diff --git a/gen/__init__.py b/gen/__init__.py index ba0e346..9426309 100644 --- a/gen/__init__.py +++ b/gen/__init__.py @@ -7,6 +7,7 @@ from appy.gen.layout import Table from appy.gen.layout import defaultFieldLayouts from appy.gen.po import PoMessage from appy.gen.mail import sendNotification +from appy.gen.indexer import defaultIndexes from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects import appy.pod from appy.pod.renderer import Renderer @@ -33,13 +34,6 @@ def initMasterValue(v): else: res = v return [str(v) for v in res] -# Default Appy indexes --------------------------------------------------------- -defaultIndexes = { - 'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex', - 'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex', - 'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex', - 'Allowed': 'KeywordIndex'} - # Descriptor classes used for refining descriptions of elements in types # (pages, groups,...) ---------------------------------------------------------- class Page: @@ -325,7 +319,7 @@ class Search: if usage == 'search': return 'Title' else: return 'SortableTitle' # Indeed, for field 'title', Appy has a specific index - # 'SortableTitle', because index 'Title' is a ZCTextIndex + # 'SortableTitle', because index 'Title' is a TextIndex # (for searchability) and can't be used for sorting. elif fieldName == 'state': return 'State' elif fieldName in defaultIndexes: return fieldName @@ -337,8 +331,8 @@ class Search: value as required for searching in the index corresponding to p_fieldName.''' if fieldName == 'title': - # Title is a ZCTextIndex. We must split p_fieldValue into keywords. - res = Keywords(fieldValue.decode('utf-8')).get() + # Title is a TextIndex. We must split p_fieldValue into keywords. + res = Keywords(fieldValue).get() elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'): v = fieldValue[:-1] # Warning: 'z' is higher than 'Z'! @@ -1436,10 +1430,14 @@ class String(Type): def getIndexType(self): '''Index type varies depending on String parameters.''' - # If String.isSelect, be it multivalued or not, we define a ZCTextIndex: + # If String.isSelect, be it multivalued or not, we define a ListIndex: # this way we can use AND/OR operator. - if self.isSelect or (self.format in (String.TEXT, String.XHTML)): - return 'ZCTextIndex' + if self.isSelect: + return 'ListIndex' + elif self.format == String.TEXT: + return 'TextIndex' + elif self.format == String.XHTML: + return 'XhtmlIndex' return Type.getIndexType(self) def getJs(self, layoutType, res): @@ -1918,7 +1916,7 @@ class Ref(Type): def getFormattedValue(self, obj, value): return value - def getIndexType(self): return 'ZCTextIndex' + def getIndexType(self): return 'TextIndex' def getIndexValue(self, obj, forSearch=False): '''Value for indexing is the list of UIDs of linked objects. If diff --git a/gen/indexer.py b/gen/indexer.py index 915c10b..b817ee2 100644 --- a/gen/indexer.py +++ b/gen/indexer.py @@ -2,9 +2,87 @@ indexed.''' # ------------------------------------------------------------------------------ -from Products.ZCTextIndex.PipelineFactory import element_factory from appy.shared.xml_parser import XmlParser -from appy.shared.utils import normalizeString +from appy.shared.utils import normalizeText + +# Default Appy indexes --------------------------------------------------------- +defaultIndexes = { + 'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex', + 'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex', + 'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex', + 'Allowed': 'KeywordIndex'} + +# Stuff for creating or updating the indexes ----------------------------------- +class TextIndexInfo: + '''Parameters for a text ZCTextIndex.''' + lexicon_id = "text_lexicon" + index_type = 'Okapi BM25 Rank' + +class XhtmlIndexInfo: + '''Parameters for a html ZCTextIndex.''' + lexicon_id = "xhtml_lexicon" + index_type = 'Okapi BM25 Rank' + +class ListIndexInfo: + '''Parameters for a list ZCTextIndex.''' + lexicon_id = "list_lexicon" + index_type = 'Okapi BM25 Rank' + +def updateIndexes(installer, indexInfo): + '''This function updates the indexes defined in the catalog.''' + catalog = installer.app.catalog + logger = installer.logger + for indexName, indexType in indexInfo.iteritems(): + indexRealType = indexType + if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'): + indexRealType = 'ZCTextIndex' + # If this index already exists but with a different type (or with a + # deprecated lexicon), remove it. + if indexName in catalog.indexes(): + indexObject = catalog.Indexes[indexName] + oldType = indexObject.__class__.__name__ + toDelete = False + if (oldType != indexRealType): + toDelete = True + info = indexRealType + elif (oldType == 'ZCTextIndex') and \ + (indexObject.lexicon_id == 'lexicon'): + toDelete = True + info = '%s (%s)' % (oldType, indexType) + if toDelete: + catalog.delIndex(indexName) + logger.info('Index %s (%s) to replace as %s.' % \ + (indexName, oldType, info)) + if indexName not in catalog.indexes(): + # We need to (re-)create this index. + if indexType == 'TextIndex': + catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo) + elif indexType == 'XhtmlIndex': + catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo) + elif indexType == 'ListIndex': + catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo) + else: + catalog.addIndex(indexName, indexType) + # Indexing database content based on this index. + logger.info('Reindexing %s (%s)...' % (indexName, indexType)) + catalog.reindexIndex(indexName, installer.app.REQUEST) + logger.info('Done.') + +# ------------------------------------------------------------------------------ +def splitIntoWords(text): + '''Split the cleaned index value p_text into words (returns a list of + words). Words of a single char are ignored, excepted digits which are + always kept. Duplicate words are removed (result is a set and not a + list).''' + res = text.split(' ') + # Remove tokens of a single char (excepted if this char is a digit). + i = len(res)-1 + while i > -1 : + if (len(res[i]) < 2) and not res[i].isdigit(): + del res[i] + i -= 1 + # Remove duplicates + return set(res) # ------------------------------------------------------------------------------ class XhtmlTextExtractor(XmlParser): @@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser): return XmlParser.endDocument(self) def characters(self, content): - c = normalizeString(content, usage='extractedText').strip().lower() + c = normalizeText(content) if len(c) > 1: self.res.append(c) - return self.env - - # Do not raise exceptions when errors occur. - def error(self, error): pass - def fatalError(self, error): pass - def warning(self, error): pass # ------------------------------------------------------------------------------ class XhtmlIndexer: '''Extracts, from XHTML field values, the text to index.''' - def process(self, text): - # Wrap the XHTML chunk into a root tag, to get valid XML. - text = '
%s
' % text[0] - parser = XhtmlTextExtractor() - text = parser.parse(text) - res = text.split(' ') - # Remove tokens of a single char. - i = len(res)-1 - while i > -1 : - if (len(res[i]) < 2) and not res[i].isdigit(): - del res[i] - i -= 1 - return res + def process(self, texts): + res = set() + for text in texts: + extractor = XhtmlTextExtractor(raiseOnError=False) + cleanText = extractor.parse('%s
' % text) + res = res.union(splitIntoWords(cleanText)) + return list(res) # ------------------------------------------------------------------------------ -element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) +class TextIndexer: + '''Extracts, from text field values, a normalized value to index.''' + def process(self, texts): + res = set() + for text in texts: + cleanText = normalizeText(text) + res = res.union(splitIntoWords(cleanText)) + return list(res) + +class ListIndexer: + '''This lexicon does nothing: list of values must be indexed as is.''' + def process(self, texts): return texts + +# ------------------------------------------------------------------------------ +try: + from Products.ZCTextIndex.PipelineFactory import element_factory as ef + ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) + ef.registerFactory('Text indexer', 'Text indexer', TextIndexer) + ef.registerFactory('List indexer', 'List indexer', ListIndexer) +except ImportError: + # May occur at generation time. + pass # ------------------------------------------------------------------------------ diff --git a/gen/installer.py b/gen/installer.py index 68f21d9..a0670cb 100644 --- a/gen/installer.py +++ b/gen/installer.py @@ -8,6 +8,7 @@ import appy.version import appy.gen as gen from appy.gen.po import PoParser from appy.gen.utils import updateRolesForPermission, createObject +from appy.gen.indexer import defaultIndexes, updateIndexes from appy.gen.migrator import Migrator from appy.shared.data import languages @@ -63,11 +64,6 @@ def onDelSession(sessionObject, container): resp.write('