From 8d1a88bd2780f4bc2d5e4eb79b11759f37b95c93 Mon Sep 17 00:00:00 2001 From: Gaetan Delannay Date: Wed, 26 Sep 2012 23:13:02 +0200 Subject: [PATCH] [shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields. --- gen/__init__.py | 26 ++++---- gen/indexer.py | 133 ++++++++++++++++++++++++++++++++------- gen/installer.py | 60 ++++++------------ gen/utils.py | 7 ++- gen/wrappers/__init__.py | 4 +- shared/utils.py | 7 +++ shared/xml_parser.py | 16 +++-- 7 files changed, 164 insertions(+), 89 deletions(-) diff --git a/gen/__init__.py b/gen/__init__.py index ba0e346..9426309 100644 --- a/gen/__init__.py +++ b/gen/__init__.py @@ -7,6 +7,7 @@ from appy.gen.layout import Table from appy.gen.layout import defaultFieldLayouts from appy.gen.po import PoMessage from appy.gen.mail import sendNotification +from appy.gen.indexer import defaultIndexes from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects import appy.pod from appy.pod.renderer import Renderer @@ -33,13 +34,6 @@ def initMasterValue(v): else: res = v return [str(v) for v in res] -# Default Appy indexes --------------------------------------------------------- -defaultIndexes = { - 'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex', - 'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex', - 'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex', - 'Allowed': 'KeywordIndex'} - # Descriptor classes used for refining descriptions of elements in types # (pages, groups,...) ---------------------------------------------------------- class Page: @@ -325,7 +319,7 @@ class Search: if usage == 'search': return 'Title' else: return 'SortableTitle' # Indeed, for field 'title', Appy has a specific index - # 'SortableTitle', because index 'Title' is a ZCTextIndex + # 'SortableTitle', because index 'Title' is a TextIndex # (for searchability) and can't be used for sorting. elif fieldName == 'state': return 'State' elif fieldName in defaultIndexes: return fieldName @@ -337,8 +331,8 @@ class Search: value as required for searching in the index corresponding to p_fieldName.''' if fieldName == 'title': - # Title is a ZCTextIndex. We must split p_fieldValue into keywords. - res = Keywords(fieldValue.decode('utf-8')).get() + # Title is a TextIndex. We must split p_fieldValue into keywords. + res = Keywords(fieldValue).get() elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'): v = fieldValue[:-1] # Warning: 'z' is higher than 'Z'! @@ -1436,10 +1430,14 @@ class String(Type): def getIndexType(self): '''Index type varies depending on String parameters.''' - # If String.isSelect, be it multivalued or not, we define a ZCTextIndex: + # If String.isSelect, be it multivalued or not, we define a ListIndex: # this way we can use AND/OR operator. - if self.isSelect or (self.format in (String.TEXT, String.XHTML)): - return 'ZCTextIndex' + if self.isSelect: + return 'ListIndex' + elif self.format == String.TEXT: + return 'TextIndex' + elif self.format == String.XHTML: + return 'XhtmlIndex' return Type.getIndexType(self) def getJs(self, layoutType, res): @@ -1918,7 +1916,7 @@ class Ref(Type): def getFormattedValue(self, obj, value): return value - def getIndexType(self): return 'ZCTextIndex' + def getIndexType(self): return 'TextIndex' def getIndexValue(self, obj, forSearch=False): '''Value for indexing is the list of UIDs of linked objects. If diff --git a/gen/indexer.py b/gen/indexer.py index 915c10b..b817ee2 100644 --- a/gen/indexer.py +++ b/gen/indexer.py @@ -2,9 +2,87 @@ indexed.''' # ------------------------------------------------------------------------------ -from Products.ZCTextIndex.PipelineFactory import element_factory from appy.shared.xml_parser import XmlParser -from appy.shared.utils import normalizeString +from appy.shared.utils import normalizeText + +# Default Appy indexes --------------------------------------------------------- +defaultIndexes = { + 'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex', + 'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex', + 'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex', + 'Allowed': 'KeywordIndex'} + +# Stuff for creating or updating the indexes ----------------------------------- +class TextIndexInfo: + '''Parameters for a text ZCTextIndex.''' + lexicon_id = "text_lexicon" + index_type = 'Okapi BM25 Rank' + +class XhtmlIndexInfo: + '''Parameters for a html ZCTextIndex.''' + lexicon_id = "xhtml_lexicon" + index_type = 'Okapi BM25 Rank' + +class ListIndexInfo: + '''Parameters for a list ZCTextIndex.''' + lexicon_id = "list_lexicon" + index_type = 'Okapi BM25 Rank' + +def updateIndexes(installer, indexInfo): + '''This function updates the indexes defined in the catalog.''' + catalog = installer.app.catalog + logger = installer.logger + for indexName, indexType in indexInfo.iteritems(): + indexRealType = indexType + if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'): + indexRealType = 'ZCTextIndex' + # If this index already exists but with a different type (or with a + # deprecated lexicon), remove it. + if indexName in catalog.indexes(): + indexObject = catalog.Indexes[indexName] + oldType = indexObject.__class__.__name__ + toDelete = False + if (oldType != indexRealType): + toDelete = True + info = indexRealType + elif (oldType == 'ZCTextIndex') and \ + (indexObject.lexicon_id == 'lexicon'): + toDelete = True + info = '%s (%s)' % (oldType, indexType) + if toDelete: + catalog.delIndex(indexName) + logger.info('Index %s (%s) to replace as %s.' % \ + (indexName, oldType, info)) + if indexName not in catalog.indexes(): + # We need to (re-)create this index. + if indexType == 'TextIndex': + catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo) + elif indexType == 'XhtmlIndex': + catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo) + elif indexType == 'ListIndex': + catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo) + else: + catalog.addIndex(indexName, indexType) + # Indexing database content based on this index. + logger.info('Reindexing %s (%s)...' % (indexName, indexType)) + catalog.reindexIndex(indexName, installer.app.REQUEST) + logger.info('Done.') + +# ------------------------------------------------------------------------------ +def splitIntoWords(text): + '''Split the cleaned index value p_text into words (returns a list of + words). Words of a single char are ignored, excepted digits which are + always kept. Duplicate words are removed (result is a set and not a + list).''' + res = text.split(' ') + # Remove tokens of a single char (excepted if this char is a digit). + i = len(res)-1 + while i > -1 : + if (len(res[i]) < 2) and not res[i].isdigit(): + del res[i] + i -= 1 + # Remove duplicates + return set(res) # ------------------------------------------------------------------------------ class XhtmlTextExtractor(XmlParser): @@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser): return XmlParser.endDocument(self) def characters(self, content): - c = normalizeString(content, usage='extractedText').strip().lower() + c = normalizeText(content) if len(c) > 1: self.res.append(c) - return self.env - - # Do not raise exceptions when errors occur. - def error(self, error): pass - def fatalError(self, error): pass - def warning(self, error): pass # ------------------------------------------------------------------------------ class XhtmlIndexer: '''Extracts, from XHTML field values, the text to index.''' - def process(self, text): - # Wrap the XHTML chunk into a root tag, to get valid XML. - text = '

%s

' % text[0] - parser = XhtmlTextExtractor() - text = parser.parse(text) - res = text.split(' ') - # Remove tokens of a single char. - i = len(res)-1 - while i > -1 : - if (len(res[i]) < 2) and not res[i].isdigit(): - del res[i] - i -= 1 - return res + def process(self, texts): + res = set() + for text in texts: + extractor = XhtmlTextExtractor(raiseOnError=False) + cleanText = extractor.parse('

%s

' % text) + res = res.union(splitIntoWords(cleanText)) + return list(res) # ------------------------------------------------------------------------------ -element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) +class TextIndexer: + '''Extracts, from text field values, a normalized value to index.''' + def process(self, texts): + res = set() + for text in texts: + cleanText = normalizeText(text) + res = res.union(splitIntoWords(cleanText)) + return list(res) + +class ListIndexer: + '''This lexicon does nothing: list of values must be indexed as is.''' + def process(self, texts): return texts + +# ------------------------------------------------------------------------------ +try: + from Products.ZCTextIndex.PipelineFactory import element_factory as ef + ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) + ef.registerFactory('Text indexer', 'Text indexer', TextIndexer) + ef.registerFactory('List indexer', 'List indexer', ListIndexer) +except ImportError: + # May occur at generation time. + pass # ------------------------------------------------------------------------------ diff --git a/gen/installer.py b/gen/installer.py index 68f21d9..a0670cb 100644 --- a/gen/installer.py +++ b/gen/installer.py @@ -8,6 +8,7 @@ import appy.version import appy.gen as gen from appy.gen.po import PoParser from appy.gen.utils import updateRolesForPermission, createObject +from appy.gen.indexer import defaultIndexes, updateIndexes from appy.gen.migrator import Migrator from appy.shared.data import languages @@ -63,11 +64,6 @@ def onDelSession(sessionObject, container): resp.write('
For security reasons, your session has ' \ 'expired.
') -class ZCTextIndexInfo: - '''Silly class used for storing information about a ZCTextIndex.''' - lexicon_id = "lexicon" - index_type = 'Okapi BM25 Rank' - # ------------------------------------------------------------------------------ class ZopeInstaller: '''This Zope installer runs every time Zope starts and encounters this @@ -148,35 +144,6 @@ class ZopeInstaller: self.app.manage_delObjects(['standard_error_message']) manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage) - def installIndexes(self, indexInfo): - '''Updates indexes in the catalog.''' - catalog = self.app.catalog - logger = self.logger - for indexName, indexType in indexInfo.iteritems(): - # If this index already exists but with a different type, remove it. - if indexName in catalog.indexes(): - oldType = catalog.Indexes[indexName].__class__.__name__ - if oldType != indexType: - catalog.delIndex(indexName) - logger.info('Existing index "%s" of type "%s" was removed:'\ - ' we need to recreate it with type "%s".' % \ - (indexName, oldType, indexType)) - if indexName not in catalog.indexes(): - # We need to create this index - if indexType != 'ZCTextIndex': - catalog.addIndex(indexName, indexType) - else: - catalog.addIndex(indexName, indexType,extra=ZCTextIndexInfo) - # Indexing database content based on this index. - catalog.reindexIndex(indexName, self.app.REQUEST) - logger.info('Created index "%s" of type "%s"...' % \ - (indexName, indexType)) - - lexiconInfos = [ - appy.Object(group='Case Normalizer', name='Case Normalizer'), - appy.Object(group='Stop Words', name=" Don't remove stop words"), - appy.Object(group='Word Splitter', name='Whitespace splitter') - ] def installCatalog(self): '''Create the catalog at the root of Zope if id does not exist.''' if 'catalog' not in self.app.objectIds(): @@ -185,19 +152,30 @@ class ZopeInstaller: manage_addZCatalog(self.app, 'catalog', '') self.logger.info('Appy catalog created.') - # Create a lexicon for ZCTextIndexes - if 'lexicon' not in self.app.catalog.objectIds(): - from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon - manage_addLexicon(self.app.catalog, 'lexicon', - elements=self.lexiconInfos) + # Create lexicons for ZCTextIndexes + catalog = self.app.catalog + lexicons = catalog.objectIds() + from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon + if 'xhtml_lexicon' not in lexicons: + lex = appy.Object(group='XHTML indexer', name='XHTML indexer') + manage_addLexicon(catalog, 'xhtml_lexicon', elements=[lex]) + if 'text_lexicon' not in lexicons: + lex = appy.Object(group='Text indexer', name='Text indexer') + manage_addLexicon(catalog, 'text_lexicon', elements=[lex]) + if 'list_lexicon' not in lexicons: + lex = appy.Object(group='List indexer', name='List indexer') + manage_addLexicon(catalog, 'list_lexicon', elements=[lex]) + + # Delete the deprecated one if it exists + if 'lexicon' in lexicons: catalog.manage_delObjects(['lexicon']) # Create or update Appy-wide indexes and field-related indexes - indexInfo = gen.defaultIndexes.copy() + indexInfo = defaultIndexes.copy() tool = self.app.config for className in self.config.attributes.iterkeys(): wrapperClass = tool.getAppyClass(className, wrapper=True) indexInfo.update(wrapperClass.getIndexes(includeDefaults=False)) - self.installIndexes(indexInfo) + updateIndexes(self, indexInfo) def getAddPermission(self, className): '''What is the name of the permission allowing to create instances of diff --git a/gen/utils.py b/gen/utils.py index 0578106..54d87d8 100644 --- a/gen/utils.py +++ b/gen/utils.py @@ -1,5 +1,6 @@ # ------------------------------------------------------------------------------ import re, os, os.path +from appy.shared.utils import normalizeText # Function for creating a Zope object ------------------------------------------ def createObject(folder, id, className, appName, wf=True, noSecurity=False): @@ -243,12 +244,12 @@ class SomeObjects: # ------------------------------------------------------------------------------ class Keywords: '''This class allows to handle keywords that a user enters and that will be - used as basis for performing requests in a Zope ZCTextIndex.''' + used as basis for performing requests in a TextIndex/XhtmlIndex.''' toRemove = '?-+*()' def __init__(self, keywords, operator='AND'): # Clean the p_keywords that the user has entered. - words = keywords.strip() + words = normalizeText(keywords) if words == '*': words = '' for c in self.toRemove: words = words.replace(c, ' ') self.keywords = words.split() @@ -267,7 +268,7 @@ class Keywords: self.keywords.insert(0, word) def get(self): - '''Returns the keywords as needed by the ZCTextIndex.''' + '''Returns the keywords as needed by the TextIndex.''' if self.keywords: op = ' %s ' % self.operator return op.join(self.keywords)+'*' diff --git a/gen/wrappers/__init__.py b/gen/wrappers/__init__.py index 3763ee8..674b011 100644 --- a/gen/wrappers/__init__.py +++ b/gen/wrappers/__init__.py @@ -4,8 +4,8 @@ # ------------------------------------------------------------------------------ import os, os.path, mimetypes import appy.pod -from appy.gen import Type, Search, Ref, String, WorkflowAnonymous, \ - defaultIndexes +from appy.gen import Type, Search, Ref, String, WorkflowAnonymous +from appy.gen.indexer import defaultIndexes from appy.gen.utils import createObject from appy.shared.utils import getOsTempFolder, executeCommand, \ normalizeString, sequenceTypes diff --git a/shared/utils.py b/shared/utils.py index d9db99b..f727d2c 100644 --- a/shared/utils.py +++ b/shared/utils.py @@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'): res = s return res +# ------------------------------------------------------------------------------ +def normalizeText(s): + '''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing + purposes.''' + return normalizeString(s, usage='extractedText').strip().lower() + +# ------------------------------------------------------------------------------ def formatNumber(n, sep=',', precision=2, tsep=' '): '''Returns a string representation of number p_n, which can be a float or integer. p_sep is the decimal separator to use. p_precision is the diff --git a/shared/xml_parser.py b/shared/xml_parser.py index 0386889..930802d 100644 --- a/shared/xml_parser.py +++ b/shared/xml_parser.py @@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler): - remembering the currently parsed element; - managing namespace declarations. This parser also knows about HTML entities.''' - def __init__(self, env=None, caller=None): + def __init__(self, env=None, caller=None, raiseOnError=True): '''p_env should be an instance of a class that inherits from XmlEnvironment: it specifies the environment to use for this SAX parser.''' @@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler): self.caller = caller # The class calling this parser self.parser = xml.sax.make_parser() # Fast, standard expat parser self.res = None # The result of parsing. + # Raise or not an error when a parsing error is encountered. + self.raiseOnError = raiseOnError # ContentHandler methods --------------------------------------------------- def startDocument(self): @@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler): self.characters('?') # ErrorHandler methods --------------------------------------------------- - # Define methods below in your subclass if you want error handling that - # does not raise exceptions, but produces a partial result instead. - #def error(self, error): pass - #def fatalError(self, error): pass - #def warning(self, error): pass + def error(self, error): + if self.raiseOnError: raise error + else: print 'SAX error', error + def fatalError(self, error): + if self.raiseOnError: raise error + else: print 'SAX fatal error', error + def warning(self, error): pass def parse(self, xml, source='string'): '''Parses a XML stream.