[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields.

2012-09-26 23:13:02 +02:00 · 2012-09-26 23:13:02 +02:00 · 8d1a88bd27
commit 8d1a88bd27
parent a2ae839704
7 changed files with 164 additions and 89 deletions
--- a/gen/init.py
+++ b/gen/init.py
@ -7,6 +7,7 @@ from appy.gen.layout import Table
 from appy.gen.layout import defaultFieldLayouts
 from appy.gen.po import PoMessage
 from appy.gen.mail import sendNotification
 from appy.gen.indexer import defaultIndexes
 from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects
 import appy.pod
 from appy.pod.renderer import Renderer
@ -33,13 +34,6 @@ def initMasterValue(v):
    else: res = v
    return [str(v) for v in res]
 # Default Appy indexes ---------------------------------------------------------
 defaultIndexes = {
    'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex',
    'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex',
    'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
    'Allowed': 'KeywordIndex'}
 # Descriptor classes used for refining descriptions of elements in types
 # (pages, groups,...) ----------------------------------------------------------
 class Page:
@ -325,7 +319,7 @@ class Search:
            if usage == 'search':  return 'Title'
            else:                  return 'SortableTitle'
            # Indeed, for field 'title', Appy has a specific index
-            # 'SortableTitle', because index 'Title' is a ZCTextIndex
+            # 'SortableTitle', because index 'Title' is a TextIndex
            # (for searchability) and can't be used for sorting.
        elif fieldName == 'state': return 'State'
        elif fieldName in defaultIndexes: return fieldName
@ -337,8 +331,8 @@ class Search:
           value as required for searching in the index corresponding to
           p_fieldName.'''
        if fieldName == 'title':
-            # Title is a ZCTextIndex. We must split p_fieldValue into keywords.
+            # Title is a TextIndex. We must split p_fieldValue into keywords.
-            res = Keywords(fieldValue.decode('utf-8')).get()
+            res = Keywords(fieldValue).get()
        elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'):
            v = fieldValue[:-1]
            # Warning: 'z' is higher than 'Z'!
@ -1436,10 +1430,14 @@ class String(Type):
    def getIndexType(self):
        '''Index type varies depending on String parameters.'''
-        # If String.isSelect, be it multivalued or not, we define a ZCTextIndex:
+        # If String.isSelect, be it multivalued or not, we define a ListIndex:
        # this way we can use AND/OR operator.
-        if self.isSelect or (self.format in (String.TEXT, String.XHTML)):
+        if self.isSelect:
-            return 'ZCTextIndex'
+            return 'ListIndex'
        elif self.format == String.TEXT:
            return 'TextIndex'
        elif self.format == String.XHTML:
            return 'XhtmlIndex'
        return Type.getIndexType(self)
    def getJs(self, layoutType, res):
@ -1918,7 +1916,7 @@ class Ref(Type):
    def getFormattedValue(self, obj, value):
        return value
-    def getIndexType(self): return 'ZCTextIndex'
+    def getIndexType(self): return 'TextIndex'
    def getIndexValue(self, obj, forSearch=False):
        '''Value for indexing is the list of UIDs of linked objects. If
--- a/gen/indexer.py
+++ b/gen/indexer.py
@ -2,9 +2,87 @@
   indexed.'''
 # ------------------------------------------------------------------------------
 from Products.ZCTextIndex.PipelineFactory import element_factory
 from appy.shared.xml_parser import XmlParser
-from appy.shared.utils import normalizeString
+from appy.shared.utils import normalizeText
 # Default Appy indexes ---------------------------------------------------------
 defaultIndexes = {
    'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',
    'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex',
    'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
    'Allowed': 'KeywordIndex'}
 # Stuff for creating or updating the indexes -----------------------------------
 class TextIndexInfo:
    '''Parameters for a text ZCTextIndex.'''
    lexicon_id = "text_lexicon"
    index_type = 'Okapi BM25 Rank'
 class XhtmlIndexInfo:
    '''Parameters for a html ZCTextIndex.'''
    lexicon_id = "xhtml_lexicon"
    index_type = 'Okapi BM25 Rank'
 class ListIndexInfo:
    '''Parameters for a list ZCTextIndex.'''
    lexicon_id = "list_lexicon"
    index_type = 'Okapi BM25 Rank'
 def updateIndexes(installer, indexInfo):
    '''This function updates the indexes defined in the catalog.'''
    catalog = installer.app.catalog
    logger = installer.logger
    for indexName, indexType in indexInfo.iteritems():
        indexRealType = indexType
        if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):
            indexRealType = 'ZCTextIndex'
        # If this index already exists but with a different type (or with a
        # deprecated lexicon), remove it.
        if indexName in catalog.indexes():
            indexObject = catalog.Indexes[indexName]
            oldType = indexObject.__class__.__name__
            toDelete = False
            if (oldType != indexRealType):
                toDelete = True
                info = indexRealType
            elif (oldType == 'ZCTextIndex') and \
                 (indexObject.lexicon_id == 'lexicon'):
                toDelete = True
                info = '%s (%s)' % (oldType, indexType)
            if toDelete:
                catalog.delIndex(indexName)
                logger.info('Index %s (%s) to replace as %s.' % \
                            (indexName, oldType, info))
        if indexName not in catalog.indexes():
            # We need to (re-)create this index.
            if indexType == 'TextIndex':
                catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)
            elif indexType == 'XhtmlIndex':
                catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)
            elif indexType == 'ListIndex':
                catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)
            else:
                catalog.addIndex(indexName, indexType)
            # Indexing database content based on this index.
            logger.info('Reindexing %s (%s)...' % (indexName, indexType))
            catalog.reindexIndex(indexName, installer.app.REQUEST)
            logger.info('Done.')
 # ------------------------------------------------------------------------------
 def splitIntoWords(text):
    '''Split the cleaned index value p_text into words (returns a list of
       words). Words of a single char are ignored, excepted digits which are
       always kept. Duplicate words are removed (result is a set and not a
       list).'''
    res = text.split(' ')
    # Remove tokens of a single char (excepted if this char is a digit).
    i = len(res)-1
    while i > -1 :
        if (len(res[i]) < 2) and not res[i].isdigit():
            del res[i]
        i -= 1
    # Remove duplicates
    return set(res)
 # ------------------------------------------------------------------------------
 class XhtmlTextExtractor(XmlParser):
@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser):
        return XmlParser.endDocument(self)
    def characters(self, content):
-        c = normalizeString(content, usage='extractedText').strip().lower()
+        c = normalizeText(content)
        if len(c) > 1: self.res.append(c)
        return self.env
    # Do not raise exceptions when errors occur.
    def error(self, error): pass
    def fatalError(self, error): pass
    def warning(self, error): pass
 # ------------------------------------------------------------------------------
 class XhtmlIndexer:
    '''Extracts, from XHTML field values, the text to index.'''
-    def process(self, text):
+    def process(self, texts):
-        # Wrap the XHTML chunk into a root tag, to get valid XML.
+        res = set()
-        text = '<p>%s</p>' % text[0]
+        for text in texts:
-        parser = XhtmlTextExtractor()
+            extractor = XhtmlTextExtractor(raiseOnError=False)
-        text = parser.parse(text)
+            cleanText = extractor.parse('<p>%s</p>' % text)
-        res = text.split(' ')
+            res = res.union(splitIntoWords(cleanText))
-        # Remove tokens of a single char.
+        return list(res)
        i = len(res)-1
        while i > -1 :
            if (len(res[i]) < 2) and not res[i].isdigit():
                del res[i]
            i -= 1
        return res
 # ------------------------------------------------------------------------------
-element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
+class TextIndexer:
    '''Extracts, from text field values, a normalized value to index.'''
    def process(self, texts):
        res = set()
        for text in texts:
            cleanText = normalizeText(text)
            res = res.union(splitIntoWords(cleanText))
        return list(res)
 class ListIndexer:
    '''This lexicon does nothing: list of values must be indexed as is.'''
    def process(self, texts): return texts
 # ------------------------------------------------------------------------------
 try:
    from Products.ZCTextIndex.PipelineFactory import element_factory as ef
    ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
    ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
    ef.registerFactory('List indexer', 'List indexer', ListIndexer)
 except ImportError:
    # May occur at generation time.
    pass
 # ------------------------------------------------------------------------------
--- a/gen/installer.py
+++ b/gen/installer.py
@ -8,6 +8,7 @@ import appy.version
 import appy.gen as gen
 from appy.gen.po import PoParser
 from appy.gen.utils import updateRolesForPermission, createObject
 from appy.gen.indexer import defaultIndexes, updateIndexes
 from appy.gen.migrator import Migrator
 from appy.shared.data import languages
@ -63,11 +64,6 @@ def onDelSession(sessionObject, container):
        resp.write('<center>For security reasons, your session has ' \
                   'expired.</center>')
 class ZCTextIndexInfo:
    '''Silly class used for storing information about a ZCTextIndex.'''
    lexicon_id = "lexicon"
    index_type = 'Okapi BM25 Rank'
 # ------------------------------------------------------------------------------
 class ZopeInstaller:
    '''This Zope installer runs every time Zope starts and encounters this
@ -148,35 +144,6 @@ class ZopeInstaller:
            self.app.manage_delObjects(['standard_error_message'])
        manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage)
    def installIndexes(self, indexInfo):
        '''Updates indexes in the catalog.'''
        catalog = self.app.catalog
        logger = self.logger
        for indexName, indexType in indexInfo.iteritems():
            # If this index already exists but with a different type, remove it.
            if indexName in catalog.indexes():
                oldType = catalog.Indexes[indexName].__class__.__name__
                if oldType != indexType:
                    catalog.delIndex(indexName)
                    logger.info('Existing index "%s" of type "%s" was removed:'\
                                ' we need to recreate it with type "%s".' % \
                                (indexName, oldType, indexType))
            if indexName not in catalog.indexes():
                # We need to create this index
                if indexType != 'ZCTextIndex':
                    catalog.addIndex(indexName, indexType)
                else:
                    catalog.addIndex(indexName, indexType,extra=ZCTextIndexInfo)
                # Indexing database content based on this index.
                catalog.reindexIndex(indexName, self.app.REQUEST)
                logger.info('Created index "%s" of type "%s"...' % \
                            (indexName, indexType))
    lexiconInfos = [
        appy.Object(group='Case Normalizer', name='Case Normalizer'),
        appy.Object(group='Stop Words', name=" Don't remove stop words"),
        appy.Object(group='Word Splitter', name='Whitespace splitter')
    ]
    def installCatalog(self):
        '''Create the catalog at the root of Zope if id does not exist.'''
        if 'catalog' not in self.app.objectIds():
@ -185,19 +152,30 @@ class ZopeInstaller:
            manage_addZCatalog(self.app, 'catalog', '')
            self.logger.info('Appy catalog created.')
-        # Create a lexicon for ZCTextIndexes
+        # Create lexicons for ZCTextIndexes
-        if 'lexicon' not in self.app.catalog.objectIds():
+        catalog = self.app.catalog
        lexicons = catalog.objectIds()
        from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
-            manage_addLexicon(self.app.catalog, 'lexicon',
+        if 'xhtml_lexicon' not in lexicons:
-                              elements=self.lexiconInfos)
+            lex = appy.Object(group='XHTML indexer', name='XHTML indexer')
            manage_addLexicon(catalog, 'xhtml_lexicon', elements=[lex])
        if 'text_lexicon' not in lexicons:
            lex = appy.Object(group='Text indexer', name='Text indexer')
            manage_addLexicon(catalog, 'text_lexicon', elements=[lex])
        if 'list_lexicon' not in lexicons:
            lex = appy.Object(group='List indexer', name='List indexer')
            manage_addLexicon(catalog, 'list_lexicon', elements=[lex])
        # Delete the deprecated one if it exists
        if 'lexicon' in lexicons: catalog.manage_delObjects(['lexicon'])
        # Create or update Appy-wide indexes and field-related indexes
-        indexInfo = gen.defaultIndexes.copy()
+        indexInfo = defaultIndexes.copy()
        tool = self.app.config
        for className in self.config.attributes.iterkeys():
            wrapperClass = tool.getAppyClass(className, wrapper=True)
            indexInfo.update(wrapperClass.getIndexes(includeDefaults=False))
-        self.installIndexes(indexInfo)
+        updateIndexes(self, indexInfo)
    def getAddPermission(self, className):
        '''What is the name of the permission allowing to create instances of
--- a/gen/utils.py
+++ b/gen/utils.py
@ -1,5 +1,6 @@
 # ------------------------------------------------------------------------------
 import re, os, os.path
 from appy.shared.utils import normalizeText
 # Function for creating a Zope object ------------------------------------------
 def createObject(folder, id, className, appName, wf=True, noSecurity=False):
@ -243,12 +244,12 @@ class SomeObjects:
 # ------------------------------------------------------------------------------
 class Keywords:
    '''This class allows to handle keywords that a user enters and that will be
-       used as basis for performing requests in a Zope ZCTextIndex.'''
+       used as basis for performing requests in a TextIndex/XhtmlIndex.'''
    toRemove = '?-+*()'
    def __init__(self, keywords, operator='AND'):
        # Clean the p_keywords that the user has entered.
-        words = keywords.strip()
+        words = normalizeText(keywords)
        if words == '*': words = ''
        for c in self.toRemove: words = words.replace(c, ' ')
        self.keywords = words.split()
@ -267,7 +268,7 @@ class Keywords:
                    self.keywords.insert(0, word)
    def get(self):
-        '''Returns the keywords as needed by the ZCTextIndex.'''
+        '''Returns the keywords as needed by the TextIndex.'''
        if self.keywords:
            op = ' %s ' % self.operator
            return op.join(self.keywords)+'*'
--- a/gen/wrappers/init.py
+++ b/gen/wrappers/init.py
@ -4,8 +4,8 @@
 # ------------------------------------------------------------------------------
 import os, os.path, mimetypes
 import appy.pod
-from appy.gen import Type, Search, Ref, String, WorkflowAnonymous, \
+from appy.gen import Type, Search, Ref, String, WorkflowAnonymous
-                     defaultIndexes
+from appy.gen.indexer import defaultIndexes
 from appy.gen.utils import createObject
 from appy.shared.utils import getOsTempFolder, executeCommand, \
                              normalizeString, sequenceTypes
--- a/shared/utils.py
+++ b/shared/utils.py
@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
        res = s
    return res
 # ------------------------------------------------------------------------------
 def normalizeText(s):
    '''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
       purposes.'''
    return normalizeString(s, usage='extractedText').strip().lower()
 # ------------------------------------------------------------------------------
 def formatNumber(n, sep=',', precision=2, tsep=' '):
    '''Returns a string representation of number p_n, which can be a float
       or integer. p_sep is the decimal separator to use. p_precision is the
--- a/shared/xml_parser.py
+++ b/shared/xml_parser.py
@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
      - remembering the currently parsed element;
      - managing namespace declarations.
      This parser also knows about HTML entities.'''
-    def __init__(self, env=None, caller=None):
+    def __init__(self, env=None, caller=None, raiseOnError=True):
        '''p_env should be an instance of a class that inherits from
           XmlEnvironment: it specifies the environment to use for this SAX
           parser.'''
@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
        self.caller = caller # The class calling this parser
        self.parser = xml.sax.make_parser() # Fast, standard expat parser
        self.res = None # The result of parsing.
        # Raise or not an error when a parsing error is encountered.
        self.raiseOnError = raiseOnError
    # ContentHandler methods ---------------------------------------------------
    def startDocument(self):
@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
            self.characters('?')
    # ErrorHandler methods ---------------------------------------------------
-    # Define methods below in your subclass if you want error handling that
+    def error(self, error):
-    # does not raise exceptions, but produces a partial result instead.
+        if self.raiseOnError: raise error
-    #def error(self, error): pass
+        else: print 'SAX error', error
-    #def fatalError(self, error): pass
+    def fatalError(self, error):
-    #def warning(self, error): pass
+        if self.raiseOnError: raise error
        else: print 'SAX fatal error', error
    def warning(self, error): pass
    def parse(self, xml, source='string'):
        '''Parses a XML stream.