From 8d1a88bd2780f4bc2d5e4eb79b11759f37b95c93 Mon Sep 17 00:00:00 2001
From: Gaetan Delannay <gaetan.delannay@gmail.com>
Date: Wed, 26 Sep 2012 23:13:02 +0200
Subject: [PATCH] [shared] xml_parser.XmlParser: added param 'raiseOnError'
 allowing to raise or not an exception when a SAX fatal parsing error is
 encountered; [gen] fine-tuned indexing machinery with more accurate text
 extraction from text and xhtml fields.

---
 gen/__init__.py          |  26 ++++----
 gen/indexer.py           | 133 ++++++++++++++++++++++++++++++++-------
 gen/installer.py         |  60 ++++++------------
 gen/utils.py             |   7 ++-
 gen/wrappers/__init__.py |   4 +-
 shared/utils.py          |   7 +++
 shared/xml_parser.py     |  16 +++--
 7 files changed, 164 insertions(+), 89 deletions(-)

diff --git a/gen/__init__.py b/gen/__init__.py
index ba0e346..9426309 100644
--- a/gen/__init__.py
+++ b/gen/__init__.py
@@ -7,6 +7,7 @@ from appy.gen.layout import Table
 from appy.gen.layout import defaultFieldLayouts
 from appy.gen.po import PoMessage
 from appy.gen.mail import sendNotification
+from appy.gen.indexer import defaultIndexes
 from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects
 import appy.pod
 from appy.pod.renderer import Renderer
@@ -33,13 +34,6 @@ def initMasterValue(v):
     else: res = v
     return [str(v) for v in res]
 
-# Default Appy indexes ---------------------------------------------------------
-defaultIndexes = {
-    'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex',
-    'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex',
-    'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
-    'Allowed': 'KeywordIndex'}
-
 # Descriptor classes used for refining descriptions of elements in types
 # (pages, groups,...) ----------------------------------------------------------
 class Page:
@@ -325,7 +319,7 @@ class Search:
             if usage == 'search':  return 'Title'
             else:                  return 'SortableTitle'
             # Indeed, for field 'title', Appy has a specific index
-            # 'SortableTitle', because index 'Title' is a ZCTextIndex
+            # 'SortableTitle', because index 'Title' is a TextIndex
             # (for searchability) and can't be used for sorting.
         elif fieldName == 'state': return 'State'
         elif fieldName in defaultIndexes: return fieldName
@@ -337,8 +331,8 @@ class Search:
            value as required for searching in the index corresponding to
            p_fieldName.'''
         if fieldName == 'title':
-            # Title is a ZCTextIndex. We must split p_fieldValue into keywords.
-            res = Keywords(fieldValue.decode('utf-8')).get()
+            # Title is a TextIndex. We must split p_fieldValue into keywords.
+            res = Keywords(fieldValue).get()
         elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'):
             v = fieldValue[:-1]
             # Warning: 'z' is higher than 'Z'!
@@ -1436,10 +1430,14 @@ class String(Type):
 
     def getIndexType(self):
         '''Index type varies depending on String parameters.'''
-        # If String.isSelect, be it multivalued or not, we define a ZCTextIndex:
+        # If String.isSelect, be it multivalued or not, we define a ListIndex:
         # this way we can use AND/OR operator.
-        if self.isSelect or (self.format in (String.TEXT, String.XHTML)):
-            return 'ZCTextIndex'
+        if self.isSelect:
+            return 'ListIndex'
+        elif self.format == String.TEXT:
+            return 'TextIndex'
+        elif self.format == String.XHTML:
+            return 'XhtmlIndex'
         return Type.getIndexType(self)
 
     def getJs(self, layoutType, res):
@@ -1918,7 +1916,7 @@ class Ref(Type):
     def getFormattedValue(self, obj, value):
         return value
 
-    def getIndexType(self): return 'ZCTextIndex'
+    def getIndexType(self): return 'TextIndex'
 
     def getIndexValue(self, obj, forSearch=False):
         '''Value for indexing is the list of UIDs of linked objects. If
diff --git a/gen/indexer.py b/gen/indexer.py
index 915c10b..b817ee2 100644
--- a/gen/indexer.py
+++ b/gen/indexer.py
@@ -2,9 +2,87 @@
    indexed.'''
 
 # ------------------------------------------------------------------------------
-from Products.ZCTextIndex.PipelineFactory import element_factory
 from appy.shared.xml_parser import XmlParser
-from appy.shared.utils import normalizeString
+from appy.shared.utils import normalizeText
+
+# Default Appy indexes ---------------------------------------------------------
+defaultIndexes = {
+    'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',
+    'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex',
+    'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
+    'Allowed': 'KeywordIndex'}
+
+# Stuff for creating or updating the indexes -----------------------------------
+class TextIndexInfo:
+    '''Parameters for a text ZCTextIndex.'''
+    lexicon_id = "text_lexicon"
+    index_type = 'Okapi BM25 Rank'
+
+class XhtmlIndexInfo:
+    '''Parameters for a html ZCTextIndex.'''
+    lexicon_id = "xhtml_lexicon"
+    index_type = 'Okapi BM25 Rank'
+
+class ListIndexInfo:
+    '''Parameters for a list ZCTextIndex.'''
+    lexicon_id = "list_lexicon"
+    index_type = 'Okapi BM25 Rank'
+
+def updateIndexes(installer, indexInfo):
+    '''This function updates the indexes defined in the catalog.'''
+    catalog = installer.app.catalog
+    logger = installer.logger
+    for indexName, indexType in indexInfo.iteritems():
+        indexRealType = indexType
+        if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):
+            indexRealType = 'ZCTextIndex'
+        # If this index already exists but with a different type (or with a
+        # deprecated lexicon), remove it.
+        if indexName in catalog.indexes():
+            indexObject = catalog.Indexes[indexName]
+            oldType = indexObject.__class__.__name__
+            toDelete = False
+            if (oldType != indexRealType):
+                toDelete = True
+                info = indexRealType
+            elif (oldType == 'ZCTextIndex') and \
+                 (indexObject.lexicon_id == 'lexicon'):
+                toDelete = True
+                info = '%s (%s)' % (oldType, indexType)
+            if toDelete:
+                catalog.delIndex(indexName)
+                logger.info('Index %s (%s) to replace as %s.' % \
+                            (indexName, oldType, info))
+        if indexName not in catalog.indexes():
+            # We need to (re-)create this index.
+            if indexType == 'TextIndex':
+                catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)
+            elif indexType == 'XhtmlIndex':
+                catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)
+            elif indexType == 'ListIndex':
+                catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)
+            else:
+                catalog.addIndex(indexName, indexType)
+            # Indexing database content based on this index.
+            logger.info('Reindexing %s (%s)...' % (indexName, indexType))
+            catalog.reindexIndex(indexName, installer.app.REQUEST)
+            logger.info('Done.')
+
+# ------------------------------------------------------------------------------
+def splitIntoWords(text):
+    '''Split the cleaned index value p_text into words (returns a list of
+       words). Words of a single char are ignored, excepted digits which are
+       always kept. Duplicate words are removed (result is a set and not a
+       list).'''
+    res = text.split(' ')
+    # Remove tokens of a single char (excepted if this char is a digit).
+    i = len(res)-1
+    while i > -1 :
+        if (len(res[i]) < 2) and not res[i].isdigit():
+            del res[i]
+        i -= 1
+    # Remove duplicates
+    return set(res)
 
 # ------------------------------------------------------------------------------
 class XhtmlTextExtractor(XmlParser):
@@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser):
         return XmlParser.endDocument(self)
 
     def characters(self, content):
-        c = normalizeString(content, usage='extractedText').strip().lower()
+        c = normalizeText(content)
         if len(c) > 1: self.res.append(c)
-        return self.env
-
-    # Do not raise exceptions when errors occur.
-    def error(self, error): pass
-    def fatalError(self, error): pass
-    def warning(self, error): pass
 
 # ------------------------------------------------------------------------------
 class XhtmlIndexer:
     '''Extracts, from XHTML field values, the text to index.'''
-    def process(self, text):
-        # Wrap the XHTML chunk into a root tag, to get valid XML.
-        text = '<p>%s</p>' % text[0]
-        parser = XhtmlTextExtractor()
-        text = parser.parse(text)
-        res = text.split(' ')
-        # Remove tokens of a single char.
-        i = len(res)-1
-        while i > -1 :
-            if (len(res[i]) < 2) and not res[i].isdigit():
-                del res[i]
-            i -= 1
-        return res
+    def process(self, texts):
+        res = set()
+        for text in texts:
+            extractor = XhtmlTextExtractor(raiseOnError=False)
+            cleanText = extractor.parse('<p>%s</p>' % text)
+            res = res.union(splitIntoWords(cleanText))
+        return list(res)
 
 # ------------------------------------------------------------------------------
-element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
+class TextIndexer:
+    '''Extracts, from text field values, a normalized value to index.'''
+    def process(self, texts):
+        res = set()
+        for text in texts:
+            cleanText = normalizeText(text)
+            res = res.union(splitIntoWords(cleanText))
+        return list(res)
+
+class ListIndexer:
+    '''This lexicon does nothing: list of values must be indexed as is.'''
+    def process(self, texts): return texts
+
+# ------------------------------------------------------------------------------
+try:
+    from Products.ZCTextIndex.PipelineFactory import element_factory as ef
+    ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
+    ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
+    ef.registerFactory('List indexer', 'List indexer', ListIndexer)
+except ImportError:
+    # May occur at generation time.
+    pass
 # ------------------------------------------------------------------------------
diff --git a/gen/installer.py b/gen/installer.py
index 68f21d9..a0670cb 100644
--- a/gen/installer.py
+++ b/gen/installer.py
@@ -8,6 +8,7 @@ import appy.version
 import appy.gen as gen
 from appy.gen.po import PoParser
 from appy.gen.utils import updateRolesForPermission, createObject
+from appy.gen.indexer import defaultIndexes, updateIndexes
 from appy.gen.migrator import Migrator
 from appy.shared.data import languages
 
@@ -63,11 +64,6 @@ def onDelSession(sessionObject, container):
         resp.write('<center>For security reasons, your session has ' \
                    'expired.</center>')
 
-class ZCTextIndexInfo:
-    '''Silly class used for storing information about a ZCTextIndex.'''
-    lexicon_id = "lexicon"
-    index_type = 'Okapi BM25 Rank'
-
 # ------------------------------------------------------------------------------
 class ZopeInstaller:
     '''This Zope installer runs every time Zope starts and encounters this
@@ -148,35 +144,6 @@ class ZopeInstaller:
             self.app.manage_delObjects(['standard_error_message'])
         manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage)
 
-    def installIndexes(self, indexInfo):
-        '''Updates indexes in the catalog.'''
-        catalog = self.app.catalog
-        logger = self.logger
-        for indexName, indexType in indexInfo.iteritems():
-            # If this index already exists but with a different type, remove it.
-            if indexName in catalog.indexes():
-                oldType = catalog.Indexes[indexName].__class__.__name__
-                if oldType != indexType:
-                    catalog.delIndex(indexName)
-                    logger.info('Existing index "%s" of type "%s" was removed:'\
-                                ' we need to recreate it with type "%s".' % \
-                                (indexName, oldType, indexType))
-            if indexName not in catalog.indexes():
-                # We need to create this index
-                if indexType != 'ZCTextIndex':
-                    catalog.addIndex(indexName, indexType)
-                else:
-                    catalog.addIndex(indexName, indexType,extra=ZCTextIndexInfo)
-                # Indexing database content based on this index.
-                catalog.reindexIndex(indexName, self.app.REQUEST)
-                logger.info('Created index "%s" of type "%s"...' % \
-                            (indexName, indexType))
-
-    lexiconInfos = [
-        appy.Object(group='Case Normalizer', name='Case Normalizer'),
-        appy.Object(group='Stop Words', name=" Don't remove stop words"),
-        appy.Object(group='Word Splitter', name='Whitespace splitter')
-    ]
     def installCatalog(self):
         '''Create the catalog at the root of Zope if id does not exist.'''
         if 'catalog' not in self.app.objectIds():
@@ -185,19 +152,30 @@ class ZopeInstaller:
             manage_addZCatalog(self.app, 'catalog', '')
             self.logger.info('Appy catalog created.')
 
-        # Create a lexicon for ZCTextIndexes
-        if 'lexicon' not in self.app.catalog.objectIds():
-            from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
-            manage_addLexicon(self.app.catalog, 'lexicon',
-                              elements=self.lexiconInfos)
+        # Create lexicons for ZCTextIndexes
+        catalog = self.app.catalog
+        lexicons = catalog.objectIds()
+        from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
+        if 'xhtml_lexicon' not in lexicons:
+            lex = appy.Object(group='XHTML indexer', name='XHTML indexer')
+            manage_addLexicon(catalog, 'xhtml_lexicon', elements=[lex])
+        if 'text_lexicon' not in lexicons:
+            lex = appy.Object(group='Text indexer', name='Text indexer')
+            manage_addLexicon(catalog, 'text_lexicon', elements=[lex])
+        if 'list_lexicon' not in lexicons:
+            lex = appy.Object(group='List indexer', name='List indexer')
+            manage_addLexicon(catalog, 'list_lexicon', elements=[lex])
+
+        # Delete the deprecated one if it exists
+        if 'lexicon' in lexicons: catalog.manage_delObjects(['lexicon'])
 
         # Create or update Appy-wide indexes and field-related indexes
-        indexInfo = gen.defaultIndexes.copy()
+        indexInfo = defaultIndexes.copy()
         tool = self.app.config
         for className in self.config.attributes.iterkeys():
             wrapperClass = tool.getAppyClass(className, wrapper=True)
             indexInfo.update(wrapperClass.getIndexes(includeDefaults=False))
-        self.installIndexes(indexInfo)
+        updateIndexes(self, indexInfo)
 
     def getAddPermission(self, className):
         '''What is the name of the permission allowing to create instances of
diff --git a/gen/utils.py b/gen/utils.py
index 0578106..54d87d8 100644
--- a/gen/utils.py
+++ b/gen/utils.py
@@ -1,5 +1,6 @@
 # ------------------------------------------------------------------------------
 import re, os, os.path
+from appy.shared.utils import normalizeText
 
 # Function for creating a Zope object ------------------------------------------
 def createObject(folder, id, className, appName, wf=True, noSecurity=False):
@@ -243,12 +244,12 @@ class SomeObjects:
 # ------------------------------------------------------------------------------
 class Keywords:
     '''This class allows to handle keywords that a user enters and that will be
-       used as basis for performing requests in a Zope ZCTextIndex.'''
+       used as basis for performing requests in a TextIndex/XhtmlIndex.'''
 
     toRemove = '?-+*()'
     def __init__(self, keywords, operator='AND'):
         # Clean the p_keywords that the user has entered.
-        words = keywords.strip()
+        words = normalizeText(keywords)
         if words == '*': words = ''
         for c in self.toRemove: words = words.replace(c, ' ')
         self.keywords = words.split()
@@ -267,7 +268,7 @@ class Keywords:
                     self.keywords.insert(0, word)
 
     def get(self):
-        '''Returns the keywords as needed by the ZCTextIndex.'''
+        '''Returns the keywords as needed by the TextIndex.'''
         if self.keywords:
             op = ' %s ' % self.operator
             return op.join(self.keywords)+'*'
diff --git a/gen/wrappers/__init__.py b/gen/wrappers/__init__.py
index 3763ee8..674b011 100644
--- a/gen/wrappers/__init__.py
+++ b/gen/wrappers/__init__.py
@@ -4,8 +4,8 @@
 # ------------------------------------------------------------------------------
 import os, os.path, mimetypes
 import appy.pod
-from appy.gen import Type, Search, Ref, String, WorkflowAnonymous, \
-                     defaultIndexes
+from appy.gen import Type, Search, Ref, String, WorkflowAnonymous
+from appy.gen.indexer import defaultIndexes
 from appy.gen.utils import createObject
 from appy.shared.utils import getOsTempFolder, executeCommand, \
                               normalizeString, sequenceTypes
diff --git a/shared/utils.py b/shared/utils.py
index d9db99b..f727d2c 100644
--- a/shared/utils.py
+++ b/shared/utils.py
@@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
         res = s
     return res
 
+# ------------------------------------------------------------------------------
+def normalizeText(s):
+    '''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
+       purposes.'''
+    return normalizeString(s, usage='extractedText').strip().lower()
+
+# ------------------------------------------------------------------------------
 def formatNumber(n, sep=',', precision=2, tsep=' '):
     '''Returns a string representation of number p_n, which can be a float
        or integer. p_sep is the decimal separator to use. p_precision is the
diff --git a/shared/xml_parser.py b/shared/xml_parser.py
index 0386889..930802d 100644
--- a/shared/xml_parser.py
+++ b/shared/xml_parser.py
@@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
       - remembering the currently parsed element;
       - managing namespace declarations.
       This parser also knows about HTML entities.'''
-    def __init__(self, env=None, caller=None):
+    def __init__(self, env=None, caller=None, raiseOnError=True):
         '''p_env should be an instance of a class that inherits from
            XmlEnvironment: it specifies the environment to use for this SAX
            parser.'''
@@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
         self.caller = caller # The class calling this parser
         self.parser = xml.sax.make_parser() # Fast, standard expat parser
         self.res = None # The result of parsing.
+        # Raise or not an error when a parsing error is encountered.
+        self.raiseOnError = raiseOnError
 
     # ContentHandler methods ---------------------------------------------------
     def startDocument(self):
@@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
             self.characters('?')
 
     # ErrorHandler methods ---------------------------------------------------
-    # Define methods below in your subclass if you want error handling that
-    # does not raise exceptions, but produces a partial result instead.
-    #def error(self, error): pass
-    #def fatalError(self, error): pass
-    #def warning(self, error): pass
+    def error(self, error):
+        if self.raiseOnError: raise error
+        else: print 'SAX error', error
+    def fatalError(self, error):
+        if self.raiseOnError: raise error
+        else: print 'SAX fatal error', error
+    def warning(self, error): pass
 
     def parse(self, xml, source='string'):
         '''Parses a XML stream.