appypod-rattail/gen/indexer.py

'''This file defines code for extracting, from field values, the text to be
   indexed.'''

# ------------------------------------------------------------------------------
from appy.shared.xml_parser import XmlParser
from appy.shared.utils import normalizeText

# Default Appy indexes ---------------------------------------------------------
defaultIndexes = {
    'State': 'ListIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',
    'SortableTitle': 'FieldIndex', 'SearchableText': 'TextIndex',
    'Creator': 'FieldIndex', 'Created': 'DateIndex', 'Modified': 'DateIndex',
    'ClassName': 'FieldIndex', 'Allowed': 'KeywordIndex'}

# Stuff for creating or updating the indexes -----------------------------------
class TextIndexInfo:
    '''Parameters for a text ZCTextIndex.'''
    lexicon_id = "text_lexicon"
    index_type = 'Okapi BM25 Rank'

class XhtmlIndexInfo:
    '''Parameters for a html ZCTextIndex.'''
    lexicon_id = "xhtml_lexicon"
    index_type = 'Okapi BM25 Rank'

class ListIndexInfo:
    '''Parameters for a list ZCTextIndex.'''
    lexicon_id = "list_lexicon"
    index_type = 'Okapi BM25 Rank'

def updateIndexes(installer, indexInfo):
    '''This function updates the indexes defined in the catalog.'''
    catalog = installer.app.catalog
    logger = installer.logger
    for indexName, indexType in indexInfo.iteritems():
        indexRealType = indexType
        if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):
            indexRealType = 'ZCTextIndex'
        # If this index already exists but with a different type (or with a
        # deprecated lexicon), remove it.
        if indexName in catalog.indexes():
            indexObject = catalog.Indexes[indexName]
            oldType = indexObject.__class__.__name__
            toDelete = False
            if (oldType != indexRealType):
                toDelete = True
                info = indexRealType
            elif (oldType == 'ZCTextIndex') and \
                 (indexObject.lexicon_id == 'lexicon'):
                toDelete = True
                info = '%s (%s)' % (oldType, indexType)
            if toDelete:
                catalog.delIndex(indexName)
                logger.info('Index %s (%s) to replace as %s.' % \
                            (indexName, oldType, info))
        if indexName not in catalog.indexes():
            # We need to (re-)create this index.
            if indexType == 'TextIndex':
                catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)
            elif indexType == 'XhtmlIndex':
                catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)
            elif indexType == 'ListIndex':
                catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)
            else:
                catalog.addIndex(indexName, indexType)
            # Indexing database content based on this index.
            logger.info('Reindexing %s (%s)...' % (indexName, indexType))
            catalog.reindexIndex(indexName, installer.app.REQUEST)
            logger.info('Done.')

# ------------------------------------------------------------------------------
def splitIntoWords(text):
    '''Split the cleaned index value p_text into words (returns a list of
       words). Words of a single char are ignored, excepted digits which are
       always kept. Duplicate words are removed (result is a set and not a
       list).'''
    res = text.split(' ')
    # Remove tokens of a single char (excepted if this char is a digit).
    i = len(res)-1
    while i > -1 :
        if (len(res[i]) < 2) and not res[i].isdigit():
            del res[i]
        i -= 1
    # Remove duplicates
    return set(res)

# ------------------------------------------------------------------------------
class XhtmlTextExtractor(XmlParser):
    '''Extracts text from XHTML.'''
    def startDocument(self):
        XmlParser.startDocument(self)
        self.res = []

    def endDocument(self):
        self.res = ' '.join(self.res)
        return XmlParser.endDocument(self)

    def characters(self, content):
        c = normalizeText(content)
        if len(c) > 1: self.res.append(c)

# ------------------------------------------------------------------------------
class XhtmlIndexer:
    '''Extracts, from XHTML field values, the text to index.'''
    def process(self, texts):
        res = set()
        for text in texts:
            extractor = XhtmlTextExtractor(raiseOnError=False)
            cleanText = extractor.parse('<p>%s</p>' % text)
            res = res.union(splitIntoWords(cleanText))
        return list(res)

# ------------------------------------------------------------------------------
class TextIndexer:
    '''Extracts, from text field values, a normalized value to index.'''
    def process(self, texts):
        res = set()
        for text in texts:
            cleanText = normalizeText(text)
            res = res.union(splitIntoWords(cleanText))
        return list(res)

class ListIndexer:
    '''This lexicon does nothing: list of values must be indexed as is.'''
    def process(self, texts): return texts

# ------------------------------------------------------------------------------
try:
    from Products.ZCTextIndex.PipelineFactory import element_factory as ef
    ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
    ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
    ef.registerFactory('List indexer', 'List indexer', ListIndexer)
except ImportError:
    # May occur at generation time.
    pass
# ------------------------------------------------------------------------------
[gen] Added the possiblity to extract in a clean way text from XHTML field values. 2012-09-25 14:43:45 -05:00			`'''This file defines code for extracting, from field values, the text to be`
			`indexed.'''`

			`# ------------------------------------------------------------------------------`
			`from appy.shared.xml_parser import XmlParser`
[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields. 2012-09-26 16:13:02 -05:00			`from appy.shared.utils import normalizeText`

			`# Default Appy indexes ---------------------------------------------------------`
			`defaultIndexes = {`
[gen] Added param Search.default allowing to define a default Search. The default search, if present, will be triggered when clicking on the main link for a class, instead of the query that collects all instances of this class; appy.gen.Type: removed 3 obsolete params: 'index', 'editDefault' and 'optional'. For achieving the same result than using 'editDefault', one may define 'by hand' an attribute on the Tool for storing the editable default value, and define, on the appropriate field in param 'default', a method that returns the value of the tool attribute; Added Type.defaultForSearch, allowing, for some sub-types, to define a default value when displaying the corresponding widget on the search screen; added a default 'state' field allowing to include workflow state among search criteria in the search screens; removed obsolete test applications. 2012-10-31 07:20:25 -05:00			`'State': 'ListIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',`
[gen] Bugfixes in the management of indexes. 2012-10-05 09:38:15 -05:00			`'SortableTitle': 'FieldIndex', 'SearchableText': 'TextIndex',`
[gen] Added an index 'Modified' on every object (it represents the date of the last modification); removed obsolete workflow-related code; removed attributes Tool.showAllStatesInPhaseFor...; changed the way to display the object's current state in the UI. 2012-11-05 03:21:27 -06:00			`'Creator': 'FieldIndex', 'Created': 'DateIndex', 'Modified': 'DateIndex',`
			`'ClassName': 'FieldIndex', 'Allowed': 'KeywordIndex'}`
[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields. 2012-09-26 16:13:02 -05:00
			`# Stuff for creating or updating the indexes -----------------------------------`
			`class TextIndexInfo:`
			`'''Parameters for a text ZCTextIndex.'''`
			`lexicon_id = "text_lexicon"`
			`index_type = 'Okapi BM25 Rank'`

			`class XhtmlIndexInfo:`
			`'''Parameters for a html ZCTextIndex.'''`
			`lexicon_id = "xhtml_lexicon"`
			`index_type = 'Okapi BM25 Rank'`

			`class ListIndexInfo:`
			`'''Parameters for a list ZCTextIndex.'''`
			`lexicon_id = "list_lexicon"`
			`index_type = 'Okapi BM25 Rank'`

			`def updateIndexes(installer, indexInfo):`
			`'''This function updates the indexes defined in the catalog.'''`
			`catalog = installer.app.catalog`
			`logger = installer.logger`
			`for indexName, indexType in indexInfo.iteritems():`
			`indexRealType = indexType`
			`if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):`
			`indexRealType = 'ZCTextIndex'`
			`# If this index already exists but with a different type (or with a`
			`# deprecated lexicon), remove it.`
			`if indexName in catalog.indexes():`
			`indexObject = catalog.Indexes[indexName]`
			`oldType = indexObject.__class__.__name__`
			`toDelete = False`
			`if (oldType != indexRealType):`
			`toDelete = True`
			`info = indexRealType`
			`elif (oldType == 'ZCTextIndex') and \`
			`(indexObject.lexicon_id == 'lexicon'):`
			`toDelete = True`
			`info = '%s (%s)' % (oldType, indexType)`
			`if toDelete:`
			`catalog.delIndex(indexName)`
			`logger.info('Index %s (%s) to replace as %s.' % \`
			`(indexName, oldType, info))`
			`if indexName not in catalog.indexes():`
			`# We need to (re-)create this index.`
			`if indexType == 'TextIndex':`
			`catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)`
			`elif indexType == 'XhtmlIndex':`
			`catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)`
			`elif indexType == 'ListIndex':`
			`catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)`
			`else:`
			`catalog.addIndex(indexName, indexType)`
			`# Indexing database content based on this index.`
			`logger.info('Reindexing %s (%s)...' % (indexName, indexType))`
			`catalog.reindexIndex(indexName, installer.app.REQUEST)`
			`logger.info('Done.')`

			`# ------------------------------------------------------------------------------`
			`def splitIntoWords(text):`
			`'''Split the cleaned index value p_text into words (returns a list of`
			`words). Words of a single char are ignored, excepted digits which are`
			`always kept. Duplicate words are removed (result is a set and not a`
			`list).'''`
			`res = text.split(' ')`
			`# Remove tokens of a single char (excepted if this char is a digit).`
			`i = len(res)-1`
			`while i > -1 :`
			`if (len(res[i]) < 2) and not res[i].isdigit():`
			`del res[i]`
			`i -= 1`
			`# Remove duplicates`
			`return set(res)`
[gen] Added the possiblity to extract in a clean way text from XHTML field values. 2012-09-25 14:43:45 -05:00
			`# ------------------------------------------------------------------------------`
			`class XhtmlTextExtractor(XmlParser):`
			`'''Extracts text from XHTML.'''`
			`def startDocument(self):`
			`XmlParser.startDocument(self)`
			`self.res = []`

			`def endDocument(self):`
			`self.res = ' '.join(self.res)`
			`return XmlParser.endDocument(self)`

			`def characters(self, content):`
[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields. 2012-09-26 16:13:02 -05:00			`c = normalizeText(content)`
[gen] Added the possiblity to extract in a clean way text from XHTML field values. 2012-09-25 14:43:45 -05:00			`if len(c) > 1: self.res.append(c)`

			`# ------------------------------------------------------------------------------`
			`class XhtmlIndexer:`
			`'''Extracts, from XHTML field values, the text to index.'''`
[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields. 2012-09-26 16:13:02 -05:00			`def process(self, texts):`
			`res = set()`
			`for text in texts:`
			`extractor = XhtmlTextExtractor(raiseOnError=False)`
			`cleanText = extractor.parse('<p>%s</p>' % text)`
			`res = res.union(splitIntoWords(cleanText))`
			`return list(res)`

			`# ------------------------------------------------------------------------------`
			`class TextIndexer:`
			`'''Extracts, from text field values, a normalized value to index.'''`
			`def process(self, texts):`
			`res = set()`
			`for text in texts:`
			`cleanText = normalizeText(text)`
			`res = res.union(splitIntoWords(cleanText))`
			`return list(res)`

			`class ListIndexer:`
			`'''This lexicon does nothing: list of values must be indexed as is.'''`
			`def process(self, texts): return texts`
[gen] Added the possiblity to extract in a clean way text from XHTML field values. 2012-09-25 14:43:45 -05:00
			`# ------------------------------------------------------------------------------`
[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields. 2012-09-26 16:13:02 -05:00			`try:`
			`from Products.ZCTextIndex.PipelineFactory import element_factory as ef`
			`ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)`
			`ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)`
			`ef.registerFactory('List indexer', 'List indexer', ListIndexer)`
			`except ImportError:`
			`# May occur at generation time.`
			`pass`
[gen] Added the possiblity to extract in a clean way text from XHTML field values. 2012-09-25 14:43:45 -05:00			`# ------------------------------------------------------------------------------`