[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields.

This commit is contained in:
Gaetan Delannay 2012-09-26 23:13:02 +02:00
parent a2ae839704
commit 8d1a88bd27
7 changed files with 164 additions and 89 deletions

View file

@ -7,6 +7,7 @@ from appy.gen.layout import Table
from appy.gen.layout import defaultFieldLayouts from appy.gen.layout import defaultFieldLayouts
from appy.gen.po import PoMessage from appy.gen.po import PoMessage
from appy.gen.mail import sendNotification from appy.gen.mail import sendNotification
from appy.gen.indexer import defaultIndexes
from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects
import appy.pod import appy.pod
from appy.pod.renderer import Renderer from appy.pod.renderer import Renderer
@ -33,13 +34,6 @@ def initMasterValue(v):
else: res = v else: res = v
return [str(v) for v in res] return [str(v) for v in res]
# Default Appy indexes ---------------------------------------------------------
defaultIndexes = {
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex',
'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex',
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
'Allowed': 'KeywordIndex'}
# Descriptor classes used for refining descriptions of elements in types # Descriptor classes used for refining descriptions of elements in types
# (pages, groups,...) ---------------------------------------------------------- # (pages, groups,...) ----------------------------------------------------------
class Page: class Page:
@ -325,7 +319,7 @@ class Search:
if usage == 'search': return 'Title' if usage == 'search': return 'Title'
else: return 'SortableTitle' else: return 'SortableTitle'
# Indeed, for field 'title', Appy has a specific index # Indeed, for field 'title', Appy has a specific index
# 'SortableTitle', because index 'Title' is a ZCTextIndex # 'SortableTitle', because index 'Title' is a TextIndex
# (for searchability) and can't be used for sorting. # (for searchability) and can't be used for sorting.
elif fieldName == 'state': return 'State' elif fieldName == 'state': return 'State'
elif fieldName in defaultIndexes: return fieldName elif fieldName in defaultIndexes: return fieldName
@ -337,8 +331,8 @@ class Search:
value as required for searching in the index corresponding to value as required for searching in the index corresponding to
p_fieldName.''' p_fieldName.'''
if fieldName == 'title': if fieldName == 'title':
# Title is a ZCTextIndex. We must split p_fieldValue into keywords. # Title is a TextIndex. We must split p_fieldValue into keywords.
res = Keywords(fieldValue.decode('utf-8')).get() res = Keywords(fieldValue).get()
elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'): elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'):
v = fieldValue[:-1] v = fieldValue[:-1]
# Warning: 'z' is higher than 'Z'! # Warning: 'z' is higher than 'Z'!
@ -1436,10 +1430,14 @@ class String(Type):
def getIndexType(self): def getIndexType(self):
'''Index type varies depending on String parameters.''' '''Index type varies depending on String parameters.'''
# If String.isSelect, be it multivalued or not, we define a ZCTextIndex: # If String.isSelect, be it multivalued or not, we define a ListIndex:
# this way we can use AND/OR operator. # this way we can use AND/OR operator.
if self.isSelect or (self.format in (String.TEXT, String.XHTML)): if self.isSelect:
return 'ZCTextIndex' return 'ListIndex'
elif self.format == String.TEXT:
return 'TextIndex'
elif self.format == String.XHTML:
return 'XhtmlIndex'
return Type.getIndexType(self) return Type.getIndexType(self)
def getJs(self, layoutType, res): def getJs(self, layoutType, res):
@ -1918,7 +1916,7 @@ class Ref(Type):
def getFormattedValue(self, obj, value): def getFormattedValue(self, obj, value):
return value return value
def getIndexType(self): return 'ZCTextIndex' def getIndexType(self): return 'TextIndex'
def getIndexValue(self, obj, forSearch=False): def getIndexValue(self, obj, forSearch=False):
'''Value for indexing is the list of UIDs of linked objects. If '''Value for indexing is the list of UIDs of linked objects. If

View file

@ -2,9 +2,87 @@
indexed.''' indexed.'''
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
from Products.ZCTextIndex.PipelineFactory import element_factory
from appy.shared.xml_parser import XmlParser from appy.shared.xml_parser import XmlParser
from appy.shared.utils import normalizeString from appy.shared.utils import normalizeText
# Default Appy indexes ---------------------------------------------------------
defaultIndexes = {
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',
'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex',
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
'Allowed': 'KeywordIndex'}
# Stuff for creating or updating the indexes -----------------------------------
class TextIndexInfo:
'''Parameters for a text ZCTextIndex.'''
lexicon_id = "text_lexicon"
index_type = 'Okapi BM25 Rank'
class XhtmlIndexInfo:
'''Parameters for a html ZCTextIndex.'''
lexicon_id = "xhtml_lexicon"
index_type = 'Okapi BM25 Rank'
class ListIndexInfo:
'''Parameters for a list ZCTextIndex.'''
lexicon_id = "list_lexicon"
index_type = 'Okapi BM25 Rank'
def updateIndexes(installer, indexInfo):
'''This function updates the indexes defined in the catalog.'''
catalog = installer.app.catalog
logger = installer.logger
for indexName, indexType in indexInfo.iteritems():
indexRealType = indexType
if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):
indexRealType = 'ZCTextIndex'
# If this index already exists but with a different type (or with a
# deprecated lexicon), remove it.
if indexName in catalog.indexes():
indexObject = catalog.Indexes[indexName]
oldType = indexObject.__class__.__name__
toDelete = False
if (oldType != indexRealType):
toDelete = True
info = indexRealType
elif (oldType == 'ZCTextIndex') and \
(indexObject.lexicon_id == 'lexicon'):
toDelete = True
info = '%s (%s)' % (oldType, indexType)
if toDelete:
catalog.delIndex(indexName)
logger.info('Index %s (%s) to replace as %s.' % \
(indexName, oldType, info))
if indexName not in catalog.indexes():
# We need to (re-)create this index.
if indexType == 'TextIndex':
catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)
elif indexType == 'XhtmlIndex':
catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)
elif indexType == 'ListIndex':
catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)
else:
catalog.addIndex(indexName, indexType)
# Indexing database content based on this index.
logger.info('Reindexing %s (%s)...' % (indexName, indexType))
catalog.reindexIndex(indexName, installer.app.REQUEST)
logger.info('Done.')
# ------------------------------------------------------------------------------
def splitIntoWords(text):
'''Split the cleaned index value p_text into words (returns a list of
words). Words of a single char are ignored, excepted digits which are
always kept. Duplicate words are removed (result is a set and not a
list).'''
res = text.split(' ')
# Remove tokens of a single char (excepted if this char is a digit).
i = len(res)-1
while i > -1 :
if (len(res[i]) < 2) and not res[i].isdigit():
del res[i]
i -= 1
# Remove duplicates
return set(res)
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class XhtmlTextExtractor(XmlParser): class XhtmlTextExtractor(XmlParser):
@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser):
return XmlParser.endDocument(self) return XmlParser.endDocument(self)
def characters(self, content): def characters(self, content):
c = normalizeString(content, usage='extractedText').strip().lower() c = normalizeText(content)
if len(c) > 1: self.res.append(c) if len(c) > 1: self.res.append(c)
return self.env
# Do not raise exceptions when errors occur.
def error(self, error): pass
def fatalError(self, error): pass
def warning(self, error): pass
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class XhtmlIndexer: class XhtmlIndexer:
'''Extracts, from XHTML field values, the text to index.''' '''Extracts, from XHTML field values, the text to index.'''
def process(self, text): def process(self, texts):
# Wrap the XHTML chunk into a root tag, to get valid XML. res = set()
text = '<p>%s</p>' % text[0] for text in texts:
parser = XhtmlTextExtractor() extractor = XhtmlTextExtractor(raiseOnError=False)
text = parser.parse(text) cleanText = extractor.parse('<p>%s</p>' % text)
res = text.split(' ') res = res.union(splitIntoWords(cleanText))
# Remove tokens of a single char. return list(res)
i = len(res)-1
while i > -1 :
if (len(res[i]) < 2) and not res[i].isdigit():
del res[i]
i -= 1
return res
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer) class TextIndexer:
'''Extracts, from text field values, a normalized value to index.'''
def process(self, texts):
res = set()
for text in texts:
cleanText = normalizeText(text)
res = res.union(splitIntoWords(cleanText))
return list(res)
class ListIndexer:
'''This lexicon does nothing: list of values must be indexed as is.'''
def process(self, texts): return texts
# ------------------------------------------------------------------------------
try:
from Products.ZCTextIndex.PipelineFactory import element_factory as ef
ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
ef.registerFactory('List indexer', 'List indexer', ListIndexer)
except ImportError:
# May occur at generation time.
pass
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------

View file

@ -8,6 +8,7 @@ import appy.version
import appy.gen as gen import appy.gen as gen
from appy.gen.po import PoParser from appy.gen.po import PoParser
from appy.gen.utils import updateRolesForPermission, createObject from appy.gen.utils import updateRolesForPermission, createObject
from appy.gen.indexer import defaultIndexes, updateIndexes
from appy.gen.migrator import Migrator from appy.gen.migrator import Migrator
from appy.shared.data import languages from appy.shared.data import languages
@ -63,11 +64,6 @@ def onDelSession(sessionObject, container):
resp.write('<center>For security reasons, your session has ' \ resp.write('<center>For security reasons, your session has ' \
'expired.</center>') 'expired.</center>')
class ZCTextIndexInfo:
'''Silly class used for storing information about a ZCTextIndex.'''
lexicon_id = "lexicon"
index_type = 'Okapi BM25 Rank'
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class ZopeInstaller: class ZopeInstaller:
'''This Zope installer runs every time Zope starts and encounters this '''This Zope installer runs every time Zope starts and encounters this
@ -148,35 +144,6 @@ class ZopeInstaller:
self.app.manage_delObjects(['standard_error_message']) self.app.manage_delObjects(['standard_error_message'])
manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage) manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage)
def installIndexes(self, indexInfo):
'''Updates indexes in the catalog.'''
catalog = self.app.catalog
logger = self.logger
for indexName, indexType in indexInfo.iteritems():
# If this index already exists but with a different type, remove it.
if indexName in catalog.indexes():
oldType = catalog.Indexes[indexName].__class__.__name__
if oldType != indexType:
catalog.delIndex(indexName)
logger.info('Existing index "%s" of type "%s" was removed:'\
' we need to recreate it with type "%s".' % \
(indexName, oldType, indexType))
if indexName not in catalog.indexes():
# We need to create this index
if indexType != 'ZCTextIndex':
catalog.addIndex(indexName, indexType)
else:
catalog.addIndex(indexName, indexType,extra=ZCTextIndexInfo)
# Indexing database content based on this index.
catalog.reindexIndex(indexName, self.app.REQUEST)
logger.info('Created index "%s" of type "%s"...' % \
(indexName, indexType))
lexiconInfos = [
appy.Object(group='Case Normalizer', name='Case Normalizer'),
appy.Object(group='Stop Words', name=" Don't remove stop words"),
appy.Object(group='Word Splitter', name='Whitespace splitter')
]
def installCatalog(self): def installCatalog(self):
'''Create the catalog at the root of Zope if id does not exist.''' '''Create the catalog at the root of Zope if id does not exist.'''
if 'catalog' not in self.app.objectIds(): if 'catalog' not in self.app.objectIds():
@ -185,19 +152,30 @@ class ZopeInstaller:
manage_addZCatalog(self.app, 'catalog', '') manage_addZCatalog(self.app, 'catalog', '')
self.logger.info('Appy catalog created.') self.logger.info('Appy catalog created.')
# Create a lexicon for ZCTextIndexes # Create lexicons for ZCTextIndexes
if 'lexicon' not in self.app.catalog.objectIds(): catalog = self.app.catalog
from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon lexicons = catalog.objectIds()
manage_addLexicon(self.app.catalog, 'lexicon', from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
elements=self.lexiconInfos) if 'xhtml_lexicon' not in lexicons:
lex = appy.Object(group='XHTML indexer', name='XHTML indexer')
manage_addLexicon(catalog, 'xhtml_lexicon', elements=[lex])
if 'text_lexicon' not in lexicons:
lex = appy.Object(group='Text indexer', name='Text indexer')
manage_addLexicon(catalog, 'text_lexicon', elements=[lex])
if 'list_lexicon' not in lexicons:
lex = appy.Object(group='List indexer', name='List indexer')
manage_addLexicon(catalog, 'list_lexicon', elements=[lex])
# Delete the deprecated one if it exists
if 'lexicon' in lexicons: catalog.manage_delObjects(['lexicon'])
# Create or update Appy-wide indexes and field-related indexes # Create or update Appy-wide indexes and field-related indexes
indexInfo = gen.defaultIndexes.copy() indexInfo = defaultIndexes.copy()
tool = self.app.config tool = self.app.config
for className in self.config.attributes.iterkeys(): for className in self.config.attributes.iterkeys():
wrapperClass = tool.getAppyClass(className, wrapper=True) wrapperClass = tool.getAppyClass(className, wrapper=True)
indexInfo.update(wrapperClass.getIndexes(includeDefaults=False)) indexInfo.update(wrapperClass.getIndexes(includeDefaults=False))
self.installIndexes(indexInfo) updateIndexes(self, indexInfo)
def getAddPermission(self, className): def getAddPermission(self, className):
'''What is the name of the permission allowing to create instances of '''What is the name of the permission allowing to create instances of

View file

@ -1,5 +1,6 @@
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
import re, os, os.path import re, os, os.path
from appy.shared.utils import normalizeText
# Function for creating a Zope object ------------------------------------------ # Function for creating a Zope object ------------------------------------------
def createObject(folder, id, className, appName, wf=True, noSecurity=False): def createObject(folder, id, className, appName, wf=True, noSecurity=False):
@ -243,12 +244,12 @@ class SomeObjects:
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class Keywords: class Keywords:
'''This class allows to handle keywords that a user enters and that will be '''This class allows to handle keywords that a user enters and that will be
used as basis for performing requests in a Zope ZCTextIndex.''' used as basis for performing requests in a TextIndex/XhtmlIndex.'''
toRemove = '?-+*()' toRemove = '?-+*()'
def __init__(self, keywords, operator='AND'): def __init__(self, keywords, operator='AND'):
# Clean the p_keywords that the user has entered. # Clean the p_keywords that the user has entered.
words = keywords.strip() words = normalizeText(keywords)
if words == '*': words = '' if words == '*': words = ''
for c in self.toRemove: words = words.replace(c, ' ') for c in self.toRemove: words = words.replace(c, ' ')
self.keywords = words.split() self.keywords = words.split()
@ -267,7 +268,7 @@ class Keywords:
self.keywords.insert(0, word) self.keywords.insert(0, word)
def get(self): def get(self):
'''Returns the keywords as needed by the ZCTextIndex.''' '''Returns the keywords as needed by the TextIndex.'''
if self.keywords: if self.keywords:
op = ' %s ' % self.operator op = ' %s ' % self.operator
return op.join(self.keywords)+'*' return op.join(self.keywords)+'*'

View file

@ -4,8 +4,8 @@
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
import os, os.path, mimetypes import os, os.path, mimetypes
import appy.pod import appy.pod
from appy.gen import Type, Search, Ref, String, WorkflowAnonymous, \ from appy.gen import Type, Search, Ref, String, WorkflowAnonymous
defaultIndexes from appy.gen.indexer import defaultIndexes
from appy.gen.utils import createObject from appy.gen.utils import createObject
from appy.shared.utils import getOsTempFolder, executeCommand, \ from appy.shared.utils import getOsTempFolder, executeCommand, \
normalizeString, sequenceTypes normalizeString, sequenceTypes

View file

@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
res = s res = s
return res return res
# ------------------------------------------------------------------------------
def normalizeText(s):
'''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
purposes.'''
return normalizeString(s, usage='extractedText').strip().lower()
# ------------------------------------------------------------------------------
def formatNumber(n, sep=',', precision=2, tsep=' '): def formatNumber(n, sep=',', precision=2, tsep=' '):
'''Returns a string representation of number p_n, which can be a float '''Returns a string representation of number p_n, which can be a float
or integer. p_sep is the decimal separator to use. p_precision is the or integer. p_sep is the decimal separator to use. p_precision is the

View file

@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
- remembering the currently parsed element; - remembering the currently parsed element;
- managing namespace declarations. - managing namespace declarations.
This parser also knows about HTML entities.''' This parser also knows about HTML entities.'''
def __init__(self, env=None, caller=None): def __init__(self, env=None, caller=None, raiseOnError=True):
'''p_env should be an instance of a class that inherits from '''p_env should be an instance of a class that inherits from
XmlEnvironment: it specifies the environment to use for this SAX XmlEnvironment: it specifies the environment to use for this SAX
parser.''' parser.'''
@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
self.caller = caller # The class calling this parser self.caller = caller # The class calling this parser
self.parser = xml.sax.make_parser() # Fast, standard expat parser self.parser = xml.sax.make_parser() # Fast, standard expat parser
self.res = None # The result of parsing. self.res = None # The result of parsing.
# Raise or not an error when a parsing error is encountered.
self.raiseOnError = raiseOnError
# ContentHandler methods --------------------------------------------------- # ContentHandler methods ---------------------------------------------------
def startDocument(self): def startDocument(self):
@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
self.characters('?') self.characters('?')
# ErrorHandler methods --------------------------------------------------- # ErrorHandler methods ---------------------------------------------------
# Define methods below in your subclass if you want error handling that def error(self, error):
# does not raise exceptions, but produces a partial result instead. if self.raiseOnError: raise error
#def error(self, error): pass else: print 'SAX error', error
#def fatalError(self, error): pass def fatalError(self, error):
#def warning(self, error): pass if self.raiseOnError: raise error
else: print 'SAX fatal error', error
def warning(self, error): pass
def parse(self, xml, source='string'): def parse(self, xml, source='string'):
'''Parses a XML stream. '''Parses a XML stream.