[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields.

This commit is contained in:
Gaetan Delannay 2012-09-26 23:13:02 +02:00
parent a2ae839704
commit 8d1a88bd27
7 changed files with 164 additions and 89 deletions

View file

@ -7,6 +7,7 @@ from appy.gen.layout import Table
from appy.gen.layout import defaultFieldLayouts
from appy.gen.po import PoMessage
from appy.gen.mail import sendNotification
from appy.gen.indexer import defaultIndexes
from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects
import appy.pod
from appy.pod.renderer import Renderer
@ -33,13 +34,6 @@ def initMasterValue(v):
else: res = v
return [str(v) for v in res]
# Default Appy indexes ---------------------------------------------------------
defaultIndexes = {
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex',
'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex',
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
'Allowed': 'KeywordIndex'}
# Descriptor classes used for refining descriptions of elements in types
# (pages, groups,...) ----------------------------------------------------------
class Page:
@ -325,7 +319,7 @@ class Search:
if usage == 'search': return 'Title'
else: return 'SortableTitle'
# Indeed, for field 'title', Appy has a specific index
# 'SortableTitle', because index 'Title' is a ZCTextIndex
# 'SortableTitle', because index 'Title' is a TextIndex
# (for searchability) and can't be used for sorting.
elif fieldName == 'state': return 'State'
elif fieldName in defaultIndexes: return fieldName
@ -337,8 +331,8 @@ class Search:
value as required for searching in the index corresponding to
p_fieldName.'''
if fieldName == 'title':
# Title is a ZCTextIndex. We must split p_fieldValue into keywords.
res = Keywords(fieldValue.decode('utf-8')).get()
# Title is a TextIndex. We must split p_fieldValue into keywords.
res = Keywords(fieldValue).get()
elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'):
v = fieldValue[:-1]
# Warning: 'z' is higher than 'Z'!
@ -1436,10 +1430,14 @@ class String(Type):
def getIndexType(self):
'''Index type varies depending on String parameters.'''
# If String.isSelect, be it multivalued or not, we define a ZCTextIndex:
# If String.isSelect, be it multivalued or not, we define a ListIndex:
# this way we can use AND/OR operator.
if self.isSelect or (self.format in (String.TEXT, String.XHTML)):
return 'ZCTextIndex'
if self.isSelect:
return 'ListIndex'
elif self.format == String.TEXT:
return 'TextIndex'
elif self.format == String.XHTML:
return 'XhtmlIndex'
return Type.getIndexType(self)
def getJs(self, layoutType, res):
@ -1918,7 +1916,7 @@ class Ref(Type):
def getFormattedValue(self, obj, value):
return value
def getIndexType(self): return 'ZCTextIndex'
def getIndexType(self): return 'TextIndex'
def getIndexValue(self, obj, forSearch=False):
'''Value for indexing is the list of UIDs of linked objects. If

View file

@ -2,9 +2,87 @@
indexed.'''
# ------------------------------------------------------------------------------
from Products.ZCTextIndex.PipelineFactory import element_factory
from appy.shared.xml_parser import XmlParser
from appy.shared.utils import normalizeString
from appy.shared.utils import normalizeText
# Default Appy indexes ---------------------------------------------------------
defaultIndexes = {
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',
'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex',
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
'Allowed': 'KeywordIndex'}
# Stuff for creating or updating the indexes -----------------------------------
class TextIndexInfo:
'''Parameters for a text ZCTextIndex.'''
lexicon_id = "text_lexicon"
index_type = 'Okapi BM25 Rank'
class XhtmlIndexInfo:
'''Parameters for a html ZCTextIndex.'''
lexicon_id = "xhtml_lexicon"
index_type = 'Okapi BM25 Rank'
class ListIndexInfo:
'''Parameters for a list ZCTextIndex.'''
lexicon_id = "list_lexicon"
index_type = 'Okapi BM25 Rank'
def updateIndexes(installer, indexInfo):
'''This function updates the indexes defined in the catalog.'''
catalog = installer.app.catalog
logger = installer.logger
for indexName, indexType in indexInfo.iteritems():
indexRealType = indexType
if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):
indexRealType = 'ZCTextIndex'
# If this index already exists but with a different type (or with a
# deprecated lexicon), remove it.
if indexName in catalog.indexes():
indexObject = catalog.Indexes[indexName]
oldType = indexObject.__class__.__name__
toDelete = False
if (oldType != indexRealType):
toDelete = True
info = indexRealType
elif (oldType == 'ZCTextIndex') and \
(indexObject.lexicon_id == 'lexicon'):
toDelete = True
info = '%s (%s)' % (oldType, indexType)
if toDelete:
catalog.delIndex(indexName)
logger.info('Index %s (%s) to replace as %s.' % \
(indexName, oldType, info))
if indexName not in catalog.indexes():
# We need to (re-)create this index.
if indexType == 'TextIndex':
catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)
elif indexType == 'XhtmlIndex':
catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)
elif indexType == 'ListIndex':
catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)
else:
catalog.addIndex(indexName, indexType)
# Indexing database content based on this index.
logger.info('Reindexing %s (%s)...' % (indexName, indexType))
catalog.reindexIndex(indexName, installer.app.REQUEST)
logger.info('Done.')
# ------------------------------------------------------------------------------
def splitIntoWords(text):
'''Split the cleaned index value p_text into words (returns a list of
words). Words of a single char are ignored, excepted digits which are
always kept. Duplicate words are removed (result is a set and not a
list).'''
res = text.split(' ')
# Remove tokens of a single char (excepted if this char is a digit).
i = len(res)-1
while i > -1 :
if (len(res[i]) < 2) and not res[i].isdigit():
del res[i]
i -= 1
# Remove duplicates
return set(res)
# ------------------------------------------------------------------------------
class XhtmlTextExtractor(XmlParser):
@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser):
return XmlParser.endDocument(self)
def characters(self, content):
c = normalizeString(content, usage='extractedText').strip().lower()
c = normalizeText(content)
if len(c) > 1: self.res.append(c)
return self.env
# Do not raise exceptions when errors occur.
def error(self, error): pass
def fatalError(self, error): pass
def warning(self, error): pass
# ------------------------------------------------------------------------------
class XhtmlIndexer:
'''Extracts, from XHTML field values, the text to index.'''
def process(self, text):
# Wrap the XHTML chunk into a root tag, to get valid XML.
text = '<p>%s</p>' % text[0]
parser = XhtmlTextExtractor()
text = parser.parse(text)
res = text.split(' ')
# Remove tokens of a single char.
i = len(res)-1
while i > -1 :
if (len(res[i]) < 2) and not res[i].isdigit():
del res[i]
i -= 1
return res
def process(self, texts):
res = set()
for text in texts:
extractor = XhtmlTextExtractor(raiseOnError=False)
cleanText = extractor.parse('<p>%s</p>' % text)
res = res.union(splitIntoWords(cleanText))
return list(res)
# ------------------------------------------------------------------------------
element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
class TextIndexer:
'''Extracts, from text field values, a normalized value to index.'''
def process(self, texts):
res = set()
for text in texts:
cleanText = normalizeText(text)
res = res.union(splitIntoWords(cleanText))
return list(res)
class ListIndexer:
'''This lexicon does nothing: list of values must be indexed as is.'''
def process(self, texts): return texts
# ------------------------------------------------------------------------------
try:
from Products.ZCTextIndex.PipelineFactory import element_factory as ef
ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
ef.registerFactory('List indexer', 'List indexer', ListIndexer)
except ImportError:
# May occur at generation time.
pass
# ------------------------------------------------------------------------------

View file

@ -8,6 +8,7 @@ import appy.version
import appy.gen as gen
from appy.gen.po import PoParser
from appy.gen.utils import updateRolesForPermission, createObject
from appy.gen.indexer import defaultIndexes, updateIndexes
from appy.gen.migrator import Migrator
from appy.shared.data import languages
@ -63,11 +64,6 @@ def onDelSession(sessionObject, container):
resp.write('<center>For security reasons, your session has ' \
'expired.</center>')
class ZCTextIndexInfo:
'''Silly class used for storing information about a ZCTextIndex.'''
lexicon_id = "lexicon"
index_type = 'Okapi BM25 Rank'
# ------------------------------------------------------------------------------
class ZopeInstaller:
'''This Zope installer runs every time Zope starts and encounters this
@ -148,35 +144,6 @@ class ZopeInstaller:
self.app.manage_delObjects(['standard_error_message'])
manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage)
def installIndexes(self, indexInfo):
'''Updates indexes in the catalog.'''
catalog = self.app.catalog
logger = self.logger
for indexName, indexType in indexInfo.iteritems():
# If this index already exists but with a different type, remove it.
if indexName in catalog.indexes():
oldType = catalog.Indexes[indexName].__class__.__name__
if oldType != indexType:
catalog.delIndex(indexName)
logger.info('Existing index "%s" of type "%s" was removed:'\
' we need to recreate it with type "%s".' % \
(indexName, oldType, indexType))
if indexName not in catalog.indexes():
# We need to create this index
if indexType != 'ZCTextIndex':
catalog.addIndex(indexName, indexType)
else:
catalog.addIndex(indexName, indexType,extra=ZCTextIndexInfo)
# Indexing database content based on this index.
catalog.reindexIndex(indexName, self.app.REQUEST)
logger.info('Created index "%s" of type "%s"...' % \
(indexName, indexType))
lexiconInfos = [
appy.Object(group='Case Normalizer', name='Case Normalizer'),
appy.Object(group='Stop Words', name=" Don't remove stop words"),
appy.Object(group='Word Splitter', name='Whitespace splitter')
]
def installCatalog(self):
'''Create the catalog at the root of Zope if id does not exist.'''
if 'catalog' not in self.app.objectIds():
@ -185,19 +152,30 @@ class ZopeInstaller:
manage_addZCatalog(self.app, 'catalog', '')
self.logger.info('Appy catalog created.')
# Create a lexicon for ZCTextIndexes
if 'lexicon' not in self.app.catalog.objectIds():
# Create lexicons for ZCTextIndexes
catalog = self.app.catalog
lexicons = catalog.objectIds()
from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
manage_addLexicon(self.app.catalog, 'lexicon',
elements=self.lexiconInfos)
if 'xhtml_lexicon' not in lexicons:
lex = appy.Object(group='XHTML indexer', name='XHTML indexer')
manage_addLexicon(catalog, 'xhtml_lexicon', elements=[lex])
if 'text_lexicon' not in lexicons:
lex = appy.Object(group='Text indexer', name='Text indexer')
manage_addLexicon(catalog, 'text_lexicon', elements=[lex])
if 'list_lexicon' not in lexicons:
lex = appy.Object(group='List indexer', name='List indexer')
manage_addLexicon(catalog, 'list_lexicon', elements=[lex])
# Delete the deprecated one if it exists
if 'lexicon' in lexicons: catalog.manage_delObjects(['lexicon'])
# Create or update Appy-wide indexes and field-related indexes
indexInfo = gen.defaultIndexes.copy()
indexInfo = defaultIndexes.copy()
tool = self.app.config
for className in self.config.attributes.iterkeys():
wrapperClass = tool.getAppyClass(className, wrapper=True)
indexInfo.update(wrapperClass.getIndexes(includeDefaults=False))
self.installIndexes(indexInfo)
updateIndexes(self, indexInfo)
def getAddPermission(self, className):
'''What is the name of the permission allowing to create instances of

View file

@ -1,5 +1,6 @@
# ------------------------------------------------------------------------------
import re, os, os.path
from appy.shared.utils import normalizeText
# Function for creating a Zope object ------------------------------------------
def createObject(folder, id, className, appName, wf=True, noSecurity=False):
@ -243,12 +244,12 @@ class SomeObjects:
# ------------------------------------------------------------------------------
class Keywords:
'''This class allows to handle keywords that a user enters and that will be
used as basis for performing requests in a Zope ZCTextIndex.'''
used as basis for performing requests in a TextIndex/XhtmlIndex.'''
toRemove = '?-+*()'
def __init__(self, keywords, operator='AND'):
# Clean the p_keywords that the user has entered.
words = keywords.strip()
words = normalizeText(keywords)
if words == '*': words = ''
for c in self.toRemove: words = words.replace(c, ' ')
self.keywords = words.split()
@ -267,7 +268,7 @@ class Keywords:
self.keywords.insert(0, word)
def get(self):
'''Returns the keywords as needed by the ZCTextIndex.'''
'''Returns the keywords as needed by the TextIndex.'''
if self.keywords:
op = ' %s ' % self.operator
return op.join(self.keywords)+'*'

View file

@ -4,8 +4,8 @@
# ------------------------------------------------------------------------------
import os, os.path, mimetypes
import appy.pod
from appy.gen import Type, Search, Ref, String, WorkflowAnonymous, \
defaultIndexes
from appy.gen import Type, Search, Ref, String, WorkflowAnonymous
from appy.gen.indexer import defaultIndexes
from appy.gen.utils import createObject
from appy.shared.utils import getOsTempFolder, executeCommand, \
normalizeString, sequenceTypes

View file

@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
res = s
return res
# ------------------------------------------------------------------------------
def normalizeText(s):
'''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
purposes.'''
return normalizeString(s, usage='extractedText').strip().lower()
# ------------------------------------------------------------------------------
def formatNumber(n, sep=',', precision=2, tsep=' '):
'''Returns a string representation of number p_n, which can be a float
or integer. p_sep is the decimal separator to use. p_precision is the

View file

@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
- remembering the currently parsed element;
- managing namespace declarations.
This parser also knows about HTML entities.'''
def __init__(self, env=None, caller=None):
def __init__(self, env=None, caller=None, raiseOnError=True):
'''p_env should be an instance of a class that inherits from
XmlEnvironment: it specifies the environment to use for this SAX
parser.'''
@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
self.caller = caller # The class calling this parser
self.parser = xml.sax.make_parser() # Fast, standard expat parser
self.res = None # The result of parsing.
# Raise or not an error when a parsing error is encountered.
self.raiseOnError = raiseOnError
# ContentHandler methods ---------------------------------------------------
def startDocument(self):
@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
self.characters('?')
# ErrorHandler methods ---------------------------------------------------
# Define methods below in your subclass if you want error handling that
# does not raise exceptions, but produces a partial result instead.
#def error(self, error): pass
#def fatalError(self, error): pass
#def warning(self, error): pass
def error(self, error):
if self.raiseOnError: raise error
else: print 'SAX error', error
def fatalError(self, error):
if self.raiseOnError: raise error
else: print 'SAX fatal error', error
def warning(self, error): pass
def parse(self, xml, source='string'):
'''Parses a XML stream.