[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields.
This commit is contained in:
parent
a2ae839704
commit
8d1a88bd27
|
@ -7,6 +7,7 @@ from appy.gen.layout import Table
|
|||
from appy.gen.layout import defaultFieldLayouts
|
||||
from appy.gen.po import PoMessage
|
||||
from appy.gen.mail import sendNotification
|
||||
from appy.gen.indexer import defaultIndexes
|
||||
from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects
|
||||
import appy.pod
|
||||
from appy.pod.renderer import Renderer
|
||||
|
@ -33,13 +34,6 @@ def initMasterValue(v):
|
|||
else: res = v
|
||||
return [str(v) for v in res]
|
||||
|
||||
# Default Appy indexes ---------------------------------------------------------
|
||||
defaultIndexes = {
|
||||
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex',
|
||||
'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex',
|
||||
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
|
||||
'Allowed': 'KeywordIndex'}
|
||||
|
||||
# Descriptor classes used for refining descriptions of elements in types
|
||||
# (pages, groups,...) ----------------------------------------------------------
|
||||
class Page:
|
||||
|
@ -325,7 +319,7 @@ class Search:
|
|||
if usage == 'search': return 'Title'
|
||||
else: return 'SortableTitle'
|
||||
# Indeed, for field 'title', Appy has a specific index
|
||||
# 'SortableTitle', because index 'Title' is a ZCTextIndex
|
||||
# 'SortableTitle', because index 'Title' is a TextIndex
|
||||
# (for searchability) and can't be used for sorting.
|
||||
elif fieldName == 'state': return 'State'
|
||||
elif fieldName in defaultIndexes: return fieldName
|
||||
|
@ -337,8 +331,8 @@ class Search:
|
|||
value as required for searching in the index corresponding to
|
||||
p_fieldName.'''
|
||||
if fieldName == 'title':
|
||||
# Title is a ZCTextIndex. We must split p_fieldValue into keywords.
|
||||
res = Keywords(fieldValue.decode('utf-8')).get()
|
||||
# Title is a TextIndex. We must split p_fieldValue into keywords.
|
||||
res = Keywords(fieldValue).get()
|
||||
elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'):
|
||||
v = fieldValue[:-1]
|
||||
# Warning: 'z' is higher than 'Z'!
|
||||
|
@ -1436,10 +1430,14 @@ class String(Type):
|
|||
|
||||
def getIndexType(self):
|
||||
'''Index type varies depending on String parameters.'''
|
||||
# If String.isSelect, be it multivalued or not, we define a ZCTextIndex:
|
||||
# If String.isSelect, be it multivalued or not, we define a ListIndex:
|
||||
# this way we can use AND/OR operator.
|
||||
if self.isSelect or (self.format in (String.TEXT, String.XHTML)):
|
||||
return 'ZCTextIndex'
|
||||
if self.isSelect:
|
||||
return 'ListIndex'
|
||||
elif self.format == String.TEXT:
|
||||
return 'TextIndex'
|
||||
elif self.format == String.XHTML:
|
||||
return 'XhtmlIndex'
|
||||
return Type.getIndexType(self)
|
||||
|
||||
def getJs(self, layoutType, res):
|
||||
|
@ -1918,7 +1916,7 @@ class Ref(Type):
|
|||
def getFormattedValue(self, obj, value):
|
||||
return value
|
||||
|
||||
def getIndexType(self): return 'ZCTextIndex'
|
||||
def getIndexType(self): return 'TextIndex'
|
||||
|
||||
def getIndexValue(self, obj, forSearch=False):
|
||||
'''Value for indexing is the list of UIDs of linked objects. If
|
||||
|
|
133
gen/indexer.py
133
gen/indexer.py
|
@ -2,9 +2,87 @@
|
|||
indexed.'''
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
from Products.ZCTextIndex.PipelineFactory import element_factory
|
||||
from appy.shared.xml_parser import XmlParser
|
||||
from appy.shared.utils import normalizeString
|
||||
from appy.shared.utils import normalizeText
|
||||
|
||||
# Default Appy indexes ---------------------------------------------------------
|
||||
defaultIndexes = {
|
||||
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',
|
||||
'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex',
|
||||
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
|
||||
'Allowed': 'KeywordIndex'}
|
||||
|
||||
# Stuff for creating or updating the indexes -----------------------------------
|
||||
class TextIndexInfo:
|
||||
'''Parameters for a text ZCTextIndex.'''
|
||||
lexicon_id = "text_lexicon"
|
||||
index_type = 'Okapi BM25 Rank'
|
||||
|
||||
class XhtmlIndexInfo:
|
||||
'''Parameters for a html ZCTextIndex.'''
|
||||
lexicon_id = "xhtml_lexicon"
|
||||
index_type = 'Okapi BM25 Rank'
|
||||
|
||||
class ListIndexInfo:
|
||||
'''Parameters for a list ZCTextIndex.'''
|
||||
lexicon_id = "list_lexicon"
|
||||
index_type = 'Okapi BM25 Rank'
|
||||
|
||||
def updateIndexes(installer, indexInfo):
|
||||
'''This function updates the indexes defined in the catalog.'''
|
||||
catalog = installer.app.catalog
|
||||
logger = installer.logger
|
||||
for indexName, indexType in indexInfo.iteritems():
|
||||
indexRealType = indexType
|
||||
if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):
|
||||
indexRealType = 'ZCTextIndex'
|
||||
# If this index already exists but with a different type (or with a
|
||||
# deprecated lexicon), remove it.
|
||||
if indexName in catalog.indexes():
|
||||
indexObject = catalog.Indexes[indexName]
|
||||
oldType = indexObject.__class__.__name__
|
||||
toDelete = False
|
||||
if (oldType != indexRealType):
|
||||
toDelete = True
|
||||
info = indexRealType
|
||||
elif (oldType == 'ZCTextIndex') and \
|
||||
(indexObject.lexicon_id == 'lexicon'):
|
||||
toDelete = True
|
||||
info = '%s (%s)' % (oldType, indexType)
|
||||
if toDelete:
|
||||
catalog.delIndex(indexName)
|
||||
logger.info('Index %s (%s) to replace as %s.' % \
|
||||
(indexName, oldType, info))
|
||||
if indexName not in catalog.indexes():
|
||||
# We need to (re-)create this index.
|
||||
if indexType == 'TextIndex':
|
||||
catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)
|
||||
elif indexType == 'XhtmlIndex':
|
||||
catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)
|
||||
elif indexType == 'ListIndex':
|
||||
catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)
|
||||
else:
|
||||
catalog.addIndex(indexName, indexType)
|
||||
# Indexing database content based on this index.
|
||||
logger.info('Reindexing %s (%s)...' % (indexName, indexType))
|
||||
catalog.reindexIndex(indexName, installer.app.REQUEST)
|
||||
logger.info('Done.')
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
def splitIntoWords(text):
|
||||
'''Split the cleaned index value p_text into words (returns a list of
|
||||
words). Words of a single char are ignored, excepted digits which are
|
||||
always kept. Duplicate words are removed (result is a set and not a
|
||||
list).'''
|
||||
res = text.split(' ')
|
||||
# Remove tokens of a single char (excepted if this char is a digit).
|
||||
i = len(res)-1
|
||||
while i > -1 :
|
||||
if (len(res[i]) < 2) and not res[i].isdigit():
|
||||
del res[i]
|
||||
i -= 1
|
||||
# Remove duplicates
|
||||
return set(res)
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
class XhtmlTextExtractor(XmlParser):
|
||||
|
@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser):
|
|||
return XmlParser.endDocument(self)
|
||||
|
||||
def characters(self, content):
|
||||
c = normalizeString(content, usage='extractedText').strip().lower()
|
||||
c = normalizeText(content)
|
||||
if len(c) > 1: self.res.append(c)
|
||||
return self.env
|
||||
|
||||
# Do not raise exceptions when errors occur.
|
||||
def error(self, error): pass
|
||||
def fatalError(self, error): pass
|
||||
def warning(self, error): pass
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
class XhtmlIndexer:
|
||||
'''Extracts, from XHTML field values, the text to index.'''
|
||||
def process(self, text):
|
||||
# Wrap the XHTML chunk into a root tag, to get valid XML.
|
||||
text = '<p>%s</p>' % text[0]
|
||||
parser = XhtmlTextExtractor()
|
||||
text = parser.parse(text)
|
||||
res = text.split(' ')
|
||||
# Remove tokens of a single char.
|
||||
i = len(res)-1
|
||||
while i > -1 :
|
||||
if (len(res[i]) < 2) and not res[i].isdigit():
|
||||
del res[i]
|
||||
i -= 1
|
||||
return res
|
||||
def process(self, texts):
|
||||
res = set()
|
||||
for text in texts:
|
||||
extractor = XhtmlTextExtractor(raiseOnError=False)
|
||||
cleanText = extractor.parse('<p>%s</p>' % text)
|
||||
res = res.union(splitIntoWords(cleanText))
|
||||
return list(res)
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
|
||||
class TextIndexer:
|
||||
'''Extracts, from text field values, a normalized value to index.'''
|
||||
def process(self, texts):
|
||||
res = set()
|
||||
for text in texts:
|
||||
cleanText = normalizeText(text)
|
||||
res = res.union(splitIntoWords(cleanText))
|
||||
return list(res)
|
||||
|
||||
class ListIndexer:
|
||||
'''This lexicon does nothing: list of values must be indexed as is.'''
|
||||
def process(self, texts): return texts
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
try:
|
||||
from Products.ZCTextIndex.PipelineFactory import element_factory as ef
|
||||
ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
|
||||
ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
|
||||
ef.registerFactory('List indexer', 'List indexer', ListIndexer)
|
||||
except ImportError:
|
||||
# May occur at generation time.
|
||||
pass
|
||||
# ------------------------------------------------------------------------------
|
||||
|
|
|
@ -8,6 +8,7 @@ import appy.version
|
|||
import appy.gen as gen
|
||||
from appy.gen.po import PoParser
|
||||
from appy.gen.utils import updateRolesForPermission, createObject
|
||||
from appy.gen.indexer import defaultIndexes, updateIndexes
|
||||
from appy.gen.migrator import Migrator
|
||||
from appy.shared.data import languages
|
||||
|
||||
|
@ -63,11 +64,6 @@ def onDelSession(sessionObject, container):
|
|||
resp.write('<center>For security reasons, your session has ' \
|
||||
'expired.</center>')
|
||||
|
||||
class ZCTextIndexInfo:
|
||||
'''Silly class used for storing information about a ZCTextIndex.'''
|
||||
lexicon_id = "lexicon"
|
||||
index_type = 'Okapi BM25 Rank'
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
class ZopeInstaller:
|
||||
'''This Zope installer runs every time Zope starts and encounters this
|
||||
|
@ -148,35 +144,6 @@ class ZopeInstaller:
|
|||
self.app.manage_delObjects(['standard_error_message'])
|
||||
manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage)
|
||||
|
||||
def installIndexes(self, indexInfo):
|
||||
'''Updates indexes in the catalog.'''
|
||||
catalog = self.app.catalog
|
||||
logger = self.logger
|
||||
for indexName, indexType in indexInfo.iteritems():
|
||||
# If this index already exists but with a different type, remove it.
|
||||
if indexName in catalog.indexes():
|
||||
oldType = catalog.Indexes[indexName].__class__.__name__
|
||||
if oldType != indexType:
|
||||
catalog.delIndex(indexName)
|
||||
logger.info('Existing index "%s" of type "%s" was removed:'\
|
||||
' we need to recreate it with type "%s".' % \
|
||||
(indexName, oldType, indexType))
|
||||
if indexName not in catalog.indexes():
|
||||
# We need to create this index
|
||||
if indexType != 'ZCTextIndex':
|
||||
catalog.addIndex(indexName, indexType)
|
||||
else:
|
||||
catalog.addIndex(indexName, indexType,extra=ZCTextIndexInfo)
|
||||
# Indexing database content based on this index.
|
||||
catalog.reindexIndex(indexName, self.app.REQUEST)
|
||||
logger.info('Created index "%s" of type "%s"...' % \
|
||||
(indexName, indexType))
|
||||
|
||||
lexiconInfos = [
|
||||
appy.Object(group='Case Normalizer', name='Case Normalizer'),
|
||||
appy.Object(group='Stop Words', name=" Don't remove stop words"),
|
||||
appy.Object(group='Word Splitter', name='Whitespace splitter')
|
||||
]
|
||||
def installCatalog(self):
|
||||
'''Create the catalog at the root of Zope if id does not exist.'''
|
||||
if 'catalog' not in self.app.objectIds():
|
||||
|
@ -185,19 +152,30 @@ class ZopeInstaller:
|
|||
manage_addZCatalog(self.app, 'catalog', '')
|
||||
self.logger.info('Appy catalog created.')
|
||||
|
||||
# Create a lexicon for ZCTextIndexes
|
||||
if 'lexicon' not in self.app.catalog.objectIds():
|
||||
from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
|
||||
manage_addLexicon(self.app.catalog, 'lexicon',
|
||||
elements=self.lexiconInfos)
|
||||
# Create lexicons for ZCTextIndexes
|
||||
catalog = self.app.catalog
|
||||
lexicons = catalog.objectIds()
|
||||
from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
|
||||
if 'xhtml_lexicon' not in lexicons:
|
||||
lex = appy.Object(group='XHTML indexer', name='XHTML indexer')
|
||||
manage_addLexicon(catalog, 'xhtml_lexicon', elements=[lex])
|
||||
if 'text_lexicon' not in lexicons:
|
||||
lex = appy.Object(group='Text indexer', name='Text indexer')
|
||||
manage_addLexicon(catalog, 'text_lexicon', elements=[lex])
|
||||
if 'list_lexicon' not in lexicons:
|
||||
lex = appy.Object(group='List indexer', name='List indexer')
|
||||
manage_addLexicon(catalog, 'list_lexicon', elements=[lex])
|
||||
|
||||
# Delete the deprecated one if it exists
|
||||
if 'lexicon' in lexicons: catalog.manage_delObjects(['lexicon'])
|
||||
|
||||
# Create or update Appy-wide indexes and field-related indexes
|
||||
indexInfo = gen.defaultIndexes.copy()
|
||||
indexInfo = defaultIndexes.copy()
|
||||
tool = self.app.config
|
||||
for className in self.config.attributes.iterkeys():
|
||||
wrapperClass = tool.getAppyClass(className, wrapper=True)
|
||||
indexInfo.update(wrapperClass.getIndexes(includeDefaults=False))
|
||||
self.installIndexes(indexInfo)
|
||||
updateIndexes(self, indexInfo)
|
||||
|
||||
def getAddPermission(self, className):
|
||||
'''What is the name of the permission allowing to create instances of
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# ------------------------------------------------------------------------------
|
||||
import re, os, os.path
|
||||
from appy.shared.utils import normalizeText
|
||||
|
||||
# Function for creating a Zope object ------------------------------------------
|
||||
def createObject(folder, id, className, appName, wf=True, noSecurity=False):
|
||||
|
@ -243,12 +244,12 @@ class SomeObjects:
|
|||
# ------------------------------------------------------------------------------
|
||||
class Keywords:
|
||||
'''This class allows to handle keywords that a user enters and that will be
|
||||
used as basis for performing requests in a Zope ZCTextIndex.'''
|
||||
used as basis for performing requests in a TextIndex/XhtmlIndex.'''
|
||||
|
||||
toRemove = '?-+*()'
|
||||
def __init__(self, keywords, operator='AND'):
|
||||
# Clean the p_keywords that the user has entered.
|
||||
words = keywords.strip()
|
||||
words = normalizeText(keywords)
|
||||
if words == '*': words = ''
|
||||
for c in self.toRemove: words = words.replace(c, ' ')
|
||||
self.keywords = words.split()
|
||||
|
@ -267,7 +268,7 @@ class Keywords:
|
|||
self.keywords.insert(0, word)
|
||||
|
||||
def get(self):
|
||||
'''Returns the keywords as needed by the ZCTextIndex.'''
|
||||
'''Returns the keywords as needed by the TextIndex.'''
|
||||
if self.keywords:
|
||||
op = ' %s ' % self.operator
|
||||
return op.join(self.keywords)+'*'
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
# ------------------------------------------------------------------------------
|
||||
import os, os.path, mimetypes
|
||||
import appy.pod
|
||||
from appy.gen import Type, Search, Ref, String, WorkflowAnonymous, \
|
||||
defaultIndexes
|
||||
from appy.gen import Type, Search, Ref, String, WorkflowAnonymous
|
||||
from appy.gen.indexer import defaultIndexes
|
||||
from appy.gen.utils import createObject
|
||||
from appy.shared.utils import getOsTempFolder, executeCommand, \
|
||||
normalizeString, sequenceTypes
|
||||
|
|
|
@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
|
|||
res = s
|
||||
return res
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
def normalizeText(s):
|
||||
'''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
|
||||
purposes.'''
|
||||
return normalizeString(s, usage='extractedText').strip().lower()
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
def formatNumber(n, sep=',', precision=2, tsep=' '):
|
||||
'''Returns a string representation of number p_n, which can be a float
|
||||
or integer. p_sep is the decimal separator to use. p_precision is the
|
||||
|
|
|
@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
|
|||
- remembering the currently parsed element;
|
||||
- managing namespace declarations.
|
||||
This parser also knows about HTML entities.'''
|
||||
def __init__(self, env=None, caller=None):
|
||||
def __init__(self, env=None, caller=None, raiseOnError=True):
|
||||
'''p_env should be an instance of a class that inherits from
|
||||
XmlEnvironment: it specifies the environment to use for this SAX
|
||||
parser.'''
|
||||
|
@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
|
|||
self.caller = caller # The class calling this parser
|
||||
self.parser = xml.sax.make_parser() # Fast, standard expat parser
|
||||
self.res = None # The result of parsing.
|
||||
# Raise or not an error when a parsing error is encountered.
|
||||
self.raiseOnError = raiseOnError
|
||||
|
||||
# ContentHandler methods ---------------------------------------------------
|
||||
def startDocument(self):
|
||||
|
@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
|
|||
self.characters('?')
|
||||
|
||||
# ErrorHandler methods ---------------------------------------------------
|
||||
# Define methods below in your subclass if you want error handling that
|
||||
# does not raise exceptions, but produces a partial result instead.
|
||||
#def error(self, error): pass
|
||||
#def fatalError(self, error): pass
|
||||
#def warning(self, error): pass
|
||||
def error(self, error):
|
||||
if self.raiseOnError: raise error
|
||||
else: print 'SAX error', error
|
||||
def fatalError(self, error):
|
||||
if self.raiseOnError: raise error
|
||||
else: print 'SAX fatal error', error
|
||||
def warning(self, error): pass
|
||||
|
||||
def parse(self, xml, source='string'):
|
||||
'''Parses a XML stream.
|
||||
|
|
Loading…
Reference in a new issue