[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields.
This commit is contained in:
parent
a2ae839704
commit
8d1a88bd27
|
@ -7,6 +7,7 @@ from appy.gen.layout import Table
|
||||||
from appy.gen.layout import defaultFieldLayouts
|
from appy.gen.layout import defaultFieldLayouts
|
||||||
from appy.gen.po import PoMessage
|
from appy.gen.po import PoMessage
|
||||||
from appy.gen.mail import sendNotification
|
from appy.gen.mail import sendNotification
|
||||||
|
from appy.gen.indexer import defaultIndexes
|
||||||
from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects
|
from appy.gen.utils import GroupDescr, Keywords, getClassName, SomeObjects
|
||||||
import appy.pod
|
import appy.pod
|
||||||
from appy.pod.renderer import Renderer
|
from appy.pod.renderer import Renderer
|
||||||
|
@ -33,13 +34,6 @@ def initMasterValue(v):
|
||||||
else: res = v
|
else: res = v
|
||||||
return [str(v) for v in res]
|
return [str(v) for v in res]
|
||||||
|
|
||||||
# Default Appy indexes ---------------------------------------------------------
|
|
||||||
defaultIndexes = {
|
|
||||||
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'ZCTextIndex',
|
|
||||||
'SortableTitle': 'FieldIndex', 'SearchableText': 'ZCTextIndex',
|
|
||||||
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
|
|
||||||
'Allowed': 'KeywordIndex'}
|
|
||||||
|
|
||||||
# Descriptor classes used for refining descriptions of elements in types
|
# Descriptor classes used for refining descriptions of elements in types
|
||||||
# (pages, groups,...) ----------------------------------------------------------
|
# (pages, groups,...) ----------------------------------------------------------
|
||||||
class Page:
|
class Page:
|
||||||
|
@ -325,7 +319,7 @@ class Search:
|
||||||
if usage == 'search': return 'Title'
|
if usage == 'search': return 'Title'
|
||||||
else: return 'SortableTitle'
|
else: return 'SortableTitle'
|
||||||
# Indeed, for field 'title', Appy has a specific index
|
# Indeed, for field 'title', Appy has a specific index
|
||||||
# 'SortableTitle', because index 'Title' is a ZCTextIndex
|
# 'SortableTitle', because index 'Title' is a TextIndex
|
||||||
# (for searchability) and can't be used for sorting.
|
# (for searchability) and can't be used for sorting.
|
||||||
elif fieldName == 'state': return 'State'
|
elif fieldName == 'state': return 'State'
|
||||||
elif fieldName in defaultIndexes: return fieldName
|
elif fieldName in defaultIndexes: return fieldName
|
||||||
|
@ -337,8 +331,8 @@ class Search:
|
||||||
value as required for searching in the index corresponding to
|
value as required for searching in the index corresponding to
|
||||||
p_fieldName.'''
|
p_fieldName.'''
|
||||||
if fieldName == 'title':
|
if fieldName == 'title':
|
||||||
# Title is a ZCTextIndex. We must split p_fieldValue into keywords.
|
# Title is a TextIndex. We must split p_fieldValue into keywords.
|
||||||
res = Keywords(fieldValue.decode('utf-8')).get()
|
res = Keywords(fieldValue).get()
|
||||||
elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'):
|
elif isinstance(fieldValue, basestring) and fieldValue.endswith('*'):
|
||||||
v = fieldValue[:-1]
|
v = fieldValue[:-1]
|
||||||
# Warning: 'z' is higher than 'Z'!
|
# Warning: 'z' is higher than 'Z'!
|
||||||
|
@ -1436,10 +1430,14 @@ class String(Type):
|
||||||
|
|
||||||
def getIndexType(self):
|
def getIndexType(self):
|
||||||
'''Index type varies depending on String parameters.'''
|
'''Index type varies depending on String parameters.'''
|
||||||
# If String.isSelect, be it multivalued or not, we define a ZCTextIndex:
|
# If String.isSelect, be it multivalued or not, we define a ListIndex:
|
||||||
# this way we can use AND/OR operator.
|
# this way we can use AND/OR operator.
|
||||||
if self.isSelect or (self.format in (String.TEXT, String.XHTML)):
|
if self.isSelect:
|
||||||
return 'ZCTextIndex'
|
return 'ListIndex'
|
||||||
|
elif self.format == String.TEXT:
|
||||||
|
return 'TextIndex'
|
||||||
|
elif self.format == String.XHTML:
|
||||||
|
return 'XhtmlIndex'
|
||||||
return Type.getIndexType(self)
|
return Type.getIndexType(self)
|
||||||
|
|
||||||
def getJs(self, layoutType, res):
|
def getJs(self, layoutType, res):
|
||||||
|
@ -1918,7 +1916,7 @@ class Ref(Type):
|
||||||
def getFormattedValue(self, obj, value):
|
def getFormattedValue(self, obj, value):
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def getIndexType(self): return 'ZCTextIndex'
|
def getIndexType(self): return 'TextIndex'
|
||||||
|
|
||||||
def getIndexValue(self, obj, forSearch=False):
|
def getIndexValue(self, obj, forSearch=False):
|
||||||
'''Value for indexing is the list of UIDs of linked objects. If
|
'''Value for indexing is the list of UIDs of linked objects. If
|
||||||
|
|
133
gen/indexer.py
133
gen/indexer.py
|
@ -2,9 +2,87 @@
|
||||||
indexed.'''
|
indexed.'''
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
from Products.ZCTextIndex.PipelineFactory import element_factory
|
|
||||||
from appy.shared.xml_parser import XmlParser
|
from appy.shared.xml_parser import XmlParser
|
||||||
from appy.shared.utils import normalizeString
|
from appy.shared.utils import normalizeText
|
||||||
|
|
||||||
|
# Default Appy indexes ---------------------------------------------------------
|
||||||
|
defaultIndexes = {
|
||||||
|
'State': 'FieldIndex', 'UID': 'FieldIndex', 'Title': 'TextIndex',
|
||||||
|
'SortableTitle': 'FieldIndex', 'SearchableText': 'XhtmlIndex',
|
||||||
|
'Creator': 'FieldIndex', 'Created': 'DateIndex', 'ClassName': 'FieldIndex',
|
||||||
|
'Allowed': 'KeywordIndex'}
|
||||||
|
|
||||||
|
# Stuff for creating or updating the indexes -----------------------------------
|
||||||
|
class TextIndexInfo:
|
||||||
|
'''Parameters for a text ZCTextIndex.'''
|
||||||
|
lexicon_id = "text_lexicon"
|
||||||
|
index_type = 'Okapi BM25 Rank'
|
||||||
|
|
||||||
|
class XhtmlIndexInfo:
|
||||||
|
'''Parameters for a html ZCTextIndex.'''
|
||||||
|
lexicon_id = "xhtml_lexicon"
|
||||||
|
index_type = 'Okapi BM25 Rank'
|
||||||
|
|
||||||
|
class ListIndexInfo:
|
||||||
|
'''Parameters for a list ZCTextIndex.'''
|
||||||
|
lexicon_id = "list_lexicon"
|
||||||
|
index_type = 'Okapi BM25 Rank'
|
||||||
|
|
||||||
|
def updateIndexes(installer, indexInfo):
|
||||||
|
'''This function updates the indexes defined in the catalog.'''
|
||||||
|
catalog = installer.app.catalog
|
||||||
|
logger = installer.logger
|
||||||
|
for indexName, indexType in indexInfo.iteritems():
|
||||||
|
indexRealType = indexType
|
||||||
|
if indexType in ('XhtmlIndex', 'TextIndex', 'ListIndex'):
|
||||||
|
indexRealType = 'ZCTextIndex'
|
||||||
|
# If this index already exists but with a different type (or with a
|
||||||
|
# deprecated lexicon), remove it.
|
||||||
|
if indexName in catalog.indexes():
|
||||||
|
indexObject = catalog.Indexes[indexName]
|
||||||
|
oldType = indexObject.__class__.__name__
|
||||||
|
toDelete = False
|
||||||
|
if (oldType != indexRealType):
|
||||||
|
toDelete = True
|
||||||
|
info = indexRealType
|
||||||
|
elif (oldType == 'ZCTextIndex') and \
|
||||||
|
(indexObject.lexicon_id == 'lexicon'):
|
||||||
|
toDelete = True
|
||||||
|
info = '%s (%s)' % (oldType, indexType)
|
||||||
|
if toDelete:
|
||||||
|
catalog.delIndex(indexName)
|
||||||
|
logger.info('Index %s (%s) to replace as %s.' % \
|
||||||
|
(indexName, oldType, info))
|
||||||
|
if indexName not in catalog.indexes():
|
||||||
|
# We need to (re-)create this index.
|
||||||
|
if indexType == 'TextIndex':
|
||||||
|
catalog.addIndex(indexName, indexRealType, extra=TextIndexInfo)
|
||||||
|
elif indexType == 'XhtmlIndex':
|
||||||
|
catalog.addIndex(indexName, indexRealType, extra=XhtmlIndexInfo)
|
||||||
|
elif indexType == 'ListIndex':
|
||||||
|
catalog.addIndex(indexName, indexRealType, extra=ListIndexInfo)
|
||||||
|
else:
|
||||||
|
catalog.addIndex(indexName, indexType)
|
||||||
|
# Indexing database content based on this index.
|
||||||
|
logger.info('Reindexing %s (%s)...' % (indexName, indexType))
|
||||||
|
catalog.reindexIndex(indexName, installer.app.REQUEST)
|
||||||
|
logger.info('Done.')
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
def splitIntoWords(text):
|
||||||
|
'''Split the cleaned index value p_text into words (returns a list of
|
||||||
|
words). Words of a single char are ignored, excepted digits which are
|
||||||
|
always kept. Duplicate words are removed (result is a set and not a
|
||||||
|
list).'''
|
||||||
|
res = text.split(' ')
|
||||||
|
# Remove tokens of a single char (excepted if this char is a digit).
|
||||||
|
i = len(res)-1
|
||||||
|
while i > -1 :
|
||||||
|
if (len(res[i]) < 2) and not res[i].isdigit():
|
||||||
|
del res[i]
|
||||||
|
i -= 1
|
||||||
|
# Remove duplicates
|
||||||
|
return set(res)
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class XhtmlTextExtractor(XmlParser):
|
class XhtmlTextExtractor(XmlParser):
|
||||||
|
@ -18,32 +96,41 @@ class XhtmlTextExtractor(XmlParser):
|
||||||
return XmlParser.endDocument(self)
|
return XmlParser.endDocument(self)
|
||||||
|
|
||||||
def characters(self, content):
|
def characters(self, content):
|
||||||
c = normalizeString(content, usage='extractedText').strip().lower()
|
c = normalizeText(content)
|
||||||
if len(c) > 1: self.res.append(c)
|
if len(c) > 1: self.res.append(c)
|
||||||
return self.env
|
|
||||||
|
|
||||||
# Do not raise exceptions when errors occur.
|
|
||||||
def error(self, error): pass
|
|
||||||
def fatalError(self, error): pass
|
|
||||||
def warning(self, error): pass
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class XhtmlIndexer:
|
class XhtmlIndexer:
|
||||||
'''Extracts, from XHTML field values, the text to index.'''
|
'''Extracts, from XHTML field values, the text to index.'''
|
||||||
def process(self, text):
|
def process(self, texts):
|
||||||
# Wrap the XHTML chunk into a root tag, to get valid XML.
|
res = set()
|
||||||
text = '<p>%s</p>' % text[0]
|
for text in texts:
|
||||||
parser = XhtmlTextExtractor()
|
extractor = XhtmlTextExtractor(raiseOnError=False)
|
||||||
text = parser.parse(text)
|
cleanText = extractor.parse('<p>%s</p>' % text)
|
||||||
res = text.split(' ')
|
res = res.union(splitIntoWords(cleanText))
|
||||||
# Remove tokens of a single char.
|
return list(res)
|
||||||
i = len(res)-1
|
|
||||||
while i > -1 :
|
|
||||||
if (len(res[i]) < 2) and not res[i].isdigit():
|
|
||||||
del res[i]
|
|
||||||
i -= 1
|
|
||||||
return res
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
element_factory.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
|
class TextIndexer:
|
||||||
|
'''Extracts, from text field values, a normalized value to index.'''
|
||||||
|
def process(self, texts):
|
||||||
|
res = set()
|
||||||
|
for text in texts:
|
||||||
|
cleanText = normalizeText(text)
|
||||||
|
res = res.union(splitIntoWords(cleanText))
|
||||||
|
return list(res)
|
||||||
|
|
||||||
|
class ListIndexer:
|
||||||
|
'''This lexicon does nothing: list of values must be indexed as is.'''
|
||||||
|
def process(self, texts): return texts
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
try:
|
||||||
|
from Products.ZCTextIndex.PipelineFactory import element_factory as ef
|
||||||
|
ef.registerFactory('XHTML indexer', 'XHTML indexer', XhtmlIndexer)
|
||||||
|
ef.registerFactory('Text indexer', 'Text indexer', TextIndexer)
|
||||||
|
ef.registerFactory('List indexer', 'List indexer', ListIndexer)
|
||||||
|
except ImportError:
|
||||||
|
# May occur at generation time.
|
||||||
|
pass
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
|
|
|
@ -8,6 +8,7 @@ import appy.version
|
||||||
import appy.gen as gen
|
import appy.gen as gen
|
||||||
from appy.gen.po import PoParser
|
from appy.gen.po import PoParser
|
||||||
from appy.gen.utils import updateRolesForPermission, createObject
|
from appy.gen.utils import updateRolesForPermission, createObject
|
||||||
|
from appy.gen.indexer import defaultIndexes, updateIndexes
|
||||||
from appy.gen.migrator import Migrator
|
from appy.gen.migrator import Migrator
|
||||||
from appy.shared.data import languages
|
from appy.shared.data import languages
|
||||||
|
|
||||||
|
@ -63,11 +64,6 @@ def onDelSession(sessionObject, container):
|
||||||
resp.write('<center>For security reasons, your session has ' \
|
resp.write('<center>For security reasons, your session has ' \
|
||||||
'expired.</center>')
|
'expired.</center>')
|
||||||
|
|
||||||
class ZCTextIndexInfo:
|
|
||||||
'''Silly class used for storing information about a ZCTextIndex.'''
|
|
||||||
lexicon_id = "lexicon"
|
|
||||||
index_type = 'Okapi BM25 Rank'
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class ZopeInstaller:
|
class ZopeInstaller:
|
||||||
'''This Zope installer runs every time Zope starts and encounters this
|
'''This Zope installer runs every time Zope starts and encounters this
|
||||||
|
@ -148,35 +144,6 @@ class ZopeInstaller:
|
||||||
self.app.manage_delObjects(['standard_error_message'])
|
self.app.manage_delObjects(['standard_error_message'])
|
||||||
manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage)
|
manage_addPageTemplate(self.app, 'standard_error_message', '',errorPage)
|
||||||
|
|
||||||
def installIndexes(self, indexInfo):
|
|
||||||
'''Updates indexes in the catalog.'''
|
|
||||||
catalog = self.app.catalog
|
|
||||||
logger = self.logger
|
|
||||||
for indexName, indexType in indexInfo.iteritems():
|
|
||||||
# If this index already exists but with a different type, remove it.
|
|
||||||
if indexName in catalog.indexes():
|
|
||||||
oldType = catalog.Indexes[indexName].__class__.__name__
|
|
||||||
if oldType != indexType:
|
|
||||||
catalog.delIndex(indexName)
|
|
||||||
logger.info('Existing index "%s" of type "%s" was removed:'\
|
|
||||||
' we need to recreate it with type "%s".' % \
|
|
||||||
(indexName, oldType, indexType))
|
|
||||||
if indexName not in catalog.indexes():
|
|
||||||
# We need to create this index
|
|
||||||
if indexType != 'ZCTextIndex':
|
|
||||||
catalog.addIndex(indexName, indexType)
|
|
||||||
else:
|
|
||||||
catalog.addIndex(indexName, indexType,extra=ZCTextIndexInfo)
|
|
||||||
# Indexing database content based on this index.
|
|
||||||
catalog.reindexIndex(indexName, self.app.REQUEST)
|
|
||||||
logger.info('Created index "%s" of type "%s"...' % \
|
|
||||||
(indexName, indexType))
|
|
||||||
|
|
||||||
lexiconInfos = [
|
|
||||||
appy.Object(group='Case Normalizer', name='Case Normalizer'),
|
|
||||||
appy.Object(group='Stop Words', name=" Don't remove stop words"),
|
|
||||||
appy.Object(group='Word Splitter', name='Whitespace splitter')
|
|
||||||
]
|
|
||||||
def installCatalog(self):
|
def installCatalog(self):
|
||||||
'''Create the catalog at the root of Zope if id does not exist.'''
|
'''Create the catalog at the root of Zope if id does not exist.'''
|
||||||
if 'catalog' not in self.app.objectIds():
|
if 'catalog' not in self.app.objectIds():
|
||||||
|
@ -185,19 +152,30 @@ class ZopeInstaller:
|
||||||
manage_addZCatalog(self.app, 'catalog', '')
|
manage_addZCatalog(self.app, 'catalog', '')
|
||||||
self.logger.info('Appy catalog created.')
|
self.logger.info('Appy catalog created.')
|
||||||
|
|
||||||
# Create a lexicon for ZCTextIndexes
|
# Create lexicons for ZCTextIndexes
|
||||||
if 'lexicon' not in self.app.catalog.objectIds():
|
catalog = self.app.catalog
|
||||||
|
lexicons = catalog.objectIds()
|
||||||
from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
|
from Products.ZCTextIndex.ZCTextIndex import manage_addLexicon
|
||||||
manage_addLexicon(self.app.catalog, 'lexicon',
|
if 'xhtml_lexicon' not in lexicons:
|
||||||
elements=self.lexiconInfos)
|
lex = appy.Object(group='XHTML indexer', name='XHTML indexer')
|
||||||
|
manage_addLexicon(catalog, 'xhtml_lexicon', elements=[lex])
|
||||||
|
if 'text_lexicon' not in lexicons:
|
||||||
|
lex = appy.Object(group='Text indexer', name='Text indexer')
|
||||||
|
manage_addLexicon(catalog, 'text_lexicon', elements=[lex])
|
||||||
|
if 'list_lexicon' not in lexicons:
|
||||||
|
lex = appy.Object(group='List indexer', name='List indexer')
|
||||||
|
manage_addLexicon(catalog, 'list_lexicon', elements=[lex])
|
||||||
|
|
||||||
|
# Delete the deprecated one if it exists
|
||||||
|
if 'lexicon' in lexicons: catalog.manage_delObjects(['lexicon'])
|
||||||
|
|
||||||
# Create or update Appy-wide indexes and field-related indexes
|
# Create or update Appy-wide indexes and field-related indexes
|
||||||
indexInfo = gen.defaultIndexes.copy()
|
indexInfo = defaultIndexes.copy()
|
||||||
tool = self.app.config
|
tool = self.app.config
|
||||||
for className in self.config.attributes.iterkeys():
|
for className in self.config.attributes.iterkeys():
|
||||||
wrapperClass = tool.getAppyClass(className, wrapper=True)
|
wrapperClass = tool.getAppyClass(className, wrapper=True)
|
||||||
indexInfo.update(wrapperClass.getIndexes(includeDefaults=False))
|
indexInfo.update(wrapperClass.getIndexes(includeDefaults=False))
|
||||||
self.installIndexes(indexInfo)
|
updateIndexes(self, indexInfo)
|
||||||
|
|
||||||
def getAddPermission(self, className):
|
def getAddPermission(self, className):
|
||||||
'''What is the name of the permission allowing to create instances of
|
'''What is the name of the permission allowing to create instances of
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
import re, os, os.path
|
import re, os, os.path
|
||||||
|
from appy.shared.utils import normalizeText
|
||||||
|
|
||||||
# Function for creating a Zope object ------------------------------------------
|
# Function for creating a Zope object ------------------------------------------
|
||||||
def createObject(folder, id, className, appName, wf=True, noSecurity=False):
|
def createObject(folder, id, className, appName, wf=True, noSecurity=False):
|
||||||
|
@ -243,12 +244,12 @@ class SomeObjects:
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class Keywords:
|
class Keywords:
|
||||||
'''This class allows to handle keywords that a user enters and that will be
|
'''This class allows to handle keywords that a user enters and that will be
|
||||||
used as basis for performing requests in a Zope ZCTextIndex.'''
|
used as basis for performing requests in a TextIndex/XhtmlIndex.'''
|
||||||
|
|
||||||
toRemove = '?-+*()'
|
toRemove = '?-+*()'
|
||||||
def __init__(self, keywords, operator='AND'):
|
def __init__(self, keywords, operator='AND'):
|
||||||
# Clean the p_keywords that the user has entered.
|
# Clean the p_keywords that the user has entered.
|
||||||
words = keywords.strip()
|
words = normalizeText(keywords)
|
||||||
if words == '*': words = ''
|
if words == '*': words = ''
|
||||||
for c in self.toRemove: words = words.replace(c, ' ')
|
for c in self.toRemove: words = words.replace(c, ' ')
|
||||||
self.keywords = words.split()
|
self.keywords = words.split()
|
||||||
|
@ -267,7 +268,7 @@ class Keywords:
|
||||||
self.keywords.insert(0, word)
|
self.keywords.insert(0, word)
|
||||||
|
|
||||||
def get(self):
|
def get(self):
|
||||||
'''Returns the keywords as needed by the ZCTextIndex.'''
|
'''Returns the keywords as needed by the TextIndex.'''
|
||||||
if self.keywords:
|
if self.keywords:
|
||||||
op = ' %s ' % self.operator
|
op = ' %s ' % self.operator
|
||||||
return op.join(self.keywords)+'*'
|
return op.join(self.keywords)+'*'
|
||||||
|
|
|
@ -4,8 +4,8 @@
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
import os, os.path, mimetypes
|
import os, os.path, mimetypes
|
||||||
import appy.pod
|
import appy.pod
|
||||||
from appy.gen import Type, Search, Ref, String, WorkflowAnonymous, \
|
from appy.gen import Type, Search, Ref, String, WorkflowAnonymous
|
||||||
defaultIndexes
|
from appy.gen.indexer import defaultIndexes
|
||||||
from appy.gen.utils import createObject
|
from appy.gen.utils import createObject
|
||||||
from appy.shared.utils import getOsTempFolder, executeCommand, \
|
from appy.shared.utils import getOsTempFolder, executeCommand, \
|
||||||
normalizeString, sequenceTypes
|
normalizeString, sequenceTypes
|
||||||
|
|
|
@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
|
||||||
res = s
|
res = s
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
|
def normalizeText(s):
|
||||||
|
'''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
|
||||||
|
purposes.'''
|
||||||
|
return normalizeString(s, usage='extractedText').strip().lower()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------------------
|
||||||
def formatNumber(n, sep=',', precision=2, tsep=' '):
|
def formatNumber(n, sep=',', precision=2, tsep=' '):
|
||||||
'''Returns a string representation of number p_n, which can be a float
|
'''Returns a string representation of number p_n, which can be a float
|
||||||
or integer. p_sep is the decimal separator to use. p_precision is the
|
or integer. p_sep is the decimal separator to use. p_precision is the
|
||||||
|
|
|
@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
|
||||||
- remembering the currently parsed element;
|
- remembering the currently parsed element;
|
||||||
- managing namespace declarations.
|
- managing namespace declarations.
|
||||||
This parser also knows about HTML entities.'''
|
This parser also knows about HTML entities.'''
|
||||||
def __init__(self, env=None, caller=None):
|
def __init__(self, env=None, caller=None, raiseOnError=True):
|
||||||
'''p_env should be an instance of a class that inherits from
|
'''p_env should be an instance of a class that inherits from
|
||||||
XmlEnvironment: it specifies the environment to use for this SAX
|
XmlEnvironment: it specifies the environment to use for this SAX
|
||||||
parser.'''
|
parser.'''
|
||||||
|
@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
|
||||||
self.caller = caller # The class calling this parser
|
self.caller = caller # The class calling this parser
|
||||||
self.parser = xml.sax.make_parser() # Fast, standard expat parser
|
self.parser = xml.sax.make_parser() # Fast, standard expat parser
|
||||||
self.res = None # The result of parsing.
|
self.res = None # The result of parsing.
|
||||||
|
# Raise or not an error when a parsing error is encountered.
|
||||||
|
self.raiseOnError = raiseOnError
|
||||||
|
|
||||||
# ContentHandler methods ---------------------------------------------------
|
# ContentHandler methods ---------------------------------------------------
|
||||||
def startDocument(self):
|
def startDocument(self):
|
||||||
|
@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
|
||||||
self.characters('?')
|
self.characters('?')
|
||||||
|
|
||||||
# ErrorHandler methods ---------------------------------------------------
|
# ErrorHandler methods ---------------------------------------------------
|
||||||
# Define methods below in your subclass if you want error handling that
|
def error(self, error):
|
||||||
# does not raise exceptions, but produces a partial result instead.
|
if self.raiseOnError: raise error
|
||||||
#def error(self, error): pass
|
else: print 'SAX error', error
|
||||||
#def fatalError(self, error): pass
|
def fatalError(self, error):
|
||||||
#def warning(self, error): pass
|
if self.raiseOnError: raise error
|
||||||
|
else: print 'SAX fatal error', error
|
||||||
|
def warning(self, error): pass
|
||||||
|
|
||||||
def parse(self, xml, source='string'):
|
def parse(self, xml, source='string'):
|
||||||
'''Parses a XML stream.
|
'''Parses a XML stream.
|
||||||
|
|
Loading…
Reference in a new issue