[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields.

This commit is contained in:
Gaetan Delannay 2012-09-26 23:13:02 +02:00
parent a2ae839704
commit 8d1a88bd27
7 changed files with 164 additions and 89 deletions

View file

@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
res = s
return res
# ------------------------------------------------------------------------------
def normalizeText(s):
'''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
purposes.'''
return normalizeString(s, usage='extractedText').strip().lower()
# ------------------------------------------------------------------------------
def formatNumber(n, sep=',', precision=2, tsep=' '):
'''Returns a string representation of number p_n, which can be a float
or integer. p_sep is the decimal separator to use. p_precision is the

View file

@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
- remembering the currently parsed element;
- managing namespace declarations.
This parser also knows about HTML entities.'''
def __init__(self, env=None, caller=None):
def __init__(self, env=None, caller=None, raiseOnError=True):
'''p_env should be an instance of a class that inherits from
XmlEnvironment: it specifies the environment to use for this SAX
parser.'''
@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
self.caller = caller # The class calling this parser
self.parser = xml.sax.make_parser() # Fast, standard expat parser
self.res = None # The result of parsing.
# Raise or not an error when a parsing error is encountered.
self.raiseOnError = raiseOnError
# ContentHandler methods ---------------------------------------------------
def startDocument(self):
@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
self.characters('?')
# ErrorHandler methods ---------------------------------------------------
# Define methods below in your subclass if you want error handling that
# does not raise exceptions, but produces a partial result instead.
#def error(self, error): pass
#def fatalError(self, error): pass
#def warning(self, error): pass
def error(self, error):
if self.raiseOnError: raise error
else: print 'SAX error', error
def fatalError(self, error):
if self.raiseOnError: raise error
else: print 'SAX fatal error', error
def warning(self, error): pass
def parse(self, xml, source='string'):
'''Parses a XML stream.