[shared] xml_parser.XmlParser: added param 'raiseOnError' allowing to raise or not an exception when a SAX fatal parsing error is encountered; [gen] fine-tuned indexing machinery with more accurate text extraction from text and xhtml fields.
This commit is contained in:
parent
a2ae839704
commit
8d1a88bd27
7 changed files with 164 additions and 89 deletions
|
@ -227,6 +227,13 @@ def normalizeString(s, usage='fileName'):
|
|||
res = s
|
||||
return res
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
def normalizeText(s):
|
||||
'''Normalizes p_s: remove special chars, lowerizes it, etc, for indexing
|
||||
purposes.'''
|
||||
return normalizeString(s, usage='extractedText').strip().lower()
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
def formatNumber(n, sep=',', precision=2, tsep=' '):
|
||||
'''Returns a string representation of number p_n, which can be a float
|
||||
or integer. p_sep is the decimal separator to use. p_precision is the
|
||||
|
|
|
@ -125,7 +125,7 @@ class XmlParser(ContentHandler, ErrorHandler):
|
|||
- remembering the currently parsed element;
|
||||
- managing namespace declarations.
|
||||
This parser also knows about HTML entities.'''
|
||||
def __init__(self, env=None, caller=None):
|
||||
def __init__(self, env=None, caller=None, raiseOnError=True):
|
||||
'''p_env should be an instance of a class that inherits from
|
||||
XmlEnvironment: it specifies the environment to use for this SAX
|
||||
parser.'''
|
||||
|
@ -136,6 +136,8 @@ class XmlParser(ContentHandler, ErrorHandler):
|
|||
self.caller = caller # The class calling this parser
|
||||
self.parser = xml.sax.make_parser() # Fast, standard expat parser
|
||||
self.res = None # The result of parsing.
|
||||
# Raise or not an error when a parsing error is encountered.
|
||||
self.raiseOnError = raiseOnError
|
||||
|
||||
# ContentHandler methods ---------------------------------------------------
|
||||
def startDocument(self):
|
||||
|
@ -170,11 +172,13 @@ class XmlParser(ContentHandler, ErrorHandler):
|
|||
self.characters('?')
|
||||
|
||||
# ErrorHandler methods ---------------------------------------------------
|
||||
# Define methods below in your subclass if you want error handling that
|
||||
# does not raise exceptions, but produces a partial result instead.
|
||||
#def error(self, error): pass
|
||||
#def fatalError(self, error): pass
|
||||
#def warning(self, error): pass
|
||||
def error(self, error):
|
||||
if self.raiseOnError: raise error
|
||||
else: print 'SAX error', error
|
||||
def fatalError(self, error):
|
||||
if self.raiseOnError: raise error
|
||||
else: print 'SAX fatal error', error
|
||||
def warning(self, error): pass
|
||||
|
||||
def parse(self, xml, source='string'):
|
||||
'''Parses a XML stream.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue