[gen] Added the possiblity to extract in a clean way text from XHTML field values.
This commit is contained in:
parent
d3e2478d6b
commit
a2ae839704
3 changed files with 60 additions and 2 deletions
|
@ -206,6 +206,9 @@ def normalizeString(s, usage='fileName'):
|
|||
# We work in unicode. Convert p_s to unicode if not unicode.
|
||||
if isinstance(s, str): s = s.decode('utf-8')
|
||||
elif not isinstance(s, unicode): s = unicode(s)
|
||||
if usage == 'extractedText':
|
||||
# Replace single quotes with blanks.
|
||||
s = s.replace("'", " ").replace(u'’', ' ')
|
||||
# Remove any special char like accents.
|
||||
s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
|
||||
# Remove any other char, depending on p_usage.
|
||||
|
|
|
@ -19,8 +19,7 @@
|
|||
|
||||
# ------------------------------------------------------------------------------
|
||||
import xml.sax, difflib, types, cgi
|
||||
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
|
||||
property_interning_dict
|
||||
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges
|
||||
from xml.sax.xmlreader import InputSource
|
||||
from xml.sax import SAXParseException
|
||||
from appy.shared import UnicodeBuffer, xmlPrologue
|
||||
|
@ -170,6 +169,13 @@ class XmlParser(ContentHandler, ErrorHandler):
|
|||
# Put a question mark instead of raising an exception.
|
||||
self.characters('?')
|
||||
|
||||
# ErrorHandler methods ---------------------------------------------------
|
||||
# Define methods below in your subclass if you want error handling that
|
||||
# does not raise exceptions, but produces a partial result instead.
|
||||
#def error(self, error): pass
|
||||
#def fatalError(self, error): pass
|
||||
#def warning(self, error): pass
|
||||
|
||||
def parse(self, xml, source='string'):
|
||||
'''Parses a XML stream.
|
||||
* If p_source is "string", p_xml must be a string containing
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue