[gen] Added the possiblity to extract in a clean way text from XHTML field values.

2012-09-25 21:43:45 +02:00 · 2012-09-25 21:43:45 +02:00 · a2ae839704
commit a2ae839704
parent d3e2478d6b
3 changed files with 60 additions and 2 deletions
--- a/shared/utils.py
+++ b/shared/utils.py
@ -206,6 +206,9 @@ def normalizeString(s, usage='fileName'):
    # We work in unicode. Convert p_s to unicode if not unicode.
    if isinstance(s, str):           s = s.decode('utf-8')
    elif not isinstance(s, unicode): s = unicode(s)
+    if usage == 'extractedText':
+        # Replace single quotes with blanks.
+        s = s.replace("'", " ").replace(u'’', ' ')
    # Remove any special char like accents.
    s = unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
    # Remove any other char, depending on p_usage.
--- a/shared/xml_parser.py
+++ b/shared/xml_parser.py
@ -19,8 +19,7 @@

 # ------------------------------------------------------------------------------
 import xml.sax, difflib, types, cgi
-from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
-                            property_interning_dict
+from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges
 from xml.sax.xmlreader import InputSource
 from xml.sax import SAXParseException
 from appy.shared import UnicodeBuffer, xmlPrologue
@ -170,6 +169,13 @@ class XmlParser(ContentHandler, ErrorHandler):
            # Put a question mark instead of raising an exception.
            self.characters('?')

+    # ErrorHandler methods ---------------------------------------------------
+    # Define methods below in your subclass if you want error handling that
+    # does not raise exceptions, but produces a partial result instead.
+    #def error(self, error): pass
+    #def fatalError(self, error): pass
+    #def warning(self, error): pass
+
    def parse(self, xml, source='string'):
        '''Parses a XML stream.
           * If p_source is "string", p_xml must be a string containing