Implemented management of HTML entities with expat (no need to get an external validating parser from _xmlplus). Works with Python2.4.4 and 2.4.6, but not from Python2.5.

2010-12-23 10:25:27 +01:00 · 2010-12-23 10:25:27 +01:00 · f3604624de
commit f3604624de
parent a30949a621
8 changed files with 1723 additions and 1721 deletions
--- a/pod/init.py
+++ b/pod/init.py
@ -29,7 +29,6 @@ XHTML_INNER_TAGS = ('b', 'i', 'u', 'em')
 XHTML_UNSTYLABLE_TAGS = XHTML_LISTS + ('li', 'a')
 XML_SPECIAL_CHARS = {'<': '&lt;', '>': '&gt;', '&': '&amp;', '"': '&quot;',
                     "'": '&apos;'}
-XML_ENTITIES = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}

 # ------------------------------------------------------------------------------
 class PodError(Exception):
--- a/pod/test/Tests.rtf
+++ b/pod/test/Tests.rtf
--- a/pod/test/results/xhtmlComplex.odt
+++ b/pod/test/results/xhtmlComplex.odt
--- a/pod/test/results/xhtmlEntities.odt
+++ b/pod/test/results/xhtmlEntities.odt
--- a/pod/test/results/xhtmlNominal.odt
+++ b/pod/test/results/xhtmlNominal.odt
--- a/pod/xhtml2odt.py
+++ b/pod/xhtml2odt.py
@ -37,39 +37,6 @@ NOT_INSIDE_P = XHTML_HEADINGS + XHTML_LISTS + ('table',) # Those elements
 # can't be rendered inside paragraphs.
 NOT_INSIDE_LIST = ('table',)
 IGNORABLE_TAGS = ('meta', 'title', 'style')
-HTML_ENTITIES = {
-        'iexcl': '¡',  'cent': '¢', 'pound': '£', 'curren': '€', 'yen': '¥',
-        'brvbar': 'Š', 'sect': '§', 'uml': '¨', 'copy':'©', 'ordf':'ª',
-        'laquo':'«', 'not':'¬', 'shy':'', 'reg':'®', 'macr':'¯', 'deg':'°',
-        'plusmn':'±', 'sup2':'²', 'sup3':'³', 'acute':'Ž',
-        'micro':'µ', 'para':'¶', 'middot':'·', 'cedil':'ž', 'sup1':'¹',
-        'ordm':'º', 'raquo':'»', 'frac14':'Œ', 'frac12':'œ', 'frac34':'Ÿ',
-        'iquest':'¿', 'Agrave':'À', 'Aacute':'Á', 'Acirc':'Â', 'Atilde':'Ã',
-        'Auml':'Ä', 'Aring':'Å', 'AElig':'Æ', 'Ccedil':'Ç', 'Egrave':'È',
-        'Eacute':'É', 'Ecirc':'Ê', 'Euml':'Ë', 'Igrave':'Ì', 'Iacute':'Í',
-        'Icirc':'Î', 'Iuml':'Ï', 'ETH':'Ð', 'Ntilde':'Ñ', 'Ograve':'Ò',
-        'Oacute':'Ó', 'Ocirc':'Ó', 'Otilde':'Õ', 'Ouml':'Ö', 'times':'×',
-        'Oslash':'Ø', 'Ugrave':'Ù', 'Uacute':'Ú', 'Ucirc':'Û', 'Uuml':'Ü',
-        'Yacute':'Ý', 'THORN':'Þ', 'szlig':'ß', 'agrave':'à', 'aacute':'á',
-        'acirc':'â', 'atilde':'ã', 'auml':'ä', 'aring':'å', 'aelig':'æ',
-        'ccedil':'ç', 'egrave':'è', 'eacute':'é', 'ecirc':'ê', 'euml':'ë',
-        'igrave':'ì', 'iacute':'í', 'icirc':'î', 'iuml':'ï', 'eth':'ð',
-        'ntilde':'ñ', 'ograve':'ò', 'oacute':'ó', 'ocirc':'ô', 'otilde':'õ',
-        'ouml':'ö', 'divide':'÷', 'oslash':'ø', 'ugrave':'ù', 'uacute':'ú',
-        'ucirc':'û', 'uuml':'ü', 'yacute':'ý', 'thorn':'þ', 'yuml':'ÿ',
-        'euro':'€', 'nbsp':' ', "rsquo":"'", "lsquo":"'", "ldquo":"'",
-        "rdquo":"'", 'ndash': ' ', 'oelig':'oe', 'quot': "'", 'mu': 'µ'}
-import htmlentitydefs
-for k, v in htmlentitydefs.entitydefs.iteritems():
-    if not HTML_ENTITIES.has_key(k) and not XML_ENTITIES.has_key(k):
-        HTML_ENTITIES[k] = ''
-
-# ------------------------------------------------------------------------------
-class Entity:
-    def __init__(self, name, value):
-        self.name = name
-        self.value = value.decode('utf-8')
-    def is_internal(self): return True

 # ------------------------------------------------------------------------------
 class HtmlElement:
@ -397,21 +364,6 @@ class XhtmlEnvironment(XmlEnvironment):

 # ------------------------------------------------------------------------------
 class XhtmlParser(XmlParser):
-    # Initialize entities recognized by this parser
-    entities = {}
-    for name, value in HTML_ENTITIES.iteritems():
-        entities[name] = Entity(name, value)
-
-    def __init__(self, *args, **kwargs):
-        XmlParser.__init__(self, *args, **kwargs)
-        # We override self.parser: we will use a different low-level
-        # xml.sax parser because we need to be able to tackle HTML as well as
-        # XML entities.
-        self.parser = xml.sax.make_parser(["xml.sax.drivers2.drv_xmlproc"])
-        # This parser is maybe less performant than the standard expat parser
-        # coded in C, but I could not find a way to manage unknown entities
-        # with the expat parser.
-
    def lowerizeInput(self, elem, attrs=None):
        '''Because (X)HTML is case insensitive, we may receive input p_elem and
           p_attrs in lower-, upper- or mixed-case. So here we produce lowercase
@ -427,15 +379,6 @@ class XhtmlParser(XmlParser):
        else:
            return resElem, resAttrs

-    def startDocument(self):
-        if hasattr(self.parser._parser, 'dtd'):
-            # If the parser is the standard expat, we can't deal with XHTML
-            # entities
-            dtd = self.parser._parser.dtd
-            # Add to the list of known entities the list of XHMLT entities.
-            # dtd.gen_ents only contains the 5 XML entities by default.
-            dtd.gen_ents.update(self.entities)
-
    def startElement(self, elem, attrs):
        elem, attrs = self.lowerizeInput(elem, attrs)
        e = XmlParser.startElement(self, elem, attrs)
--- a/shared/test.py
+++ b/shared/test.py
@ -104,7 +104,7 @@ class Test:
            if os.path.exists(expectedFlavourSpecific):
                expected = expectedFlavourSpecific
        # Perform the comparison
-        comparator = XmlComparator(expected, actual, areXml, xmlTagsToIgnore,
+        comparator = XmlComparator(actual, expected, areXml, xmlTagsToIgnore,
                                   xmlAttrsToIgnore)
        return not comparator.filesAreIdentical(
            report=self.report, encoding=encoding)
--- a/shared/xml_parser.py
+++ b/shared/xml_parser.py
@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # ------------------------------------------------------------------------------
 # Appy is a framework for building applications in the Python language.
 # Copyright (C) 2007 Gaetan Delannay
@ -18,16 +19,44 @@

 # ------------------------------------------------------------------------------
 import xml.sax, difflib, types
-from xml.sax.handler import ContentHandler, ErrorHandler
+from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
+                            property_interning_dict
 from xml.sax.xmlreader import InputSource
 from appy.shared import UnicodeBuffer, xmlPrologue
 from appy.shared.errors import AppyError

-# Error-related constants ------------------------------------------------------
+# Constants --------------------------------------------------------------------
 CONVERSION_ERROR = '"%s" value "%s" could not be converted by the XML ' \
                   'unmarshaller.'
 CUSTOM_CONVERSION_ERROR = 'Custom converter for "%s" values produced an ' \
                          'error while converting value "%s". %s'
+XML_ENTITIES = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': "'", 'apos': "'"}
+HTML_ENTITIES = {
+        'iexcl': '¡',  'cent': '¢', 'pound': '£', 'curren': '€', 'yen': '¥',
+        'brvbar': 'Š', 'sect': '§', 'uml': '¨', 'copy':'©', 'ordf':'ª',
+        'laquo':'«', 'not':'¬', 'shy':'', 'reg':'®', 'macr':'¯', 'deg':'°',
+        'plusmn':'±', 'sup2':'²', 'sup3':'³', 'acute':'Ž',
+        'micro':'µ', 'para':'¶', 'middot':'·', 'cedil':'ž', 'sup1':'¹',
+        'ordm':'º', 'raquo':'»', 'frac14':'Œ', 'frac12':'œ', 'frac34':'Ÿ',
+        'iquest':'¿', 'Agrave':'À', 'Aacute':'Á', 'Acirc':'Â', 'Atilde':'Ã',
+        'Auml':'Ä', 'Aring':'Å', 'AElig':'Æ', 'Ccedil':'Ç', 'Egrave':'È',
+        'Eacute':'É', 'Ecirc':'Ê', 'Euml':'Ë', 'Igrave':'Ì', 'Iacute':'Í',
+        'Icirc':'Î', 'Iuml':'Ï', 'ETH':'Ð', 'Ntilde':'Ñ', 'Ograve':'Ò',
+        'Oacute':'Ó', 'Ocirc':'Ó', 'Otilde':'Õ', 'Ouml':'Ö', 'times':'×',
+        'Oslash':'Ø', 'Ugrave':'Ù', 'Uacute':'Ú', 'Ucirc':'Û', 'Uuml':'Ü',
+        'Yacute':'Ý', 'THORN':'Þ', 'szlig':'ß', 'agrave':'à', 'aacute':'á',
+        'acirc':'â', 'atilde':'ã', 'auml':'ä', 'aring':'å', 'aelig':'æ',
+        'ccedil':'ç', 'egrave':'è', 'eacute':'é', 'ecirc':'ê', 'euml':'ë',
+        'igrave':'ì', 'iacute':'í', 'icirc':'î', 'iuml':'ï', 'eth':'ð',
+        'ntilde':'ñ', 'ograve':'ò', 'oacute':'ó', 'ocirc':'ô', 'otilde':'õ',
+        'ouml':'ö', 'divide':'÷', 'oslash':'ø', 'ugrave':'ù', 'uacute':'ú',
+        'ucirc':'û', 'uuml':'ü', 'yacute':'ý', 'thorn':'þ', 'yuml':'ÿ',
+        'euro':'€', 'nbsp':' ', "rsquo":"'", "lsquo":"'", "ldquo":"'",
+        "rdquo":"'", 'ndash': ' ', 'oelig':'oe', 'quot': "'", 'mu': 'µ'}
+import htmlentitydefs
+for k, v in htmlentitydefs.entitydefs.iteritems():
+    if not HTML_ENTITIES.has_key(k) and not XML_ENTITIES.has_key(k):
+        HTML_ENTITIES[k] = ''

 # ------------------------------------------------------------------------------
 class XmlElement:
@ -90,9 +119,10 @@ class XmlEnvironment:
        return self.namespaces[nsUri]

 class XmlParser(ContentHandler, ErrorHandler):
-    '''Basic XML content handler that does things like :
+    '''Basic expat-based XML parser that does things like :
      - remembering the currently parsed element;
-      - managing namespace declarations.'''
+      - managing namespace declarations.
+      This parser also knows about HTML entities.'''
    def __init__(self, env=None, caller=None):
        '''p_env should be an instance of a class that inherits from
           XmlEnvironment: it specifies the environment to use for this SAX
@ -104,6 +134,10 @@ class XmlParser(ContentHandler, ErrorHandler):
        self.caller = caller # The class calling this parser
        self.parser = xml.sax.make_parser() # Fast, standard expat parser
        self.res = None # The result of parsing.
+
+    # ContentHandler methods ---------------------------------------------------
+    def startDocument(self):
+        self.parser._parser.UseForeignDTD(True)
    def setDocumentLocator(self, locator):
        self.locator = locator
        return self.env
@ -123,14 +157,25 @@ class XmlParser(ContentHandler, ErrorHandler):
        return self.env
    def characters(self, content):
        return self.env
+
+    def skippedEntity(self, name):
+        '''This method is called every time expat does not recognize an entity.
+           We provide here support for HTML entities.'''
+        if HTML_ENTITIES.has_key(name):
+            self.characters(HTML_ENTITIES[name].decode('utf-8'))
+        else:
+            # Put a question mark instead of raising an exception.
+            self.characters('?')
+
    def parse(self, xmlContent, source='string'):
-        '''Parsers the XML file or string p_xmlContent.'''
+        '''Parses the XML file or string p_xmlContent.'''
        try:
            from cStringIO import StringIO
        except ImportError:
            from StringIO import StringIO
        self.parser.setContentHandler(self)
        self.parser.setErrorHandler(self)
+        self.parser.setFeature(feature_external_ges, False)
        inputSource = InputSource()
        if source == 'string':
            inputSource.setByteStream(StringIO(xmlContent))