Implemented management of HTML entities with expat (no need to get an external validating parser from _xmlplus). Works with Python2.4.4 and 2.4.6, but not from Python2.5.
This commit is contained in:
parent
a30949a621
commit
f3604624de
|
@ -29,7 +29,6 @@ XHTML_INNER_TAGS = ('b', 'i', 'u', 'em')
|
||||||
XHTML_UNSTYLABLE_TAGS = XHTML_LISTS + ('li', 'a')
|
XHTML_UNSTYLABLE_TAGS = XHTML_LISTS + ('li', 'a')
|
||||||
XML_SPECIAL_CHARS = {'<': '<', '>': '>', '&': '&', '"': '"',
|
XML_SPECIAL_CHARS = {'<': '<', '>': '>', '&': '&', '"': '"',
|
||||||
"'": '''}
|
"'": '''}
|
||||||
XML_ENTITIES = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class PodError(Exception):
|
class PodError(Exception):
|
||||||
|
|
3323
pod/test/Tests.rtf
3323
pod/test/Tests.rtf
File diff suppressed because it is too large
Load diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -37,39 +37,6 @@ NOT_INSIDE_P = XHTML_HEADINGS + XHTML_LISTS + ('table',) # Those elements
|
||||||
# can't be rendered inside paragraphs.
|
# can't be rendered inside paragraphs.
|
||||||
NOT_INSIDE_LIST = ('table',)
|
NOT_INSIDE_LIST = ('table',)
|
||||||
IGNORABLE_TAGS = ('meta', 'title', 'style')
|
IGNORABLE_TAGS = ('meta', 'title', 'style')
|
||||||
HTML_ENTITIES = {
|
|
||||||
'iexcl': '¡', 'cent': '¢', 'pound': '£', 'curren': '€', 'yen': '¥',
|
|
||||||
'brvbar': 'Š', 'sect': '§', 'uml': '¨', 'copy':'©', 'ordf':'ª',
|
|
||||||
'laquo':'«', 'not':'¬', 'shy':'', 'reg':'®', 'macr':'¯', 'deg':'°',
|
|
||||||
'plusmn':'±', 'sup2':'²', 'sup3':'³', 'acute':'Ž',
|
|
||||||
'micro':'µ', 'para':'¶', 'middot':'·', 'cedil':'ž', 'sup1':'¹',
|
|
||||||
'ordm':'º', 'raquo':'»', 'frac14':'Œ', 'frac12':'œ', 'frac34':'Ÿ',
|
|
||||||
'iquest':'¿', 'Agrave':'À', 'Aacute':'Á', 'Acirc':'Â', 'Atilde':'Ã',
|
|
||||||
'Auml':'Ä', 'Aring':'Å', 'AElig':'Æ', 'Ccedil':'Ç', 'Egrave':'È',
|
|
||||||
'Eacute':'É', 'Ecirc':'Ê', 'Euml':'Ë', 'Igrave':'Ì', 'Iacute':'Í',
|
|
||||||
'Icirc':'Î', 'Iuml':'Ï', 'ETH':'Ð', 'Ntilde':'Ñ', 'Ograve':'Ò',
|
|
||||||
'Oacute':'Ó', 'Ocirc':'Ó', 'Otilde':'Õ', 'Ouml':'Ö', 'times':'×',
|
|
||||||
'Oslash':'Ø', 'Ugrave':'Ù', 'Uacute':'Ú', 'Ucirc':'Û', 'Uuml':'Ü',
|
|
||||||
'Yacute':'Ý', 'THORN':'Þ', 'szlig':'ß', 'agrave':'à', 'aacute':'á',
|
|
||||||
'acirc':'â', 'atilde':'ã', 'auml':'ä', 'aring':'å', 'aelig':'æ',
|
|
||||||
'ccedil':'ç', 'egrave':'è', 'eacute':'é', 'ecirc':'ê', 'euml':'ë',
|
|
||||||
'igrave':'ì', 'iacute':'í', 'icirc':'î', 'iuml':'ï', 'eth':'ð',
|
|
||||||
'ntilde':'ñ', 'ograve':'ò', 'oacute':'ó', 'ocirc':'ô', 'otilde':'õ',
|
|
||||||
'ouml':'ö', 'divide':'÷', 'oslash':'ø', 'ugrave':'ù', 'uacute':'ú',
|
|
||||||
'ucirc':'û', 'uuml':'ü', 'yacute':'ý', 'thorn':'þ', 'yuml':'ÿ',
|
|
||||||
'euro':'€', 'nbsp':' ', "rsquo":"'", "lsquo":"'", "ldquo":"'",
|
|
||||||
"rdquo":"'", 'ndash': ' ', 'oelig':'oe', 'quot': "'", 'mu': 'µ'}
|
|
||||||
import htmlentitydefs
|
|
||||||
for k, v in htmlentitydefs.entitydefs.iteritems():
|
|
||||||
if not HTML_ENTITIES.has_key(k) and not XML_ENTITIES.has_key(k):
|
|
||||||
HTML_ENTITIES[k] = ''
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
|
||||||
class Entity:
|
|
||||||
def __init__(self, name, value):
|
|
||||||
self.name = name
|
|
||||||
self.value = value.decode('utf-8')
|
|
||||||
def is_internal(self): return True
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class HtmlElement:
|
class HtmlElement:
|
||||||
|
@ -397,21 +364,6 @@ class XhtmlEnvironment(XmlEnvironment):
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class XhtmlParser(XmlParser):
|
class XhtmlParser(XmlParser):
|
||||||
# Initialize entities recognized by this parser
|
|
||||||
entities = {}
|
|
||||||
for name, value in HTML_ENTITIES.iteritems():
|
|
||||||
entities[name] = Entity(name, value)
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
XmlParser.__init__(self, *args, **kwargs)
|
|
||||||
# We override self.parser: we will use a different low-level
|
|
||||||
# xml.sax parser because we need to be able to tackle HTML as well as
|
|
||||||
# XML entities.
|
|
||||||
self.parser = xml.sax.make_parser(["xml.sax.drivers2.drv_xmlproc"])
|
|
||||||
# This parser is maybe less performant than the standard expat parser
|
|
||||||
# coded in C, but I could not find a way to manage unknown entities
|
|
||||||
# with the expat parser.
|
|
||||||
|
|
||||||
def lowerizeInput(self, elem, attrs=None):
|
def lowerizeInput(self, elem, attrs=None):
|
||||||
'''Because (X)HTML is case insensitive, we may receive input p_elem and
|
'''Because (X)HTML is case insensitive, we may receive input p_elem and
|
||||||
p_attrs in lower-, upper- or mixed-case. So here we produce lowercase
|
p_attrs in lower-, upper- or mixed-case. So here we produce lowercase
|
||||||
|
@ -427,15 +379,6 @@ class XhtmlParser(XmlParser):
|
||||||
else:
|
else:
|
||||||
return resElem, resAttrs
|
return resElem, resAttrs
|
||||||
|
|
||||||
def startDocument(self):
|
|
||||||
if hasattr(self.parser._parser, 'dtd'):
|
|
||||||
# If the parser is the standard expat, we can't deal with XHTML
|
|
||||||
# entities
|
|
||||||
dtd = self.parser._parser.dtd
|
|
||||||
# Add to the list of known entities the list of XHMLT entities.
|
|
||||||
# dtd.gen_ents only contains the 5 XML entities by default.
|
|
||||||
dtd.gen_ents.update(self.entities)
|
|
||||||
|
|
||||||
def startElement(self, elem, attrs):
|
def startElement(self, elem, attrs):
|
||||||
elem, attrs = self.lowerizeInput(elem, attrs)
|
elem, attrs = self.lowerizeInput(elem, attrs)
|
||||||
e = XmlParser.startElement(self, elem, attrs)
|
e = XmlParser.startElement(self, elem, attrs)
|
||||||
|
|
|
@ -104,8 +104,8 @@ class Test:
|
||||||
if os.path.exists(expectedFlavourSpecific):
|
if os.path.exists(expectedFlavourSpecific):
|
||||||
expected = expectedFlavourSpecific
|
expected = expectedFlavourSpecific
|
||||||
# Perform the comparison
|
# Perform the comparison
|
||||||
comparator = XmlComparator(expected, actual, areXml, xmlTagsToIgnore,
|
comparator = XmlComparator(actual, expected, areXml, xmlTagsToIgnore,
|
||||||
xmlAttrsToIgnore)
|
xmlAttrsToIgnore)
|
||||||
return not comparator.filesAreIdentical(
|
return not comparator.filesAreIdentical(
|
||||||
report=self.report, encoding=encoding)
|
report=self.report, encoding=encoding)
|
||||||
def run(self):
|
def run(self):
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
# Appy is a framework for building applications in the Python language.
|
# Appy is a framework for building applications in the Python language.
|
||||||
# Copyright (C) 2007 Gaetan Delannay
|
# Copyright (C) 2007 Gaetan Delannay
|
||||||
|
@ -18,16 +19,44 @@
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
import xml.sax, difflib, types
|
import xml.sax, difflib, types
|
||||||
from xml.sax.handler import ContentHandler, ErrorHandler
|
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
|
||||||
|
property_interning_dict
|
||||||
from xml.sax.xmlreader import InputSource
|
from xml.sax.xmlreader import InputSource
|
||||||
from appy.shared import UnicodeBuffer, xmlPrologue
|
from appy.shared import UnicodeBuffer, xmlPrologue
|
||||||
from appy.shared.errors import AppyError
|
from appy.shared.errors import AppyError
|
||||||
|
|
||||||
# Error-related constants ------------------------------------------------------
|
# Constants --------------------------------------------------------------------
|
||||||
CONVERSION_ERROR = '"%s" value "%s" could not be converted by the XML ' \
|
CONVERSION_ERROR = '"%s" value "%s" could not be converted by the XML ' \
|
||||||
'unmarshaller.'
|
'unmarshaller.'
|
||||||
CUSTOM_CONVERSION_ERROR = 'Custom converter for "%s" values produced an ' \
|
CUSTOM_CONVERSION_ERROR = 'Custom converter for "%s" values produced an ' \
|
||||||
'error while converting value "%s". %s'
|
'error while converting value "%s". %s'
|
||||||
|
XML_ENTITIES = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': "'", 'apos': "'"}
|
||||||
|
HTML_ENTITIES = {
|
||||||
|
'iexcl': '¡', 'cent': '¢', 'pound': '£', 'curren': '€', 'yen': '¥',
|
||||||
|
'brvbar': 'Š', 'sect': '§', 'uml': '¨', 'copy':'©', 'ordf':'ª',
|
||||||
|
'laquo':'«', 'not':'¬', 'shy':'', 'reg':'®', 'macr':'¯', 'deg':'°',
|
||||||
|
'plusmn':'±', 'sup2':'²', 'sup3':'³', 'acute':'Ž',
|
||||||
|
'micro':'µ', 'para':'¶', 'middot':'·', 'cedil':'ž', 'sup1':'¹',
|
||||||
|
'ordm':'º', 'raquo':'»', 'frac14':'Œ', 'frac12':'œ', 'frac34':'Ÿ',
|
||||||
|
'iquest':'¿', 'Agrave':'À', 'Aacute':'Á', 'Acirc':'Â', 'Atilde':'Ã',
|
||||||
|
'Auml':'Ä', 'Aring':'Å', 'AElig':'Æ', 'Ccedil':'Ç', 'Egrave':'È',
|
||||||
|
'Eacute':'É', 'Ecirc':'Ê', 'Euml':'Ë', 'Igrave':'Ì', 'Iacute':'Í',
|
||||||
|
'Icirc':'Î', 'Iuml':'Ï', 'ETH':'Ð', 'Ntilde':'Ñ', 'Ograve':'Ò',
|
||||||
|
'Oacute':'Ó', 'Ocirc':'Ó', 'Otilde':'Õ', 'Ouml':'Ö', 'times':'×',
|
||||||
|
'Oslash':'Ø', 'Ugrave':'Ù', 'Uacute':'Ú', 'Ucirc':'Û', 'Uuml':'Ü',
|
||||||
|
'Yacute':'Ý', 'THORN':'Þ', 'szlig':'ß', 'agrave':'à', 'aacute':'á',
|
||||||
|
'acirc':'â', 'atilde':'ã', 'auml':'ä', 'aring':'å', 'aelig':'æ',
|
||||||
|
'ccedil':'ç', 'egrave':'è', 'eacute':'é', 'ecirc':'ê', 'euml':'ë',
|
||||||
|
'igrave':'ì', 'iacute':'í', 'icirc':'î', 'iuml':'ï', 'eth':'ð',
|
||||||
|
'ntilde':'ñ', 'ograve':'ò', 'oacute':'ó', 'ocirc':'ô', 'otilde':'õ',
|
||||||
|
'ouml':'ö', 'divide':'÷', 'oslash':'ø', 'ugrave':'ù', 'uacute':'ú',
|
||||||
|
'ucirc':'û', 'uuml':'ü', 'yacute':'ý', 'thorn':'þ', 'yuml':'ÿ',
|
||||||
|
'euro':'€', 'nbsp':' ', "rsquo":"'", "lsquo":"'", "ldquo":"'",
|
||||||
|
"rdquo":"'", 'ndash': ' ', 'oelig':'oe', 'quot': "'", 'mu': 'µ'}
|
||||||
|
import htmlentitydefs
|
||||||
|
for k, v in htmlentitydefs.entitydefs.iteritems():
|
||||||
|
if not HTML_ENTITIES.has_key(k) and not XML_ENTITIES.has_key(k):
|
||||||
|
HTML_ENTITIES[k] = ''
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
class XmlElement:
|
class XmlElement:
|
||||||
|
@ -90,9 +119,10 @@ class XmlEnvironment:
|
||||||
return self.namespaces[nsUri]
|
return self.namespaces[nsUri]
|
||||||
|
|
||||||
class XmlParser(ContentHandler, ErrorHandler):
|
class XmlParser(ContentHandler, ErrorHandler):
|
||||||
'''Basic XML content handler that does things like :
|
'''Basic expat-based XML parser that does things like :
|
||||||
- remembering the currently parsed element;
|
- remembering the currently parsed element;
|
||||||
- managing namespace declarations.'''
|
- managing namespace declarations.
|
||||||
|
This parser also knows about HTML entities.'''
|
||||||
def __init__(self, env=None, caller=None):
|
def __init__(self, env=None, caller=None):
|
||||||
'''p_env should be an instance of a class that inherits from
|
'''p_env should be an instance of a class that inherits from
|
||||||
XmlEnvironment: it specifies the environment to use for this SAX
|
XmlEnvironment: it specifies the environment to use for this SAX
|
||||||
|
@ -104,6 +134,10 @@ class XmlParser(ContentHandler, ErrorHandler):
|
||||||
self.caller = caller # The class calling this parser
|
self.caller = caller # The class calling this parser
|
||||||
self.parser = xml.sax.make_parser() # Fast, standard expat parser
|
self.parser = xml.sax.make_parser() # Fast, standard expat parser
|
||||||
self.res = None # The result of parsing.
|
self.res = None # The result of parsing.
|
||||||
|
|
||||||
|
# ContentHandler methods ---------------------------------------------------
|
||||||
|
def startDocument(self):
|
||||||
|
self.parser._parser.UseForeignDTD(True)
|
||||||
def setDocumentLocator(self, locator):
|
def setDocumentLocator(self, locator):
|
||||||
self.locator = locator
|
self.locator = locator
|
||||||
return self.env
|
return self.env
|
||||||
|
@ -123,14 +157,25 @@ class XmlParser(ContentHandler, ErrorHandler):
|
||||||
return self.env
|
return self.env
|
||||||
def characters(self, content):
|
def characters(self, content):
|
||||||
return self.env
|
return self.env
|
||||||
|
|
||||||
|
def skippedEntity(self, name):
|
||||||
|
'''This method is called every time expat does not recognize an entity.
|
||||||
|
We provide here support for HTML entities.'''
|
||||||
|
if HTML_ENTITIES.has_key(name):
|
||||||
|
self.characters(HTML_ENTITIES[name].decode('utf-8'))
|
||||||
|
else:
|
||||||
|
# Put a question mark instead of raising an exception.
|
||||||
|
self.characters('?')
|
||||||
|
|
||||||
def parse(self, xmlContent, source='string'):
|
def parse(self, xmlContent, source='string'):
|
||||||
'''Parsers the XML file or string p_xmlContent.'''
|
'''Parses the XML file or string p_xmlContent.'''
|
||||||
try:
|
try:
|
||||||
from cStringIO import StringIO
|
from cStringIO import StringIO
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
self.parser.setContentHandler(self)
|
self.parser.setContentHandler(self)
|
||||||
self.parser.setErrorHandler(self)
|
self.parser.setErrorHandler(self)
|
||||||
|
self.parser.setFeature(feature_external_ges, False)
|
||||||
inputSource = InputSource()
|
inputSource = InputSource()
|
||||||
if source == 'string':
|
if source == 'string':
|
||||||
inputSource.setByteStream(StringIO(xmlContent))
|
inputSource.setByteStream(StringIO(xmlContent))
|
||||||
|
|
Loading…
Reference in a new issue