[gen] More robust XHTML cleaning.

This commit is contained in:
Gaetan Delannay 2012-05-22 16:42:20 +02:00
parent 36257b1b3a
commit e66daeb151
3 changed files with 42 additions and 14 deletions

View file

@ -1239,7 +1239,13 @@ class String(Type):
# When image upload is allowed, ckeditor inserts some "style" attrs # When image upload is allowed, ckeditor inserts some "style" attrs
# (ie for image size when images are resized). So in this case we # (ie for image size when images are resized). So in this case we
# can't remove style-related information. # can't remove style-related information.
value = XhtmlCleaner().clean(value, keepStyles=self.richText) try:
value = XhtmlCleaner().clean(value, keepStyles=self.richText)
except XhtmlCleaner.Error, e:
# Errors while parsing p_value can't prevent the user from
# storing it.
obj.log('Unparsable XHTML content in field "%s".' % self.name,
type='warning')
Type.store(self, obj, value) Type.store(self, obj, value)
def getFormattedValue(self, obj, value): def getFormattedValue(self, obj, value):

View file

@ -214,6 +214,7 @@ class Debianizer:
f = file(name, 'w') f = file(name, 'w')
f.write(initScript % ('oo', 'oo', 'Start OpenOffice in server mode', f.write(initScript % ('oo', 'oo', 'Start OpenOffice in server mode',
'startoo', 'startoo', "#Can't stop OO.")) 'startoo', 'startoo', "#Can't stop OO."))
f.write('\n')
f.close() f.close()
os.chmod(name, 0744) # Make it executable by owner. os.chmod(name, 0744) # Make it executable by owner.
# Get the size of the app, in Kb. # Get the size of the app, in Kb.

View file

@ -22,6 +22,7 @@ import xml.sax, difflib, types, cgi
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\ from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
property_interning_dict property_interning_dict
from xml.sax.xmlreader import InputSource from xml.sax.xmlreader import InputSource
from xml.sax import SAXParseException
from appy.shared import UnicodeBuffer, xmlPrologue from appy.shared import UnicodeBuffer, xmlPrologue
from appy.shared.errors import AppyError from appy.shared.errors import AppyError
from appy.shared.utils import sequenceTypes from appy.shared.utils import sequenceTypes
@ -890,13 +891,16 @@ class XmlComparator:
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class XhtmlCleaner(XmlParser): class XhtmlCleaner(XmlParser):
'''This class cleans XHTML content, so it becomes ready to be stored into a
Appy-compliant format.'''
class Error(Exception): pass
# Tags that will not be in the result, content included, if keepStyles is # Tags that will not be in the result, content included, if keepStyles is
# False. # False.
tagsToIgnoreWithContent = ('style', 'colgroup') tagsToIgnoreWithContent = ('style', 'colgroup')
# Tags that will be removed from the result, but whose content will be kept, # Tags that will be removed from the result, but whose content will be kept,
# if keepStyles is False. # if keepStyles is False.
tagsToIgnoreKeepContent= ('x', 'font') tagsToIgnoreKeepContent= ('x', 'font', 'center')
# All tags to ignore # All tags to ignore
tagsToIgnore = tagsToIgnoreWithContent + tagsToIgnoreKeepContent tagsToIgnore = tagsToIgnoreWithContent + tagsToIgnoreKeepContent
# Attributes to ignore, if keepStyles if False. # Attributes to ignore, if keepStyles if False.
@ -909,20 +913,33 @@ class XhtmlCleaner(XmlParser):
# Tags that required a line break to be inserted after them. # Tags that required a line break to be inserted after them.
lineBreakTags = ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td') lineBreakTags = ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td')
'''This class has 2 objectives: # A pre-cleaning phase consists in performing some replacements before
# running the XML SAX parsing. The dict below contains such repls.
preCleanRepls = {' ': ' '}
1. The main objective is to format XHTML p_s to be storable in the ZODB def preClean(self, s):
according to Appy rules. '''Before true XHTML cleaning, this method performs pre-cleaning by
a. Every <p> or <li> must be on a single line (ending with a carriage performing, on p_s, replacements as defined in self.preCleanRepls.'''
return); else, appy.shared.diff will not be able to compute XHTML for item, repl in self.preCleanRepls.iteritems():
diffs; if item in s:
b. Optimize size: HTML comments are removed. s = s.replace(item, repl)
return s
2. If p_keepStyles (or m_clean) is False, some style-related information
will be removed, in order to get a standardized content that can be
dumped in an elegant and systematic manner into a POD template.
'''
def clean(self, s, keepStyles=True): def clean(self, s, keepStyles=True):
'''Cleaning XHTML code is done for 2 reasons:
1. The main objective is to format XHTML p_s to be storable in the
ZODB according to Appy rules.
a. Every <p> or <li> must be on a single line (ending with a
carriage return); else, appy.shared.diff will not be able to
compute XHTML diffs;
b. Optimize size: HTML comments are removed.
2. If p_keepStyles (or m_clean) is False, some style-related
information will be removed, in order to get a standardized
content that can be dumped in an elegant and systematic manner
into a POD template.
'''
# Must we keep style-related information or not? # Must we keep style-related information or not?
self.env.keepStyles = keepStyles self.env.keepStyles = keepStyles
self.env.currentContent = '' self.env.currentContent = ''
@ -934,7 +951,11 @@ class XhtmlCleaner(XmlParser):
# 'ignoreContent' is True if, within the currently ignored tag, we must # 'ignoreContent' is True if, within the currently ignored tag, we must
# also ignore its content. # also ignore its content.
self.env.ignoreContent = False self.env.ignoreContent = False
return self.parse('<x>%s</x>' % s).encode('utf-8') try:
res = self.parse('<x>%s</x>' % self.preClean(s)).encode('utf-8')
except SAXParseException, e:
raise self.Error(str(e))
return res
def startDocument(self): def startDocument(self):
# The result will be cleaned XHTML, joined from self.res. # The result will be cleaned XHTML, joined from self.res.