[gen] More robust XHTML cleaning.

This commit is contained in:
Gaetan Delannay 2012-05-22 16:42:20 +02:00
parent 36257b1b3a
commit e66daeb151
3 changed files with 42 additions and 14 deletions

View file

@ -1239,7 +1239,13 @@ class String(Type):
# When image upload is allowed, ckeditor inserts some "style" attrs
# (ie for image size when images are resized). So in this case we
# can't remove style-related information.
value = XhtmlCleaner().clean(value, keepStyles=self.richText)
try:
value = XhtmlCleaner().clean(value, keepStyles=self.richText)
except XhtmlCleaner.Error, e:
# Errors while parsing p_value can't prevent the user from
# storing it.
obj.log('Unparsable XHTML content in field "%s".' % self.name,
type='warning')
Type.store(self, obj, value)
def getFormattedValue(self, obj, value):

View file

@ -214,6 +214,7 @@ class Debianizer:
f = file(name, 'w')
f.write(initScript % ('oo', 'oo', 'Start OpenOffice in server mode',
'startoo', 'startoo', "#Can't stop OO."))
f.write('\n')
f.close()
os.chmod(name, 0744) # Make it executable by owner.
# Get the size of the app, in Kb.

View file

@ -22,6 +22,7 @@ import xml.sax, difflib, types, cgi
from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
property_interning_dict
from xml.sax.xmlreader import InputSource
from xml.sax import SAXParseException
from appy.shared import UnicodeBuffer, xmlPrologue
from appy.shared.errors import AppyError
from appy.shared.utils import sequenceTypes
@ -890,13 +891,16 @@ class XmlComparator:
# ------------------------------------------------------------------------------
class XhtmlCleaner(XmlParser):
'''This class cleans XHTML content, so it becomes ready to be stored into a
Appy-compliant format.'''
class Error(Exception): pass
# Tags that will not be in the result, content included, if keepStyles is
# False.
tagsToIgnoreWithContent = ('style', 'colgroup')
# Tags that will be removed from the result, but whose content will be kept,
# if keepStyles is False.
tagsToIgnoreKeepContent= ('x', 'font')
tagsToIgnoreKeepContent= ('x', 'font', 'center')
# All tags to ignore
tagsToIgnore = tagsToIgnoreWithContent + tagsToIgnoreKeepContent
# Attributes to ignore, if keepStyles if False.
@ -909,20 +913,33 @@ class XhtmlCleaner(XmlParser):
# Tags that required a line break to be inserted after them.
lineBreakTags = ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td')
'''This class has 2 objectives:
# A pre-cleaning phase consists in performing some replacements before
# running the XML SAX parsing. The dict below contains such repls.
preCleanRepls = {' ': ' '}
1. The main objective is to format XHTML p_s to be storable in the ZODB
according to Appy rules.
a. Every <p> or <li> must be on a single line (ending with a carriage
return); else, appy.shared.diff will not be able to compute XHTML
diffs;
b. Optimize size: HTML comments are removed.
def preClean(self, s):
'''Before true XHTML cleaning, this method performs pre-cleaning by
performing, on p_s, replacements as defined in self.preCleanRepls.'''
for item, repl in self.preCleanRepls.iteritems():
if item in s:
s = s.replace(item, repl)
return s
2. If p_keepStyles (or m_clean) is False, some style-related information
will be removed, in order to get a standardized content that can be
dumped in an elegant and systematic manner into a POD template.
'''
def clean(self, s, keepStyles=True):
'''Cleaning XHTML code is done for 2 reasons:
1. The main objective is to format XHTML p_s to be storable in the
ZODB according to Appy rules.
a. Every <p> or <li> must be on a single line (ending with a
carriage return); else, appy.shared.diff will not be able to
compute XHTML diffs;
b. Optimize size: HTML comments are removed.
2. If p_keepStyles (or m_clean) is False, some style-related
information will be removed, in order to get a standardized
content that can be dumped in an elegant and systematic manner
into a POD template.
'''
# Must we keep style-related information or not?
self.env.keepStyles = keepStyles
self.env.currentContent = ''
@ -934,7 +951,11 @@ class XhtmlCleaner(XmlParser):
# 'ignoreContent' is True if, within the currently ignored tag, we must
# also ignore its content.
self.env.ignoreContent = False
return self.parse('<x>%s</x>' % s).encode('utf-8')
try:
res = self.parse('<x>%s</x>' % self.preClean(s)).encode('utf-8')
except SAXParseException, e:
raise self.Error(str(e))
return res
def startDocument(self):
# The result will be cleaned XHTML, joined from self.res.