From e66daeb151dcf8c9b3af601c579a38b4c2f63212 Mon Sep 17 00:00:00 2001 From: Gaetan Delannay Date: Tue, 22 May 2012 16:42:20 +0200 Subject: [PATCH] [gen] More robust XHTML cleaning. --- gen/__init__.py | 8 +++++++- shared/packaging.py | 1 + shared/xml_parser.py | 47 ++++++++++++++++++++++++++++++++------------ 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/gen/__init__.py b/gen/__init__.py index 6a480e2..f44b27c 100644 --- a/gen/__init__.py +++ b/gen/__init__.py @@ -1239,7 +1239,13 @@ class String(Type): # When image upload is allowed, ckeditor inserts some "style" attrs # (ie for image size when images are resized). So in this case we # can't remove style-related information. - value = XhtmlCleaner().clean(value, keepStyles=self.richText) + try: + value = XhtmlCleaner().clean(value, keepStyles=self.richText) + except XhtmlCleaner.Error, e: + # Errors while parsing p_value can't prevent the user from + # storing it. + obj.log('Unparsable XHTML content in field "%s".' % self.name, + type='warning') Type.store(self, obj, value) def getFormattedValue(self, obj, value): diff --git a/shared/packaging.py b/shared/packaging.py index 53c5b34..9c47627 100644 --- a/shared/packaging.py +++ b/shared/packaging.py @@ -214,6 +214,7 @@ class Debianizer: f = file(name, 'w') f.write(initScript % ('oo', 'oo', 'Start OpenOffice in server mode', 'startoo', 'startoo', "#Can't stop OO.")) + f.write('\n') f.close() os.chmod(name, 0744) # Make it executable by owner. # Get the size of the app, in Kb. diff --git a/shared/xml_parser.py b/shared/xml_parser.py index cd8f649..c53764c 100644 --- a/shared/xml_parser.py +++ b/shared/xml_parser.py @@ -22,6 +22,7 @@ import xml.sax, difflib, types, cgi from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\ property_interning_dict from xml.sax.xmlreader import InputSource +from xml.sax import SAXParseException from appy.shared import UnicodeBuffer, xmlPrologue from appy.shared.errors import AppyError from appy.shared.utils import sequenceTypes @@ -890,13 +891,16 @@ class XmlComparator: # ------------------------------------------------------------------------------ class XhtmlCleaner(XmlParser): + '''This class cleans XHTML content, so it becomes ready to be stored into a + Appy-compliant format.''' + class Error(Exception): pass # Tags that will not be in the result, content included, if keepStyles is # False. tagsToIgnoreWithContent = ('style', 'colgroup') # Tags that will be removed from the result, but whose content will be kept, # if keepStyles is False. - tagsToIgnoreKeepContent= ('x', 'font') + tagsToIgnoreKeepContent= ('x', 'font', 'center') # All tags to ignore tagsToIgnore = tagsToIgnoreWithContent + tagsToIgnoreKeepContent # Attributes to ignore, if keepStyles if False. @@ -909,20 +913,33 @@ class XhtmlCleaner(XmlParser): # Tags that required a line break to be inserted after them. lineBreakTags = ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td') - '''This class has 2 objectives: + # A pre-cleaning phase consists in performing some replacements before + # running the XML SAX parsing. The dict below contains such repls. + preCleanRepls = {' ': ' '} - 1. The main objective is to format XHTML p_s to be storable in the ZODB - according to Appy rules. - a. Every

or

  • must be on a single line (ending with a carriage - return); else, appy.shared.diff will not be able to compute XHTML - diffs; - b. Optimize size: HTML comments are removed. + def preClean(self, s): + '''Before true XHTML cleaning, this method performs pre-cleaning by + performing, on p_s, replacements as defined in self.preCleanRepls.''' + for item, repl in self.preCleanRepls.iteritems(): + if item in s: + s = s.replace(item, repl) + return s - 2. If p_keepStyles (or m_clean) is False, some style-related information - will be removed, in order to get a standardized content that can be - dumped in an elegant and systematic manner into a POD template. - ''' def clean(self, s, keepStyles=True): + '''Cleaning XHTML code is done for 2 reasons: + + 1. The main objective is to format XHTML p_s to be storable in the + ZODB according to Appy rules. + a. Every

    or

  • must be on a single line (ending with a + carriage return); else, appy.shared.diff will not be able to + compute XHTML diffs; + b. Optimize size: HTML comments are removed. + + 2. If p_keepStyles (or m_clean) is False, some style-related + information will be removed, in order to get a standardized + content that can be dumped in an elegant and systematic manner + into a POD template. + ''' # Must we keep style-related information or not? self.env.keepStyles = keepStyles self.env.currentContent = '' @@ -934,7 +951,11 @@ class XhtmlCleaner(XmlParser): # 'ignoreContent' is True if, within the currently ignored tag, we must # also ignore its content. self.env.ignoreContent = False - return self.parse('%s' % s).encode('utf-8') + try: + res = self.parse('%s' % self.preClean(s)).encode('utf-8') + except SAXParseException, e: + raise self.Error(str(e)) + return res def startDocument(self): # The result will be cleaned XHTML, joined from self.res.