From e66daeb151dcf8c9b3af601c579a38b4c2f63212 Mon Sep 17 00:00:00 2001
From: Gaetan Delannay <gaetan.delannay@gmail.com>
Date: Tue, 22 May 2012 16:42:20 +0200
Subject: [PATCH] [gen] More robust XHTML cleaning.

---
 gen/__init__.py      |  8 +++++++-
 shared/packaging.py  |  1 +
 shared/xml_parser.py | 47 ++++++++++++++++++++++++++++++++------------
 3 files changed, 42 insertions(+), 14 deletions(-)
diff --git a/gen/__init__.py b/gen/__init__.py
index 6a480e2..f44b27c 100644
--- a/gen/__init__.py
+++ b/gen/__init__.py
@@ -1239,7 +1239,13 @@ class String(Type):
             # When image upload is allowed, ckeditor inserts some "style" attrs
             # (ie for image size when images are resized). So in this case we
             # can't remove style-related information.
-            value = XhtmlCleaner().clean(value, keepStyles=self.richText)
+            try:
+                value = XhtmlCleaner().clean(value, keepStyles=self.richText)
+            except XhtmlCleaner.Error, e:
+                # Errors while parsing p_value can't prevent the user from
+                # storing it.
+                obj.log('Unparsable XHTML content in field "%s".' % self.name,
+                        type='warning')
         Type.store(self, obj, value)
 
     def getFormattedValue(self, obj, value):
diff --git a/shared/packaging.py b/shared/packaging.py
index 53c5b34..9c47627 100644
--- a/shared/packaging.py
+++ b/shared/packaging.py
@@ -214,6 +214,7 @@ class Debianizer:
             f = file(name, 'w')
             f.write(initScript % ('oo', 'oo', 'Start OpenOffice in server mode',
                                   'startoo', 'startoo', "#Can't stop OO."))
+            f.write('\n')
             f.close()
             os.chmod(name, 0744) # Make it executable by owner.
         # Get the size of the app, in Kb.
diff --git a/shared/xml_parser.py b/shared/xml_parser.py
index cd8f649..c53764c 100644
--- a/shared/xml_parser.py
+++ b/shared/xml_parser.py
@@ -22,6 +22,7 @@ import xml.sax, difflib, types, cgi
 from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
                             property_interning_dict
 from xml.sax.xmlreader import InputSource
+from xml.sax import SAXParseException
 from appy.shared import UnicodeBuffer, xmlPrologue
 from appy.shared.errors import AppyError
 from appy.shared.utils import sequenceTypes
@@ -890,13 +891,16 @@ class XmlComparator:
 
 # ------------------------------------------------------------------------------
 class XhtmlCleaner(XmlParser):
+    '''This class cleans XHTML content, so it becomes ready to be stored into a
+       Appy-compliant format.'''
+    class Error(Exception): pass
 
     # Tags that will not be in the result, content included, if keepStyles is
     # False.
     tagsToIgnoreWithContent = ('style', 'colgroup')
     # Tags that will be removed from the result, but whose content will be kept,
     # if keepStyles is False.
-    tagsToIgnoreKeepContent= ('x', 'font')
+    tagsToIgnoreKeepContent= ('x', 'font', 'center')
     # All tags to ignore
     tagsToIgnore = tagsToIgnoreWithContent + tagsToIgnoreKeepContent
     # Attributes to ignore, if keepStyles if False.
@@ -909,20 +913,33 @@ class XhtmlCleaner(XmlParser):
 
     # Tags that required a line break to be inserted after them.
     lineBreakTags = ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td')
-    '''This class has 2 objectives:
+    # A pre-cleaning phase consists in performing some replacements before
+    # running the XML SAX parsing. The dict below contains such repls.
+    preCleanRepls = {'&nbsp;': ' '}
 
-       1. The main objective is to format XHTML p_s to be storable in the ZODB
-          according to Appy rules.
-          a. Every <p> or <li> must be on a single line (ending with a carriage
-             return); else, appy.shared.diff will not be able to compute XHTML
-             diffs;
-          b. Optimize size: HTML comments are removed.
+    def preClean(self, s):
+        '''Before true XHTML cleaning, this method performs pre-cleaning by
+           performing, on p_s, replacements as defined in self.preCleanRepls.'''
+        for item, repl in self.preCleanRepls.iteritems():
+            if item in s:
+                s = s.replace(item, repl)
+        return s
 
-       2. If p_keepStyles (or m_clean) is False, some style-related information
-          will be removed, in order to get a standardized content that can be
-          dumped in an elegant and systematic manner into a POD template.
-    '''
     def clean(self, s, keepStyles=True):
+        '''Cleaning XHTML code is done for 2 reasons:
+
+           1. The main objective is to format XHTML p_s to be storable in the
+              ZODB according to Appy rules.
+              a. Every <p> or <li> must be on a single line (ending with a
+                 carriage return); else, appy.shared.diff will not be able to
+                 compute XHTML diffs;
+              b. Optimize size: HTML comments are removed.
+
+           2. If p_keepStyles (or m_clean) is False, some style-related
+              information will be removed, in order to get a standardized
+              content that can be dumped in an elegant and systematic manner
+              into a POD template.
+        '''
         # Must we keep style-related information or not?
         self.env.keepStyles = keepStyles
         self.env.currentContent = ''
@@ -934,7 +951,11 @@ class XhtmlCleaner(XmlParser):
         # 'ignoreContent' is True if, within the currently ignored tag, we must
         # also ignore its content.
         self.env.ignoreContent = False
-        return self.parse('<x>%s</x>' % s).encode('utf-8')
+        try:
+            res = self.parse('<x>%s</x>' % self.preClean(s)).encode('utf-8')
+        except SAXParseException, e:
+            raise self.Error(str(e))
+        return res
 
     def startDocument(self):
         # The result will be cleaned XHTML, joined from self.res.