appy.gen: improved cleaning and formatting of XHTML content; appy.pod: added some default appy-related table styles for producing cells with text in bold/normal, aligned right/left, etc.

2012-05-14 17:35:34 +02:00 · 2012-05-14 17:35:34 +02:00 · 028040351c
commit 028040351c
parent d3a2b85a10
11 changed files with 195 additions and 54 deletions
--- a/shared/packaging.py
+++ b/shared/packaging.py
@ -106,7 +106,8 @@ class Debianizer:

    def __init__(self, app, out, appVersion='0.1.0',
                 pythonVersions=('2.6',), zopePort=8080,
-                 depends=('openoffice.org', 'imagemagick'), sign=False):
+                 depends=('zope2.12', 'openoffice.org', 'imagemagick'),
+                 sign=False):
        # app is the path to the Python package to Debianize.
        self.app = app
        self.appName = os.path.basename(app)
@ -261,10 +262,6 @@ class Debianizer:
        # Create postinst, a script that will:
        # - bytecompile Python files after the Debian install
        # - change ownership of some files if required
-        # - [in the case of a app-package] execute:
-        #   apt-get -t squeeze-backports install zope2.12
-        #   (if zope2.12 is defined as a simple dependency in field "Depends:"
-        #   it will fail because it will not be searched in squeeze-backports).
        # - [in the case of an app-package] call update-rc.d for starting it at
        #   boot time.
        f = file('postinst', 'w')
@ -276,8 +273,6 @@ class Debianizer:
                                                                  self.appName)
            content += 'if [ -e %s ]\nthen\n%sfi\n' % (bin, cmds)
        if self.appName != 'appy':
-            # Install zope2.12 from squeeze-backports
-            content += 'apt-get -t squeeze-backports install zope2.12\n'
            # Allow user "zope", that runs the Zope instance, to write the
            # database and log files.
            content += 'chown -R zope:root /var/lib/%s\n' % self.appNameLower
--- a/shared/utils.py
+++ b/shared/utils.py
@ -263,35 +263,6 @@ def formatNumber(n, sep=',', precision=2, tsep=' '):
            res += sep + splitted[1]
    return res

-# ------------------------------------------------------------------------------
-class XhtmlCleaner:
-    # Regular expressions used for cleaning.
-    classAttr = re.compile('class\s*=\s*".*?"')
-    comment = re.compile('<!--.*?-->', re.S)
-
-    '''This class has 2 objectives:
-
-       1. The main objective is to format XHTML p_s to be storable in the ZODB
-          according to Appy rules.
-          a. Every <p> or <li> must be on a single line (ending with a carriage
-             return); else, appy.shared.diff will not be able to compute XHTML
-             diffs;
-          b. Optimize size: HTML comments are removed.
-
-       2. If p_keepStyles (or m_clean) is False, some style-related information
-          will be removed, in order to get a standardized content that can be
-          dumped in an elegant and systematic manner into a POD template.
-    '''
-    @classmethod
-    def clean(klass, s, keepStyles=False):
-        '''Returns the cleaned variant of p_s.'''
-        if not keepStyles:
-            # Format p_s according to objective 2.
-            s = klass.classAttr.sub('', s)
-        # Format p_s according to objective 1.
-        s = klass.comment.sub('', s)
-        return s
-
 # ------------------------------------------------------------------------------
 def lower(s):
    '''French-accents-aware variant of string.lower.'''
--- a/shared/xml_parser.py
+++ b/shared/xml_parser.py
@ -18,7 +18,7 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,USA.

 # ------------------------------------------------------------------------------
-import xml.sax, difflib, types
+import xml.sax, difflib, types, cgi
 from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges,\
                            property_interning_dict
 from xml.sax.xmlreader import InputSource
@ -887,4 +887,127 @@ class XmlComparator:
            else:
                lastLinePrinted = False
        return not atLeastOneDiff
+
+# ------------------------------------------------------------------------------
+class XhtmlCleaner(XmlParser):
+
+    # Tags that will not be in the result, content included, if keepStyles is
+    # False.
+    tagsToIgnoreWithContent = ('style', 'colgroup')
+    # Tags that will be removed from the result, but whose content will be kept,
+    # if keepStyles is False.
+    tagsToIgnoreKeepContent= ('x', 'font')
+    # All tags to ignore
+    tagsToIgnore = tagsToIgnoreWithContent + tagsToIgnoreKeepContent
+    # Attributes to ignore, if keepStyles if False.
+    attrsToIgnore = ('align', 'valign', 'cellpadding', 'cellspacing', 'width',
+                     'height', 'bgcolor', 'lang', 'border', 'class')
+    # Attrs to add, if not present, to ensure good formatting, be it at the web
+    # or ODT levels.
+    attrsToAdd = {'table': {'cellspacing':'0', 'cellpadding':'6', 'border':'1'},
+                  'tr':    {'valign': 'top'}}
+
+    # Tags that required a line break to be inserted after them.
+    lineBreakTags = ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td')
+    '''This class has 2 objectives:
+
+       1. The main objective is to format XHTML p_s to be storable in the ZODB
+          according to Appy rules.
+          a. Every <p> or <li> must be on a single line (ending with a carriage
+             return); else, appy.shared.diff will not be able to compute XHTML
+             diffs;
+          b. Optimize size: HTML comments are removed.
+
+       2. If p_keepStyles (or m_clean) is False, some style-related information
+          will be removed, in order to get a standardized content that can be
+          dumped in an elegant and systematic manner into a POD template.
+    '''
+    def clean(self, s, keepStyles=True):
+        # Must we keep style-related information or not?
+        self.env.keepStyles = keepStyles
+        self.env.currentContent = ''
+        # The stack of currently parsed elements (will contain only ignored
+        # ones).
+        self.env.currentElems = []
+        # 'ignoreTag' is True if we must ignore the currently walked tag.
+        self.env.ignoreTag = False
+        # 'ignoreContent' is True if, within the currently ignored tag, we must
+        # also ignore its content.
+        self.env.ignoreContent = False
+        return self.parse('<x>%s</x>' % s)
+
+    def startDocument(self):
+        # The result will be cleaned XHTML, joined from self.res.
+        self.res = []
+
+    def endDocument(self):
+        self.res = ''.join(self.res)
+
+    def startElement(self, elem, attrs):
+        e = self.env
+        # Dump any previously gathered content if any
+        if e.currentContent:
+            self.res.append(e.currentContent)
+            e.currentContent = ''
+        if e.ignoreTag and e.ignoreContent: return
+        if not e.keepStyles and (elem in self.tagsToIgnore):
+            e.ignoreTag = True
+            if elem in self.tagsToIgnoreWithContent:
+                e.ignoreContent = True
+            else:
+                e.ignoreContent = False
+            e.currentElems.append( (elem, e.ignoreContent) )
+            return
+        # Add a line break before the start tag if required (ie: xhtml differ
+        # needs to get paragraphs and other elements on separate lines).
+        if (elem in self.lineBreakTags) and self.res and \
+           (self.res[-1][-1] != '\n'):
+            prefix = '\n'
+        else:
+            prefix = ''
+        res = '%s<%s' % (prefix, elem)
+        # Include the found attributes, excepted those that must be ignored.
+        for name, value in attrs.items():
+            if not e.keepStyles and (name in self.attrsToIgnore): continue
+            res += ' %s="%s"' % (name, value)
+        # Include additional attributes if required.
+        if elem in self.attrsToAdd:
+            for name, value in self.attrsToAdd[elem].iteritems():
+                res += ' %s="%s"' % (name, value)
+        self.res.append('%s>' % res)
+
+    def endElement(self, elem):
+        e = self.env
+        if e.ignoreTag and (elem in self.tagsToIgnore):
+            # Pop the currently ignored tag
+            e.currentElems.pop()
+            if e.currentElems:
+                # Keep ignoring tags.
+                e.ignoreContent = e.currentElems[-1][1]
+            else:
+                # Stop ignoring elems
+                e.ignoreTag = e.ignoreContent = False
+        elif e.ignoreTag and e.ignoreContent:
+            # This is the end of a sub-tag within a region that we must ignore.
+            pass
+        else:
+            self.res.append(self.env.currentContent)
+            # Add a line break after the end tag if required (ie: xhtml differ
+            # needs to get paragraphs and other elements on separate lines).
+            if elem in self.lineBreakTags:
+                suffix = '\n'
+            else:
+                suffix = ''
+            self.res.append('</%s>%s' % (elem, suffix))
+            self.env.currentContent = ''
+
+    def characters(self, content):
+        if self.env.ignoreContent: return
+        # Remove blanks that ckeditor may add just after a start tag
+        if not self.env.currentContent or (self.env.currentContent == ' '):
+            toAdd = ' ' + content.lstrip()
+        else:
+            toAdd = content
+        # Re-transform XML special chars to entities.
+        self.env.currentContent += cgi.escape(content)
 # ------------------------------------------------------------------------------