[pod] Performance improvement: stop computing tag names, create a dict of precomputed tags before starting parsing.

2013-01-31 12:50:25 +01:00 · 2013-01-31 12:50:25 +01:00 · d5d99b67eb
commit d5d99b67eb
parent ad94fee755
1 changed files with 44 additions and 24 deletions
--- a/pod/pod_parser.py
+++ b/pod/pod_parser.py
@ -60,10 +60,6 @@ class OdInsert:
 class PodEnvironment(OdfEnvironment):
    '''Contains all elements representing the current parser state during
       parsing.'''
    # Elements we must ignore (they will not be included in the result
    ignorableElements = None # Will be set after namespace propagation
    # Elements that may be impacted by POD statements
    impactableElements = None # Idem
    # Possibles modes
    # ADD_IN_BUFFER: when encountering an impactable element, we must
    #                continue to dump it in the current buffer
@ -92,6 +88,10 @@ class PodEnvironment(OdfEnvironment):
        self.mode = self.ADD_IN_SUBBUFFER
        # Current state
        self.state = self.READING_CONTENT
        # Elements we must ignore (they will not be included in the result)
        self.ignorableElements = None # Will be set after namespace propagation
        # Elements that may be impacted by POD statements
        self.impactableElements = None # Idem
        # Stack of currently visited tables
        self.tableStack = []
        self.tableIndex = -1
@ -110,6 +110,8 @@ class PodEnvironment(OdfEnvironment):
        self.namedIfActions = {} #~{s_statementName: IfAction}~
        # Currently parsed expression within an ODS template
        self.currentOdsExpression = None
        # Names of some tags, that we will compute after namespace propagation
        self.tags = None
    def getTable(self):
        '''Gets the currently parsed table.'''
@ -157,15 +159,15 @@ class PodEnvironment(OdfEnvironment):
            self.getTable().curRowAttrs = self.currentElem.attrs
        elif elem == Cell.OD.elem:
            colspan = 1
-            attrSpan = '%s:number-columns-spanned' % tableNs
+            attrSpan = self.tags['number-columns-spanned']
            if self.currentElem.attrs.has_key(attrSpan):
                colspan = int(self.currentElem.attrs[attrSpan])
            self.getTable().curColIndex += colspan
-        elif elem == ('%s:table-column' % tableNs):
+        elif elem == self.tags['table-column']:
            attrs = self.currentElem.attrs
-            if attrs.has_key('%s:number-columns-repeated' % tableNs):
+            if attrs.has_key(self.tags['number-columns-repeated']):
                self.getTable().nbOfColumns += int(
-                    attrs['%s:number-columns-repeated' % tableNs])
+                    attrs[self.tags['number-columns-repeated']])
            else:
                self.getTable().nbOfColumns += 1
        return ns
@ -190,8 +192,28 @@ class PodEnvironment(OdfEnvironment):
            xmlElemDef = eval(elemName[0].upper() + elemName[1:]).OD
            elemFullName = xmlElemDef.getFullName(ns)
            xmlElemDef.__init__(elemFullName)
-        self.ignorableElements = ('%s:tracked-changes' % ns[self.NS_TEXT],
+        # Create a table of names of used tags and attributes (precomputed,
-                                  '%s:change' % ns[self.NS_TEXT])
+        # including namespace, for performance).
        self.tags = {
          'tracked-changes': '%s:tracked-changes' % ns[self.NS_TEXT],
          'change': '%s:change' % ns[self.NS_TEXT],
          'annotation': '%s:annotation' % ns[self.NS_OFFICE],
          'change-start': '%s:change-start' % ns[self.NS_TEXT],
          'change-end': '%s:change-end' % ns[self.NS_TEXT],
          'conditional-text': '%s:conditional-text' % ns[self.NS_TEXT],
          'table-cell': '%s:table-cell' % ns[self.NS_TABLE],
          'formula': '%s:formula' % ns[self.NS_TABLE],
          'value-type': '%s:value-type' % ns[self.NS_OFFICE],
          'string-value': '%s:string-value' % ns[self.NS_OFFICE],
          'span': '%s:span' % ns[self.NS_TEXT],
          'number-columns-spanned': '%s:number-columns-spanned' % \
                                    ns[self.NS_TABLE],
          'number-columns-repeated': '%s:number-columns-repeated' % \
                                    ns[self.NS_TABLE],
          'table-column': '%s:table-column' % ns[self.NS_TABLE],
        }
        self.ignorableElements = (self.tags['tracked-changes'],
                                  self.tags['change'])
        self.impactableElements = (
           Text.OD.elem, Title.OD.elem, Table.OD.elem, Row.OD.elem,
           Cell.OD.elem, Section.OD.elem)
@ -213,19 +235,18 @@ class PodParser(OdfParser):
        tableNs = ns[e.NS_TABLE]
        if elem in e.ignorableElements:
            e.state = e.IGNORING
-        elif elem == ('%s:annotation' % officeNs):
+        elif elem == e.tags['annotation']:
            # Be it in an ODT or ODS template, an annotation is considered to
            # contain a POD statement.
            e.state = e.READING_STATEMENT
-        elif (elem == ('%s:change-start' % textNs)) or \
+        elif elem in (e.tags['change-start'], e.tags['conditional-text']):
             (elem == ('%s:conditional-text' % textNs)):
            # In an ODT template, any text in track-changes or any conditional
            # field is considered to contain a POD expression.
            e.state = e.READING_EXPRESSION
            e.exprHasStyle = False
-        elif (elem == ('%s:table-cell' % tableNs)) and \
+        elif (elem == e.tags['table-cell']) and \
-             attrs.has_key('%s:formula' % tableNs) and \
+             attrs.has_key(e.tags['formula']) and \
-             (attrs['%s:value-type' % officeNs] == 'string'):
+             (attrs[e.tags['value-type']] == 'string'):
            # In an ODS template, any cell containing a formula of type "string"
            # is considered to contain a POD expression. But here it is a
            # special case: we need to dump the cell; the expression is not
@ -237,9 +258,9 @@ class PodParser(OdfParser):
                e.addSubBuffer()
            e.currentBuffer.addElement(e.currentElem.name)
            e.currentBuffer.dumpStartElement(elem, attrs,
-                ignoreAttrs=('%s:formula'%tableNs, '%s:string-value'%officeNs))
+                ignoreAttrs=(e.tags['formula'], e.tags['string-value']))
            # We already have the POD expression: remember it on the env.
-            e.currentOdsExpression = attrs['%s:string-value' % officeNs]
+            e.currentOdsExpression = attrs[e.tags['string-value']]
        else:
            if e.state == e.IGNORING:
                pass
@ -252,8 +273,7 @@ class PodParser(OdfParser):
            elif e.state == e.READING_STATEMENT:
                pass
            elif e.state == e.READING_EXPRESSION:
-                if (elem == ('%s:span' % textNs)) and \
+                if (elem == (e.tags['span'])) and not e.currentContent.strip():
                   not e.currentContent.strip():
                    e.currentBuffer.dumpStartElement(elem, attrs)
                    e.exprHasStyle = True
        e.manageInserts()
@ -265,7 +285,7 @@ class PodParser(OdfParser):
        textNs = ns[e.NS_TEXT]
        if elem in e.ignorableElements:
            e.state = e.READING_CONTENT
-        elif elem == ('%s:annotation' % officeNs):
+        elif elem == e.tags['annotation']:
            # Manage statement
            oldCb = e.currentBuffer
            actionElemIndex = oldCb.createAction(e.currentStatement)
@ -316,14 +336,14 @@ class PodParser(OdfParser):
                        e.currentStatement.append(statementLine)
                    e.currentContent = ''
            elif e.state == e.READING_EXPRESSION:
-                if (elem == ('%s:change-end' % textNs)) or \
+                if (elem == e.tags['change-end']) or \
-                   (elem == ('%s:conditional-text' % textNs)):
+                   (elem == e.tags['conditional-text']):
                    expression = e.currentContent.strip()
                    e.currentContent = ''
                    # Manage expression
                    e.currentBuffer.addExpression(expression)
                    if e.exprHasStyle:
-                        e.currentBuffer.dumpEndElement('%s:span' % textNs)
+                        e.currentBuffer.dumpEndElement(e.tags['span'])
                    e.state = e.READING_CONTENT
    def characters(self, content):