From d5d99b67eb9128ae17a6b5b350c5222bd041eca0 Mon Sep 17 00:00:00 2001 From: Gaetan Delannay Date: Thu, 31 Jan 2013 12:50:25 +0100 Subject: [PATCH] [pod] Performance improvement: stop computing tag names, create a dict of precomputed tags before starting parsing. --- pod/pod_parser.py | 68 ++++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/pod/pod_parser.py b/pod/pod_parser.py index c56a69f..18e70ab 100644 --- a/pod/pod_parser.py +++ b/pod/pod_parser.py @@ -60,10 +60,6 @@ class OdInsert: class PodEnvironment(OdfEnvironment): '''Contains all elements representing the current parser state during parsing.''' - # Elements we must ignore (they will not be included in the result - ignorableElements = None # Will be set after namespace propagation - # Elements that may be impacted by POD statements - impactableElements = None # Idem # Possibles modes # ADD_IN_BUFFER: when encountering an impactable element, we must # continue to dump it in the current buffer @@ -92,6 +88,10 @@ class PodEnvironment(OdfEnvironment): self.mode = self.ADD_IN_SUBBUFFER # Current state self.state = self.READING_CONTENT + # Elements we must ignore (they will not be included in the result) + self.ignorableElements = None # Will be set after namespace propagation + # Elements that may be impacted by POD statements + self.impactableElements = None # Idem # Stack of currently visited tables self.tableStack = [] self.tableIndex = -1 @@ -110,6 +110,8 @@ class PodEnvironment(OdfEnvironment): self.namedIfActions = {} #~{s_statementName: IfAction}~ # Currently parsed expression within an ODS template self.currentOdsExpression = None + # Names of some tags, that we will compute after namespace propagation + self.tags = None def getTable(self): '''Gets the currently parsed table.''' @@ -157,15 +159,15 @@ class PodEnvironment(OdfEnvironment): self.getTable().curRowAttrs = self.currentElem.attrs elif elem == Cell.OD.elem: colspan = 1 - attrSpan = '%s:number-columns-spanned' % tableNs + attrSpan = self.tags['number-columns-spanned'] if self.currentElem.attrs.has_key(attrSpan): colspan = int(self.currentElem.attrs[attrSpan]) self.getTable().curColIndex += colspan - elif elem == ('%s:table-column' % tableNs): + elif elem == self.tags['table-column']: attrs = self.currentElem.attrs - if attrs.has_key('%s:number-columns-repeated' % tableNs): + if attrs.has_key(self.tags['number-columns-repeated']): self.getTable().nbOfColumns += int( - attrs['%s:number-columns-repeated' % tableNs]) + attrs[self.tags['number-columns-repeated']]) else: self.getTable().nbOfColumns += 1 return ns @@ -190,8 +192,28 @@ class PodEnvironment(OdfEnvironment): xmlElemDef = eval(elemName[0].upper() + elemName[1:]).OD elemFullName = xmlElemDef.getFullName(ns) xmlElemDef.__init__(elemFullName) - self.ignorableElements = ('%s:tracked-changes' % ns[self.NS_TEXT], - '%s:change' % ns[self.NS_TEXT]) + # Create a table of names of used tags and attributes (precomputed, + # including namespace, for performance). + self.tags = { + 'tracked-changes': '%s:tracked-changes' % ns[self.NS_TEXT], + 'change': '%s:change' % ns[self.NS_TEXT], + 'annotation': '%s:annotation' % ns[self.NS_OFFICE], + 'change-start': '%s:change-start' % ns[self.NS_TEXT], + 'change-end': '%s:change-end' % ns[self.NS_TEXT], + 'conditional-text': '%s:conditional-text' % ns[self.NS_TEXT], + 'table-cell': '%s:table-cell' % ns[self.NS_TABLE], + 'formula': '%s:formula' % ns[self.NS_TABLE], + 'value-type': '%s:value-type' % ns[self.NS_OFFICE], + 'string-value': '%s:string-value' % ns[self.NS_OFFICE], + 'span': '%s:span' % ns[self.NS_TEXT], + 'number-columns-spanned': '%s:number-columns-spanned' % \ + ns[self.NS_TABLE], + 'number-columns-repeated': '%s:number-columns-repeated' % \ + ns[self.NS_TABLE], + 'table-column': '%s:table-column' % ns[self.NS_TABLE], + } + self.ignorableElements = (self.tags['tracked-changes'], + self.tags['change']) self.impactableElements = ( Text.OD.elem, Title.OD.elem, Table.OD.elem, Row.OD.elem, Cell.OD.elem, Section.OD.elem) @@ -213,19 +235,18 @@ class PodParser(OdfParser): tableNs = ns[e.NS_TABLE] if elem in e.ignorableElements: e.state = e.IGNORING - elif elem == ('%s:annotation' % officeNs): + elif elem == e.tags['annotation']: # Be it in an ODT or ODS template, an annotation is considered to # contain a POD statement. e.state = e.READING_STATEMENT - elif (elem == ('%s:change-start' % textNs)) or \ - (elem == ('%s:conditional-text' % textNs)): + elif elem in (e.tags['change-start'], e.tags['conditional-text']): # In an ODT template, any text in track-changes or any conditional # field is considered to contain a POD expression. e.state = e.READING_EXPRESSION e.exprHasStyle = False - elif (elem == ('%s:table-cell' % tableNs)) and \ - attrs.has_key('%s:formula' % tableNs) and \ - (attrs['%s:value-type' % officeNs] == 'string'): + elif (elem == e.tags['table-cell']) and \ + attrs.has_key(e.tags['formula']) and \ + (attrs[e.tags['value-type']] == 'string'): # In an ODS template, any cell containing a formula of type "string" # is considered to contain a POD expression. But here it is a # special case: we need to dump the cell; the expression is not @@ -237,9 +258,9 @@ class PodParser(OdfParser): e.addSubBuffer() e.currentBuffer.addElement(e.currentElem.name) e.currentBuffer.dumpStartElement(elem, attrs, - ignoreAttrs=('%s:formula'%tableNs, '%s:string-value'%officeNs)) + ignoreAttrs=(e.tags['formula'], e.tags['string-value'])) # We already have the POD expression: remember it on the env. - e.currentOdsExpression = attrs['%s:string-value' % officeNs] + e.currentOdsExpression = attrs[e.tags['string-value']] else: if e.state == e.IGNORING: pass @@ -252,8 +273,7 @@ class PodParser(OdfParser): elif e.state == e.READING_STATEMENT: pass elif e.state == e.READING_EXPRESSION: - if (elem == ('%s:span' % textNs)) and \ - not e.currentContent.strip(): + if (elem == (e.tags['span'])) and not e.currentContent.strip(): e.currentBuffer.dumpStartElement(elem, attrs) e.exprHasStyle = True e.manageInserts() @@ -265,7 +285,7 @@ class PodParser(OdfParser): textNs = ns[e.NS_TEXT] if elem in e.ignorableElements: e.state = e.READING_CONTENT - elif elem == ('%s:annotation' % officeNs): + elif elem == e.tags['annotation']: # Manage statement oldCb = e.currentBuffer actionElemIndex = oldCb.createAction(e.currentStatement) @@ -316,14 +336,14 @@ class PodParser(OdfParser): e.currentStatement.append(statementLine) e.currentContent = '' elif e.state == e.READING_EXPRESSION: - if (elem == ('%s:change-end' % textNs)) or \ - (elem == ('%s:conditional-text' % textNs)): + if (elem == e.tags['change-end']) or \ + (elem == e.tags['conditional-text']): expression = e.currentContent.strip() e.currentContent = '' # Manage expression e.currentBuffer.addExpression(expression) if e.exprHasStyle: - e.currentBuffer.dumpEndElement('%s:span' % textNs) + e.currentBuffer.dumpEndElement(e.tags['span']) e.state = e.READING_CONTENT def characters(self, content):