appypod-rattail/pod/pod_parser.py

383 lines
18 KiB
Python

# ------------------------------------------------------------------------------
# This file is part of Appy, a framework for building applications in the Python
# language. Copyright (C) 2007 Gaetan Delannay
# Appy is free software; you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation; either version 3 of the License, or (at your option) any later
# version.
# Appy is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
# You should have received a copy of the GNU General Public License along with
# Appy. If not, see <http://www.gnu.org/licenses/>.
# ------------------------------------------------------------------------------
import re
from appy.shared.xml_parser import XmlElement
from appy.pod.buffers import FileBuffer, MemoryBuffer
from appy.pod.odf_parser import OdfEnvironment, OdfParser
from appy.pod.elements import *
# ------------------------------------------------------------------------------
class OdTable:
'''Informations about the currently parsed Open Document (Od)table.'''
def __init__(self):
self.nbOfColumns = 0
self.nbOfRows = 0
self.curColIndex = None
self.curRowAttrs = None
def isOneCell(self):
return (self.nbOfColumns == 1) and (self.nbOfRows == 1)
class OdInsert:
'''While parsing an odt/pod file, we may need to insert a specific odt chunk
at a given place in the odt file (ie: add the pod-specific fonts and
styles). OdInsert instances define such 'inserts' (what to insert and
when).'''
def __init__(self, odtChunk, elem, nsUris={}):
self.odtChunk = odtChunk.decode('utf-8') # The odt chunk to insert
self.elem = elem # The p_odtChunk will be inserted just after the p_elem
# start, which must be an XmlElement instance. If more than one p_elem
# is present in the odt file, the p_odtChunk will be inserted only at
# the first p_elem occurrence.
self.nsUris = nsUris # The URI replacements that need to be done in
# p_odtChunk. It is a dict whose keys are names used in p_odtChunk (in
# the form @name@) to refer to XML namespaces, and values are URIs of
# those namespaces.
def resolve(self, namespaces):
'''Replaces all unresolved namespaces in p_odtChunk, thanks to the dict
of p_namespaces.'''
for nsName, nsUri in self.nsUris.iteritems():
self.odtChunk = re.sub('@%s@' % nsName, namespaces[nsUri],
self.odtChunk)
return self.odtChunk
class PodEnvironment(OdfEnvironment):
'''Contains all elements representing the current parser state during
parsing.'''
# Possibles modes
# ADD_IN_BUFFER: when encountering an impactable element, we must
# continue to dump it in the current buffer
ADD_IN_BUFFER = 0
# ADD_IN_SUBBUFFER: when encountering an impactable element, we must
# create a new sub-buffer and dump it in it.
ADD_IN_SUBBUFFER = 1
# Possible states
IGNORING = 0 # We are ignoring what we are currently reading
READING_CONTENT = 1 # We are reading "normal" content
READING_STATEMENT = 2 # We are reading a POD statement (for, if...)
READING_EXPRESSION = 3 # We are reading a POD expression.
def __init__(self, context, inserts=[]):
OdfEnvironment.__init__(self)
# Buffer where we must dump the content we are currently reading
self.currentBuffer = None
# XML element content we are currently reading
self.currentContent = ''
# Current statement (a list of lines) that we are currently reading
self.currentStatement = []
# Current mode
self.mode = self.ADD_IN_SUBBUFFER
# Current state
self.state = self.READING_CONTENT
# Elements we must ignore (they will not be included in the result)
self.ignorableElems = None # Will be set after namespace propagation
# Elements that may be impacted by POD statements
self.impactableElems = None # Idem
# Elements representing start and end tags surrounding expressions
self.exprStartElems = self.exprEndElems = None # Idem
# Stack of currently visited tables
self.tableStack = []
self.tableIndex = -1
# Evaluation context
self.context = context
# For the currently read expression, is there style-related information
# associated with it?
self.exprHasStyle = False
# Namespace definitions are not already encountered.
self.gotNamespaces = False
# Store inserts
self.inserts = inserts
# Currently walked "if" actions
self.ifActions = []
# Currently walked named "if" actions
self.namedIfActions = {} #~{s_statementName: IfAction}~
# Currently parsed expression within an ODS template
self.currentOdsExpression = None
self.currentOdsHook = None
# Names of some tags, that we will compute after namespace propagation
self.tags = None
# When an error occurs, must we raise it or write it into he current
# buffer?
self.raiseOnError = None # Will be initialized by PodParser.__init__
def getTable(self):
'''Gets the currently parsed table.'''
res = None
if self.tableIndex != -1:
res = self.tableStack[self.tableIndex]
return res
def transformInserts(self):
'''Now the namespaces were parsed; I can put p_inserts in the form of a
dict for easier and more performant access while parsing.'''
res = {}
for insert in self.inserts:
elemName = insert.elem.getFullName(self.namespaces)
if not res.has_key(elemName):
res[elemName] = insert
return res
def manageInserts(self):
'''We just dumped the start of an elem. Here we will insert any odt
chunk if needed.'''
if self.inserts.has_key(self.currentElem.elem):
insert = self.inserts[self.currentElem.elem]
self.currentBuffer.write(insert.resolve(self.namespaces))
# The insert is destroyed after single use
del self.inserts[self.currentElem.elem]
def onStartElement(self):
ns = self.namespaces
if not self.gotNamespaces:
# We suppose that all the interesting (from the POD point of view)
# XML namespace definitions are defined at the root XML element.
# Here we propagate them in XML element definitions that we use
# throughout POD.
self.gotNamespaces = True
self.propagateNamespaces()
elem = self.currentElem.elem
tableNs = self.ns(self.NS_TABLE)
if elem == Table.OD.elem:
self.tableStack.append(OdTable())
self.tableIndex += 1
elif elem == Row.OD.elem:
self.getTable().nbOfRows += 1
self.getTable().curColIndex = -1
self.getTable().curRowAttrs = self.currentElem.attrs
elif elem == Cell.OD.elem:
colspan = 1
attrSpan = self.tags['number-columns-spanned']
if self.currentElem.attrs.has_key(attrSpan):
colspan = int(self.currentElem.attrs[attrSpan])
self.getTable().curColIndex += colspan
elif elem == self.tags['table-column']:
attrs = self.currentElem.attrs
if attrs.has_key(self.tags['number-columns-repeated']):
self.getTable().nbOfColumns += int(
attrs[self.tags['number-columns-repeated']])
else:
self.getTable().nbOfColumns += 1
return ns
def onEndElement(self):
ns = self.namespaces
if self.currentElem.elem == Table.OD.elem:
self.tableStack.pop()
self.tableIndex -= 1
return ns
def addSubBuffer(self):
subBuffer = self.currentBuffer.addSubBuffer()
self.currentBuffer = subBuffer
self.mode = self.ADD_IN_BUFFER
def propagateNamespaces(self):
'''Propagates the namespaces in all XML element definitions that are
used throughout POD.'''
ns = self.namespaces
for elemName in PodElement.POD_ELEMS:
xmlElemDef = eval(elemName[0].upper() + elemName[1:]).OD
elemFullName = xmlElemDef.getFullName(ns)
xmlElemDef.__init__(elemFullName)
# Create a table of names of used tags and attributes (precomputed,
# including namespace, for performance).
table = ns[self.NS_TABLE]
text = ns[self.NS_TEXT]
office = ns[self.NS_OFFICE]
tags = {
'tracked-changes': '%s:tracked-changes' % text,
'change': '%s:change' % text,
'annotation': '%s:annotation' % office,
'change-start': '%s:change-start' % text,
'change-end': '%s:change-end' % text,
'conditional-text': '%s:conditional-text' % text,
'text-input': '%s:text-input' % text,
'table': '%s:table' % table,
'table-name': '%s:name' % table,
'table-cell': '%s:table-cell' % table,
'table-column': '%s:table-column' % table,
'formula': '%s:formula' % table,
'value-type': '%s:value-type' % office,
'value': '%s:value' % office,
'string-value': '%s:string-value' % office,
'span': '%s:span' % text,
'number-columns-spanned': '%s:number-columns-spanned' % table,
'number-columns-repeated': '%s:number-columns-repeated' % table,
}
self.tags = tags
self.ignorableElems = (tags['tracked-changes'], tags['change'])
self.exprStartElems = (tags['change-start'], tags['conditional-text'], \
tags['text-input'])
self.exprEndElems = (tags['change-end'], tags['conditional-text'], \
tags['text-input'])
self.impactableElems = (Text.OD.elem, Title.OD.elem, Table.OD.elem,
Row.OD.elem, Cell.OD.elem, Section.OD.elem)
self.inserts = self.transformInserts()
# ------------------------------------------------------------------------------
class PodParser(OdfParser):
def __init__(self, env, caller):
OdfParser.__init__(self, env, caller)
env.raiseOnError = caller.raiseOnError
def endDocument(self):
self.env.currentBuffer.content.close()
def startElement(self, elem, attrs):
e = OdfParser.startElement(self, elem, attrs)
ns = e.onStartElement()
officeNs = ns[e.NS_OFFICE]
textNs = ns[e.NS_TEXT]
tableNs = ns[e.NS_TABLE]
if elem in e.ignorableElems:
e.state = e.IGNORING
elif elem == e.tags['annotation']:
# Be it in an ODT or ODS template, an annotation is considered to
# contain a POD statement.
e.state = e.READING_STATEMENT
elif elem in e.exprStartElems:
# Any track-changed text or being in a conditional or input field is
# considered to be a POD expression.
e.state = e.READING_EXPRESSION
e.exprHasStyle = False
elif (elem == e.tags['table-cell']) and \
attrs.has_key(e.tags['formula']) and \
attrs.has_key(e.tags['value-type']) and \
(attrs[e.tags['value-type']] == 'string') and \
attrs[e.tags['formula']].startswith('of:="'):
# In an ODS template, any cell containing a formula of type "string"
# and whose content is expressed as a string between double quotes
# (="...") is considered to contain a POD expression. But here it
# is a special case: we need to dump the cell; the expression is not
# directly contained within this cell; the expression will be
# contained in the next inner paragraph. So we must here dump the
# cell, but without some attributes, because the "formula" will be
# converted to the result of evaluating the POD expression.
if e.mode == e.ADD_IN_SUBBUFFER:
e.addSubBuffer()
e.currentBuffer.addElement(e.currentElem.name)
hook = e.currentBuffer.dumpStartElement(elem, attrs,
ignoreAttrs=(e.tags['formula'], e.tags['string-value'],
e.tags['value-type']),
hook=True)
# We already have the POD expression: remember it on the env.
e.currentOdsExpression = attrs[e.tags['string-value']]
e.currentOdsHook = hook
else:
if e.state == e.IGNORING:
pass
elif e.state == e.READING_CONTENT:
if elem in e.impactableElems:
if e.mode == e.ADD_IN_SUBBUFFER:
e.addSubBuffer()
e.currentBuffer.addElement(e.currentElem.name)
e.currentBuffer.dumpStartElement(elem, attrs)
elif e.state == e.READING_STATEMENT:
pass
elif e.state == e.READING_EXPRESSION:
if (elem == (e.tags['span'])) and not e.currentContent.strip():
e.currentBuffer.dumpStartElement(elem, attrs)
e.exprHasStyle = True
e.manageInserts()
def endElement(self, elem):
e = OdfParser.endElement(self, elem)
ns = e.onEndElement()
officeNs = ns[e.NS_OFFICE]
textNs = ns[e.NS_TEXT]
if elem in e.ignorableElems:
e.state = e.READING_CONTENT
elif elem == e.tags['annotation']:
# Manage statement
oldCb = e.currentBuffer
actionElemIndex = oldCb.createAction(e.currentStatement)
e.currentStatement = []
if actionElemIndex != -1:
e.currentBuffer = oldCb.\
transferActionIndependentContent(actionElemIndex)
if e.currentBuffer == oldCb:
e.mode = e.ADD_IN_SUBBUFFER
else:
e.mode = e.ADD_IN_BUFFER
e.state = e.READING_CONTENT
else:
if e.state == e.IGNORING:
pass
elif e.state == e.READING_CONTENT:
# Dump the ODS POD expression if any
if e.currentOdsExpression:
e.currentBuffer.addExpression(e.currentOdsExpression,
tiedHook=e.currentOdsHook)
e.currentOdsExpression = None
e.currentOdsHook = None
# Dump the ending tag
e.currentBuffer.dumpEndElement(elem)
if elem in e.impactableElems:
if isinstance(e.currentBuffer, MemoryBuffer):
isMainElement = e.currentBuffer.isMainElement(elem)
# Unreference the element among buffer.elements
e.currentBuffer.unreferenceElement(elem)
if isMainElement:
parent = e.currentBuffer.parent
if not e.currentBuffer.action:
# Delete this buffer and transfer content to
# parent.
e.currentBuffer.transferAllContent()
parent.removeLastSubBuffer()
e.currentBuffer = parent
else:
if isinstance(parent, FileBuffer):
# Execute buffer action and delete the
# buffer.
e.currentBuffer.action.execute(parent,
e.context)
parent.removeLastSubBuffer()
e.currentBuffer = parent
e.mode = e.ADD_IN_SUBBUFFER
elif e.state == e.READING_STATEMENT:
if e.currentElem.elem == Text.OD.elem:
statementLine = e.currentContent.strip()
if statementLine:
e.currentStatement.append(statementLine)
e.currentContent = ''
elif e.state == e.READING_EXPRESSION:
if elem in e.exprEndElems:
expression = e.currentContent.strip()
e.currentContent = ''
# Manage expression
e.currentBuffer.addExpression(expression)
if e.exprHasStyle:
e.currentBuffer.dumpEndElement(e.tags['span'])
e.state = e.READING_CONTENT
def characters(self, content):
e = OdfParser.characters(self, content)
if e.state == e.IGNORING:
pass
elif e.state == e.READING_CONTENT:
if e.currentOdsExpression:
# Do not write content if we have encountered an ODS expression:
# we will replace this content with the expression's result.
pass
else:
e.currentBuffer.dumpContent(content)
elif e.state == e.READING_STATEMENT:
if e.currentElem.elem.startswith(e.namespaces[e.NS_TEXT]):
e.currentContent += content
elif e.state == e.READING_EXPRESSION:
e.currentContent += content
# ------------------------------------------------------------------------------