# -*- coding: utf-8 -*-
# ------------------------------------------------------------------------------
# Appy is a framework for building applications in the Python language.
# Copyright (C) 2007-2011 Gaetan Delannay
#
# Distributed under the GNU General Public License.
#
# Thanks to Fabio Marcuzzi and Gauthier Bastien for management of strike and
# underline.
# ------------------------------------------------------------------------------
import xml.sax
from appy.shared.xml_parser import XmlEnvironment, XmlParser
from appy.pod.odf_parser import OdfEnvironment
from appy.pod import *
# To which ODT tags do HTML tags correspond ?
HTML_2_ODT = {'h1':'h', 'h2':'h', 'h3':'h', 'h4':'h', 'h5':'h', 'h6':'h',
'p':'p', 'b':'span', 'i':'span', 'strong':'span', 'strike':'span',
'u':'span', 'em': 'span', 'sub': 'span', 'sup': 'span',
'br': 'line-break', 'div': 'span'}
DEFAULT_ODT_STYLES = {'b': 'podBold', 'strong':'podBold', 'i': 'podItalic',
'u': 'podUnderline', 'strike': 'podStrike',
'em': 'podItalic', 'sup': 'podSup', 'sub':'podSub',
'td': 'podCell', 'th': 'podHeaderCell'}
INNER_TAGS = ('b', 'strong', 'i', 'u', 'em', 'sup', 'sub', 'span', 'div')
TABLE_CELL_TAGS = ('td', 'th')
OUTER_TAGS = TABLE_CELL_TAGS + ('li',)
# The following elements can't be rendered inside paragraphs
NOT_INSIDE_P = XHTML_HEADINGS + XHTML_LISTS + ('table',)
NOT_INSIDE_P_OR_P = NOT_INSIDE_P + ('p',)
NOT_INSIDE_LIST = ('table',)
IGNORABLE_TAGS = ('meta', 'title', 'style')
# ------------------------------------------------------------------------------
class HtmlElement:
'''Every time an HTML element is encountered during the SAX parsing,
an instance of this class is pushed on the stack of currently parsed
elements.'''
elemTypes = {'p':'para', 'li':'para','ol':'list','ul':'list'}
def __init__(self, elem, attrs):
self.elem = elem
# Keep "class" attribute (useful for finding the corresponding ODT
# style) in some cases. Normally, basic XmlElement class stores attrs,
# but for a strange reason those attrs are back to None (probably for
# performance reasons they become inaccessible after a while).
self.classAttr = None
if attrs.has_key('class'):
self.classAttr = attrs['class']
self.tagsToReopen = [] # When the HTML element corresponding to self
# is completely dumped, if there was a problem related to tags
# inclusion, we may need to dump start tags corresponding to
# tags that we had to close before dumping this element. This list
# contains HtmlElement instances.
self.tagsToClose = [] # Before dumping the closing tag corresponding
# to self, we may need to close other tags (ie closing a paragraph
# before closing a cell). This list contains HtmlElement instances.
self.elemType = self.elem
if self.elemTypes.has_key(self.elem):
self.elemType = self.elemTypes[self.elem]
# If a conflict occurs on this element, we will note it.
self.isConflictual = False
def setConflictual(self):
'''Note p_self as conflictual.'''
self.isConflictual = True
return self
def getOdfTag(self, env):
'''Gets the raw ODF tag that corresponds to me.'''
res = ''
if HTML_2_ODT.has_key(self.elem):
res += '%s:%s' % (env.textNs, HTML_2_ODT[self.elem])
elif self.elem == 'a':
res += '%s:a' % env.textNs
elif self.elem in XHTML_LISTS:
res += '%s:list' % env.textNs
elif self.elem == 'li':
res += '%s:list-item' % env.textNs
elif self.elem == 'table':
res += '%s:table' % env.tableNs
elif self.elem == 'thead':
res += '%s:table-header-rows' % env.tableNs
elif self.elem == 'tr':
res += '%s:table-row' % env.tableNs
elif self.elem in TABLE_CELL_TAGS:
res += '%s:table-cell' % env.tableNs
return res
def getOdfTags(self, env):
'''Gets the start and end tags corresponding to p_self.'''
tag = self.getOdfTag(env)
if not tag: return (None, None)
return ('<%s>' % tag, '%s>' % tag)
def getConflictualElements(self, env):
'''self was just parsed. In some cases, this element can't be dumped
in the result because there are conflictual elements among previously
parsed opening elements (p_env.currentElements). For example, if we
just dumped a "p", we can't dump a table within the "p". Such
constraints do not hold in XHTML code but hold in ODF code.'''
if not env.currentElements: return ()
parentElem = env.currentElements[-1]
# Check elements that can't be found within a paragraph
if (parentElem.elemType == 'para') and \
(self.elem in NOT_INSIDE_P_OR_P):
# Oups, li->p wrongly considered as a conflict.
if (parentElem.elem == 'li') and (self.elem == 'p'): return ()
return (parentElem.setConflictual(),)
# Check inner paragraphs
if (parentElem.elem in INNER_TAGS) and (self.elemType == 'para'):
res = [parentElem.setConflictual()]
if len(env.currentElements) > 1:
i = 2
visitParents = True
while visitParents:
try:
nextParent = env.currentElements[-i]
i += 1
res.insert(0, nextParent.setConflictual())
if nextParent.elemType == 'para':
visitParents = False
except IndexError:
visitParents = False
return res
if parentElem.tagsToClose and \
(parentElem.tagsToClose[-1].elemType == 'para') and \
(self.elem in NOT_INSIDE_P):
return (parentElem.tagsToClose[-1].setConflictual(),)
# Check elements that can't be found within a list
if (parentElem.elemType=='list') and (self.elem in NOT_INSIDE_LIST):
return (parentElem.setConflictual(),)
return ()
def addInnerParagraph(self, env):
'''Dump an inner paragraph inside self (if not already done).'''
if not self.tagsToClose:
# We did not do it yet
env.dumpString('<%s:p' % env.textNs)
if self.elem == 'li':
itemStyle = env.getCurrentElement(isList=True).elem # ul or ol
# Which 'li'-related style must I use?
if self.classAttr:
odtStyle = env.parser.caller.findStyle(
self.elem, classValue=self.classAttr)
if odtStyle and (odtStyle.name == 'podItemKeepWithNext'):
itemStyle += '_kwn'
env.dumpString(' %s:style-name="%s"' % (env.textNs,
env.itemStyles[itemStyle]))
env.dumpString('>')
self.tagsToClose.append(HtmlElement('p',{}))
def dump(self, start, env):
'''Dumps the start or end (depending on p_start) tag of this HTML
element. We must take care of potential innerTags.'''
# Compute the tag in itself
tag = ''
prefix = '<'
if not start: prefix += '/'
# Compute tag attributes
attrs = ''
if start:
if self.elemType == 'list':
# I must specify the list style
attrs += ' %s:style-name="%s"' % (
env.textNs, env.listStyles[self.elem])
if self.elem == 'ol':
# I have interrupted a numbered list. I need to continue
# the numbering.
attrs += ' %s:continue-numbering="true"' % env.textNs
else:
attrs = env.getOdtAttributes(self)
tag = prefix + self.getOdfTag(env) + attrs + '>'
# Close/open subTags if any
for subElem in self.tagsToClose:
subTag = subElem.dump(start, env)
if start: tag += subTag
else: tag = subTag + tag
return tag
def __repr__(self):
return '' % self.elem
# ------------------------------------------------------------------------------
class HtmlTable:
'''Represents an HTML table, and also a sub-buffer. When parsing elements
corresponding to an HTML table (
, , , etc), we can't dump
corresponding ODF elements directly into the global result buffer
(XhtmlEnvironment.res). Indeed, when dumping an ODF table, we must
dump columns declarations at the beginning of the table. So before
dumping rows and cells, we must know how much columns will be present
in the table. It means that we must first parse the first |
entirely
in order to know how much columns are present in the HTML table before
dumping the ODF table. So we use this class as a sub-buffer that will
be constructed as we parse the HTML table; when encountering the end
of the HTML table, we will dump the result of this sub-buffer into
the parent buffer, which may be the global buffer or another table
buffer.'''
def __init__(self):
self.res = u'' # The sub-buffer.
self.tempRes = u'' # The temporary sub-buffer, into which we will
# dump all table sub-elements, until we encounter the end of the first
# row. Then, we will know how much columns are defined in the table;
# we will dump columns declarations into self.res and dump self.tempRes
# into self.res.
self.firstRowParsed = False # Was the first table row completely parsed?
self.nbOfColumns = 0
# ------------------------------------------------------------------------------
class XhtmlEnvironment(XmlEnvironment):
itemStyles = {'ul': 'podBulletItem', 'ol': 'podNumberItem',
'ul_kwn': 'podBulletItemKeepWithNext',
'ol_kwn': 'podNumberItemKeepWithNext'}
listStyles = {'ul': 'podBulletedList', 'ol': 'podNumberedList'}
def __init__(self, ns):
XmlEnvironment.__init__(self)
self.res = u''
self.currentContent = u''
self.currentElements = [] # Stack of currently walked elements
self.currentLists = [] # Stack of currently walked lists (ul or ol)
self.currentTables = [] # Stack of currently walked tables
self.creatingRootParagraph = False
# Within the XHTML chunk given to this parser, there may be some
# content that is not enclosed within any tag (at "root" level). When I
# encounter such content, I will include it into a root paragraph with
# default style. This content may include sub-tags of course (span,
# div, img, a...) or may already be dumped entirely if I encounter
# "paragraph-style" sub-tags (h1, h2, p...). self.creatingRootParagraph
# tells me if I am still in a root paragraph. So when I encounter a
# "root" content I know if I must reopen a new paragraph or not, for
# example.
self.textNs = ns[OdfEnvironment.NS_TEXT]
self.linkNs = ns[OdfEnvironment.NS_XLINK]
self.tableNs = ns[OdfEnvironment.NS_TABLE]
self.ignore = False # Will be True when parsing parts of the XHTML that
# must be ignored.
def getCurrentElement(self, isList=False):
'''Gets the element that is on the top of self.currentElements or
self.currentLists.'''
res = None
if isList:
elements = self.currentLists # Stack of list elements only
else:
elements = self.currentElements # Stack of all elements (including
# elements also pushed on other stacks, like lists and tables).
if elements:
res = elements[-1]
return res
def anElementIsMissing(self, previousElem, currentElem):
res = False
if previousElem and (previousElem.elem in OUTER_TAGS) and \
((not currentElem) or (currentElem.elem in INNER_TAGS)):
res = True
return res
def dumpCurrentContent(self):
'''Dumps content that was temporarily stored in self.currentContent
into the result.'''
if self.currentContent.strip():
# Manage missing elements
currentElem = self.getCurrentElement()
if self.anElementIsMissing(currentElem, None):
currentElem.addInnerParagraph(self)
# Dump and reinitialize the current content
for c in self.currentContent.strip('\n'):
# We remove leading and trailing carriage returns, but not
# whitespace because whitespace may be part of the text to dump.
if XML_SPECIAL_CHARS.has_key(c):
self.dumpString(XML_SPECIAL_CHARS[c])
else:
self.dumpString(c)
self.currentContent = u''
def getOdtAttributes(self, htmlElem, htmlAttrs={}):
'''Gets the ODT attributes to dump for p_currentElem. p_htmlAttrs are
the parsed attributes from the XHTML p_currentElem.'''
odtStyle = self.parser.caller.findStyle(htmlElem.elem, htmlAttrs)
styleName = None
if odtStyle:
styleName = odtStyle.name
elif DEFAULT_ODT_STYLES.has_key(htmlElem.elem):
styleName = DEFAULT_ODT_STYLES[htmlElem.elem]
res = ''
if styleName:
res += ' %s:style-name="%s"' % (self.textNs, styleName)
if (htmlElem.elem in XHTML_HEADINGS) and \
(odtStyle.outlineLevel != None):
res += ' %s:outline-level="%d"' % (self.textNs, \
odtStyle.outlineLevel)
return res
def dumpStyledElement(self, htmlElem, odfTag, attrs):
'''Dumps an element that potentially has associated style
information.'''
self.dumpString('<' + odfTag)
self.dumpString(self.getOdtAttributes(htmlElem, attrs))
self.dumpString('>')
def getTags(self, elems, start=True):
'''This method returns a series of start or end tags (depending on
p_start) that correspond to HtmlElement instances in p_elems.'''
res = ''
for elem in elems:
tag = elem.dump(start, self)
if start: res += tag
else: res = tag + res
return res
def closeConflictualElements(self, conflictElems):
'''This method dumps end tags for p_conflictElems, excepted if those
tags would be empty. In this latter case, tags are purely removed
from the result.'''
startTags = self.getTags(conflictElems, start=True)
if self.res.endswith(startTags):
# In this case I would dump an empty (series of) tag(s). Instead, I
# will remove those tags.
self.res = self.res[:-len(startTags)]
else:
self.dumpString(self.getTags(conflictElems, start=False))
def dumpString(self, s):
'''Dumps arbitrary content p_s.
If the table stack is not empty, we must dump p_s into the buffer
corresponding to the last parsed table. Else, we must dump p_s
into the global buffer (self.res).'''
if self.currentTables:
currentTable = self.currentTables[-1]
if (not currentTable.res) or currentTable.firstRowParsed:
currentTable.res += s
else:
currentTable.tempRes += s
else:
self.res += s
def getTagsToReopen(self, conflictElems):
'''Normally, tags to reopen are equal to p_conflictElems. But we have a
special case. Indeed, if a conflict elem has itself tagsToClose,
the last tag to close may not be needed anymore on the tag to
reopen, so we remove it.'''
conflictElems[-1].tagsToClose = []
return conflictElems
def onElementStart(self, elem, attrs):
previousElem = self.getCurrentElement()
self.dumpCurrentContent()
currentElem = HtmlElement(elem, attrs)
# Manage conflictual elements
conflictElems = currentElem.getConflictualElements(self)
if conflictElems:
# We must close the conflictual elements, and once the currentElem
# will be dumped, we will re-open the conflictual elements.
self.closeConflictualElements(conflictElems)
currentElem.tagsToReopen = self.getTagsToReopen(conflictElems)
# Manage missing elements
if self.anElementIsMissing(previousElem, currentElem):
previousElem.addInnerParagraph(self)
# Add the current element on the stack of walked elements
self.currentElements.append(currentElem)
if elem in XHTML_LISTS:
# Update stack of current lists
self.currentLists.append(currentElem)
elif elem == 'table':
# Update stack of current tables
self.currentTables.append(HtmlTable())
elif elem in TABLE_CELL_TAGS:
# If we are in the first row of a table, update columns count
currentTable = self.currentTables[-1]
if not currentTable.firstRowParsed:
nbOfCols = 1
if attrs.has_key('colspan'):
nbOfCols = int(attrs['colspan'])
currentTable.nbOfColumns += nbOfCols
return currentElem
def onElementEnd(self, elem):
res = None
self.dumpCurrentContent()
currentElem = self.currentElements.pop()
if elem in XHTML_LISTS:
self.currentLists.pop()
elif elem == 'table':
lastTable = self.currentTables.pop()
# Dumps the content of the last parsed table into the parent buffer
self.dumpString(lastTable.res)
elif elem == 'tr':
lastTable = self.currentTables[-1]
if not lastTable.firstRowParsed:
lastTable.firstRowParsed = True
# First row is parsed. I know the number of columns in the
# table: I can dump the columns declarations.
lastTable.res += ('<%s:table-column/>' % self.tableNs) * \
lastTable.nbOfColumns
lastTable.res += lastTable.tempRes
lastTable.tempRes = u''
if currentElem.tagsToClose:
self.closeConflictualElements(currentElem.tagsToClose)
if currentElem.tagsToReopen:
res = currentElem.tagsToReopen
return currentElem, res
# ------------------------------------------------------------------------------
class XhtmlParser(XmlParser):
def lowerizeInput(self, elem, attrs=None):
'''Because (X)HTML is case insensitive, we may receive input p_elem and
p_attrs in lower-, upper- or mixed-case. So here we produce lowercase
versions that will be used throughout our parser.'''
resElem = elem.lower()
resAttrs = attrs
if attrs:
resAttrs = {}
for attrName in attrs.keys():
resAttrs[attrName.lower()] = attrs[attrName]
if attrs == None:
return resElem
else:
return resElem, resAttrs
def startElement(self, elem, attrs):
elem, attrs = self.lowerizeInput(elem, attrs)
e = XmlParser.startElement(self, elem, attrs)
currentElem = e.onElementStart(elem, attrs)
odfTag = currentElem.getOdfTag(e)
if HTML_2_ODT.has_key(elem):
e.dumpStyledElement(currentElem, odfTag, attrs)
elif elem == 'a':
e.dumpString('<%s %s:type="simple"' % (odfTag, e.linkNs))
if attrs.has_key('href'):
e.dumpString(' %s:href="%s"' % (e.linkNs, attrs['href']))
e.dumpString('>')
elif elem in XHTML_LISTS:
prologue = ''
if len(e.currentLists) >= 2:
# It is a list into another list. In this case the inner list
# must be surrounded by a list-item element.
prologue = '<%s:list-item>' % e.textNs
numbering = ''
if elem == 'ol':
numbering = ' %s:continue-numbering="false"' % e.textNs
e.dumpString('%s<%s %s:style-name="%s"%s>' % (
prologue, odfTag, e.textNs, e.listStyles[elem], numbering))
elif elem in ('li', 'thead', 'tr'):
e.dumpString('<%s>' % odfTag)
elif elem == 'table':
# Here we must call "dumpString" only once
e.dumpString('<%s %s:style-name="podTable">' % (odfTag, e.tableNs))
elif elem in TABLE_CELL_TAGS:
e.dumpString('<%s %s:style-name="%s"' % \
(odfTag, e.tableNs, DEFAULT_ODT_STYLES[elem]))
if attrs.has_key('colspan'):
e.dumpString(' %s:number-columns-spanned="%s"' % \
(e.tableNs, attrs['colspan']))
e.dumpString('>')
elif elem in IGNORABLE_TAGS:
e.ignore = True
def endElement(self, elem):
elem = self.lowerizeInput(elem)
e = XmlParser.endElement(self, elem)
currentElem, elemsToReopen = e.onElementEnd(elem)
# Determine the tag to dump
startTag, endTag = currentElem.getOdfTags(e)
if currentElem.isConflictual and e.res.endswith(startTag):
# We will not dump it, it would constitute a silly empty tag.
e.res = e.res[:-len(startTag)]
else:
# Dump the end tag. But dump some additional stuff if required.
if elem == 'div':
# For "div" elements, we append a carriage return.
endTag = '<%s:line-break/>%s' % (e.textNs, endTag)
elif elem in XHTML_LISTS:
if len(e.currentLists) >= 1:
# We were in an inner list. So we must close the list-item
# tag that surrounds it.
endTag = '%s%s:list-item>' % (endTag, e.textNs)
if endTag:
e.dumpString(endTag)
if elem in IGNORABLE_TAGS:
e.ignore = False
if elemsToReopen:
e.dumpString(e.getTags(elemsToReopen, start=True))
def characters(self, content):
e = XmlParser.characters(self, content)
if not e.ignore:
e.currentContent += content
# -------------------------------------------------------------------------------
class Xhtml2OdtConverter:
'''Converts a chunk of XHTML into a chunk of ODT.'''
def __init__(self, xhtmlString, encoding, stylesManager, localStylesMapping,
ns):
self.xhtmlString = xhtmlString
self.encoding = encoding # Todo: manage encoding that is not utf-8
self.stylesManager = stylesManager
self.odtStyles = stylesManager.styles
self.globalStylesMapping = stylesManager.stylesMapping
self.localStylesMapping = localStylesMapping
self.odtChunk = None
self.xhtmlParser = XhtmlParser(XhtmlEnvironment(ns), self)
def run(self):
self.xhtmlParser.parse(self.xhtmlString)
return self.xhtmlParser.env.res
def findStyle(self, elem, attrs=None, classValue=None):
'''Finds the ODT style that must be applied to XHTML p_elem that has
attrs p_attrs. In some cases, p_attrs is not given; the value of the
"class" attribute is given instead (in p_classValue).
Here are the places where we will search, ordered by
priority (highest first):
(1) local styles mapping (CSS style in "class" attr)
(2) " (HTML elem)
(3) global styles mapping (CSS style in "class" attr)
(4) " (HTML elem)
(5) ODT style that has the same name as CSS style in "class" attr
(6) Prefefined pod-specific ODT style that has the same name as
CSS style in "class" attr
(7) ODT style that has the same outline level as HTML elem.'''
res = None
cssStyleName = None
if attrs and attrs.has_key('class'):
cssStyleName = attrs['class']
if classValue:
cssStyleName = classValue
# (1)
if self.localStylesMapping.has_key(cssStyleName):
res = self.localStylesMapping[cssStyleName]
# (2)
elif self.localStylesMapping.has_key(elem):
res = self.localStylesMapping[elem]
# (3)
elif self.globalStylesMapping.has_key(cssStyleName):
res = self.globalStylesMapping[cssStyleName]
# (4)
elif self.globalStylesMapping.has_key(elem):
res = self.globalStylesMapping[elem]
# (5)
elif self.odtStyles.has_key(cssStyleName):
res = self.odtStyles[cssStyleName]
# (6)
elif self.stylesManager.podSpecificStyles.has_key(cssStyleName):
res = self.stylesManager.podSpecificStyles[cssStyleName]
# (7)
else:
# Try to find a style with the correct outline level
if elem in XHTML_HEADINGS:
# Is there a delta that must be taken into account ?
outlineDelta = 0
if self.localStylesMapping.has_key('h*'):
outlineDelta += self.localStylesMapping['h*']
elif self.globalStylesMapping.has_key('h*'):
outlineDelta += self.globalStylesMapping['h*']
outlineLevel = int(elem[1]) + outlineDelta
# Normalize the outline level
if outlineLevel < 1: outlineLevel = 1
res = self.odtStyles.getParagraphStyleAtLevel(outlineLevel)
if res:
self.stylesManager.checkStylesAdequation(elem, res)
return res
# ------------------------------------------------------------------------------