appypod-rattail/appy/shared/rtf.py
2015-11-03 20:55:38 +01:00

490 lines
21 KiB
Python
Raw Permalink Blame History

# -*- coding: iso-8859-15 -*-
# ------------------------------------------------------------------------------
# Appy is a framework for building applications in the Python language.
# Copyright (C) 2007 Gaetan Delannay
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,USA.
'''RTF table parser.
This parser reads RTF documents that conform to the following.
- Each table must have a first row with only one cell: the table name.
- The other rows must all have the same number of columns. This number must
be strictly greater than 1.'''
# -----------------------------------------------------------------------------
import re, sys, collections
from collections import UserDict
from io import StringIO
# -----------------------------------------------------------------------------
class ParserError(Exception): pass
class TypeError(Exception): pass
# ParserError-related constants ------------------------------------------------
BAD_PARENT_ROW = 'For table "%s", you specified "%s" as parent ' \
'table, but you referred to row number "%s" ' \
'within the parent. This value must be a positive ' \
'integer or zero (we start counting rows at 0).'
PARENT_NOT_FOUND = 'I cannot find table "%s" that you defined as being ' \
'parent of "%s".'
TABLE_KEY_ERROR = 'Within a row of table "%s", you mention a column named ' \
'"%s" which does not exist neither in "%s" itself, ' \
'neither in its parent row(s). '
PARENT_ROW_NOT_FOUND = 'You specified table "%s" as inheriting from table ' \
'"%s", row "%d", but this row does not exist (table ' \
'"%s" as a length = %d). Note that we start counting ' \
'rows at 0.'
PARENT_COLUMN_NOT_FOUND = 'You specified table "%s" as inheriting from table ' \
'"%s", column "%s", but this column does not exist ' \
'in table "%s" or parents.'
PARENT_ROW_COL_NOT_FOUND = 'You specified table "%s" as inheriting from ' \
'table "%s", column "%s", value "%s", but it does ' \
'not correspond to any row in table "%s".'
NO_ROWS_IN_TABLE_YET = 'In first row of table "%s", you use value \' " \' ' \
'for referencing the cell value in previous row, ' \
'which does not exist.'
VALUE_ERROR = 'Value error for column "%s" of table "%s". %s'
TYPE_ERROR = 'Type error for column "%s" of table "%s". %s'
# TypeError-related constants -------------------------------------------------
LIST_TYPE_ERROR = 'Maximum number of nested lists is 4.'
BASIC_TYPE_ERROR = 'Letter "%s" does not correspond to any valid type. ' \
'Valid types are f (float), i (int), g (long) and b (bool).'
BASIC_VALUE_ERROR = 'Value "%s" can\'t be converted to type "%s".'
LIST_VALUE_ERROR = 'Value "%s" is malformed: within it, %s. You should check ' \
'the use of separators ( , : ; - ) to obtain a schema ' \
'conform to the type "%s"'
# -----------------------------------------------------------------------------
class Type:
basicTypes = {'f': float, 'i':int, 'g':int, 'b':bool}
separators = ['-', ';', ',', ':']
def __init__(self, typeDecl):
self.basicType = None # The python basic type
self.listNumber = 0
# If = 1 : it is a list. If = 2: it is a list of lists. If = 3...
self.analyseTypeDecl(typeDecl)
if self.listNumber > 4:
raise TypeError(LIST_TYPE_ERROR)
self.name = self.computeName()
def analyseTypeDecl(self, typeDecl):
for char in typeDecl:
if char == 'l':
self.listNumber += 1
else:
# Get the basic type
if not (char in list(Type.basicTypes.keys())):
raise TypeError(BASIC_TYPE_ERROR % char)
self.basicType = Type.basicTypes[char]
break
if not self.basicType:
self.basicType = str
def convertBasicValue(self, value):
try:
return self.basicType(value.strip())
except ValueError:
raise TypeError(BASIC_VALUE_ERROR % (value,
self.basicType.__name__))
def convertValue(self, value):
'''Converts a p_value which is a string into a value conform
to self.'''
if self.listNumber == 0:
res = self.convertBasicValue(value)
else:
# Get separators in their order of appearance
separators = []
for char in value:
if (char in Type.separators) and (char not in separators):
separators.append(char)
# Remove surplus separators
if len(separators) > self.listNumber:
nbOfSurplusSeps = len(separators) - self.listNumber
separators = separators[nbOfSurplusSeps:]
# If not enough separators, create corresponding empty lists.
res = None
innerList = None
resIsComplete = False
if len(separators) < self.listNumber:
if not value:
res = []
resIsComplete = True
else:
# Begin with empty list(s)
nbOfMissingSeps = self.listNumber - len(separators)
res = []
innerList = res
for i in range(nbOfMissingSeps-1):
newInnerList = []
innerList.append(newInnerList)
innerList = newInnerList
# We can now convert the value
separators.reverse()
if innerList != None:
innerList.append(self.convertListItem(value, separators))
elif not resIsComplete:
try:
res = self.convertListItem(value, separators)
except TypeError as te:
raise TypeError(LIST_VALUE_ERROR % (value, te, self.name))
return res
def convertListItem(self, stringItem, remainingSeps):
if not remainingSeps:
res = self.convertBasicValue(stringItem)
else:
curSep = remainingSeps[0]
tempRes = stringItem.split(curSep)
if (len(tempRes) == 1) and (not tempRes[0]):
# There was no value within value, so we produce an empty list.
res = []
else:
res = []
for tempItem in tempRes:
res.append(self.convertListItem(tempItem,
remainingSeps[1:]))
return res
def computeName(self):
prefix = 'list of ' * self.listNumber
return '<%s%s>' % (prefix, self.basicType.__name__)
def __repr__(self):
return self.name
# -----------------------------------------------------------------------------
class Table(collections.UserList):
def __init__(self):
collections.UserList.__init__(self)
self.name = None
self.parent = None
self.parentRow = None
# Either ~i~ (the ith row in table self.parent, index starts at 0) or
# ~(s_columnName:s_columnValue)~ (identifies the 1st row that have
# s_columnValue for the column named s_columnName)
def dump(self, withContent=True):
res = 'Table "%s"' % self.name
if self.parent:
res += ' extends table "%s"' % self.parent.name
if isinstance(self.parentRow, int):
res += '(%d)' % self.parentRow
else:
res += '(%s=%s)' % self.parentRow
if withContent:
res += '\n'
for line in self:
res += str(line)
return res
def instanceOf(self, tableName):
res = False
if self.parent:
if self.parent.name == tableName:
res = True
else:
res = self.parent.instanceOf(tableName)
return res
def asDict(self):
'''If this table as only 2 columns named "key" and "value", it can be
represented as a Python dict. This method produces this dict.'''
infoDict = {}
if self.parent:
for info in self.parent:
infoDict[info["key"]] = info["value"]
for info in self:
infoDict[info["key"]] = info["value"]
return infoDict
# -----------------------------------------------------------------------------
class TableRow(UserDict):
def __init__(self, table):
UserDict.__init__(self)
self.table = table
def __getitem__(self, key):
'''This method "implements" row inheritance: if the current row does
not have an element with p_key, it looks in the parent row of this row,
via the parent table self.table.'''
keyError = False
t = self.table
if key in self:
res = UserDict.__getitem__(self, key)
else:
# Get the parent row
if t.parent:
if isinstance(t.parentRow, int):
if t.parentRow < len(t.parent):
try:
res = t.parent[t.parentRow][key]
except KeyError:
keyError = True
else:
raise ParserError(PARENT_ROW_NOT_FOUND %
(t.name, t.parent.name, t.parentRow,
t.parent.name, len(t.parent)))
else:
tColumn, tValue = t.parentRow
# Get the 1st row having tColumn = tValue
rowFound = False
for row in t.parent:
try:
curVal = row[tColumn]
except KeyError:
raise ParserError(PARENT_COLUMN_NOT_FOUND %
(t.name, t.parent.name, tColumn,
t.parent.name))
if curVal == tValue:
rowFound = True
try:
res = row[key]
except KeyError:
keyError = True
break
if not rowFound:
raise ParserError(PARENT_ROW_COL_NOT_FOUND %
(t.name, t.parent.name, tColumn,
tValue, t.parent.name))
else:
keyError = True
if keyError:
raise KeyError(TABLE_KEY_ERROR % (t.name, key, t.name))
return res
# -----------------------------------------------------------------------------
class NameResolver:
def resolveNames(self, tables):
for tableName, table in tables.items():
if table.parent:
if table.parent not in tables:
raise ParserError(PARENT_NOT_FOUND %
(table.parent, table.name))
table.parent = tables[table.parent]
# -----------------------------------------------------------------------------
class TableParser:
# Parser possible states
IGNORE = 0
READING_CONTROL_WORD = 1
READING_CONTENT = 2
READING_SPECIAL_CHAR = 3
def __init__(self, fileName):
self.input = open(fileName)
self.state = None
# RTF character types
self.alpha = re.compile('[a-zA-Z_\-\*]')
self.numeric = re.compile('[0-9]')
self.whiteSpaces = (' ', '\t', '\n', '\r', '\f', '\v')
self.specialChars = {91:"'", 92:"'", 93:'"', 94:'"', 85:'...', 81:'<EFBFBD>',
4:'', 5:''}
# Parser state
self.state = TableParser.READING_CONTENT
# Parser buffers
self.controlWordBuffer = ''
self.contentBuffer = StringIO()
self.specialCharBuffer = ''
# Resulting RTF output tables
self.rtfTables = {}
# Attributes needed by onRow and onColumn
self.nbOfColumns = 0
self.currentRow = []
self.previousRow = []
self.currentTable = Table()
self.currentTableName = None
self.currentColumnNames = None # ~[]~
self.currentColumnTypes = None # ~[]~
self.rowIsHeader = False
# Table name regular expression
self.tableNameRex = re.compile('([^\(]+)(?:\((.*)\))?')
def isGroupDelimiter(self, char):
return (char == '{') or (char == '}')
def isControlWordStart(self, char):
return (char == '\\')
def isAlpha(self, char):
return self.alpha.match(char)
def isNumeric(self, char):
return self.numeric.match(char)
def isWhiteSpace(self, char):
return (char in self.whiteSpaces)
def isQuote(self, char):
return char == "'"
def manageControlWord(self):
self.state = TableParser.READING_CONTENT
cWord = self.controlWordBuffer
if cWord == 'trowd':
self.contentBuffer = StringIO()
elif cWord == 'row':
self.onRow()
self.contentBuffer = StringIO()
elif cWord == 'cell':
self.onColumn(self.contentBuffer.getvalue().strip())
self.contentBuffer = StringIO()
elif cWord in ('bkmkstart', 'bkmkend'):
self.state = TableParser.IGNORE
self.controlWordBuffer = ''
def manageSpecialChar(self):
if len(self.specialCharBuffer) == 2:
specialChar = bytes.fromhex(self.specialCharBuffer).decode('utf-8')
self.specialCharBuffer = ''
self.state = TableParser.READING_CONTENT
if specialChar is not '':
self.contentBuffer.write(specialChar)
def bufferize(self, char):
if self.state == TableParser.READING_CONTROL_WORD:
self.controlWordBuffer += char
elif self.state == TableParser.READING_CONTENT:
self.contentBuffer.write(char)
elif self.state == TableParser.READING_SPECIAL_CHAR:
self.specialCharBuffer += char
def parse(self):
for line in self.input:
for char in line:
if self.state == TableParser.READING_SPECIAL_CHAR:
self.bufferize(char)
self.manageSpecialChar()
continue
if self.isGroupDelimiter(char):
self.state = TableParser.READING_CONTENT
elif self.isControlWordStart(char):
self.manageControlWord()
self.state = TableParser.READING_CONTROL_WORD
elif self.isAlpha(char):
self.bufferize(char)
elif self.isNumeric(char):
self.bufferize(char)
elif self.isWhiteSpace(char):
if self.state == TableParser.READING_CONTROL_WORD:
self.manageControlWord()
elif self.state == TableParser.READING_CONTENT:
if char not in ['\n', '\r']:
self.contentBuffer.write(char)
elif self.isQuote(char):
if (self.state == TableParser.READING_CONTROL_WORD) and \
not self.controlWordBuffer:
self.state = TableParser.READING_SPECIAL_CHAR
else:
self.bufferize(char)
else:
self.contentBuffer.write(char)
if self.controlWordBuffer:
self.manageControlWord()
if self.currentTableName:
self.addTable(self.currentTableName, self.currentTable)
return self.rtfTables
def getColumnInfos(self, columnHeaders):
'''Get, from the column headers, column names and types.'''
columnNames = []
columnTypes = []
for header in columnHeaders:
if header.find(':') != -1:
# We have a type declaration
name, typeDecl = header.split(':')
columnNames.append(name.strip())
try:
columnTypes.append(Type(typeDecl.strip()))
except TypeError as te:
raise ParserError(TYPE_ERROR %
(header, self.currentTableName, te))
else:
# No type declaration: implicitly it is a string
columnNames.append(header)
columnTypes.append(None)
return columnNames, columnTypes
def onRow(self):
if (self.nbOfColumns == 0) or not self.currentRow:
pass
else:
if self.rowIsHeader:
self.currentColumnNames, self.currentColumnTypes = \
self.getColumnInfos(self.currentRow)
self.rowIsHeader = False
elif self.nbOfColumns == 1:
self.rowIsHeader = True
if self.currentTableName:
self.addTable(self.currentTableName, self.currentTable)
self.currentTable = Table()
self.currentTableName = self.currentRow[0]
else:
self.addRow()
del self.currentRow[:]
self.nbOfColumns = 0
def onColumn(self, content):
self.currentRow.append(content)
self.nbOfColumns += 1
def addRow(self):
i = 0
row = TableRow(self.currentTable)
for columnName in self.currentColumnNames:
columnValue = self.currentRow[i]
if columnValue == '"':
if len(self.currentTable) == 0:
raise ParserError(
NO_ROWS_IN_TABLE_YET % self.currentTableName)
else:
lastRow = self.currentTable[len(self.currentTable)-1]
columnValue = lastRow[columnName]
else:
columnType = self.currentColumnTypes[i]
if columnType:
try:
columnValue = columnType.convertValue(columnValue)
except TypeError as te:
raise ParserError(VALUE_ERROR %
(columnName, self.currentTableName,
te))
row[columnName] = columnValue
i += 1
self.currentTable.append(row)
def addTable(self, tableName, table):
res = self.tableNameRex.search(tableName)
tName, parentSpec = res.groups()
table.name = tName
if parentSpec:
res = parentSpec.split(':')
if len(res) == 1:
table.parent = parentSpec.strip()
table.parentRow = 0
else:
table.parent = res[0].strip()
res = res[1].split('=')
if len(res) == 1:
try:
table.parentRow = int(res[0])
except ValueError:
raise ParserError(BAD_PARENT_ROW %
(table.name, table.parent,
res[0]))
if table.parentRow < 0:
raise ParserError(BAD_PARENT_ROW %
(table.name, table.parent,
res[0]))
else:
table.parentRow = (res[0].strip(), res[1].strip())
self.rtfTables[table.name] = table
# -----------------------------------------------------------------------------
class RtfTablesParser:
def __init__(self, fileName):
self.tableParser = TableParser(fileName)
self.nameResolver = NameResolver()
def parse(self):
tables = self.tableParser.parse()
self.nameResolver.resolveNames(tables)
return tables
# -----------------------------------------------------------------------------
if __name__ =='__main__':
tables = RtfTablesParser("Tests.rtf").parse()
for key, item in tables.items():
print(('Table %s' % key))
print(item)
# -----------------------------------------------------------------------------