490 lines
21 KiB
Python
490 lines
21 KiB
Python
# -*- coding: iso-8859-15 -*-
|
||
# ------------------------------------------------------------------------------
|
||
# Appy is a framework for building applications in the Python language.
|
||
# Copyright (C) 2007 Gaetan Delannay
|
||
|
||
# This program is free software; you can redistribute it and/or
|
||
# modify it under the terms of the GNU General Public License
|
||
# as published by the Free Software Foundation; either version 2
|
||
# of the License, or (at your option) any later version.
|
||
|
||
# This program is distributed in the hope that it will be useful,
|
||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
# GNU General Public License for more details.
|
||
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with this program; if not, write to the Free Software
|
||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,USA.
|
||
|
||
'''RTF table parser.
|
||
|
||
This parser reads RTF documents that conform to the following.
|
||
- Each table must have a first row with only one cell: the table name.
|
||
- The other rows must all have the same number of columns. This number must
|
||
be strictly greater than 1.'''
|
||
|
||
# -----------------------------------------------------------------------------
|
||
import re, sys, collections
|
||
from collections import UserDict
|
||
from io import StringIO
|
||
|
||
# -----------------------------------------------------------------------------
|
||
class ParserError(Exception): pass
|
||
class TypeError(Exception): pass
|
||
|
||
# ParserError-related constants ------------------------------------------------
|
||
BAD_PARENT_ROW = 'For table "%s", you specified "%s" as parent ' \
|
||
'table, but you referred to row number "%s" ' \
|
||
'within the parent. This value must be a positive ' \
|
||
'integer or zero (we start counting rows at 0).'
|
||
PARENT_NOT_FOUND = 'I cannot find table "%s" that you defined as being ' \
|
||
'parent of "%s".'
|
||
TABLE_KEY_ERROR = 'Within a row of table "%s", you mention a column named ' \
|
||
'"%s" which does not exist neither in "%s" itself, ' \
|
||
'neither in its parent row(s). '
|
||
PARENT_ROW_NOT_FOUND = 'You specified table "%s" as inheriting from table ' \
|
||
'"%s", row "%d", but this row does not exist (table ' \
|
||
'"%s" as a length = %d). Note that we start counting ' \
|
||
'rows at 0.'
|
||
PARENT_COLUMN_NOT_FOUND = 'You specified table "%s" as inheriting from table ' \
|
||
'"%s", column "%s", but this column does not exist ' \
|
||
'in table "%s" or parents.'
|
||
PARENT_ROW_COL_NOT_FOUND = 'You specified table "%s" as inheriting from ' \
|
||
'table "%s", column "%s", value "%s", but it does ' \
|
||
'not correspond to any row in table "%s".'
|
||
NO_ROWS_IN_TABLE_YET = 'In first row of table "%s", you use value \' " \' ' \
|
||
'for referencing the cell value in previous row, ' \
|
||
'which does not exist.'
|
||
VALUE_ERROR = 'Value error for column "%s" of table "%s". %s'
|
||
TYPE_ERROR = 'Type error for column "%s" of table "%s". %s'
|
||
|
||
# TypeError-related constants -------------------------------------------------
|
||
LIST_TYPE_ERROR = 'Maximum number of nested lists is 4.'
|
||
BASIC_TYPE_ERROR = 'Letter "%s" does not correspond to any valid type. ' \
|
||
'Valid types are f (float), i (int), g (long) and b (bool).'
|
||
BASIC_VALUE_ERROR = 'Value "%s" can\'t be converted to type "%s".'
|
||
LIST_VALUE_ERROR = 'Value "%s" is malformed: within it, %s. You should check ' \
|
||
'the use of separators ( , : ; - ) to obtain a schema ' \
|
||
'conform to the type "%s"'
|
||
|
||
# -----------------------------------------------------------------------------
|
||
class Type:
|
||
basicTypes = {'f': float, 'i':int, 'g':int, 'b':bool}
|
||
separators = ['-', ';', ',', ':']
|
||
def __init__(self, typeDecl):
|
||
self.basicType = None # The python basic type
|
||
self.listNumber = 0
|
||
# If = 1 : it is a list. If = 2: it is a list of lists. If = 3...
|
||
self.analyseTypeDecl(typeDecl)
|
||
if self.listNumber > 4:
|
||
raise TypeError(LIST_TYPE_ERROR)
|
||
self.name = self.computeName()
|
||
def analyseTypeDecl(self, typeDecl):
|
||
for char in typeDecl:
|
||
if char == 'l':
|
||
self.listNumber += 1
|
||
else:
|
||
# Get the basic type
|
||
if not (char in list(Type.basicTypes.keys())):
|
||
raise TypeError(BASIC_TYPE_ERROR % char)
|
||
self.basicType = Type.basicTypes[char]
|
||
break
|
||
if not self.basicType:
|
||
self.basicType = str
|
||
def convertBasicValue(self, value):
|
||
try:
|
||
return self.basicType(value.strip())
|
||
except ValueError:
|
||
raise TypeError(BASIC_VALUE_ERROR % (value,
|
||
self.basicType.__name__))
|
||
def convertValue(self, value):
|
||
'''Converts a p_value which is a string into a value conform
|
||
to self.'''
|
||
if self.listNumber == 0:
|
||
res = self.convertBasicValue(value)
|
||
else:
|
||
# Get separators in their order of appearance
|
||
separators = []
|
||
for char in value:
|
||
if (char in Type.separators) and (char not in separators):
|
||
separators.append(char)
|
||
# Remove surplus separators
|
||
if len(separators) > self.listNumber:
|
||
nbOfSurplusSeps = len(separators) - self.listNumber
|
||
separators = separators[nbOfSurplusSeps:]
|
||
# If not enough separators, create corresponding empty lists.
|
||
res = None
|
||
innerList = None
|
||
resIsComplete = False
|
||
if len(separators) < self.listNumber:
|
||
if not value:
|
||
res = []
|
||
resIsComplete = True
|
||
else:
|
||
# Begin with empty list(s)
|
||
nbOfMissingSeps = self.listNumber - len(separators)
|
||
res = []
|
||
innerList = res
|
||
for i in range(nbOfMissingSeps-1):
|
||
newInnerList = []
|
||
innerList.append(newInnerList)
|
||
innerList = newInnerList
|
||
# We can now convert the value
|
||
separators.reverse()
|
||
if innerList != None:
|
||
innerList.append(self.convertListItem(value, separators))
|
||
elif not resIsComplete:
|
||
try:
|
||
res = self.convertListItem(value, separators)
|
||
except TypeError as te:
|
||
raise TypeError(LIST_VALUE_ERROR % (value, te, self.name))
|
||
return res
|
||
def convertListItem(self, stringItem, remainingSeps):
|
||
if not remainingSeps:
|
||
res = self.convertBasicValue(stringItem)
|
||
else:
|
||
curSep = remainingSeps[0]
|
||
tempRes = stringItem.split(curSep)
|
||
if (len(tempRes) == 1) and (not tempRes[0]):
|
||
# There was no value within value, so we produce an empty list.
|
||
res = []
|
||
else:
|
||
res = []
|
||
for tempItem in tempRes:
|
||
res.append(self.convertListItem(tempItem,
|
||
remainingSeps[1:]))
|
||
return res
|
||
def computeName(self):
|
||
prefix = 'list of ' * self.listNumber
|
||
return '<%s%s>' % (prefix, self.basicType.__name__)
|
||
def __repr__(self):
|
||
return self.name
|
||
|
||
# -----------------------------------------------------------------------------
|
||
class Table(collections.UserList):
|
||
def __init__(self):
|
||
collections.UserList.__init__(self)
|
||
self.name = None
|
||
self.parent = None
|
||
self.parentRow = None
|
||
# Either ~i~ (the ith row in table self.parent, index starts at 0) or
|
||
# ~(s_columnName:s_columnValue)~ (identifies the 1st row that have
|
||
# s_columnValue for the column named s_columnName)
|
||
def dump(self, withContent=True):
|
||
res = 'Table "%s"' % self.name
|
||
if self.parent:
|
||
res += ' extends table "%s"' % self.parent.name
|
||
if isinstance(self.parentRow, int):
|
||
res += '(%d)' % self.parentRow
|
||
else:
|
||
res += '(%s=%s)' % self.parentRow
|
||
if withContent:
|
||
res += '\n'
|
||
for line in self:
|
||
res += str(line)
|
||
return res
|
||
def instanceOf(self, tableName):
|
||
res = False
|
||
if self.parent:
|
||
if self.parent.name == tableName:
|
||
res = True
|
||
else:
|
||
res = self.parent.instanceOf(tableName)
|
||
return res
|
||
def asDict(self):
|
||
'''If this table as only 2 columns named "key" and "value", it can be
|
||
represented as a Python dict. This method produces this dict.'''
|
||
infoDict = {}
|
||
if self.parent:
|
||
for info in self.parent:
|
||
infoDict[info["key"]] = info["value"]
|
||
for info in self:
|
||
infoDict[info["key"]] = info["value"]
|
||
return infoDict
|
||
|
||
# -----------------------------------------------------------------------------
|
||
class TableRow(UserDict):
|
||
def __init__(self, table):
|
||
UserDict.__init__(self)
|
||
self.table = table
|
||
def __getitem__(self, key):
|
||
'''This method "implements" row inheritance: if the current row does
|
||
not have an element with p_key, it looks in the parent row of this row,
|
||
via the parent table self.table.'''
|
||
keyError = False
|
||
t = self.table
|
||
if key in self:
|
||
res = UserDict.__getitem__(self, key)
|
||
else:
|
||
# Get the parent row
|
||
if t.parent:
|
||
if isinstance(t.parentRow, int):
|
||
if t.parentRow < len(t.parent):
|
||
try:
|
||
res = t.parent[t.parentRow][key]
|
||
except KeyError:
|
||
keyError = True
|
||
else:
|
||
raise ParserError(PARENT_ROW_NOT_FOUND %
|
||
(t.name, t.parent.name, t.parentRow,
|
||
t.parent.name, len(t.parent)))
|
||
else:
|
||
tColumn, tValue = t.parentRow
|
||
# Get the 1st row having tColumn = tValue
|
||
rowFound = False
|
||
for row in t.parent:
|
||
try:
|
||
curVal = row[tColumn]
|
||
except KeyError:
|
||
raise ParserError(PARENT_COLUMN_NOT_FOUND %
|
||
(t.name, t.parent.name, tColumn,
|
||
t.parent.name))
|
||
if curVal == tValue:
|
||
rowFound = True
|
||
try:
|
||
res = row[key]
|
||
except KeyError:
|
||
keyError = True
|
||
break
|
||
if not rowFound:
|
||
raise ParserError(PARENT_ROW_COL_NOT_FOUND %
|
||
(t.name, t.parent.name, tColumn,
|
||
tValue, t.parent.name))
|
||
else:
|
||
keyError = True
|
||
if keyError:
|
||
raise KeyError(TABLE_KEY_ERROR % (t.name, key, t.name))
|
||
return res
|
||
|
||
# -----------------------------------------------------------------------------
|
||
class NameResolver:
|
||
def resolveNames(self, tables):
|
||
for tableName, table in tables.items():
|
||
if table.parent:
|
||
if table.parent not in tables:
|
||
raise ParserError(PARENT_NOT_FOUND %
|
||
(table.parent, table.name))
|
||
table.parent = tables[table.parent]
|
||
|
||
# -----------------------------------------------------------------------------
|
||
class TableParser:
|
||
# Parser possible states
|
||
IGNORE = 0
|
||
READING_CONTROL_WORD = 1
|
||
READING_CONTENT = 2
|
||
READING_SPECIAL_CHAR = 3
|
||
def __init__(self, fileName):
|
||
self.input = open(fileName)
|
||
self.state = None
|
||
# RTF character types
|
||
self.alpha = re.compile('[a-zA-Z_\-\*]')
|
||
self.numeric = re.compile('[0-9]')
|
||
self.whiteSpaces = (' ', '\t', '\n', '\r', '\f', '\v')
|
||
self.specialChars = {91:"'", 92:"'", 93:'"', 94:'"', 85:'...', 81:'<EFBFBD>',
|
||
4:'', 5:''}
|
||
# Parser state
|
||
self.state = TableParser.READING_CONTENT
|
||
# Parser buffers
|
||
self.controlWordBuffer = ''
|
||
self.contentBuffer = StringIO()
|
||
self.specialCharBuffer = ''
|
||
# Resulting RTF output tables
|
||
self.rtfTables = {}
|
||
# Attributes needed by onRow and onColumn
|
||
self.nbOfColumns = 0
|
||
self.currentRow = []
|
||
self.previousRow = []
|
||
self.currentTable = Table()
|
||
self.currentTableName = None
|
||
self.currentColumnNames = None # ~[]~
|
||
self.currentColumnTypes = None # ~[]~
|
||
self.rowIsHeader = False
|
||
# Table name regular expression
|
||
self.tableNameRex = re.compile('([^\(]+)(?:\((.*)\))?')
|
||
def isGroupDelimiter(self, char):
|
||
return (char == '{') or (char == '}')
|
||
def isControlWordStart(self, char):
|
||
return (char == '\\')
|
||
def isAlpha(self, char):
|
||
return self.alpha.match(char)
|
||
def isNumeric(self, char):
|
||
return self.numeric.match(char)
|
||
def isWhiteSpace(self, char):
|
||
return (char in self.whiteSpaces)
|
||
def isQuote(self, char):
|
||
return char == "'"
|
||
def manageControlWord(self):
|
||
self.state = TableParser.READING_CONTENT
|
||
cWord = self.controlWordBuffer
|
||
if cWord == 'trowd':
|
||
self.contentBuffer = StringIO()
|
||
elif cWord == 'row':
|
||
self.onRow()
|
||
self.contentBuffer = StringIO()
|
||
elif cWord == 'cell':
|
||
self.onColumn(self.contentBuffer.getvalue().strip())
|
||
self.contentBuffer = StringIO()
|
||
elif cWord in ('bkmkstart', 'bkmkend'):
|
||
self.state = TableParser.IGNORE
|
||
self.controlWordBuffer = ''
|
||
|
||
def manageSpecialChar(self):
|
||
if len(self.specialCharBuffer) == 2:
|
||
specialChar = bytes.fromhex(self.specialCharBuffer).decode('utf-8')
|
||
self.specialCharBuffer = ''
|
||
self.state = TableParser.READING_CONTENT
|
||
if specialChar is not '':
|
||
self.contentBuffer.write(specialChar)
|
||
def bufferize(self, char):
|
||
if self.state == TableParser.READING_CONTROL_WORD:
|
||
self.controlWordBuffer += char
|
||
elif self.state == TableParser.READING_CONTENT:
|
||
self.contentBuffer.write(char)
|
||
elif self.state == TableParser.READING_SPECIAL_CHAR:
|
||
self.specialCharBuffer += char
|
||
def parse(self):
|
||
for line in self.input:
|
||
for char in line:
|
||
if self.state == TableParser.READING_SPECIAL_CHAR:
|
||
self.bufferize(char)
|
||
self.manageSpecialChar()
|
||
continue
|
||
if self.isGroupDelimiter(char):
|
||
self.state = TableParser.READING_CONTENT
|
||
elif self.isControlWordStart(char):
|
||
self.manageControlWord()
|
||
self.state = TableParser.READING_CONTROL_WORD
|
||
elif self.isAlpha(char):
|
||
self.bufferize(char)
|
||
elif self.isNumeric(char):
|
||
self.bufferize(char)
|
||
elif self.isWhiteSpace(char):
|
||
if self.state == TableParser.READING_CONTROL_WORD:
|
||
self.manageControlWord()
|
||
elif self.state == TableParser.READING_CONTENT:
|
||
if char not in ['\n', '\r']:
|
||
self.contentBuffer.write(char)
|
||
elif self.isQuote(char):
|
||
if (self.state == TableParser.READING_CONTROL_WORD) and \
|
||
not self.controlWordBuffer:
|
||
self.state = TableParser.READING_SPECIAL_CHAR
|
||
else:
|
||
self.bufferize(char)
|
||
else:
|
||
self.contentBuffer.write(char)
|
||
|
||
if self.controlWordBuffer:
|
||
self.manageControlWord()
|
||
if self.currentTableName:
|
||
self.addTable(self.currentTableName, self.currentTable)
|
||
return self.rtfTables
|
||
def getColumnInfos(self, columnHeaders):
|
||
'''Get, from the column headers, column names and types.'''
|
||
columnNames = []
|
||
columnTypes = []
|
||
for header in columnHeaders:
|
||
if header.find(':') != -1:
|
||
# We have a type declaration
|
||
name, typeDecl = header.split(':')
|
||
columnNames.append(name.strip())
|
||
try:
|
||
columnTypes.append(Type(typeDecl.strip()))
|
||
except TypeError as te:
|
||
raise ParserError(TYPE_ERROR %
|
||
(header, self.currentTableName, te))
|
||
else:
|
||
# No type declaration: implicitly it is a string
|
||
columnNames.append(header)
|
||
columnTypes.append(None)
|
||
return columnNames, columnTypes
|
||
def onRow(self):
|
||
if (self.nbOfColumns == 0) or not self.currentRow:
|
||
pass
|
||
else:
|
||
if self.rowIsHeader:
|
||
self.currentColumnNames, self.currentColumnTypes = \
|
||
self.getColumnInfos(self.currentRow)
|
||
self.rowIsHeader = False
|
||
elif self.nbOfColumns == 1:
|
||
self.rowIsHeader = True
|
||
if self.currentTableName:
|
||
self.addTable(self.currentTableName, self.currentTable)
|
||
self.currentTable = Table()
|
||
self.currentTableName = self.currentRow[0]
|
||
else:
|
||
self.addRow()
|
||
del self.currentRow[:]
|
||
self.nbOfColumns = 0
|
||
def onColumn(self, content):
|
||
self.currentRow.append(content)
|
||
self.nbOfColumns += 1
|
||
def addRow(self):
|
||
i = 0
|
||
row = TableRow(self.currentTable)
|
||
for columnName in self.currentColumnNames:
|
||
columnValue = self.currentRow[i]
|
||
if columnValue == '"':
|
||
if len(self.currentTable) == 0:
|
||
raise ParserError(
|
||
NO_ROWS_IN_TABLE_YET % self.currentTableName)
|
||
else:
|
||
lastRow = self.currentTable[len(self.currentTable)-1]
|
||
columnValue = lastRow[columnName]
|
||
else:
|
||
columnType = self.currentColumnTypes[i]
|
||
if columnType:
|
||
try:
|
||
columnValue = columnType.convertValue(columnValue)
|
||
except TypeError as te:
|
||
raise ParserError(VALUE_ERROR %
|
||
(columnName, self.currentTableName,
|
||
te))
|
||
row[columnName] = columnValue
|
||
i += 1
|
||
self.currentTable.append(row)
|
||
def addTable(self, tableName, table):
|
||
res = self.tableNameRex.search(tableName)
|
||
tName, parentSpec = res.groups()
|
||
table.name = tName
|
||
if parentSpec:
|
||
res = parentSpec.split(':')
|
||
if len(res) == 1:
|
||
table.parent = parentSpec.strip()
|
||
table.parentRow = 0
|
||
else:
|
||
table.parent = res[0].strip()
|
||
res = res[1].split('=')
|
||
if len(res) == 1:
|
||
try:
|
||
table.parentRow = int(res[0])
|
||
except ValueError:
|
||
raise ParserError(BAD_PARENT_ROW %
|
||
(table.name, table.parent,
|
||
res[0]))
|
||
if table.parentRow < 0:
|
||
raise ParserError(BAD_PARENT_ROW %
|
||
(table.name, table.parent,
|
||
res[0]))
|
||
else:
|
||
table.parentRow = (res[0].strip(), res[1].strip())
|
||
self.rtfTables[table.name] = table
|
||
|
||
# -----------------------------------------------------------------------------
|
||
class RtfTablesParser:
|
||
def __init__(self, fileName):
|
||
self.tableParser = TableParser(fileName)
|
||
self.nameResolver = NameResolver()
|
||
def parse(self):
|
||
tables = self.tableParser.parse()
|
||
self.nameResolver.resolveNames(tables)
|
||
return tables
|
||
|
||
# -----------------------------------------------------------------------------
|
||
if __name__ =='__main__':
|
||
tables = RtfTablesParser("Tests.rtf").parse()
|
||
for key, item in tables.items():
|
||
print(('Table %s' % key))
|
||
print(item)
|
||
# -----------------------------------------------------------------------------
|