Optimized XML marshall/unmarshall process for transferring large binary files.

This commit is contained in:
Gaetan Delannay 2009-08-28 15:14:26 +02:00
parent 2d82dc4e0b
commit 599396a838
2 changed files with 80 additions and 53 deletions

View file

@ -26,6 +26,14 @@ class UnmarshalledObject:
res = res.strip() + '>' res = res.strip() + '>'
return res.encode('utf-8') return res.encode('utf-8')
class UnmarshalledFile:
'''Used for producing file objects from a marshalled Python object.'''
def __init__(self):
self.name = '' # The name of the file on disk
self.mimeType = None # The MIME type of the file
self.content = '' # The binary content of the file of a file object
self.size = 0 # The length of the file in bytes.
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
class Dummy: pass class Dummy: pass
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------

View file

@ -17,7 +17,7 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,USA. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,USA.
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
import xml.sax, base64 import xml.sax
from xml.sax.handler import ContentHandler, ErrorHandler from xml.sax.handler import ContentHandler, ErrorHandler
from xml.sax.xmlreader import InputSource from xml.sax.xmlreader import InputSource
from StringIO import StringIO from StringIO import StringIO
@ -133,8 +133,7 @@ class XmlParser(ContentHandler, ErrorHandler):
return self.res return self.res
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
from appy.shared import UnmarshalledObject, Dummy from appy.shared import UnmarshalledObject, UnmarshalledFile
from appy.gen.plone25.wrappers import FileWrapper
try: try:
from DateTime import DateTime from DateTime import DateTime
except ImportError: except ImportError:
@ -169,14 +168,10 @@ class XmlUnmarshaller(XmlParser):
# to store the next parsed element. A container can be a list, a tuple, # to store the next parsed element. A container can be a list, a tuple,
# an object (the root object of the whole web or a sub-object). # an object (the root object of the whole web or a sub-object).
self.env.currentBasicType = None # Will hold the name of the currently self.env.currentBasicType = None # Will hold the name of the currently
# parsed basic type (unicode, float, ...) # parsed basic type (unicode, float...)
self.env.currentContent = '' # We store here the content of tags. self.env.currentContent = '' # We store here the content of tags.
self.env.currentFileName = '' # If current tag contains a file, we
# store here the file name.
self.env.currentMimeType = '' # If current tag contains a file, we
# store here the file name.
containerTags = ('tuple', 'list', 'object') containerTags = ('tuple', 'list', 'object', 'file')
numericTypes = ('bool', 'int', 'float', 'long') numericTypes = ('bool', 'int', 'float', 'long')
def startElement(self, elem, attrs): def startElement(self, elem, attrs):
e = XmlParser.startElement(self, elem, attrs) e = XmlParser.startElement(self, elem, attrs)
@ -189,18 +184,19 @@ class XmlUnmarshaller(XmlParser):
if elemType == 'object': newObject = UnmarshalledObject() if elemType == 'object': newObject = UnmarshalledObject()
elif elemType == 'tuple': newObject = [] # Tuples become lists elif elemType == 'tuple': newObject = [] # Tuples become lists
elif elemType == 'list': newObject = [] elif elemType == 'list': newObject = []
elif elemType == 'file':
newObject = UnmarshalledFile()
if attrs.has_key('name'):
newObject.name = attrs['name']
if attrs.has_key('mimeType'):
newObject.mimeType = attrs['mimeType']
else: newObject = UnmarshalledObject() else: newObject = UnmarshalledObject()
# Store the value on the last container, or on the root object. # Store the value on the last container, or on the root object.
self.storeValue(elem, newObject) self.storeValue(elem, newObject)
# Push the new object on the container stack # Push the new object on the container stack
e.containerStack.append(newObject) e.containerStack.append(newObject)
else: else:
# We are parsing a basic type
e.currentBasicType = elemType e.currentBasicType = elemType
if elemType == 'file':
if attrs.has_key('name'): e.currentFileName = attrs['name']
if attrs.has_key('mimeType'):
e.currentMimeType = attrs['mimeType']
def storeValue(self, name, value): def storeValue(self, name, value):
'''Stores the newly parsed p_value (contained in tag p_name) on the '''Stores the newly parsed p_value (contained in tag p_name) on the
@ -214,8 +210,10 @@ class XmlUnmarshaller(XmlParser):
self.res.__class__ = self.klass self.res.__class__ = self.klass
else: else:
currentContainer = e.containerStack[-1] currentContainer = e.containerStack[-1]
if type(currentContainer) == list: if isinstance(currentContainer, list):
currentContainer.append(value) currentContainer.append(value)
elif isinstance(currentContainer, UnmarshalledFile):
currentContainer.content += value
else: else:
# Current container is an object # Current container is an object
setattr(currentContainer, name, value) setattr(currentContainer, name, value)
@ -236,13 +234,8 @@ class XmlUnmarshaller(XmlParser):
value = None value = None
elif e.currentBasicType == 'DateTime': elif e.currentBasicType == 'DateTime':
value = DateTime(e.currentContent.strip()) value = DateTime(e.currentContent.strip())
elif e.currentBasicType == 'file': elif e.currentBasicType == 'base64':
value = Dummy() value = e.currentContent.decode('base64')
value.name = e.currentFileName
value.content = base64.b64decode(e.currentContent.strip())
value.mimeType = e.currentMimeType
value.size = len(value.content)
value.__class__ = FileWrapper
else: else:
value = e.currentContent.strip() value = e.currentContent.strip()
# Store the value on the last container # Store the value on the last container
@ -271,24 +264,59 @@ class XmlMarshaller:
fieldsToExclude = [] fieldsToExclude = []
atFiles = ('image', 'file') # Types of archetypes fields that contain files. atFiles = ('image', 'file') # Types of archetypes fields that contain files.
def dumpValue(self, res, value, fieldType='basic'): def dumpString(self, res, s):
'''Dumps the XML version of p_value to p_res.''' '''Dumps a string into the result.'''
if fieldType == 'file':
# p_value contains the (possibly binary) content of a file. We will
# encode it in Base64.
if hasattr(value, 'data'):
v = value.data # Simple wrap for images
if hasattr(v, 'data'): v = v.data # Double wrap for files
else:
v = value
res.write(base64.b64encode(v))
elif isinstance(value, basestring):
# Replace special chars by XML entities # Replace special chars by XML entities
for c in value: for c in s:
if self.xmlEntities.has_key(c): if self.xmlEntities.has_key(c):
res.write(self.xmlEntities[c]) res.write(self.xmlEntities[c])
else: else:
res.write(c) res.write(c)
def dumpFile(self, res, v):
'''Dumps a file into the result.'''
# p_value contains the (possibly binary) content of a file. We will
# encode it in Base64, in one or several parts.
res.write('<part type="base64" number="1">')
if hasattr(v, 'data'):
# The file is an Archetypes file.
valueType = v.data.__class__.__name__
if valueType == 'Pdata':
# There will be several parts.
res.write(v.data.data.encode('base64'))
# Write subsequent parts
nextPart = v.data.next
nextPartNumber = 2
while nextPart:
res.write('</part>') # Close the previous part
res.write('<part type="base64" number="%d">'%nextPartNumber)
res.write(nextPart.data.encode('base64'))
nextPart = nextPart.next
nextPartNumber += 1
else:
res.write(v.data.encode('base64'))
else:
res.write(v.encode('base64'))
res.write('</part>')
def dumpValue(self, res, value, fieldType):
'''Dumps the XML version of p_value to p_res.'''
if fieldType == 'file':
self.dumpFile(res, value)
elif fieldType == 'ref':
if value:
if type(value) in self.sequenceTypes:
for elem in value:
self.dumpField(res, 'url', elem.absolute_url_path())
else:
self.dumpField(res, 'url', value.absolute_url_path())
elif type(value) in self.sequenceTypes:
# The previous condition must be checked before this one because
# Referred objects may be stored in lists or tuples, too.
for elem in value:
self.dumpField(res, 'e', elem)
elif isinstance(value, basestring):
self.dumpString(res, value)
elif isinstance(value, bool): elif isinstance(value, bool):
res.write(self.trueFalse[value]) res.write(self.trueFalse[value])
else: else:
@ -298,7 +326,7 @@ class XmlMarshaller:
'''Dumps in p_res, the value of the p_field for p_instance.''' '''Dumps in p_res, the value of the p_field for p_instance.'''
res.write('<'); res.write(fieldName); res.write('<'); res.write(fieldName);
# Dump the type of the field as an XML attribute # Dump the type of the field as an XML attribute
fType = None # No type will mean "string". fType = None # No type will mean "unicode".
if fieldType == 'file': fType ='file' if fieldType == 'file': fType ='file'
elif fieldType == 'ref': fType = 'list' elif fieldType == 'ref': fType = 'list'
elif isinstance(fieldValue, bool): fType = 'bool' elif isinstance(fieldValue, bool): fType = 'bool'
@ -309,28 +337,19 @@ class XmlMarshaller:
elif isinstance(fieldValue, list): fType = 'list' elif isinstance(fieldValue, list): fType = 'list'
elif fieldValue.__class__.__name__ == 'DateTime': fType = 'DateTime' elif fieldValue.__class__.__name__ == 'DateTime': fType = 'DateTime'
if fType: res.write(' type="%s"' % fType) if fType: res.write(' type="%s"' % fType)
# Dump other attributes if needed
if type(fieldValue) in self.sequenceTypes: if type(fieldValue) in self.sequenceTypes:
res.write(' count="%d"' % len(fieldValue)) res.write(' count="%d"' % len(fieldValue))
if fieldType == 'file': if fieldType == 'file':
if hasattr(fieldValue, 'content_type'): if hasattr(fieldValue, 'content_type'):
res.write(' mimeType="%s"' % fieldValue.content_type) res.write(' mimeType="%s"' % fieldValue.content_type)
if hasattr(fieldValue, 'filename'): if hasattr(fieldValue, 'filename'):
res.write(' name="%s"' % fieldValue.filename) res.write(' name="')
self.dumpString(res, fieldValue.filename)
res.write('"')
res.write('>') res.write('>')
# Dump the child elements if any # Dump the field value
if fieldType == 'ref': self.dumpValue(res, fieldValue, fieldType)
if fieldValue:
for elem in fieldValue:
self.dumpField(res, 'url', elem.absolute_url_path())
else:
self.dumpField(res, 'url', '')
elif type(fieldValue) in self.sequenceTypes:
# The previous condition must be checked before this one because
# Referred objects are stored in lists or tuples, too.
for elem in fieldValue:
self.dumpField(res, 'e', elem)
else:
res.write(self.dumpValue(res, fieldValue, fieldType))
res.write('</'); res.write(fieldName); res.write('>') res.write('</'); res.write(fieldName); res.write('>')
def marshall(self, instance, objectType='popo'): def marshall(self, instance, objectType='popo'):