# -*- coding: utf-8 -*- # ------------------------------------------------------------------------------ # Appy is a framework for building applications in the Python language. # Copyright (C) 2007 Gaetan Delannay # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,USA. # ------------------------------------------------------------------------------ import xml.sax, difflib, types, cgi from xml.parsers.expat import XML_PARAM_ENTITY_PARSING_NEVER from xml.sax.handler import ContentHandler, ErrorHandler, feature_external_ges from xml.sax.xmlreader import InputSource from xml.sax import SAXParseException from appy.shared import UnicodeBuffer from appy.shared.errors import AppyError from appy.shared.utils import sequenceTypes from appy.shared.css import parseStyleAttribute # Constants -------------------------------------------------------------------- xmlPrologue = '\n' xhtmlPrologue = '\n' CONVERSION_ERROR = '"%s" value "%s" could not be converted by the XML ' \ 'unmarshaller.' CUSTOM_CONVERSION_ERROR = 'Custom converter for "%s" values produced an ' \ 'error while converting value "%s". %s' XML_SPECIAL_CHARS = {'<': '<', '>': '>', '&': '&', '"': '"', "'": '''} XML_SPECIAL_CHARS_NO_APOS = XML_SPECIAL_CHARS.copy() del XML_SPECIAL_CHARS_NO_APOS["'"] XML_ENTITIES = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"} HTML_ENTITIES = { 'iexcl': '¡', 'cent': '¢', 'pound': '£', 'curren': '€', 'yen': '¥', 'brvbar': 'Š', 'sect': '§', 'uml': '¨', 'copy':'©', 'ordf':'ª', 'laquo':'«', 'not':'¬', 'shy':'­', 'reg':'®', 'macr':'¯', 'deg':'°', 'plusmn':'±', 'sup2':'²', 'sup3':'³', 'acute':'Ž', 'micro':'µ', 'para':'¶', 'middot':'·', 'cedil':'ž', 'sup1':'¹', 'ordm':'º', 'raquo':'»', 'frac14':'Œ', 'frac12':'œ', 'frac34':'Ÿ', 'iquest':'¿', 'Agrave':'À', 'Aacute':'Á', 'Acirc':'Â', 'Atilde':'Ã', 'Auml':'Ä', 'Aring':'Å', 'AElig':'Æ', 'Ccedil':'Ç', 'Egrave':'È', 'Eacute':'É', 'Ecirc':'Ê', 'Euml':'Ë', 'Igrave':'Ì', 'Iacute':'Í', 'Icirc':'Î', 'Iuml':'Ï', 'ETH':'Ð', 'Ntilde':'Ñ', 'Ograve':'Ò', 'Oacute':'Ó', 'Ocirc':'Ó', 'Otilde':'Õ', 'Ouml':'Ö', 'times':'×', 'Oslash':'Ø', 'Ugrave':'Ù', 'Uacute':'Ú', 'Ucirc':'Û', 'Uuml':'Ü', 'Yacute':'Ý', 'THORN':'Þ', 'szlig':'ß', 'agrave':'à', 'aacute':'á', 'acirc':'â', 'atilde':'ã', 'auml':'ä', 'aring':'å', 'aelig':'æ', 'ccedil':'ç', 'egrave':'è', 'eacute':'é', 'ecirc':'ê', 'euml':'ë', 'igrave':'ì', 'iacute':'í', 'icirc':'î', 'iuml':'ï', 'eth':'ð', 'ntilde':'ñ', 'ograve':'ò', 'oacute':'ó', 'ocirc':'ô', 'otilde':'õ', 'ouml':'ö', 'divide':'÷', 'oslash':'ø', 'ugrave':'ù', 'uacute':'ú', 'ucirc':'û', 'uuml':'ü', 'yacute':'ý', 'thorn':'þ', 'yuml':'ÿ', 'euro':'€', 'nbsp':' ', "rsquo":"'", "lsquo":"'", "ldquo":"'", "rdquo":"'", 'ndash': '—', 'mdash': '—', 'oelig':'oe', 'quot': "'", 'mu': 'µ'} import htmlentitydefs for k, v in htmlentitydefs.entitydefs.iteritems(): if not HTML_ENTITIES.has_key(k) and not XML_ENTITIES.has_key(k): HTML_ENTITIES[k] = '' def escapeXml(s, format='xml', nsText='text'): '''Returns p_s, whose XML special chars have been replaced with escaped XML entities. If p_format is "odf", line breaks and tabs are converted to their ODF counterparts. In this case, it is needed to give the name of the "text" namespace (p_nsText) as defined in the ODF document where the line breaks and tabs must be inserted.''' if isinstance(s, unicode): res = u'' else: res = '' odf = format == 'odf' for c in s: if XML_SPECIAL_CHARS_NO_APOS.has_key(c): # We do not escape 'apos': there is no particular need for that. res += XML_SPECIAL_CHARS_NO_APOS[c] elif odf and (c == '\n'): res += '<%s:line-break/>' % nsText elif odf and (c == '\t'): res += '<%s:tab/>' % nsText elif odf and (c == '\r'): pass else: res += c return res def escapeXhtml(s): '''Return p_s, whose XHTML special chars and carriage return chars have been replaced with corresponding XHTML entities.''' if isinstance(s, unicode): res = u'' else: res = '' for c in s: if XML_SPECIAL_CHARS_NO_APOS.has_key(c): res += XML_SPECIAL_CHARS_NO_APOS[c] elif c == '\n': res += '
' elif c == '\r': pass else: res += c return res # ------------------------------------------------------------------------------ class XmlElement: '''Represents an XML tag.''' def __init__(self, elem, attrs=None, nsUri=None): '''An XmlElement instance may represent: - an already parsed tag (in this case, p_elem may be prefixed with a namespace); - the definition of an XML element (in this case, no namespace can be found in p_elem; but a namespace URI may be defined in p_nsUri).''' self.elem = elem self.attrs = attrs if elem.find(':') != -1: self.ns, self.name = elem.split(':') else: self.ns = '' self.name = elem self.nsUri = nsUri def equalsTo(self, other, namespaces=None): '''Does p_elem == p_other? If a p_namespaces dict is given, p_other must define a nsUri.''' res = None if namespaces: res = self.elem == ('%s:%s' % (namespaces[other.nsUri], other.name)) else: res = self.elem == other.elem return res def __repr__(self): res = self.elem if self.attrs: res += '(' for attrName, attrValue in self.attrs.items(): res += '%s="%s"' % (attrName, attrValue) res += ')' return res def getFullName(self, namespaces=None): '''Gets the name of the element including the namespace prefix.''' if not namespaces: res = self.elem else: res = '%s:%s' % (namespaces[self.nsUri], self.name) return res class XmlEnvironment: '''An XML environment remembers a series of elements during a SAX parsing. This class is an abstract class that gathers basic things like namespaces.''' def __init__(self): # This dict contains the xml namespace declarations encountered so far self.namespaces = {} # ~{s_namespaceUri:s_namespaceName}~ self.currentElem = None # The currently parsed element self.parser = None def manageNamespaces(self, attrs): '''Manages namespaces definitions encountered in p_attrs.''' for attrName, attrValue in attrs.items(): if attrName.startswith('xmlns:'): self.namespaces[attrValue] = attrName[6:] def ns(self, nsUri): '''Returns the namespace corresponding to o_nsUri.''' return self.namespaces[nsUri] class XmlParser(ContentHandler, ErrorHandler): '''Basic expat-based XML parser that does things like : - remembering the currently parsed element; - managing namespace declarations. This parser also knows about HTML entities.''' def __init__(self, env=None, caller=None, raiseOnError=True): '''p_env should be an instance of a class that inherits from XmlEnvironment: it specifies the environment to use for this SAX parser.''' ContentHandler.__init__(self) if not env: env = XmlEnvironment() self.env = env self.env.parser = self self.caller = caller # The class calling this parser self.parser = xml.sax.make_parser() # Fast, standard expat parser self.res = None # The result of parsing. # Raise or not an error when a parsing error is encountered. self.raiseOnError = raiseOnError # ContentHandler methods --------------------------------------------------- def startDocument(self): parser = self.parser._parser parser.UseForeignDTD(True) parser.SetParamEntityParsing(XML_PARAM_ENTITY_PARSING_NEVER) def setDocumentLocator(self, locator): self.locator = locator return self.env def endDocument(self): return self.env def startElement(self, elem, attrs): self.env.manageNamespaces(attrs) if self.env.currentElem == None: self.env.currentElem = XmlElement(elem, attrs=attrs) else: # Reuse the exiting instance in order to avoid creating one instance # every time an elem is met in the XML file. self.env.currentElem.__init__(elem, attrs) return self.env def endElement(self, elem): self.env.currentElem.__init__(elem) return self.env def characters(self, content): return self.env def skippedEntity(self, name): '''This method is called every time expat does not recognize an entity. We provide here support for HTML entities.''' if HTML_ENTITIES.has_key(name): self.characters(HTML_ENTITIES[name].decode('utf-8')) else: # Put a question mark instead of raising an exception. self.characters('?') # ErrorHandler methods --------------------------------------------------- def error(self, error): if self.raiseOnError: raise error else: print('SAX error %s' % str(error)) def fatalError(self, error): if self.raiseOnError: raise error else: print('SAX fatal error %s' % str(error)) def warning(self, error): pass def parse(self, xml, source='string'): '''Parses a XML stream. * If p_source is "string", p_xml must be a string containing valid XML content. * If p_source is "file": p_xml can be: - a string containing the path to the XML file on disk; - a file instance opened for reading. Note that in this case, this method will close it. ''' try: from cStringIO import StringIO except ImportError: from StringIO import StringIO self._xml = xml self.parser.setContentHandler(self) self.parser.setErrorHandler(self) self.parser.setFeature(feature_external_ges, False) inputSource = InputSource() if source == 'string': inputSource.setByteStream(StringIO(xml)) else: if not isinstance(xml, file): xml = file(xml) inputSource.setByteStream(xml) self.parser.parse(inputSource) if isinstance(xml, file): xml.close() return self.res # ------------------------------------------------------------------------------ from appy.shared import UnmarshalledFile from appy import Object try: from DateTime import DateTime except ImportError: DateTime = 'unicode' class XmlUnmarshaller(XmlParser): '''This class allows to parse a XML file and recreate the corresponding web of Python objects. This parser assumes that the XML file respects this convention: any tag may define in attribute "type" storing the type of its content, which may be: bool * int * float * long * DateTime * tuple * list * object If "object" is specified, it means that the tag contains sub-tags, each one corresponding to the value of an attribute for this object. if "tuple" is specified, it will be converted to a list.''' def __init__(self, classes={}, tagTypes={}, conversionFunctions={}, utf8=True): XmlParser.__init__(self) # self.classes below is a dict whose keys are tag names and values are # Python classes. During the unmarshalling process, when an object is # encountered, instead of creating an instance of Object, we will create # an instance of the class specified in self.classes. # Root tag is named "xmlPythonData" by default by the XmlMarshaller. # This will not work if the object in the specified tag is not an # Object instance (ie it is a list or tuple or simple value). Note that # we will not call the constructor of the specified class. We will # simply create an instance of Objects and dynamically change the class # of the created instance to this class. if not isinstance(classes, dict) and classes: # The user may only need to define a class for the root tag self.classes = {'xmlPythonData': classes} else: self.classes = classes # We expect that the parsed XML file will follow some conventions # (ie, a tag that corresponds to a list has attribute type="list" or a # tag that corresponds to an object has attribute type="object".). If # it is not the case of p_xmlContent, you can provide the missing type # information in p_tagTypes. Here is an example of p_tagTypes: # {"information": "list", "days": "list", "person": "object"}. self.tagTypes = tagTypes # The parser assumes that data is represented in some standard way. If # it is not the case, you may provide, in this dict, custom functions # allowing to convert values of basic types (long, float, DateTime...). # Every such function must take a single arg which is the value to # convert and return the converted value. Dict keys are strings # representing types ('bool', 'int', 'unicode', etc) and dict values are # conversion functions. Here is an example: # {'int': convertInteger, 'DateTime': convertDate} # NOTE: you can even invent a new basic type, put it in self.tagTypes, # and create a specific conversionFunction for it. This way, you can # for example convert strings that have specific values (in this case, # knowing that the value is a 'string' is not sufficient). self.conversionFunctions = conversionFunctions self.utf8 = utf8 def encode(self, value): '''Depending on self.utf8 we may need to encode p_value.''' if self.utf8: return value return value.encode('utf-8') def convertAttrs(self, attrs): '''Converts XML attrs to a dict.''' res = {} for k, v in attrs.items(): if ':' in k: # An attr prefixed with a namespace. Remove this. k = k.split(':')[-1] res[str(k)] = self.encode(v) return res def startDocument(self): XmlParser.startDocument(self) self.res = None # The resulting web of Python objects (Object instances) self.env.containerStack = [] # The stack of current "containers" where # to store the next parsed element. A container can be a list, a tuple, # an object (the root object of the whole web or a sub-object). self.env.currentBasicType = None # Will hold the name of the currently # parsed basic type (unicode, float...) self.env.currentContent = '' # We store here the content of tags. containerTags = ('tuple', 'list', 'dict', 'object', 'file') numericTypes = ('bool', 'int', 'float', 'long') def startElement(self, elem, attrs): # Remember the name of the previous element previousElem = None if self.env.currentElem: previousElem = self.env.currentElem.name e = XmlParser.startElement(self, elem, attrs) # Determine the type of the element. elemType = 'unicode' # Default value if attrs.has_key('type'): elemType = attrs['type'] elif self.tagTypes.has_key(elem): elemType = self.tagTypes[elem] if elemType in self.containerTags: # I must create a new container object. if elemType == 'object': newObject = Object(**self.convertAttrs(attrs)) elif elemType == 'tuple': newObject = [] # Tuples become lists elif elemType == 'list': newObject = [] elif elemType == 'dict': newObject = {} elif elemType == 'file': newObject = UnmarshalledFile() if attrs.has_key('name'): newObject.name = self.encode(attrs['name']) if attrs.has_key('mimeType'): newObject.mimeType = self.encode(attrs['mimeType']) else: newObject = Object(**self.convertAttrs(attrs)) # Store the value on the last container, or on the root object. self.storeValue(elem, newObject) # Push the new object on the container stack e.containerStack.append(newObject) else: # If we are already parsing a basic type, it means that we were # wrong for our diagnotsic of the containing element: it was not # basic. We will make the assumption that the containing element is # then an object. if e.currentBasicType: # Previous elem was an object: create it on the stack. newObject = Object() self.storeValue(previousElem, newObject) e.containerStack.append(newObject) e.currentBasicType = elemType def storeValue(self, name, value): '''Stores the newly parsed p_value (contained in tag p_name) on the current container in environment self.env.''' e = self.env # Remove namespace prefix when relevant if ':' in name: name = name.split(':')[-1] # Change the class of the value if relevant if (name in self.classes) and isinstance(value, Object): value.__class__ = self.classes[name] # Where must I store this value? if not e.containerStack: # I store the object at the root of the web. self.res = value else: currentContainer = e.containerStack[-1] if isinstance(currentContainer, list): currentContainer.append(value) elif isinstance(currentContainer, dict): # If the current container is a dict, it means that p_value is # a dict entry object named "entry" by convention and having # attributes "k" and "v" that store, respectively, the key and # the value of the entry. But this object is under construction: # at this time, attributes "k" and "v" are not created yet. We # will act in m_endElement, when the object will be finalized. pass elif isinstance(currentContainer, UnmarshalledFile): val = value or '' currentContainer.content += val currentContainer.size += len(val) else: # Current container is an object if hasattr(currentContainer, name) and \ getattr(currentContainer, name): # We have already encountered a sub-object with this name. # Having several sub-objects with the same name, we will # create a list. attrValue = getattr(currentContainer, name) if not isinstance(attrValue, list): attrValue = [attrValue, value] else: attrValue.append(value) else: attrValue = value setattr(currentContainer, name, attrValue) def characters(self, content): content = self.encode(content) e = XmlParser.characters(self, content) if e.currentBasicType: e.currentContent += content def endElement(self, elem): e = XmlParser.endElement(self, elem) if e.currentBasicType: value = e.currentContent.strip() if not value: value = None else: # If we have a custom converter for values of this type, use it. if self.conversionFunctions.has_key(e.currentBasicType): try: value = self.conversionFunctions[e.currentBasicType]( value) except Exception, err: raise AppyError(CUSTOM_CONVERSION_ERROR % ( e.currentBasicType, value, str(err))) # If not, try a standard conversion elif e.currentBasicType in self.numericTypes: try: exec 'value = %s' % value except SyntaxError: raise AppyError(CONVERSION_ERROR % ( e.currentBasicType, value)) except NameError: raise AppyError(CONVERSION_ERROR % ( e.currentBasicType, value)) # Check that the value is of the correct type. For instance, # a float value with a comma in it could have been converted # to a tuple instead of a float. if not isinstance(value, eval(e.currentBasicType)): raise AppyError(CONVERSION_ERROR % ( e.currentBasicType, value)) elif e.currentBasicType == 'DateTime': value = DateTime(value) elif e.currentBasicType == 'base64': value = e.currentContent.decode('base64') # Store the value on the last container self.storeValue(elem, value) # Clean the environment e.currentBasicType = None e.currentContent = '' else: elem = e.containerStack.pop() # This element can be a temporary "entry" object representing a dict # entry. if e.containerStack: lastContainer = e.containerStack[-1] if isinstance(lastContainer, dict): lastContainer[elem.k] = elem.v # Alias: "unmarshall" -> "parse" unmarshall = XmlParser.parse # ------------------------------------------------------------------------------ class XmlMarshaller: '''This class allows to produce a XML version of a Python object, which respects some conventions as described in the doc of the corresponding Unmarshaller (see above).''' xmlEntities = {'<': '<', '>': '>', '&': '&', '"': '"', "'": '''} trueFalse = {True: 'True', False: 'False'} fieldsToMarshall = 'all' fieldsToExclude = [] atFiles = ('image', 'file') # Types of archetypes fields that contain files. def __init__(self, cdata=False, dumpUnicode=False, conversionFunctions={}, dumpXmlPrologue=True, rootTag='xmlPythonData', namespaces={}, namespacedTags={}): # If p_cdata is True, all string values will be dumped as XML CDATA. self.cdata = cdata # If p_dumpUnicode is True, the result will be unicode. self.dumpUnicode = dumpUnicode # The following dict stores specific conversion (=Python to XML) # functions. A specific conversion function is useful when you are not # happy with the way built-in converters work, or if you want to # specify a specific way to represent, in XML, some particular Python # object or value. In this dict, every key represents a given type # (class names must be full-path class names); every value is a function # accepting 2 args: first one is the UnicodeBuffer where the result is # being dumped, while the second one is the Python object or value to # dump. self.conversionFunctions = conversionFunctions # If dumpXmlPrologue is True, the XML prologue will be dumped. self.dumpXmlPrologue = dumpXmlPrologue # The name of the root tag self.rootElementName = rootTag # The namespaces that will be defined at the root of the XML message. # It is a dict whose keys are namespace prefixes and whose values are # namespace URLs. If you want to specify a default namespace, specify an # entry with an empty string as a key. self.namespaces = namespaces # The following dict will tell which XML tags will get which namespace # prefix ({s_tagName: s_prefix}). Special optional dict entry # '*':s_prefix will indicate a default prefix that will be applied to # any tag that does not have it own key in this dict. self.namespacedTags = namespacedTags self.objectType = None # Will be given by method m_marshal def getTagName(self, name): '''Returns the name of tag p_name as will be dumped. It can be p_name, or p_name prefixed with a namespace prefix (will depend on self.prefixedTags).''' # Determine the prefix prefix = '' if name in self.namespacedTags: prefix = self.namespacedTags[name] elif '*' in self.namespacedTags: prefix = self.namespacedTags['*'] if prefix: return '%s:%s' % (prefix, name) return name def isAnObject(self, instance): '''Returns True if p_instance is a class instance, False if it is a basic type, or tuple, sequence, etc.''' if instance.__class__.__name__ == 'LazyMap': return False iType = type(instance) if iType == types.InstanceType: return True elif iType.__name__ == 'ImplicitAcquirerWrapper': # This is the case with Archetype instances return True elif iType.__class__.__name__ == 'ExtensionClass': return True return False def isAList(self, value): '''Is p_value a list?''' return value.__class__.__name__ in ('list', 'PersistentList', 'LazyMap') def isADict(self, value): '''Is p_value a dict?''' return value.__class__.__name__ in ('dict', 'PersistentMapping') def dumpRootTag(self, res, instance): '''Dumps the root tag.''' # Dumps the name of the tag. tagName = self.getTagName(self.rootElementName) res.write('<'); res.write(tagName) # Dumps namespace definitions if any for prefix, url in self.namespaces.iteritems(): if not prefix: pre = 'xmlns' # The default namespace else: pre = 'xmlns:%s' % prefix res.write(' %s="%s"' % (pre, url)) # Dumps Appy- or Plone-specific attribute if self.objectType != 'popo': res.write(' type="object" id="%s"' % instance.UID()) res.write('>') return tagName def dumpString(self, res, s): '''Dumps a string into the result.''' if self.cdata: res.write('') def dumpFile(self, res, v): '''Dumps a file into the result.''' if not v: return w = res.write # p_value contains the (possibly binary) content of a file. We will # encode it in Base64, in one or several parts. partTag = self.getTagName('part') res.write('<%s type="base64" number="1">' % partTag) if hasattr(v, 'data'): # The file is an Archetypes file. if v.data.__class__.__name__ == 'Pdata': # There will be several parts. w(v.data.data.encode('base64')) # Write subsequent parts nextPart = v.data.next nextPartNb = 2 while nextPart: w('' % partTag) # Close the previous part w('<%s type="base64" number="%d">' % (partTag, nextPartNb)) w(nextPart.data.encode('base64')) nextPart = nextPart.next nextPartNb += 1 else: w(v.data.encode('base64')) w('' % partTag) elif hasattr(v, 'uploadName'): # The file is a Appy FileInfo instance. Read the file from disk. filePath = v.getFilePath(self.instance) f = file(filePath, 'rb') partNb = 1 while True: chunk = f.read(v.BYTES) if not chunk: break # We have one more chunk. Dump the start tag (excepted if it is # the first chunk: the start tag has already been dumped, see # above). if partNb > 1: w('<%s type="base64" number="%d">' % (partTag, partNb)) w(chunk.encode('base64')) w('' % partTag) # Close the tag partNb += 1 f.close() else: w(v.encode('base64')) w('' % partTag) def dumpDict(self, res, v): '''Dumps the XML version of dict p_v.''' for key, value in v.iteritems(): res.write('') self.dumpField(res, 'k', key) self.dumpField(res, 'v', value) res.write('') def dumpValue(self, res, value, fieldType, isRef=False): '''Dumps the XML version of p_value to p_res.''' # Use a custom function if one is defined for this type of value. className = value.__class__.__name__ if className in self.conversionFunctions: self.conversionFunctions[className](res, value) return # Use a standard conversion else. if fieldType == 'file': self.dumpFile(res, value) elif fieldType == 'dict': self.dumpDict(res, value) elif isRef: if value: if self.objectType == 'appy': suffix = '/xml' else: suffix = '' if type(value) in sequenceTypes: for elem in value: self.dumpField(res, 'url', elem.absolute_url()+suffix) else: self.dumpField(res, 'url', value.absolute_url()+suffix) elif fieldType in ('list', 'tuple'): # The previous condition must be checked before this one because # referred objects may be stored in lists or tuples, too. for elem in value: self.dumpField(res, 'e', elem) elif isinstance(value, basestring): self.dumpString(res, value) elif isinstance(value, bool): res.write(self.trueFalse[value]) elif fieldType == 'object': if hasattr(value, 'absolute_url'): # Dump the URL to the object only res.write(value.absolute_url()) else: # Dump the entire object content for k, v in value.__dict__.iteritems(): if not k.startswith('__'): self.dumpField(res, k, v) # Maybe we could add a parameter to the marshaller to know how # to marshall objects (produce an ID, an URL, include the entire # tag but we need to take care of circular references,...) else: res.write(value) def dumpField(self, res, fieldName, fieldValue, fieldType='basic'): '''Dumps in p_res, the value of the p_field for p_instance.''' # As a preamble, manage special case of p_fieldName == "_any". In that # case, p_fieldValue corresponds to a previously marshalled string that # must be included as is here, without dumping the tag name. if fieldName == '_any': res.write(value) return # Now, dump "normal" fields. fieldTag = self.getTagName(fieldName) res.write('<'); res.write(fieldTag) # Dump the type of the field as an XML attribute fType = None # No type will mean "unicode". if fieldType == 'file': fType = 'file' elif fieldType == 'ref': fType = 'list' elif isinstance(fieldValue, bool): fType = 'bool' elif isinstance(fieldValue, int): fType = 'int' elif isinstance(fieldValue, float): fType = 'float' elif isinstance(fieldValue, long): fType = 'long' elif isinstance(fieldValue, tuple): fType = 'tuple' elif self.isAList(fieldValue): fType = 'list' elif self.isADict(fieldValue): fType = 'dict' elif fieldValue.__class__.__name__ == 'DateTime': fType = 'DateTime' elif self.isAnObject(fieldValue): fType = 'object' if self.objectType != 'popo': if fType: res.write(' type="%s"' % fType) # Dump other attributes if needed if fType in ('list', 'tuple'): length = 0 if fieldValue: length = len(fieldValue) res.write(' count="%d"' % length) if fType == 'file': # Get the MIME type mimeType = None if hasattr(fieldValue, 'content_type'): mimeType = fieldValue.content_type elif hasattr(fieldValue, 'mimeType'): mimeType = fieldValue.mimeType if mimeType: res.write(' mimeType="%s"' % mimeType) # Get the file name fileName = None if hasattr(fieldValue, 'filename'): fileName = fieldValue.filename elif hasattr(fieldValue, 'uploadName'): fileName = fieldValue.uploadName if fileName: res.write(' name="') self.dumpString(res, fileName) res.write('"') res.write('>') # Dump the field value self.dumpValue(res, fieldValue, fType, isRef=(fieldType=='ref')) res.write('') def marshall(self, instance, objectType='popo', conversionFunctions={}): '''Returns in a UnicodeBuffer the XML version of p_instance. If p_instance corresponds to a Plain Old Python Object, specify 'popo' for p_objectType. If p_instance corresponds to an Archetypes object (Zope/Plone), specify 'archetype' for p_objectType. if p_instance is a Appy object, specify "appy" as p_objectType. If p_instance is not an instance at all, but another Python data structure or basic type, p_objectType is ignored.''' self.objectType = objectType # The Appy object is needed to marshall its File fields. if objectType == 'appy': self.instance = instance # Call the XmlMarshaller constructor if it hasn't been called yet. if not hasattr(self, 'cdata'): XmlMarshaller.__init__(self) if conversionFunctions: self.conversionFunctions.update(conversionFunctions) # Create the buffer where the XML result will be dumped. res = UnicodeBuffer() # Dump the XML prologue if required if self.dumpXmlPrologue: res.write(xmlPrologue) if self.isAnObject(instance): # Dump the root tag rootTagName = self.dumpRootTag(res, instance) # Dump the fields of this root object if objectType == 'popo': for fieldName, fieldValue in instance.__dict__.iteritems(): mustDump = False if fieldName in self.fieldsToExclude: mustDump = False elif self.fieldsToMarshall == 'all': mustDump = True else: if (type(self.fieldsToMarshall) in sequenceTypes) \ and (fieldName in self.fieldsToMarshall): mustDump = True if mustDump: self.dumpField(res, fieldName, fieldValue) elif objectType == 'archetype': for field in instance.schema.fields(): # Dump only needed fields mustDump = False if field.getName() in self.fieldsToExclude: mustDump = False elif (self.fieldsToMarshall == 'all') and \ (field.schemata != 'metadata'): mustDump = True elif self.fieldsToMarshall == 'all_with_metadata': mustDump = True else: if (type(self.fieldsToMarshall) in sequenceTypes) \ and (field.getName() in self.fieldsToMarshall): mustDump = True if mustDump: fieldType = 'basic' if field.type in self.atFiles: fieldType = 'file' elif field.type == 'reference': fieldType = 'ref' self.dumpField(res, field.getName(),field.get(instance), fieldType=fieldType) elif objectType == 'appy': for field in instance.getAppyTypes('view', None): # Dump only needed fields if (field.type == 'Computed') and not field.plainText: # Ignore fields used for producing custom chunks of HTML # within the web UI. continue if field.name in self.fieldsToExclude: continue if (type(self.fieldsToMarshall) in sequenceTypes) \ and (field.name not in self.fieldsToMarshall): continue # Determine field type and value fieldType = 'basic' if field.type == 'File': fieldType = 'file' v = field.getValue(instance) elif field.type == 'Ref': fieldType = 'ref' v = field.getValue(instance, appy=False) else: v = field.getValue(instance) self.dumpField(res, field.name, v, fieldType=fieldType) # Dump the object history. if hasattr(instance.aq_base, 'workflow_history'): histTag = self.getTagName('history') eventTag = self.getTagName('event') res.write('<%s type="list">' % histTag) key = instance.workflow_history.keys()[0] history = instance.workflow_history[key] for event in history: res.write('<%s type="object">' % eventTag) for k, v in event.iteritems(): self.dumpField(res, k, v) res.write('' % eventTag) res.write('' % histTag) self.marshallSpecificElements(instance, res) res.write('') else: self.dumpField(res, self.rootElementName, instance) # Return the result res = res.getValue() if not self.dumpUnicode: res = res.encode('utf-8') return res def marshallSpecificElements(self, instance, res): '''You can use this marshaller as a base class for creating your own. In this case, this method will be called by the marshall method for allowing your concrete marshaller to insert more things in the result. p_res is the UnicodeBuffer buffer where the result of the marshalling process is currently dumped; p_instance is the instance currently marshalled.''' # ------------------------------------------------------------------------------ class XmlHandler(ContentHandler): '''This handler is used for producing, in self.res, a readable XML (with carriage returns) and for removing some tags that always change (like dates) from a file that need to be compared to another file.''' def __init__(self, xmlTagsToIgnore, xmlAttrsToIgnore): ContentHandler.__init__(self) self.res = unicode(xmlPrologue) self.namespaces = {} # ~{s_namespaceUri:s_namespaceName}~ self.indentLevel = -1 self.tabWidth = 3 self.tagsToIgnore = xmlTagsToIgnore self.attrsToIgnore = xmlAttrsToIgnore self.ignoring = False # Some content must be ignored, and not dumped # into the result. def isIgnorable(self, elem): '''Is p_elem an ignorable element ?''' res = False for tagName in self.tagsToIgnore: if isinstance(tagName, list) or isinstance(tagName, tuple): # We have a namespace nsUri, elemName = tagName try: nsName = self.ns(nsUri) elemFullName = '%s:%s' % (nsName, elemName) except KeyError: elemFullName = '' else: # No namespace elemFullName = tagName if elemFullName == elem: res = True break return res def setDocumentLocator(self, locator): self.locator = locator def endDocument(self): pass def dumpSpaces(self): self.res += '\n' + (' ' * self.indentLevel * self.tabWidth) def manageNamespaces(self, attrs): '''Manage namespaces definitions encountered in attrs''' for attrName, attrValue in attrs.items(): if attrName.startswith('xmlns:'): self.namespaces[attrValue] = attrName[6:] def ns(self, nsUri): return self.namespaces[nsUri] def startElement(self, elem, attrs): self.manageNamespaces(attrs) # Do we enter into a ignorable element ? if self.isIgnorable(elem): self.ignoring = True else: if not self.ignoring: self.indentLevel += 1 self.dumpSpaces() self.res += '<%s' % elem attrsNames = attrs.keys() attrsNames.sort() for attrToIgnore in self.attrsToIgnore: if attrToIgnore in attrsNames: attrsNames.remove(attrToIgnore) for attrName in attrsNames: self.res += ' %s="%s"' % (attrName, attrs[attrName]) self.res += '>' def endElement(self, elem): if self.isIgnorable(elem): self.ignoring = False else: if not self.ignoring: self.dumpSpaces() self.indentLevel -= 1 self.res += '' % elem def characters(self, content): if not self.ignoring: self.res += content.replace('\n', '') # ------------------------------------------------------------------------------ class XmlComparator: '''Compares 2 XML files and produces a diff.''' def __init__(self, fileNameA, fileNameB, areXml=True, xmlTagsToIgnore=(), xmlAttrsToIgnore=()): self.fileNameA = fileNameA self.fileNameB = fileNameB self.areXml = areXml # Can also diff non-XML files. self.xmlTagsToIgnore = xmlTagsToIgnore self.xmlAttrsToIgnore = xmlAttrsToIgnore def filesAreIdentical(self, report=None, encoding=None): '''Compares the 2 files and returns True if they are identical (if we ignore xmlTagsToIgnore and xmlAttrsToIgnore). If p_report is specified, it must be an instance of appy.shared.test.TestReport; the diffs will be dumped in it.''' # Perform the comparison differ = difflib.Differ() if self.areXml: f = file(self.fileNameA) contentA = f.read() f.close() f = file(self.fileNameB) contentB = f.read() f.close() xmlHandler = XmlHandler(self.xmlTagsToIgnore, self.xmlAttrsToIgnore) xml.sax.parseString(contentA, xmlHandler) contentA = xmlHandler.res.split('\n') xmlHandler = XmlHandler(self.xmlTagsToIgnore, self.xmlAttrsToIgnore) xml.sax.parseString(contentB, xmlHandler) contentB = xmlHandler.res.split('\n') else: f = file(self.fileNameA) contentA = f.readlines() f.close() f = file(self.fileNameB) contentB = f.readlines() f.close() diffResult = list(differ.compare(contentA, contentB)) # Analyse, format and report the result. atLeastOneDiff = False lastLinePrinted = False i = -1 for line in diffResult: i += 1 if line and (line[0] != ' '): if not atLeastOneDiff: msg = 'Difference(s) detected between files %s and %s:' % \ (self.fileNameA, self.fileNameB) if report: report.say(msg, encoding='utf-8') else: print(msg) atLeastOneDiff = True if not lastLinePrinted: if report: report.say('...') else: print('...') if self.areXml: if report: report.say(line, encoding=encoding) else: print(line) else: if report: report.say(line[:-1], encoding=encoding) else: print(line[:-1]) lastLinePrinted = True else: lastLinePrinted = False return not atLeastOneDiff # ------------------------------------------------------------------------------ class XhtmlCleaner(XmlParser): '''This class cleans XHTML content, so it becomes ready to be stored into a Appy-compliant format.''' class Error(Exception): pass # Tags that will never be in the result, content included. tagsToIgnoreWithContent = ('style', 'colgroup', 'head') # Tags that will be removed from the result, but whose content will be kept, tagsToIgnoreKeepContent = ('x', 'html', 'body') allTagsToIgnore = tagsToIgnoreWithContent + tagsToIgnoreKeepContent # Additional tags that will be removed, but content kept, if keepStyles is # False. tagsToIgnoreKeepContentDropStyles = ('font', 'center') # Attributes to ignore, if keepStyles if False. attrsToIgnore = ('align', 'valign', 'cellpadding', 'cellspacing', 'width', 'height', 'bgcolor', 'lang', 'border', 'class', 'rules', 'id', 'name') # CSS attributes to keep even if keepStyles if False. These attributes can # be used by pod (to align a paragraph, center/resize an image...). cssAttrsToKeep = ('width', 'height', 'float', 'text-align', 'font-style', 'font-weight') # Attrs to add, if not present, to ensure good formatting, be it at the web # or ODT levels. attrsToAdd = {'table': {'cellspacing':'0', 'cellpadding':'6', 'border':'1'}, 'tr': {'valign': 'top'}} # Tags that require a line break to be inserted after them. lineBreakTags = ('p', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'td') # No-end tags noEndTags = ('br', 'img') def __init__(self, keepStyles=True): XmlParser.__init__(self) self.keepStyles = keepStyles # Compute tags to ignore, which may vary according to p_keepStyles. self.tagsToIgnore = self.allTagsToIgnore if not keepStyles: self.tagsToIgnore += self.tagsToIgnoreKeepContentDropStyles def clean(self, s): '''Cleaning XHTML code is done for 2 reasons: 1. The main objective is to format XHTML p_s to be storable in the ZODB according to Appy rules. a. Every

or

  • must be on a single line (ending with a carriage return); else, appy.shared.diff will not be able to compute XHTML diffs; b. Optimize size: HTML comments are removed. 2. If p_keepStyles (or m_clean) is False, some style-related information will be removed, in order to get a standardized content that can be dumped in an elegant and systematic manner into a POD template. ''' self.env.currentContent = '' # The stack of currently parsed elements (will contain only ignored # ones). self.env.currentElems = [] # 'ignoreTag' is True if we must ignore the currently walked tag. self.env.ignoreTag = False # 'ignoreContent' is True if, within the currently ignored tag, we must # also ignore its content. self.env.ignoreContent = False try: res = self.parse('%s' % s).encode('utf-8') except SAXParseException, e: raise self.Error(str(e)) return res def cleanStyleAttribute(self, value): '''p_value contains some CSS attributes from a "style" attribute. We keep those that pod can manage.''' res = [] for name, v in parseStyleAttribute(value): if name in self.cssAttrsToKeep: res.append('%s: %s' % (name, v)) return '; '.join(res) def startDocument(self): # The result will be cleaned XHTML, joined from self.res. XmlParser.startDocument(self) self.res = [] def endDocument(self): self.res = ''.join(self.res) def startElement(self, elem, attrs): e = self.env # Dump any previously gathered content if any if e.currentContent: self.res.append(e.currentContent) e.currentContent = '' if e.ignoreTag and e.ignoreContent: return if elem in self.tagsToIgnore: e.ignoreTag = True if elem in self.tagsToIgnoreWithContent: e.ignoreContent = True else: e.ignoreContent = False e.currentElems.append( (elem, e.ignoreContent) ) return # Add a line break before the start tag if required (ie: xhtml differ # needs to get paragraphs and other elements on separate lines). if (elem in self.lineBreakTags) and self.res and \ (self.res[-1][-1] != '\n'): prefix = '\n' else: prefix = '' res = '%s<%s' % (prefix, elem) # Include the found attributes, excepted those that must be ignored. for name, value in attrs.items(): if not self.keepStyles: if name in self.attrsToIgnore: continue elif name == 'style': value = self.cleanStyleAttribute(value) if not value: continue res += ' %s="%s"' % (name, value) # Include additional attributes if required. if elem in self.attrsToAdd: for name, value in self.attrsToAdd[elem].iteritems(): res += ' %s="%s"' % (name, value) # Close the tag if it is a no-end tag if elem in self.noEndTags: suffix = '/>' else: suffix = '>' self.res.append('%s%s' % (res, suffix)) def endElement(self, elem): e = self.env if e.ignoreTag and (elem in self.tagsToIgnore): # Pop the currently ignored tag e.currentElems.pop() if e.currentElems: # Keep ignoring tags. e.ignoreContent = e.currentElems[-1][1] else: # Stop ignoring elems e.ignoreTag = e.ignoreContent = False elif e.ignoreTag and e.ignoreContent: # This is the end of a sub-tag within a region that we must ignore. pass else: if self.env.currentContent: self.res.append(self.env.currentContent) # Close the tag only if it is a no-end tag. if elem not in self.noEndTags: # Add a line break after the end tag if required (ie: xhtml # differ needs to get paragraphs and other elements on separate # lines). if (elem in self.lineBreakTags) and self.res and \ (self.res[-1][-1] != '\n'): suffix = '\n' else: suffix = '' self.res.append('%s' % (elem, suffix)) self.env.currentContent = '' def characters(self, content): if self.env.ignoreContent: return # Remove blanks that ckeditor may add just after a start tag or # between tags. if not self.env.currentContent or \ self.env.currentContent[-1] in ('\n', ' '): # I give here to lstrip an explicit list of what is to be considered # as blank chars, because I do not want unicode NBSP chars to be in # this list. toAdd = content.lstrip(u' \n\r\t') else: toAdd = content # Re-transform XML special chars to entities. self.env.currentContent += cgi.escape(toAdd) # ------------------------------------------------------------------------------ class XhtmlToText(XmlParser): '''Produces a text version of XHTML content.''' paraTags = ('p', 'li', 'center', 'div') def startDocument(self): XmlParser.startDocument(self) self.res = [] def endDocument(self): self.res = ''.join(self.res) return XmlParser.endDocument(self) def characters(self, content): self.res.append(content.replace('\n', '')) def startElement(self, elem, attrs): '''Dumps a carriage return every time a "br" tag is encountered.''' if elem == 'br': self.res.append('\n') def endElement(self, elem): '''Dumps a carriage return every time a paragraph is encountered.''' if elem in self.paraTags: self.res.append('\n') # ------------------------------------------------------------------------------