From 02fce031433df736f0130a387c2a4d7ccd2e0758 Mon Sep 17 00:00:00 2001
From: Gaetan Delannay <gaetan.delannay@gmail.com>
Date: Sat, 2 Jul 2011 11:46:49 +0200
Subject: [PATCH] appy.pod: optimized image importing: when an image is
 imported several times in a pod template through a call to 'do ... from
 document(at=path)', pod inserts only one copy of the file into the ODT
 result.

---
 pod/doc_importers.py | 128 ++++++++++++++++++++-----------------------
 pod/renderer.py      |  23 ++++----
 2 files changed, 73 insertions(+), 78 deletions(-)

diff --git a/pod/doc_importers.py b/pod/doc_importers.py
index 155bdb1..50807ec 100644
--- a/pod/doc_importers.py
+++ b/pod/doc_importers.py
@@ -17,7 +17,7 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301,USA.
 
 # ------------------------------------------------------------------------------
-import os, os.path, time, shutil, struct
+import os, os.path, time, shutil, struct, random
 from appy.pod import PodError
 from appy.pod.odf_parser import OdfEnvironment
 
@@ -31,10 +31,13 @@ PDF_TO_IMG_ERROR = 'A PDF file could not be converted into images. Please ' \
 class DocImporter:
     '''Base class used for importing external content into a pod template (an
        image, another pod template, another odt document...'''
-    def __init__(self, content, at, format, tempFolder, ns):
+    def __init__(self, content, at, format, tempFolder, ns, fileNames):
         self.content = content
-        self.at = at # If content is None, p_at tells us where to find it
-        # (file system path, url, etc)
+        # If content is None, p_at tells us where to find it (file system path,
+        # url, etc)
+        self.at = at
+        # Ensure this path exists.
+        if at and not os.path.isfile(at): raise PodError(FILE_NOT_FOUND % at)
         self.format = format
         self.res = u''
         self.ns = ns
@@ -45,64 +48,51 @@ class DocImporter:
         self.svgNs = ns[OdfEnvironment.NS_SVG]
         self.tempFolder = tempFolder
         self.importFolder = self.getImportFolder()
-        # If the importer generates one or several images, we will retain their
-        # names here, because we will need to declare them in
-        # META-INF/manifest.xml
-        self.fileNames = []
-        if self.at:
-            # Check that the file exists
-            if not os.path.isfile(self.at):
-                raise PodError(FILE_NOT_FOUND % self.at)
-            self.importPath = self.moveFile(self.at)
+        # Create the import folder if it does not exist.
+        if not os.path.exists(self.importFolder): os.mkdir(self.importFolder)
+        self.importPath = self.getImportPath(at, format)
+        # A link to the global fileNames dict (explained in renderer.py)
+        self.fileNames = fileNames
+        if at:
+            # Move the file within the ODT, if it is an image and if this image
+            # has not already been imported.
+            self.importPath = self.moveFile(at, self.importPath)
         else:
             # We need to dump the file content (in self.content) in a temp file
             # first. self.content may be binary or a file handler.
-            if not os.path.exists(self.importFolder):
-                os.mkdir(self.importFolder)
             if isinstance(self.content, file):
-                self.fileName = os.path.basename(self.content.name)
                 fileContent = self.content.read()
             else:
-                self.fileName = 'f%f.%s' % (time.time(), self.format)
                 fileContent = self.content
-            self.importPath = self.getImportPath(self.fileName)
-            theFile = file(self.importPath, 'w')
-            theFile.write(fileContent)
-            theFile.close()
-        self.importPath = os.path.abspath(self.importPath)
+            f = file(self.importPath, 'w')
+            f.write(fileContent)
+            f.close()
+
     def getImportFolder(self):
         '''This method must be overridden and gives the path where to dump the
            content of the document or image. In the case of a document it is a
            temp folder; in the case of an image it is a folder within the ODT
            result.'''
-        pass
-    def getImportPath(self, fileName):
-        '''Import path is the path to the external file or image that is now
-           stored on disk. We check here that this name does not correspond
-           to an existing file; if yes, we change the path until we get a path
-           that does not correspond to an existing file.'''
-        res = '%s/%s' % (self.importFolder, fileName)
-        resIsGood = False
-        while not resIsGood:
-            if not os.path.exists(res):
-                resIsGood = True
-            else:
-                # We must find another file name, this one already exists.
-                name, ext = os.path.splitext(res)
-                name += 'g'
-                res = name + ext
-        return res
-    def moveFile(self, at):
+
+    def getImportPath(self, at, format):
+        '''Gets the path name of the file to dump on disk (within the ODT for
+           images, in a temp folder for docs).'''
+        if not format:
+            format = os.path.splitext(at)[1][1:]
+        fileName = 'f.%d.%f.%s' % (random.randint(0,10), time.time(), format)
+        return os.path.abspath('%s/%s' % (self.importFolder, fileName))
+
+    def moveFile(self, at, importPath):
         '''In the case parameter "at" was used, we may want to move the file at
-           p_at within the ODT result (for images) or do nothing (for
-           documents).'''
+           p_at within the ODT result in p_importPath (for images) or do
+           nothing (for docs). In the latter case, the file to import stays
+           at _at, and is not copied into p_importPath.'''
         return at
 
 class OdtImporter(DocImporter):
     '''This class allows to import the content of another ODT document into a
        pod template.'''
-    def getImportFolder(self):
-        return '%s/docImports' % self.tempFolder
+    def getImportFolder(self): return '%s/docImports' % self.tempFolder
     def run(self):
         self.res += '<%s:section %s:name="PodImportSection%f">' \
                     '<%s:section-source %s:href="%s" ' \
@@ -116,8 +106,7 @@ class PdfImporter(DocImporter):
        template. It calls gs to split the PDF into images and calls the
        ImageImporter for importing it into the result.'''
     imagePrefix = 'PdfPart'
-    def getImportFolder(self):
-        return '%s/docImports' % self.tempFolder
+    def getImportFolder(self): return '%s/docImports' % self.tempFolder
     def run(self):
         # Split the PDF into images with Ghostscript
         imagesFolder = os.path.dirname(self.importPath)
@@ -132,8 +121,7 @@ class PdfImporter(DocImporter):
             if fileName == firstImage:
                 succeeded = True
                 break
-        if not succeeded:
-            raise PodError(PDF_TO_IMG_ERROR)
+        if not succeeded: raise PodError(PDF_TO_IMG_ERROR)
         # Insert images into the result.
         noMoreImages = False
         i = 0
@@ -143,10 +131,9 @@ class PdfImporter(DocImporter):
             if os.path.exists(nextImage):
                 # Use internally an Image importer for doing this job.
                 imgImporter = ImageImporter(None, nextImage, 'jpg',
-                    self.tempFolder, self.ns)
+                    self.tempFolder, self.ns, self.fileNames)
                 imgImporter.setAnchor('paragraph')
                 self.res += imgImporter.run()
-                self.fileNames += imgImporter.fileNames
                 os.remove(nextImage)
             else:
                 noMoreImages = True
@@ -194,21 +181,25 @@ class ImageImporter(DocImporter):
        externally.'''
     anchorTypes = ('page', 'paragraph', 'char', 'as-char')
     WRONG_ANCHOR = 'Wrong anchor. Valid values for anchors are: %s.'
-    def getImportFolder(self):
-        return '%s/unzip/Pictures' % self.tempFolder
-    def moveFile(self, at):
-        '''Image to insert is at p_at. We must move it into the ODT result.'''
-        fileName = os.path.basename(at)
-        folderName = self.getImportFolder()
-        if not os.path.exists(folderName):
-            os.mkdir(folderName)
-        res = self.getImportPath(fileName)
-        shutil.copy(at, res)
-        return res
+    def getImportFolder(self): return '%s/unzip/Pictures' % self.tempFolder
+
+    def moveFile(self, at, importPath):
+        '''Copies file at p_at into the ODT file at p_importPath.'''
+        # Has this image already been imported ?
+        for imagePath, imageAt in self.fileNames.iteritems():
+            if imageAt == at:
+                # Yes!
+                i = importPath.rfind('/Pictures/') + 1
+                return importPath[:i] + imagePath
+        # If I am here, the image has not already been imported: copy it.
+        shutil.copy(at, importPath)
+        return importPath
+
     def setAnchor(self, anchor):
         if anchor not in self.anchorTypes:
             raise PodError(self.WRONG_ANCHOR % str(self.anchorTypes))
         self.anchor = anchor
+
     def run(self):
         # Some shorcuts for the used xml namespaces
         d = self.drawNs
@@ -219,18 +210,19 @@ class ImageImporter(DocImporter):
         # Compute path to image
         i = self.importPath.rfind('/Pictures/')
         imagePath = self.importPath[i+1:]
-        self.fileNames.append(imagePath)
+        self.fileNames[imagePath] = self.at
         # Compute image size
         width, height = getSize(self.importPath, self.format)
         if width != None:
             size = ' %s:width="%fcm" %s:height="%fcm"' % (s, width, s, height)
         else:
             size = ''
-        self.res += '<%s:p><%s:frame %s:name="%s" %s:z-index="0" ' \
-                    '%s:anchor-type="%s"%s><%s:image %s:type="simple" ' \
-                    '%s:show="embed" %s:href="%s" %s:actuate="onLoad"/>' \
-                    '</%s:frame></%s:p>' % \
-                    (t, d, d, imageName, d, t, self.anchor, size, d, x, x, x,
-                     imagePath, x, d, t)
+        image = '<%s:frame %s:name="%s" %s:z-index="0" %s:anchor-type="%s"%s>' \
+                '<%s:image %s:type="simple" %s:show="embed" %s:href="%s" ' \
+                '%s:actuate="onLoad"/></%s:frame>' % (d, d, imageName, d, t, \
+                self.anchor, size, d, x, x, x, imagePath, x, d)
+        if hasattr(self, 'wrapInPara') and self.wrapInPara:
+            image = '<%s:p>%s</%s:p>' % (t, image, t)
+        self.res += image
         return self.res
 # ------------------------------------------------------------------------------
diff --git a/pod/renderer.py b/pod/renderer.py
index bbc36e9..d6224e6 100644
--- a/pod/renderer.py
+++ b/pod/renderer.py
@@ -142,10 +142,14 @@ class Renderer:
         self.forceOoCall = forceOoCall
         self.finalizeFunction = finalizeFunction
         self.overwriteExisting = overwriteExisting
-        # Retain potential files or images that will be included through
+        # Remember potential files or images that will be included through
         # "do ... from document" statements: we will need to declare them in
-        # META-INF/manifest.xml.
-        self.fileNames = []
+        # META-INF/manifest.xml. Keys are file names as they appear within the
+        # ODT file (to dump in manifest.xml); values are original paths of
+        # included images (used for avoiding to create multiple copies of a file
+        # which is imported several times).
+        # imported file).
+        self.fileNames = {}
         self.prepareFolders()
         # Unzip template
         self.unzipFolder = os.path.join(self.tempFolder, 'unzip')
@@ -255,12 +259,12 @@ class Renderer:
     imageFormats = ('png', 'jpeg', 'jpg', 'gif')
     ooFormats = ('odt',)
     def importDocument(self, content=None, at=None, format=None,
-                       anchor='as-char'):
+                       anchor='as-char', wrapInPara=True):
         '''If p_at is not None, it represents a path or url allowing to find
            the document. If p_at is None, the content of the document is
            supposed to be in binary format in p_content. The document
-           p_format may be: odt or any format in imageFormats. p_anchor is only
-           relevant for images.'''
+           p_format may be: odt or any format in imageFormats. p_anchor and
+           p_wrapInPara are only relevant for images.'''
         ns = self.currentParser.env.namespaces
         importer = None
         # Is there someting to import?
@@ -287,12 +291,11 @@ class Renderer:
             importer = PdfImporter
         else:
             raise PodError(DOC_WRONG_FORMAT % format)
-        imp = importer(content, at, format, self.tempFolder, ns)
+        imp = importer(content, at, format, self.tempFolder, ns, self.fileNames)
         if isImage:
             imp.setAnchor(anchor)
+            imp.wrapInPara = wrapInPara
         res = imp.run()
-        if imp.fileNames:
-            self.fileNames += imp.fileNames
         return res
 
     def prepareFolders(self):
@@ -323,7 +326,7 @@ class Renderer:
         if self.fileNames:
             j = os.path.join
             toInsert = ''
-            for fileName in self.fileNames:
+            for fileName in self.fileNames.iterkeys():
                 mimeType = mimetypes.guess_type(fileName)[0]
                 toInsert += ' <manifest:file-entry manifest:media-type="%s" ' \
                             'manifest:full-path="%s"/>\n' % (mimeType, fileName)