document (.mht file) generation with images OK

2012-12-19 11:09:39 +01:00 · 2012-12-19 11:09:39 +01:00 · c62122dcd9
commit c62122dcd9
parent 87d323a550
2 changed files with 42 additions and 20 deletions
--- a/docgen/README.txt
+++ b/docgen/README.txt
@ -20,12 +20,7 @@ Working with MHT Files
  >>> data = f.read()
  >>> f.close()
-  >>> imagePath = os.path.join(basePath, 'test_image.jpg')
+  >>> xbody = '''<div class="WordSection1">
  >>> f = open(imagePath, 'rt')
  >>> imageData = f.read()
  >>> f.close()
  >>> body = '''<div class="WordSection1">
  ... <v:shape id="Grafik_x0020_2" o:spid="_x0000_i1025" type="#_x0000_t75"
  ...     style="width:320pt;height:240pt;visibility:visible;mso-wrap-style:square">
  ...   <v:imagedata src="FB-Besprechungsprotokoll-Dateien/image002.jpg" o:title=""/>
@ -33,15 +28,27 @@ Working with MHT Files
  ... </div>
  ... '''
  >>> body = '''<div class="WordSection1">
  ... <img src="files/test_image.jpg" />
  ... </div>
  ... '''
  >>> from cybertools.docgen.mht import MHTFile
  >>> document = MHTFile(data, body)
-  >>> document.addImage(imageData, 'files/test_image.jpg')
+
  >>> imageRefs = document.htmlDoc.getImageRefs()
  >>> for path in imageRefs:
  ...     imagePath = os.path.join(basePath, os.path.basename(path))
  ...     f = open(imagePath, 'rt')
  ...     imageData = f.read()
  ...     f.close()
  ...     document.addImage(imageData, path)
  >>> document.insertBody()
  >>> output = document.asString()
  >>> len(data), len(output)
-  (294996, 336268)
+  (294996, 336140)
  >>> outPath = os.path.join(basePath, 'out_doc.mht')
  >>> #f = open(outPath, 'wt')
--- a/docgen/mht.py
+++ b/docgen/mht.py
@ -21,10 +21,14 @@ Working with MHT Files.
 """
 import base64
 from cStringIO import StringIO
 import email
 import Image
 import mimetypes
 import os
 from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Tag
 class MHTFile(object):
@ -53,7 +57,7 @@ class MHTFile(object):
        self.body = body
        self.htmlDoc = HTMLDoc(body)
        self.lastImageNum = 0
-        self.imageMappings = []
+        self.imageMappings = {}
        for idx, part in enumerate(self.msg.walk()):
            docPath = part['Content-Location']
            contentType = part.get_content_type()
@ -68,28 +72,30 @@ class MHTFile(object):
    def getImageRefs(self):
        return self.htmlDoc.getImageRefs()
-    def addImage(self, imageData, path, contentType='image/jpeg'):
+    def addImage(self, imageData, path):
        image = Image.open(StringIO(imageData))
        width, height = image.size
        contentType, enc = mimetypes.guess_type(path)
        bp, ext = os.path.splitext(path)
        self.lastImageNum += 1
        name = 'image%03i%s' % (self.lastImageNum, ext)
-        self.imageMappings.append((path, name))
+        self.imageMappings[path] = (name, width, height)
        flpos = self.indexes['filelist']
        vars = dict(path=self.path, docname=self.documentName,  
                    suffix=self.foldernameSuffix,
                    imgname=name, ctype=contentType,
                    imgdata=base64.encodestring(imageData))
        content = self. imageTemplate % vars
-        self.parts.insert(flpos, content)
+        self.parts.insert(flpos, str(content))
        filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern
        filelist = self.parts[flpos]
-        self.parts[flpos] = filelist.replace(self.filelistPattern, filelistRep)
+        self.parts[flpos] = str(filelist.replace(self.filelistPattern, filelistRep))
    def insertBody(self):
-        self.htmlDoc.updateImageRefs(self.imageMappings)
+        path = '-'.join((self.documentName, self.foldernameSuffix))
-        # TODO: convert changed self.htmlDoc to new body
+        self.htmlDoc.updateImageRefs(self.imageMappings, path)
-        content = self.body.encode(self.encoding)
+        content = self.htmlDoc.doc.renderContents(self.encoding)
        bodyIndex = self.indexes['body']
        baseDocument = self.parts[bodyIndex]
        self.parts[bodyIndex] =  baseDocument.replace(self.bodyMarker, 
@ -106,11 +112,20 @@ class HTMLDoc(object):
    def __init__(self, data):
        self.data = data
        self.doc = BeautifulSoup(data)
    def getImageRefs(self):
-        return []
+        return [img['src'] for img in self.doc('img')]
-    def updateImageRefs(self, mappings):
+    def updateImageRefs(self, mappings, path=''):
-        for old, new in mappings:
+        for img in self.doc('img'):
-            pass
+            name, width, height = mappings[img['src']]
            imgdata = Tag(self.doc, 'v:imagedata')
            imgdata['src'] = '/'.join((path, name))
            imgdata.isSelfClosing = True
            img.append(imgdata)
            del img['src']
            img['style'] = 'width:%spt;height:%spt' % (width, height)
            img.isSelfClosing = False
            img.name='v:shape'