work in progress: document generation with embedded images via MHT file

2012-12-05 10:07:09 +01:00 · 2012-12-05 10:07:09 +01:00 · 77eb301edd
commit 77eb301edd
parent efe5ff20da
2 changed files with 61 additions and 17 deletions
--- a/docgen/README.txt
+++ b/docgen/README.txt
@ -22,18 +22,18 @@ Working with MHT Files

  >>> imagePath = os.path.join(basePath, 'test_image.jpg')

-  >>> from cybertools.docgen.mht import MHTFile
-  >>> document = MHTFile(data)
-  >>> document.addImage(imagePath)
-
  >>> body = '''<img src="test_image.jpg" />
  ... '''

-  >>> document.setBody(body)
+  >>> from cybertools.docgen.mht import MHTFile
+  >>> document = MHTFile(data, body)
+  >>> document.addImage(imagePath)  # TODO: provide imageData, path
+
+  >>> document.insertBody()

  >>> output = document.asString()
  >>> len(data), len(output)
-  (294996, 295017)
+  (294996, 295346)

  >>> outPath = os.path.join(basePath, 'out_doc.mht')
  >>> #f = open(outPath, 'wt')
--- a/docgen/mht.py
+++ b/docgen/mht.py
@ -20,8 +20,9 @@
 Working with MHT Files.
 """

-from email import message_from_string
-#from email.multipart import MIMEMultipart
+import base64
+import email
+import os


 class MHTFile(object):
@ -32,30 +33,73 @@ class MHTFile(object):
    bodyMarker = 'lxdoc_body'
    indexes = dict(body=2, filelist=-2)

+    imageTemplate = ('\n'
+        'Content-Location: file:///C:/AF2749EC/%(docname)s-Dateien/$(imgname)s\n'
+        'Content-Transfer-Encoding: base64\n'
+        'Content-Type: %(ctype)s\n\n%(imgdata)s\n\n')

-    def __init__(self, data):
+    filelistItemTemplate = ' <o:File HRef=3D"%s"/>\n'
+    filelistPattern ='<o:File HRef=3D"filelist.xml"/>'
+
+    def __init__(self, data, body):
        self.data = data
-        self.msg = message_from_string(data)
+        self.msg = email.message_from_string(data)
        self.boundary = self.msg.get_boundary()
        self.parts = data.split(self.boundary)
+        self.body = body
+        self.htmlDoc = HTMLDoc(body)
+        self.lastImageNum = 0
+        self.imageMappings = []
        #print '***', len(self.parts)
-        #for idx, part in enumerate(self.msg.walk()):
-        #    print '***', idx, part['Content-Location'], part.get_content_type()
+        for idx, part in enumerate(self.msg.walk()):
+        #    print '***', idx, , part.get_content_type()
+            if idx == 1:
+                docPath = part['Content-Location']
+        self.documentName = docPath
+        # TODO: collect existing images to provide consistent naming

-    def addImage(self, imagePath):
-        pass
+    def getImageRefs(self):
+        return self.htmlDoc.getImageRefs()

-    def setBody(self, body):
-        content = body.encode(self.encoding)
+    def addImage(self, imageData, path='image001.jpg', contentType='image/jpeg'):
+        flpos = self.indexes['filelist']
+        # TODO: get contentType from path
+        # TODO: generate name, update self.imageMappings
+        name = path
+        vars = dict(docname=self.documentName, imgname=name, ctype=contentType,
+                    imgdata=base64.encodestring(imageData))
+        content = self. imageTemplate % vars
+        self.parts.insert(flpos, content)
+        filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern
+        filelist = self.parts[flpos]
+        self.parts[flpos] = filelist.replace(self.filelistPattern, filelistRep)
+
+
+    def insertBody(self):
+        # self.htmlDoc.updateImageRefs(self.imageMappings)
+        # TODO: convert changed self.htmlDoc to new body
+        content = self.body.encode(self.encoding)
        bodyIndex = self.indexes['body']
        baseDocument = self.parts[bodyIndex]
        self.parts[bodyIndex] =  baseDocument.replace(self.bodyMarker, 
                                        self.quopri(content))

    def asString(self):
-        #msg = MIMEMultipart('related')
        return self.boundary.join(self.parts)

    def quopri(self, s):
        return s.replace('="', '=3D"')

+
+class HTMLDoc(object):
+
+    def __init__(self, data):
+        self.data = data
+
+    def getImageRefs(self):
+        return []
+
+    def updateImageRefs(self, mappings):
+        for old, new in mappings:
+            pass
+