diff --git a/docgen/README.txt b/docgen/README.txt
index 19cf90e..f378c85 100644
--- a/docgen/README.txt
+++ b/docgen/README.txt
@@ -22,18 +22,18 @@ Working with MHT Files
>>> imagePath = os.path.join(basePath, 'test_image.jpg')
- >>> from cybertools.docgen.mht import MHTFile
- >>> document = MHTFile(data)
- >>> document.addImage(imagePath)
-
>>> body = '''
... '''
- >>> document.setBody(body)
+ >>> from cybertools.docgen.mht import MHTFile
+ >>> document = MHTFile(data, body)
+ >>> document.addImage(imagePath) # TODO: provide imageData, path
+
+ >>> document.insertBody()
>>> output = document.asString()
>>> len(data), len(output)
- (294996, 295017)
+ (294996, 295346)
>>> outPath = os.path.join(basePath, 'out_doc.mht')
>>> #f = open(outPath, 'wt')
diff --git a/docgen/mht.py b/docgen/mht.py
index fe06424..cff3e4b 100644
--- a/docgen/mht.py
+++ b/docgen/mht.py
@@ -20,8 +20,9 @@
Working with MHT Files.
"""
-from email import message_from_string
-#from email.multipart import MIMEMultipart
+import base64
+import email
+import os
class MHTFile(object):
@@ -32,30 +33,73 @@ class MHTFile(object):
bodyMarker = 'lxdoc_body'
indexes = dict(body=2, filelist=-2)
+ imageTemplate = ('\n'
+ 'Content-Location: file:///C:/AF2749EC/%(docname)s-Dateien/$(imgname)s\n'
+ 'Content-Transfer-Encoding: base64\n'
+ 'Content-Type: %(ctype)s\n\n%(imgdata)s\n\n')
- def __init__(self, data):
+ filelistItemTemplate = ' \n'
+ filelistPattern =''
+
+ def __init__(self, data, body):
self.data = data
- self.msg = message_from_string(data)
+ self.msg = email.message_from_string(data)
self.boundary = self.msg.get_boundary()
self.parts = data.split(self.boundary)
+ self.body = body
+ self.htmlDoc = HTMLDoc(body)
+ self.lastImageNum = 0
+ self.imageMappings = []
#print '***', len(self.parts)
- #for idx, part in enumerate(self.msg.walk()):
- # print '***', idx, part['Content-Location'], part.get_content_type()
+ for idx, part in enumerate(self.msg.walk()):
+ # print '***', idx, , part.get_content_type()
+ if idx == 1:
+ docPath = part['Content-Location']
+ self.documentName = docPath
+ # TODO: collect existing images to provide consistent naming
- def addImage(self, imagePath):
- pass
+ def getImageRefs(self):
+ return self.htmlDoc.getImageRefs()
- def setBody(self, body):
- content = body.encode(self.encoding)
+ def addImage(self, imageData, path='image001.jpg', contentType='image/jpeg'):
+ flpos = self.indexes['filelist']
+ # TODO: get contentType from path
+ # TODO: generate name, update self.imageMappings
+ name = path
+ vars = dict(docname=self.documentName, imgname=name, ctype=contentType,
+ imgdata=base64.encodestring(imageData))
+ content = self. imageTemplate % vars
+ self.parts.insert(flpos, content)
+ filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern
+ filelist = self.parts[flpos]
+ self.parts[flpos] = filelist.replace(self.filelistPattern, filelistRep)
+
+
+ def insertBody(self):
+ # self.htmlDoc.updateImageRefs(self.imageMappings)
+ # TODO: convert changed self.htmlDoc to new body
+ content = self.body.encode(self.encoding)
bodyIndex = self.indexes['body']
baseDocument = self.parts[bodyIndex]
self.parts[bodyIndex] = baseDocument.replace(self.bodyMarker,
self.quopri(content))
def asString(self):
- #msg = MIMEMultipart('related')
return self.boundary.join(self.parts)
def quopri(self, s):
return s.replace('="', '=3D"')
+
+class HTMLDoc(object):
+
+ def __init__(self, data):
+ self.data = data
+
+ def getImageRefs(self):
+ return []
+
+ def updateImageRefs(self, mappings):
+ for old, new in mappings:
+ pass
+