diff --git a/docgen/README.txt b/docgen/README.txt index 19cf90e..f378c85 100644 --- a/docgen/README.txt +++ b/docgen/README.txt @@ -22,18 +22,18 @@ Working with MHT Files >>> imagePath = os.path.join(basePath, 'test_image.jpg') - >>> from cybertools.docgen.mht import MHTFile - >>> document = MHTFile(data) - >>> document.addImage(imagePath) - >>> body = ''' ... ''' - >>> document.setBody(body) + >>> from cybertools.docgen.mht import MHTFile + >>> document = MHTFile(data, body) + >>> document.addImage(imagePath) # TODO: provide imageData, path + + >>> document.insertBody() >>> output = document.asString() >>> len(data), len(output) - (294996, 295017) + (294996, 295346) >>> outPath = os.path.join(basePath, 'out_doc.mht') >>> #f = open(outPath, 'wt') diff --git a/docgen/mht.py b/docgen/mht.py index fe06424..cff3e4b 100644 --- a/docgen/mht.py +++ b/docgen/mht.py @@ -20,8 +20,9 @@ Working with MHT Files. """ -from email import message_from_string -#from email.multipart import MIMEMultipart +import base64 +import email +import os class MHTFile(object): @@ -32,30 +33,73 @@ class MHTFile(object): bodyMarker = 'lxdoc_body' indexes = dict(body=2, filelist=-2) + imageTemplate = ('\n' + 'Content-Location: file:///C:/AF2749EC/%(docname)s-Dateien/$(imgname)s\n' + 'Content-Transfer-Encoding: base64\n' + 'Content-Type: %(ctype)s\n\n%(imgdata)s\n\n') - def __init__(self, data): + filelistItemTemplate = ' \n' + filelistPattern ='' + + def __init__(self, data, body): self.data = data - self.msg = message_from_string(data) + self.msg = email.message_from_string(data) self.boundary = self.msg.get_boundary() self.parts = data.split(self.boundary) + self.body = body + self.htmlDoc = HTMLDoc(body) + self.lastImageNum = 0 + self.imageMappings = [] #print '***', len(self.parts) - #for idx, part in enumerate(self.msg.walk()): - # print '***', idx, part['Content-Location'], part.get_content_type() + for idx, part in enumerate(self.msg.walk()): + # print '***', idx, , part.get_content_type() + if idx == 1: + docPath = part['Content-Location'] + self.documentName = docPath + # TODO: collect existing images to provide consistent naming - def addImage(self, imagePath): - pass + def getImageRefs(self): + return self.htmlDoc.getImageRefs() - def setBody(self, body): - content = body.encode(self.encoding) + def addImage(self, imageData, path='image001.jpg', contentType='image/jpeg'): + flpos = self.indexes['filelist'] + # TODO: get contentType from path + # TODO: generate name, update self.imageMappings + name = path + vars = dict(docname=self.documentName, imgname=name, ctype=contentType, + imgdata=base64.encodestring(imageData)) + content = self. imageTemplate % vars + self.parts.insert(flpos, content) + filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern + filelist = self.parts[flpos] + self.parts[flpos] = filelist.replace(self.filelistPattern, filelistRep) + + + def insertBody(self): + # self.htmlDoc.updateImageRefs(self.imageMappings) + # TODO: convert changed self.htmlDoc to new body + content = self.body.encode(self.encoding) bodyIndex = self.indexes['body'] baseDocument = self.parts[bodyIndex] self.parts[bodyIndex] = baseDocument.replace(self.bodyMarker, self.quopri(content)) def asString(self): - #msg = MIMEMultipart('related') return self.boundary.join(self.parts) def quopri(self, s): return s.replace('="', '=3D"') + +class HTMLDoc(object): + + def __init__(self, data): + self.data = data + + def getImageRefs(self): + return [] + + def updateImageRefs(self, mappings): + for old, new in mappings: + pass +