diff --git a/docgen/README.txt b/docgen/README.txt index 8831bb3..19cf90e 100644 --- a/docgen/README.txt +++ b/docgen/README.txt @@ -26,15 +26,19 @@ Working with MHT Files >>> document = MHTFile(data) >>> document.addImage(imagePath) - >>> body = ''' + >>> body = ''' ... ''' >>> document.setBody(body) + >>> output = document.asString() + >>> len(data), len(output) + (294996, 295017) + >>> outPath = os.path.join(basePath, 'out_doc.mht') - >>> f = open(outPath, 'wt') - >>> f.write(document.data) - >>> f.close() + >>> #f = open(outPath, 'wt') + >>> #f.write(document.asString()) + >>> #f.close() - >>> os.unlink(outPath) + >>> #os.unlink(outPath) diff --git a/docgen/mht.py b/docgen/mht.py index 71fbb99..fe06424 100644 --- a/docgen/mht.py +++ b/docgen/mht.py @@ -20,14 +20,42 @@ Working with MHT Files. """ +from email import message_from_string +#from email.multipart import MIMEMultipart + + class MHTFile(object): + #encoding = 'UTF-8' + #encoding = 'ISO8859-15' + encoding = 'Windows-1252' + bodyMarker = 'lxdoc_body' + indexes = dict(body=2, filelist=-2) + + def __init__(self, data): self.data = data + self.msg = message_from_string(data) + self.boundary = self.msg.get_boundary() + self.parts = data.split(self.boundary) + #print '***', len(self.parts) + #for idx, part in enumerate(self.msg.walk()): + # print '***', idx, part['Content-Location'], part.get_content_type() def addImage(self, imagePath): pass def setBody(self, body): - pass + content = body.encode(self.encoding) + bodyIndex = self.indexes['body'] + baseDocument = self.parts[bodyIndex] + self.parts[bodyIndex] = baseDocument.replace(self.bodyMarker, + self.quopri(content)) + + def asString(self): + #msg = MIMEMultipart('related') + return self.boundary.join(self.parts) + + def quopri(self, s): + return s.replace('="', '=3D"')