diff --git a/docgen/README.txt b/docgen/README.txt
index f378c85..2e5b281 100644
--- a/docgen/README.txt
+++ b/docgen/README.txt
@@ -21,19 +21,27 @@ Working with MHT Files
>>> f.close()
>>> imagePath = os.path.join(basePath, 'test_image.jpg')
+ >>> f = open(imagePath, 'rt')
+ >>> imageData = f.read()
+ >>> f.close()
- >>> body = '''
+ >>> body = '''
+ ...
+ ...
+ ...
+ ...
... '''
>>> from cybertools.docgen.mht import MHTFile
>>> document = MHTFile(data, body)
- >>> document.addImage(imagePath) # TODO: provide imageData, path
+ >>> document.addImage(imageData, 'files/test_image.jpg')
>>> document.insertBody()
>>> output = document.asString()
>>> len(data), len(output)
- (294996, 295346)
+ (294996, 336268)
>>> outPath = os.path.join(basePath, 'out_doc.mht')
>>> #f = open(outPath, 'wt')
diff --git a/docgen/mht.py b/docgen/mht.py
index cff3e4b..2eb9367 100644
--- a/docgen/mht.py
+++ b/docgen/mht.py
@@ -22,6 +22,7 @@ Working with MHT Files.
import base64
import email
+import mimetypes
import os
@@ -31,15 +32,18 @@ class MHTFile(object):
#encoding = 'ISO8859-15'
encoding = 'Windows-1252'
bodyMarker = 'lxdoc_body'
+ foldernameSuffix = 'Dateien'
indexes = dict(body=2, filelist=-2)
+ path = documentName = None
+
imageTemplate = ('\n'
- 'Content-Location: file:///C:/AF2749EC/%(docname)s-Dateien/$(imgname)s\n'
+ 'Content-Location: %(path)s/%(docname)s-%(suffix)s/%(imgname)s\n'
'Content-Transfer-Encoding: base64\n'
'Content-Type: %(ctype)s\n\n%(imgdata)s\n\n')
filelistItemTemplate = ' \n'
- filelistPattern =''
+ filelistPattern =' '
def __init__(self, data, body):
self.data = data
@@ -50,23 +54,30 @@ class MHTFile(object):
self.htmlDoc = HTMLDoc(body)
self.lastImageNum = 0
self.imageMappings = []
- #print '***', len(self.parts)
for idx, part in enumerate(self.msg.walk()):
- # print '***', idx, , part.get_content_type()
- if idx == 1:
- docPath = part['Content-Location']
- self.documentName = docPath
- # TODO: collect existing images to provide consistent naming
+ docPath = part['Content-Location']
+ contentType = part.get_content_type()
+ #print '***', idx, docPath, contentType
+ if idx == self.indexes['body'] - 1:
+ self.path, docname = os.path.split(docPath)
+ self.documentName, ext = os.path.splitext(docname)
+ if contentType.startswith('image/'):
+ self.lastImageNum += 1
+ #print '###', self.path, self.documentName, self.lastImageNum
def getImageRefs(self):
return self.htmlDoc.getImageRefs()
- def addImage(self, imageData, path='image001.jpg', contentType='image/jpeg'):
+ def addImage(self, imageData, path, contentType='image/jpeg'):
+ contentType, enc = mimetypes.guess_type(path)
+ bp, ext = os.path.splitext(path)
+ self.lastImageNum += 1
+ name = 'image%03i%s' % (self.lastImageNum, ext)
+ self.imageMappings.append((path, name))
flpos = self.indexes['filelist']
- # TODO: get contentType from path
- # TODO: generate name, update self.imageMappings
- name = path
- vars = dict(docname=self.documentName, imgname=name, ctype=contentType,
+ vars = dict(path=self.path, docname=self.documentName,
+ suffix=self.foldernameSuffix,
+ imgname=name, ctype=contentType,
imgdata=base64.encodestring(imageData))
content = self. imageTemplate % vars
self.parts.insert(flpos, content)
@@ -76,7 +87,7 @@ class MHTFile(object):
def insertBody(self):
- # self.htmlDoc.updateImageRefs(self.imageMappings)
+ self.htmlDoc.updateImageRefs(self.imageMappings)
# TODO: convert changed self.htmlDoc to new body
content = self.body.encode(self.encoding)
bodyIndex = self.indexes['body']
diff --git a/docgen/testing/test_image.jpg b/docgen/testing/test_image.jpg
new file mode 100644
index 0000000..9f33480
Binary files /dev/null and b/docgen/testing/test_image.jpg differ