diff --git a/docgen/README.txt b/docgen/README.txt
index 2e5b281..8eb768a 100644
--- a/docgen/README.txt
+++ b/docgen/README.txt
@@ -20,12 +20,7 @@ Working with MHT Files
>>> data = f.read()
>>> f.close()
- >>> imagePath = os.path.join(basePath, 'test_image.jpg')
- >>> f = open(imagePath, 'rt')
- >>> imageData = f.read()
- >>> f.close()
-
- >>> body = '''
+ >>> xbody = '''
...
...
@@ -33,15 +28,27 @@ Working with MHT Files
...
... '''
+ >>> body = '''
+ ...

+ ...
+ ... '''
+
>>> from cybertools.docgen.mht import MHTFile
>>> document = MHTFile(data, body)
- >>> document.addImage(imageData, 'files/test_image.jpg')
+
+ >>> imageRefs = document.htmlDoc.getImageRefs()
+ >>> for path in imageRefs:
+ ... imagePath = os.path.join(basePath, os.path.basename(path))
+ ... f = open(imagePath, 'rt')
+ ... imageData = f.read()
+ ... f.close()
+ ... document.addImage(imageData, path)
>>> document.insertBody()
>>> output = document.asString()
>>> len(data), len(output)
- (294996, 336268)
+ (294996, 336140)
>>> outPath = os.path.join(basePath, 'out_doc.mht')
>>> #f = open(outPath, 'wt')
diff --git a/docgen/mht.py b/docgen/mht.py
index 2eb9367..1242973 100644
--- a/docgen/mht.py
+++ b/docgen/mht.py
@@ -21,10 +21,14 @@ Working with MHT Files.
"""
import base64
+from cStringIO import StringIO
import email
+import Image
import mimetypes
import os
+from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Tag
+
class MHTFile(object):
@@ -53,7 +57,7 @@ class MHTFile(object):
self.body = body
self.htmlDoc = HTMLDoc(body)
self.lastImageNum = 0
- self.imageMappings = []
+ self.imageMappings = {}
for idx, part in enumerate(self.msg.walk()):
docPath = part['Content-Location']
contentType = part.get_content_type()
@@ -68,28 +72,30 @@ class MHTFile(object):
def getImageRefs(self):
return self.htmlDoc.getImageRefs()
- def addImage(self, imageData, path, contentType='image/jpeg'):
+ def addImage(self, imageData, path):
+ image = Image.open(StringIO(imageData))
+ width, height = image.size
contentType, enc = mimetypes.guess_type(path)
bp, ext = os.path.splitext(path)
self.lastImageNum += 1
name = 'image%03i%s' % (self.lastImageNum, ext)
- self.imageMappings.append((path, name))
+ self.imageMappings[path] = (name, width, height)
flpos = self.indexes['filelist']
vars = dict(path=self.path, docname=self.documentName,
suffix=self.foldernameSuffix,
imgname=name, ctype=contentType,
imgdata=base64.encodestring(imageData))
content = self. imageTemplate % vars
- self.parts.insert(flpos, content)
+ self.parts.insert(flpos, str(content))
filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern
filelist = self.parts[flpos]
- self.parts[flpos] = filelist.replace(self.filelistPattern, filelistRep)
+ self.parts[flpos] = str(filelist.replace(self.filelistPattern, filelistRep))
def insertBody(self):
- self.htmlDoc.updateImageRefs(self.imageMappings)
- # TODO: convert changed self.htmlDoc to new body
- content = self.body.encode(self.encoding)
+ path = '-'.join((self.documentName, self.foldernameSuffix))
+ self.htmlDoc.updateImageRefs(self.imageMappings, path)
+ content = self.htmlDoc.doc.renderContents(self.encoding)
bodyIndex = self.indexes['body']
baseDocument = self.parts[bodyIndex]
self.parts[bodyIndex] = baseDocument.replace(self.bodyMarker,
@@ -106,11 +112,20 @@ class HTMLDoc(object):
def __init__(self, data):
self.data = data
+ self.doc = BeautifulSoup(data)
def getImageRefs(self):
- return []
+ return [img['src'] for img in self.doc('img')]
- def updateImageRefs(self, mappings):
- for old, new in mappings:
- pass
+ def updateImageRefs(self, mappings, path=''):
+ for img in self.doc('img'):
+ name, width, height = mappings[img['src']]
+ imgdata = Tag(self.doc, 'v:imagedata')
+ imgdata['src'] = '/'.join((path, name))
+ imgdata.isSelfClosing = True
+ img.append(imgdata)
+ del img['src']
+ img['style'] = 'width:%spt;height:%spt' % (width, height)
+ img.isSelfClosing = False
+ img.name='v:shape'