document (.mht file) generation with images OK

This commit is contained in:
Helmut Merz 2012-12-19 11:09:39 +01:00
parent 87d323a550
commit c62122dcd9
2 changed files with 42 additions and 20 deletions

View file

@ -20,12 +20,7 @@ Working with MHT Files
>>> data = f.read() >>> data = f.read()
>>> f.close() >>> f.close()
>>> imagePath = os.path.join(basePath, 'test_image.jpg') >>> xbody = '''<div class="WordSection1">
>>> f = open(imagePath, 'rt')
>>> imageData = f.read()
>>> f.close()
>>> body = '''<div class="WordSection1">
... <v:shape id="Grafik_x0020_2" o:spid="_x0000_i1025" type="#_x0000_t75" ... <v:shape id="Grafik_x0020_2" o:spid="_x0000_i1025" type="#_x0000_t75"
... style="width:320pt;height:240pt;visibility:visible;mso-wrap-style:square"> ... style="width:320pt;height:240pt;visibility:visible;mso-wrap-style:square">
... <v:imagedata src="FB-Besprechungsprotokoll-Dateien/image002.jpg" o:title=""/> ... <v:imagedata src="FB-Besprechungsprotokoll-Dateien/image002.jpg" o:title=""/>
@ -33,15 +28,27 @@ Working with MHT Files
... </div> ... </div>
... ''' ... '''
>>> body = '''<div class="WordSection1">
... <img src="files/test_image.jpg" />
... </div>
... '''
>>> from cybertools.docgen.mht import MHTFile >>> from cybertools.docgen.mht import MHTFile
>>> document = MHTFile(data, body) >>> document = MHTFile(data, body)
>>> document.addImage(imageData, 'files/test_image.jpg')
>>> imageRefs = document.htmlDoc.getImageRefs()
>>> for path in imageRefs:
... imagePath = os.path.join(basePath, os.path.basename(path))
... f = open(imagePath, 'rt')
... imageData = f.read()
... f.close()
... document.addImage(imageData, path)
>>> document.insertBody() >>> document.insertBody()
>>> output = document.asString() >>> output = document.asString()
>>> len(data), len(output) >>> len(data), len(output)
(294996, 336268) (294996, 336140)
>>> outPath = os.path.join(basePath, 'out_doc.mht') >>> outPath = os.path.join(basePath, 'out_doc.mht')
>>> #f = open(outPath, 'wt') >>> #f = open(outPath, 'wt')

View file

@ -21,10 +21,14 @@ Working with MHT Files.
""" """
import base64 import base64
from cStringIO import StringIO
import email import email
import Image
import mimetypes import mimetypes
import os import os
from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Tag
class MHTFile(object): class MHTFile(object):
@ -53,7 +57,7 @@ class MHTFile(object):
self.body = body self.body = body
self.htmlDoc = HTMLDoc(body) self.htmlDoc = HTMLDoc(body)
self.lastImageNum = 0 self.lastImageNum = 0
self.imageMappings = [] self.imageMappings = {}
for idx, part in enumerate(self.msg.walk()): for idx, part in enumerate(self.msg.walk()):
docPath = part['Content-Location'] docPath = part['Content-Location']
contentType = part.get_content_type() contentType = part.get_content_type()
@ -68,28 +72,30 @@ class MHTFile(object):
def getImageRefs(self): def getImageRefs(self):
return self.htmlDoc.getImageRefs() return self.htmlDoc.getImageRefs()
def addImage(self, imageData, path, contentType='image/jpeg'): def addImage(self, imageData, path):
image = Image.open(StringIO(imageData))
width, height = image.size
contentType, enc = mimetypes.guess_type(path) contentType, enc = mimetypes.guess_type(path)
bp, ext = os.path.splitext(path) bp, ext = os.path.splitext(path)
self.lastImageNum += 1 self.lastImageNum += 1
name = 'image%03i%s' % (self.lastImageNum, ext) name = 'image%03i%s' % (self.lastImageNum, ext)
self.imageMappings.append((path, name)) self.imageMappings[path] = (name, width, height)
flpos = self.indexes['filelist'] flpos = self.indexes['filelist']
vars = dict(path=self.path, docname=self.documentName, vars = dict(path=self.path, docname=self.documentName,
suffix=self.foldernameSuffix, suffix=self.foldernameSuffix,
imgname=name, ctype=contentType, imgname=name, ctype=contentType,
imgdata=base64.encodestring(imageData)) imgdata=base64.encodestring(imageData))
content = self. imageTemplate % vars content = self. imageTemplate % vars
self.parts.insert(flpos, content) self.parts.insert(flpos, str(content))
filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern
filelist = self.parts[flpos] filelist = self.parts[flpos]
self.parts[flpos] = filelist.replace(self.filelistPattern, filelistRep) self.parts[flpos] = str(filelist.replace(self.filelistPattern, filelistRep))
def insertBody(self): def insertBody(self):
self.htmlDoc.updateImageRefs(self.imageMappings) path = '-'.join((self.documentName, self.foldernameSuffix))
# TODO: convert changed self.htmlDoc to new body self.htmlDoc.updateImageRefs(self.imageMappings, path)
content = self.body.encode(self.encoding) content = self.htmlDoc.doc.renderContents(self.encoding)
bodyIndex = self.indexes['body'] bodyIndex = self.indexes['body']
baseDocument = self.parts[bodyIndex] baseDocument = self.parts[bodyIndex]
self.parts[bodyIndex] = baseDocument.replace(self.bodyMarker, self.parts[bodyIndex] = baseDocument.replace(self.bodyMarker,
@ -106,11 +112,20 @@ class HTMLDoc(object):
def __init__(self, data): def __init__(self, data):
self.data = data self.data = data
self.doc = BeautifulSoup(data)
def getImageRefs(self): def getImageRefs(self):
return [] return [img['src'] for img in self.doc('img')]
def updateImageRefs(self, mappings): def updateImageRefs(self, mappings, path=''):
for old, new in mappings: for img in self.doc('img'):
pass name, width, height = mappings[img['src']]
imgdata = Tag(self.doc, 'v:imagedata')
imgdata['src'] = '/'.join((path, name))
imgdata.isSelfClosing = True
img.append(imgdata)
del img['src']
img['style'] = 'width:%spt;height:%spt' % (width, height)
img.isSelfClosing = False
img.name='v:shape'