130 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			130 lines
		
	
	
	
		
			4.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #
 | |
| #  Copyright (c) 2012 Helmut Merz helmutm@cy55.de
 | |
| #
 | |
| #  This program is free software; you can redistribute it and/or modify
 | |
| #  it under the terms of the GNU General Public License as published by
 | |
| #  the Free Software Foundation; either version 2 of the License, or
 | |
| #  (at your option) any later version.
 | |
| #
 | |
| #  This program is distributed in the hope that it will be useful,
 | |
| #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| #  GNU General Public License for more details.
 | |
| #
 | |
| #  You should have received a copy of the GNU General Public License
 | |
| #  along with this program; if not, write to the Free Software
 | |
| #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | |
| #
 | |
| 
 | |
| """
 | |
| Working with MHT Files.
 | |
| """
 | |
| 
 | |
| import base64
 | |
| from cStringIO import StringIO
 | |
| import email
 | |
| from PIL import Image
 | |
| import mimetypes
 | |
| import os
 | |
| 
 | |
| from cybertools.text.lib.BeautifulSoup import BeautifulSoup, Tag
 | |
| 
 | |
| 
 | |
| class MHTFile(object):
 | |
| 
 | |
|     #encoding = 'UTF-8'
 | |
|     #encoding = 'ISO8859-15'
 | |
|     encoding = 'Windows-1252'
 | |
|     bodyMarker = 'lxdoc_body'
 | |
|     foldernameSuffix = 'Dateien'
 | |
|     indexes = dict(body=1, filelist=-2)
 | |
| 
 | |
|     path = documentName = None
 | |
| 
 | |
|     imageTemplate = ('\n'
 | |
|         'Content-Location: %(path)s/%(docname)s-%(suffix)s/%(imgname)s\n'
 | |
|         'Content-Transfer-Encoding: base64\n'
 | |
|         'Content-Type: %(ctype)s\n\n%(imgdata)s\n\n')
 | |
| 
 | |
|     filelistItemTemplate = ' <o:File HRef=3D"%s"/>\n'
 | |
|     filelistPattern =' <o:File HRef=3D"filelist.xml"/>'
 | |
| 
 | |
|     def __init__(self, data, body):
 | |
|         self.data = data
 | |
|         self.msg = email.message_from_string(data)
 | |
|         self.boundary = '--' + self.msg.get_boundary()
 | |
|         self.parts = data.split(self.boundary)
 | |
|         self.body = body
 | |
|         self.htmlDoc = HTMLDoc(body)
 | |
|         self.lastImageNum = 0
 | |
|         self.imageMappings = {}
 | |
|         for idx, part in enumerate(self.msg.walk()):
 | |
|             docPath = part['Content-Location']
 | |
|             contentType = part.get_content_type()
 | |
|             if idx == self.indexes['body']:
 | |
|                 self.path, docname = os.path.split(docPath)
 | |
|                 self.documentName, ext = os.path.splitext(docname)
 | |
|             if contentType.startswith('image/'):
 | |
|                 self.lastImageNum += 1
 | |
|         #print '###', self.path, self.documentName, self.lastImageNum
 | |
| 
 | |
|     def getImageRefs(self):
 | |
|         return self.htmlDoc.getImageRefs()
 | |
| 
 | |
|     def addImage(self, imageData, path):
 | |
|         image = Image.open(StringIO(imageData))
 | |
|         width, height = image.size
 | |
|         contentType, enc = mimetypes.guess_type(path)
 | |
|         bp, ext = os.path.splitext(path)
 | |
|         self.lastImageNum += 1
 | |
|         name = 'image%03i%s' % (self.lastImageNum, ext)
 | |
|         self.imageMappings[path] = (name, width, height)
 | |
|         flpos = self.indexes['filelist']
 | |
|         vars = dict(path=self.path, docname=self.documentName,  
 | |
|                     suffix=self.foldernameSuffix,
 | |
|                     imgname=name, ctype=contentType,
 | |
|                     imgdata=base64.encodestring(imageData))
 | |
|         content = self. imageTemplate % vars
 | |
|         self.parts.insert(flpos, str(content))
 | |
|         filelistRep = (self.filelistItemTemplate % name) + self.filelistPattern
 | |
|         filelist = self.parts[flpos]
 | |
|         self.parts[flpos] = str(filelist.replace(self.filelistPattern, filelistRep))
 | |
| 
 | |
| 
 | |
|     def insertBody(self):
 | |
|         path = '-'.join((self.documentName, self.foldernameSuffix))
 | |
|         self.htmlDoc.updateImageRefs(self.imageMappings, path)
 | |
|         content = self.htmlDoc.doc.renderContents(self.encoding)
 | |
|         bodyIndex = self.indexes['body']
 | |
|         baseDocument = self.parts[bodyIndex]
 | |
|         self.parts[bodyIndex] =  baseDocument.replace(self.bodyMarker, 
 | |
|                                         self.quopri(content))
 | |
| 
 | |
|     def asString(self):
 | |
|         return self.boundary.join(self.parts)
 | |
| 
 | |
|     def quopri(self, s):
 | |
|         return s.replace('="', '=3D"')
 | |
| 
 | |
| 
 | |
| class HTMLDoc(object):
 | |
| 
 | |
|     def __init__(self, data):
 | |
|         self.data = data
 | |
|         self.doc = BeautifulSoup(data)
 | |
| 
 | |
|     def getImageRefs(self):
 | |
|         return [img['src'] for img in self.doc('img')]
 | |
| 
 | |
|     def updateImageRefs(self, mappings, path=''):
 | |
|         for img in self.doc('img'):
 | |
|             name, width, height = mappings[img['src']]
 | |
|             imgdata = Tag(self.doc, 'v:imagedata')
 | |
|             imgdata['src'] = '/'.join((path, name))
 | |
|             imgdata.isSelfClosing = True
 | |
|             img.append(imgdata)
 | |
|             del img['src']
 | |
|             img['style'] = 'width:%spt;height:%spt' % (width, height)
 | |
|             img.isSelfClosing = False
 | |
|             img.name='v:shape'
 | |
| 
 |