replaced createMIMEMail with an overwritten version of createResource and also moved the collection of the Metadata to the method assembleMetadata.

Added a new module inagent/util which contains a dictionary with all more or less often used codepages and their representation in an Outlook Mail object.jsUntil now, the attachments are only represented by thei filenames as a list named subResources.

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2589 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
scrat 2008-05-10 15:39:25 +00:00
parent ac5013a026
commit 52d6d9348c
3 changed files with 130 additions and 19 deletions

View file

@ -26,6 +26,7 @@ from zope.interface import implements
from cybertools.agent.base.agent import Agent, Master
from cybertools.agent.crawl.base import Resource
from cybertools.agent.crawl.base import Metadata
from cybertools.agent.crawl.base import Crawler
from cybertools.agent.components import agents
from twisted.internet.defer import succeed
@ -51,10 +52,14 @@ class MailCrawler(Crawler):
def loadMailsFromFolder(self, folder):
pass
def createResource(self, mail, path=None, application=None, metadata=None):
resource = MailResource(mail, path=path, application=application,
def createResource(self, data, path=None, application=None, metadata=None):
resource = MailResource(data=data, path=path, application=application,
metadata=metadata)
self.result.append(resource)
def createMetadata(self, metadata):
metadata = Metadata(metadata)
return metadata
def login(self):
pass

View file

@ -40,6 +40,7 @@ from cybertools.agent.crawl.mail import MailResource
from cybertools.agent.components import agents
from cybertools.agent.system.windows import api
from cybertools.agent.util.task import coiterate
from cybertools.agent.util.codepages import codepages
# some constants
COMMASPACE = ', '
@ -127,10 +128,9 @@ class OutlookCrawler(MailCrawler):
record[key] = "Invalid data format"
except:
record[key] = "Requested attribute not available"
# Create the mime email object
msg = self.createEmailMime(record)
metadata = self.assembleMetadata(record)
# Create a resource and append it to the result list
self.createResource(msg, application='outlook')
self.createResource(mail, folder, metadata)
yield None
def login(self):
@ -177,23 +177,71 @@ class OutlookCrawler(MailCrawler):
pass
return outlookFound
def createEmailMime(self, emails):
# Create the container (outer) email message.
msg = MIMEMultipart.MIMEMultipart()
for key in emails.keys():
if isinstance(emails[key], (str, unicode)):
msg[key] = emails[key].encode('utf-8')
elif isinstance(emails[key], (list, tuple, dict)):
def assembleMetadata(self, mailAttr):
meta = {}
for key in mailAttr.keys():
if isinstance(mailAttr[key], (str, unicode))\
and mailAttr[key] != 'Body' and mailAttr[key] != 'HTMLBody':
meta[key] = mailAttr[key].encode('utf-8')
elif isinstance(mailAttr[key], (list, tuple, dict)):
lst = []
for rec in emails[key]:
for rec in mailAttr[key]:
lst.append(rec)
msg[key] = COMMASPACE.join(lst)
meta[key] = COMMASPACE.join(lst)
else:
msg[key] = emails[key]
if emails.has_key('Body'):
msg.preamble = emails['Body'].encode('utf-8')
meta[key] = mailAttr[key]
metadata = self.createMetadata(meta)
return metadata
def createResource(self, mail, folder, metadata):
enc = "not available"
textType = "not available"
attachments = []
ident = "EntryID not available"
if (hasattr(mail, 'BodyFormat')):
value = getattr(mail, 'BodyFormat')
if value == 1:
#1: it is a plain text mail, that is maybe decorated with
#some html Tags by Outlook for formatting
#so save it as plain text mail
if hasattr(mail, 'Body'):
mailContent = getattr(mail, 'Body')
textType = "text/plain"
else:
mailContent = "Could not retrieve mail body"
textType = "text/plain"
elif value == 2:
#2: it is a HTML mail
if hasattr(mail, 'HTMLBody'):
mailContent = getattr(mail, 'HTMLBody')
textType = "text/html"
else:
mailContent = "Could not retrieve HTMLBody of mail"
textType = "text/html"
else:
msg.preamble = "e-Mail body not available"
return msg
#Could not determine BodyFormat. Try to retrieve plain text
if hasattr(mail, 'Body'):
mailContent = getattr(mail, 'Body')
else:
mailContent = "Could not retrieve mail body"
if hasattr(mail, 'InternetCodepage'):
Codepage = getattr(mail, 'InternetCodepage')
if codepages.has_key(Codepage):
enc = codepages[Codepage]
if hasattr(mail, 'EntryID'):
ident = getattr(mail, 'EntryID')
if hasattr(mail, 'Attachments'):
attachedElems = getattr(mail, 'Attachments')
for item in range(1, len(attachedElems)+1):
attachments.append(attachedElems.Item(item).FileName)
resource = MailResource(data=mailContent,\
contentType=textType,\
encoding=enc,\
path=folder,\
application='outlook',\
identifier=ident,\
metadata=metadata,\
subResources=attachments)
self.result.append(resource)
agents.register(OutlookCrawler, Master, name='crawl.outlook')

58
agent/util/codepages.py Normal file
View file

@ -0,0 +1,58 @@
#
# Copyright (c) 2008 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Codepages Module
"""
codepages = {28596: 'iso-8859-6',\
1256: 'windows-1256',\
28594: 'iso-8859-4',\
1257: 'windows-1257',\
28592: 'iso-8859-2',\
1250: 'windows-1250',\
936: 'gb2312',\
52936: 'hz-gb-2312',\
950: 'big5',\
28595: 'iso-8859-5',\
20866: 'koi8-r',\
21866: 'koi8-u',\
1251: 'windows-1251',\
28597: 'iso-8859-7',\
1253: 'windows-1253',\
38598: 'iso-8859-8-i',\
1255: 'windows-1255',\
51932: 'euc-jp',\
50220: 'iso-2022-jp',\
50221: 'csISO2022JP',\
932: 'iso-2022-jp',\
949: 'ks_c_5601-1987',\
51949: 'euc-kr',\
28593: 'iso-8859-3',\
28605: 'iso-8859-15',\
874: 'windows-874',\
28599: 'iso-8859-9',\
1254: 'windows-1254',\
65000: 'utf-7',\
65001: 'utf-8',\
20127: 'us-ascii',\
1258: 'windows-1258',\
28591: 'iso-8859-1',\
1252: 'Windows-1252'
}