replaced createMIMEMail with an overwritten version of createResource and also moved the collection of the Metadata to the method assembleMetadata.

Added a new module inagent/util which contains a dictionary with all more or less often used codepages and their representation in an Outlook Mail object.jsUntil now, the attachments are only represented by thei filenames as a list named subResources.

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2589 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
scrat 2008-05-10 15:39:25 +00:00
parent ac5013a026
commit 52d6d9348c
3 changed files with 130 additions and 19 deletions

View file

@ -26,6 +26,7 @@ from zope.interface import implements
from cybertools.agent.base.agent import Agent, Master from cybertools.agent.base.agent import Agent, Master
from cybertools.agent.crawl.base import Resource from cybertools.agent.crawl.base import Resource
from cybertools.agent.crawl.base import Metadata
from cybertools.agent.crawl.base import Crawler from cybertools.agent.crawl.base import Crawler
from cybertools.agent.components import agents from cybertools.agent.components import agents
from twisted.internet.defer import succeed from twisted.internet.defer import succeed
@ -51,10 +52,14 @@ class MailCrawler(Crawler):
def loadMailsFromFolder(self, folder): def loadMailsFromFolder(self, folder):
pass pass
def createResource(self, mail, path=None, application=None, metadata=None): def createResource(self, data, path=None, application=None, metadata=None):
resource = MailResource(mail, path=path, application=application, resource = MailResource(data=data, path=path, application=application,
metadata=metadata) metadata=metadata)
self.result.append(resource) self.result.append(resource)
def createMetadata(self, metadata):
metadata = Metadata(metadata)
return metadata
def login(self): def login(self):
pass pass

View file

@ -40,6 +40,7 @@ from cybertools.agent.crawl.mail import MailResource
from cybertools.agent.components import agents from cybertools.agent.components import agents
from cybertools.agent.system.windows import api from cybertools.agent.system.windows import api
from cybertools.agent.util.task import coiterate from cybertools.agent.util.task import coiterate
from cybertools.agent.util.codepages import codepages
# some constants # some constants
COMMASPACE = ', ' COMMASPACE = ', '
@ -127,10 +128,9 @@ class OutlookCrawler(MailCrawler):
record[key] = "Invalid data format" record[key] = "Invalid data format"
except: except:
record[key] = "Requested attribute not available" record[key] = "Requested attribute not available"
# Create the mime email object metadata = self.assembleMetadata(record)
msg = self.createEmailMime(record)
# Create a resource and append it to the result list # Create a resource and append it to the result list
self.createResource(msg, application='outlook') self.createResource(mail, folder, metadata)
yield None yield None
def login(self): def login(self):
@ -177,23 +177,71 @@ class OutlookCrawler(MailCrawler):
pass pass
return outlookFound return outlookFound
def createEmailMime(self, emails): def assembleMetadata(self, mailAttr):
# Create the container (outer) email message. meta = {}
msg = MIMEMultipart.MIMEMultipart() for key in mailAttr.keys():
for key in emails.keys(): if isinstance(mailAttr[key], (str, unicode))\
if isinstance(emails[key], (str, unicode)): and mailAttr[key] != 'Body' and mailAttr[key] != 'HTMLBody':
msg[key] = emails[key].encode('utf-8') meta[key] = mailAttr[key].encode('utf-8')
elif isinstance(emails[key], (list, tuple, dict)): elif isinstance(mailAttr[key], (list, tuple, dict)):
lst = [] lst = []
for rec in emails[key]: for rec in mailAttr[key]:
lst.append(rec) lst.append(rec)
msg[key] = COMMASPACE.join(lst) meta[key] = COMMASPACE.join(lst)
else: else:
msg[key] = emails[key] meta[key] = mailAttr[key]
if emails.has_key('Body'): metadata = self.createMetadata(meta)
msg.preamble = emails['Body'].encode('utf-8') return metadata
def createResource(self, mail, folder, metadata):
enc = "not available"
textType = "not available"
attachments = []
ident = "EntryID not available"
if (hasattr(mail, 'BodyFormat')):
value = getattr(mail, 'BodyFormat')
if value == 1:
#1: it is a plain text mail, that is maybe decorated with
#some html Tags by Outlook for formatting
#so save it as plain text mail
if hasattr(mail, 'Body'):
mailContent = getattr(mail, 'Body')
textType = "text/plain"
else:
mailContent = "Could not retrieve mail body"
textType = "text/plain"
elif value == 2:
#2: it is a HTML mail
if hasattr(mail, 'HTMLBody'):
mailContent = getattr(mail, 'HTMLBody')
textType = "text/html"
else:
mailContent = "Could not retrieve HTMLBody of mail"
textType = "text/html"
else: else:
msg.preamble = "e-Mail body not available" #Could not determine BodyFormat. Try to retrieve plain text
return msg if hasattr(mail, 'Body'):
mailContent = getattr(mail, 'Body')
else:
mailContent = "Could not retrieve mail body"
if hasattr(mail, 'InternetCodepage'):
Codepage = getattr(mail, 'InternetCodepage')
if codepages.has_key(Codepage):
enc = codepages[Codepage]
if hasattr(mail, 'EntryID'):
ident = getattr(mail, 'EntryID')
if hasattr(mail, 'Attachments'):
attachedElems = getattr(mail, 'Attachments')
for item in range(1, len(attachedElems)+1):
attachments.append(attachedElems.Item(item).FileName)
resource = MailResource(data=mailContent,\
contentType=textType,\
encoding=enc,\
path=folder,\
application='outlook',\
identifier=ident,\
metadata=metadata,\
subResources=attachments)
self.result.append(resource)
agents.register(OutlookCrawler, Master, name='crawl.outlook') agents.register(OutlookCrawler, Master, name='crawl.outlook')

58
agent/util/codepages.py Normal file
View file

@ -0,0 +1,58 @@
#
# Copyright (c) 2008 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Codepages Module
"""
codepages = {28596: 'iso-8859-6',\
1256: 'windows-1256',\
28594: 'iso-8859-4',\
1257: 'windows-1257',\
28592: 'iso-8859-2',\
1250: 'windows-1250',\
936: 'gb2312',\
52936: 'hz-gb-2312',\
950: 'big5',\
28595: 'iso-8859-5',\
20866: 'koi8-r',\
21866: 'koi8-u',\
1251: 'windows-1251',\
28597: 'iso-8859-7',\
1253: 'windows-1253',\
38598: 'iso-8859-8-i',\
1255: 'windows-1255',\
51932: 'euc-jp',\
50220: 'iso-2022-jp',\
50221: 'csISO2022JP',\
932: 'iso-2022-jp',\
949: 'ks_c_5601-1987',\
51949: 'euc-kr',\
28593: 'iso-8859-3',\
28605: 'iso-8859-15',\
874: 'windows-874',\
28599: 'iso-8859-9',\
1254: 'windows-1254',\
65000: 'utf-7',\
65001: 'utf-8',\
20127: 'us-ascii',\
1258: 'windows-1258',\
28591: 'iso-8859-1',\
1252: 'Windows-1252'
}