diff --git a/agent/crawl/mail.py b/agent/crawl/mail.py index b9f0857..90a4436 100644 --- a/agent/crawl/mail.py +++ b/agent/crawl/mail.py @@ -26,6 +26,7 @@ from zope.interface import implements from cybertools.agent.base.agent import Agent, Master from cybertools.agent.crawl.base import Resource +from cybertools.agent.crawl.base import Metadata from cybertools.agent.crawl.base import Crawler from cybertools.agent.components import agents from twisted.internet.defer import succeed @@ -51,10 +52,14 @@ class MailCrawler(Crawler): def loadMailsFromFolder(self, folder): pass - def createResource(self, mail, path=None, application=None, metadata=None): - resource = MailResource(mail, path=path, application=application, + def createResource(self, data, path=None, application=None, metadata=None): + resource = MailResource(data=data, path=path, application=application, metadata=metadata) self.result.append(resource) + + def createMetadata(self, metadata): + metadata = Metadata(metadata) + return metadata def login(self): pass diff --git a/agent/crawl/outlook.py b/agent/crawl/outlook.py index 878c5fd..0e80188 100644 --- a/agent/crawl/outlook.py +++ b/agent/crawl/outlook.py @@ -40,6 +40,7 @@ from cybertools.agent.crawl.mail import MailResource from cybertools.agent.components import agents from cybertools.agent.system.windows import api from cybertools.agent.util.task import coiterate +from cybertools.agent.util.codepages import codepages # some constants COMMASPACE = ', ' @@ -127,10 +128,9 @@ class OutlookCrawler(MailCrawler): record[key] = "Invalid data format" except: record[key] = "Requested attribute not available" - # Create the mime email object - msg = self.createEmailMime(record) + metadata = self.assembleMetadata(record) # Create a resource and append it to the result list - self.createResource(msg, application='outlook') + self.createResource(mail, folder, metadata) yield None def login(self): @@ -177,23 +177,71 @@ class OutlookCrawler(MailCrawler): pass return outlookFound - def createEmailMime(self, emails): - # Create the container (outer) email message. - msg = MIMEMultipart.MIMEMultipart() - for key in emails.keys(): - if isinstance(emails[key], (str, unicode)): - msg[key] = emails[key].encode('utf-8') - elif isinstance(emails[key], (list, tuple, dict)): + def assembleMetadata(self, mailAttr): + meta = {} + for key in mailAttr.keys(): + if isinstance(mailAttr[key], (str, unicode))\ + and mailAttr[key] != 'Body' and mailAttr[key] != 'HTMLBody': + meta[key] = mailAttr[key].encode('utf-8') + elif isinstance(mailAttr[key], (list, tuple, dict)): lst = [] - for rec in emails[key]: + for rec in mailAttr[key]: lst.append(rec) - msg[key] = COMMASPACE.join(lst) + meta[key] = COMMASPACE.join(lst) else: - msg[key] = emails[key] - if emails.has_key('Body'): - msg.preamble = emails['Body'].encode('utf-8') + meta[key] = mailAttr[key] + metadata = self.createMetadata(meta) + return metadata + + def createResource(self, mail, folder, metadata): + enc = "not available" + textType = "not available" + attachments = [] + ident = "EntryID not available" + if (hasattr(mail, 'BodyFormat')): + value = getattr(mail, 'BodyFormat') + if value == 1: + #1: it is a plain text mail, that is maybe decorated with + #some html Tags by Outlook for formatting + #so save it as plain text mail + if hasattr(mail, 'Body'): + mailContent = getattr(mail, 'Body') + textType = "text/plain" + else: + mailContent = "Could not retrieve mail body" + textType = "text/plain" + elif value == 2: + #2: it is a HTML mail + if hasattr(mail, 'HTMLBody'): + mailContent = getattr(mail, 'HTMLBody') + textType = "text/html" + else: + mailContent = "Could not retrieve HTMLBody of mail" + textType = "text/html" else: - msg.preamble = "e-Mail body not available" - return msg + #Could not determine BodyFormat. Try to retrieve plain text + if hasattr(mail, 'Body'): + mailContent = getattr(mail, 'Body') + else: + mailContent = "Could not retrieve mail body" + if hasattr(mail, 'InternetCodepage'): + Codepage = getattr(mail, 'InternetCodepage') + if codepages.has_key(Codepage): + enc = codepages[Codepage] + if hasattr(mail, 'EntryID'): + ident = getattr(mail, 'EntryID') + if hasattr(mail, 'Attachments'): + attachedElems = getattr(mail, 'Attachments') + for item in range(1, len(attachedElems)+1): + attachments.append(attachedElems.Item(item).FileName) + resource = MailResource(data=mailContent,\ + contentType=textType,\ + encoding=enc,\ + path=folder,\ + application='outlook',\ + identifier=ident,\ + metadata=metadata,\ + subResources=attachments) + self.result.append(resource) agents.register(OutlookCrawler, Master, name='crawl.outlook') diff --git a/agent/util/codepages.py b/agent/util/codepages.py new file mode 100644 index 0000000..a7aee62 --- /dev/null +++ b/agent/util/codepages.py @@ -0,0 +1,58 @@ +# +# Copyright (c) 2008 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Codepages Module +""" + +codepages = {28596: 'iso-8859-6',\ + 1256: 'windows-1256',\ + 28594: 'iso-8859-4',\ + 1257: 'windows-1257',\ + 28592: 'iso-8859-2',\ + 1250: 'windows-1250',\ + 936: 'gb2312',\ + 52936: 'hz-gb-2312',\ + 950: 'big5',\ + 28595: 'iso-8859-5',\ + 20866: 'koi8-r',\ + 21866: 'koi8-u',\ + 1251: 'windows-1251',\ + 28597: 'iso-8859-7',\ + 1253: 'windows-1253',\ + 38598: 'iso-8859-8-i',\ + 1255: 'windows-1255',\ + 51932: 'euc-jp',\ + 50220: 'iso-2022-jp',\ + 50221: 'csISO2022JP',\ + 932: 'iso-2022-jp',\ + 949: 'ks_c_5601-1987',\ + 51949: 'euc-kr',\ + 28593: 'iso-8859-3',\ + 28605: 'iso-8859-15',\ + 874: 'windows-874',\ + 28599: 'iso-8859-9',\ + 1254: 'windows-1254',\ + 65000: 'utf-7',\ + 65001: 'utf-8',\ + 20127: 'us-ascii',\ + 1258: 'windows-1258',\ + 28591: 'iso-8859-1',\ + 1252: 'Windows-1252' + } + \ No newline at end of file