replaced createMIMEMail with an overwritten version of createResource and also moved the collection of the Metadata to the method assembleMetadata.
Added a new module inagent/util which contains a dictionary with all more or less often used codepages and their representation in an Outlook Mail object.jsUntil now, the attachments are only represented by thei filenames as a list named subResources. git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2589 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
ac5013a026
commit
52d6d9348c
3 changed files with 130 additions and 19 deletions
|
@ -26,6 +26,7 @@ from zope.interface import implements
|
|||
|
||||
from cybertools.agent.base.agent import Agent, Master
|
||||
from cybertools.agent.crawl.base import Resource
|
||||
from cybertools.agent.crawl.base import Metadata
|
||||
from cybertools.agent.crawl.base import Crawler
|
||||
from cybertools.agent.components import agents
|
||||
from twisted.internet.defer import succeed
|
||||
|
@ -51,10 +52,14 @@ class MailCrawler(Crawler):
|
|||
def loadMailsFromFolder(self, folder):
|
||||
pass
|
||||
|
||||
def createResource(self, mail, path=None, application=None, metadata=None):
|
||||
resource = MailResource(mail, path=path, application=application,
|
||||
def createResource(self, data, path=None, application=None, metadata=None):
|
||||
resource = MailResource(data=data, path=path, application=application,
|
||||
metadata=metadata)
|
||||
self.result.append(resource)
|
||||
|
||||
def createMetadata(self, metadata):
|
||||
metadata = Metadata(metadata)
|
||||
return metadata
|
||||
|
||||
def login(self):
|
||||
pass
|
||||
|
|
|
@ -40,6 +40,7 @@ from cybertools.agent.crawl.mail import MailResource
|
|||
from cybertools.agent.components import agents
|
||||
from cybertools.agent.system.windows import api
|
||||
from cybertools.agent.util.task import coiterate
|
||||
from cybertools.agent.util.codepages import codepages
|
||||
|
||||
# some constants
|
||||
COMMASPACE = ', '
|
||||
|
@ -127,10 +128,9 @@ class OutlookCrawler(MailCrawler):
|
|||
record[key] = "Invalid data format"
|
||||
except:
|
||||
record[key] = "Requested attribute not available"
|
||||
# Create the mime email object
|
||||
msg = self.createEmailMime(record)
|
||||
metadata = self.assembleMetadata(record)
|
||||
# Create a resource and append it to the result list
|
||||
self.createResource(msg, application='outlook')
|
||||
self.createResource(mail, folder, metadata)
|
||||
yield None
|
||||
|
||||
def login(self):
|
||||
|
@ -177,23 +177,71 @@ class OutlookCrawler(MailCrawler):
|
|||
pass
|
||||
return outlookFound
|
||||
|
||||
def createEmailMime(self, emails):
|
||||
# Create the container (outer) email message.
|
||||
msg = MIMEMultipart.MIMEMultipart()
|
||||
for key in emails.keys():
|
||||
if isinstance(emails[key], (str, unicode)):
|
||||
msg[key] = emails[key].encode('utf-8')
|
||||
elif isinstance(emails[key], (list, tuple, dict)):
|
||||
def assembleMetadata(self, mailAttr):
|
||||
meta = {}
|
||||
for key in mailAttr.keys():
|
||||
if isinstance(mailAttr[key], (str, unicode))\
|
||||
and mailAttr[key] != 'Body' and mailAttr[key] != 'HTMLBody':
|
||||
meta[key] = mailAttr[key].encode('utf-8')
|
||||
elif isinstance(mailAttr[key], (list, tuple, dict)):
|
||||
lst = []
|
||||
for rec in emails[key]:
|
||||
for rec in mailAttr[key]:
|
||||
lst.append(rec)
|
||||
msg[key] = COMMASPACE.join(lst)
|
||||
meta[key] = COMMASPACE.join(lst)
|
||||
else:
|
||||
msg[key] = emails[key]
|
||||
if emails.has_key('Body'):
|
||||
msg.preamble = emails['Body'].encode('utf-8')
|
||||
meta[key] = mailAttr[key]
|
||||
metadata = self.createMetadata(meta)
|
||||
return metadata
|
||||
|
||||
def createResource(self, mail, folder, metadata):
|
||||
enc = "not available"
|
||||
textType = "not available"
|
||||
attachments = []
|
||||
ident = "EntryID not available"
|
||||
if (hasattr(mail, 'BodyFormat')):
|
||||
value = getattr(mail, 'BodyFormat')
|
||||
if value == 1:
|
||||
#1: it is a plain text mail, that is maybe decorated with
|
||||
#some html Tags by Outlook for formatting
|
||||
#so save it as plain text mail
|
||||
if hasattr(mail, 'Body'):
|
||||
mailContent = getattr(mail, 'Body')
|
||||
textType = "text/plain"
|
||||
else:
|
||||
mailContent = "Could not retrieve mail body"
|
||||
textType = "text/plain"
|
||||
elif value == 2:
|
||||
#2: it is a HTML mail
|
||||
if hasattr(mail, 'HTMLBody'):
|
||||
mailContent = getattr(mail, 'HTMLBody')
|
||||
textType = "text/html"
|
||||
else:
|
||||
mailContent = "Could not retrieve HTMLBody of mail"
|
||||
textType = "text/html"
|
||||
else:
|
||||
msg.preamble = "e-Mail body not available"
|
||||
return msg
|
||||
#Could not determine BodyFormat. Try to retrieve plain text
|
||||
if hasattr(mail, 'Body'):
|
||||
mailContent = getattr(mail, 'Body')
|
||||
else:
|
||||
mailContent = "Could not retrieve mail body"
|
||||
if hasattr(mail, 'InternetCodepage'):
|
||||
Codepage = getattr(mail, 'InternetCodepage')
|
||||
if codepages.has_key(Codepage):
|
||||
enc = codepages[Codepage]
|
||||
if hasattr(mail, 'EntryID'):
|
||||
ident = getattr(mail, 'EntryID')
|
||||
if hasattr(mail, 'Attachments'):
|
||||
attachedElems = getattr(mail, 'Attachments')
|
||||
for item in range(1, len(attachedElems)+1):
|
||||
attachments.append(attachedElems.Item(item).FileName)
|
||||
resource = MailResource(data=mailContent,\
|
||||
contentType=textType,\
|
||||
encoding=enc,\
|
||||
path=folder,\
|
||||
application='outlook',\
|
||||
identifier=ident,\
|
||||
metadata=metadata,\
|
||||
subResources=attachments)
|
||||
self.result.append(resource)
|
||||
|
||||
agents.register(OutlookCrawler, Master, name='crawl.outlook')
|
||||
|
|
58
agent/util/codepages.py
Normal file
58
agent/util/codepages.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
#
|
||||
# Copyright (c) 2008 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Codepages Module
|
||||
"""
|
||||
|
||||
codepages = {28596: 'iso-8859-6',\
|
||||
1256: 'windows-1256',\
|
||||
28594: 'iso-8859-4',\
|
||||
1257: 'windows-1257',\
|
||||
28592: 'iso-8859-2',\
|
||||
1250: 'windows-1250',\
|
||||
936: 'gb2312',\
|
||||
52936: 'hz-gb-2312',\
|
||||
950: 'big5',\
|
||||
28595: 'iso-8859-5',\
|
||||
20866: 'koi8-r',\
|
||||
21866: 'koi8-u',\
|
||||
1251: 'windows-1251',\
|
||||
28597: 'iso-8859-7',\
|
||||
1253: 'windows-1253',\
|
||||
38598: 'iso-8859-8-i',\
|
||||
1255: 'windows-1255',\
|
||||
51932: 'euc-jp',\
|
||||
50220: 'iso-2022-jp',\
|
||||
50221: 'csISO2022JP',\
|
||||
932: 'iso-2022-jp',\
|
||||
949: 'ks_c_5601-1987',\
|
||||
51949: 'euc-kr',\
|
||||
28593: 'iso-8859-3',\
|
||||
28605: 'iso-8859-15',\
|
||||
874: 'windows-874',\
|
||||
28599: 'iso-8859-9',\
|
||||
1254: 'windows-1254',\
|
||||
65000: 'utf-7',\
|
||||
65001: 'utf-8',\
|
||||
20127: 'us-ascii',\
|
||||
1258: 'windows-1258',\
|
||||
28591: 'iso-8859-1',\
|
||||
1252: 'Windows-1252'
|
||||
}
|
||||
|
Loading…
Add table
Reference in a new issue