cybertools/agent/crawl/outlook.py
scrat c367f55637 added dummy sftp.py for testing purposes
remote.py: now the file on the remote server gets the basename of the local file
outlook.py: added utf-8 encoding for the mail content to avoid problems with special characters

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2759 fd906abe-77d9-0310-91a1-e0d9ade77398
2008-07-13 09:06:28 +00:00

231 lines
9 KiB
Python

#
# Copyright (c) 2008 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Outlook Crawler Class.
$Id$
"""
import re
from email import MIMEMultipart
import tempfile
import os
from zope.interface import implements
from twisted.internet import defer
#from pywintypes import com_error
#The watsup import is needed as soon as we start handling the Outlook Pop-Up
#again
#This should also be integrated within the wrapper-api for doctests
#from watsup.winGuiAuto import findTopWindow, findControl, findControls, clickButton, \
# getComboboxItems, selectComboboxItem, setCheckBox
from cybertools.agent.base.agent import Agent, Master
from cybertools.agent.crawl.mail import MailCrawler
from cybertools.agent.crawl.mail import MailResource
from cybertools.agent.crawl.filesystem import FileResource
from cybertools.agent.components import agents
from cybertools.agent.system.windows import api
from cybertools.agent.util.task import coiterate
from cybertools.agent.system.windows.codepages import codepages
# some constants
COMMASPACE = ', '
class OutlookCrawler(MailCrawler):
keys = ""
inbox = ""
subfolders = ""
pattern = ""
def collect(self, filter=None):
self.result = []
self.d = defer.Deferred()
self.oOutlookApp = None
if self.findOutlook():
self.fetchCriteria()
coiterate(self.crawlFolders()).addCallback(self.finished).addErrback(self.error)
else:
pass
#self.d.addErrback([])
return self.d
def error(self, reason):
print '***** error',
print reason
def finished(self, result):
self.d.callback(self.result)
def fetchCriteria(self):
criteria = self.params
self.keys = criteria.get('keys')
self.inbox = criteria.get('inbox') #boolean
self.subfolders = criteria.get('subfolders') #boolean
self.pattern = criteria.get('pattern')
if self.pattern != '' and self.pattern != None:
self.pattern = re.compile(criteria.get('pattern') or '.*')
def crawlFolders(self):
onMAPI = self.oOutlookApp.GetNamespace("MAPI")
ofInbox = \
onMAPI.GetDefaultFolder(api.client.constants.olFolderInbox)
# fetch mails from inbox
if self.inbox:
for m in self.loadMailsFromFolder(ofInbox):
yield None
# fetch mails of inbox subfolders
if self.subfolders and self.pattern is None:
lInboxSubfolders = getattr(ofInbox, 'Folders')
for of in range(lInboxSubfolders.__len__()):
# get a MAPI-subfolder object and load its emails
for m in self.loadMailsFromFolder(lInboxSubfolders.Item(of + 1)):
yield None
elif self.subfolders and self.pattern:
lInboxSubfolders = getattr(ofInbox, 'Folders')
for of in range(lInboxSubfolders.__len__()):
# get specified MAPI-subfolder object and load its emails
if self.pattern.match(getattr(lInboxSubfolders.Item(of + 1), 'Name')):
for m in self.loadMailsFromFolder(lInboxSubfolders.Item(of + 1)):
yield None
def loadMailsFromFolder(self, folder):
# get items of the folder
folderItems = getattr(folder, 'Items')
for item in range(len(folderItems)):
mail = folderItems.Item(item+1)
if mail.Class == api.client.constants.olMail:
if self.keys is None:
self.keys = []
for key in mail._prop_map_get_.items():
try:
if isinstance(key[0], (int, str, unicode, bool)):
self.keys.append(key[0])
except api.com_error:
pass
record = {}
for key in self.keys:
try:
if (hasattr(mail, key)):
value = getattr(mail, key)
if isinstance(value, (int, str, unicode, bool)):
record[key] = value
else:
record[key] = None
except:
pass
metadata = self.assembleMetadata(folder, record)
# Create a resource and append it to the result list
self.createResource(mail, folder, metadata)
yield None
def findOutlook(self):
outlookFound = False
try:
self.oOutlookApp = \
api.client.gencache.EnsureDispatch("Outlook.Application")
outlookFound = True
except com_error:
pass
return outlookFound
def assembleMetadata(self, folder, mailAttr):
meta = {}
for key in mailAttr.keys():
if isinstance(mailAttr[key], (str, unicode))\
and mailAttr[key] != 'Body' and mailAttr[key] != 'HTMLBody':
meta[key] = mailAttr[key].encode('utf-8')
elif isinstance(mailAttr[key], (list, tuple, dict)):
lst = []
for rec in mailAttr[key]:
lst.append(rec)
meta[key] = COMMASPACE.join(lst)
else:
meta[key] = mailAttr[key]
meta["path"] = folder
metadata = self.createMetadata(meta)
return metadata
def createResource(self, mail, folder, metadata):
enc = None
textType = "application/octet-stream"
attachments = []
mailContent = ""
ident = None
if (hasattr(mail, 'BodyFormat')):
value = getattr(mail, 'BodyFormat')
if value == 1:
#1: it is a plain text mail, that is maybe decorated with
#some html Tags by Outlook for formatting
#so save it as plain text mail
if hasattr(mail, 'Body'):
mailContent = getattr(mail, 'Body')
textType = "text/plain"
else:
mailContent = ""
textType = "text/plain"
elif value == 2:
#2: it is a HTML mail
if hasattr(mail, 'HTMLBody'):
mailContent = getattr(mail, 'HTMLBody')
textType = "text/html"
else:
mailContent = ""
textType = "text/html"
else:
#Could not determine BodyFormat. Try to retrieve plain text
if hasattr(mail, 'Body'):
mailContent = getattr(mail, 'Body')
else:
mailContent = ""
if hasattr(mail, 'InternetCodepage'):
Codepage = getattr(mail, 'InternetCodepage')
if codepages.has_key(Codepage):
enc = codepages[Codepage]
if hasattr(mail, 'EntryID'):
ident = getattr(mail, 'EntryID')
if hasattr(mail, 'Attachments'):
attachedElems = getattr(mail, 'Attachments')
for item in range(1, len(attachedElems)+1):
fileHandle, filePath = tempfile.mkstemp(prefix="outlook")
attachedItem = attachedElems.Item(item)
attachedItem.SaveAsFile(filePath)
os.close(fileHandle)
metadat = self.createMetadata(dict(filename=filePath))
fileRes = FileResource(data=None,
path=filePath,
metadata=metadat)
attachments.append(fileRes)
fileHandle, filePath = tempfile.mkstemp(prefix="olmail")
filePointer = os.fdopen(fileHandle, "w")
mailContent = mailContent.encode('utf-8')
filePointer.write(mailContent)
filePointer.close()
resource = MailResource(data=mailContent,
contentType=textType,
encoding=enc,
path=filePath,
application='outlook',
identifier=ident,
metadata=metadata,
subResources=attachments)
self.result.append(resource)
agents.register(OutlookCrawler, Master, name='crawl.outlook')