diff --git a/agent/crawl/base.py b/agent/crawl/base.py index 188b481..738bfc1 100644 --- a/agent/crawl/base.py +++ b/agent/crawl/base.py @@ -61,16 +61,18 @@ class Resource(object): implements(IResource) + data = file = path = None + type = 'sample' + contentType = 'text/plain' + encoding = '' application = 'sample' + metadata = None - def __init__(self, data=None, file=None, path=None, application=None, - metadata=None): - self.data = data - self.file = file - self.path = path - if application: - self.application = application - self.metadata = metadata + def __init__(self, data=None, **kw): + if data is not None: + self.data = data + for k, v in kw.items(): + setattr(self, k, v) self.subResources = [] @@ -79,8 +81,7 @@ class Metadata(dict): implements(IMetadataSet) def __init__(self, data=dict()): - for k in data: - self[k] = data[k] + self.update(data) def asXML(self): # TODO... diff --git a/agent/crawl/filesystem.py b/agent/crawl/filesystem.py index 690486f..cead352 100644 --- a/agent/crawl/filesystem.py +++ b/agent/crawl/filesystem.py @@ -35,7 +35,6 @@ from cybertools.agent.crawl.base import Crawler from cybertools.agent.util.task import coiterate - class FilesystemCrawler(Crawler): def collect(self): @@ -67,7 +66,7 @@ class FilesystemCrawler(Crawler): meta = dict( path=filename, ) - self.collected.append(FileResource(filename, Metadata(meta))) + self.collected.append(FileResource(path=filename, metadata=Metadata(meta))) yield None agents.register(FilesystemCrawler, Master, name='crawl.filesystem') @@ -75,13 +74,12 @@ agents.register(FilesystemCrawler, Master, name='crawl.filesystem') class FileResource(Resource): - def __init__(self, path, metadata=None): - self.path = path - self.metadata = metadata - + type = 'file' application = 'filesystem' @property def data(self): - return open(self.path, 'r') - + f = open(self.path, 'r') + text = f.read() + f.close() + return text diff --git a/agent/crawl/filesystem.txt b/agent/crawl/filesystem.txt index 2755c99..5783e1b 100644 --- a/agent/crawl/filesystem.txt +++ b/agent/crawl/filesystem.txt @@ -38,5 +38,5 @@ the twisted reactor first. Job 00001 completed; result: [..., ...]; >>> r0 = controller.result[0] - >>> r0.metadata, r0.data.read() + >>> r0.metadata, r0.data ({'path': '...file1.txt'}, 'Data from file1.txt') diff --git a/agent/crawl/mail.py b/agent/crawl/mail.py index 78a7b5a..b9f0857 100644 --- a/agent/crawl/mail.py +++ b/agent/crawl/mail.py @@ -59,9 +59,11 @@ class MailCrawler(Crawler): def login(self): pass +agents.register(MailCrawler, Master, name='crawl.mail') + class MailResource(Resource): - application = 'outlook' + type = 'email' + application = 'mailclient' -agents.register(MailCrawler, Master, name='crawl.mail') diff --git a/agent/crawl/outlook.py b/agent/crawl/outlook.py index 95086ea..878c5fd 100644 --- a/agent/crawl/outlook.py +++ b/agent/crawl/outlook.py @@ -130,7 +130,7 @@ class OutlookCrawler(MailCrawler): # Create the mime email object msg = self.createEmailMime(record) # Create a resource and append it to the result list - self.createResource(msg, folder, "Microsoft Office Outlook") + self.createResource(msg, application='outlook') yield None def login(self): diff --git a/agent/interfaces.py b/agent/interfaces.py index e602972..2d87d18 100644 --- a/agent/interfaces.py +++ b/agent/interfaces.py @@ -214,6 +214,12 @@ class IResource(Interface): 'None if the data or file attribute is given.') identifier = Attribute('A string (usually derived from the path) that ' 'uniquely identifies the resource.') + type = Attribute('A string denoting the type of the resource, e.g. ' + '"file" or "email".') + contentType = Attribute('A string denoting the MIME type of the data, ' + 'e.g. "text/plain" or "application/octet-stream"') + encoding = Attribute('Optional: a string denoting the encoding of the ' + 'file data, e.g. "UTF-8".') application = Attribute('The name of the application that provided ' 'the resource, e.g. "filesystem" or "mail".') metadata = Attribute('Information describing this resource; '