From db31c2d25244e4bc6c067b14ced8d99700f0aa37 Mon Sep 17 00:00:00 2001 From: helmutm Date: Thu, 9 Aug 2007 14:55:07 +0000 Subject: [PATCH] work in progress: filesystem crawler - added basic metadata handling git-svn-id: svn://svn.cy55.de/Zope3/src/loops/trunk@1893 fd906abe-77d9-0310-91a1-e0d9ade77398 --- agent/crawl/base.py | 6 ++++++ agent/crawl/filesystem.py | 20 +++++++++++--------- agent/crawl/filesystem.txt | 2 ++ agent/interfaces.py | 2 +- agent/testing/crawl.py | 9 ++------- agent/testing/transport.py | 2 ++ 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/agent/crawl/base.py b/agent/crawl/base.py index baa7944..0902484 100644 --- a/agent/crawl/base.py +++ b/agent/crawl/base.py @@ -49,4 +49,10 @@ class Metadata(object): def __init__(self, data=dict()): self.data = data + def asXml(self): + # TODO... + return '' + + def set(self, key, value): + self.data['key'] = value diff --git a/agent/crawl/filesystem.py b/agent/crawl/filesystem.py index 0b4b6f1..c1dcab6 100644 --- a/agent/crawl/filesystem.py +++ b/agent/crawl/filesystem.py @@ -36,15 +36,14 @@ from loops.agent.crawl.base import Metadata class CrawlingJob(BaseCrawlingJob): def collect(self): - self.data = [] - #deferred = reactor.deferToThread(self.crawlFilesystem, dataAvailable) - deferred = self.deferred = Deferred() - self.internalDeferred = coiterate(self.crawlFilesystem()) - self.internalDeferred.addCallback(self.finished) - return deferred + self.collected = [] + coiterate(self.crawlFilesystem()).addCallback(self.finished) + # TODO: addErrback() + self.deferred = Deferred() + return self.deferred def finished(self, result): - self.deferred.callback(self.data) + self.deferred.callback(self.collected) def crawlFilesystem(self): criteria = self.params @@ -59,8 +58,11 @@ class CrawlingJob(BaseCrawlingJob): mtime = datetime.fromtimestamp( os.stat(filename)[stat.ST_MTIME]) # TODO: check modification time - self.data.append((FileResource(filename), - Metadata(dict()))) + meta = dict( + path=filename, + ) + self.collected.append((FileResource(filename), + Metadata(meta))) yield None diff --git a/agent/crawl/filesystem.txt b/agent/crawl/filesystem.txt index e569a03..26a63b9 100644 --- a/agent/crawl/filesystem.txt +++ b/agent/crawl/filesystem.txt @@ -35,6 +35,8 @@ We are now ready to schedule the job and let the reactor execute it. >>> scheduler.schedule(crawlJob, int(time())) >>> tester.iterate() + Metadata: {'path': '...data...file1.txt'} Transferring: Data from file1.txt + Metadata: {'path': '...data...subdir...file2.txt'} Transferring: Data from file2.txt diff --git a/agent/interfaces.py b/agent/interfaces.py index 7c30acf..44798bf 100644 --- a/agent/interfaces.py +++ b/agent/interfaces.py @@ -124,7 +124,7 @@ class IMetadataSet(Interface): (nested metadata) this will be converted to XML as well. """ - def setData(key, value): + def set(key, value): """ Set a metadata element. The value may be a string or another metadata set diff --git a/agent/testing/crawl.py b/agent/testing/crawl.py index 110f3ef..52929a8 100644 --- a/agent/testing/crawl.py +++ b/agent/testing/crawl.py @@ -26,7 +26,7 @@ from twisted.internet import reactor from twisted.internet.defer import Deferred from zope.interface import implements -from loops.agent.interfaces import ICrawlingJob, IResource, IMetadataSet +from loops.agent.interfaces import ICrawlingJob, IResource from loops.agent.crawl.base import CrawlingJob as BaseCrawlingJob @@ -39,12 +39,7 @@ class CrawlingJob(BaseCrawlingJob): return deferred def dataAvailable(self): - self.deferred.callback([(DummyResource(), Metadata())]) - - -class Metadata(object): - - implements(IMetadataSet) + self.deferred.callback([(DummyResource(), None)]) class DummyResource(object): diff --git a/agent/testing/transport.py b/agent/testing/transport.py index ae1dad4..d755546 100644 --- a/agent/testing/transport.py +++ b/agent/testing/transport.py @@ -55,6 +55,8 @@ class Transporter(BaseTransporter): data.close() else: text = data + if metadata is not None: + print 'Metadata:', metadata.data print 'Transferring:', text return Deferred()