add basic filesystem crawler

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2570 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2008-05-01 18:59:06 +00:00
parent 3e6ac40962
commit 4c4ada47da
7 changed files with 146 additions and 4 deletions

View file

@ -27,7 +27,7 @@ from zope.interface import implements
from cybertools.agent.base.agent import Master from cybertools.agent.base.agent import Master
from cybertools.agent.core.agent import QueueableAgent from cybertools.agent.core.agent import QueueableAgent
from cybertools.agent.interfaces import ICrawler from cybertools.agent.interfaces import ICrawler
from cybertools.agent.interfaces import IResource from cybertools.agent.interfaces import IResource, IMetadataSet
from cybertools.agent.components import agents from cybertools.agent.components import agents
from twisted.internet.defer import succeed from twisted.internet.defer import succeed
@ -35,7 +35,7 @@ from twisted.internet.defer import succeed
class Crawler(QueueableAgent): class Crawler(QueueableAgent):
implements(ICrawler) implements(ICrawler)
def __init__(self, master, params={}): def __init__(self, master, params={}):
super(Crawler, self).__init__(master) super(Crawler, self).__init__(master)
@ -54,6 +54,8 @@ class SampleCrawler(Crawler):
d = succeed([]) d = succeed([])
return d return d
agents.register(SampleCrawler, Master, name='crawl.sample')
class Resource(object): class Resource(object):
@ -71,6 +73,18 @@ class Resource(object):
self.metadata = metadata self.metadata = metadata
class Metadata(dict):
agents.register(SampleCrawler, Master, name='crawl.sample') implements(IMetadataSet)
def __init__(self, data=dict()):
for k in data:
self[k] = data[k]
def asXML(self):
# TODO...
return ''
def set(self, key, value):
self['key'] = value

87
agent/crawl/filesystem.py Normal file
View file

@ -0,0 +1,87 @@
#
# Copyright (c) 2008 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Filesystem crawler.
$Id$
"""
import os
from fnmatch import filter
from datetime import datetime
from twisted.internet.defer import Deferred
from zope.interface import implements
from cybertools.agent.base.agent import Master
from cybertools.agent.components import agents
from cybertools.agent.crawl.base import Resource, Metadata
from cybertools.agent.crawl.base import Crawler
from cybertools.agent.util.task import coiterate
class FilesystemCrawler(Crawler):
def collect(self):
self.collected = []
coiterate(self.crawlFilesystem()).addCallback(self.finished)
# TODO: addErrback()
self.deferred = Deferred()
return self.deferred
def finished(self, result):
self.deferred.callback(self.collected)
def crawlFilesystem(self):
directory = self.params.get('directory')
pattern = self.params.get('pattern') or '*'
lastRun = self.params.get('lastrun') or datetime(1980, 1, 1)
for path, dirs, files in os.walk(directory):
if '.svn' in dirs:
del dirs[dirs.index('.svn')]
for x in self.loadFiles(path, files, pattern, lastRun):
yield None
def loadFiles(self, path, files, pattern, lastRun):
for f in filter(files, pattern):
filename = os.path.join(path, f)
mtime = datetime.fromtimestamp(os.path.getmtime(filename))
if mtime <= lastRun: # file not changed
continue
meta = dict(
path=filename,
)
self.collected.append(FileResource(filename, Metadata(meta)))
yield None
agents.register(FilesystemCrawler, Master, name='crawl.filesystem')
class FileResource(Resource):
def __init__(self, path, metadata=None):
self.path = path
self.metadata = metadata
application = 'filesystem'
@property
def data(self):
return open(self.path, 'r')

View file

@ -0,0 +1,38 @@
================================================
Agents for Job Execution and Communication Tasks
================================================
($Id$)
>>> import os
>>> from time import time
>>> from cybertools.agent.tests import tester, baseDir
>>> config = '''
... controller(names=['core.sample'])
... scheduler(name='core')
... logger(name='default', standard=30)
... '''
>>> from cybertools.agent.main import setup
>>> master = setup(config)
Starting agent application...
Using controllers core.sample.
Filesystem Crawler
==================
>>> controller = master.controllers[0]
>>> controller.createAgent('crawl.filesystem', 'sample03')
In the next step we request the start of a job, again via the controller.
>>> path = os.path.join(baseDir, 'testing', 'data')
>>> controller.enterJob('sample', 'sample03', params=dict(directory=path))
The job is not executed immediately - we have to hand over control to
the twisted reactor first.
>>> from cybertools.agent.tests import tester
>>> tester.iterate()
Job 00001 completed; result: [..., ...];

View file

@ -58,7 +58,7 @@ def setupEnvironment(config):
from cybertools.agent.control import cmdline from cybertools.agent.control import cmdline
from cybertools.agent.system.windows import api from cybertools.agent.system.windows import api
api.setup(config) api.setup(config)
from cybertools.agent.crawl import base, outlook from cybertools.agent.crawl import base, filesystem, outlook
def startReactor(): def startReactor():

View file

@ -0,0 +1 @@
Data from file1.txt

View file

@ -0,0 +1 @@
Data from file2.txt

View file

@ -43,6 +43,7 @@ def test_suite():
unittest.makeSuite(Test), unittest.makeSuite(Test),
DocFileSuite('README.txt', optionflags=flags), DocFileSuite('README.txt', optionflags=flags),
DocFileSuite('crawl/README.txt', optionflags=flags), DocFileSuite('crawl/README.txt', optionflags=flags),
DocFileSuite('crawl/filesystem.txt', optionflags=flags),
DocFileSuite('crawl/outlook.txt', optionflags=flags), DocFileSuite('crawl/outlook.txt', optionflags=flags),
)) ))
return testSuite return testSuite