add basic filesystem crawler
git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2570 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
		
							parent
							
								
									3e6ac40962
								
							
						
					
					
						commit
						4c4ada47da
					
				
					 7 changed files with 146 additions and 4 deletions
				
			
		|  | @ -27,7 +27,7 @@ from zope.interface import implements | ||||||
| from cybertools.agent.base.agent import Master | from cybertools.agent.base.agent import Master | ||||||
| from cybertools.agent.core.agent import QueueableAgent | from cybertools.agent.core.agent import QueueableAgent | ||||||
| from cybertools.agent.interfaces import ICrawler | from cybertools.agent.interfaces import ICrawler | ||||||
| from cybertools.agent.interfaces import IResource | from cybertools.agent.interfaces import IResource, IMetadataSet | ||||||
| from cybertools.agent.components import agents | from cybertools.agent.components import agents | ||||||
| from twisted.internet.defer import succeed | from twisted.internet.defer import succeed | ||||||
| 
 | 
 | ||||||
|  | @ -54,6 +54,8 @@ class SampleCrawler(Crawler): | ||||||
|         d = succeed([]) |         d = succeed([]) | ||||||
|         return d |         return d | ||||||
| 
 | 
 | ||||||
|  | agents.register(SampleCrawler, Master, name='crawl.sample') | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class Resource(object): | class Resource(object): | ||||||
| 
 | 
 | ||||||
|  | @ -71,6 +73,18 @@ class Resource(object): | ||||||
|         self.metadata = metadata |         self.metadata = metadata | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class Metadata(dict): | ||||||
| 
 | 
 | ||||||
| agents.register(SampleCrawler, Master, name='crawl.sample') |     implements(IMetadataSet) | ||||||
|  | 
 | ||||||
|  |     def __init__(self, data=dict()): | ||||||
|  |         for k in data: | ||||||
|  |             self[k] = data[k] | ||||||
|  | 
 | ||||||
|  |     def asXML(self): | ||||||
|  |         # TODO... | ||||||
|  |         return '' | ||||||
|  | 
 | ||||||
|  |     def set(self, key, value): | ||||||
|  |         self['key'] = value | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										87
									
								
								agent/crawl/filesystem.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										87
									
								
								agent/crawl/filesystem.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,87 @@ | ||||||
|  | # | ||||||
|  | #  Copyright (c) 2008 Helmut Merz helmutm@cy55.de | ||||||
|  | # | ||||||
|  | #  This program is free software; you can redistribute it and/or modify | ||||||
|  | #  it under the terms of the GNU General Public License as published by | ||||||
|  | #  the Free Software Foundation; either version 2 of the License, or | ||||||
|  | #  (at your option) any later version. | ||||||
|  | # | ||||||
|  | #  This program is distributed in the hope that it will be useful, | ||||||
|  | #  but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  | #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  | #  GNU General Public License for more details. | ||||||
|  | # | ||||||
|  | #  You should have received a copy of the GNU General Public License | ||||||
|  | #  along with this program; if not, write to the Free Software | ||||||
|  | #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA | ||||||
|  | # | ||||||
|  | 
 | ||||||
|  | """ | ||||||
|  | Filesystem crawler. | ||||||
|  | 
 | ||||||
|  | $Id$ | ||||||
|  | """ | ||||||
|  | 
 | ||||||
|  | import os | ||||||
|  | from fnmatch import filter | ||||||
|  | from datetime import datetime | ||||||
|  | from twisted.internet.defer import Deferred | ||||||
|  | from zope.interface import implements | ||||||
|  | 
 | ||||||
|  | from cybertools.agent.base.agent import Master | ||||||
|  | from cybertools.agent.components import agents | ||||||
|  | from cybertools.agent.crawl.base import Resource, Metadata | ||||||
|  | from cybertools.agent.crawl.base import Crawler | ||||||
|  | from cybertools.agent.util.task import coiterate | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class FilesystemCrawler(Crawler): | ||||||
|  | 
 | ||||||
|  |     def collect(self): | ||||||
|  |         self.collected = [] | ||||||
|  |         coiterate(self.crawlFilesystem()).addCallback(self.finished) | ||||||
|  |         # TODO: addErrback() | ||||||
|  |         self.deferred = Deferred() | ||||||
|  |         return self.deferred | ||||||
|  | 
 | ||||||
|  |     def finished(self, result): | ||||||
|  |         self.deferred.callback(self.collected) | ||||||
|  | 
 | ||||||
|  |     def crawlFilesystem(self): | ||||||
|  |         directory = self.params.get('directory') | ||||||
|  |         pattern = self.params.get('pattern') or '*' | ||||||
|  |         lastRun = self.params.get('lastrun') or datetime(1980, 1, 1) | ||||||
|  |         for path, dirs, files in os.walk(directory): | ||||||
|  |             if '.svn' in dirs: | ||||||
|  |                 del dirs[dirs.index('.svn')] | ||||||
|  |             for x in self.loadFiles(path, files, pattern, lastRun): | ||||||
|  |                 yield None | ||||||
|  | 
 | ||||||
|  |     def loadFiles(self, path, files, pattern, lastRun): | ||||||
|  |         for f in filter(files, pattern): | ||||||
|  |             filename = os.path.join(path, f) | ||||||
|  |             mtime = datetime.fromtimestamp(os.path.getmtime(filename)) | ||||||
|  |             if mtime <= lastRun:  # file not changed | ||||||
|  |                 continue | ||||||
|  |             meta = dict( | ||||||
|  |                 path=filename, | ||||||
|  |             ) | ||||||
|  |             self.collected.append(FileResource(filename, Metadata(meta))) | ||||||
|  |             yield None | ||||||
|  | 
 | ||||||
|  | agents.register(FilesystemCrawler, Master, name='crawl.filesystem') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class FileResource(Resource): | ||||||
|  | 
 | ||||||
|  |     def __init__(self, path, metadata=None): | ||||||
|  |         self.path = path | ||||||
|  |         self.metadata = metadata | ||||||
|  | 
 | ||||||
|  |     application = 'filesystem' | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def data(self): | ||||||
|  |         return open(self.path, 'r') | ||||||
|  | 
 | ||||||
							
								
								
									
										38
									
								
								agent/crawl/filesystem.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								agent/crawl/filesystem.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,38 @@ | ||||||
|  | ================================================ | ||||||
|  | Agents for Job Execution and Communication Tasks | ||||||
|  | ================================================ | ||||||
|  | 
 | ||||||
|  |   ($Id$) | ||||||
|  | 
 | ||||||
|  |   >>> import os | ||||||
|  |   >>> from time import time | ||||||
|  | 
 | ||||||
|  |   >>> from cybertools.agent.tests import tester, baseDir | ||||||
|  |   >>> config = ''' | ||||||
|  |   ... controller(names=['core.sample']) | ||||||
|  |   ... scheduler(name='core') | ||||||
|  |   ... logger(name='default', standard=30) | ||||||
|  |   ... ''' | ||||||
|  |   >>> from cybertools.agent.main import setup | ||||||
|  |   >>> master = setup(config) | ||||||
|  |   Starting agent application... | ||||||
|  |   Using controllers core.sample. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | Filesystem Crawler | ||||||
|  | ================== | ||||||
|  | 
 | ||||||
|  |   >>> controller = master.controllers[0] | ||||||
|  |   >>> controller.createAgent('crawl.filesystem', 'sample03') | ||||||
|  | 
 | ||||||
|  | In the next step we request the start of a job, again via the controller. | ||||||
|  | 
 | ||||||
|  |   >>> path = os.path.join(baseDir, 'testing', 'data') | ||||||
|  |   >>> controller.enterJob('sample', 'sample03', params=dict(directory=path)) | ||||||
|  | 
 | ||||||
|  | The job is not executed immediately - we have to hand over control to | ||||||
|  | the twisted reactor first. | ||||||
|  | 
 | ||||||
|  |   >>> from cybertools.agent.tests import tester | ||||||
|  |   >>> tester.iterate() | ||||||
|  |   Job 00001 completed; result: [..., ...]; | ||||||
|  | @ -58,7 +58,7 @@ def setupEnvironment(config): | ||||||
|     from cybertools.agent.control import cmdline |     from cybertools.agent.control import cmdline | ||||||
|     from cybertools.agent.system.windows import api |     from cybertools.agent.system.windows import api | ||||||
|     api.setup(config) |     api.setup(config) | ||||||
|     from cybertools.agent.crawl import base, outlook |     from cybertools.agent.crawl import base, filesystem, outlook | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def startReactor(): | def startReactor(): | ||||||
|  |  | ||||||
							
								
								
									
										1
									
								
								agent/testing/data/file1.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								agent/testing/data/file1.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | ||||||
|  | Data from file1.txt | ||||||
							
								
								
									
										1
									
								
								agent/testing/data/subdir/file2.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								agent/testing/data/subdir/file2.txt
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1 @@ | ||||||
|  | Data from file2.txt | ||||||
|  | @ -43,6 +43,7 @@ def test_suite(): | ||||||
|             unittest.makeSuite(Test), |             unittest.makeSuite(Test), | ||||||
|             DocFileSuite('README.txt', optionflags=flags), |             DocFileSuite('README.txt', optionflags=flags), | ||||||
|             DocFileSuite('crawl/README.txt', optionflags=flags), |             DocFileSuite('crawl/README.txt', optionflags=flags), | ||||||
|  |             DocFileSuite('crawl/filesystem.txt', optionflags=flags), | ||||||
|             DocFileSuite('crawl/outlook.txt', optionflags=flags), |             DocFileSuite('crawl/outlook.txt', optionflags=flags), | ||||||
|     )) |     )) | ||||||
|     return testSuite |     return testSuite | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue
	
	 helmutm
						helmutm