From 907dda565ab932c8cd0157c31cf12b27e7e6d34e Mon Sep 17 00:00:00 2001 From: helmutm Date: Wed, 10 Oct 2007 11:53:00 +0000 Subject: [PATCH] more extractors and analyzers git-svn-id: svn://svn.cy55.de/Zope3/src/loops/trunk@2108 fd906abe-77d9-0310-91a1-e0d9ade77398 --- classifier/README.txt | 10 +++++++++ classifier/base.py | 9 +++++--- classifier/browser.py | 1 - classifier/configure.zcml | 32 ++++++++++++++++++++-------- classifier/sample.py | 8 +++++-- classifier/standard.py | 44 ++++++++++++++++++++++++++++++++++++++- 6 files changed, 88 insertions(+), 16 deletions(-) diff --git a/classifier/README.txt b/classifier/README.txt index 5a88272..db42ea7 100644 --- a/classifier/README.txt +++ b/classifier/README.txt @@ -84,6 +84,9 @@ and follow the classifier step by step. >>> infoSet {'filename': 'cust_im_contract_webbg_20071015.txt'} +Let's now use the sample analyzer - an example that interprets very carefully +the underscore-separated parts of the filename. + >>> analyzer = component.getAdapter(classifier, name=classifier.analyzer) >>> statements = analyzer.extractStatements(infoSet) >>> statements @@ -138,6 +141,13 @@ So we are now ready to have the whole stuff run in one call. >>> len(webbg.getResources((concepts['ownedby'],))) 3 +We can repeat the process without getting additional assignments. + + >>> for name in rnames[1:]: + ... classifier.process(resources[name]) + >>> len(webbg.getResources()) + 4 + Fin de partie ============= diff --git a/classifier/base.py b/classifier/base.py index 27b4348..622883b 100644 --- a/classifier/base.py +++ b/classifier/base.py @@ -69,10 +69,13 @@ class Classifier(AdapterBase): statement.subject = resource if statement.predicate is None: statement.predicate = defaultPredicate - self.assignConcept(statement) + self.assignConcept(statement.subject, statement.object, + statement.predicate) - def assignConcept(self, statement): - statement.object.assignResource(statement.subject, statement.predicate) + def assignConcept(self, resource, concept, predicate): + resources = concept.getResources([predicate]) + if resource not in resources: + concept.assignResource(resource, predicate) class Extractor(object): diff --git a/classifier/browser.py b/classifier/browser.py index 6d32e9f..3ef92a5 100644 --- a/classifier/browser.py +++ b/classifier/browser.py @@ -43,7 +43,6 @@ class ClassifierView(ConceptView): cta = adapted(self.context) if cta is not None: for r in collectResources(self.context): - print '***', r.title cta.process(r) return True diff --git a/classifier/configure.zcml b/classifier/configure.zcml index bf5c528..0c91342 100644 --- a/classifier/configure.zcml +++ b/classifier/configure.zcml @@ -3,13 +3,11 @@ + i18n_domain="zope"> - @@ -19,9 +17,7 @@ - + name="filename" trusted="True" /> @@ -30,10 +26,28 @@ + factory="loops.classifier.standard.PathExtractor" + name="path" trusted="True" /> + + + + + + + + + + + diff --git a/classifier/sample.py b/classifier/sample.py index c1436ef..89c1fa4 100644 --- a/classifier/sample.py +++ b/classifier/sample.py @@ -32,6 +32,7 @@ from cybertools.typology.interfaces import IType from loops.classifier.base import Analyzer from loops.classifier.base import Statement from loops.common import adapted +from loops.query import ConceptQuery class SampleAnalyzer(Analyzer): @@ -46,6 +47,10 @@ class SampleAnalyzer(Analyzer): resource. """ + @Lazy + def query(self): + return ConceptQuery(self.context) + def handleCustomer(self, name): custTypes = self.getTypes(('institution', 'customer',)) for c in self.findConcepts(name): @@ -90,8 +95,7 @@ class SampleAnalyzer(Analyzer): return result def findConcepts(self, name): - cat = component.getUtility(ICatalog) - return cat.searchResults(loops_text=name) + return self.query.query(name, 'loops:concept:*') @Lazy def conceptManager(self): diff --git a/classifier/standard.py b/classifier/standard.py index cf8a52b..dc5c795 100644 --- a/classifier/standard.py +++ b/classifier/standard.py @@ -22,11 +22,16 @@ Standard implementations of classifier components. $Id$ """ +import re + +from zope.cachedescriptors.property import Lazy from zope.component import adapts -from loops.classifier.base import Extractor +from loops.classifier.base import Analyzer, Extractor from loops.classifier.base import InformationSet +from loops.classifier.base import Statement from loops.interfaces import IExternalFile +from loops.query import ConceptQuery class FilenameExtractor(Extractor): @@ -39,3 +44,40 @@ class FilenameExtractor(Extractor): def extractInformationSet(self): filename = self.context.externalAddress return InformationSet(filename=filename) + + +class PathExtractor(Extractor): + + adapts(IExternalFile) + + def __init__(self, context): + self.context = context + + def extractInformationSet(self): + params = self.context.storageParams + if 'subdir' in params: + return InformationSet(path=params['subdir']) + else: + return InformationSet() + + +class WordBasedAnalyzer(Analyzer): + + @Lazy + def query(self): + return ConceptQuery(self.context) + + def extractStatements(self, informationSet): + result = [] + for key, value in informationSet.items(): + words = self.split(value) + for w in words: + result.extend([Statement(c) for c in self.findConcepts(w)]) + return result + + def split(self, text): + return re.split('\W+', text) + + def findConcepts(self, word): + return self.query.query(word, 'loops:concept:*') +