more extractors and analyzers

git-svn-id: svn://svn.cy55.de/Zope3/src/loops/trunk@2108 fd906abe-77d9-0310-91a1-e0d9ade77398
2007-10-10 11:53:00 +00:00 · 2007-10-10 11:53:00 +00:00 · 907dda565a
commit 907dda565a
parent 96d0462b18
6 changed files with 88 additions and 16 deletions
--- a/classifier/README.txt
+++ b/classifier/README.txt
@ -84,6 +84,9 @@ and follow the classifier step by step.
  >>> infoSet
  {'filename': 'cust_im_contract_webbg_20071015.txt'}

+Let's now use the sample analyzer - an example that interprets very carefully
+the underscore-separated parts of the filename.
+
  >>> analyzer = component.getAdapter(classifier, name=classifier.analyzer)
  >>> statements = analyzer.extractStatements(infoSet)
  >>> statements
@ -138,6 +141,13 @@ So we are now ready to have the whole stuff run in one call.
  >>> len(webbg.getResources((concepts['ownedby'],)))
  3

+We can repeat the process without getting additional assignments.
+
+  >>> for name in rnames[1:]:
+  ...     classifier.process(resources[name])
+  >>> len(webbg.getResources())
+  4
+

 Fin de partie
 =============
--- a/classifier/base.py
+++ b/classifier/base.py
@ -69,10 +69,13 @@ class Classifier(AdapterBase):
                statement.subject = resource
            if statement.predicate is None:
                statement.predicate = defaultPredicate
-            self.assignConcept(statement)
+            self.assignConcept(statement.subject, statement.object,
+                               statement.predicate)

-    def assignConcept(self, statement):
-        statement.object.assignResource(statement.subject, statement.predicate)
+    def assignConcept(self, resource, concept, predicate):
+        resources = concept.getResources([predicate])
+        if resource not in resources:
+            concept.assignResource(resource, predicate)


 class Extractor(object):
--- a/classifier/browser.py
+++ b/classifier/browser.py
@ -43,7 +43,6 @@ class ClassifierView(ConceptView):
            cta = adapted(self.context)
            if cta is not None:
                for r in collectResources(self.context):
-                    print '***', r.title
                    cta.process(r)
        return True

--- a/classifier/configure.zcml
+++ b/classifier/configure.zcml
@ -3,13 +3,11 @@
 <configure
   xmlns:zope="http://namespaces.zope.org/zope"
   xmlns:browser="http://namespaces.zope.org/browser"
-   i18n_domain="zope"
-   >
+   i18n_domain="zope">

  <zope:adapter
        factory="loops.classifier.base.Classifier"
        trusted="True" />
-
  <zope:class class="loops.classifier.base.Classifier">
    <require permission="zope.View"
        interface="loops.classifier.interfaces.IClassifier" />
@ -19,9 +17,7 @@

  <zope:adapter
        factory="loops.classifier.standard.FilenameExtractor"
-        name="filename"
-        trusted="True" />
-
+        name="filename" trusted="True" />
  <zope:class class="loops.classifier.standard.FilenameExtractor">
    <require permission="zope.View"
        interface="loops.classifier.interfaces.IExtractor" />
@ -30,10 +26,28 @@
  </zope:class>

  <zope:adapter
-        factory="loops.classifier.sample.SampleAnalyzer"
-        name="sample"
-        trusted="True" />
+        factory="loops.classifier.standard.PathExtractor"
+        name="path" trusted="True" />
+  <zope:class class="loops.classifier.standard.PathExtractor">
+    <require permission="zope.View"
+        interface="loops.classifier.interfaces.IExtractor" />
+    <require permission="zope.ManageContent"
+        set_schema="loops.classifier.interfaces.IExtractor" />
+  </zope:class>

+  <zope:adapter
+        factory="loops.classifier.standard.WordBasedAnalyzer"
+        name="word-based" trusted="True" />
+  <zope:class class="loops.classifier.standard.WordBasedAnalyzer">
+    <require permission="zope.View"
+        interface="loops.classifier.interfaces.IAnalyzer" />
+    <require permission="zope.ManageContent"
+        set_schema="loops.classifier.interfaces.IAnalyzer" />
+  </zope:class>
+
+  <zope:adapter
+        factory="loops.classifier.sample.SampleAnalyzer"
+        name="sample" trusted="True" />
  <zope:class class="loops.classifier.sample.SampleAnalyzer">
    <require permission="zope.View"
        interface="loops.classifier.interfaces.IAnalyzer" />
--- a/classifier/sample.py
+++ b/classifier/sample.py
@ -32,6 +32,7 @@ from cybertools.typology.interfaces import IType
 from loops.classifier.base import Analyzer
 from loops.classifier.base import Statement
 from loops.common import adapted
+from loops.query import ConceptQuery


 class SampleAnalyzer(Analyzer):
@ -46,6 +47,10 @@ class SampleAnalyzer(Analyzer):
        resource.
    """

+    @Lazy
+    def query(self):
+        return ConceptQuery(self.context)
+
    def handleCustomer(self, name):
        custTypes = self.getTypes(('institution', 'customer',))
        for c in self.findConcepts(name):
@ -90,8 +95,7 @@ class SampleAnalyzer(Analyzer):
        return result

    def findConcepts(self, name):
-        cat = component.getUtility(ICatalog)
-        return cat.searchResults(loops_text=name)
+        return self.query.query(name, 'loops:concept:*')

    @Lazy
    def conceptManager(self):
--- a/classifier/standard.py
+++ b/classifier/standard.py
@ -22,11 +22,16 @@ Standard implementations of classifier components.
 $Id$
 """

+import re
+
+from zope.cachedescriptors.property import Lazy
 from zope.component import adapts

-from loops.classifier.base import Extractor
+from loops.classifier.base import Analyzer, Extractor
 from loops.classifier.base import InformationSet
+from loops.classifier.base import Statement
 from loops.interfaces import IExternalFile
+from loops.query import ConceptQuery


 class FilenameExtractor(Extractor):
@ -39,3 +44,40 @@ class FilenameExtractor(Extractor):
    def extractInformationSet(self):
        filename = self.context.externalAddress
        return InformationSet(filename=filename)
+
+
+class PathExtractor(Extractor):
+
+    adapts(IExternalFile)
+
+    def __init__(self, context):
+        self.context = context
+
+    def extractInformationSet(self):
+        params = self.context.storageParams
+        if 'subdir' in params:
+            return InformationSet(path=params['subdir'])
+        else:
+            return InformationSet()
+
+
+class WordBasedAnalyzer(Analyzer):
+
+    @Lazy
+    def query(self):
+        return ConceptQuery(self.context)
+
+    def extractStatements(self, informationSet):
+        result = []
+        for key, value in informationSet.items():
+            words = self.split(value)
+            for w in words:
+                result.extend([Statement(c) for c in self.findConcepts(w)])
+        return result
+
+    def split(self, text):
+        return re.split('\W+', text)
+
+    def findConcepts(self, word):
+        return self.query.query(word, 'loops:concept:*')
+