From bf8a641654e2e10a82ec82d6d6eb7531bc7fb65a Mon Sep 17 00:00:00 2001 From: helmutm Date: Wed, 26 Sep 2007 16:38:46 +0000 Subject: [PATCH] work in progress: automatic classification git-svn-id: svn://svn.cy55.de/Zope3/src/loops/trunk@2071 fd906abe-77d9-0310-91a1-e0d9ade77398 --- classifier/README.txt | 81 +++++++++++++- classifier/base.py | 21 ++-- classifier/interfaces.py | 12 ++- classifier/sample.py | 101 ++++++++++++++++++ classifier/standard.py | 41 +++++++ .../cust_im_contract_webbg_20071015.txt | 1 + .../testdata/cust_im_note_smitha_20071004.txt | 1 + .../testdata/cust_im_note_webbg_20070924.txt | 1 + .../testdata/cust_mc_note_webbg_20070824.txt | 1 + .../emp_watersj_note_miller_20070822.txt | 1 + .../emp_webbg_note_millerj_20070804.txt | 1 + classifier/testdata/note_smitha_20070824.txt | 1 + classifier/testsetup.py | 39 +++++-- integrator/README.txt | 2 +- 14 files changed, 288 insertions(+), 16 deletions(-) create mode 100644 classifier/sample.py create mode 100644 classifier/standard.py create mode 100644 classifier/testdata/cust_im_contract_webbg_20071015.txt create mode 100644 classifier/testdata/cust_im_note_smitha_20071004.txt create mode 100644 classifier/testdata/cust_im_note_webbg_20070924.txt create mode 100644 classifier/testdata/cust_mc_note_webbg_20070824.txt create mode 100644 classifier/testdata/emp_watersj_note_miller_20070822.txt create mode 100644 classifier/testdata/emp_webbg_note_millerj_20070804.txt create mode 100644 classifier/testdata/note_smitha_20070824.txt diff --git a/classifier/README.txt b/classifier/README.txt index f48ab40..c9ccbb7 100644 --- a/classifier/README.txt +++ b/classifier/README.txt @@ -26,8 +26,85 @@ configuration): >>> t = TestSite(site) >>> concepts, resources, views = t.setup() - >>> len(concepts) + len(resources) - 18 + >>> len(concepts), len(resources) + (20, 0) + +Let's now add an external collection that reads in a set of resources +from external files so we have something to work with. + + >>> from loops.concept import Concept + >>> from loops.setup import addObject + >>> from loops.common import adapted + >>> from loops.classifier.testsetup import dataDir + + >>> tExternalCollection = concepts['extcollection'] + >>> coll01 = addObject(concepts, Concept, 'coll01', + ... title=u'Collection One', conceptType=tExternalCollection) + >>> aColl01 = adapted(coll01) + >>> aColl01.baseAddress = dataDir + >>> aColl01.address = '' + + >>> aColl01.update() + >>> len(resources) + 7 + >>> rnames = list(sorted(resources.keys())) + >>> rnames[0] + u'cust_im_contract_webbg_20071015.txt' + + +Filename-based Classification +============================= + +Let's first look at the external address (i.e. the file name) of the +resource we want to classify. + + >>> r1 = resources[rnames[0]] + >>> adapted(r1) + + >>> adapted(r1).externalAddress + 'cust_im_contract_webbg_20071015.txt' + +OK, that's what we need. So we get the preconfigured classifier +(see testsetup.py) and let it classify the resource. + + >>> classifier = adapted(concepts['fileclassifier']) + +Before just processing the resource we'll have a look at the details +and follow the classifier step by step. + + >>> from loops.classifier.base import InformationSet + >>> from loops.classifier.interfaces import IExtractor, IAnalyzer + >>> infoSet = InformationSet() + >>> for name in classifier.extractors.split(): + ... print 'extractor:', name + ... extractor = component.getAdapter(adapted(r1), IExtractor, name=name) + ... infoSet.update(extractor.extractInformationSet()) + extractor: filename + + >>> infoSet + {'filename': 'cust_im_contract_webbg_20071015.txt'} + + >>> analyzer = component.getUtility(IAnalyzer, name=classifier.analyzer) + >>> statements = analyzer.extractStatements(infoSet, classifier) + >>> statements + [] + +So there seems to be something missing - we have to create concepts +that may be identified as being candidates for classification. + + >>> tInstitution = addObject(concepts, Concept, 'institution', + ... title=u'Institution', conceptType=concepts['type']) + >>> cust_im = addObject(concepts, Concept, 'im_editors', + ... title=u'im Editors', conceptType=tInstitution) + >>> t.indexAll(concepts, resources) + + >>> statements = analyzer.extractStatements(infoSet, classifier) + >>> len(statements) + 1 + +So we are now ready to have the whole stuff run in one call. + + >>> classifier.process(r1) Fin de partie diff --git a/classifier/base.py b/classifier/base.py index 55a95f8..28f933f 100644 --- a/classifier/base.py +++ b/classifier/base.py @@ -51,7 +51,14 @@ class Classifier(AdapterBase): _contextAttributes = list(IClassifier) + list(IConcept) def process(self, resource): - pass + infoSet = InformationSet() + for name in self.extractors.split(): + extractor = component.getAdapter(adapted(resource), IExtractor, name=name) + infoSet.update(extractor.extractInformationSet()) + analyzer = component.getUtility(IAnalyzer, name=self.analyzer) + statements = analyzer.extractStatements(infoSet, self) + for statement in statements: + self.assignConcept(statement) def assignConcept(self, statement): pass @@ -73,7 +80,7 @@ class Analyzer(object): implements(IAnalyzer) - def extractStatements(self,informationSet): + def extractStatements(self, informationSet, classifier=None): return [] @@ -86,7 +93,9 @@ class Statement(object): implements(IStatement) - subject = None - predicate = None - object = None - relevance = 100 + def __init__(self, subject=None, predicate=None, object=None, relevance=100): + self.subject = subject + self.predicate = predicate + self.object = object + self.relevance = relevance + diff --git a/classifier/interfaces.py b/classifier/interfaces.py index 8a0d513..252e2a9 100644 --- a/classifier/interfaces.py +++ b/classifier/interfaces.py @@ -33,6 +33,12 @@ class IClassifier(Interface): concepts to assign. """ + extractors = schema.TextLine( + title=_(u'Extractors'), + description=_(u'Space-separated list of names of extractor adapters.'), + default=u'', + required=False) + analyzer = schema.TextLine( title=_(u'Analyzer'), description=_(u'Name of a utility that is able to analyze ' @@ -72,9 +78,13 @@ class IAnalyzer(Interface): provide a collection of statements about it. """ - def extractStatements(informationSet): + def extractStatements(informationSet, classifier=None): """ Return a collection of statements derived from the information set given. + + The ``classifier`` argument may be given in order to + check the environment of the classifier, e.g. available + concepts that may be used as attributes for statements. """ diff --git a/classifier/sample.py b/classifier/sample.py new file mode 100644 index 0000000..2bbaf9d --- /dev/null +++ b/classifier/sample.py @@ -0,0 +1,101 @@ +# +# Copyright (c) 2007 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Sample classifier implementation. + +$Id$ +""" + +from zope import component +from zope.app.catalog.interfaces import ICatalog +from zope.component import adapts + +from cybertools.organize.interfaces import IPerson +from cybertools.typology.interfaces import IType +from loops.classifier.base import Analyzer +from loops.classifier.base import Statement + + +class SampleAnalyzer(Analyzer): + """ A fairly specific analyzer that expects filenames following this + format: + + ctype_name_doctype_owner_date.extension + + with ctype = ('cust', 'emp'), spec is the short name of a customer or + an employee, doctype = ('note', 'contract'), and owner + being the short name of the user that is responsible for the + resource. + """ + + def handleCustomer(self, name, classifier): + result = [] + candidates = self.findConcepts(name) + cm = self.getConceptManager(classifier) + custTypes = [c for c in (cm.get('institution'), cm.get('customer'),) + if c is not None] + for c in candidates: + ctype = IType(c) + if ctype.typeProvider in custTypes: + result.append(Statement(c)) + return result + + def handleEmployee(self, name, classifier): + result = [] + #print 'employee', name + return result + + def handleOwner(self, name, classifier): + result = [] + #print 'owner', name + return result + + def handleDoctype(self, name, classifier): + result = [] + #print 'doctype', name + return result + + handlers = dict(cust=handleCustomer, emp=handleEmployee) + + def extractStatements(self, informationSet, classifier=None): + result = [] + if classifier is None: + return result # classifier is needed for getting access to concepts + fn = informationSet.get('filename') + if fn is None: + return result + parts = fn.split('_') + if len(parts) > 1: + ctype = parts.pop(0) + if ctype in self.handlers: + name = parts.pop(0) + result.extend(self.handlers[ctype](self, name, classifier)) + if len(parts) > 1: + result.extend(self.handleDoctype(parts.pop(0), classifier)) + if len(parts) > 1: + result.extend(self.handleOwner(parts.pop(0), classifier)) + return result + + def findConcepts(self, name): + cat = component.getUtility(ICatalog) + return cat.searchResults(loops_text=name) + + def getConceptManager(self, obj): + return obj.context.getLoopsRoot().getConceptManager() + diff --git a/classifier/standard.py b/classifier/standard.py new file mode 100644 index 0000000..cf8a52b --- /dev/null +++ b/classifier/standard.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2007 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Standard implementations of classifier components. + +$Id$ +""" + +from zope.component import adapts + +from loops.classifier.base import Extractor +from loops.classifier.base import InformationSet +from loops.interfaces import IExternalFile + + +class FilenameExtractor(Extractor): + + adapts(IExternalFile) + + def __init__(self, context): + self.context = context + + def extractInformationSet(self): + filename = self.context.externalAddress + return InformationSet(filename=filename) diff --git a/classifier/testdata/cust_im_contract_webbg_20071015.txt b/classifier/testdata/cust_im_contract_webbg_20071015.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/classifier/testdata/cust_im_contract_webbg_20071015.txt @@ -0,0 +1 @@ + diff --git a/classifier/testdata/cust_im_note_smitha_20071004.txt b/classifier/testdata/cust_im_note_smitha_20071004.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/classifier/testdata/cust_im_note_smitha_20071004.txt @@ -0,0 +1 @@ + diff --git a/classifier/testdata/cust_im_note_webbg_20070924.txt b/classifier/testdata/cust_im_note_webbg_20070924.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/classifier/testdata/cust_im_note_webbg_20070924.txt @@ -0,0 +1 @@ + diff --git a/classifier/testdata/cust_mc_note_webbg_20070824.txt b/classifier/testdata/cust_mc_note_webbg_20070824.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/classifier/testdata/cust_mc_note_webbg_20070824.txt @@ -0,0 +1 @@ + diff --git a/classifier/testdata/emp_watersj_note_miller_20070822.txt b/classifier/testdata/emp_watersj_note_miller_20070822.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/classifier/testdata/emp_watersj_note_miller_20070822.txt @@ -0,0 +1 @@ + diff --git a/classifier/testdata/emp_webbg_note_millerj_20070804.txt b/classifier/testdata/emp_webbg_note_millerj_20070804.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/classifier/testdata/emp_webbg_note_millerj_20070804.txt @@ -0,0 +1 @@ + diff --git a/classifier/testdata/note_smitha_20070824.txt b/classifier/testdata/note_smitha_20070824.txt new file mode 100644 index 0000000..8d1c8b6 --- /dev/null +++ b/classifier/testdata/note_smitha_20070824.txt @@ -0,0 +1 @@ + diff --git a/classifier/testsetup.py b/classifier/testsetup.py index a20af82..f58e98e 100644 --- a/classifier/testsetup.py +++ b/classifier/testsetup.py @@ -6,12 +6,23 @@ $Id$ import os from zope import component +#from zope.app.catalog.interfaces import ICatalog +#from zope.app.catalog.text import TextIndex +from cybertools.storage.interfaces import IExternalStorage +from cybertools.storage.filesystem import fullPathStorage from loops import util -from loops.classifier.base import Classifier, Extractor, Analyzer +from loops.classifier.base import Classifier +from loops.classifier.sample import SampleAnalyzer +from loops.classifier.standard import FilenameExtractor from loops.classifier.interfaces import IClassifier, IAnalyzer +from loops.common import adapted from loops.concept import Concept -from loops.resource import Resource +from loops.resource import Resource, ExternalFileAdapter +from loops.interfaces import IIndexAttributes, IExternalFile +from loops.integrator.collection import DirectoryCollectionProvider +from loops.integrator.collection import ExternalCollectionAdapter +from loops.integrator.interfaces import IExternalCollection, IExternalCollectionProvider from loops.knowledge.setup import SetupManager as KnowledgeSetupManager from loops.setup import SetupManager, addAndConfigureObject from loops.tests.setup import TestSite as BaseTestSite @@ -28,18 +39,34 @@ class TestSite(BaseTestSite): component.provideAdapter(KnowledgeSetupManager, name='knowledge') concepts, resources, views = self.baseSetup() + #catalog = component.getUtility(ICatalog) + #catalog['loops_text'] = TextIndex('text', IIndexAttributes, True) + # classifier and Co tType = concepts.getTypeConcept() tClassifier = addAndConfigureObject(concepts, Concept, 'classifier', title=u'Classifier', conceptType=tType, typeInterface=IClassifier) - component.provideAdapter(Classifier) - fileClassifier = addAndConfigureObject(concepts, Concept, + sampleClassifier = addAndConfigureObject(concepts, Concept, 'fileclassifier', title=u'File Classifier', conceptType=tClassifier) + sampleClassifier = adapted(sampleClassifier) + sampleClassifier.extractors = 'filename' + sampleClassifier.analyzer = 'sample' + component.provideAdapter(FilenameExtractor, name='filename') + component.provideUtility(SampleAnalyzer(), IAnalyzer, name='sample') - component.provideAdapter(Extractor) - component.provideUtility(Analyzer, IAnalyzer) + # external file stuff for providing test files + component.provideAdapter(ExternalFileAdapter, provides=IExternalFile) + component.provideUtility(fullPathStorage(), IExternalStorage, name='fullpath') + component.provideAdapter(ExternalCollectionAdapter) + component.provideUtility(DirectoryCollectionProvider(), IExternalCollectionProvider) + tExtFile = addAndConfigureObject(concepts, Concept, 'extfile', + title=u'External File', conceptType=tType, + typeInterface=IExternalFile) + tExtCollection = addAndConfigureObject(concepts, Concept, 'extcollection', + title=u'External Collection', conceptType=tType, + typeInterface=IExternalCollection) self.indexAll(concepts, resources) return concepts, resources, views diff --git a/integrator/README.txt b/integrator/README.txt index 45e3e95..c52f6a1 100644 --- a/integrator/README.txt +++ b/integrator/README.txt @@ -111,7 +111,7 @@ Working with the External Collection (u'programming_zope_zope3.txt', u'zope3', 'fullpath')] We may update the collection after having changed the storage params. -This should also the settings for existing objects if they still +This should also change the settings for existing objects if they still can be found. >>> import os