work in progress: automatic classification

git-svn-id: svn://svn.cy55.de/Zope3/src/loops/trunk@2071 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2007-09-26 16:38:46 +00:00
parent 52538479d8
commit bf8a641654
14 changed files with 288 additions and 16 deletions

View file

@ -26,8 +26,85 @@ configuration):
>>> t = TestSite(site) >>> t = TestSite(site)
>>> concepts, resources, views = t.setup() >>> concepts, resources, views = t.setup()
>>> len(concepts) + len(resources) >>> len(concepts), len(resources)
18 (20, 0)
Let's now add an external collection that reads in a set of resources
from external files so we have something to work with.
>>> from loops.concept import Concept
>>> from loops.setup import addObject
>>> from loops.common import adapted
>>> from loops.classifier.testsetup import dataDir
>>> tExternalCollection = concepts['extcollection']
>>> coll01 = addObject(concepts, Concept, 'coll01',
... title=u'Collection One', conceptType=tExternalCollection)
>>> aColl01 = adapted(coll01)
>>> aColl01.baseAddress = dataDir
>>> aColl01.address = ''
>>> aColl01.update()
>>> len(resources)
7
>>> rnames = list(sorted(resources.keys()))
>>> rnames[0]
u'cust_im_contract_webbg_20071015.txt'
Filename-based Classification
=============================
Let's first look at the external address (i.e. the file name) of the
resource we want to classify.
>>> r1 = resources[rnames[0]]
>>> adapted(r1)
<loops.resource.ExternalFileAdapter object ...>
>>> adapted(r1).externalAddress
'cust_im_contract_webbg_20071015.txt'
OK, that's what we need. So we get the preconfigured classifier
(see testsetup.py) and let it classify the resource.
>>> classifier = adapted(concepts['fileclassifier'])
Before just processing the resource we'll have a look at the details
and follow the classifier step by step.
>>> from loops.classifier.base import InformationSet
>>> from loops.classifier.interfaces import IExtractor, IAnalyzer
>>> infoSet = InformationSet()
>>> for name in classifier.extractors.split():
... print 'extractor:', name
... extractor = component.getAdapter(adapted(r1), IExtractor, name=name)
... infoSet.update(extractor.extractInformationSet())
extractor: filename
>>> infoSet
{'filename': 'cust_im_contract_webbg_20071015.txt'}
>>> analyzer = component.getUtility(IAnalyzer, name=classifier.analyzer)
>>> statements = analyzer.extractStatements(infoSet, classifier)
>>> statements
[]
So there seems to be something missing - we have to create concepts
that may be identified as being candidates for classification.
>>> tInstitution = addObject(concepts, Concept, 'institution',
... title=u'Institution', conceptType=concepts['type'])
>>> cust_im = addObject(concepts, Concept, 'im_editors',
... title=u'im Editors', conceptType=tInstitution)
>>> t.indexAll(concepts, resources)
>>> statements = analyzer.extractStatements(infoSet, classifier)
>>> len(statements)
1
So we are now ready to have the whole stuff run in one call.
>>> classifier.process(r1)
Fin de partie Fin de partie

View file

@ -51,7 +51,14 @@ class Classifier(AdapterBase):
_contextAttributes = list(IClassifier) + list(IConcept) _contextAttributes = list(IClassifier) + list(IConcept)
def process(self, resource): def process(self, resource):
pass infoSet = InformationSet()
for name in self.extractors.split():
extractor = component.getAdapter(adapted(resource), IExtractor, name=name)
infoSet.update(extractor.extractInformationSet())
analyzer = component.getUtility(IAnalyzer, name=self.analyzer)
statements = analyzer.extractStatements(infoSet, self)
for statement in statements:
self.assignConcept(statement)
def assignConcept(self, statement): def assignConcept(self, statement):
pass pass
@ -73,7 +80,7 @@ class Analyzer(object):
implements(IAnalyzer) implements(IAnalyzer)
def extractStatements(self,informationSet): def extractStatements(self, informationSet, classifier=None):
return [] return []
@ -86,7 +93,9 @@ class Statement(object):
implements(IStatement) implements(IStatement)
subject = None def __init__(self, subject=None, predicate=None, object=None, relevance=100):
predicate = None self.subject = subject
object = None self.predicate = predicate
relevance = 100 self.object = object
self.relevance = relevance

View file

@ -33,6 +33,12 @@ class IClassifier(Interface):
concepts to assign. concepts to assign.
""" """
extractors = schema.TextLine(
title=_(u'Extractors'),
description=_(u'Space-separated list of names of extractor adapters.'),
default=u'',
required=False)
analyzer = schema.TextLine( analyzer = schema.TextLine(
title=_(u'Analyzer'), title=_(u'Analyzer'),
description=_(u'Name of a utility that is able to analyze ' description=_(u'Name of a utility that is able to analyze '
@ -72,9 +78,13 @@ class IAnalyzer(Interface):
provide a collection of statements about it. provide a collection of statements about it.
""" """
def extractStatements(informationSet): def extractStatements(informationSet, classifier=None):
""" Return a collection of statements derived from the """ Return a collection of statements derived from the
information set given. information set given.
The ``classifier`` argument may be given in order to
check the environment of the classifier, e.g. available
concepts that may be used as attributes for statements.
""" """

101
classifier/sample.py Normal file
View file

@ -0,0 +1,101 @@
#
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Sample classifier implementation.
$Id$
"""
from zope import component
from zope.app.catalog.interfaces import ICatalog
from zope.component import adapts
from cybertools.organize.interfaces import IPerson
from cybertools.typology.interfaces import IType
from loops.classifier.base import Analyzer
from loops.classifier.base import Statement
class SampleAnalyzer(Analyzer):
""" A fairly specific analyzer that expects filenames following this
format:
ctype_name_doctype_owner_date.extension
with ctype = ('cust', 'emp'), spec is the short name of a customer or
an employee, doctype = ('note', 'contract'), and owner
being the short name of the user that is responsible for the
resource.
"""
def handleCustomer(self, name, classifier):
result = []
candidates = self.findConcepts(name)
cm = self.getConceptManager(classifier)
custTypes = [c for c in (cm.get('institution'), cm.get('customer'),)
if c is not None]
for c in candidates:
ctype = IType(c)
if ctype.typeProvider in custTypes:
result.append(Statement(c))
return result
def handleEmployee(self, name, classifier):
result = []
#print 'employee', name
return result
def handleOwner(self, name, classifier):
result = []
#print 'owner', name
return result
def handleDoctype(self, name, classifier):
result = []
#print 'doctype', name
return result
handlers = dict(cust=handleCustomer, emp=handleEmployee)
def extractStatements(self, informationSet, classifier=None):
result = []
if classifier is None:
return result # classifier is needed for getting access to concepts
fn = informationSet.get('filename')
if fn is None:
return result
parts = fn.split('_')
if len(parts) > 1:
ctype = parts.pop(0)
if ctype in self.handlers:
name = parts.pop(0)
result.extend(self.handlers[ctype](self, name, classifier))
if len(parts) > 1:
result.extend(self.handleDoctype(parts.pop(0), classifier))
if len(parts) > 1:
result.extend(self.handleOwner(parts.pop(0), classifier))
return result
def findConcepts(self, name):
cat = component.getUtility(ICatalog)
return cat.searchResults(loops_text=name)
def getConceptManager(self, obj):
return obj.context.getLoopsRoot().getConceptManager()

41
classifier/standard.py Normal file
View file

@ -0,0 +1,41 @@
#
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Standard implementations of classifier components.
$Id$
"""
from zope.component import adapts
from loops.classifier.base import Extractor
from loops.classifier.base import InformationSet
from loops.interfaces import IExternalFile
class FilenameExtractor(Extractor):
adapts(IExternalFile)
def __init__(self, context):
self.context = context
def extractInformationSet(self):
filename = self.context.externalAddress
return InformationSet(filename=filename)

View file

@ -0,0 +1 @@

View file

@ -0,0 +1 @@

View file

@ -0,0 +1 @@

View file

@ -0,0 +1 @@

View file

@ -0,0 +1 @@

View file

@ -0,0 +1 @@

View file

@ -0,0 +1 @@

View file

@ -6,12 +6,23 @@ $Id$
import os import os
from zope import component from zope import component
#from zope.app.catalog.interfaces import ICatalog
#from zope.app.catalog.text import TextIndex
from cybertools.storage.interfaces import IExternalStorage
from cybertools.storage.filesystem import fullPathStorage
from loops import util from loops import util
from loops.classifier.base import Classifier, Extractor, Analyzer from loops.classifier.base import Classifier
from loops.classifier.sample import SampleAnalyzer
from loops.classifier.standard import FilenameExtractor
from loops.classifier.interfaces import IClassifier, IAnalyzer from loops.classifier.interfaces import IClassifier, IAnalyzer
from loops.common import adapted
from loops.concept import Concept from loops.concept import Concept
from loops.resource import Resource from loops.resource import Resource, ExternalFileAdapter
from loops.interfaces import IIndexAttributes, IExternalFile
from loops.integrator.collection import DirectoryCollectionProvider
from loops.integrator.collection import ExternalCollectionAdapter
from loops.integrator.interfaces import IExternalCollection, IExternalCollectionProvider
from loops.knowledge.setup import SetupManager as KnowledgeSetupManager from loops.knowledge.setup import SetupManager as KnowledgeSetupManager
from loops.setup import SetupManager, addAndConfigureObject from loops.setup import SetupManager, addAndConfigureObject
from loops.tests.setup import TestSite as BaseTestSite from loops.tests.setup import TestSite as BaseTestSite
@ -28,18 +39,34 @@ class TestSite(BaseTestSite):
component.provideAdapter(KnowledgeSetupManager, name='knowledge') component.provideAdapter(KnowledgeSetupManager, name='knowledge')
concepts, resources, views = self.baseSetup() concepts, resources, views = self.baseSetup()
#catalog = component.getUtility(ICatalog)
#catalog['loops_text'] = TextIndex('text', IIndexAttributes, True)
# classifier and Co
tType = concepts.getTypeConcept() tType = concepts.getTypeConcept()
tClassifier = addAndConfigureObject(concepts, Concept, 'classifier', tClassifier = addAndConfigureObject(concepts, Concept, 'classifier',
title=u'Classifier', conceptType=tType, title=u'Classifier', conceptType=tType,
typeInterface=IClassifier) typeInterface=IClassifier)
component.provideAdapter(Classifier) component.provideAdapter(Classifier)
fileClassifier = addAndConfigureObject(concepts, Concept, sampleClassifier = addAndConfigureObject(concepts, Concept,
'fileclassifier', title=u'File Classifier', 'fileclassifier', title=u'File Classifier',
conceptType=tClassifier) conceptType=tClassifier)
sampleClassifier = adapted(sampleClassifier)
sampleClassifier.extractors = 'filename'
sampleClassifier.analyzer = 'sample'
component.provideAdapter(FilenameExtractor, name='filename')
component.provideUtility(SampleAnalyzer(), IAnalyzer, name='sample')
component.provideAdapter(Extractor) # external file stuff for providing test files
component.provideUtility(Analyzer, IAnalyzer) component.provideAdapter(ExternalFileAdapter, provides=IExternalFile)
component.provideUtility(fullPathStorage(), IExternalStorage, name='fullpath')
component.provideAdapter(ExternalCollectionAdapter)
component.provideUtility(DirectoryCollectionProvider(), IExternalCollectionProvider)
tExtFile = addAndConfigureObject(concepts, Concept, 'extfile',
title=u'External File', conceptType=tType,
typeInterface=IExternalFile)
tExtCollection = addAndConfigureObject(concepts, Concept, 'extcollection',
title=u'External Collection', conceptType=tType,
typeInterface=IExternalCollection)
self.indexAll(concepts, resources) self.indexAll(concepts, resources)
return concepts, resources, views return concepts, resources, views

View file

@ -111,7 +111,7 @@ Working with the External Collection
(u'programming_zope_zope3.txt', u'zope3', 'fullpath')] (u'programming_zope_zope3.txt', u'zope3', 'fullpath')]
We may update the collection after having changed the storage params. We may update the collection after having changed the storage params.
This should also the settings for existing objects if they still This should also change the settings for existing objects if they still
can be found. can be found.
>>> import os >>> import os