work in progress: automatic classification
git-svn-id: svn://svn.cy55.de/Zope3/src/loops/trunk@2071 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
52538479d8
commit
bf8a641654
14 changed files with 288 additions and 16 deletions
|
@ -26,8 +26,85 @@ configuration):
|
||||||
>>> t = TestSite(site)
|
>>> t = TestSite(site)
|
||||||
>>> concepts, resources, views = t.setup()
|
>>> concepts, resources, views = t.setup()
|
||||||
|
|
||||||
>>> len(concepts) + len(resources)
|
>>> len(concepts), len(resources)
|
||||||
18
|
(20, 0)
|
||||||
|
|
||||||
|
Let's now add an external collection that reads in a set of resources
|
||||||
|
from external files so we have something to work with.
|
||||||
|
|
||||||
|
>>> from loops.concept import Concept
|
||||||
|
>>> from loops.setup import addObject
|
||||||
|
>>> from loops.common import adapted
|
||||||
|
>>> from loops.classifier.testsetup import dataDir
|
||||||
|
|
||||||
|
>>> tExternalCollection = concepts['extcollection']
|
||||||
|
>>> coll01 = addObject(concepts, Concept, 'coll01',
|
||||||
|
... title=u'Collection One', conceptType=tExternalCollection)
|
||||||
|
>>> aColl01 = adapted(coll01)
|
||||||
|
>>> aColl01.baseAddress = dataDir
|
||||||
|
>>> aColl01.address = ''
|
||||||
|
|
||||||
|
>>> aColl01.update()
|
||||||
|
>>> len(resources)
|
||||||
|
7
|
||||||
|
>>> rnames = list(sorted(resources.keys()))
|
||||||
|
>>> rnames[0]
|
||||||
|
u'cust_im_contract_webbg_20071015.txt'
|
||||||
|
|
||||||
|
|
||||||
|
Filename-based Classification
|
||||||
|
=============================
|
||||||
|
|
||||||
|
Let's first look at the external address (i.e. the file name) of the
|
||||||
|
resource we want to classify.
|
||||||
|
|
||||||
|
>>> r1 = resources[rnames[0]]
|
||||||
|
>>> adapted(r1)
|
||||||
|
<loops.resource.ExternalFileAdapter object ...>
|
||||||
|
>>> adapted(r1).externalAddress
|
||||||
|
'cust_im_contract_webbg_20071015.txt'
|
||||||
|
|
||||||
|
OK, that's what we need. So we get the preconfigured classifier
|
||||||
|
(see testsetup.py) and let it classify the resource.
|
||||||
|
|
||||||
|
>>> classifier = adapted(concepts['fileclassifier'])
|
||||||
|
|
||||||
|
Before just processing the resource we'll have a look at the details
|
||||||
|
and follow the classifier step by step.
|
||||||
|
|
||||||
|
>>> from loops.classifier.base import InformationSet
|
||||||
|
>>> from loops.classifier.interfaces import IExtractor, IAnalyzer
|
||||||
|
>>> infoSet = InformationSet()
|
||||||
|
>>> for name in classifier.extractors.split():
|
||||||
|
... print 'extractor:', name
|
||||||
|
... extractor = component.getAdapter(adapted(r1), IExtractor, name=name)
|
||||||
|
... infoSet.update(extractor.extractInformationSet())
|
||||||
|
extractor: filename
|
||||||
|
|
||||||
|
>>> infoSet
|
||||||
|
{'filename': 'cust_im_contract_webbg_20071015.txt'}
|
||||||
|
|
||||||
|
>>> analyzer = component.getUtility(IAnalyzer, name=classifier.analyzer)
|
||||||
|
>>> statements = analyzer.extractStatements(infoSet, classifier)
|
||||||
|
>>> statements
|
||||||
|
[]
|
||||||
|
|
||||||
|
So there seems to be something missing - we have to create concepts
|
||||||
|
that may be identified as being candidates for classification.
|
||||||
|
|
||||||
|
>>> tInstitution = addObject(concepts, Concept, 'institution',
|
||||||
|
... title=u'Institution', conceptType=concepts['type'])
|
||||||
|
>>> cust_im = addObject(concepts, Concept, 'im_editors',
|
||||||
|
... title=u'im Editors', conceptType=tInstitution)
|
||||||
|
>>> t.indexAll(concepts, resources)
|
||||||
|
|
||||||
|
>>> statements = analyzer.extractStatements(infoSet, classifier)
|
||||||
|
>>> len(statements)
|
||||||
|
1
|
||||||
|
|
||||||
|
So we are now ready to have the whole stuff run in one call.
|
||||||
|
|
||||||
|
>>> classifier.process(r1)
|
||||||
|
|
||||||
|
|
||||||
Fin de partie
|
Fin de partie
|
||||||
|
|
|
@ -51,7 +51,14 @@ class Classifier(AdapterBase):
|
||||||
_contextAttributes = list(IClassifier) + list(IConcept)
|
_contextAttributes = list(IClassifier) + list(IConcept)
|
||||||
|
|
||||||
def process(self, resource):
|
def process(self, resource):
|
||||||
pass
|
infoSet = InformationSet()
|
||||||
|
for name in self.extractors.split():
|
||||||
|
extractor = component.getAdapter(adapted(resource), IExtractor, name=name)
|
||||||
|
infoSet.update(extractor.extractInformationSet())
|
||||||
|
analyzer = component.getUtility(IAnalyzer, name=self.analyzer)
|
||||||
|
statements = analyzer.extractStatements(infoSet, self)
|
||||||
|
for statement in statements:
|
||||||
|
self.assignConcept(statement)
|
||||||
|
|
||||||
def assignConcept(self, statement):
|
def assignConcept(self, statement):
|
||||||
pass
|
pass
|
||||||
|
@ -73,7 +80,7 @@ class Analyzer(object):
|
||||||
|
|
||||||
implements(IAnalyzer)
|
implements(IAnalyzer)
|
||||||
|
|
||||||
def extractStatements(self,informationSet):
|
def extractStatements(self, informationSet, classifier=None):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
@ -86,7 +93,9 @@ class Statement(object):
|
||||||
|
|
||||||
implements(IStatement)
|
implements(IStatement)
|
||||||
|
|
||||||
subject = None
|
def __init__(self, subject=None, predicate=None, object=None, relevance=100):
|
||||||
predicate = None
|
self.subject = subject
|
||||||
object = None
|
self.predicate = predicate
|
||||||
relevance = 100
|
self.object = object
|
||||||
|
self.relevance = relevance
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,12 @@ class IClassifier(Interface):
|
||||||
concepts to assign.
|
concepts to assign.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
extractors = schema.TextLine(
|
||||||
|
title=_(u'Extractors'),
|
||||||
|
description=_(u'Space-separated list of names of extractor adapters.'),
|
||||||
|
default=u'',
|
||||||
|
required=False)
|
||||||
|
|
||||||
analyzer = schema.TextLine(
|
analyzer = schema.TextLine(
|
||||||
title=_(u'Analyzer'),
|
title=_(u'Analyzer'),
|
||||||
description=_(u'Name of a utility that is able to analyze '
|
description=_(u'Name of a utility that is able to analyze '
|
||||||
|
@ -72,9 +78,13 @@ class IAnalyzer(Interface):
|
||||||
provide a collection of statements about it.
|
provide a collection of statements about it.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def extractStatements(informationSet):
|
def extractStatements(informationSet, classifier=None):
|
||||||
""" Return a collection of statements derived from the
|
""" Return a collection of statements derived from the
|
||||||
information set given.
|
information set given.
|
||||||
|
|
||||||
|
The ``classifier`` argument may be given in order to
|
||||||
|
check the environment of the classifier, e.g. available
|
||||||
|
concepts that may be used as attributes for statements.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
101
classifier/sample.py
Normal file
101
classifier/sample.py
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Sample classifier implementation.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
from zope import component
|
||||||
|
from zope.app.catalog.interfaces import ICatalog
|
||||||
|
from zope.component import adapts
|
||||||
|
|
||||||
|
from cybertools.organize.interfaces import IPerson
|
||||||
|
from cybertools.typology.interfaces import IType
|
||||||
|
from loops.classifier.base import Analyzer
|
||||||
|
from loops.classifier.base import Statement
|
||||||
|
|
||||||
|
|
||||||
|
class SampleAnalyzer(Analyzer):
|
||||||
|
""" A fairly specific analyzer that expects filenames following this
|
||||||
|
format:
|
||||||
|
|
||||||
|
ctype_name_doctype_owner_date.extension
|
||||||
|
|
||||||
|
with ctype = ('cust', 'emp'), spec is the short name of a customer or
|
||||||
|
an employee, doctype = ('note', 'contract'), and owner
|
||||||
|
being the short name of the user that is responsible for the
|
||||||
|
resource.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def handleCustomer(self, name, classifier):
|
||||||
|
result = []
|
||||||
|
candidates = self.findConcepts(name)
|
||||||
|
cm = self.getConceptManager(classifier)
|
||||||
|
custTypes = [c for c in (cm.get('institution'), cm.get('customer'),)
|
||||||
|
if c is not None]
|
||||||
|
for c in candidates:
|
||||||
|
ctype = IType(c)
|
||||||
|
if ctype.typeProvider in custTypes:
|
||||||
|
result.append(Statement(c))
|
||||||
|
return result
|
||||||
|
|
||||||
|
def handleEmployee(self, name, classifier):
|
||||||
|
result = []
|
||||||
|
#print 'employee', name
|
||||||
|
return result
|
||||||
|
|
||||||
|
def handleOwner(self, name, classifier):
|
||||||
|
result = []
|
||||||
|
#print 'owner', name
|
||||||
|
return result
|
||||||
|
|
||||||
|
def handleDoctype(self, name, classifier):
|
||||||
|
result = []
|
||||||
|
#print 'doctype', name
|
||||||
|
return result
|
||||||
|
|
||||||
|
handlers = dict(cust=handleCustomer, emp=handleEmployee)
|
||||||
|
|
||||||
|
def extractStatements(self, informationSet, classifier=None):
|
||||||
|
result = []
|
||||||
|
if classifier is None:
|
||||||
|
return result # classifier is needed for getting access to concepts
|
||||||
|
fn = informationSet.get('filename')
|
||||||
|
if fn is None:
|
||||||
|
return result
|
||||||
|
parts = fn.split('_')
|
||||||
|
if len(parts) > 1:
|
||||||
|
ctype = parts.pop(0)
|
||||||
|
if ctype in self.handlers:
|
||||||
|
name = parts.pop(0)
|
||||||
|
result.extend(self.handlers[ctype](self, name, classifier))
|
||||||
|
if len(parts) > 1:
|
||||||
|
result.extend(self.handleDoctype(parts.pop(0), classifier))
|
||||||
|
if len(parts) > 1:
|
||||||
|
result.extend(self.handleOwner(parts.pop(0), classifier))
|
||||||
|
return result
|
||||||
|
|
||||||
|
def findConcepts(self, name):
|
||||||
|
cat = component.getUtility(ICatalog)
|
||||||
|
return cat.searchResults(loops_text=name)
|
||||||
|
|
||||||
|
def getConceptManager(self, obj):
|
||||||
|
return obj.context.getLoopsRoot().getConceptManager()
|
||||||
|
|
41
classifier/standard.py
Normal file
41
classifier/standard.py
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation; either version 2 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program; if not, write to the Free Software
|
||||||
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
#
|
||||||
|
|
||||||
|
"""
|
||||||
|
Standard implementations of classifier components.
|
||||||
|
|
||||||
|
$Id$
|
||||||
|
"""
|
||||||
|
|
||||||
|
from zope.component import adapts
|
||||||
|
|
||||||
|
from loops.classifier.base import Extractor
|
||||||
|
from loops.classifier.base import InformationSet
|
||||||
|
from loops.interfaces import IExternalFile
|
||||||
|
|
||||||
|
|
||||||
|
class FilenameExtractor(Extractor):
|
||||||
|
|
||||||
|
adapts(IExternalFile)
|
||||||
|
|
||||||
|
def __init__(self, context):
|
||||||
|
self.context = context
|
||||||
|
|
||||||
|
def extractInformationSet(self):
|
||||||
|
filename = self.context.externalAddress
|
||||||
|
return InformationSet(filename=filename)
|
1
classifier/testdata/cust_im_contract_webbg_20071015.txt
vendored
Normal file
1
classifier/testdata/cust_im_contract_webbg_20071015.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
1
classifier/testdata/cust_im_note_smitha_20071004.txt
vendored
Normal file
1
classifier/testdata/cust_im_note_smitha_20071004.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
1
classifier/testdata/cust_im_note_webbg_20070924.txt
vendored
Normal file
1
classifier/testdata/cust_im_note_webbg_20070924.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
1
classifier/testdata/cust_mc_note_webbg_20070824.txt
vendored
Normal file
1
classifier/testdata/cust_mc_note_webbg_20070824.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
1
classifier/testdata/emp_watersj_note_miller_20070822.txt
vendored
Normal file
1
classifier/testdata/emp_watersj_note_miller_20070822.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
1
classifier/testdata/emp_webbg_note_millerj_20070804.txt
vendored
Normal file
1
classifier/testdata/emp_webbg_note_millerj_20070804.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
1
classifier/testdata/note_smitha_20070824.txt
vendored
Normal file
1
classifier/testdata/note_smitha_20070824.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
|
|
@ -6,12 +6,23 @@ $Id$
|
||||||
|
|
||||||
import os
|
import os
|
||||||
from zope import component
|
from zope import component
|
||||||
|
#from zope.app.catalog.interfaces import ICatalog
|
||||||
|
#from zope.app.catalog.text import TextIndex
|
||||||
|
|
||||||
|
from cybertools.storage.interfaces import IExternalStorage
|
||||||
|
from cybertools.storage.filesystem import fullPathStorage
|
||||||
from loops import util
|
from loops import util
|
||||||
from loops.classifier.base import Classifier, Extractor, Analyzer
|
from loops.classifier.base import Classifier
|
||||||
|
from loops.classifier.sample import SampleAnalyzer
|
||||||
|
from loops.classifier.standard import FilenameExtractor
|
||||||
from loops.classifier.interfaces import IClassifier, IAnalyzer
|
from loops.classifier.interfaces import IClassifier, IAnalyzer
|
||||||
|
from loops.common import adapted
|
||||||
from loops.concept import Concept
|
from loops.concept import Concept
|
||||||
from loops.resource import Resource
|
from loops.resource import Resource, ExternalFileAdapter
|
||||||
|
from loops.interfaces import IIndexAttributes, IExternalFile
|
||||||
|
from loops.integrator.collection import DirectoryCollectionProvider
|
||||||
|
from loops.integrator.collection import ExternalCollectionAdapter
|
||||||
|
from loops.integrator.interfaces import IExternalCollection, IExternalCollectionProvider
|
||||||
from loops.knowledge.setup import SetupManager as KnowledgeSetupManager
|
from loops.knowledge.setup import SetupManager as KnowledgeSetupManager
|
||||||
from loops.setup import SetupManager, addAndConfigureObject
|
from loops.setup import SetupManager, addAndConfigureObject
|
||||||
from loops.tests.setup import TestSite as BaseTestSite
|
from loops.tests.setup import TestSite as BaseTestSite
|
||||||
|
@ -28,18 +39,34 @@ class TestSite(BaseTestSite):
|
||||||
component.provideAdapter(KnowledgeSetupManager, name='knowledge')
|
component.provideAdapter(KnowledgeSetupManager, name='knowledge')
|
||||||
concepts, resources, views = self.baseSetup()
|
concepts, resources, views = self.baseSetup()
|
||||||
|
|
||||||
|
#catalog = component.getUtility(ICatalog)
|
||||||
|
#catalog['loops_text'] = TextIndex('text', IIndexAttributes, True)
|
||||||
|
# classifier and Co
|
||||||
tType = concepts.getTypeConcept()
|
tType = concepts.getTypeConcept()
|
||||||
tClassifier = addAndConfigureObject(concepts, Concept, 'classifier',
|
tClassifier = addAndConfigureObject(concepts, Concept, 'classifier',
|
||||||
title=u'Classifier', conceptType=tType,
|
title=u'Classifier', conceptType=tType,
|
||||||
typeInterface=IClassifier)
|
typeInterface=IClassifier)
|
||||||
|
|
||||||
component.provideAdapter(Classifier)
|
component.provideAdapter(Classifier)
|
||||||
fileClassifier = addAndConfigureObject(concepts, Concept,
|
sampleClassifier = addAndConfigureObject(concepts, Concept,
|
||||||
'fileclassifier', title=u'File Classifier',
|
'fileclassifier', title=u'File Classifier',
|
||||||
conceptType=tClassifier)
|
conceptType=tClassifier)
|
||||||
|
sampleClassifier = adapted(sampleClassifier)
|
||||||
|
sampleClassifier.extractors = 'filename'
|
||||||
|
sampleClassifier.analyzer = 'sample'
|
||||||
|
component.provideAdapter(FilenameExtractor, name='filename')
|
||||||
|
component.provideUtility(SampleAnalyzer(), IAnalyzer, name='sample')
|
||||||
|
|
||||||
component.provideAdapter(Extractor)
|
# external file stuff for providing test files
|
||||||
component.provideUtility(Analyzer, IAnalyzer)
|
component.provideAdapter(ExternalFileAdapter, provides=IExternalFile)
|
||||||
|
component.provideUtility(fullPathStorage(), IExternalStorage, name='fullpath')
|
||||||
|
component.provideAdapter(ExternalCollectionAdapter)
|
||||||
|
component.provideUtility(DirectoryCollectionProvider(), IExternalCollectionProvider)
|
||||||
|
tExtFile = addAndConfigureObject(concepts, Concept, 'extfile',
|
||||||
|
title=u'External File', conceptType=tType,
|
||||||
|
typeInterface=IExternalFile)
|
||||||
|
tExtCollection = addAndConfigureObject(concepts, Concept, 'extcollection',
|
||||||
|
title=u'External Collection', conceptType=tType,
|
||||||
|
typeInterface=IExternalCollection)
|
||||||
|
|
||||||
self.indexAll(concepts, resources)
|
self.indexAll(concepts, resources)
|
||||||
return concepts, resources, views
|
return concepts, resources, views
|
||||||
|
|
|
@ -111,7 +111,7 @@ Working with the External Collection
|
||||||
(u'programming_zope_zope3.txt', u'zope3', 'fullpath')]
|
(u'programming_zope_zope3.txt', u'zope3', 'fullpath')]
|
||||||
|
|
||||||
We may update the collection after having changed the storage params.
|
We may update the collection after having changed the storage params.
|
||||||
This should also the settings for existing objects if they still
|
This should also change the settings for existing objects if they still
|
||||||
can be found.
|
can be found.
|
||||||
|
|
||||||
>>> import os
|
>>> import os
|
||||||
|
|
Loading…
Add table
Reference in a new issue