work in progress: automatic classification
git-svn-id: svn://svn.cy55.de/Zope3/src/loops/trunk@2071 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
parent
52538479d8
commit
bf8a641654
14 changed files with 288 additions and 16 deletions
|
@ -26,8 +26,85 @@ configuration):
|
|||
>>> t = TestSite(site)
|
||||
>>> concepts, resources, views = t.setup()
|
||||
|
||||
>>> len(concepts) + len(resources)
|
||||
18
|
||||
>>> len(concepts), len(resources)
|
||||
(20, 0)
|
||||
|
||||
Let's now add an external collection that reads in a set of resources
|
||||
from external files so we have something to work with.
|
||||
|
||||
>>> from loops.concept import Concept
|
||||
>>> from loops.setup import addObject
|
||||
>>> from loops.common import adapted
|
||||
>>> from loops.classifier.testsetup import dataDir
|
||||
|
||||
>>> tExternalCollection = concepts['extcollection']
|
||||
>>> coll01 = addObject(concepts, Concept, 'coll01',
|
||||
... title=u'Collection One', conceptType=tExternalCollection)
|
||||
>>> aColl01 = adapted(coll01)
|
||||
>>> aColl01.baseAddress = dataDir
|
||||
>>> aColl01.address = ''
|
||||
|
||||
>>> aColl01.update()
|
||||
>>> len(resources)
|
||||
7
|
||||
>>> rnames = list(sorted(resources.keys()))
|
||||
>>> rnames[0]
|
||||
u'cust_im_contract_webbg_20071015.txt'
|
||||
|
||||
|
||||
Filename-based Classification
|
||||
=============================
|
||||
|
||||
Let's first look at the external address (i.e. the file name) of the
|
||||
resource we want to classify.
|
||||
|
||||
>>> r1 = resources[rnames[0]]
|
||||
>>> adapted(r1)
|
||||
<loops.resource.ExternalFileAdapter object ...>
|
||||
>>> adapted(r1).externalAddress
|
||||
'cust_im_contract_webbg_20071015.txt'
|
||||
|
||||
OK, that's what we need. So we get the preconfigured classifier
|
||||
(see testsetup.py) and let it classify the resource.
|
||||
|
||||
>>> classifier = adapted(concepts['fileclassifier'])
|
||||
|
||||
Before just processing the resource we'll have a look at the details
|
||||
and follow the classifier step by step.
|
||||
|
||||
>>> from loops.classifier.base import InformationSet
|
||||
>>> from loops.classifier.interfaces import IExtractor, IAnalyzer
|
||||
>>> infoSet = InformationSet()
|
||||
>>> for name in classifier.extractors.split():
|
||||
... print 'extractor:', name
|
||||
... extractor = component.getAdapter(adapted(r1), IExtractor, name=name)
|
||||
... infoSet.update(extractor.extractInformationSet())
|
||||
extractor: filename
|
||||
|
||||
>>> infoSet
|
||||
{'filename': 'cust_im_contract_webbg_20071015.txt'}
|
||||
|
||||
>>> analyzer = component.getUtility(IAnalyzer, name=classifier.analyzer)
|
||||
>>> statements = analyzer.extractStatements(infoSet, classifier)
|
||||
>>> statements
|
||||
[]
|
||||
|
||||
So there seems to be something missing - we have to create concepts
|
||||
that may be identified as being candidates for classification.
|
||||
|
||||
>>> tInstitution = addObject(concepts, Concept, 'institution',
|
||||
... title=u'Institution', conceptType=concepts['type'])
|
||||
>>> cust_im = addObject(concepts, Concept, 'im_editors',
|
||||
... title=u'im Editors', conceptType=tInstitution)
|
||||
>>> t.indexAll(concepts, resources)
|
||||
|
||||
>>> statements = analyzer.extractStatements(infoSet, classifier)
|
||||
>>> len(statements)
|
||||
1
|
||||
|
||||
So we are now ready to have the whole stuff run in one call.
|
||||
|
||||
>>> classifier.process(r1)
|
||||
|
||||
|
||||
Fin de partie
|
||||
|
|
|
@ -51,7 +51,14 @@ class Classifier(AdapterBase):
|
|||
_contextAttributes = list(IClassifier) + list(IConcept)
|
||||
|
||||
def process(self, resource):
|
||||
pass
|
||||
infoSet = InformationSet()
|
||||
for name in self.extractors.split():
|
||||
extractor = component.getAdapter(adapted(resource), IExtractor, name=name)
|
||||
infoSet.update(extractor.extractInformationSet())
|
||||
analyzer = component.getUtility(IAnalyzer, name=self.analyzer)
|
||||
statements = analyzer.extractStatements(infoSet, self)
|
||||
for statement in statements:
|
||||
self.assignConcept(statement)
|
||||
|
||||
def assignConcept(self, statement):
|
||||
pass
|
||||
|
@ -73,7 +80,7 @@ class Analyzer(object):
|
|||
|
||||
implements(IAnalyzer)
|
||||
|
||||
def extractStatements(self,informationSet):
|
||||
def extractStatements(self, informationSet, classifier=None):
|
||||
return []
|
||||
|
||||
|
||||
|
@ -86,7 +93,9 @@ class Statement(object):
|
|||
|
||||
implements(IStatement)
|
||||
|
||||
subject = None
|
||||
predicate = None
|
||||
object = None
|
||||
relevance = 100
|
||||
def __init__(self, subject=None, predicate=None, object=None, relevance=100):
|
||||
self.subject = subject
|
||||
self.predicate = predicate
|
||||
self.object = object
|
||||
self.relevance = relevance
|
||||
|
||||
|
|
|
@ -33,6 +33,12 @@ class IClassifier(Interface):
|
|||
concepts to assign.
|
||||
"""
|
||||
|
||||
extractors = schema.TextLine(
|
||||
title=_(u'Extractors'),
|
||||
description=_(u'Space-separated list of names of extractor adapters.'),
|
||||
default=u'',
|
||||
required=False)
|
||||
|
||||
analyzer = schema.TextLine(
|
||||
title=_(u'Analyzer'),
|
||||
description=_(u'Name of a utility that is able to analyze '
|
||||
|
@ -72,9 +78,13 @@ class IAnalyzer(Interface):
|
|||
provide a collection of statements about it.
|
||||
"""
|
||||
|
||||
def extractStatements(informationSet):
|
||||
def extractStatements(informationSet, classifier=None):
|
||||
""" Return a collection of statements derived from the
|
||||
information set given.
|
||||
|
||||
The ``classifier`` argument may be given in order to
|
||||
check the environment of the classifier, e.g. available
|
||||
concepts that may be used as attributes for statements.
|
||||
"""
|
||||
|
||||
|
||||
|
|
101
classifier/sample.py
Normal file
101
classifier/sample.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
#
|
||||
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Sample classifier implementation.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
from zope import component
|
||||
from zope.app.catalog.interfaces import ICatalog
|
||||
from zope.component import adapts
|
||||
|
||||
from cybertools.organize.interfaces import IPerson
|
||||
from cybertools.typology.interfaces import IType
|
||||
from loops.classifier.base import Analyzer
|
||||
from loops.classifier.base import Statement
|
||||
|
||||
|
||||
class SampleAnalyzer(Analyzer):
|
||||
""" A fairly specific analyzer that expects filenames following this
|
||||
format:
|
||||
|
||||
ctype_name_doctype_owner_date.extension
|
||||
|
||||
with ctype = ('cust', 'emp'), spec is the short name of a customer or
|
||||
an employee, doctype = ('note', 'contract'), and owner
|
||||
being the short name of the user that is responsible for the
|
||||
resource.
|
||||
"""
|
||||
|
||||
def handleCustomer(self, name, classifier):
|
||||
result = []
|
||||
candidates = self.findConcepts(name)
|
||||
cm = self.getConceptManager(classifier)
|
||||
custTypes = [c for c in (cm.get('institution'), cm.get('customer'),)
|
||||
if c is not None]
|
||||
for c in candidates:
|
||||
ctype = IType(c)
|
||||
if ctype.typeProvider in custTypes:
|
||||
result.append(Statement(c))
|
||||
return result
|
||||
|
||||
def handleEmployee(self, name, classifier):
|
||||
result = []
|
||||
#print 'employee', name
|
||||
return result
|
||||
|
||||
def handleOwner(self, name, classifier):
|
||||
result = []
|
||||
#print 'owner', name
|
||||
return result
|
||||
|
||||
def handleDoctype(self, name, classifier):
|
||||
result = []
|
||||
#print 'doctype', name
|
||||
return result
|
||||
|
||||
handlers = dict(cust=handleCustomer, emp=handleEmployee)
|
||||
|
||||
def extractStatements(self, informationSet, classifier=None):
|
||||
result = []
|
||||
if classifier is None:
|
||||
return result # classifier is needed for getting access to concepts
|
||||
fn = informationSet.get('filename')
|
||||
if fn is None:
|
||||
return result
|
||||
parts = fn.split('_')
|
||||
if len(parts) > 1:
|
||||
ctype = parts.pop(0)
|
||||
if ctype in self.handlers:
|
||||
name = parts.pop(0)
|
||||
result.extend(self.handlers[ctype](self, name, classifier))
|
||||
if len(parts) > 1:
|
||||
result.extend(self.handleDoctype(parts.pop(0), classifier))
|
||||
if len(parts) > 1:
|
||||
result.extend(self.handleOwner(parts.pop(0), classifier))
|
||||
return result
|
||||
|
||||
def findConcepts(self, name):
|
||||
cat = component.getUtility(ICatalog)
|
||||
return cat.searchResults(loops_text=name)
|
||||
|
||||
def getConceptManager(self, obj):
|
||||
return obj.context.getLoopsRoot().getConceptManager()
|
||||
|
41
classifier/standard.py
Normal file
41
classifier/standard.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
#
|
||||
# Copyright (c) 2007 Helmut Merz helmutm@cy55.de
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
#
|
||||
|
||||
"""
|
||||
Standard implementations of classifier components.
|
||||
|
||||
$Id$
|
||||
"""
|
||||
|
||||
from zope.component import adapts
|
||||
|
||||
from loops.classifier.base import Extractor
|
||||
from loops.classifier.base import InformationSet
|
||||
from loops.interfaces import IExternalFile
|
||||
|
||||
|
||||
class FilenameExtractor(Extractor):
|
||||
|
||||
adapts(IExternalFile)
|
||||
|
||||
def __init__(self, context):
|
||||
self.context = context
|
||||
|
||||
def extractInformationSet(self):
|
||||
filename = self.context.externalAddress
|
||||
return InformationSet(filename=filename)
|
1
classifier/testdata/cust_im_contract_webbg_20071015.txt
vendored
Normal file
1
classifier/testdata/cust_im_contract_webbg_20071015.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
|
1
classifier/testdata/cust_im_note_smitha_20071004.txt
vendored
Normal file
1
classifier/testdata/cust_im_note_smitha_20071004.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
|
1
classifier/testdata/cust_im_note_webbg_20070924.txt
vendored
Normal file
1
classifier/testdata/cust_im_note_webbg_20070924.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
|
1
classifier/testdata/cust_mc_note_webbg_20070824.txt
vendored
Normal file
1
classifier/testdata/cust_mc_note_webbg_20070824.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
|
1
classifier/testdata/emp_watersj_note_miller_20070822.txt
vendored
Normal file
1
classifier/testdata/emp_watersj_note_miller_20070822.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
|
1
classifier/testdata/emp_webbg_note_millerj_20070804.txt
vendored
Normal file
1
classifier/testdata/emp_webbg_note_millerj_20070804.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
|
1
classifier/testdata/note_smitha_20070824.txt
vendored
Normal file
1
classifier/testdata/note_smitha_20070824.txt
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
|
|
@ -6,12 +6,23 @@ $Id$
|
|||
|
||||
import os
|
||||
from zope import component
|
||||
#from zope.app.catalog.interfaces import ICatalog
|
||||
#from zope.app.catalog.text import TextIndex
|
||||
|
||||
from cybertools.storage.interfaces import IExternalStorage
|
||||
from cybertools.storage.filesystem import fullPathStorage
|
||||
from loops import util
|
||||
from loops.classifier.base import Classifier, Extractor, Analyzer
|
||||
from loops.classifier.base import Classifier
|
||||
from loops.classifier.sample import SampleAnalyzer
|
||||
from loops.classifier.standard import FilenameExtractor
|
||||
from loops.classifier.interfaces import IClassifier, IAnalyzer
|
||||
from loops.common import adapted
|
||||
from loops.concept import Concept
|
||||
from loops.resource import Resource
|
||||
from loops.resource import Resource, ExternalFileAdapter
|
||||
from loops.interfaces import IIndexAttributes, IExternalFile
|
||||
from loops.integrator.collection import DirectoryCollectionProvider
|
||||
from loops.integrator.collection import ExternalCollectionAdapter
|
||||
from loops.integrator.interfaces import IExternalCollection, IExternalCollectionProvider
|
||||
from loops.knowledge.setup import SetupManager as KnowledgeSetupManager
|
||||
from loops.setup import SetupManager, addAndConfigureObject
|
||||
from loops.tests.setup import TestSite as BaseTestSite
|
||||
|
@ -28,18 +39,34 @@ class TestSite(BaseTestSite):
|
|||
component.provideAdapter(KnowledgeSetupManager, name='knowledge')
|
||||
concepts, resources, views = self.baseSetup()
|
||||
|
||||
#catalog = component.getUtility(ICatalog)
|
||||
#catalog['loops_text'] = TextIndex('text', IIndexAttributes, True)
|
||||
# classifier and Co
|
||||
tType = concepts.getTypeConcept()
|
||||
tClassifier = addAndConfigureObject(concepts, Concept, 'classifier',
|
||||
title=u'Classifier', conceptType=tType,
|
||||
typeInterface=IClassifier)
|
||||
|
||||
component.provideAdapter(Classifier)
|
||||
fileClassifier = addAndConfigureObject(concepts, Concept,
|
||||
sampleClassifier = addAndConfigureObject(concepts, Concept,
|
||||
'fileclassifier', title=u'File Classifier',
|
||||
conceptType=tClassifier)
|
||||
sampleClassifier = adapted(sampleClassifier)
|
||||
sampleClassifier.extractors = 'filename'
|
||||
sampleClassifier.analyzer = 'sample'
|
||||
component.provideAdapter(FilenameExtractor, name='filename')
|
||||
component.provideUtility(SampleAnalyzer(), IAnalyzer, name='sample')
|
||||
|
||||
component.provideAdapter(Extractor)
|
||||
component.provideUtility(Analyzer, IAnalyzer)
|
||||
# external file stuff for providing test files
|
||||
component.provideAdapter(ExternalFileAdapter, provides=IExternalFile)
|
||||
component.provideUtility(fullPathStorage(), IExternalStorage, name='fullpath')
|
||||
component.provideAdapter(ExternalCollectionAdapter)
|
||||
component.provideUtility(DirectoryCollectionProvider(), IExternalCollectionProvider)
|
||||
tExtFile = addAndConfigureObject(concepts, Concept, 'extfile',
|
||||
title=u'External File', conceptType=tType,
|
||||
typeInterface=IExternalFile)
|
||||
tExtCollection = addAndConfigureObject(concepts, Concept, 'extcollection',
|
||||
title=u'External Collection', conceptType=tType,
|
||||
typeInterface=IExternalCollection)
|
||||
|
||||
self.indexAll(concepts, resources)
|
||||
return concepts, resources, views
|
||||
|
|
|
@ -111,7 +111,7 @@ Working with the External Collection
|
|||
(u'programming_zope_zope3.txt', u'zope3', 'fullpath')]
|
||||
|
||||
We may update the collection after having changed the storage params.
|
||||
This should also the settings for existing objects if they still
|
||||
This should also change the settings for existing objects if they still
|
||||
can be found.
|
||||
|
||||
>>> import os
|
||||
|
|
Loading…
Add table
Reference in a new issue