added catalog package with a simple variation of hurry.query and a keyword index

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2550 fd906abe-77d9-0310-91a1-e0d9ade77398
This commit is contained in:
helmutm 2008-04-26 16:40:04 +00:00
parent 3acc628b80
commit 56a889540b
6 changed files with 535 additions and 0 deletions

198
catalog/README.txt Normal file
View file

@ -0,0 +1,198 @@
=========================
Catalog, Indexes, Queries
=========================
($Id$)
Set up Working Environment
==========================
We first have to set up an IntIds utility (we use a dummy implementation
for testing purposes here) and a catalog with a few indexes.
>>> from zope import component
>>> from cybertools.relation.tests import IntIdsStub
>>> intid = IntIdsStub()
>>> component.provideUtility(intid)
>>> from zope.app.catalog.interfaces import ICatalog
>>> from zope.app.catalog.catalog import Catalog
>>> catalog = Catalog()
>>> component.provideUtility(catalog, ICatalog)
>>> from zope.interface import Interface, Attribute, implements
>>> class IContent(Interface):
... f1 = Attribute('f1')
... f2 = Attribute('f2')
... f3 = Attribute('f3')
... t1 = Attribute('t1')
... t2 = Attribute('t2')
... k1 = Attribute('k1')
>>> from zope.app.catalog.field import FieldIndex
>>> from zope.app.catalog.text import TextIndex
>>> from cybertools.catalog.keyword import KeywordIndex
>>> catalog['f1'] = FieldIndex('f1', IContent)
>>> catalog['f2'] = FieldIndex('f2', IContent)
>>> catalog['f3'] = FieldIndex('f3', IContent)
>>> catalog['t1'] = TextIndex('t1', IContent)
>>> catalog['t2'] = TextIndex('t2', IContent)
>>> catalog['k1'] = KeywordIndex('k1', IContent)
In addition we need a class for the content objects that we want
to index and query.
>>> from zope.app.container.contained import Contained
>>> class Content(Contained):
... implements(IContent)
... def __init__(self, id, f1='', f2='', f3='', t1='', t2='', k1=[]):
... self.id = id
... self.f1 = f1
... self.f2 = f2
... self.f3 = f3
... self.t1 = t1
... self.t2 = t2
... self.k1 = k1
... def __cmp__(self, other):
... return cmp(self.id, other.id)
The id attribute is just so we can identify objects we find again
easily. By including the __cmp__ method we make sure search results
can be stably sorted.
We are now ready to create a few content objects.
Now let's create some objects so that they'll be cataloged.
>>> content = [
... Content(1, 'a', 'b', 'd'),
... Content(2, 'a', 'c'),
... Content(3, 'X', 'c'),
... Content(4, 'a', 'b', 'e'),
... Content(5, 'X', 'b', 'e', k1=('zope', 'plone')),
... Content(6, 'Y', 'Z', t1='some interesting text')]
And catalog them now.
>>> for entry in content:
... catalog.index_doc(intid.register(entry), entry)
Let's provide a simple function for displaying query results.
>>> def displayQuery(q):
... return [intid.getObject(uid).id for uid in q.apply()]
Field Index Queries
===================
Now for a query where f1 equals a.
>>> from cybertools.catalog.query import Eq
>>> f1 = ('', 'f1')
>>> displayQuery(Eq(f1, 'a'))
[1, 2, 4]
Not equals (this is more efficient than the generic ~ operator).
>>> from cybertools.catalog.query import NotEq
>>> displayQuery(NotEq(f1, 'a'))
[3, 5, 6]
Testing whether a field is in a set.
>>> from cybertools.catalog.query import In
>>> displayQuery(In(f1, ['a', 'X']))
[1, 2, 3, 4, 5]
Whether documents are in a specified range.
>>> from cybertools.catalog.query import Between
>>> displayQuery(Between(f1, 'X', 'Y'))
[3, 5, 6]
You can leave out one end of the range.
>>> displayQuery(Between(f1, 'X', None)) # 'X' < 'a'
[1, 2, 3, 4, 5, 6]
>>> displayQuery(Between(f1, None, 'X'))
[3, 5]
You can also use greater-equals and less-equals for the same purpose.
>>> from cybertools.catalog.query import Ge, Le
>>> displayQuery(Ge(f1, 'X'))
[1, 2, 3, 4, 5, 6]
>>> displayQuery(Le(f1, 'X'))
[3, 5]
It's also possible to use not with the ~ operator.
>>> displayQuery(~Eq(f1, 'a'))
[3, 5, 6]
Using and (&).
>>> f2 = ('', 'f2')
>>> displayQuery(Eq(f1, 'a') & Eq(f2, 'b'))
[1, 4]
Using or (|).
>>> displayQuery(Eq(f1, 'a') | Eq(f2, 'b'))
[1, 2, 4, 5]
These can be chained.
>>> displayQuery(Eq(f1, 'a') & Eq(f2, 'b') & Between(f1, 'a', 'b'))
[1, 4]
>>> displayQuery(Eq(f1, 'a') | Eq(f1, 'X') | Eq(f2, 'b'))
[1, 2, 3, 4, 5]
And nested.
>>> displayQuery((Eq(f1, 'a') | Eq(f1, 'X')) & (Eq(f2, 'b') | Eq(f2, 'c')))
[1, 2, 3, 4, 5]
"and" and "or" can also be spelled differently.
>>> from cybertools.catalog.query import And, Or
>>> displayQuery(And(Eq(f1, 'a'), Eq(f2, 'b')))
[1, 4]
>>> displayQuery(Or(Eq(f1, 'a'), Eq(f2, 'b')))
[1, 2, 4, 5]
Combination of In and &
-----------------------
A combination of 'In' and '&'.
>>> displayQuery(In(f1, ['a', 'X', 'Y', 'Z']))
[1, 2, 3, 4, 5, 6]
>>> displayQuery(In(f1, ['Z']))
[]
>>> displayQuery(In(f1, ['a', 'X', 'Y', 'Z']) & In(f1, ['Z']))
[]
Text Index Queries
==================
>>> from cybertools.catalog.query import Text
>>> t1 = ('', 't1')
>>> displayQuery(Text(t1, 'interesting'))
[6]
Keyword Index Queries
=====================
>>> from cybertools.catalog.query import AllOf, AnyOf
>>> k1 = ('', 'k1')
>>> displayQuery(AnyOf(k1, 'plone'))
[5]
>>> displayQuery(AllOf(k1, ['plone', 'zope', 'zms']))
[]
>>> displayQuery(AllOf(k1, ['plone', 'zope']))
[5]

3
catalog/__init__.py Normal file
View file

@ -0,0 +1,3 @@
"""
$Id$
"""

43
catalog/keyword.py Normal file
View file

@ -0,0 +1,43 @@
#
# Copyright (c) 2008 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""Keyword catalog index.
$Id$
"""
import zope.index.keyword
import zope.interface
import zope.app.container.contained
import zope.app.catalog.attribute
import zope.app.catalog.interfaces
class IKeywordIndex(zope.app.catalog.interfaces.IAttributeIndex,
zope.app.catalog.interfaces.ICatalogIndex):
"""Interface-based catalog keyword index.
"""
class KeywordIndex(zope.app.catalog.attribute.AttributeIndex,
zope.index.keyword.KeywordIndex,
zope.app.container.contained.Contained):
zope.interface.implements(IKeywordIndex)

260
catalog/query.py Normal file
View file

@ -0,0 +1,260 @@
#
# Copyright (c) 2008 Helmut Merz helmutm@cy55.de
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
"""
Catalog query terms and their logical combinations.
This is mainly a simplified version of Martijn Faassen's hurry.query
(http://cheeseshop.python.org/pypi/hurry.query).
$Id$
"""
from BTrees.IFBTree import weightedIntersection, weightedUnion
from BTrees.IFBTree import difference, IFBTree, IFBucket
from zope.app.intid.interfaces import IIntIds
from zope.app.catalog.catalog import ResultSet
from zope.app.catalog.field import IFieldIndex
from zope.app.catalog.text import ITextIndex
from zope.app.catalog.interfaces import ICatalog
from zope import component
from cybertools.catalog.keyword import IKeywordIndex
class Term(object):
def __and__(self, other):
return And(self, other)
def __rand__(self, other):
return And(other, self)
def __or__(self, other):
return Or(self, other)
def __ror__(self, other):
return Or(other, self)
def __invert__(self):
return Not(self)
class And(Term):
def __init__(self, *terms):
self.terms = terms
def apply(self):
results = []
for term in self.terms:
r = term.apply()
if not r:
# empty results
return r
results.append((len(r), r))
if not results:
# no applicable terms at all
return IFBucket()
results.sort()
_, result = results.pop(0)
for _, r in results:
w, result = weightedIntersection(result, r)
return result
class Or(Term):
def __init__(self, *terms):
self.terms = terms
def apply(self):
results = []
for term in self.terms:
r = term.apply()
# empty results
if not r:
continue
results.append(r)
if not results:
# no applicable terms at all
return IFBucket()
result = results.pop(0)
for r in results:
w, result = weightedUnion(result, r)
return result
class Not(Term):
def __init__(self, term):
self.term = term
def apply(self):
return difference(self._all(), self.term.apply())
def _all(self):
# XXX may not work well/be efficient with extentcatalog
# XXX not very efficient in general, better to use internal
# IntIds datastructure but that would break abstraction..
intids = component.getUtility(IIntIds)
result = IFBucket()
for uid in intids:
result[uid] = 0
return result
class IndexTerm(Term):
def __init__(self, (catalog_name, index_name)):
self.catalog_name = catalog_name
self.index_name = index_name
def getIndex(self):
catalog = component.getUtility(ICatalog, self.catalog_name)
index = catalog[self.index_name]
return index
# field index
class FieldTerm(IndexTerm):
def getIndex(self):
index = super(FieldTerm, self).getIndex()
assert IFieldIndex.providedBy(index)
return index
class Eq(FieldTerm):
def __init__(self, index_id, value):
assert value is not None
super(Eq, self).__init__(index_id)
self.value = value
def apply(self):
return self.getIndex().apply((self.value, self.value))
class NotEq(FieldTerm):
def __init__(self, index_id, not_value):
super(NotEq, self).__init__(index_id)
self.not_value = not_value
def apply(self):
index = self.getIndex()
all = index.apply((None, None))
r = index.apply((self.not_value, self.not_value))
return difference(all, r)
class Between(FieldTerm):
def __init__(self, index_id, min_value, max_value):
super(Between, self).__init__(index_id)
self.min_value = min_value
self.max_value = max_value
def apply(self):
return self.getIndex().apply((self.min_value, self.max_value))
class Ge(Between):
def __init__(self, index_id, min_value):
super(Ge, self).__init__(index_id, min_value, None)
class Le(Between):
def __init__(self, index_id, max_value):
super(Le, self).__init__(index_id, None, max_value)
class In(FieldTerm):
def __init__(self, index_id, values):
assert None not in values
super(In, self).__init__(index_id)
self.values = values
def apply(self):
results = []
index = self.getIndex()
for value in self.values:
r = index.apply((value, value))
# empty results
if not r:
continue
results.append(r)
if not results:
# no applicable terms at all
return IFBucket()
result = results.pop(0)
for r in results:
w, result = weightedUnion(result, r)
return result
# text index
class Text(IndexTerm):
def __init__(self, index_id, text):
super(Text, self).__init__(index_id)
self.text = text
def getIndex(self):
index = super(Text, self).getIndex()
assert ITextIndex.providedBy(index)
return index
def apply(self):
index = self.getIndex()
return index.apply(self.text)
# keyword index
class KeywordTerm(IndexTerm):
def __init__(self, index_id, values):
super(KeywordTerm, self).__init__(index_id)
self.values = values
def getIndex(self):
index = super(KeywordTerm, self).getIndex()
assert IKeywordIndex.providedBy(index)
return index
class AnyOf(KeywordTerm):
def apply(self):
index = self.getIndex()
return index.search(self.values, 'or')
class AllOf(KeywordTerm):
def apply(self):
index = self.getIndex()
return index.search(self.values, 'and')

28
catalog/tests.py Executable file
View file

@ -0,0 +1,28 @@
#! /usr/bin/python
"""
Tests for the 'cybertools.catalog' package.
$Id$
"""
import unittest, doctest
from zope.testing.doctestunit import DocFileSuite
class Test(unittest.TestCase):
"Basic tests for the cybertools.catalog package."
def testBasicStuff(self):
pass
def test_suite():
flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS
return unittest.TestSuite((
unittest.makeSuite(Test),
DocFileSuite('README.txt', optionflags=flags),
))
if __name__ == '__main__':
unittest.main(defaultTest='test_suite')

View file

@ -37,6 +37,9 @@ class IntIdsStub(object):
id = self.getId(ob)
self.objs[id] = None
def __iter__(self):
return iter(xrange(len(self.objs)))
class TestRelation(unittest.TestCase):
"Basic tests for the relation package."