From 56a889540b08a123bf9acc433e7b42c1664b79e2 Mon Sep 17 00:00:00 2001 From: helmutm Date: Sat, 26 Apr 2008 16:40:04 +0000 Subject: [PATCH] added catalog package with a simple variation of hurry.query and a keyword index git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2550 fd906abe-77d9-0310-91a1-e0d9ade77398 --- catalog/README.txt | 198 +++++++++++++++++++++++++++++++++ catalog/__init__.py | 3 + catalog/keyword.py | 43 ++++++++ catalog/query.py | 260 ++++++++++++++++++++++++++++++++++++++++++++ catalog/tests.py | 28 +++++ relation/tests.py | 3 + 6 files changed, 535 insertions(+) create mode 100644 catalog/README.txt create mode 100644 catalog/__init__.py create mode 100644 catalog/keyword.py create mode 100644 catalog/query.py create mode 100755 catalog/tests.py diff --git a/catalog/README.txt b/catalog/README.txt new file mode 100644 index 0000000..3b48988 --- /dev/null +++ b/catalog/README.txt @@ -0,0 +1,198 @@ +========================= +Catalog, Indexes, Queries +========================= + + ($Id$) + + +Set up Working Environment +========================== + +We first have to set up an IntIds utility (we use a dummy implementation +for testing purposes here) and a catalog with a few indexes. + + >>> from zope import component + >>> from cybertools.relation.tests import IntIdsStub + >>> intid = IntIdsStub() + >>> component.provideUtility(intid) + + >>> from zope.app.catalog.interfaces import ICatalog + >>> from zope.app.catalog.catalog import Catalog + >>> catalog = Catalog() + >>> component.provideUtility(catalog, ICatalog) + + >>> from zope.interface import Interface, Attribute, implements + >>> class IContent(Interface): + ... f1 = Attribute('f1') + ... f2 = Attribute('f2') + ... f3 = Attribute('f3') + ... t1 = Attribute('t1') + ... t2 = Attribute('t2') + ... k1 = Attribute('k1') + + >>> from zope.app.catalog.field import FieldIndex + >>> from zope.app.catalog.text import TextIndex + >>> from cybertools.catalog.keyword import KeywordIndex + >>> catalog['f1'] = FieldIndex('f1', IContent) + >>> catalog['f2'] = FieldIndex('f2', IContent) + >>> catalog['f3'] = FieldIndex('f3', IContent) + >>> catalog['t1'] = TextIndex('t1', IContent) + >>> catalog['t2'] = TextIndex('t2', IContent) + >>> catalog['k1'] = KeywordIndex('k1', IContent) + +In addition we need a class for the content objects that we want +to index and query. + + >>> from zope.app.container.contained import Contained + >>> class Content(Contained): + ... implements(IContent) + ... def __init__(self, id, f1='', f2='', f3='', t1='', t2='', k1=[]): + ... self.id = id + ... self.f1 = f1 + ... self.f2 = f2 + ... self.f3 = f3 + ... self.t1 = t1 + ... self.t2 = t2 + ... self.k1 = k1 + ... def __cmp__(self, other): + ... return cmp(self.id, other.id) + +The id attribute is just so we can identify objects we find again +easily. By including the __cmp__ method we make sure search results +can be stably sorted. + +We are now ready to create a few content objects. + +Now let's create some objects so that they'll be cataloged. + + >>> content = [ + ... Content(1, 'a', 'b', 'd'), + ... Content(2, 'a', 'c'), + ... Content(3, 'X', 'c'), + ... Content(4, 'a', 'b', 'e'), + ... Content(5, 'X', 'b', 'e', k1=('zope', 'plone')), + ... Content(6, 'Y', 'Z', t1='some interesting text')] + +And catalog them now. + + >>> for entry in content: + ... catalog.index_doc(intid.register(entry), entry) + +Let's provide a simple function for displaying query results. + + >>> def displayQuery(q): + ... return [intid.getObject(uid).id for uid in q.apply()] + + +Field Index Queries +=================== + +Now for a query where f1 equals a. + + >>> from cybertools.catalog.query import Eq + >>> f1 = ('', 'f1') + >>> displayQuery(Eq(f1, 'a')) + [1, 2, 4] + +Not equals (this is more efficient than the generic ~ operator). + + >>> from cybertools.catalog.query import NotEq + >>> displayQuery(NotEq(f1, 'a')) + [3, 5, 6] + +Testing whether a field is in a set. + + >>> from cybertools.catalog.query import In + >>> displayQuery(In(f1, ['a', 'X'])) + [1, 2, 3, 4, 5] + +Whether documents are in a specified range. + + >>> from cybertools.catalog.query import Between + >>> displayQuery(Between(f1, 'X', 'Y')) + [3, 5, 6] + +You can leave out one end of the range. + + >>> displayQuery(Between(f1, 'X', None)) # 'X' < 'a' + [1, 2, 3, 4, 5, 6] + >>> displayQuery(Between(f1, None, 'X')) + [3, 5] + +You can also use greater-equals and less-equals for the same purpose. + + >>> from cybertools.catalog.query import Ge, Le + >>> displayQuery(Ge(f1, 'X')) + [1, 2, 3, 4, 5, 6] + >>> displayQuery(Le(f1, 'X')) + [3, 5] + +It's also possible to use not with the ~ operator. + + >>> displayQuery(~Eq(f1, 'a')) + [3, 5, 6] + +Using and (&). + + >>> f2 = ('', 'f2') + >>> displayQuery(Eq(f1, 'a') & Eq(f2, 'b')) + [1, 4] + +Using or (|). + + >>> displayQuery(Eq(f1, 'a') | Eq(f2, 'b')) + [1, 2, 4, 5] + +These can be chained. + + >>> displayQuery(Eq(f1, 'a') & Eq(f2, 'b') & Between(f1, 'a', 'b')) + [1, 4] + >>> displayQuery(Eq(f1, 'a') | Eq(f1, 'X') | Eq(f2, 'b')) + [1, 2, 3, 4, 5] + +And nested. + + >>> displayQuery((Eq(f1, 'a') | Eq(f1, 'X')) & (Eq(f2, 'b') | Eq(f2, 'c'))) + [1, 2, 3, 4, 5] + +"and" and "or" can also be spelled differently. + + >>> from cybertools.catalog.query import And, Or + >>> displayQuery(And(Eq(f1, 'a'), Eq(f2, 'b'))) + [1, 4] + >>> displayQuery(Or(Eq(f1, 'a'), Eq(f2, 'b'))) + [1, 2, 4, 5] + +Combination of In and & +----------------------- + +A combination of 'In' and '&'. + + >>> displayQuery(In(f1, ['a', 'X', 'Y', 'Z'])) + [1, 2, 3, 4, 5, 6] + >>> displayQuery(In(f1, ['Z'])) + [] + >>> displayQuery(In(f1, ['a', 'X', 'Y', 'Z']) & In(f1, ['Z'])) + [] + + +Text Index Queries +================== + + >>> from cybertools.catalog.query import Text + >>> t1 = ('', 't1') + >>> displayQuery(Text(t1, 'interesting')) + [6] + + +Keyword Index Queries +===================== + + >>> from cybertools.catalog.query import AllOf, AnyOf + >>> k1 = ('', 'k1') + >>> displayQuery(AnyOf(k1, 'plone')) + [5] + >>> displayQuery(AllOf(k1, ['plone', 'zope', 'zms'])) + [] + >>> displayQuery(AllOf(k1, ['plone', 'zope'])) + [5] diff --git a/catalog/__init__.py b/catalog/__init__.py new file mode 100644 index 0000000..38314f3 --- /dev/null +++ b/catalog/__init__.py @@ -0,0 +1,3 @@ +""" +$Id$ +""" diff --git a/catalog/keyword.py b/catalog/keyword.py new file mode 100644 index 0000000..4b362a2 --- /dev/null +++ b/catalog/keyword.py @@ -0,0 +1,43 @@ +# +# Copyright (c) 2008 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +"""Keyword catalog index. + +$Id$ +""" + +import zope.index.keyword +import zope.interface + +import zope.app.container.contained +import zope.app.catalog.attribute +import zope.app.catalog.interfaces + + +class IKeywordIndex(zope.app.catalog.interfaces.IAttributeIndex, + zope.app.catalog.interfaces.ICatalogIndex): + """Interface-based catalog keyword index. + """ + + +class KeywordIndex(zope.app.catalog.attribute.AttributeIndex, + zope.index.keyword.KeywordIndex, + zope.app.container.contained.Contained): + + zope.interface.implements(IKeywordIndex) + diff --git a/catalog/query.py b/catalog/query.py new file mode 100644 index 0000000..8075c61 --- /dev/null +++ b/catalog/query.py @@ -0,0 +1,260 @@ +# +# Copyright (c) 2008 Helmut Merz helmutm@cy55.de +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +""" +Catalog query terms and their logical combinations. + +This is mainly a simplified version of Martijn Faassen's hurry.query +(http://cheeseshop.python.org/pypi/hurry.query). + +$Id$ +""" + +from BTrees.IFBTree import weightedIntersection, weightedUnion +from BTrees.IFBTree import difference, IFBTree, IFBucket +from zope.app.intid.interfaces import IIntIds +from zope.app.catalog.catalog import ResultSet +from zope.app.catalog.field import IFieldIndex +from zope.app.catalog.text import ITextIndex +from zope.app.catalog.interfaces import ICatalog +from zope import component + +from cybertools.catalog.keyword import IKeywordIndex + + +class Term(object): + + def __and__(self, other): + return And(self, other) + + def __rand__(self, other): + return And(other, self) + + def __or__(self, other): + return Or(self, other) + + def __ror__(self, other): + return Or(other, self) + + def __invert__(self): + return Not(self) + + +class And(Term): + + def __init__(self, *terms): + self.terms = terms + + def apply(self): + results = [] + for term in self.terms: + r = term.apply() + if not r: + # empty results + return r + results.append((len(r), r)) + if not results: + # no applicable terms at all + return IFBucket() + results.sort() + _, result = results.pop(0) + for _, r in results: + w, result = weightedIntersection(result, r) + return result + + +class Or(Term): + + def __init__(self, *terms): + self.terms = terms + + def apply(self): + results = [] + for term in self.terms: + r = term.apply() + # empty results + if not r: + continue + results.append(r) + if not results: + # no applicable terms at all + return IFBucket() + result = results.pop(0) + for r in results: + w, result = weightedUnion(result, r) + return result + + +class Not(Term): + + def __init__(self, term): + self.term = term + + def apply(self): + return difference(self._all(), self.term.apply()) + + def _all(self): + # XXX may not work well/be efficient with extentcatalog + # XXX not very efficient in general, better to use internal + # IntIds datastructure but that would break abstraction.. + intids = component.getUtility(IIntIds) + result = IFBucket() + for uid in intids: + result[uid] = 0 + return result + + +class IndexTerm(Term): + + def __init__(self, (catalog_name, index_name)): + self.catalog_name = catalog_name + self.index_name = index_name + + def getIndex(self): + catalog = component.getUtility(ICatalog, self.catalog_name) + index = catalog[self.index_name] + return index + + +# field index + +class FieldTerm(IndexTerm): + + def getIndex(self): + index = super(FieldTerm, self).getIndex() + assert IFieldIndex.providedBy(index) + return index + + +class Eq(FieldTerm): + + def __init__(self, index_id, value): + assert value is not None + super(Eq, self).__init__(index_id) + self.value = value + + def apply(self): + return self.getIndex().apply((self.value, self.value)) + + +class NotEq(FieldTerm): + + def __init__(self, index_id, not_value): + super(NotEq, self).__init__(index_id) + self.not_value = not_value + + def apply(self): + index = self.getIndex() + all = index.apply((None, None)) + r = index.apply((self.not_value, self.not_value)) + return difference(all, r) + + +class Between(FieldTerm): + + def __init__(self, index_id, min_value, max_value): + super(Between, self).__init__(index_id) + self.min_value = min_value + self.max_value = max_value + + def apply(self): + return self.getIndex().apply((self.min_value, self.max_value)) + + +class Ge(Between): + + def __init__(self, index_id, min_value): + super(Ge, self).__init__(index_id, min_value, None) + + +class Le(Between): + + def __init__(self, index_id, max_value): + super(Le, self).__init__(index_id, None, max_value) + + +class In(FieldTerm): + + def __init__(self, index_id, values): + assert None not in values + super(In, self).__init__(index_id) + self.values = values + + def apply(self): + results = [] + index = self.getIndex() + for value in self.values: + r = index.apply((value, value)) + # empty results + if not r: + continue + results.append(r) + if not results: + # no applicable terms at all + return IFBucket() + result = results.pop(0) + for r in results: + w, result = weightedUnion(result, r) + return result + + +# text index + +class Text(IndexTerm): + + def __init__(self, index_id, text): + super(Text, self).__init__(index_id) + self.text = text + + def getIndex(self): + index = super(Text, self).getIndex() + assert ITextIndex.providedBy(index) + return index + + def apply(self): + index = self.getIndex() + return index.apply(self.text) + + +# keyword index + +class KeywordTerm(IndexTerm): + + def __init__(self, index_id, values): + super(KeywordTerm, self).__init__(index_id) + self.values = values + + def getIndex(self): + index = super(KeywordTerm, self).getIndex() + assert IKeywordIndex.providedBy(index) + return index + + +class AnyOf(KeywordTerm): + + def apply(self): + index = self.getIndex() + return index.search(self.values, 'or') + + +class AllOf(KeywordTerm): + + def apply(self): + index = self.getIndex() + return index.search(self.values, 'and') + diff --git a/catalog/tests.py b/catalog/tests.py new file mode 100755 index 0000000..e20b183 --- /dev/null +++ b/catalog/tests.py @@ -0,0 +1,28 @@ +#! /usr/bin/python + +""" +Tests for the 'cybertools.catalog' package. + +$Id$ +""" + +import unittest, doctest +from zope.testing.doctestunit import DocFileSuite + + +class Test(unittest.TestCase): + "Basic tests for the cybertools.catalog package." + + def testBasicStuff(self): + pass + + +def test_suite(): + flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS + return unittest.TestSuite(( + unittest.makeSuite(Test), + DocFileSuite('README.txt', optionflags=flags), + )) + +if __name__ == '__main__': + unittest.main(defaultTest='test_suite') diff --git a/relation/tests.py b/relation/tests.py index 1e3a610..ce86bcb 100755 --- a/relation/tests.py +++ b/relation/tests.py @@ -37,6 +37,9 @@ class IntIdsStub(object): id = self.getId(ob) self.objs[id] = None + def __iter__(self): + return iter(xrange(len(self.objs))) + class TestRelation(unittest.TestCase): "Basic tests for the relation package."