added catalog package with a simple variation of hurry.query and a keyword index

git-svn-id: svn://svn.cy55.de/Zope3/src/cybertools/trunk@2550 fd906abe-77d9-0310-91a1-e0d9ade77398
2008-04-26 16:40:04 +00:00 · 2008-04-26 16:40:04 +00:00 · 56a889540b
commit 56a889540b
parent 3acc628b80
6 changed files with 535 additions and 0 deletions
--- a/catalog/README.txt
+++ b/catalog/README.txt
@ -0,0 +1,198 @@
+=========================
+Catalog, Indexes, Queries
+=========================
+
+  ($Id$)
+
+
+Set up Working Environment
+==========================
+
+We first have to set up an IntIds utility (we use a dummy implementation
+for testing purposes here) and a catalog with a few indexes.
+
+  >>> from zope import component
+  >>> from cybertools.relation.tests import IntIdsStub
+  >>> intid = IntIdsStub()
+  >>> component.provideUtility(intid)
+
+  >>> from zope.app.catalog.interfaces import ICatalog
+  >>> from zope.app.catalog.catalog import Catalog
+  >>> catalog = Catalog()
+  >>> component.provideUtility(catalog, ICatalog)
+
+  >>> from zope.interface import Interface, Attribute, implements
+  >>> class IContent(Interface):
+  ...     f1 = Attribute('f1')
+  ...     f2 = Attribute('f2')
+  ...     f3 = Attribute('f3')
+  ...     t1 = Attribute('t1')
+  ...     t2 = Attribute('t2')
+  ...     k1 = Attribute('k1')
+
+  >>> from zope.app.catalog.field import FieldIndex
+  >>> from zope.app.catalog.text import TextIndex
+  >>> from cybertools.catalog.keyword import KeywordIndex
+  >>> catalog['f1'] = FieldIndex('f1', IContent)
+  >>> catalog['f2'] = FieldIndex('f2', IContent)
+  >>> catalog['f3'] = FieldIndex('f3', IContent)
+  >>> catalog['t1'] = TextIndex('t1', IContent)
+  >>> catalog['t2'] = TextIndex('t2', IContent)
+  >>> catalog['k1'] = KeywordIndex('k1', IContent)
+
+In addition we need a class for the content objects that we want
+to index and query.
+
+  >>> from zope.app.container.contained import Contained
+  >>> class Content(Contained):
+  ...     implements(IContent)
+  ...     def __init__(self, id, f1='', f2='', f3='', t1='', t2='', k1=[]):
+  ...         self.id = id
+  ...         self.f1 = f1
+  ...         self.f2 = f2
+  ...         self.f3 = f3
+  ...         self.t1 = t1
+  ...         self.t2 = t2
+  ...         self.k1 = k1
+  ...     def __cmp__(self, other):
+  ...         return cmp(self.id, other.id)
+
+The id attribute is just so we can identify objects we find again
+easily. By including the __cmp__ method we make sure search results
+can be stably sorted.
+
+We are now ready to create a few content objects.
+
+Now let's create some objects so that they'll be cataloged.
+
+  >>> content = [
+  ... Content(1, 'a', 'b', 'd'),
+  ... Content(2, 'a', 'c'),
+  ... Content(3, 'X', 'c'),
+  ... Content(4, 'a', 'b', 'e'),
+  ... Content(5, 'X', 'b', 'e', k1=('zope', 'plone')),
+  ... Content(6, 'Y', 'Z', t1='some interesting text')]
+
+And catalog them now.
+
+  >>> for entry in content:
+  ...     catalog.index_doc(intid.register(entry), entry)
+
+Let's provide a simple function for displaying query results.
+
+  >>> def displayQuery(q):
+  ...     return [intid.getObject(uid).id for uid in q.apply()]
+
+
+Field Index Queries
+===================
+
+Now for a query where f1 equals a.
+
+  >>> from cybertools.catalog.query import Eq
+  >>> f1 = ('', 'f1')
+  >>> displayQuery(Eq(f1, 'a'))
+  [1, 2, 4]
+
+Not equals (this is more efficient than the generic ~ operator).
+
+  >>> from cybertools.catalog.query import NotEq
+  >>> displayQuery(NotEq(f1, 'a'))
+  [3, 5, 6]
+
+Testing whether a field is in a set.
+
+  >>> from cybertools.catalog.query import In
+  >>> displayQuery(In(f1, ['a', 'X']))
+  [1, 2, 3, 4, 5]
+
+Whether documents are in a specified range.
+
+  >>> from cybertools.catalog.query import Between
+  >>> displayQuery(Between(f1, 'X', 'Y'))
+  [3, 5, 6]
+
+You can leave out one end of the range.
+
+  >>> displayQuery(Between(f1, 'X', None)) # 'X' < 'a'
+  [1, 2, 3, 4, 5, 6]
+  >>> displayQuery(Between(f1, None, 'X'))
+  [3, 5]
+
+You can also use greater-equals and less-equals for the same purpose.
+
+  >>> from cybertools.catalog.query import Ge, Le
+  >>> displayQuery(Ge(f1, 'X'))
+  [1, 2, 3, 4, 5, 6]
+  >>> displayQuery(Le(f1, 'X'))
+  [3, 5]
+
+It's also possible to use not with the ~ operator.
+
+  >>> displayQuery(~Eq(f1, 'a'))
+  [3, 5, 6]
+
+Using and (&).
+
+  >>> f2 = ('', 'f2')
+  >>> displayQuery(Eq(f1, 'a') & Eq(f2, 'b'))
+  [1, 4]
+
+Using or (|).
+
+  >>> displayQuery(Eq(f1, 'a') | Eq(f2, 'b'))
+  [1, 2, 4, 5]
+
+These can be chained.
+
+  >>> displayQuery(Eq(f1, 'a') & Eq(f2, 'b') & Between(f1, 'a', 'b'))
+  [1, 4]
+  >>> displayQuery(Eq(f1, 'a') | Eq(f1, 'X') | Eq(f2, 'b'))
+  [1, 2, 3, 4, 5]
+
+And nested.
+
+  >>> displayQuery((Eq(f1, 'a') | Eq(f1, 'X')) & (Eq(f2, 'b') | Eq(f2, 'c')))
+  [1, 2, 3, 4, 5]
+
+"and" and "or" can also be spelled differently.
+
+  >>> from cybertools.catalog.query import And, Or
+  >>> displayQuery(And(Eq(f1, 'a'), Eq(f2, 'b')))
+  [1, 4]
+  >>> displayQuery(Or(Eq(f1, 'a'), Eq(f2, 'b')))
+  [1, 2, 4, 5]
+
+Combination of In and &
+-----------------------
+
+A combination of 'In' and '&'.
+
+  >>> displayQuery(In(f1, ['a', 'X', 'Y', 'Z']))
+  [1, 2, 3, 4, 5, 6]
+  >>> displayQuery(In(f1, ['Z']))
+  []
+  >>> displayQuery(In(f1, ['a', 'X', 'Y', 'Z']) & In(f1, ['Z']))
+  []
+
+
+Text Index Queries
+==================
+
+  >>> from cybertools.catalog.query import Text
+  >>> t1 = ('', 't1')
+  >>> displayQuery(Text(t1, 'interesting'))
+  [6]
+
+
+Keyword Index Queries
+=====================
+
+  >>> from cybertools.catalog.query import AllOf, AnyOf
+  >>> k1 = ('', 'k1')
+  >>> displayQuery(AnyOf(k1, 'plone'))
+  [5]
+  >>> displayQuery(AllOf(k1, ['plone', 'zope', 'zms']))
+  []
+  >>> displayQuery(AllOf(k1, ['plone', 'zope']))
+  [5]
--- a/catalog/init.py
+++ b/catalog/init.py
@ -0,0 +1,3 @@
+"""
+$Id$
+"""
--- a/catalog/keyword.py
+++ b/catalog/keyword.py
@ -0,0 +1,43 @@
+#
+#  Copyright (c) 2008 Helmut Merz helmutm@cy55.de
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+"""Keyword catalog index.
+
+$Id$
+"""
+
+import zope.index.keyword
+import zope.interface
+
+import zope.app.container.contained
+import zope.app.catalog.attribute
+import zope.app.catalog.interfaces
+
+
+class IKeywordIndex(zope.app.catalog.interfaces.IAttributeIndex,
+                    zope.app.catalog.interfaces.ICatalogIndex):
+    """Interface-based catalog keyword index.
+    """
+
+
+class KeywordIndex(zope.app.catalog.attribute.AttributeIndex,
+                 zope.index.keyword.KeywordIndex,
+                 zope.app.container.contained.Contained):
+
+    zope.interface.implements(IKeywordIndex)
+
--- a/catalog/query.py
+++ b/catalog/query.py
@ -0,0 +1,260 @@
+#
+#  Copyright (c) 2008 Helmut Merz helmutm@cy55.de
+#
+#  This program is free software; you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License as published by
+#  the Free Software Foundation; either version 2 of the License, or
+#  (at your option) any later version.
+#
+#  This program is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#  GNU General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with this program; if not, write to the Free Software
+#  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+
+"""
+Catalog query terms and their logical combinations.
+
+This is mainly a simplified version of Martijn Faassen's hurry.query
+(http://cheeseshop.python.org/pypi/hurry.query).
+
+$Id$
+"""
+
+from BTrees.IFBTree import weightedIntersection, weightedUnion
+from BTrees.IFBTree import difference, IFBTree, IFBucket
+from zope.app.intid.interfaces import IIntIds
+from zope.app.catalog.catalog import ResultSet
+from zope.app.catalog.field import IFieldIndex
+from zope.app.catalog.text import ITextIndex
+from zope.app.catalog.interfaces import ICatalog
+from zope import component
+
+from cybertools.catalog.keyword import IKeywordIndex
+
+
+class Term(object):
+
+    def __and__(self, other):
+        return And(self, other)
+
+    def __rand__(self, other):
+        return And(other, self)
+
+    def __or__(self, other):
+        return Or(self, other)
+
+    def __ror__(self, other):
+        return Or(other, self)
+
+    def __invert__(self):
+        return Not(self)
+
+
+class And(Term):
+
+    def __init__(self, *terms):
+        self.terms = terms
+
+    def apply(self):
+        results = []
+        for term in self.terms:
+            r = term.apply()
+            if not r:
+                # empty results
+                return r
+            results.append((len(r), r))
+        if not results:
+            # no applicable terms at all
+            return IFBucket()
+        results.sort()
+        _, result = results.pop(0)
+        for _, r in results:
+            w, result = weightedIntersection(result, r)
+        return result
+
+
+class Or(Term):
+
+    def __init__(self, *terms):
+        self.terms = terms
+
+    def apply(self):
+        results = []
+        for term in self.terms:
+            r = term.apply()
+            # empty results
+            if not r:
+                continue
+            results.append(r)
+        if not results:
+            # no applicable terms at all
+            return IFBucket()
+        result = results.pop(0)
+        for r in results:
+            w, result = weightedUnion(result, r)
+        return result
+
+
+class Not(Term):
+
+    def __init__(self, term):
+        self.term = term
+
+    def apply(self):
+        return difference(self._all(), self.term.apply())
+
+    def _all(self):
+        # XXX may not work well/be efficient with extentcatalog
+        # XXX not very efficient in general, better to use internal
+        # IntIds datastructure but that would break abstraction..
+        intids = component.getUtility(IIntIds)
+        result = IFBucket()
+        for uid in intids:
+            result[uid] = 0
+        return result
+
+
+class IndexTerm(Term):
+
+    def __init__(self, (catalog_name, index_name)):
+        self.catalog_name = catalog_name
+        self.index_name = index_name
+
+    def getIndex(self):
+        catalog = component.getUtility(ICatalog, self.catalog_name)
+        index = catalog[self.index_name]
+        return index
+
+
+# field index
+
+class FieldTerm(IndexTerm):
+
+    def getIndex(self):
+        index = super(FieldTerm, self).getIndex()
+        assert IFieldIndex.providedBy(index)
+        return index
+
+
+class Eq(FieldTerm):
+
+    def __init__(self, index_id, value):
+        assert value is not None
+        super(Eq, self).__init__(index_id)
+        self.value = value
+
+    def apply(self):
+        return self.getIndex().apply((self.value, self.value))
+
+
+class NotEq(FieldTerm):
+
+    def __init__(self, index_id, not_value):
+        super(NotEq, self).__init__(index_id)
+        self.not_value = not_value
+
+    def apply(self):
+        index = self.getIndex()
+        all = index.apply((None, None))
+        r = index.apply((self.not_value, self.not_value))
+        return difference(all, r)
+
+
+class Between(FieldTerm):
+
+    def __init__(self, index_id, min_value, max_value):
+        super(Between, self).__init__(index_id)
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def apply(self):
+        return self.getIndex().apply((self.min_value, self.max_value))
+
+
+class Ge(Between):
+
+    def __init__(self, index_id, min_value):
+        super(Ge, self).__init__(index_id, min_value, None)
+
+
+class Le(Between):
+
+    def __init__(self, index_id, max_value):
+        super(Le, self).__init__(index_id, None, max_value)
+
+
+class In(FieldTerm):
+
+    def __init__(self, index_id, values):
+        assert None not in values
+        super(In, self).__init__(index_id)
+        self.values = values
+
+    def apply(self):
+        results = []
+        index = self.getIndex()
+        for value in self.values:
+            r = index.apply((value, value))
+            # empty results
+            if not r:
+                continue
+            results.append(r)
+        if not results:
+            # no applicable terms at all
+            return IFBucket()
+        result = results.pop(0)
+        for r in results:
+            w, result = weightedUnion(result, r)
+        return result
+
+
+# text index
+
+class Text(IndexTerm):
+
+    def __init__(self, index_id, text):
+        super(Text, self).__init__(index_id)
+        self.text = text
+
+    def getIndex(self):
+        index = super(Text, self).getIndex()
+        assert ITextIndex.providedBy(index)
+        return index
+
+    def apply(self):
+        index = self.getIndex()
+        return index.apply(self.text)
+
+
+# keyword index
+
+class KeywordTerm(IndexTerm):
+
+    def __init__(self, index_id, values):
+        super(KeywordTerm, self).__init__(index_id)
+        self.values = values
+
+    def getIndex(self):
+        index = super(KeywordTerm, self).getIndex()
+        assert IKeywordIndex.providedBy(index)
+        return index
+
+
+class AnyOf(KeywordTerm):
+
+    def apply(self):
+        index = self.getIndex()
+        return index.search(self.values, 'or')
+
+
+class AllOf(KeywordTerm):
+
+    def apply(self):
+        index = self.getIndex()
+        return index.search(self.values, 'and')
+
--- a/catalog/tests.py
+++ b/catalog/tests.py
@ -0,0 +1,28 @@
+#! /usr/bin/python
+
+"""
+Tests for the 'cybertools.catalog' package.
+
+$Id$
+"""
+
+import unittest, doctest
+from zope.testing.doctestunit import DocFileSuite
+
+
+class Test(unittest.TestCase):
+    "Basic tests for the cybertools.catalog package."
+
+    def testBasicStuff(self):
+        pass
+
+
+def test_suite():
+    flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS
+    return unittest.TestSuite((
+        unittest.makeSuite(Test),
+        DocFileSuite('README.txt', optionflags=flags),
+        ))
+
+if __name__ == '__main__':
+    unittest.main(defaultTest='test_suite')
--- a/relation/tests.py
+++ b/relation/tests.py
@ -37,6 +37,9 @@ class IntIdsStub(object):
        id = self.getId(ob)
        self.objs[id] = None

+    def __iter__(self):
+        return iter(xrange(len(self.objs)))
+

 class TestRelation(unittest.TestCase):
    "Basic tests for the relation package."