[Checkins] SVN: zope.index/branches/nadako-sorting/ Optimize KeywordIndex searching. Make KeywordIndex implement IIndexSearch. Use IFSet rather than IISet to make it possible to merge KeywordIndex results with other indexes.

Dan Korostelev nadako at gmail.com
Sun Dec 28 01:40:45 EST 2008


Log message for revision 94402:
  Optimize KeywordIndex searching. Make KeywordIndex implement IIndexSearch. Use IFSet rather than IISet to make it possible to merge KeywordIndex results with other indexes.

Changed:
  U   zope.index/branches/nadako-sorting/CHANGES.txt
  U   zope.index/branches/nadako-sorting/src/zope/index/keyword/index.py
  U   zope.index/branches/nadako-sorting/src/zope/index/keyword/tests.py

-=-
Modified: zope.index/branches/nadako-sorting/CHANGES.txt
===================================================================
--- zope.index/branches/nadako-sorting/CHANGES.txt	2008-12-28 06:01:54 UTC (rev 94401)
+++ zope.index/branches/nadako-sorting/CHANGES.txt	2008-12-28 06:40:44 UTC (rev 94402)
@@ -5,9 +5,21 @@
 ------------------
 
 - Remove zope.testing from dependencies, as it's not really needed.
+
 - Define IIndexSort interface for indexes that support sorting and
   implement it in the FieldIndex.
 
+- Add an ``apply`` method for KeywordIndex, making it implementing
+  IIndexSearch that can be useful in catalog.
+
+- Make KeywordIndex use IFSets instead of IISets. This makes it more
+  compatible with other indexes. 
+
+- Optimize the ``search`` method of KeywordIndex by using multiunion
+  for the ``or`` operator and sorting before intersection for ``and``.
+
+Thanks Chris McDonough of repoze for supporting and useful code.
+
 3.4.1 (2007-09-28)
 ------------------
 

Modified: zope.index/branches/nadako-sorting/src/zope/index/keyword/index.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/keyword/index.py	2008-12-28 06:01:54 UTC (rev 94401)
+++ zope.index/branches/nadako-sorting/src/zope/index/keyword/index.py	2008-12-28 06:40:44 UTC (rev 94402)
@@ -21,7 +21,7 @@
 
 from BTrees.Length import Length
 
-from zope.index.interfaces import IInjection, IStatistics
+from zope.index.interfaces import IInjection, IStatistics, IIndexSearch
 from zope.index.keyword.interfaces import IKeywordQuerying
 from zope.interface import implements
 
@@ -31,7 +31,7 @@
 
     family = BTrees.family32
     normalize = True
-    implements(IInjection, IStatistics, IKeywordQuerying)
+    implements(IInjection, IStatistics, IIndexSearch, IKeywordQuerying)
 
     def __init__(self, family=None):
         if family is not None:
@@ -117,7 +117,7 @@
         has_key = idx.has_key
         for word in words:
             if not has_key(word):
-                idx[word] = self.family.II.Set()
+                idx[word] = self.family.IF.Set()
             idx[word].insert(docid)
 
     def _insert_reverse(self, docid, words):
@@ -134,20 +134,36 @@
         if self.normalize:
             query = [w.lower() for w in query]
 
-        f = {'and' : self.family.II.intersection,
-             'or' : self.family.II.union,
-             }[operator]
-    
-        rs = None
+        sets = []
         for word in query:
-            docids = self._fwd_index.get(word, self.family.II.Set())
-            rs = f(rs, docids)
-            
+            docids = self._fwd_index.get(word, self.family.IF.Set())
+            sets.append(docids)
+
+        if operator == 'or':
+            rs = self.family.IF.multiunion(sets)
+        elif operator == 'and':
+            sets.sort(key=len)
+            rs = None
+            for set in sets:
+                rs = self.family.IF.intersection(rs, set)
+                if not rs:
+                    break
+        else:
+            raise TypeError('Keyword index only supports `and` and `or` operators, not `%s`.' % operator)
+        
         if rs:
             return rs
         else:
-            return self.family.II.Set()
+            return self.family.IF.Set()
 
+    def apply(self, query):
+        operator = 'and'
+        if isinstance(query, dict):
+            if 'operator' in query:
+                operator = query.pop('operator')
+            query = query['query']
+        return self.search(query, operator=operator)
+
 class CaseSensitiveKeywordIndex(KeywordIndex):
     """ A case-sensitive keyword index """
     normalize = False        

Modified: zope.index/branches/nadako-sorting/src/zope/index/keyword/tests.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/keyword/tests.py	2008-12-28 06:01:54 UTC (rev 94401)
+++ zope.index/branches/nadako-sorting/src/zope/index/keyword/tests.py	2008-12-28 06:40:44 UTC (rev 94402)
@@ -23,7 +23,7 @@
 
 class KeywordIndexTest(TestCase):
 
-    from BTrees.IIBTree import IISet
+    from BTrees.IFBTree import IFSet
 
     def setUp(self):
         self.index = KeywordIndex()
@@ -41,7 +41,7 @@
 
         results = self.index.search(query, mode)
 
-        # results and expected are IISets() but we can not
+        # results and expected are IFSets() but we can not
         # compare them directly since __eq__() does not seem
         # to be implemented for BTrees
 
@@ -53,6 +53,18 @@
     def _search_or(self, query, expected):
         return self._search(query, expected, 'or')
 
+    def _apply(self, query, expected, mode='and'):
+        results = self.index.apply(query)
+        self.assertEqual(results.keys(), expected.keys())
+
+    def _apply_and(self, query, expected):
+        results = self.index.apply({'operator': 'and', 'query': query})
+        self.assertEqual(results.keys(), expected.keys())
+
+    def _apply_or(self, query, expected):
+        results = self.index.apply({'operator': 'or', 'query': query})
+        self.assertEqual(results.keys(), expected.keys())
+
     def test_interface(self):
         verifyClass(IInjection, KeywordIndex)
         verifyClass(IStatistics, KeywordIndex)
@@ -85,13 +97,13 @@
         self.index.index_doc(1,  ('foo', 'bar', 'doom'))
         self.index.index_doc(1,  ('bar', 'blabla'))
         self.assertEqual(self.index.documentCount(), 3)
-        self._search('quick',   self.IISet())
-        self._search('foo',   self.IISet())
-        self._search('bar',   self.IISet([1]))
-        self._search(['doom'],   self.IISet())
-        self._search(['blabla'],   self.IISet([1]))
-        self._search_and(('bar', 'blabla'),   self.IISet([1]))
-        self._search(['cmf'],   self.IISet([5]))
+        self._search('quick',   self.IFSet())
+        self._search('foo',   self.IFSet())
+        self._search('bar',   self.IFSet([1]))
+        self._search(['doom'],   self.IFSet())
+        self._search(['blabla'],   self.IFSet([1]))
+        self._search_and(('bar', 'blabla'),   self.IFSet([1]))
+        self._search(['cmf'],   self.IFSet([5]))
 
     def test_hasdoc(self):
         self._populate_index()
@@ -104,26 +116,47 @@
 
     def test_simplesearch(self):
         self._populate_index()
-        self._search([''],      self.IISet())
-        self._search(['cmf'],   self.IISet([1, 5]))
-        self._search(['zope'],  self.IISet([1, 3]))
-        self._search(['zope3'], self.IISet([1]))
-        self._search(['foo'],   self.IISet())
+        self._search([''],      self.IFSet())
+        self._search(['cmf'],   self.IFSet([1, 5]))
+        self._search(['zope'],  self.IFSet([1, 3]))
+        self._search(['zope3'], self.IFSet([1]))
+        self._search(['foo'],   self.IFSet())
 
     def test_search_and(self):
         self._populate_index()
-        self._search_and(('cmf', 'zope3'), self.IISet([1]))
-        self._search_and(('cmf', 'zope'),  self.IISet([1]))
-        self._search_and(('cmf', 'zope4'), self.IISet())
-        self._search_and(('zope', 'ZOPE'), self.IISet([1, 3]))
+        self._search_and(('cmf', 'zope3'), self.IFSet([1]))
+        self._search_and(('cmf', 'zope'),  self.IFSet([1]))
+        self._search_and(('cmf', 'zope4'), self.IFSet())
+        self._search_and(('zope', 'ZOPE'), self.IFSet([1, 3]))
 
     def test_search_or(self):
         self._populate_index()
-        self._search_or(('cmf', 'zope3'), self.IISet([1, 5]))
-        self._search_or(('cmf', 'zope'),  self.IISet([1, 3, 5]))
-        self._search_or(('cmf', 'zope4'), self.IISet([1, 5]))
-        self._search_or(('zope', 'ZOPE'), self.IISet([1,3]))
+        self._search_or(('cmf', 'zope3'), self.IFSet([1, 5]))
+        self._search_or(('cmf', 'zope'),  self.IFSet([1, 3, 5]))
+        self._search_or(('cmf', 'zope4'), self.IFSet([1, 5]))
+        self._search_or(('zope', 'ZOPE'), self.IFSet([1,3]))
 
+    def test_apply(self):
+        self._populate_index()
+        self._apply(('cmf', 'zope3'), self.IFSet([1]))
+        self._apply(('cmf', 'zope'),  self.IFSet([1]))
+        self._apply(('cmf', 'zope4'), self.IFSet())
+        self._apply(('zope', 'ZOPE'), self.IFSet([1, 3]))
+
+    def test_apply_and(self):
+        self._populate_index()
+        self._apply_and(('cmf', 'zope3'), self.IFSet([1]))
+        self._apply_and(('cmf', 'zope'),  self.IFSet([1]))
+        self._apply_and(('cmf', 'zope4'), self.IFSet())
+        self._apply_and(('zope', 'ZOPE'), self.IFSet([1, 3]))
+
+    def test_apply_or(self):
+        self._populate_index()
+        self._apply_or(('cmf', 'zope3'), self.IFSet([1, 5]))
+        self._apply_or(('cmf', 'zope'),  self.IFSet([1, 3, 5]))
+        self._apply_or(('cmf', 'zope4'), self.IFSet([1, 5]))
+        self._apply_or(('zope', 'ZOPE'), self.IFSet([1,3]))
+
     def test_index_input(self):
         self.assertRaises(
             TypeError, self.index.index_doc, 1, "non-sequence-string")
@@ -131,7 +164,7 @@
 
 class KeywordIndexTest64(KeywordIndexTest):
 
-    from BTrees.LLBTree import LLSet as IISet
+    from BTrees.LFBTree import LFSet as IFSet
 
     def setUp(self):
         self.index = KeywordIndex(family=BTrees.family64)



More information about the Checkins mailing list