[Checkins] SVN: zope.index/branches/nadako-sorting/ Add optimal sorting algorhythm selection for FieldIndex.

Sun Dec 28 05:31:45 EST 2008

Log message for revision 94409:
  Add optimal sorting algorhythm selection for FieldIndex.
  Change IIndexSort's ``sort`` method declaration to skip docids that are not indexed instead of raising KeyError, as it's not always trivial to efficiently implement.

Changed:
  U   zope.index/branches/nadako-sorting/CHANGES.txt
  U   zope.index/branches/nadako-sorting/src/zope/index/field/README.txt
  U   zope.index/branches/nadako-sorting/src/zope/index/field/index.py
  U   zope.index/branches/nadako-sorting/src/zope/index/field/tests.py
  U   zope.index/branches/nadako-sorting/src/zope/index/interfaces.py

-=-
Modified: zope.index/branches/nadako-sorting/CHANGES.txt
===================================================================

--- zope.index/branches/nadako-sorting/CHANGES.txt	2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/CHANGES.txt	2008-12-28 10:31:45 UTC (rev 94409)
@@ -6,9 +6,10 @@
 
 - Remove zope.testing from dependencies, as it's not really needed.
 
-- Define IIndexSort interface for indexes that support sorting and
-  implement it in the FieldIndex.
+- Define IIndexSort interface for indexes that support sorting.
 
+- Implement sorting for FieldIndex (adapted from repoze.catalog/ZCatalog).
+
 - Add an ``apply`` method for KeywordIndex/TopicIndex, making them
   implement IIndexSearch that can be useful in catalog.
 

Modified: zope.index/branches/nadako-sorting/src/zope/index/field/README.txt
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/field/README.txt	2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/field/README.txt	2008-12-28 10:31:45 UTC (rev 94409)
@@ -125,27 +125,25 @@
     >>> index.index_doc(8, 2)
     >>> index.index_doc(9, 1)
 
-    >>> list(index.sort([4, 2, 9, 7, 4, 1, 5]))
-    [9, 7, 5, 4, 4, 2, 1]
+    >>> list(index.sort([4, 2, 9, 7, 3, 1, 5]))
+    [9, 7, 5, 4, 3, 2, 1]
 
 We can also specify the ``reverse`` argument to reverse results:
 
-    >>> list(index.sort([4, 2, 9, 7, 4, 1, 5], reverse=True))
-    [1, 2, 4, 4, 5, 7, 9]
+    >>> list(index.sort([4, 2, 9, 7, 3, 1, 5], reverse=True))
+    [1, 2, 3, 4, 5, 7, 9]
 
 And as per IIndexSort, we can limit results by specifying the ``limit``
 argument:
 
-    >>> list(index.sort([4, 2, 9, 7, 4, 1, 5], limit=3)) 
+    >>> list(index.sort([4, 2, 9, 7, 3, 1, 5], limit=3)) 
     [9, 7, 5]
 
-If we pass an id that is not indexed by this index, the
-KeyError is raised as Interface declares:
+If we pass an id that is not indexed by this index, it won't be included
+in the result.
 
-    >>> list(index.sort([10]))
-    Traceback (most recent call last):
-    ...
-    KeyError: 'docid 10 is not indexed by <zope.index.field.index.FieldIndex object at 0x...>'
+    >>> list(index.sort([2, 10]))
+    [2]
 
     >>> index.clear()
 

Modified: zope.index/branches/nadako-sorting/src/zope/index/field/index.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/field/index.py	2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/field/index.py	2008-12-28 10:31:45 UTC (rev 94409)
@@ -15,14 +15,15 @@
 
 $Id$
 """
-import persistent
+import heapq
+import bisect
+from itertools import islice
 
 import BTrees
-
+import persistent
+import zope.interface
 from BTrees.Length import Length
 
-import zope.interface
-
 from zope.index import interfaces
 
 class FieldIndex(persistent.Persistent):
@@ -113,14 +114,97 @@
         if (limit is not None) and (limit < 1):
             raise ValueError('limit value must be 1 or greater')
 
+        numdocs = self._num_docs.value
+        if not numdocs:
+            raise StopIteration
+
+        if not isinstance(docids, (self.family.IF.Set, self.family.IF.TreeSet)):
+            docids = self.family.IF.Set(docids)
+        if not docids:
+            raise StopIteration
+
+        rlen = len(docids)
+
+        fwd_index = self._fwd_index
+        rev_index = self._rev_index
+        getValue = rev_index.get
         marker = object()
-        getValue = self._rev_index.get
+
+        # use_lazy and use_nbest computations lifted wholesale from
+        # Zope2 catalog without questioning reasoning
+        use_lazy = rlen > numdocs * (rlen / 100 + 1)
+        use_nbest = limit and limit * 4 < rlen
+
+        # overrides for unit tests
+        if getattr(self, '_use_lazy', False):
+            use_lazy = True
+        if getattr(self, '_use_nbest', False):
+            use_nbest = True
         
-        n = 0
-        for docid in sorted(docids, key=getValue, reverse=reverse):
-            if getValue(docid, marker) is marker:
-                raise KeyError('docid %d is not indexed by %s' % (docid, self))
-            n += 1
-            yield docid
-            if limit and n >= limit:
-                raise StopIteration()
+        if use_nbest:
+            # this is a sort with a limit that appears useful, try to
+            # take advantage of the fact that we can keep a smaller
+            # set of simultaneous values in memory; use generators
+            # and heapq functions to do so.
+
+            def nsort():
+                for docid in docids:
+                    val = getValue(docid, marker)
+                    if val is not marker:
+                        yield (val, docid)
+
+            iterable = nsort()
+
+            if reverse:
+                # we use a generator as an iterable in the reverse
+                # sort case because the nlargest implementation does
+                # not manifest the whole thing into memory at once if
+                # we do so.
+                for val in heapq.nlargest(limit, iterable):
+                    yield val[1]
+            else:
+                # lifted from heapq.nsmallest
+                it = iter(iterable)
+                result = sorted(islice(it, 0, limit))
+                if not result:
+                    raise StopIteration
+                insort = bisect.insort
+                pop = result.pop
+                los = result[-1]    # los --> Largest of the nsmallest
+                for elem in it:
+                    if los <= elem:
+                        continue
+                    insort(result, elem)
+                    pop()
+                    los = result[-1]
+                for val in result:
+                    yield val[1]
+
+        else:
+            if use_lazy and not reverse:
+                # Since this the sort is not reversed, and the number
+                # of results in the search result set is much larger
+                # than the number of items in this index, we assume it
+                # will be fastest to iterate over all of our forward
+                # BTree's items instead of using a full sort, as our
+                # forward index is already sorted in ascending order
+                # by value. The Zope 2 catalog implementation claims
+                # that this case is rarely exercised in practice.
+                n = 0
+                for stored_docids in fwd_index.values():
+                    for docid in self.family.IF.intersection(docids, stored_docids):
+                        n += 1
+                        yield docid
+                        if limit and n >= limit:
+                            raise StopIteration
+            else:
+                # If the result set is not much larger than the number
+                # of documents in this index, or if we need to sort in
+                # reverse order, use a non-lazy sort.
+                n = 0
+                for docid in sorted(docids, key=getValue, reverse=reverse):
+                    if getValue(docid, marker) is not marker:
+                        n += 1
+                        yield docid
+                        if limit and n >= limit:
+                            raise StopIteration

Modified: zope.index/branches/nadako-sorting/src/zope/index/field/tests.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/field/tests.py	2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/field/tests.py	2008-12-28 10:31:45 UTC (rev 94409)
@@ -16,10 +16,134 @@
 $Id$
 """
 import unittest
+from BTrees.IFBTree import IFSet
 from zope.testing import doctest
 
+from zope.index.field import FieldIndex
+
+class TestFieldIndexSorting(unittest.TestCase):
+
+    def _populateIndex(self, index):
+        index.index_doc(5, 1) # docid, obj
+        index.index_doc(2, 2)
+        index.index_doc(1, 3)
+        index.index_doc(3, 4) 
+        index.index_doc(4, 5)
+        index.index_doc(8, 6)
+        index.index_doc(9, 7)
+        index.index_doc(7, 8)
+        index.index_doc(6, 9)
+        index.index_doc(11, 10)
+        index.index_doc(10, 11)
+
+    def test_sort_lazy_nolimit(self):
+        index = FieldIndex()
+        index._use_lazy = True
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1)
+        self.assertEqual(list(result), [5, 2, 1, 3, 4])
+
+    def test_sort_lazy_withlimit(self):
+        index = FieldIndex()
+        index._use_lazy = True
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1, limit=3)
+        self.assertEqual(list(result), [5, 2, 1])
+
+    def test_sort_nonlazy_nolimit(self):
+        index = FieldIndex()
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1)
+        self.assertEqual(list(result), [5, 2, 1, 3, 4])
+
+    def test_sort_nonlazy_missingdocid(self):
+        index = FieldIndex()
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5, 99])
+        result = index.sort(c1)
+        self.assertEqual(list(result), [5, 2, 1, 3, 4]) # 99 not present
+
+    def test_sort_nonlazy_withlimit(self):
+        index = FieldIndex()
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1, limit=3)
+        self.assertEqual(list(result), [5, 2, 1])
+
+    def test_sort_nonlazy_reverse_nolimit(self):
+        index = FieldIndex()
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1, reverse=True)
+        self.assertEqual(list(result), [4, 3, 1, 2, 5])
+
+    def test_sort_nonlazy_reverse_withlimit(self):
+        index = FieldIndex()
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1, reverse=True, limit=3)
+        self.assertEqual(list(result), [4, 3, 1])
+
+    def test_sort_nbest(self):
+        index = FieldIndex()
+        index._use_nbest = True
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1, limit=3)
+        self.assertEqual(list(result), [5, 2, 1])
+
+    def test_sort_nbest_reverse(self):
+        index = FieldIndex()
+        index._use_nbest = True
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1, reverse=True, limit=3)
+        self.assertEqual(list(result), [4, 3, 1])
+
+    def test_sort_nbest_missing(self):
+        index = FieldIndex()
+        index._use_nbest = True
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5, 99])
+        result = index.sort(c1, limit=3)
+        self.assertEqual(list(result), [5, 2, 1])
+
+    def test_sort_nbest_missing_reverse(self):
+        index = FieldIndex()
+        index._use_nbest = True
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5, 99])
+        result = index.sort(c1, reverse=True, limit=3)
+        self.assertEqual(list(result), [4, 3, 1])
+
+    def test_sort_nodocs(self):
+        index = FieldIndex()
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1)
+        self.assertEqual(list(result), [])
+
+    def test_sort_nodocids(self):
+        index = FieldIndex()
+        self._populateIndex(index)
+        c1 = IFSet()
+        result = index.sort(c1)
+        self.assertEqual(list(result), [])
+
+    def test_sort_badlimit(self):
+        index = FieldIndex()
+        self._populateIndex(index)
+        c1 = IFSet([1, 2, 3, 4, 5])
+        result = index.sort(c1, limit=0)
+        self.assertRaises(ValueError, list, result)
+
 def test_suite():
-    return doctest.DocFileSuite('README.txt', optionflags=doctest.ELLIPSIS)
+    return unittest.TestSuite((
+        doctest.DocFileSuite('README.txt', optionflags=doctest.ELLIPSIS),
+        unittest.makeSuite(TestFieldIndexSorting),
+        ))
 
 if __name__=='__main__':
     unittest.main(defaultTest='test_suite')

Modified: zope.index/branches/nadako-sorting/src/zope/index/interfaces.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/interfaces.py	2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/interfaces.py	2008-12-28 10:31:45 UTC (rev 94409)
@@ -87,8 +87,8 @@
     def sort(docids, limit=None, reverse=False):
         """Sort document ids sequence using indexed values
         
-        If some of docids are not in index, KeyError is 
-        raised.
+        If some of docids are not indexed they are skipped
+        from resulting iterable.
         
         Return an iterable of document ids. Limited by
         value of the "limit" argument and optionally