[Checkins] SVN: zope.index/branches/nadako-sorting/ Add optimal sorting algorhythm selection for FieldIndex.
Dan Korostelev
nadako at gmail.com
Sun Dec 28 05:31:45 EST 2008
Log message for revision 94409:
Add optimal sorting algorhythm selection for FieldIndex.
Change IIndexSort's ``sort`` method declaration to skip docids that are not indexed instead of raising KeyError, as it's not always trivial to efficiently implement.
Changed:
U zope.index/branches/nadako-sorting/CHANGES.txt
U zope.index/branches/nadako-sorting/src/zope/index/field/README.txt
U zope.index/branches/nadako-sorting/src/zope/index/field/index.py
U zope.index/branches/nadako-sorting/src/zope/index/field/tests.py
U zope.index/branches/nadako-sorting/src/zope/index/interfaces.py
-=-
Modified: zope.index/branches/nadako-sorting/CHANGES.txt
===================================================================
--- zope.index/branches/nadako-sorting/CHANGES.txt 2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/CHANGES.txt 2008-12-28 10:31:45 UTC (rev 94409)
@@ -6,9 +6,10 @@
- Remove zope.testing from dependencies, as it's not really needed.
-- Define IIndexSort interface for indexes that support sorting and
- implement it in the FieldIndex.
+- Define IIndexSort interface for indexes that support sorting.
+- Implement sorting for FieldIndex (adapted from repoze.catalog/ZCatalog).
+
- Add an ``apply`` method for KeywordIndex/TopicIndex, making them
implement IIndexSearch that can be useful in catalog.
Modified: zope.index/branches/nadako-sorting/src/zope/index/field/README.txt
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/field/README.txt 2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/field/README.txt 2008-12-28 10:31:45 UTC (rev 94409)
@@ -125,27 +125,25 @@
>>> index.index_doc(8, 2)
>>> index.index_doc(9, 1)
- >>> list(index.sort([4, 2, 9, 7, 4, 1, 5]))
- [9, 7, 5, 4, 4, 2, 1]
+ >>> list(index.sort([4, 2, 9, 7, 3, 1, 5]))
+ [9, 7, 5, 4, 3, 2, 1]
We can also specify the ``reverse`` argument to reverse results:
- >>> list(index.sort([4, 2, 9, 7, 4, 1, 5], reverse=True))
- [1, 2, 4, 4, 5, 7, 9]
+ >>> list(index.sort([4, 2, 9, 7, 3, 1, 5], reverse=True))
+ [1, 2, 3, 4, 5, 7, 9]
And as per IIndexSort, we can limit results by specifying the ``limit``
argument:
- >>> list(index.sort([4, 2, 9, 7, 4, 1, 5], limit=3))
+ >>> list(index.sort([4, 2, 9, 7, 3, 1, 5], limit=3))
[9, 7, 5]
-If we pass an id that is not indexed by this index, the
-KeyError is raised as Interface declares:
+If we pass an id that is not indexed by this index, it won't be included
+in the result.
- >>> list(index.sort([10]))
- Traceback (most recent call last):
- ...
- KeyError: 'docid 10 is not indexed by <zope.index.field.index.FieldIndex object at 0x...>'
+ >>> list(index.sort([2, 10]))
+ [2]
>>> index.clear()
Modified: zope.index/branches/nadako-sorting/src/zope/index/field/index.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/field/index.py 2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/field/index.py 2008-12-28 10:31:45 UTC (rev 94409)
@@ -15,14 +15,15 @@
$Id$
"""
-import persistent
+import heapq
+import bisect
+from itertools import islice
import BTrees
-
+import persistent
+import zope.interface
from BTrees.Length import Length
-import zope.interface
-
from zope.index import interfaces
class FieldIndex(persistent.Persistent):
@@ -113,14 +114,97 @@
if (limit is not None) and (limit < 1):
raise ValueError('limit value must be 1 or greater')
+ numdocs = self._num_docs.value
+ if not numdocs:
+ raise StopIteration
+
+ if not isinstance(docids, (self.family.IF.Set, self.family.IF.TreeSet)):
+ docids = self.family.IF.Set(docids)
+ if not docids:
+ raise StopIteration
+
+ rlen = len(docids)
+
+ fwd_index = self._fwd_index
+ rev_index = self._rev_index
+ getValue = rev_index.get
marker = object()
- getValue = self._rev_index.get
+
+ # use_lazy and use_nbest computations lifted wholesale from
+ # Zope2 catalog without questioning reasoning
+ use_lazy = rlen > numdocs * (rlen / 100 + 1)
+ use_nbest = limit and limit * 4 < rlen
+
+ # overrides for unit tests
+ if getattr(self, '_use_lazy', False):
+ use_lazy = True
+ if getattr(self, '_use_nbest', False):
+ use_nbest = True
- n = 0
- for docid in sorted(docids, key=getValue, reverse=reverse):
- if getValue(docid, marker) is marker:
- raise KeyError('docid %d is not indexed by %s' % (docid, self))
- n += 1
- yield docid
- if limit and n >= limit:
- raise StopIteration()
+ if use_nbest:
+ # this is a sort with a limit that appears useful, try to
+ # take advantage of the fact that we can keep a smaller
+ # set of simultaneous values in memory; use generators
+ # and heapq functions to do so.
+
+ def nsort():
+ for docid in docids:
+ val = getValue(docid, marker)
+ if val is not marker:
+ yield (val, docid)
+
+ iterable = nsort()
+
+ if reverse:
+ # we use a generator as an iterable in the reverse
+ # sort case because the nlargest implementation does
+ # not manifest the whole thing into memory at once if
+ # we do so.
+ for val in heapq.nlargest(limit, iterable):
+ yield val[1]
+ else:
+ # lifted from heapq.nsmallest
+ it = iter(iterable)
+ result = sorted(islice(it, 0, limit))
+ if not result:
+ raise StopIteration
+ insort = bisect.insort
+ pop = result.pop
+ los = result[-1] # los --> Largest of the nsmallest
+ for elem in it:
+ if los <= elem:
+ continue
+ insort(result, elem)
+ pop()
+ los = result[-1]
+ for val in result:
+ yield val[1]
+
+ else:
+ if use_lazy and not reverse:
+ # Since this the sort is not reversed, and the number
+ # of results in the search result set is much larger
+ # than the number of items in this index, we assume it
+ # will be fastest to iterate over all of our forward
+ # BTree's items instead of using a full sort, as our
+ # forward index is already sorted in ascending order
+ # by value. The Zope 2 catalog implementation claims
+ # that this case is rarely exercised in practice.
+ n = 0
+ for stored_docids in fwd_index.values():
+ for docid in self.family.IF.intersection(docids, stored_docids):
+ n += 1
+ yield docid
+ if limit and n >= limit:
+ raise StopIteration
+ else:
+ # If the result set is not much larger than the number
+ # of documents in this index, or if we need to sort in
+ # reverse order, use a non-lazy sort.
+ n = 0
+ for docid in sorted(docids, key=getValue, reverse=reverse):
+ if getValue(docid, marker) is not marker:
+ n += 1
+ yield docid
+ if limit and n >= limit:
+ raise StopIteration
Modified: zope.index/branches/nadako-sorting/src/zope/index/field/tests.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/field/tests.py 2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/field/tests.py 2008-12-28 10:31:45 UTC (rev 94409)
@@ -16,10 +16,134 @@
$Id$
"""
import unittest
+from BTrees.IFBTree import IFSet
from zope.testing import doctest
+from zope.index.field import FieldIndex
+
+class TestFieldIndexSorting(unittest.TestCase):
+
+ def _populateIndex(self, index):
+ index.index_doc(5, 1) # docid, obj
+ index.index_doc(2, 2)
+ index.index_doc(1, 3)
+ index.index_doc(3, 4)
+ index.index_doc(4, 5)
+ index.index_doc(8, 6)
+ index.index_doc(9, 7)
+ index.index_doc(7, 8)
+ index.index_doc(6, 9)
+ index.index_doc(11, 10)
+ index.index_doc(10, 11)
+
+ def test_sort_lazy_nolimit(self):
+ index = FieldIndex()
+ index._use_lazy = True
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1)
+ self.assertEqual(list(result), [5, 2, 1, 3, 4])
+
+ def test_sort_lazy_withlimit(self):
+ index = FieldIndex()
+ index._use_lazy = True
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1, limit=3)
+ self.assertEqual(list(result), [5, 2, 1])
+
+ def test_sort_nonlazy_nolimit(self):
+ index = FieldIndex()
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1)
+ self.assertEqual(list(result), [5, 2, 1, 3, 4])
+
+ def test_sort_nonlazy_missingdocid(self):
+ index = FieldIndex()
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5, 99])
+ result = index.sort(c1)
+ self.assertEqual(list(result), [5, 2, 1, 3, 4]) # 99 not present
+
+ def test_sort_nonlazy_withlimit(self):
+ index = FieldIndex()
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1, limit=3)
+ self.assertEqual(list(result), [5, 2, 1])
+
+ def test_sort_nonlazy_reverse_nolimit(self):
+ index = FieldIndex()
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1, reverse=True)
+ self.assertEqual(list(result), [4, 3, 1, 2, 5])
+
+ def test_sort_nonlazy_reverse_withlimit(self):
+ index = FieldIndex()
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1, reverse=True, limit=3)
+ self.assertEqual(list(result), [4, 3, 1])
+
+ def test_sort_nbest(self):
+ index = FieldIndex()
+ index._use_nbest = True
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1, limit=3)
+ self.assertEqual(list(result), [5, 2, 1])
+
+ def test_sort_nbest_reverse(self):
+ index = FieldIndex()
+ index._use_nbest = True
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1, reverse=True, limit=3)
+ self.assertEqual(list(result), [4, 3, 1])
+
+ def test_sort_nbest_missing(self):
+ index = FieldIndex()
+ index._use_nbest = True
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5, 99])
+ result = index.sort(c1, limit=3)
+ self.assertEqual(list(result), [5, 2, 1])
+
+ def test_sort_nbest_missing_reverse(self):
+ index = FieldIndex()
+ index._use_nbest = True
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5, 99])
+ result = index.sort(c1, reverse=True, limit=3)
+ self.assertEqual(list(result), [4, 3, 1])
+
+ def test_sort_nodocs(self):
+ index = FieldIndex()
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1)
+ self.assertEqual(list(result), [])
+
+ def test_sort_nodocids(self):
+ index = FieldIndex()
+ self._populateIndex(index)
+ c1 = IFSet()
+ result = index.sort(c1)
+ self.assertEqual(list(result), [])
+
+ def test_sort_badlimit(self):
+ index = FieldIndex()
+ self._populateIndex(index)
+ c1 = IFSet([1, 2, 3, 4, 5])
+ result = index.sort(c1, limit=0)
+ self.assertRaises(ValueError, list, result)
+
def test_suite():
- return doctest.DocFileSuite('README.txt', optionflags=doctest.ELLIPSIS)
+ return unittest.TestSuite((
+ doctest.DocFileSuite('README.txt', optionflags=doctest.ELLIPSIS),
+ unittest.makeSuite(TestFieldIndexSorting),
+ ))
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
Modified: zope.index/branches/nadako-sorting/src/zope/index/interfaces.py
===================================================================
--- zope.index/branches/nadako-sorting/src/zope/index/interfaces.py 2008-12-28 08:26:26 UTC (rev 94408)
+++ zope.index/branches/nadako-sorting/src/zope/index/interfaces.py 2008-12-28 10:31:45 UTC (rev 94409)
@@ -87,8 +87,8 @@
def sort(docids, limit=None, reverse=False):
"""Sort document ids sequence using indexed values
- If some of docids are not in index, KeyError is
- raised.
+ If some of docids are not indexed they are skipped
+ from resulting iterable.
Return an iterable of document ids. Limited by
value of the "limit" argument and optionally
More information about the Checkins
mailing list