[Checkins] SVN: Products.ZCatalog/trunk/ Extend BooleanIndex by making the indexed value variable instead of hardcoding it to `True`. The indexed value will determine the smaller set automatically and choose its best value. An inline switch is done once the indexed value set grows larger than 60% of the total length. 60% was chosen to avoid constant switching for indexes that have an almost equal distribution of `True/False`.

Hanno Schlichting hannosch at hannosch.eu
Tue May 17 08:41:54 EDT 2011


Log message for revision 121686:
  Extend BooleanIndex by making the indexed value variable instead of hardcoding it to `True`. The indexed value will determine the smaller set automatically and choose its best value. An inline switch is done once the indexed value set grows larger than 60% of the total length. 60% was chosen to avoid constant switching for indexes that have an almost equal distribution of `True/False`.
  

Changed:
  U   Products.ZCatalog/trunk/CHANGES.txt
  U   Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/BooleanIndex.py
  U   Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/tests.py

-=-
Modified: Products.ZCatalog/trunk/CHANGES.txt
===================================================================
--- Products.ZCatalog/trunk/CHANGES.txt	2011-05-17 12:33:54 UTC (rev 121685)
+++ Products.ZCatalog/trunk/CHANGES.txt	2011-05-17 12:41:54 UTC (rev 121686)
@@ -4,6 +4,13 @@
 2.13.14 (unreleased)
 --------------------
 
+- Extend BooleanIndex by making the indexed value variable instead of
+  hardcoding it to `True`. The indexed value will determine the smaller set
+  automatically and choose its best value. An inline switch is done once the
+  indexed value set grows larger than 60% of the total length. 60% was chosen
+  to avoid constant switching for indexes that have an almost equal
+  distribution of `True/False`.
+
 - Substitute catalog entry in UUIDIndex error message.
 
 2.13.13 (2011-05-04)

Modified: Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/BooleanIndex.py
===================================================================
--- Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/BooleanIndex.py	2011-05-17 12:33:54 UTC (rev 121685)
+++ Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/BooleanIndex.py	2011-05-17 12:41:54 UTC (rev 121686)
@@ -32,6 +32,9 @@
        self._index = set([documentId1, documentId2])
        self._unindex = {documentId:[True/False]}
 
+       self._length is the length of the unindex
+       self._index_length is the length of the index
+
        False doesn't have actual entries in _index.
     """
 
@@ -50,33 +53,93 @@
     manage_main._setName('manage_main')
     manage_browse = DTMLFile('../dtml/browseIndex', globals())
 
+    _index_value = 1
+    _index_length = None
+
     def clear(self):
-        self._length = BTrees.Length.Length()
         self._index = IITreeSet()
+        self._index_length = BTrees.Length.Length()
+        self._index_value = 1
         self._unindex = IIBTree()
+        self._length = BTrees.Length.Length()
 
+    def histogram(self):
+        """Return a mapping which provides a histogram of the number of
+        elements found at each point in the index.
+        """
+        histogram = {}
+        indexed = bool(self._index_value)
+        histogram[indexed] = self._index_length.value
+        histogram[not indexed] = self._length.value - self._index_length.value
+        return histogram
+
+    def _invert_index(self, documentId):
+        self._index_value = indexed = int(not self._index_value)
+        self._index.clear()
+        length = 0
+        for rid, value in self._unindex.iteritems():
+            # documentId is the rid of the currently processed object that
+            # triggered the invert. in the case of unindexing, the rid hasn't
+            # been removed from the unindex yet. While indexing, the rid will
+            # be added to the index and unindex after this method is done
+            if value == indexed and rid != documentId:
+                self._index.add(rid)
+                length += 1
+        self._index_length = BTrees.Length.Length(length)
+
     def insertForwardIndexEntry(self, entry, documentId):
-        """If True, insert directly into treeset
+        """If the value matches the indexed one, insert into treeset
         """
-        if entry:
+        # when we get the first entry, decide to index the opposite of what
+        # we got, as indexing zero items is fewer than one
+        length = self._length.value
+        index_length = self._index_length.value
+        if length == 0:
+            self._index_value = int(not bool(entry))
+
+        if bool(entry) is bool(self._index_value):
+            # is the index (after adding the current entry) larger than 60%
+            # of the total length? than switch the indexed value
+            if (index_length + 1) >= ((length + 1) * 0.6):
+                self._invert_index(documentId)
+                return
+
             self._index.insert(documentId)
-            self._length.change(1)
+            # BBB inline migration
+            length = self._index_length
+            if length is None:
+                self._index_length = BTrees.Length.Length(len(self._index))
+            else:
+                length.change(1)
 
-    def removeForwardIndexEntry(self, entry, documentId):
+    def removeForwardIndexEntry(self, entry, documentId, check=True):
         """Take the entry provided and remove any reference to documentId
         in its entry in the index.
         """
-        try:
-            if entry:
+        if bool(entry) is bool(self._index_value):
+            try:
                 self._index.remove(documentId)
-                self._length.change(-1)
-        except ConflictError:
-            raise
-        except Exception:
-            LOG.exception('%s: unindex_object could not remove '
-                          'documentId %s from index %s. This '
-                          'should not happen.' % (self.__class__.__name__,
-                          str(documentId), str(self.id)))
+                # BBB inline migration
+                length = self._index_length
+                if length is None:
+                    self._index_length = BTrees.Length.Length(len(self._index))
+                else:
+                    length.change(-1)
+            except ConflictError:
+                raise
+            except Exception:
+                LOG.exception('%s: unindex_object could not remove '
+                              'documentId %s from index %s. This '
+                              'should not happen.' % (self.__class__.__name__,
+                              str(documentId), str(self.id)))
+        elif check:
+            length = self._length.value
+            index_length = self._index_length.value
+            # is the index (after removing the current entry) larger than
+            # 60% of the total length? than switch the indexed value
+            if (index_length) <= ((length - 1) * 0.6):
+                self._invert_index(documentId)
+                return
 
     def _index_object(self, documentId, obj, threshold=None, attr=''):
         """ index and object 'obj' with integer id 'documentId'"""
@@ -94,10 +157,11 @@
         oldDatum = self._unindex.get(documentId, _marker)
         if datum != oldDatum:
             if oldDatum is not _marker:
-                self.removeForwardIndexEntry(oldDatum, documentId)
+                self.removeForwardIndexEntry(oldDatum, documentId, check=False)
                 if datum is _marker:
                     try:
                         del self._unindex[documentId]
+                        self._length.change(-1)
                     except ConflictError:
                         raise
                     except Exception:
@@ -106,24 +170,44 @@
                                   documentId)
 
             if datum is not _marker:
-                if datum:
-                    self.insertForwardIndexEntry(datum, documentId)
+                self.insertForwardIndexEntry(datum, documentId)
                 self._unindex[documentId] = datum
+                self._length.change(1)
 
             returnStatus = 1
 
         return returnStatus
 
+    def unindex_object(self, documentId):
+        """ Unindex the object with integer id 'documentId' and don't
+        raise an exception if we fail
+        """
+        unindexRecord = self._unindex.get(documentId, _marker)
+        if unindexRecord is _marker:
+            return None
+
+        self.removeForwardIndexEntry(unindexRecord, documentId)
+
+        try:
+            del self._unindex[documentId]
+            self._length.change(-1)
+        except ConflictError:
+            raise
+        except:
+            LOG.debug('Attempt to unindex nonexistent document'
+                      ' with id %s' % documentId,exc_info=True)
+
     def _apply_index(self, request, resultset=None):
         record = parseIndexRequest(request, self.id, self.query_options)
         if record.keys is None:
             return None
 
         index = self._index
+        indexed = self._index_value
 
         for key in record.keys:
-            if key:
-                # If True, check index
+            if bool(key) is bool(indexed):
+                # If we match the indexed value, check index
                 return (intersection(index, resultset), (self.id, ))
             else:
                 # Otherwise, remove from resultset or _unindex
@@ -139,11 +223,14 @@
         return 2
 
     def items(self):
-        items = []
-        for v, k in self._unindex.items():
-            if isinstance(v, int):
-                v = IISet((v, ))
-            items.append((k, v))
+        # return a list of value to int set of rid tuples
+        indexed = self._index_value
+        items = [(bool(indexed), self._index.keys())]
+        false = IISet()
+        for rid, value in self._unindex.iteritems():
+            if value != indexed:
+                false.add(rid)
+        items.append((not bool(indexed), false))
         return items
 
 manage_addBooleanIndexForm = DTMLFile('dtml/addBooleanIndex', globals())

Modified: Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/tests.py
===================================================================
--- Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/tests.py	2011-05-17 12:33:54 UTC (rev 121685)
+++ Products.ZCatalog/trunk/src/Products/PluginIndexes/BooleanIndex/tests.py	2011-05-17 12:41:54 UTC (rev 121686)
@@ -36,15 +36,15 @@
         index = self._makeOne()
         obj = Dummy(1, True)
         index._index_object(obj.id, obj, attr='truth')
-        self.failUnless(1 in index._unindex)
-        self.failUnless(1 in index._index)
+        self.assertTrue(1 in index._unindex)
+        self.assertFalse(1 in index._index)
 
     def test_index_false(self):
         index = self._makeOne()
         obj = Dummy(1, False)
         index._index_object(obj.id, obj, attr='truth')
-        self.failUnless(1 in index._unindex)
-        self.failIf(1 in index._index)
+        self.assertTrue(1 in index._unindex)
+        self.assertFalse(1 in index._index)
 
     def test_index_missing_attribute(self):
         index = self._makeOne()
@@ -61,8 +61,8 @@
         index._index_object(obj.id, obj, attr='truth')
 
         res, idx = index._apply_index({'truth': True})
-        self.failUnlessEqual(idx, ('truth', ))
-        self.failUnlessEqual(list(res), [1])
+        self.assertEqual(idx, ('truth', ))
+        self.assertEqual(list(res), [1])
 
     def test_search_false(self):
         index = self._makeOne()
@@ -72,8 +72,8 @@
         index._index_object(obj.id, obj, attr='truth')
 
         res, idx = index._apply_index({'truth': False})
-        self.failUnlessEqual(idx, ('truth', ))
-        self.failUnlessEqual(list(res), [2])
+        self.assertEqual(idx, ('truth', ))
+        self.assertEqual(list(res), [2])
 
     def test_search_inputresult(self):
         index = self._makeOne()
@@ -83,27 +83,121 @@
         index._index_object(obj.id, obj, attr='truth')
 
         res, idx = index._apply_index({'truth': True}, resultset=IISet([]))
-        self.failUnlessEqual(idx, ('truth', ))
-        self.failUnlessEqual(list(res), [])
+        self.assertEqual(idx, ('truth', ))
+        self.assertEqual(list(res), [])
 
         res, idx = index._apply_index({'truth': True}, resultset=IISet([2]))
-        self.failUnlessEqual(idx, ('truth', ))
-        self.failUnlessEqual(list(res), [])
+        self.assertEqual(idx, ('truth', ))
+        self.assertEqual(list(res), [])
 
         res, idx = index._apply_index({'truth': True}, resultset=IISet([1]))
-        self.failUnlessEqual(idx, ('truth', ))
-        self.failUnlessEqual(list(res), [1])
+        self.assertEqual(idx, ('truth', ))
+        self.assertEqual(list(res), [1])
 
         res, idx = index._apply_index({'truth': True}, resultset=IISet([1, 2]))
-        self.failUnlessEqual(idx, ('truth', ))
-        self.failUnlessEqual(list(res), [1])
+        self.assertEqual(idx, ('truth', ))
+        self.assertEqual(list(res), [1])
 
         res, idx = index._apply_index({'truth': False},
                                       resultset=IISet([1, 2]))
-        self.failUnlessEqual(idx, ('truth', ))
-        self.failUnlessEqual(list(res), [2])
+        self.assertEqual(idx, ('truth', ))
+        self.assertEqual(list(res), [2])
 
+    def test_index_many_true(self):
+        index = self._makeOne()
+        for i in range(0, 100):
+            obj = Dummy(i, i < 80 and True or False)
+            index._index_object(obj.id, obj, attr='truth')
+        self.assertEqual(list(index._index), range(80, 100))
+        self.assertEqual(len(index._unindex), 100)
 
+        res, idx = index._apply_index({'truth': True})
+        self.assertEqual(list(res), range(0, 80))
+        res, idx = index._apply_index({'truth': False})
+        self.assertEqual(list(res), range(80, 100))
+
+    def test_index_many_false(self):
+        index = self._makeOne()
+        for i in range(0, 100):
+            obj = Dummy(i, i >= 80 and True or False)
+            index._index_object(obj.id, obj, attr='truth')
+        self.assertEqual(list(index._index), range(80, 100))
+        self.assertEqual(len(index._unindex), 100)
+
+        res, idx = index._apply_index({'truth': False})
+        self.assertEqual(list(res), range(0, 80))
+        res, idx = index._apply_index({'truth': True})
+        self.assertEqual(list(res), range(80, 100))
+
+    def test_index_many_change(self):
+        index = self._makeOne()
+        def add(i, value):
+            obj = Dummy(i, value)
+            index._index_object(obj.id, obj, attr='truth')
+        # First lets index only True values
+        for i in range(0, 4):
+            add(i, True)
+        self.assertEqual(list(index._index), [])
+        self.assertEqual(len(index._unindex), 4)
+        # Now add an equal number of False values
+        for i in range(4, 8):
+            add(i, False)
+        self.assertEqual(list(index._index), range(4, 8))
+        self.assertEqual(len(index._unindex), 8)
+        # Once False gets to be more than 60% of the indexed set, we switch
+        add(8, False)
+        self.assertEqual(list(index._index), range(4, 9))
+        add(9, False)
+        self.assertEqual(list(index._index), range(0, 4))
+        res, idx = index._apply_index({'truth': True})
+        self.assertEqual(list(res), range(0, 4))
+        res, idx = index._apply_index({'truth': False})
+        self.assertEqual(list(res), range(4, 10))
+        # and we can again switch if the percentages change again
+        for i in range(6, 10):
+            index.unindex_object(i)
+        self.assertEqual(list(index._index), range(4, 6))
+        self.assertEqual(len(index._unindex), 6)
+        res, idx = index._apply_index({'truth': True})
+        self.assertEqual(list(res), range(0, 4))
+        res, idx = index._apply_index({'truth': False})
+        self.assertEqual(list(res), range(4, 6))
+
+    def test_items(self):
+        index = self._makeOne()
+        # test empty
+        items = dict(index.items())
+        self.assertEqual(len(items[True]), 0)
+        self.assertEqual(len(items[False]), 0)
+        # test few trues
+        for i in range(0, 20):
+            obj = Dummy(i, i < 5 and True or False)
+            index._index_object(obj.id, obj, attr='truth')
+        items = dict(index.items())
+        self.assertEqual(len(items[True]), 5)
+        self.assertEqual(len(items[False]), 15)
+        # test many trues
+        for i in range(7, 20):
+            index.unindex_object(i)
+        items = dict(index.items())
+        self.assertEqual(len(items[True]), 5)
+        self.assertEqual(len(items[False]), 2)
+
+    def test_histogram(self):
+        index = self._makeOne()
+        # test empty
+        hist = index.histogram()
+        self.assertEqual(hist[True], 0)
+        self.assertEqual(hist[False], 0)
+        # test few trues
+        for i in range(0, 20):
+            obj = Dummy(i, i < 5 and True or False)
+            index._index_object(obj.id, obj, attr='truth')
+        hist = index.histogram()
+        self.assertEqual(hist[True], 5)
+        self.assertEqual(hist[False], 15)
+
+
 def test_suite():
     from unittest import TestSuite, makeSuite
     suite = TestSuite()



More information about the checkins mailing list