[Checkins] SVN: zope.index/trunk/ KeywordIndex: Store docids in TreeSet rather than a Set when the

Shane Hathaway shane at hathawaymix.org
Sat Dec 3 23:23:57 UTC 2011


Log message for revision 123565:
  KeywordIndex: Store docids in TreeSet rather than a Set when the
  number of documents matching a word reaches a configurable
  threshold (default 100). The rule is applied to individual words at
  indexing time, but you can call the new optimize method to optimize
  all the words in an index at once. Designed to fix LP #881950.
  

Changed:
  U   zope.index/trunk/CHANGES.txt
  U   zope.index/trunk/setup.py
  U   zope.index/trunk/src/zope/index/keyword/index.py
  U   zope.index/trunk/src/zope/index/keyword/tests.py

-=-
Modified: zope.index/trunk/CHANGES.txt
===================================================================
--- zope.index/trunk/CHANGES.txt	2011-12-03 21:52:54 UTC (rev 123564)
+++ zope.index/trunk/CHANGES.txt	2011-12-03 23:23:57 UTC (rev 123565)
@@ -4,7 +4,11 @@
 3.6.2 (unreleased)
 ------------------
 
-- TBD
+- KeywordIndex: Store docids in TreeSet rather than a Set when the
+  number of documents matching a word reaches a configurable
+  threshold (default 100). The rule is applied to individual words at
+  indexing time, but you can call the new optimize method to optimize
+  all the words in an index at once. Designed to fix LP #881950.
 
 3.6.1 (2010-07-08)
 ------------------

Modified: zope.index/trunk/setup.py
===================================================================
--- zope.index/trunk/setup.py	2011-12-03 21:52:54 UTC (rev 123564)
+++ zope.index/trunk/setup.py	2011-12-03 23:23:57 UTC (rev 123565)
@@ -61,7 +61,7 @@
         print >> sys.stderr, '*' * 80
 
 setup(name='zope.index',
-      version='3.6.2dev',
+      version='3.6.2',
       url='http://pypi.python.org/pypi/zope.index',
       license='ZPL 2.1',
       author='Zope Foundation and Contributors',

Modified: zope.index/trunk/src/zope/index/keyword/index.py
===================================================================
--- zope.index/trunk/src/zope/index/keyword/index.py	2011-12-03 21:52:54 UTC (rev 123564)
+++ zope.index/trunk/src/zope/index/keyword/index.py	2011-12-03 23:23:57 UTC (rev 123565)
@@ -30,6 +30,10 @@
     implements(IInjection, IStatistics, IIndexSearch, IKeywordQuerying)
     family = BTrees.family32
 
+    # If a word is referenced by at least tree_threshold docids,
+    # use a TreeSet for that word instead of a Set.
+    tree_threshold = 100
+
     def __init__(self, family=None):
         if family is not None:
             self.family = family
@@ -123,11 +127,19 @@
         """insert a sequence of words into the forward index """
 
         idx = self._fwd_index
-        has_key = idx.has_key
+        get_word_idx = idx.get
+        IF = self.family.IF
+        Set = IF.Set
+        TreeSet = IF.TreeSet
         for word in words:
-            if not has_key(word):
-                idx[word] = self.family.IF.Set()
-            idx[word].insert(docid)
+            word_idx = get_word_idx(word)
+            if word_idx is None:
+                idx[word] = word_idx = Set()
+            word_idx.insert(docid)
+            if (not isinstance(word_idx, TreeSet) and
+                    len(word_idx) >= self.tree_threshold):
+                # Convert to a TreeSet.
+                idx[word] = TreeSet(word_idx)
 
     def _insert_reverse(self, docid, words):
         """ add words to forward index """
@@ -175,6 +187,28 @@
             query = query['query']
         return self.search(query, operator=operator)
 
+    def optimize(self):
+        """Optimize the index. Call this after changing tree_threshold.
+
+        This converts internal data structures between
+        Sets and TreeSets based on tree_threshold.
+        """
+        idx = self._fwd_index
+        IF = self.family.IF
+        Set = IF.Set
+        TreeSet = IF.TreeSet
+        items = list(self._fwd_index.items())
+        for word, word_idx in items:
+            if len(word_idx) >= self.tree_threshold:
+                if not isinstance(word_idx, TreeSet):
+                    # Convert to a TreeSet.
+                    idx[word] = TreeSet(word_idx)
+            else:
+                if isinstance(word_idx, TreeSet):
+                    # Convert to a Set.
+                    idx[word] = Set(word_idx)
+
+
 class CaseInsensitiveKeywordIndex(KeywordIndex):
     """A case-normalizing keyword index (for strings as keywords)"""
 

Modified: zope.index/trunk/src/zope/index/keyword/tests.py
===================================================================
--- zope.index/trunk/src/zope/index/keyword/tests.py	2011-12-03 21:52:54 UTC (rev 123564)
+++ zope.index/trunk/src/zope/index/keyword/tests.py	2011-12-03 23:23:57 UTC (rev 123565)
@@ -85,6 +85,60 @@
         self._apply_or(index, ('cmf', 'zope4'), self.IFSet([5]))
         self._apply_or(index, ('zope', 'Zope'), self.IFSet([1,3]))
 
+    def test_apply_with_only_tree_set(self):
+        index = self._makeOne()
+        index.tree_threshold = 0
+        self._populate(index)
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFTreeSet()))
+        self._apply_and(index, ('CMF', 'Zope3'), self.IFSet([1]))
+        self._apply_and(index, ('CMF', 'zope'),  self.IFSet([1]))
+        self._apply_and(index, ('cmf', 'zope4'), self.IFSet())
+        self._apply_and(index, ('quick', 'FOX'), self.IFSet([2]))
+
+    def test_apply_with_mix_of_tree_set_and_simple_set(self):
+        index = self._makeOne()
+        index.tree_threshold = 2
+        self._populate(index)
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFSet()))
+        self._apply_and(index, ('CMF', 'Zope3'), self.IFSet([1]))
+        self._apply_and(index, ('CMF', 'zope'),  self.IFSet([1]))
+        self._apply_and(index, ('cmf', 'zope4'), self.IFSet())
+        self._apply_and(index, ('quick', 'FOX'), self.IFSet([2]))
+
+    def test_optimize_converts_to_tree_set(self):
+        index = self._makeOne()
+        self._populate(index)
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFSet()))
+        index.tree_threshold = 0
+        index.optimize()
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFTreeSet()))
+
+    def test_optimize_converts_to_simple_set(self):
+        index = self._makeOne()
+        index.tree_threshold = 0
+        self._populate(index)
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFTreeSet()))
+        index.tree_threshold = 99
+        index.optimize()
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFSet()))
+
+    def test_optimize_leaves_words_alone(self):
+        index = self._makeOne()
+        self._populate(index)
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFSet()))
+        index.tree_threshold = 99
+        index.optimize()
+        self.assertEqual(type(index._fwd_index['zope']),
+            type(self.IFSet()))
+
+
 class CaseInsensitiveKeywordIndexTestsBase:
 
     def _getTargetClass(self):
@@ -166,6 +220,10 @@
         from BTrees.IFBTree import IFSet
         return IFSet(*args, **kw)
 
+    def IFTreeSet(self, *args, **kw):
+        from BTrees.IFBTree import IFTreeSet
+        return IFTreeSet(*args, **kw)
+
 class _SixtyFourBitBase:
 
     def _get_family(self):
@@ -176,6 +234,10 @@
         from BTrees.LFBTree import LFSet
         return LFSet(*args, **kw)
 
+    def IFTreeSet(self, *args, **kw):
+        from BTrees.LFBTree import LFTreeSet
+        return LFTreeSet(*args, **kw)
+
 _marker = object()
 
 class _TestCaseBase:



More information about the checkins mailing list