[Checkins] SVN: Products.ZCTextIndex/trunk/ Changed word id creation algorithm in Lexicon. Instead of relying on an increasing length counter, we use a number from a randomized range. This avoids conflict errors while adding new words in multiple parallel transactions. Inspired by code from ``enfold.fixes``.

Hanno Schlichting hannosch at hannosch.eu
Sat Oct 2 06:38:52 EDT 2010


Log message for revision 117163:
  Changed word id creation algorithm in Lexicon. Instead of relying on an increasing length counter, we use a number from a randomized range. This avoids conflict errors while adding new words in multiple parallel transactions. Inspired by code from ``enfold.fixes``.
  

Changed:
  U   Products.ZCTextIndex/trunk/CHANGES.txt
  U   Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py
  U   Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py
  U   Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py

-=-
Modified: Products.ZCTextIndex/trunk/CHANGES.txt
===================================================================
--- Products.ZCTextIndex/trunk/CHANGES.txt	2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/CHANGES.txt	2010-10-02 10:38:52 UTC (rev 117163)
@@ -4,6 +4,11 @@
 2.13.1 (unreleased)
 -------------------
 
+- Changed word id creation algorithm in Lexicon. Instead of relying on an
+  increasing length counter, we use a number from a randomized range. This
+  avoids conflict errors while adding new words in multiple parallel
+  transactions. Inspired by code from ``enfold.fixes``.
+
 - Lexicon: Added clear method.
 
 - Lexicon: Removed BBB code for instances created with Zope < 2.6.2.

Modified: Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py
===================================================================
--- Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py	2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py	2010-10-02 10:38:52 UTC (rev 117163)
@@ -14,6 +14,7 @@
 """Lexicon.
 """
 
+from random import randrange
 import re
 
 from BTrees.IOBTree import IOBTree
@@ -32,6 +33,9 @@
 
     implements(ILexicon)
 
+    _v_nextid = None
+    _wid_length_based = True # Flag to distinguish new and old lexica
+
     def __init__(self, *pipeline):
         self.clear()
         self._pipeline = pipeline
@@ -40,6 +44,7 @@
         """Empty the lexicon.
         """
         self.length = Length()
+        self._wid_length_based = False
         self._wids = OIBTree()  # word -> wid
         self._words = IOBTree() # wid -> word
         # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
@@ -67,11 +72,6 @@
         last = _text2list(text)
         for element in self._pipeline:
             last = element.process(last)
-        # Strategically unload the length value so that we get the most
-        # recent value written to the database to minimize conflicting wids
-        # Because length is independent, this will load the most
-        # recent value stored, regardless of whether MVCC is enabled
-        self.length._p_deactivate()
         return map(self._getWordIdCreate, last)
 
     def termToWordIds(self, text):
@@ -141,22 +141,37 @@
     def _getWordIdCreate(self, word):
         wid = self._wids.get(word)
         if wid is None:
-            wid = self._new_wid()
+            # WidCode requires us to use at least 0x4000 as a base number.
+            # The algorithm in versions before 2.13 used the length as a base
+            # number. So we don't even try to generate numbers below the
+            # length as they are likely all taken
+            minimum = 0x4000
+            if self._wid_length_based:
+                minimum = max(self.length(), 0x4000)
+
+            while True:
+                if self._v_nextid is None:
+                    self._v_nextid = randrange(minimum, 0x10000000)
+
+                wid = self._v_nextid
+                self._v_nextid += 1
+
+                if wid not in self._words:
+                    break
+
+                self._v_nextid = None
+
+            self.length.change(1)
             self._wids[word] = wid
             self._words[wid] = word
         return wid
 
-    def _new_wid(self):
-        self.length.change(1)
-        while self._words.has_key(self.length()): # just to be safe
-            self.length.change(1)
-        return self.length()
 
 def _text2list(text):
     # Helper: splitter input may be a string or a list of strings
     try:
         text + ""
-    except:
+    except Exception:
         return text
     else:
         return [text]

Modified: Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py
===================================================================
--- Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py	2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py	2010-10-02 10:38:52 UTC (rev 117163)
@@ -119,13 +119,3 @@
     _encoding = tuple(_encoding)
 
 _fill()
-
-def test():
-    for i in range(2**20):
-        if i % 1000 == 0: print i
-        wids = [i]
-        code = encode(wids)
-        assert decode(code) == wids, (wids, code, decode(code))
-
-if __name__ == "__main__":
-    test()

Modified: Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py
===================================================================
--- Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py	2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py	2010-10-02 10:38:52 UTC (rev 117163)
@@ -20,7 +20,6 @@
 import sys
 
 import transaction
-import ZODB
 
 
 class StupidPipelineElement:
@@ -87,10 +86,8 @@
         verifyClass(ILexicon, self._getTargetClass())
 
     def test_clear(self):
-        from Products.ZCTextIndex.Lexicon import Splitter
-
         lexicon = self._makeOne()
-        wids = lexicon.sourceToWordIds('foo')
+        lexicon.sourceToWordIds('foo')
         self.assertEqual(len(lexicon._wids), 1)
         self.assertEqual(len(lexicon._words), 1)
         self.assertEqual(lexicon.length(), 1)
@@ -105,7 +102,9 @@
 
         lexicon = self._makeOne(Splitter())
         wids = lexicon.sourceToWordIds('cats and dogs')
-        self.assertEqual(wids, [1, 2, 3])
+        self.assertEqual(len(wids), 3)
+        first = wids[0]
+        self.assertEqual(wids, [first, first+1, first+2])
 
     def testTermToWordIds(self):
         from Products.ZCTextIndex.Lexicon import Splitter
@@ -113,7 +112,8 @@
         lexicon = self._makeOne(Splitter())
         wids = lexicon.sourceToWordIds('cats and dogs')
         wids = lexicon.termToWordIds('dogs')
-        self.assertEqual(wids, [3])
+        self.assertEqual(len(wids), 1)
+        self.assert_(wids[0] > 0)
 
     def testMissingTermToWordIds(self):
         from Products.ZCTextIndex.Lexicon import Splitter
@@ -134,7 +134,8 @@
         lexicon = self._makeOne(AddedSplitter())
         wids = lexicon.sourceToWordIds('cats and dogs')
         wids = lexicon.termToWordIds('dogs')
-        self.assertEqual(wids, [3])
+        self.assertEqual(len(wids), 1)
+        self.assert_(wids[0] > 0)
 
     def testMissingTermToWordIdsWithProcess_post_glob(self):
         """This test is for added process_post_glob"""
@@ -156,7 +157,8 @@
                                 StupidPipelineElement('dogs', 'fish'))
         wids = lexicon.sourceToWordIds('cats and dogs')
         wids = lexicon.termToWordIds('fish')
-        self.assertEqual(wids, [3])
+        self.assertEqual(len(wids), 1)
+        self.assert_(wids[0] > 0)
 
     def testSplitterAdaptorFold(self):
         from Products.ZCTextIndex.Lexicon import CaseNormalizer
@@ -165,7 +167,9 @@
         lexicon = self._makeOne(Splitter(), CaseNormalizer())
         wids = lexicon.sourceToWordIds('CATS and dogs')
         wids = lexicon.termToWordIds('cats and dogs')
-        self.assertEqual(wids, [1, 2, 3])
+        self.assertEqual(len(wids), 3)
+        first = wids[0]
+        self.assertEqual(wids, [first, first+1, first+2])
 
     def testSplitterAdaptorNofold(self):
         from Products.ZCTextIndex.Lexicon import Splitter
@@ -173,7 +177,9 @@
         lexicon = self._makeOne(Splitter())
         wids = lexicon.sourceToWordIds('CATS and dogs')
         wids = lexicon.termToWordIds('cats and dogs')
-        self.assertEqual(wids, [0, 2, 3])
+        self.assertEqual(len(wids), 3)
+        second = wids[1]
+        self.assertEqual(wids, [0, second, second+1])
 
     def testTwoElementPipeline(self):
         from Products.ZCTextIndex.Lexicon import Splitter
@@ -183,7 +189,8 @@
                           WackyReversePipelineElement('fish'))
         wids = lexicon.sourceToWordIds('cats and dogs')
         wids = lexicon.termToWordIds('hsif')
-        self.assertEqual(wids, [1])
+        self.assertEqual(len(wids), 1)
+        self.assert_(wids[0] > 0)
 
     def testThreeElementPipeline(self):
         from Products.ZCTextIndex.Lexicon import Splitter
@@ -194,7 +201,8 @@
                           WackyReversePipelineElement('fish'))
         wids = lexicon.sourceToWordIds('cats and dogs')
         wids = lexicon.termToWordIds('hsif')
-        self.assertEqual(wids, [2])
+        self.assertEqual(len(wids), 1)
+        self.assert_(wids[0] > 0)
 
     def testSplitterLocaleAwareness(self):
         import locale



More information about the checkins mailing list