[Checkins] SVN: Products.ZCTextIndex/trunk/ Changed word id creation algorithm in Lexicon. Instead of relying on an increasing length counter, we use a number from a randomized range. This avoids conflict errors while adding new words in multiple parallel transactions. Inspired by code from ``enfold.fixes``.
Hanno Schlichting
hannosch at hannosch.eu
Sat Oct 2 06:38:52 EDT 2010
Log message for revision 117163:
Changed word id creation algorithm in Lexicon. Instead of relying on an increasing length counter, we use a number from a randomized range. This avoids conflict errors while adding new words in multiple parallel transactions. Inspired by code from ``enfold.fixes``.
Changed:
U Products.ZCTextIndex/trunk/CHANGES.txt
U Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py
U Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py
U Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py
-=-
Modified: Products.ZCTextIndex/trunk/CHANGES.txt
===================================================================
--- Products.ZCTextIndex/trunk/CHANGES.txt 2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/CHANGES.txt 2010-10-02 10:38:52 UTC (rev 117163)
@@ -4,6 +4,11 @@
2.13.1 (unreleased)
-------------------
+- Changed word id creation algorithm in Lexicon. Instead of relying on an
+ increasing length counter, we use a number from a randomized range. This
+ avoids conflict errors while adding new words in multiple parallel
+ transactions. Inspired by code from ``enfold.fixes``.
+
- Lexicon: Added clear method.
- Lexicon: Removed BBB code for instances created with Zope < 2.6.2.
Modified: Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py
===================================================================
--- Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py 2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/Lexicon.py 2010-10-02 10:38:52 UTC (rev 117163)
@@ -14,6 +14,7 @@
"""Lexicon.
"""
+from random import randrange
import re
from BTrees.IOBTree import IOBTree
@@ -32,6 +33,9 @@
implements(ILexicon)
+ _v_nextid = None
+ _wid_length_based = True # Flag to distinguish new and old lexica
+
def __init__(self, *pipeline):
self.clear()
self._pipeline = pipeline
@@ -40,6 +44,7 @@
"""Empty the lexicon.
"""
self.length = Length()
+ self._wid_length_based = False
self._wids = OIBTree() # word -> wid
self._words = IOBTree() # wid -> word
# wid 0 is reserved for words that aren't in the lexicon (OOV -- out
@@ -67,11 +72,6 @@
last = _text2list(text)
for element in self._pipeline:
last = element.process(last)
- # Strategically unload the length value so that we get the most
- # recent value written to the database to minimize conflicting wids
- # Because length is independent, this will load the most
- # recent value stored, regardless of whether MVCC is enabled
- self.length._p_deactivate()
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
@@ -141,22 +141,37 @@
def _getWordIdCreate(self, word):
wid = self._wids.get(word)
if wid is None:
- wid = self._new_wid()
+ # WidCode requires us to use at least 0x4000 as a base number.
+ # The algorithm in versions before 2.13 used the length as a base
+ # number. So we don't even try to generate numbers below the
+ # length as they are likely all taken
+ minimum = 0x4000
+ if self._wid_length_based:
+ minimum = max(self.length(), 0x4000)
+
+ while True:
+ if self._v_nextid is None:
+ self._v_nextid = randrange(minimum, 0x10000000)
+
+ wid = self._v_nextid
+ self._v_nextid += 1
+
+ if wid not in self._words:
+ break
+
+ self._v_nextid = None
+
+ self.length.change(1)
self._wids[word] = wid
self._words[wid] = word
return wid
- def _new_wid(self):
- self.length.change(1)
- while self._words.has_key(self.length()): # just to be safe
- self.length.change(1)
- return self.length()
def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
- except:
+ except Exception:
return text
else:
return [text]
Modified: Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py
===================================================================
--- Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py 2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/WidCode.py 2010-10-02 10:38:52 UTC (rev 117163)
@@ -119,13 +119,3 @@
_encoding = tuple(_encoding)
_fill()
-
-def test():
- for i in range(2**20):
- if i % 1000 == 0: print i
- wids = [i]
- code = encode(wids)
- assert decode(code) == wids, (wids, code, decode(code))
-
-if __name__ == "__main__":
- test()
Modified: Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py
===================================================================
--- Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py 2010-10-02 09:59:04 UTC (rev 117162)
+++ Products.ZCTextIndex/trunk/src/Products/ZCTextIndex/tests/testLexicon.py 2010-10-02 10:38:52 UTC (rev 117163)
@@ -20,7 +20,6 @@
import sys
import transaction
-import ZODB
class StupidPipelineElement:
@@ -87,10 +86,8 @@
verifyClass(ILexicon, self._getTargetClass())
def test_clear(self):
- from Products.ZCTextIndex.Lexicon import Splitter
-
lexicon = self._makeOne()
- wids = lexicon.sourceToWordIds('foo')
+ lexicon.sourceToWordIds('foo')
self.assertEqual(len(lexicon._wids), 1)
self.assertEqual(len(lexicon._words), 1)
self.assertEqual(lexicon.length(), 1)
@@ -105,7 +102,9 @@
lexicon = self._makeOne(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
- self.assertEqual(wids, [1, 2, 3])
+ self.assertEqual(len(wids), 3)
+ first = wids[0]
+ self.assertEqual(wids, [first, first+1, first+2])
def testTermToWordIds(self):
from Products.ZCTextIndex.Lexicon import Splitter
@@ -113,7 +112,8 @@
lexicon = self._makeOne(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
- self.assertEqual(wids, [3])
+ self.assertEqual(len(wids), 1)
+ self.assert_(wids[0] > 0)
def testMissingTermToWordIds(self):
from Products.ZCTextIndex.Lexicon import Splitter
@@ -134,7 +134,8 @@
lexicon = self._makeOne(AddedSplitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
- self.assertEqual(wids, [3])
+ self.assertEqual(len(wids), 1)
+ self.assert_(wids[0] > 0)
def testMissingTermToWordIdsWithProcess_post_glob(self):
"""This test is for added process_post_glob"""
@@ -156,7 +157,8 @@
StupidPipelineElement('dogs', 'fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('fish')
- self.assertEqual(wids, [3])
+ self.assertEqual(len(wids), 1)
+ self.assert_(wids[0] > 0)
def testSplitterAdaptorFold(self):
from Products.ZCTextIndex.Lexicon import CaseNormalizer
@@ -165,7 +167,9 @@
lexicon = self._makeOne(Splitter(), CaseNormalizer())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
- self.assertEqual(wids, [1, 2, 3])
+ self.assertEqual(len(wids), 3)
+ first = wids[0]
+ self.assertEqual(wids, [first, first+1, first+2])
def testSplitterAdaptorNofold(self):
from Products.ZCTextIndex.Lexicon import Splitter
@@ -173,7 +177,9 @@
lexicon = self._makeOne(Splitter())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
- self.assertEqual(wids, [0, 2, 3])
+ self.assertEqual(len(wids), 3)
+ second = wids[1]
+ self.assertEqual(wids, [0, second, second+1])
def testTwoElementPipeline(self):
from Products.ZCTextIndex.Lexicon import Splitter
@@ -183,7 +189,8 @@
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
- self.assertEqual(wids, [1])
+ self.assertEqual(len(wids), 1)
+ self.assert_(wids[0] > 0)
def testThreeElementPipeline(self):
from Products.ZCTextIndex.Lexicon import Splitter
@@ -194,7 +201,8 @@
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
- self.assertEqual(wids, [2])
+ self.assertEqual(len(wids), 1)
+ self.assert_(wids[0] > 0)
def testSplitterLocaleAwareness(self):
import locale
More information about the checkins
mailing list