[Checkins] SVN: zope.index/trunk/src/zope/index/text/ The baseindex, okapiindex, and lexicon used plain counters for various

Chris McDonough chrism at plope.com
Mon Feb 16 08:55:51 EST 2009


Log message for revision 96591:
  The baseindex, okapiindex, and lexicon used plain counters for various
  lengths, which is unsuitable for production applications.
  
  Backport code from Zope2 indexes which opportunistically replaces the
  counters with BTree.Length objects.
  

Changed:
  U   zope.index/trunk/src/zope/index/text/baseindex.py
  U   zope.index/trunk/src/zope/index/text/lexicon.py
  U   zope.index/trunk/src/zope/index/text/okapiindex.py
  U   zope.index/trunk/src/zope/index/text/textindex.txt

-=-
Modified: zope.index/trunk/src/zope/index/text/baseindex.py
===================================================================
--- zope.index/trunk/src/zope/index/text/baseindex.py	2009-02-16 12:30:52 UTC (rev 96590)
+++ zope.index/trunk/src/zope/index/text/baseindex.py	2009-02-16 13:55:51 UTC (rev 96591)
@@ -72,6 +72,7 @@
 
         # Use a BTree length for efficient length computation w/o conflicts
         self.wordCount = Length.Length()
+        self.documentCount = Length.Length()
 
     def clear(self):
         self.__init__(self._lexicon)
@@ -83,6 +84,7 @@
 
     def documentCount(self):
         """Return the number of documents in the index."""
+        # overridden per instance
         return len(self._docweight)
 
     def get_words(self, docid):
@@ -99,6 +101,11 @@
         self._mass_add_wordinfo(wid2weight, docid)
         self._docweight[docid] = docweight
         self._docwords[docid] = widcode.encode(wids)
+        try:
+            self.documentCount.change(1)
+        except AttributeError:
+            # upgrade documentCount to Length object
+            self.documentCount = Length.Length(self.documentCount())
         return len(wids)
 
     # A subclass may wish to extend or override this.  This is for adjusting
@@ -106,16 +113,11 @@
     # faster than simply unindexing the old version in its entirety and then
     # adding the new version in its entirety.
     def _reindex_doc(self, docid, text):
-
         # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
         old_wids = self.get_words(docid)
+        old_wid2w, old_docw = self._get_frequencies(old_wids)
+
         new_wids = self._lexicon.sourceToWordIds(text)
-
-        if old_wids == new_wids:
-            # we return -1 if not changed
-            return -1
-        
-        old_wid2w, old_docw = self._get_frequencies(old_wids)
         new_wid2w, new_docw = self._get_frequencies(new_wids)
 
         old_widset = self.family.IF.TreeSet(old_wid2w.keys())
@@ -168,6 +170,11 @@
             self._del_wordinfo(wid, docid)
         del self._docwords[docid]
         del self._docweight[docid]
+        try:
+            self.documentCount.change(-1)
+        except AttributeError:
+            # upgrade documentCount to Length object
+            self.documentCount = Length.Length(self.documentCount())
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)

Modified: zope.index/trunk/src/zope/index/text/lexicon.py
===================================================================
--- zope.index/trunk/src/zope/index/text/lexicon.py	2009-02-16 12:30:52 UTC (rev 96590)
+++ zope.index/trunk/src/zope/index/text/lexicon.py	2009-02-16 13:55:51 UTC (rev 96591)
@@ -21,6 +21,7 @@
 
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
+from BTrees.Length import Length
 
 from persistent import Persistent
 
@@ -32,7 +33,6 @@
 class Lexicon(Persistent):
 
     implements(ILexicon)
-
     def __init__(self, *pipeline):
         self._wids = OIBTree()  # word -> wid
         self._words = IOBTree() # wid -> word
@@ -41,16 +41,13 @@
         # we never saw before, and that isn't a known stopword (or otherwise
         # filtered out).  Returning a special wid value for OOV words is a
         # way to let clients know when an OOV word appears.
-        self._nextwid = 1
+        self.wordCount = Length()
         self._pipeline = pipeline
 
-        # Keep some statistics about indexing
-        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
-        self._nwords = 0 # Number of words indexed (after pipeline)
-
     def wordCount(self):
         """Return the number of unique terms in the lexicon."""
-        return self._nextwid - 1
+        # overridden per instance
+        return len(self._wids)
 
     def words(self):
         return self._wids.keys()
@@ -63,11 +60,16 @@
 
     def sourceToWordIds(self, text):
         last = _text2list(text)
-        for t in last:
-            self._nbytes += len(t)
         for element in self._pipeline:
             last = element.process(last)
-        self._nwords += len(last)
+        if not hasattr(self.wordCount, 'change'):
+            # Make sure wordCount is overridden with a BTrees.Length.Length
+            self.wordCount = Length(self.wordCount())        
+        # Strategically unload the length value so that we get the most
+        # recent value written to the database to minimize conflicting wids
+        # Because length is independent, this will load the most
+        # recent value stored, regardless of whether MVCC is enabled
+        self.wordCount._p_deactivate()
         return map(self._getWordIdCreate, last)
 
     def termToWordIds(self, text):
@@ -142,9 +144,14 @@
         return wid
 
     def _new_wid(self):
-        wid = self._nextwid
-        self._nextwid += 1
-        return wid
+        count = self.wordCount
+        try:
+            count.change(1)
+        except AttributeError:
+            count = self.wordCount = Length.Length(count())
+        while self._words.has_key(count()): # just to be safe
+            count.change(1)
+        return count()
 
 def _text2list(text):
     # Helper: splitter input may be a string or a list of strings

Modified: zope.index/trunk/src/zope/index/text/okapiindex.py
===================================================================
--- zope.index/trunk/src/zope/index/text/okapiindex.py	2009-02-16 12:30:52 UTC (rev 96590)
+++ zope.index/trunk/src/zope/index/text/okapiindex.py	2009-02-16 13:55:51 UTC (rev 96591)
@@ -193,6 +193,7 @@
 """
 from zope.index.text.baseindex import BaseIndex
 from zope.index.text.baseindex import inverse_doc_frequency
+from BTrees.Length import Length
 
 class OkapiIndex(BaseIndex):
 
@@ -217,28 +218,32 @@
         # sum(self._docweight.values()), the total # of words in all docs
         # This is a long for "better safe than sorry" reasons.  It isn't
         # used often enough that speed should matter.
-        self._totaldoclen = 0L
+        self._totaldoclen = Length(0)
 
     def index_doc(self, docid, text):
         count = BaseIndex.index_doc(self, docid, text)
-        if count == -1:
-            return count
-        self._totaldoclen += count
+        self._change_doc_len(count)
         return count
 
     def _reindex_doc(self, docid, text):
-        old_docw = self._docweight[docid]
-        count = BaseIndex._reindex_doc(self, docid, text)
-        if count > -1:
-            self._totaldoclen -= old_docw
-        return count
+        self._change_doc_len(-self._docweight[docid])
+        return BaseIndex._reindex_doc(self, docid, text)
 
     def unindex_doc(self, docid):
         if docid not in self._docwords:
             return
-        self._totaldoclen -= self._docweight.get(docid, 0)
+        self._change_doc_len(-self._docweight[docid])
         BaseIndex.unindex_doc(self, docid)
 
+    def _change_doc_len(self, delta):
+        # Change total doc length used for scoring
+        delta = int(delta)
+        try:
+            self._totaldoclen.change(delta)
+        except AttributeError:
+            # Opportunistically upgrade _totaldoclen attribute to Length object
+            self._totaldoclen = Length(long(self._totaldoclen + delta))
+
     # The workhorse.  Return a list of (IFBucket, weight) pairs, one pair
     # for each wid t in wids.  The IFBucket, times the weight, maps D to
     # TF(D,t) * IDF(t) for every docid D containing t.
@@ -250,7 +255,12 @@
         if not wids:
             return []
         N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         K1 = self.K1
         B = self.B
         K1_plus1 = K1 + 1.0
@@ -294,7 +304,12 @@
         if not wids:
             return []
         N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         #K1 = self.K1
         #B = self.B
         #K1_plus1 = K1 + 1.0

Modified: zope.index/trunk/src/zope/index/text/textindex.txt
===================================================================
--- zope.index/trunk/src/zope/index/text/textindex.txt	2009-02-16 12:30:52 UTC (rev 96590)
+++ zope.index/trunk/src/zope/index/text/textindex.txt	2009-02-16 13:55:51 UTC (rev 96591)
@@ -122,55 +122,36 @@
 Tracking Changes
 ================
 
-In order to have as few writes as possible, the index doesn't change
-its state if we index a docid with the same values twice. To
-test this behaviour we have to create a simple data manager.
+If we index a document the first time it updates the _totaldoclen of
+the underlying object.
 
-    >>> class DM:
-    ...     def __init__(self):
-    ...         self.called = 0
-    ...     def register(self, ob):
-    ...         self.called += 1
-    ...     def setstate(self, ob):
-    ...         ob.__setstate__({'x': 42})
-
-If we index a document the first time it changes the state of the
-underlying index. At the start _p_changed is False.
-
-    >>> index._p_jar = index.index._p_jar = DM()
-    >>> index.index._p_changed
-    False
+    >>> index = TextIndex()
+    >>> index.index._totaldoclen()
+    0
     >>> index.index_doc(100, u"a new funky value")
-    >>> index.index._p_changed
-    True
+    >>> index.index._totaldoclen()
+    3
 
-Now for testing we set the changed flag to false again.
+If we index it a second time, the underlying index length should not
+be changed.
 
-    >>> index.index._p_changed = False
-
-If we index it a second time, the underlying index should not be
-changed.
-
     >>> index.index_doc(100, u"a new funky value")
-    >>> index._p_changed is index.index._p_changed is False
-    True
-    >>> index.index._p_changed = False
+    >>> index.index._totaldoclen()
+    3
 
-But if we change it the state changes too.
+But if we change it the length changes too.
 
     >>> index.index_doc(100, u"an even newer funky value")
-    >>> index.index._p_changed
-    True
+    >>> index.index._totaldoclen()
+    5
 
 The same as for index_doc applies to unindex_doc, if an object is
 unindexed that is not indexed no indexes chould change state.
 
-    >>> index._p_changed = index.index._p_changed = False
     >>> index.unindex_doc(100)
-    >>> index.index._p_changed
-    True
+    >>> index.index._totaldoclen()
+    0
 
-    >>> index._p_changed = index.index._p_changed = False
     >>> index.unindex_doc(100)
-    >>> index._p_changed is index.index._p_changed is False
-    True
+    >>> index.index._totaldoclen()
+    0



More information about the Checkins mailing list