[Zope-Checkins] CVS: Zope/lib/python/Products/ZCTextIndex - BaseIndex.py:1.30 CosineIndex.py:1.23 IIndex.py:1.12 Lexicon.py:1.19 OkapiIndex.py:1.30 ZCTextIndex.py:1.47

Thu, 5 Jun 2003 15:44:25 -0400

Update of /cvs-repository/Zope/lib/python/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv20404

Modified Files:
	BaseIndex.py CosineIndex.py IIndex.py Lexicon.py OkapiIndex.py 
	ZCTextIndex.py 
Log Message:
Merge casey-zctextindex-fewer-conflicts-branch:

  - Indexes and Lexicon now much less likely to generate write conflicts. 
    Previously *any* concurrent index/unindex operation would conflict

  - Performance and scalability fix for queries


=== Zope/lib/python/Products/ZCTextIndex/BaseIndex.py 1.29 => 1.30 ===

--- Zope/lib/python/Products/ZCTextIndex/BaseIndex.py:1.29	Tue Feb  4 13:29:41 2003
+++ Zope/lib/python/Products/ZCTextIndex/BaseIndex.py	Thu Jun  5 15:43:54 2003
@@ -20,7 +20,7 @@
 from BTrees.IOBTree import IOBTree
 from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
 from BTrees.IIBTree import intersection, difference
-import BTrees.Length
+from BTrees.Length import Length
 
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
@@ -83,12 +83,18 @@
         self._docwords = IOBTree()
 
         # Use a BTree length for efficient length computation w/o conflicts
-        self.length = BTrees.Length.Length()
+        self.length = Length()
+        self.document_count = Length()
 
     def length(self):
         """Return the number of words in the index."""
         # This is overridden per instance
         return len(self._wordinfo)
+        
+    def document_count(self):
+        """Return the number of documents in the index"""
+        # This is overridden per instance
+        return len(self._docweight)        
 
     def get_words(self, docid):
         """Return a list of the wordids for a given docid."""
@@ -104,6 +110,11 @@
         self._mass_add_wordinfo(wid2weight, docid)
         self._docweight[docid] = docweight
         self._docwords[docid] = WidCode.encode(wids)
+        try:
+            self.document_count.change(1)
+        except AttributeError:
+            # Upgrade document_count to Length object
+            self.document_count = Length(self.document_count())
         return len(wids)
 
     # A subclass may wish to extend or override this.  This is for adjusting
@@ -165,6 +176,11 @@
             self._del_wordinfo(wid, docid)
         del self._docwords[docid]
         del self._docweight[docid]
+        try:
+            self.document_count.change(-1)
+        except AttributeError:
+            # Upgrade document_count to Length object
+            self.document_count = Length(self.document_count())
 
     def search(self, term):
         wids = self._lexicon.termToWordIds(term)


=== Zope/lib/python/Products/ZCTextIndex/CosineIndex.py 1.22 => 1.23 ===
--- Zope/lib/python/Products/ZCTextIndex/CosineIndex.py:1.22	Tue May 28 19:42:20 2002
+++ Zope/lib/python/Products/ZCTextIndex/CosineIndex.py	Thu Jun  5 15:43:54 2003
@@ -69,7 +69,7 @@
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))
+        N = float(self.document_count())
         L = []
         DictType = type({})
         for wid in wids:
@@ -86,7 +86,7 @@
         wids = []
         for term in terms:
             wids += self._lexicon.termToWordIds(term)
-        N = float(len(self._docweight))
+        N = float(self.document_count())
         sum = 0.0
         for wid in self._remove_oov_wids(wids):
             wt = inverse_doc_frequency(len(self._wordinfo[wid]), N)


=== Zope/lib/python/Products/ZCTextIndex/IIndex.py 1.11 => 1.12 ===
--- Zope/lib/python/Products/ZCTextIndex/IIndex.py:1.11	Wed Aug 14 18:25:14 2002
+++ Zope/lib/python/Products/ZCTextIndex/IIndex.py	Thu Jun  5 15:43:54 2003
@@ -20,6 +20,9 @@
     """Interface for an Index."""
 
     def length():
+        """Return the number of words in the index."""
+        
+    def document_count():
         """Return the number of documents in the index."""
 
     def get_words(docid):
@@ -62,10 +65,13 @@
         """
 
     def index_doc(docid, text):
-        "XXX"
+        """Add a document with the specified id and text to the index. If a
+        document by that id already exists, replace its text with the new
+        text provided
+        """
 
     def unindex_doc(docid):
-        "XXX"
+        """Remove the document with the specified id from the index"""
 
     def has_doc(docid):
         """Returns true if docid is an id of a document in the index"""


=== Zope/lib/python/Products/ZCTextIndex/Lexicon.py 1.18 => 1.19 ===
--- Zope/lib/python/Products/ZCTextIndex/Lexicon.py:1.18	Mon Dec  2 00:27:51 2002
+++ Zope/lib/python/Products/ZCTextIndex/Lexicon.py	Thu Jun  5 15:43:54 2003
@@ -16,6 +16,7 @@
 
 from BTrees.IOBTree import IOBTree
 from BTrees.OIBTree import OIBTree
+from BTrees.Length import Length
 
 import ZODB
 from Persistence import Persistent
@@ -37,16 +38,13 @@
         # we never saw before, and that isn't a known stopword (or otherwise
         # filtered out).  Returning a special wid value for OOV words is a
         # way to let clients know when an OOV word appears.
-        self._nextwid = 1
+        self.length = Length()
         self._pipeline = pipeline
 
-        # Keep some statistics about indexing
-        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
-        self._nwords = 0 # Number of words indexed (after pipeline)
-
     def length(self):
         """Return the number of unique terms in the lexicon."""
-        return self._nextwid - 1
+        # Overridden in instances
+        return len(self._wids)
 
     def words(self):
         return self._wids.keys()
@@ -59,11 +57,15 @@
 
     def sourceToWordIds(self, text):
         last = _text2list(text)
-        for t in last:
-            self._nbytes += len(t)
         for element in self._pipeline:
             last = element.process(last)
-        self._nwords += len(last)
+        if not hasattr(self.length, 'change'):
+            # Make sure length is overridden with a BTrees.Length.Length
+            self.length = Length(self.length())        
+        # Strategically unload the length value so that we get the most
+        # recent value written to the database to minimize conflicting wids
+        # XXX this will not work when MVCC is implemented in the ZODB...
+        self.length._p_deactivate()
         return map(self._getWordIdCreate, last)
 
     def termToWordIds(self, text):
@@ -138,9 +140,10 @@
         return wid
 
     def _new_wid(self):
-        wid = self._nextwid
-        self._nextwid += 1
-        return wid
+        self.length.change(1)
+        while self._words.has_key(self.length()): # just to be safe
+            self.length.change(1)
+        return self.length()
 
 def _text2list(text):
     # Helper: splitter input may be a string or a list of strings


=== Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py 1.29 => 1.30 ===
--- Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py:1.29	Wed May 29 16:47:44 2002
+++ Zope/lib/python/Products/ZCTextIndex/OkapiIndex.py	Thu Jun  5 15:43:54 2003
@@ -18,6 +18,7 @@
 # understand what's going on.
 
 from BTrees.IIBTree import IIBucket
+from BTrees.Length import Length
 
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex.BaseIndex import BaseIndex, \
@@ -50,20 +51,29 @@
         # sum(self._docweight.values()), the total # of words in all docs
         # This is a long for "better safe than sorry" reasons.  It isn't
         # used often enough that speed should matter.
-        self._totaldoclen = 0L
+        # Use a BTree.Length.Length object to avoid concurrent write conflicts
+        self._totaldoclen = Length(0L)
 
     def index_doc(self, docid, text):
         count = BaseIndex.index_doc(self, docid, text)
-        self._totaldoclen += count
+        self._change_doc_len(count)
         return count
 
     def _reindex_doc(self, docid, text):
-        self._totaldoclen -= self._docweight[docid]
+        self._change_doc_len(-self._docweight[docid])
         return BaseIndex._reindex_doc(self, docid, text)
 
     def unindex_doc(self, docid):
-        self._totaldoclen -= self._docweight[docid]
+        self._change_doc_len(-self._docweight[docid])
         BaseIndex.unindex_doc(self, docid)
+    
+    def _change_doc_len(self, delta):
+        # Change total doc length used for scoring
+        try:
+            self._totaldoclen.change(delta)
+        except AttributeError:
+            # Opportunistically upgrade _totaldoclen attribute to Length object
+            self._totaldoclen = Length(long(self._totaldoclen + delta))
 
     # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
     # for each wid t in wids.  The IIBucket, times the weight, maps D to
@@ -76,8 +86,13 @@
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        N = float(self.document_count())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         K1 = self.K1
         B = self.B
         K1_plus1 = K1 + 1.0
@@ -120,8 +135,13 @@
     def _search_wids(self, wids):
         if not wids:
             return []
-        N = float(len(self._docweight))  # total # of docs
-        meandoclen = self._totaldoclen / N
+        N = float(self.document_count())  # total # of docs
+        try:
+            doclen = self._totaldoclen()
+        except TypeError:
+            # _totaldoclen has not yet been upgraded
+            doclen = self._totaldoclen
+        meandoclen = doclen / N
         #K1 = self.K1
         #B = self.B
         #K1_plus1 = K1 + 1.0


=== Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py 1.46 => 1.47 ===
--- Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py:1.46	Fri Feb 28 17:22:57 2003
+++ Zope/lib/python/Products/ZCTextIndex/ZCTextIndex.py	Thu Jun  5 15:43:54 2003
@@ -173,13 +173,11 @@
         if text is None:
             return 0
         count = self.index.index_doc(docid, text)
-        self._p_changed = 1 # XXX
         return count
 
     def unindex_object(self, docid):
         if self.index.has_doc(docid):
             self.index.unindex_doc(docid)
-            self._p_changed = 1 # XXX
 
     def _apply_index(self, request, cid=''):
         """Apply query specified by request, a mapping containing the query.