[Zope-CVS] CVS: Products/ZCTextIndex - BaseIndex.py:1.5 CosineIndex.py:1.13 IIndex.py:1.3 OkapiIndex.py:1.19

Tim Peters tim.one@comcast.net
Fri, 17 May 2002 02:03:19 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv7484

Modified Files:
	BaseIndex.py CosineIndex.py IIndex.py OkapiIndex.py 
Log Message:
Combine _{add,del}wordinfo; make length() part of IIndex.


=== Products/ZCTextIndex/BaseIndex.py 1.4 => 1.5 ===
         return 10   # arbitrary
 
-    def _get_frequencies(self, wids):
-        """Return individual term frequencies."""
-        # Computes f(d, t) for each term.
-        # Returns a dict mapping wid to the number of times wid appeares
-        # in wids, {t -> f(d, t)}
-        d = {}
-        dget = d.get
-        for wid in wids:
-            d[wid] = dget(wid, 0) + 1
-        return d
-
     DICT_CUTOFF = 10
 
     def _add_wordinfo(self, wid, f, docid):


=== Products/ZCTextIndex/CosineIndex.py 1.12 => 1.13 ===
         return d.keys(), weights, scaled_int(W)
 
-    DICT_CUTOFF = 10
-
-    def _add_wordinfo(self, wid, f, docid):
-        # Store a wordinfo in a dict as long as there are less than
-        # DICT_CUTOFF docids in the dict.  Otherwise use an IIBTree.
-
-        # The pickle of a dict is smaller than the pickle of an
-        # IIBTree, substantially so for small mappings.  Thus, we use
-        # a dictionary until the mapping reaches DICT_CUTOFF elements.
-
-        # The cutoff is chosen based on the implementation
-        # characteristics of Python dictionaries.  The dict hashtable
-        # always has 2**N slots and is resized whenever it is 2/3s
-        # full.  A pickled dict with 10 elts is half the size of an
-        # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
-        # choose 10 as the cutoff for now.
-
-        # The IIBTree has a smaller in-memory representation than a
-        # dictionary, so pickle size isn't the only consideration when
-        # choosing the threshold.  The pickle of a 500-elt dict is 92%
-        # of the size of the same IIBTree, but the dict uses more
-        # space when it is live in memory.  An IIBTree stores two C
-        # arrays of ints, one for the keys and one for the values.  It
-        # holds upto 120 key-value pairs in a single bucket.
-        try:
-            map = self._wordinfo[wid]
-        except KeyError:
-            map = {}
-        else:
-            # _add_wordinfo() is called for each update.  If the map
-            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
-            if len(map) == self.DICT_CUTOFF:
-                map = IIBTree(map)
-        map[docid] = f
-        self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
-    def _del_wordinfo(self, wid, docid):
-        try:
-            map = self._wordinfo[wid]
-            del map[docid]
-        except KeyError:
-            return
-        if len(map) == 0:
-            del self._wordinfo[wid]
-            return
-        if len(map) == self.DICT_CUTOFF:
-            new = {}
-            for k, v in map.items():
-                new[k] = v
-            map = new
-        self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
     def _add_undoinfo(self, docid, wids):
         self._docwords[docid] = WidCode.encode(wids)
 


=== Products/ZCTextIndex/IIndex.py 1.2 => 1.3 ===
     """Interface for an Index."""
 
+    def length():
+        """Return the number of documents in the index."""
+
     def search(term):
         """Execute a search on a single term given as a string.
 


=== Products/ZCTextIndex/OkapiIndex.py 1.18 => 1.19 ===
         return d
 
-    DICT_CUTOFF = 10
-
-    def _add_wordinfo(self, wid, f, docid):
-        # Store a wordinfo in a dict as long as there are less than
-        # DICT_CUTOFF docids in the dict.  Otherwise use an IIBTree.
-
-        # The pickle of a dict is smaller than the pickle of an
-        # IIBTree, substantially so for small mappings.  Thus, we use
-        # a dictionary until the mapping reaches DICT_CUTOFF elements.
-
-        # The cutoff is chosen based on the implementation
-        # characteristics of Python dictionaries.  The dict hashtable
-        # always has 2**N slots and is resized whenever it is 2/3s
-        # full.  A pickled dict with 10 elts is half the size of an
-        # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
-        # choose 10 as the cutoff for now.
-
-        # The IIBTree has a smaller in-memory representation than a
-        # dictionary, so pickle size isn't the only consideration when
-        # choosing the threshold.  The pickle of a 500-elt dict is 92%
-        # of the size of the same IIBTree, but the dict uses more
-        # space when it is live in memory.  An IIBTree stores two C
-        # arrays of ints, one for the keys and one for the values.  It
-        # holds upto 120 key-value pairs in a single bucket.
-        try:
-            map = self._wordinfo[wid]
-        except KeyError:
-            map = {}
-        else:
-            # _add_wordinfo() is called for each update.  If the map
-            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
-            if len(map) == self.DICT_CUTOFF:
-                map = IIBTree(map)
-        map[docid] = f
-        self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
-    def _del_wordinfo(self, wid, docid):
-        try:
-            map = self._wordinfo[wid]
-            del map[docid]
-        except KeyError:
-            return
-        if len(map) == 0:
-            del self._wordinfo[wid]
-            return
-        if len(map) == self.DICT_CUTOFF:
-            new = {}
-            for k, v in map.items():
-                new[k] = v
-            map = new
-        self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
 """
 "Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
 It's based on probability arguments about how words are distributed in