[Zope-CVS] CVS: Products/ZCTextIndex - BaseIndex.py:1.8 CosineIndex.py:1.17 OkapiIndex.py:1.22

Tim Peters tim.one@comcast.net
Fri, 17 May 2002 02:56:01 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv20114

Modified Files:
	BaseIndex.py CosineIndex.py OkapiIndex.py 
Log Message:
Factor out most of the code for indexing a doc.  The cosine index may
take longer to construct now; both indexers' _get_frequencies routines
were fiddled to return the same kind of stuff again, and I had
previously fiddled the cosine indexer's _get_frequencies to do something
weirder but (probably) faster than this.


=== Products/ZCTextIndex/BaseIndex.py 1.7 => 1.8 ===
         return WidCode.decode(self._docwords[docid])
 
-    # Subclass must override.
+    # A subclass may wish to extend or override this.
     def index_doc(self, docid, text):
+        # XXX If docid is already known, do something smart.
+        wids = self._lexicon.sourceToWordIds(text)
+        wid2weight, docweight = self._get_frequencies(wids)
+        for wid, weight in wid2weight.items():
+            self._add_wordinfo(wid, weight, docid)
+        self._docweight[docid] = docweight
+        self._docwords[docid] = WidCode.encode(wids)
+        return len(wids)
+
+    # Subclass must override.
+    def _get_frequencies(self, wids):
+        # Compute term frequencies and a doc weight, whatever those mean
+        # to an indexer.
+        # Return pair:
+        #    {wid0: w(d, wid0), wid1: w(d, wid1),  ...],
+        #    docweight
+        # The wid->weight mappings are fed into _add_wordinfo, and docweight
+        # becomes the value of _docweight[docid].
         raise NotImplementedError
 
     # A subclass may wish to extend or override this.


=== Products/ZCTextIndex/CosineIndex.py 1.16 => 1.17 ===
     #        computed by self.query_weight()
 
-    def index_doc(self, docid, text):
-        wids = self._lexicon.sourceToWordIds(text)
-        uniqwids, freqs, docweight = self._get_frequencies(wids)
-        for i in range(len(uniqwids)):
-            self._add_wordinfo(uniqwids[i], freqs[i], docid)
-        self._docweight[docid] = docweight
-        self._docwords[docid] = WidCode.encode(wids)
-        return len(wids)
-
     def _search_wids(self, wids):
         if not wids:
             return []
@@ -111,30 +102,22 @@
         return scaled_int(math.sqrt(sum))
 
     def _get_frequencies(self, wids):
-        """Return individual doc-term weights and docweight."""
-        # Computes w(d, t) for each term, and W(d).
-        # Return triple:
-        #    [wid0, wid1, ...],
-        #    [w(d, wid0)/W(d), w(d, wid1)/W(d), ...],
-        #    W(d)
-        # The second list and W(d) are scaled_ints.
         d = {}
+        dget = d.get
         for wid in wids:
-            d[wid] = d.get(wid, 0) + 1
+            d[wid] = dget(wid, 0) + 1
         Wsquares = 0.0
-        weights = []
-        push = weights.append
-        for count in d.values():
+        for wid, count in d.items():
             w = doc_term_weight(count)
             Wsquares += w * w
-            push(w)
+            d[wid] = w
         W = math.sqrt(Wsquares)
         #print "W = %.3f" % W
-        for i in xrange(len(weights)):
-            #print i, ":", "%.3f" % weights[i],
-            weights[i] = scaled_int(weights[i] / W)
-            #print "->", weights[i]
-        return d.keys(), weights, scaled_int(W)
+        for wid, weight in d.items():
+            #print i, ":", "%.3f" % weight,
+            d[wid] = scaled_int(weight / W)
+            #print "->", d[wid]
+        return d, scaled_int(W)
 
     # The rest are helper methods to support unit tests
 


=== Products/ZCTextIndex/OkapiIndex.py 1.21 => 1.22 ===
 
     def index_doc(self, docid, text):
-        wids = self._lexicon.sourceToWordIds(text)
-        self._docweight[docid] = len(wids)
-        self._totaldoclen += len(wids)
-
-        wid2count = self._get_frequencies(wids)
-        for wid, count in wid2count.items():
-            self._add_wordinfo(wid, count, docid)
-
-        self._docwords[docid] = WidCode.encode(wids)
-        return len(wids)
+        count = BaseIndex.index_doc(self, docid, text)
+        self._totaldoclen += count
 
     def unindex_doc(self, docid):
         self._totaldoclen -= self._docweight[docid]
@@ -125,15 +117,11 @@
         return 10   # arbitrary
 
     def _get_frequencies(self, wids):
-        """Return individual term frequencies."""
-        # Computes f(d, t) for each term.
-        # Returns a dict mapping wid to the number of times wid appeares
-        # in wids, {t -> f(d, t)}
         d = {}
         dget = d.get
         for wid in wids:
             d[wid] = dget(wid, 0) + 1
-        return d
+        return d, len(wids)
 
 """
 "Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.