[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.29

Tim Peters tim.one@comcast.net
Fri, 3 May 2002 18:17:38 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv25562

Modified Files:
      Tag: TextIndexDS9-branch
	Index.py 
Log Message:
In the maps storing (docid, weight) pairs, the weights have been scaled
ints representing 1.+log(count).  This checkin changes the weights to
store the count directly instead when it's < 256.  This requires
compensating hair upon both storing and loading weights.  The payoff
is that virtually all counts are < 256, and a binary pickle can save
such a count in 1 byte, while a weight generally consumes 2 bytes.

Hmm.  Because a pickle stores a typecode too, it's really a reduction
from 3 bytes to 2 for most weights.  Maybe it's not worth the bother --
it does slow _get_frequencies by introducing irregularities.


=== Products/ZCTextIndex/Index.py 1.1.2.28 => 1.1.2.29 ===
 from Products.ZCTextIndex.IIndex import IIndex
 
-def scaled_int(f):
-    return int(round(f * 1024.))
+# Instead of storing floats, we generally store scaled ints.  Binary pickles
+# can store those more efficiently.  The default SCALE_FACTOR of 1024
+# is large enough to get about 3 decimal digits of fractional info, and
+# small enough so that scaled values should almost always fit in a signed
+# 16-bit int (we're generally storing logs, so a few bits before the radix
+# point goes a long way; on the flip side, for reasonably small numbers x
+# most of the info in log(x) is in the fractional bits, so we do want to
+# save a lot of those).
+SCALE_FACTOR = 1024.0
+
+def scaled_int(f, scale=SCALE_FACTOR):
+    return int(round(f * scale))
 
 class Index:
     __implements__ = IIndex
@@ -60,7 +70,9 @@
     #
     #    w(d, t) = 1 + log f(d, t)
     #        computed by doc_term_weight(); for a given word t,
-    #        self._wordinfo[t] is a map from d to w(d, t)
+    #        self._wordinfo[t] is a map from d to w(d, t).  To save space
+    #        in binary pickles, we actually store f(d, t) if that's < 256.
+    #        In that case, WDT[f(d, t)] retrieves the correct w(d, t).
     #
     #    w(q, t) = log(1 + N/f(t))
     #        computed by query_term_weight()
@@ -95,6 +107,8 @@
             d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
             idf = query_term_weight(len(d2w), N)  # this is an unscaled float
             for docid, tf in d2w.items():
+                if tf < 256:
+                    tf = WDT[tf]
                 # scaled int * unscaled float / scaled int -> unscaled float
                 w = tf * idf / self._docweight[docid]
                 result[docid] = result.get(docid, 0) + scaled_int(w)
@@ -115,18 +129,23 @@
         """Return individual doc-term weights and docweight."""
         # computes w(d, t) for each term, and W(d)
         # return triple
-        #    [wid0, wid1, ...]
-        #    [w(d, wid0), w(d, wid1), ...],
+        #    [wid0, wid1, ...].
+        #    an encoded form of [w(d, wid0), w(d, wid1), ...],
         #    W(d)
         d = {}
         for wid in wids:
             d[wid] = d.get(wid, 0) + 1
         Wsquares = 0.
-        freqs = map(doc_term_weight, d.values())
-        for f in freqs:
-            Wsquares += f * f
-        freqs = map(scaled_int, freqs)
-        return d.keys(), freqs, scaled_int(math.sqrt(Wsquares))
+        weights = []
+        push = weights.append
+        for count in d.values():
+            w = doc_term_weight(count)
+            Wsquares += w * w
+            if count < 256:
+                push(count)         # 1 byte in a binary pickle
+            else:
+                push(scaled_int(w)) # 2 bytes in a binary pickle, and rare
+        return d.keys(), weights, scaled_int(math.sqrt(Wsquares))
 
     DICT_CUTOFF = 10
 
@@ -176,7 +195,13 @@
     def _get_wdt(self, d, t):
         wid, = self._lexicon.termToWordIds(t)
         map = self._wordinfo[wid]
-        return map.get(d, 0)
+        w = map.get(d, 0)
+        if w == 0:
+            return 0
+        elif w < 256:
+            return WDT[w]
+        else:
+            return w
 
     def _get_Wd(self, d):
         return self._docweight[d]
@@ -203,3 +228,16 @@
     """
     # implements w(q, t) = log(1 + N/f(t))
     return math.log(1. + float(num_items) / term_count)
+
+# For 0 < i < 256, WDT[i] is the integer scaled value of doc_term_weight(i).
+# WDT[0] is None, because doc_term_weight(0) should never happen, and
+# returning None then should quickly lead to an exception.
+WDT = [scaled_int(doc_term_weight(i)) for i in range(1, 256)]
+del i
+WDT.insert(0, None)
+
+# Upon reading up a "w(d, t)" value, we need to know whether it's a true
+# scaled w(d,t), or a raw count that must be used to index WDT[] to get
+# the true scaled w(d,t).  The key is that for raw counts >= 256, their
+# scaled w(d,t) values are greater than 255.
+assert scaled_int(doc_term_weight(256)) > 255