[Zope-CVS] CVS: Products/ZCTextIndex - CosineIndex.py:1.11 OkapiIndex.py:1.17

Tim Peters tim.one@comcast.net
Fri, 17 May 2002 01:50:47 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv4501

Modified Files:
	CosineIndex.py OkapiIndex.py 
Log Message:
Compute inverse doc frequency the same way everywhere.


=== Products/ZCTextIndex/CosineIndex.py 1.10 => 1.11 ===
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
-from Products.ZCTextIndex.BaseIndex import BaseIndex
+from Products.ZCTextIndex.BaseIndex import BaseIndex, inverse_doc_frequency
 from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
                                         mass_weightedUnion
 
@@ -77,7 +77,7 @@
     #        self._wordinfo[t] is a map from d to w(d, t).
     #
     #    w(q, t) = log(1 + N/f(t))
-    #        computed by query_term_weight()
+    #        computed by inverse_doc_frequency()
     #
     #    W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
     #        computed by _get_frequencies(), and remembered in
@@ -110,7 +110,7 @@
         for wid in wids:
             assert self._wordinfo.has_key(wid)  # caller responsible for OOV
             d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
-            idf = query_term_weight(len(d2w), N)  # this is an unscaled float
+            idf = inverse_doc_frequency(len(d2w), N)  # this is an unscaled float
             #print "idf = %.3f" % idf
             if isinstance(d2w, DictType):
                 d2w = IIBucket(d2w)
@@ -237,12 +237,3 @@
     """Return the doc-term weight for a term that appears count times."""
     # implements w(d, t) = 1 + log f(d, t)
     return 1.0 + math.log(count)
-
-def query_term_weight(term_count, num_items):
-    """Return the query-term weight for a term,
-
-    that appears in term_count items in a collection with num_items
-    total items.
-    """
-    # implements w(q, t) = log(1 + N/f(t))
-    return math.log(1.0 + float(num_items) / term_count)


=== Products/ZCTextIndex/OkapiIndex.py 1.16 => 1.17 ===
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
-from Products.ZCTextIndex.BaseIndex import BaseIndex
+from Products.ZCTextIndex.BaseIndex import BaseIndex, inverse_doc_frequency
 from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
                                         mass_weightedUnion
 
@@ -211,15 +211,6 @@
                 new[k] = v
             map = new
         self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
-def inverse_doc_frequency(term_count, num_items):
-    """Return the inverse doc frequency for a term,
-
-    that appears in term_count items in a collection with num_items
-    total items.
-    """
-    # implements IDF(q, t) = log(1 + N/f(t))
-    return math.log(1.0 + float(num_items) / term_count)
 
 """
 "Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.