[Checkins] SVN: zope.index/trunk/ - Port okascore.c optimization used in okapiiindex from Zope2 catalog

Tue Jun 9 19:13:54 EDT 2009

Log message for revision 100775:
  - Port okascore.c optimization used in okapiiindex from Zope2 catalog
    implementation.  This module is compiled conditionally, based on
    whether your environment has a working C compiler.
  
  - Don't use ``len(self._docweight)`` in okapiindex _search_wids method
    (obtaining the length of a BTree is very expensive at scale).
    Instead use self.documentCount().  Also a Zope2 port.
  
  

Changed:
  U   zope.index/trunk/CHANGES.txt
  U   zope.index/trunk/setup.py
  _U  zope.index/trunk/src/zope/index/text/
  U   zope.index/trunk/src/zope/index/text/okapiindex.py
  A   zope.index/trunk/src/zope/index/text/okascore.c
  U   zope.index/trunk/src/zope/index/text/textindex.txt

-=-
Modified: zope.index/trunk/CHANGES.txt
===================================================================

--- zope.index/trunk/CHANGES.txt	2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/CHANGES.txt	2009-06-09 23:13:53 UTC (rev 100775)
@@ -4,8 +4,14 @@
 3.5.2 (unreleased)
 ------------------
 
-- ...
+- Port okascore.c optimization used in okapiiindex from Zope2 catalog
+  implementation.  This module is compiled conditionally, based on
+  whether your environment has a working C compiler.
 
+- Don't use ``len(self._docweight)`` in okapiindex _search_wids method
+  (obtaining the length of a BTree is very expensive at scale).
+  Instead use self.documentCount().  Also a Zope2 port.
+
 3.5.1 (2009-02-27)
 ------------------
 

Modified: zope.index/trunk/setup.py
===================================================================
--- zope.index/trunk/setup.py	2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/setup.py	2009-06-09 23:13:53 UTC (rev 100775)
@@ -15,12 +15,48 @@
 
 $Id$
 """
-from setuptools import setup, find_packages
+import sys
+import os
 
+from setuptools import setup, find_packages, Extension
+from distutils.command.build_ext import build_ext
+from distutils.errors import CCompilerError
+from distutils.errors import DistutilsExecError
+from distutils.errors import DistutilsPlatformError
+
 long_description = (open('README.txt').read() +
                     '\n\n' +
                     open('CHANGES.txt').read())
 
+class optional_build_ext(build_ext):
+    """This class subclasses build_ext and allows
+       the building of C extensions to fail.
+    """
+    def run(self):
+        try:
+            build_ext.run(self)
+        
+        except DistutilsPlatformError, e:
+            self._unavailable(e)
+
+    def build_extension(self, ext):
+       try:
+           build_ext.build_extension(self, ext)
+        
+       except (CCompilerError, DistutilsExecError), e:
+           self._unavailable(e)
+
+    def _unavailable(self, e):
+        print >> sys.stderr, '*' * 80
+        print >> sys.stderr, """WARNING:
+
+        An optional code optimization (C extension) could not be compiled.
+
+        Optimizations for this package will not be available!"""
+        print >> sys.stderr
+        print >> sys.stderr, e
+        print >> sys.stderr, '*' * 80
+
 setup(name='zope.index',
       version='3.5.2dev',
       url='http://pypi.python.org/pypi/zope.index',
@@ -37,5 +73,10 @@
                         'ZODB3>=3.8.0b1',
                         'zope.interface'],
       include_package_data = True,
+      ext_modules=[
+          Extension('zope.index.text.okascore',
+              [os.path.join('src', 'zope', 'index', 'text', 'okascore.c')]),
+      ],
       zip_safe=False,
+      cmdclass = {'build_ext':optional_build_ext},
       )


Property changes on: zope.index/trunk/src/zope/index/text
___________________________________________________________________
Added: svn:ignore
   + *.so


Modified: zope.index/trunk/src/zope/index/text/okapiindex.py
===================================================================
--- zope.index/trunk/src/zope/index/text/okapiindex.py	2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/src/zope/index/text/okapiindex.py	2009-06-09 23:13:53 UTC (rev 100775)
@@ -193,6 +193,10 @@
 """
 from zope.index.text.baseindex import BaseIndex
 from zope.index.text.baseindex import inverse_doc_frequency
+try:
+    from zope.index.text.okascore import score
+except ImportError:
+    score = None
 from BTrees.Length import Length
 
 class OkapiIndex(BaseIndex):
@@ -251,84 +255,87 @@
     # D to TF(D,t)*IDF(t) directly, where the product is computed as a float.
     # NOTE:  This may be overridden below, by a function that computes the
     # same thing but with the inner scoring loop in C.
-    def _search_wids(self, wids):
-        if not wids:
-            return []
-        N = float(len(self._docweight))  # total # of docs
-        try:
-            doclen = self._totaldoclen()
-        except TypeError:
-            # _totaldoclen has not yet been upgraded
-            doclen = self._totaldoclen
-        meandoclen = doclen / N
-        K1 = self.K1
-        B = self.B
-        K1_plus1 = K1 + 1.0
-        B_from1 = 1.0 - B
+    if score is None:
+        def _search_wids(self, wids):
+            if not wids:
+                return []
+            N = float(self.documentCount())  # total # of docs
+            try:
+                doclen = self._totaldoclen()
+            except TypeError:
+                # _totaldoclen has not yet been upgraded
+                doclen = self._totaldoclen
+            meandoclen = doclen / N
+            K1 = self.K1
+            B = self.B
+            K1_plus1 = K1 + 1.0
+            B_from1 = 1.0 - B
 
-        #                           f(D, t) * (k1 + 1)
-        #   TF(D, t) =  -------------------------------------------
-        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+            #                           f(D, t) * (k1 + 1)
+            #   TF(D, t) =  -------------------------------------------
+            #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
 
-        L = []
-        docid2len = self._docweight
-        for t in wids:
-            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
-            result = self.family.IF.Bucket()
-            for docid, f in d2f.items():
-                lenweight = B_from1 + B * docid2len[docid] / meandoclen
-                tf = f * K1_plus1 / (f + K1 * lenweight)
-                result[docid] = tf * idf
-            L.append((result, 1))
-        return L
+            L = []
+            docid2len = self._docweight
+            for t in wids:
+                d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+                idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+                result = self.family.IF.Bucket()
+                for docid, f in d2f.items():
+                    lenweight = B_from1 + B * docid2len[docid] / meandoclen
+                    tf = f * K1_plus1 / (f + K1 * lenweight)
+                    result[docid] = tf * idf
+                L.append((result, 1))
+            return L
 
-        # Note about the above:  the result is tf * idf.  tf is small -- it
-        # can't be larger than k1+1 = 2.2.  idf is formally unbounded, but
-        # is less than 14 for a term that appears in only 1 of a million
-        # documents.  So the product is probably less than 32, or 5 bits
-        # before the radix point.  If we did the scaled-int business on
-        # both of them, we'd be up to 25 bits.  Add 64 of those and we'd
-        # be in overflow territory.  That's pretty unlikely, so we *could*
-        # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
-        # as an invariant weight across the whole result.  But besides
-        # skating near the edge, it's not a speed cure, since the computation
-        # of tf would still be done at Python speed, and it's a lot more
-        # work than just multiplying by idf.
+            # Note about the above: the result is tf * idf.  tf is
+            # small -- it can't be larger than k1+1 = 2.2.  idf is
+            # formally unbounded, but is less than 14 for a term that
+            # appears in only 1 of a million documents.  So the
+            # product is probably less than 32, or 5 bits before the
+            # radix point.  If we did the scaled-int business on both
+            # of them, we'd be up to 25 bits.  Add 64 of those and
+            # we'd be in overflow territory.  That's pretty unlikely,
+            # so we *could* just store scaled_int(tf) in
+            # result[docid], and use scaled_int(idf) as an invariant
+            # weight across the whole result.  But besides skating
+            # near the edge, it's not a speed cure, since the
+            # computation of tf would still be done at Python speed,
+            # and it's a lot more work than just multiplying by idf.
+    else:
+        # The same function as _search_wids above, but with the inner scoring
+        # loop written in C (module okascore, function score()).
+        # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
+        # function.
+        def _search_wids(self, wids):
+            if not wids:
+                return []
+            N = float(self.documentCount())  # total # of docs
+            try:
+                doclen = self._totaldoclen()
+            except TypeError:
+                # _totaldoclen has not yet been upgraded
+                doclen = self._totaldoclen
+            meandoclen = doclen / N
+            #K1 = self.K1
+            #B = self.B
+            #K1_plus1 = K1 + 1.0
+            #B_from1 = 1.0 - B
 
-    # The same function as _search_wids above, but with the inner scoring
-    # loop written in C (module okascore, function score()).
-    # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
-    # function.
-    def _search_wids_NOTYET(self, wids):
-        if not wids:
-            return []
-        N = float(len(self._docweight))  # total # of docs
-        try:
-            doclen = self._totaldoclen()
-        except TypeError:
-            # _totaldoclen has not yet been upgraded
-            doclen = self._totaldoclen
-        meandoclen = doclen / N
-        #K1 = self.K1
-        #B = self.B
-        #K1_plus1 = K1 + 1.0
-        #B_from1 = 1.0 - B
+            #                           f(D, t) * (k1 + 1)
+            #   TF(D, t) =  -------------------------------------------
+            #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
 
-        #                           f(D, t) * (k1 + 1)
-        #   TF(D, t) =  -------------------------------------------
-        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+            L = []
+            docid2len = self._docweight
+            for t in wids:
+                d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+                idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+                result = self.family.IF.Bucket()
+                score(result, d2f.items(), docid2len, idf, meandoclen)
+                L.append((result, 1))
+            return L
 
-        L = []
-        docid2len = self._docweight
-        for t in wids:
-            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
-            result = self.family.IF.Bucket()
-            score(result, d2f.items(), docid2len, idf, meandoclen)
-            L.append((result, 1))
-        return L
-
     def query_weight(self, terms):
         # Get the wids.
         wids = []

Added: zope.index/trunk/src/zope/index/text/okascore.c
===================================================================
--- zope.index/trunk/src/zope/index/text/okascore.c	                        (rev 0)
+++ zope.index/trunk/src/zope/index/text/okascore.c	2009-06-09 23:13:53 UTC (rev 100775)
@@ -0,0 +1,131 @@
+/*****************************************************************************
+
+  Copyright (c) 2002 Zope Corporation and Contributors.
+  All Rights Reserved.
+
+  This software is subject to the provisions of the Zope Public License,
+  Version 2.1 (ZPL).  A copy of the ZPL should accompany this distribution.
+  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+  FOR A PARTICULAR PURPOSE
+
+ ****************************************************************************/
+
+/*	okascore.c
+ *
+ *	The inner scoring loop of OkapiIndex._search_wids() coded in C.
+ *
+ * Example from an indexed Python-Dev archive, where "python" shows up in all
+ * but 2 of the 19,058 messages.  With the Python scoring loop,
+ *
+ *      query: python
+ *      # results: 10 of 19056 in 534.77 ms
+ *      query: python
+ *      # results: 10 of 19056 in 277.52 ms
+ *
+ * The first timing is cold, the second timing from an immediate repeat of
+ * the same query.  With the scoring loop here in C:
+ *
+ *     query: python
+ *     # results: 10 of 19056 in 380.74 ms  -- 40% speedup
+ *     query: python
+ *     # results: 10 of 19056 in 118.96 ms  -- 133% speedup
+ */
+
+#include "Python.h"
+
+#define K1 1.2
+#define B  0.75
+
+#ifndef PyTuple_CheckExact
+#define PyTuple_CheckExact PyTuple_Check
+#endif
+
+static PyObject *
+score(PyObject *self, PyObject *args)
+{
+	/* Believe it or not, floating these common subexpressions "by hand"
+	   gets better code out of MSVC 6. */
+	const double B_FROM1 = 1.0 - B;
+	const double K1_PLUS1 = K1 + 1.0;
+
+	/* Inputs */
+	PyObject *result;	/* IIBucket result, maps d to score */
+	PyObject *d2fitems;	/* ._wordinfo[t].items(), maps d to f(d, t) */
+	PyObject *d2len;	/* ._docweight, maps d to # words in d */
+	double idf;		/* inverse doc frequency of t */
+	double meandoclen;	/* average number of words in a doc */
+
+	int n, i;
+
+	if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
+						   &idf, &meandoclen))
+		return NULL;
+
+	idf *= 1024.0;	/* float out part of the scaled_int computation */
+	n = PyObject_Length(d2fitems);
+	for (i = 0; i < n; ++i) {
+		PyObject *d_and_f;	/* d2f[i], a (d, f) pair */
+		PyObject *d;
+		double f;
+		PyObject *doclen;	/* ._docweight[d] */
+		double lenweight;
+		double tf;
+		PyObject *scaled_int;
+		int status;
+
+		d_and_f = PySequence_GetItem(d2fitems, i);
+		if (d_and_f == NULL)
+			return NULL;
+		if (!(PyTuple_CheckExact(d_and_f) &&
+		      PyTuple_GET_SIZE(d_and_f) == 2)) {
+			PyErr_SetString(PyExc_TypeError,
+				"d2fitems must produce 2-item tuples");
+			Py_DECREF(d_and_f);
+			return NULL;
+		}
+		d = PyTuple_GET_ITEM(d_and_f, 0);
+		f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
+
+		doclen = PyObject_GetItem(d2len, d);
+		if (doclen == NULL) {
+			Py_DECREF(d_and_f);
+			return NULL;
+		}
+		lenweight = B_FROM1 + B * PyInt_AS_LONG(doclen) / meandoclen;
+
+		tf = f * K1_PLUS1 / (f + K1 * lenweight);
+		scaled_int = PyInt_FromLong((long)(tf * idf + 0.5));
+		if (scaled_int == NULL)
+			status = -1;
+		else
+			status = PyObject_SetItem(result, d, scaled_int);
+		Py_DECREF(d_and_f);
+		Py_DECREF(doclen);
+		Py_XDECREF(scaled_int);
+		if (status < 0)
+			return NULL;
+	}
+	Py_INCREF(Py_None);
+	return Py_None;
+}
+
+static char score__doc__[] =
+"score(result, d2fitems, d2len, idf, meandoclen)\n"
+"\n"
+"Do the inner scoring loop for an Okapi index.\n";
+
+static PyMethodDef okascore_functions[] = {
+	{"score",	   score,	  METH_VARARGS, score__doc__},
+	{NULL}
+};
+
+void
+initokascore(void)
+{
+	PyObject *m;
+
+	m = Py_InitModule3("okascore", okascore_functions,
+			    "inner scoring loop for Okapi rank");
+}

Modified: zope.index/trunk/src/zope/index/text/textindex.txt
===================================================================
--- zope.index/trunk/src/zope/index/text/textindex.txt	2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/src/zope/index/text/textindex.txt	2009-06-09 23:13:53 UTC (rev 100775)
@@ -6,6 +6,7 @@
 any arguments:
 
     >>> from zope.index.text.textindex import TextIndex
+    >>> from zope.index.text.okapiindex import score
     >>> index = TextIndex()
 
 By default, it uses an "Okapi" inverted index and a lexicon with a
@@ -65,13 +66,18 @@
     ... """)
 
 Then we can search using the apply method, which takes a search
-string:
+string.  If we use the okascore.c module, we get different results
+because it uses different constants:
 
-    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown fox').items()]
-    [(1, '0.6153'), (2, '0.6734')]
+    >>> if score is not None: result = [(1, '787.6832'), (2, '839.0846')]
+    >>> if score is None: result = [(1, '0.6153'), (2, '0.6734')]
+    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown fox').items()] == result
+    True
 
-    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'quick fox').items()]
-    [(1, '0.6153')]
+    >>> if score is not None: result = [(1, '787.7337')]
+    >>> if score is None: result = [(1, '0.6153')]
+    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'quick fox').items()] == result
+    True
 
     >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown python').items()]
     []
@@ -79,27 +85,41 @@
     >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'dalmatian').items()]
     []
 
-    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown or python').items()]
-    [(1, '0.2602'), (2, '0.2529'), (8, '0.0934')]
+    >>> if score is not None: result = [(1, '333.0285'), (2, '333.0285'), (8, '454.7052')]
+    >>> if score is None: result = [(1, '0.2602'), (2, '0.2529'), (8, '0.0934')]
+    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown or python').items()] == result
+    True
 
-    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'butts').items()]
-    [(7, '0.6948')]
+    >>> from zope.index.text.okapiindex import score
+    >>> if score is not None: result = [(7, '787.7707')]
+    >>> if score is None: result =  [(7, '0.6948')]
+    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'butts').items()] == result
+    True
 
 The outputs are mappings from document ids to integer scored. Items
 with higher scores are more relevent.
 
-We can use unicode characters in search strings:
+We can use unicode characters in search strings.  If we use the
+okascore.c module, we get different results because it uses different
+constants:
 
-    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u"Fran\xe7ois").items()]
-    [(4, '0.7427')]
+    >>> if score is not None: result = [(4, '787.7707')]
+    >>> if score is None: result = [(4, '0.7427')]
+    >>> [(k, "%.4f" % v) for (k, v) in index.apply(u"Fran\xe7ois").items()] == result
+    True
 
-    >>> [(k, "%.4f" % v) for (k, v) in index.apply(word).items()]
-    [(5, '0.7179')]
+    >>> if score is not None: result = [(5, '787.7707')]
+    >>> if score is None: result = [(5, '0.7179')]
+    >>> [(k, "%.4f" % v) for (k, v) in index.apply(word).items()] == result
+    True
 
-We can use globbing in search strings:
+We can use globbing in search strings.  If we use the okascore.c
+module, we get different results because it uses different constants:
 
-    >>> [(k, "%.3f" % v) for (k, v) in index.apply('fo*').items()]
-    [(1, '2.179'), (2, '2.651'), (3, '2.041')]
+    >>> if score is not None: result = [(1, '2789.000'), (2, '3153.000'), (3, '3808.000')]
+    >>> if score is None: result = [(1, '2.179'), (2, '2.651'), (3, '2.041')]
+    >>> [(k, "%.3f" % v) for (k, v) in index.apply('fo*').items()] == result
+    True
 
 Text indexes support basic statistics:
 
@@ -115,10 +135,15 @@
     >>> index2.index_doc(1, [])
     >>> index2.index_doc(1, ["Zorro"])
 
-    >>> [(k, "%.4f" % v) for (k, v) in index2.apply("Zorro").items()]
-    [(1, '0.4545')]
+If we use the okascore.c module, we get different results because it
+uses different constants:
 
+    >>> if score is not None: result = [(1, '787.5803')]
+    >>> if score is None: result = [(1, '0.4545')]
+    >>> [(k, "%.4f" % v) for (k, v) in index2.apply("Zorro").items()] == result
+    True
 
+
 Tracking Changes
 ================