[Checkins] SVN: zope.index/trunk/ - Port okascore.c optimization used in okapiiindex from Zope2 catalog
Chris McDonough
chrism at plope.com
Tue Jun 9 19:13:54 EDT 2009
Log message for revision 100775:
- Port okascore.c optimization used in okapiiindex from Zope2 catalog
implementation. This module is compiled conditionally, based on
whether your environment has a working C compiler.
- Don't use ``len(self._docweight)`` in okapiindex _search_wids method
(obtaining the length of a BTree is very expensive at scale).
Instead use self.documentCount(). Also a Zope2 port.
Changed:
U zope.index/trunk/CHANGES.txt
U zope.index/trunk/setup.py
_U zope.index/trunk/src/zope/index/text/
U zope.index/trunk/src/zope/index/text/okapiindex.py
A zope.index/trunk/src/zope/index/text/okascore.c
U zope.index/trunk/src/zope/index/text/textindex.txt
-=-
Modified: zope.index/trunk/CHANGES.txt
===================================================================
--- zope.index/trunk/CHANGES.txt 2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/CHANGES.txt 2009-06-09 23:13:53 UTC (rev 100775)
@@ -4,8 +4,14 @@
3.5.2 (unreleased)
------------------
-- ...
+- Port okascore.c optimization used in okapiiindex from Zope2 catalog
+ implementation. This module is compiled conditionally, based on
+ whether your environment has a working C compiler.
+- Don't use ``len(self._docweight)`` in okapiindex _search_wids method
+ (obtaining the length of a BTree is very expensive at scale).
+ Instead use self.documentCount(). Also a Zope2 port.
+
3.5.1 (2009-02-27)
------------------
Modified: zope.index/trunk/setup.py
===================================================================
--- zope.index/trunk/setup.py 2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/setup.py 2009-06-09 23:13:53 UTC (rev 100775)
@@ -15,12 +15,48 @@
$Id$
"""
-from setuptools import setup, find_packages
+import sys
+import os
+from setuptools import setup, find_packages, Extension
+from distutils.command.build_ext import build_ext
+from distutils.errors import CCompilerError
+from distutils.errors import DistutilsExecError
+from distutils.errors import DistutilsPlatformError
+
long_description = (open('README.txt').read() +
'\n\n' +
open('CHANGES.txt').read())
+class optional_build_ext(build_ext):
+ """This class subclasses build_ext and allows
+ the building of C extensions to fail.
+ """
+ def run(self):
+ try:
+ build_ext.run(self)
+
+ except DistutilsPlatformError, e:
+ self._unavailable(e)
+
+ def build_extension(self, ext):
+ try:
+ build_ext.build_extension(self, ext)
+
+ except (CCompilerError, DistutilsExecError), e:
+ self._unavailable(e)
+
+ def _unavailable(self, e):
+ print >> sys.stderr, '*' * 80
+ print >> sys.stderr, """WARNING:
+
+ An optional code optimization (C extension) could not be compiled.
+
+ Optimizations for this package will not be available!"""
+ print >> sys.stderr
+ print >> sys.stderr, e
+ print >> sys.stderr, '*' * 80
+
setup(name='zope.index',
version='3.5.2dev',
url='http://pypi.python.org/pypi/zope.index',
@@ -37,5 +73,10 @@
'ZODB3>=3.8.0b1',
'zope.interface'],
include_package_data = True,
+ ext_modules=[
+ Extension('zope.index.text.okascore',
+ [os.path.join('src', 'zope', 'index', 'text', 'okascore.c')]),
+ ],
zip_safe=False,
+ cmdclass = {'build_ext':optional_build_ext},
)
Property changes on: zope.index/trunk/src/zope/index/text
___________________________________________________________________
Added: svn:ignore
+ *.so
Modified: zope.index/trunk/src/zope/index/text/okapiindex.py
===================================================================
--- zope.index/trunk/src/zope/index/text/okapiindex.py 2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/src/zope/index/text/okapiindex.py 2009-06-09 23:13:53 UTC (rev 100775)
@@ -193,6 +193,10 @@
"""
from zope.index.text.baseindex import BaseIndex
from zope.index.text.baseindex import inverse_doc_frequency
+try:
+ from zope.index.text.okascore import score
+except ImportError:
+ score = None
from BTrees.Length import Length
class OkapiIndex(BaseIndex):
@@ -251,84 +255,87 @@
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float.
# NOTE: This may be overridden below, by a function that computes the
# same thing but with the inner scoring loop in C.
- def _search_wids(self, wids):
- if not wids:
- return []
- N = float(len(self._docweight)) # total # of docs
- try:
- doclen = self._totaldoclen()
- except TypeError:
- # _totaldoclen has not yet been upgraded
- doclen = self._totaldoclen
- meandoclen = doclen / N
- K1 = self.K1
- B = self.B
- K1_plus1 = K1 + 1.0
- B_from1 = 1.0 - B
+ if score is None:
+ def _search_wids(self, wids):
+ if not wids:
+ return []
+ N = float(self.documentCount()) # total # of docs
+ try:
+ doclen = self._totaldoclen()
+ except TypeError:
+ # _totaldoclen has not yet been upgraded
+ doclen = self._totaldoclen
+ meandoclen = doclen / N
+ K1 = self.K1
+ B = self.B
+ K1_plus1 = K1 + 1.0
+ B_from1 = 1.0 - B
- # f(D, t) * (k1 + 1)
- # TF(D, t) = -------------------------------------------
- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+ # f(D, t) * (k1 + 1)
+ # TF(D, t) = -------------------------------------------
+ # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
- L = []
- docid2len = self._docweight
- for t in wids:
- d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
- idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
- result = self.family.IF.Bucket()
- for docid, f in d2f.items():
- lenweight = B_from1 + B * docid2len[docid] / meandoclen
- tf = f * K1_plus1 / (f + K1 * lenweight)
- result[docid] = tf * idf
- L.append((result, 1))
- return L
+ L = []
+ docid2len = self._docweight
+ for t in wids:
+ d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+ idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
+ result = self.family.IF.Bucket()
+ for docid, f in d2f.items():
+ lenweight = B_from1 + B * docid2len[docid] / meandoclen
+ tf = f * K1_plus1 / (f + K1 * lenweight)
+ result[docid] = tf * idf
+ L.append((result, 1))
+ return L
- # Note about the above: the result is tf * idf. tf is small -- it
- # can't be larger than k1+1 = 2.2. idf is formally unbounded, but
- # is less than 14 for a term that appears in only 1 of a million
- # documents. So the product is probably less than 32, or 5 bits
- # before the radix point. If we did the scaled-int business on
- # both of them, we'd be up to 25 bits. Add 64 of those and we'd
- # be in overflow territory. That's pretty unlikely, so we *could*
- # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
- # as an invariant weight across the whole result. But besides
- # skating near the edge, it's not a speed cure, since the computation
- # of tf would still be done at Python speed, and it's a lot more
- # work than just multiplying by idf.
+ # Note about the above: the result is tf * idf. tf is
+ # small -- it can't be larger than k1+1 = 2.2. idf is
+ # formally unbounded, but is less than 14 for a term that
+ # appears in only 1 of a million documents. So the
+ # product is probably less than 32, or 5 bits before the
+ # radix point. If we did the scaled-int business on both
+ # of them, we'd be up to 25 bits. Add 64 of those and
+ # we'd be in overflow territory. That's pretty unlikely,
+ # so we *could* just store scaled_int(tf) in
+ # result[docid], and use scaled_int(idf) as an invariant
+ # weight across the whole result. But besides skating
+ # near the edge, it's not a speed cure, since the
+ # computation of tf would still be done at Python speed,
+ # and it's a lot more work than just multiplying by idf.
+ else:
+ # The same function as _search_wids above, but with the inner scoring
+ # loop written in C (module okascore, function score()).
+ # Cautions: okascore hardcodes the values of K, B1, and the scaled_int
+ # function.
+ def _search_wids(self, wids):
+ if not wids:
+ return []
+ N = float(self.documentCount()) # total # of docs
+ try:
+ doclen = self._totaldoclen()
+ except TypeError:
+ # _totaldoclen has not yet been upgraded
+ doclen = self._totaldoclen
+ meandoclen = doclen / N
+ #K1 = self.K1
+ #B = self.B
+ #K1_plus1 = K1 + 1.0
+ #B_from1 = 1.0 - B
- # The same function as _search_wids above, but with the inner scoring
- # loop written in C (module okascore, function score()).
- # Cautions: okascore hardcodes the values of K, B1, and the scaled_int
- # function.
- def _search_wids_NOTYET(self, wids):
- if not wids:
- return []
- N = float(len(self._docweight)) # total # of docs
- try:
- doclen = self._totaldoclen()
- except TypeError:
- # _totaldoclen has not yet been upgraded
- doclen = self._totaldoclen
- meandoclen = doclen / N
- #K1 = self.K1
- #B = self.B
- #K1_plus1 = K1 + 1.0
- #B_from1 = 1.0 - B
+ # f(D, t) * (k1 + 1)
+ # TF(D, t) = -------------------------------------------
+ # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
- # f(D, t) * (k1 + 1)
- # TF(D, t) = -------------------------------------------
- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+ L = []
+ docid2len = self._docweight
+ for t in wids:
+ d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+ idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
+ result = self.family.IF.Bucket()
+ score(result, d2f.items(), docid2len, idf, meandoclen)
+ L.append((result, 1))
+ return L
- L = []
- docid2len = self._docweight
- for t in wids:
- d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
- idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
- result = self.family.IF.Bucket()
- score(result, d2f.items(), docid2len, idf, meandoclen)
- L.append((result, 1))
- return L
-
def query_weight(self, terms):
# Get the wids.
wids = []
Added: zope.index/trunk/src/zope/index/text/okascore.c
===================================================================
--- zope.index/trunk/src/zope/index/text/okascore.c (rev 0)
+++ zope.index/trunk/src/zope/index/text/okascore.c 2009-06-09 23:13:53 UTC (rev 100775)
@@ -0,0 +1,131 @@
+/*****************************************************************************
+
+ Copyright (c) 2002 Zope Corporation and Contributors.
+ All Rights Reserved.
+
+ This software is subject to the provisions of the Zope Public License,
+ Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
+ THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+ WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+ FOR A PARTICULAR PURPOSE
+
+ ****************************************************************************/
+
+/* okascore.c
+ *
+ * The inner scoring loop of OkapiIndex._search_wids() coded in C.
+ *
+ * Example from an indexed Python-Dev archive, where "python" shows up in all
+ * but 2 of the 19,058 messages. With the Python scoring loop,
+ *
+ * query: python
+ * # results: 10 of 19056 in 534.77 ms
+ * query: python
+ * # results: 10 of 19056 in 277.52 ms
+ *
+ * The first timing is cold, the second timing from an immediate repeat of
+ * the same query. With the scoring loop here in C:
+ *
+ * query: python
+ * # results: 10 of 19056 in 380.74 ms -- 40% speedup
+ * query: python
+ * # results: 10 of 19056 in 118.96 ms -- 133% speedup
+ */
+
+#include "Python.h"
+
+#define K1 1.2
+#define B 0.75
+
+#ifndef PyTuple_CheckExact
+#define PyTuple_CheckExact PyTuple_Check
+#endif
+
+static PyObject *
+score(PyObject *self, PyObject *args)
+{
+ /* Believe it or not, floating these common subexpressions "by hand"
+ gets better code out of MSVC 6. */
+ const double B_FROM1 = 1.0 - B;
+ const double K1_PLUS1 = K1 + 1.0;
+
+ /* Inputs */
+ PyObject *result; /* IIBucket result, maps d to score */
+ PyObject *d2fitems; /* ._wordinfo[t].items(), maps d to f(d, t) */
+ PyObject *d2len; /* ._docweight, maps d to # words in d */
+ double idf; /* inverse doc frequency of t */
+ double meandoclen; /* average number of words in a doc */
+
+ int n, i;
+
+ if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
+ &idf, &meandoclen))
+ return NULL;
+
+ idf *= 1024.0; /* float out part of the scaled_int computation */
+ n = PyObject_Length(d2fitems);
+ for (i = 0; i < n; ++i) {
+ PyObject *d_and_f; /* d2f[i], a (d, f) pair */
+ PyObject *d;
+ double f;
+ PyObject *doclen; /* ._docweight[d] */
+ double lenweight;
+ double tf;
+ PyObject *scaled_int;
+ int status;
+
+ d_and_f = PySequence_GetItem(d2fitems, i);
+ if (d_and_f == NULL)
+ return NULL;
+ if (!(PyTuple_CheckExact(d_and_f) &&
+ PyTuple_GET_SIZE(d_and_f) == 2)) {
+ PyErr_SetString(PyExc_TypeError,
+ "d2fitems must produce 2-item tuples");
+ Py_DECREF(d_and_f);
+ return NULL;
+ }
+ d = PyTuple_GET_ITEM(d_and_f, 0);
+ f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));
+
+ doclen = PyObject_GetItem(d2len, d);
+ if (doclen == NULL) {
+ Py_DECREF(d_and_f);
+ return NULL;
+ }
+ lenweight = B_FROM1 + B * PyInt_AS_LONG(doclen) / meandoclen;
+
+ tf = f * K1_PLUS1 / (f + K1 * lenweight);
+ scaled_int = PyInt_FromLong((long)(tf * idf + 0.5));
+ if (scaled_int == NULL)
+ status = -1;
+ else
+ status = PyObject_SetItem(result, d, scaled_int);
+ Py_DECREF(d_and_f);
+ Py_DECREF(doclen);
+ Py_XDECREF(scaled_int);
+ if (status < 0)
+ return NULL;
+ }
+ Py_INCREF(Py_None);
+ return Py_None;
+}
+
+static char score__doc__[] =
+"score(result, d2fitems, d2len, idf, meandoclen)\n"
+"\n"
+"Do the inner scoring loop for an Okapi index.\n";
+
+static PyMethodDef okascore_functions[] = {
+ {"score", score, METH_VARARGS, score__doc__},
+ {NULL}
+};
+
+void
+initokascore(void)
+{
+ PyObject *m;
+
+ m = Py_InitModule3("okascore", okascore_functions,
+ "inner scoring loop for Okapi rank");
+}
Modified: zope.index/trunk/src/zope/index/text/textindex.txt
===================================================================
--- zope.index/trunk/src/zope/index/text/textindex.txt 2009-06-09 22:56:13 UTC (rev 100774)
+++ zope.index/trunk/src/zope/index/text/textindex.txt 2009-06-09 23:13:53 UTC (rev 100775)
@@ -6,6 +6,7 @@
any arguments:
>>> from zope.index.text.textindex import TextIndex
+ >>> from zope.index.text.okapiindex import score
>>> index = TextIndex()
By default, it uses an "Okapi" inverted index and a lexicon with a
@@ -65,13 +66,18 @@
... """)
Then we can search using the apply method, which takes a search
-string:
+string. If we use the okascore.c module, we get different results
+because it uses different constants:
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown fox').items()]
- [(1, '0.6153'), (2, '0.6734')]
+ >>> if score is not None: result = [(1, '787.6832'), (2, '839.0846')]
+ >>> if score is None: result = [(1, '0.6153'), (2, '0.6734')]
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown fox').items()] == result
+ True
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'quick fox').items()]
- [(1, '0.6153')]
+ >>> if score is not None: result = [(1, '787.7337')]
+ >>> if score is None: result = [(1, '0.6153')]
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'quick fox').items()] == result
+ True
>>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown python').items()]
[]
@@ -79,27 +85,41 @@
>>> [(k, "%.4f" % v) for (k, v) in index.apply(u'dalmatian').items()]
[]
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown or python').items()]
- [(1, '0.2602'), (2, '0.2529'), (8, '0.0934')]
+ >>> if score is not None: result = [(1, '333.0285'), (2, '333.0285'), (8, '454.7052')]
+ >>> if score is None: result = [(1, '0.2602'), (2, '0.2529'), (8, '0.0934')]
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown or python').items()] == result
+ True
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'butts').items()]
- [(7, '0.6948')]
+ >>> from zope.index.text.okapiindex import score
+ >>> if score is not None: result = [(7, '787.7707')]
+ >>> if score is None: result = [(7, '0.6948')]
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'butts').items()] == result
+ True
The outputs are mappings from document ids to integer scored. Items
with higher scores are more relevent.
-We can use unicode characters in search strings:
+We can use unicode characters in search strings. If we use the
+okascore.c module, we get different results because it uses different
+constants:
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u"Fran\xe7ois").items()]
- [(4, '0.7427')]
+ >>> if score is not None: result = [(4, '787.7707')]
+ >>> if score is None: result = [(4, '0.7427')]
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u"Fran\xe7ois").items()] == result
+ True
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(word).items()]
- [(5, '0.7179')]
+ >>> if score is not None: result = [(5, '787.7707')]
+ >>> if score is None: result = [(5, '0.7179')]
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(word).items()] == result
+ True
-We can use globbing in search strings:
+We can use globbing in search strings. If we use the okascore.c
+module, we get different results because it uses different constants:
- >>> [(k, "%.3f" % v) for (k, v) in index.apply('fo*').items()]
- [(1, '2.179'), (2, '2.651'), (3, '2.041')]
+ >>> if score is not None: result = [(1, '2789.000'), (2, '3153.000'), (3, '3808.000')]
+ >>> if score is None: result = [(1, '2.179'), (2, '2.651'), (3, '2.041')]
+ >>> [(k, "%.3f" % v) for (k, v) in index.apply('fo*').items()] == result
+ True
Text indexes support basic statistics:
@@ -115,10 +135,15 @@
>>> index2.index_doc(1, [])
>>> index2.index_doc(1, ["Zorro"])
- >>> [(k, "%.4f" % v) for (k, v) in index2.apply("Zorro").items()]
- [(1, '0.4545')]
+If we use the okascore.c module, we get different results because it
+uses different constants:
+ >>> if score is not None: result = [(1, '787.5803')]
+ >>> if score is None: result = [(1, '0.4545')]
+ >>> [(k, "%.4f" % v) for (k, v) in index2.apply("Zorro").items()] == result
+ True
+
Tracking Changes
================
More information about the Checkins
mailing list