[Checkins] SVN: zope.index/trunk/ - Fixed a broken optimization in okascore.c: it was passing a Python
Shane Hathaway
shane at hathawaymix.org
Sat Jul 25 06:26:02 EDT 2009
Log message for revision 102293:
- Fixed a broken optimization in okascore.c: it was passing a Python
float to the PyInt_AS_LONG() macro. This resulted in wrong scores,
especially on 64 bit platforms, where all scores typically ended up
being zero.
- Changed okascore.c to produce the same results as its Python
equivalent, reducing the brittleness of the text index tests.
Changed:
U zope.index/trunk/CHANGES.txt
U zope.index/trunk/src/zope/index/text/okascore.c
U zope.index/trunk/src/zope/index/text/textindex.txt
-=-
Modified: zope.index/trunk/CHANGES.txt
===================================================================
--- zope.index/trunk/CHANGES.txt 2009-07-25 07:48:34 UTC (rev 102292)
+++ zope.index/trunk/CHANGES.txt 2009-07-25 10:26:01 UTC (rev 102293)
@@ -4,8 +4,14 @@
3.5.3 (unreleased)
------------------
-- ...
+- Fixed a broken optimization in okascore.c: it was passing a Python
+ float to the PyInt_AS_LONG() macro. This resulted in wrong scores,
+ especially on 64 bit platforms, where all scores typically ended up
+ being zero.
+- Changed okascore.c to produce the same results as its Python
+ equivalent, reducing the brittleness of the text index tests.
+
3.5.2 (2009-06-09)
------------------
Modified: zope.index/trunk/src/zope/index/text/okascore.c
===================================================================
--- zope.index/trunk/src/zope/index/text/okascore.c 2009-07-25 07:48:34 UTC (rev 102292)
+++ zope.index/trunk/src/zope/index/text/okascore.c 2009-07-25 10:26:01 UTC (rev 102293)
@@ -63,7 +63,6 @@
&idf, &meandoclen))
return NULL;
- idf *= 1024.0; /* float out part of the scaled_int computation */
n = PyObject_Length(d2fitems);
for (i = 0; i < n; ++i) {
PyObject *d_and_f; /* d2f[i], a (d, f) pair */
@@ -72,7 +71,7 @@
PyObject *doclen; /* ._docweight[d] */
double lenweight;
double tf;
- PyObject *scaled_int;
+ PyObject *doc_score;
int status;
d_and_f = PySequence_GetItem(d2fitems, i);
@@ -93,17 +92,17 @@
Py_DECREF(d_and_f);
return NULL;
}
- lenweight = B_FROM1 + B * PyInt_AS_LONG(doclen) / meandoclen;
+ lenweight = B_FROM1 + B * PyInt_AsLong(doclen) / meandoclen;
tf = f * K1_PLUS1 / (f + K1 * lenweight);
- scaled_int = PyInt_FromLong((long)(tf * idf + 0.5));
- if (scaled_int == NULL)
+ doc_score = PyFloat_FromDouble(tf * idf);
+ if (doc_score == NULL)
status = -1;
else
- status = PyObject_SetItem(result, d, scaled_int);
+ status = PyObject_SetItem(result, d, doc_score);
Py_DECREF(d_and_f);
Py_DECREF(doclen);
- Py_XDECREF(scaled_int);
+ Py_XDECREF(doc_score);
if (status < 0)
return NULL;
}
Modified: zope.index/trunk/src/zope/index/text/textindex.txt
===================================================================
--- zope.index/trunk/src/zope/index/text/textindex.txt 2009-07-25 07:48:34 UTC (rev 102292)
+++ zope.index/trunk/src/zope/index/text/textindex.txt 2009-07-25 10:26:01 UTC (rev 102293)
@@ -6,7 +6,6 @@
any arguments:
>>> from zope.index.text.textindex import TextIndex
- >>> from zope.index.text.okapiindex import score
>>> index = TextIndex()
By default, it uses an "Okapi" inverted index and a lexicon with a
@@ -66,18 +65,13 @@
... """)
Then we can search using the apply method, which takes a search
-string. If we use the okascore.c module, we get different results
-because it uses different constants:
+string.
- >>> if score is not None: result = [(1, '787.6832'), (2, '839.0846')]
- >>> if score is None: result = [(1, '0.6153'), (2, '0.6734')]
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown fox').items()] == result
- True
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown fox').items()]
+ [(1, '0.6153'), (2, '0.6734')]
- >>> if score is not None: result = [(1, '787.7337')]
- >>> if score is None: result = [(1, '0.6153')]
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'quick fox').items()] == result
- True
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'quick fox').items()]
+ [(1, '0.6153')]
>>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown python').items()]
[]
@@ -85,41 +79,27 @@
>>> [(k, "%.4f" % v) for (k, v) in index.apply(u'dalmatian').items()]
[]
- >>> if score is not None: result = [(1, '333.0285'), (2, '333.0285'), (8, '454.7052')]
- >>> if score is None: result = [(1, '0.2602'), (2, '0.2529'), (8, '0.0934')]
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown or python').items()] == result
- True
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'brown or python').items()]
+ [(1, '0.2602'), (2, '0.2529'), (8, '0.0934')]
- >>> from zope.index.text.okapiindex import score
- >>> if score is not None: result = [(7, '787.7707')]
- >>> if score is None: result = [(7, '0.6948')]
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'butts').items()] == result
- True
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u'butts').items()]
+ [(7, '0.6948')]
-The outputs are mappings from document ids to integer scored. Items
+The outputs are mappings from document ids to float scores. Items
with higher scores are more relevent.
-We can use unicode characters in search strings. If we use the
-okascore.c module, we get different results because it uses different
-constants:
+We can use unicode characters in search strings.
- >>> if score is not None: result = [(4, '787.7707')]
- >>> if score is None: result = [(4, '0.7427')]
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(u"Fran\xe7ois").items()] == result
- True
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(u"Fran\xe7ois").items()]
+ [(4, '0.7427')]
- >>> if score is not None: result = [(5, '787.7707')]
- >>> if score is None: result = [(5, '0.7179')]
- >>> [(k, "%.4f" % v) for (k, v) in index.apply(word).items()] == result
- True
+ >>> [(k, "%.4f" % v) for (k, v) in index.apply(word).items()]
+ [(5, '0.7179')]
-We can use globbing in search strings. If we use the okascore.c
-module, we get different results because it uses different constants:
+We can use globbing in search strings.
- >>> if score is not None: result = [(1, '2789.000'), (2, '3153.000'), (3, '3808.000')]
- >>> if score is None: result = [(1, '2.179'), (2, '2.651'), (3, '2.041')]
- >>> [(k, "%.3f" % v) for (k, v) in index.apply('fo*').items()] == result
- True
+ >>> [(k, "%.3f" % v) for (k, v) in index.apply('fo*').items()]
+ [(1, '2.179'), (2, '2.651'), (3, '2.041')]
Text indexes support basic statistics:
@@ -134,16 +114,10 @@
>>> index2 = TextIndex()
>>> index2.index_doc(1, [])
>>> index2.index_doc(1, ["Zorro"])
+ >>> [(k, "%.4f" % v) for (k, v) in index2.apply("Zorro").items()]
+ [(1, '0.4545')]
-If we use the okascore.c module, we get different results because it
-uses different constants:
- >>> if score is not None: result = [(1, '787.5803')]
- >>> if score is None: result = [(1, '0.4545')]
- >>> [(k, "%.4f" % v) for (k, v) in index2.apply("Zorro").items()] == result
- True
-
-
Tracking Changes
================
More information about the Checkins
mailing list