[Zope-CVS] CVS: Products/ZCTextIndex - okascore.c:1.1 OkapiIndex.py:1.27 Setup:1.3

Mon, 20 May 2002 19:15:17 -0400

Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv26401

Modified Files:
	OkapiIndex.py Setup 
Added Files:
	okascore.c 
Log Message:
Since I did the work to write the inner Okapi scoring loop in C, may as
well check it in.  This yields an overall 133% speedup on a "hot" search
for 'python' in my python-dev archive (a word that appears in all but
2 documents).  For those who read the email, turned out it was a
significant speedup to iterate over an IIBTree's items rather than to
materialize the items into an explicit list first.

This is now within 20% of simply doing "IIBucket(the_IIBTree)" (i.e.,
no arithmetic at all), so there's no significant possibility remaining
for speeding the inner score loop.


=== Added File Products/ZCTextIndex/okascore.c ===
/*****************************************************************************

  Copyright (c) 2002 Zope Corporation and Contributors.
  All Rights Reserved.

  This software is subject to the provisions of the Zope Public License,
  Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
  THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
  WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
  FOR A PARTICULAR PURPOSE

 ****************************************************************************/

/*	okascore.c
 *
 *	The inner scoring loop of OkapiIndex._search_wids() coded in C.
 *
 * With the Python scoring loop,
 *
 *      query: python
 *      # results: 10 of 19056 in 534.77 ms
 *      query: python
 *      # results: 10 of 19056 in 277.52 ms
 *
 * The first timing is cold, the second timing from an immediate repeat of
 * the same query.  With the timing loop here in C:
 *
 *     query: python
 *     # results: 10 of 19056 in 380.74 ms  -- 40% speedup
 *     query: python
 *     # results: 10 of 19056 in 118.96 ms  -- 133% speedup
 */

#include "Python.h"

#define K1 1.2
#define B  0.75

static PyObject *
score(PyObject *self, PyObject *args)
{
	/* Believe it or not, floating these common subexpressions "by hand"
	   gets better code out of MSVC 6. */
	const double B_FROM1 = 1.0 - B;
	const double K1_PLUS1 = K1 + 1.0;

	/* Inputs */
	PyObject *result;	/* IIBucket result, maps d to score */
	PyObject *d2fitems;	/* ._wordinfo[t].items(), maps d to f(d, t) */
	PyObject *d2len;	/* ._docweight, maps d to # words in d */
	double idf;		/* inverse doc frequency of t */
	double meandoclen;	/* average number of words in a doc */

	int n, i;

	if (!PyArg_ParseTuple(args, "OOOdd:score", &result, &d2fitems, &d2len,
						   &idf, &meandoclen))
		return NULL;

	n = PyObject_Length(d2fitems);
	for (i = 0; i < n; ++i) {
		PyObject *d_and_f;	/* d2f[i], a (d, f) pair */
		PyObject *d;
		double f;
		PyObject *doclen;	/* ._docweight[d] */
		double lenweight;
		double tf;
		double score;
		PyObject *scaled_int;
		int status;

		d_and_f = PySequence_GetItem(d2fitems, i);
		if (d_and_f == NULL)
			return NULL;
		if (!(PyTuple_Check(d_and_f) &&
		      PyTuple_Size(d_and_f) == 2)) {
			PyErr_SetString(PyExc_TypeError,
				"d2fitems must produce 2-item tuples");
			Py_DECREF(d_and_f);
			return NULL;
		}
		d = PyTuple_GET_ITEM(d_and_f, 0);
		f = (double)PyInt_AsLong(PyTuple_GET_ITEM(d_and_f, 1));

		doclen = PyObject_GetItem(d2len, d);
		if (doclen == NULL) {
			Py_DECREF(d_and_f);
			return NULL;
		}
		lenweight = B_FROM1 + B * PyInt_AsLong(doclen) / meandoclen;

		tf = f * K1_PLUS1 / (f + K1 * lenweight);
		score = tf * idf;
		scaled_int = PyInt_FromLong((long)(score * 1024.0 + 0.5));
		status = PyObject_SetItem(result, d, scaled_int);
		Py_DECREF(d_and_f);
		Py_DECREF(doclen);
		Py_DECREF(scaled_int);
		if (status < 0)
			return NULL;
	}
	Py_INCREF(Py_None);
	return Py_None;
}

static char score__doc__[] =
"score(result, d2fitems, d2len, idf, meandoclen)\n"
"\n"
"Do the inner scoring loop for an Okapi index.\n";

static PyMethodDef okascore_functions[] = {
	{"score",	   score,	  METH_VARARGS, score__doc__},
	{NULL}
};

void
initokascore(void)
{
	PyObject *m;

	m = Py_InitModule3("okascore", okascore_functions,
			    "inner scoring loop for Okapi rank");
}


=== Products/ZCTextIndex/OkapiIndex.py 1.26 => 1.27 ===
     # D to TF(D,t)*IDF(t) directly, where the product is computed as a float
     # but stored as a scaled_int.
+    # NOTE:  This is overridden below, by a function that computes the
+    # same thing but with the inner scoring loop in C.
     def _search_wids(self, wids):
         if not wids:
             return []
@@ -87,7 +89,6 @@
         L = []
         docid2len = self._docweight
         for t in wids:
-            assert self._wordinfo.has_key(t)  # caller responsible for OOV
             d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
             idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
             result = IIBucket()
@@ -110,6 +111,35 @@
         # skating near the edge, it's not a speed cure, since the computation
         # of tf would still be done at Python speed, and it's a lot more
         # work than just multiplying by idf.
+
+    # The same function as _search_wids above, but with the inner scoring
+    # loop written in C (module okascore, function score()).
+    # Cautions:  okascore hardcodes the values of K, B1, and the scaled_int
+    # function.
+    def _search_wids(self, wids):
+        from Products.ZCTextIndex.okascore import score
+        if not wids:
+            return []
+        N = float(len(self._docweight))  # total # of docs
+        meandoclen = self._totaldoclen / N
+        #K1 = self.K1
+        #B = self.B
+        #K1_plus1 = K1 + 1.0
+        #B_from1 = 1.0 - B
+
+        #                           f(D, t) * (k1 + 1)
+        #   TF(D, t) =  -------------------------------------------
+        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+        L = []
+        docid2len = self._docweight
+        for t in wids:
+            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
+            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
+            result = IIBucket()
+            score(result, d2f.items(), docid2len, idf, meandoclen)
+            L.append((result, 1))
+        return L
 
     def query_weight(self, terms):
         # This method was inherited from the cosine measure, and doesn't


=== Products/ZCTextIndex/Setup 1.2 => 1.3 ===
 stopper stopper.c
+okascore okascore.c