[Zope-CVS] CVS: Products/ZCTextIndex/tests - testZCTextIndex.py:1.1.2.5

Jeremy Hylton jeremy@zope.com
Thu, 2 May 2002 23:01:57 -0400


Update of /cvs-repository/Products/ZCTextIndex/tests
In directory cvs.zope.org:/tmp/cvs-serv8978/tests

Modified Files:
      Tag: TextIndexDS9-branch
	testZCTextIndex.py 
Log Message:
Add testRanking() that verifies correctness of the cosine implementation.

The test computes the values from Table 4.8 of Managing Gigabytes
based on the running example in section 4.4 of that book.  The test
checks the final cosine value along with most of the intermediate
values.



=== Products/ZCTextIndex/tests/testZCTextIndex.py 1.1.2.4 => 1.1.2.5 ===
+from Products.ZCTextIndex.QueryParser import terms
 from Products.ZCTextIndex.tests \
      import testIndex, testQueryEngine, testQueryParser
+from Products.ZCTextIndex.Index import scaled_int
 
 import unittest
 
@@ -13,6 +15,10 @@
 # ZCTextIndex.  These tests run the individual module unit tests with
 # the fully integrated ZCTextIndex.
 
+def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
+    if abs(scaled1 - scaled2) > epsilon:
+        raise AssertionError, "%s != %s" % (scaled1, scaled2)
+
 class IndexTests(testIndex.IndexTest):
 
     def setUp(self):
@@ -31,6 +37,79 @@
                 wids = self.lexicon.termToWordIds(word)
                 self.assertEqual(wids, [])
         self.assertEqual(len(self.index._docwords[1]), 1)
+
+    def testRanking(self):
+        # A fairly involved test of the ranking calculations based on
+        # an example set of documents in queries in Managing
+        # Gigabytes, pp. 180-188.
+        self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
+                      "pease", "porridge", "pot"]
+        self._ranking_index()
+        self._ranking_tf()
+        self._ranking_idf()
+        self._ranking_queries()
+
+    def _ranking_index(self):
+        docs = ["Pease porridge hot, pease porridge cold,",
+                "Pease porridge in the pot,",
+                "Nine days old.",
+                "In the pot cold, in the pot hot,",
+                "Pease porridge, pease porridge,",
+                "Eat the lot."]
+        for i in range(len(docs)):
+            self.zc_index.index_object(i + 1, testIndex.Indexable(docs[i]))
+
+    def _ranking_tf(self):
+        # matrix of term weights for the rows are docids
+        # and the columns are indexes into this list:
+        l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
+               (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
+               (0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
+               (1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
+               (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
+               (0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
+        l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
+
+        for i in range(len(l_Wd)):
+            docid = i + 1
+            scaled_Wd = scaled_int(l_Wd[i])
+            eq(scaled_Wd, self.index._get_Wd(docid))
+            wdts = [scaled_int(t) for t in l_wdt[i]]
+            for j in range(len(wdts)):
+                wdt = self.index._get_wdt(docid, self.words[j])
+                eq(wdts[j], wdt)
+
+    def _ranking_idf(self):
+        word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
+        idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
+        for i in range(len(self.words)):
+            word = self.words[i]
+            eq(word_freqs[i], self.index._get_ft(word))
+            eq(scaled_int(idfs[i]), self.index._get_wt(word))
+
+    def _ranking_queries(self):
+        queries = ["eat", "porridge", "hot OR porridge",
+                   "eat OR nine OR day OR old OR porridge"]
+        wqs = [1.95, 1.10, 1.77, 3.55]
+        results = [[(6, 0.71)],
+                   [(1, 0.61), (2, 0.58), (5, 0.71)],
+                   [(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
+                   [(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
+        for i in range(len(queries)):
+            raw = queries[i]
+            q = self.zc_index.parser.parseQuery(raw)
+            wq = self.index.query_weight(terms(q))
+            eq(wq, scaled_int(wqs[i]))
+            r = self.zc_index.query(raw)
+            self.assertEqual(len(r), len(results[i]))
+            # convert the results to a dict for each checking
+            d = {}
+            for doc, score in results[i]:
+                d[doc] = scaled_int(score)
+            for doc, score in r:
+                score = scaled_int(float(score) / wq)
+                self.assert_(0 <= score <= 256)
+                eq(d[doc], score)
 
 class QueryTests(testQueryEngine.TestQueryEngine,
                  testQueryParser.TestQueryParser):