[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.12

Tim Peters tim.one@comcast.net
Fri, 3 May 2002 00:55:11 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv7200

Modified Files:
      Tag: TextIndexDS9-branch
	Index.py 
Log Message:
query_weight():  Truncating division in N / len(self._wordinfo[wid])
isn't what we want, so arranged to make it float division.  Also
made assorted literals floats to avoid the expense of runtime
promotion.

class Index comments:  I got some whitespace on sale, so used it to
incorporate some of Jeremy's checkin comments.


=== Products/ZCTextIndex/Index.py 1.1.2.11 => 1.1.2.12 ===
 # Copyright (c) 2001, 2002 Zope Corporation and Contributors.
 # All Rights Reserved.
-# 
+#
 # This software is subject to the provisions of the Zope Public License,
 # Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
 # FOR A PARTICULAR PURPOSE
-# 
+#
 ##############################################################################
 """Text Index.
 
@@ -49,12 +49,27 @@
     # precomputes some values that are independent of the particular
     # query.
 
-    # The equation is:
-    # cosine = 1/W(d) * 1/W(q) + sum(for t in Q^D: w(d,t) * w(q,t))
-    # where w(d, t) = 1 + log f(d, t)
-    #       w(q, t) = log(1 + N/f(t))
-    #       W(d) = sqrt(sum(for t in D: w(d, t) ** 2))
-    #       W(q) = sqrt(sum(for t in Q: w(q, t) ** 2))
+    # The equation is
+    #
+    #                     sum(for t in I(d,q): w(d,t) * w(q,t))
+    #     cosine(d, q) =  -------------------------------------
+    #                                  W(d) * W(q)
+    #
+    # where
+    #    I(d, q) = the intersection of the terms in d and q.
+    #
+    #    w(d, t) = 1 + log f(d, t)
+    #        computed by doc_term_weight()
+    #
+    #    w(q, t) = log(1 + N/f(t))
+    #        computed by query_term_weight()
+    #
+    #    W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
+    #        computed by _get_frequencies(), and remembered in
+    #        self._docweight[d]
+    #
+    #    W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
+    #        computed by self.query_weight()
 
     def index_doc(self, docid, text, threshold=None):
         wids = self._lexicon.sourceToWordIds(text)
@@ -89,16 +104,17 @@
         wids = []
         for term in terms:
             wids += self._lexicon.termToWordIds(term)
-        N = len(self._docweight)
-        sum = 0
+        N = float(len(self._docweight))
+        sum = 0.
         for wid in wids:
-            wt = math.log(1 + N / len(self._wordinfo[wid]))
-            sum += wt ** 2
+            wt = math.log(1. + N / len(self._wordinfo[wid]))
+            sum += wt ** 2.
         return scaled_int(math.sqrt(sum))
 
     def _get_frequencies(self, wids):
         """Return individual doc-term weights and docweight."""
-        # computes w(d, t) for each term and W(d)
+        # computes w(d, t) for each term, and W(d)
+        # returns pairt [(wid0, w(d, wid0)), (wid1, w(d, wid1)), ...], W(d)
         d = {}
         for wid in wids:
             d[wid] = d.get(wid, 0) + 1
@@ -150,12 +166,11 @@
     # implements w(d, t) = 1 + log f(d, t)
     return scaled_int(1 + math.log(count))
 
-def query_term_weight(term_count, num_terms):
+def query_term_weight(term_count, num_items):
     """Return the query-term weight for a term,
 
-    that appears term_count times in a collection with num_terms
-    unique terms.
+    that appears in term_count items in a collection with num_items
+    total items.
     """
     # implements w(q, t) = log(1 + N/f(t))
-    return scaled_int(math.log(1 + float(num_terms) / term_count))
-
+    return scaled_int(math.log(1 + float(num_items) / term_count))