[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.12
Tim Peters
tim.one@comcast.net
Fri, 3 May 2002 00:55:11 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv7200
Modified Files:
Tag: TextIndexDS9-branch
Index.py
Log Message:
query_weight(): Truncating division in N / len(self._wordinfo[wid])
isn't what we want, so arranged to make it float division. Also
made assorted literals floats to avoid the expense of runtime
promotion.
class Index comments: I got some whitespace on sale, so used it to
incorporate some of Jeremy's checkin comments.
=== Products/ZCTextIndex/Index.py 1.1.2.11 => 1.1.2.12 ===
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
-#
+#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
-#
+#
##############################################################################
"""Text Index.
@@ -49,12 +49,27 @@
# precomputes some values that are independent of the particular
# query.
- # The equation is:
- # cosine = 1/W(d) * 1/W(q) + sum(for t in Q^D: w(d,t) * w(q,t))
- # where w(d, t) = 1 + log f(d, t)
- # w(q, t) = log(1 + N/f(t))
- # W(d) = sqrt(sum(for t in D: w(d, t) ** 2))
- # W(q) = sqrt(sum(for t in Q: w(q, t) ** 2))
+ # The equation is
+ #
+ # sum(for t in I(d,q): w(d,t) * w(q,t))
+ # cosine(d, q) = -------------------------------------
+ # W(d) * W(q)
+ #
+ # where
+ # I(d, q) = the intersection of the terms in d and q.
+ #
+ # w(d, t) = 1 + log f(d, t)
+ # computed by doc_term_weight()
+ #
+ # w(q, t) = log(1 + N/f(t))
+ # computed by query_term_weight()
+ #
+ # W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
+ # computed by _get_frequencies(), and remembered in
+ # self._docweight[d]
+ #
+ # W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
+ # computed by self.query_weight()
def index_doc(self, docid, text, threshold=None):
wids = self._lexicon.sourceToWordIds(text)
@@ -89,16 +104,17 @@
wids = []
for term in terms:
wids += self._lexicon.termToWordIds(term)
- N = len(self._docweight)
- sum = 0
+ N = float(len(self._docweight))
+ sum = 0.
for wid in wids:
- wt = math.log(1 + N / len(self._wordinfo[wid]))
- sum += wt ** 2
+ wt = math.log(1. + N / len(self._wordinfo[wid]))
+ sum += wt ** 2.
return scaled_int(math.sqrt(sum))
def _get_frequencies(self, wids):
"""Return individual doc-term weights and docweight."""
- # computes w(d, t) for each term and W(d)
+ # computes w(d, t) for each term, and W(d)
+ # returns pairt [(wid0, w(d, wid0)), (wid1, w(d, wid1)), ...], W(d)
d = {}
for wid in wids:
d[wid] = d.get(wid, 0) + 1
@@ -150,12 +166,11 @@
# implements w(d, t) = 1 + log f(d, t)
return scaled_int(1 + math.log(count))
-def query_term_weight(term_count, num_terms):
+def query_term_weight(term_count, num_items):
"""Return the query-term weight for a term,
- that appears term_count times in a collection with num_terms
- unique terms.
+ that appears in term_count items in a collection with num_items
+ total items.
"""
# implements w(q, t) = log(1 + N/f(t))
- return scaled_int(math.log(1 + float(num_terms) / term_count))
-
+ return scaled_int(math.log(1 + float(num_items) / term_count))