[Zope-CVS] CVS: Products/ZCTextIndex - CosineIndex.py:1.11 OkapiIndex.py:1.17
Tim Peters
tim.one@comcast.net
Fri, 17 May 2002 01:50:47 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv4501
Modified Files:
CosineIndex.py OkapiIndex.py
Log Message:
Compute inverse doc frequency the same way everywhere.
=== Products/ZCTextIndex/CosineIndex.py 1.10 => 1.11 ===
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
-from Products.ZCTextIndex.BaseIndex import BaseIndex
+from Products.ZCTextIndex.BaseIndex import BaseIndex, inverse_doc_frequency
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion
@@ -77,7 +77,7 @@
# self._wordinfo[t] is a map from d to w(d, t).
#
# w(q, t) = log(1 + N/f(t))
- # computed by query_term_weight()
+ # computed by inverse_doc_frequency()
#
# W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
# computed by _get_frequencies(), and remembered in
@@ -110,7 +110,7 @@
for wid in wids:
assert self._wordinfo.has_key(wid) # caller responsible for OOV
d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
- idf = query_term_weight(len(d2w), N) # this is an unscaled float
+ idf = inverse_doc_frequency(len(d2w), N) # this is an unscaled float
#print "idf = %.3f" % idf
if isinstance(d2w, DictType):
d2w = IIBucket(d2w)
@@ -237,12 +237,3 @@
"""Return the doc-term weight for a term that appears count times."""
# implements w(d, t) = 1 + log f(d, t)
return 1.0 + math.log(count)
-
-def query_term_weight(term_count, num_items):
- """Return the query-term weight for a term,
-
- that appears in term_count items in a collection with num_items
- total items.
- """
- # implements w(q, t) = log(1 + N/f(t))
- return math.log(1.0 + float(num_items) / term_count)
=== Products/ZCTextIndex/OkapiIndex.py 1.16 => 1.17 ===
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
-from Products.ZCTextIndex.BaseIndex import BaseIndex
+from Products.ZCTextIndex.BaseIndex import BaseIndex, inverse_doc_frequency
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion
@@ -211,15 +211,6 @@
new[k] = v
map = new
self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
-def inverse_doc_frequency(term_count, num_items):
- """Return the inverse doc frequency for a term,
-
- that appears in term_count items in a collection with num_items
- total items.
- """
- # implements IDF(q, t) = log(1 + N/f(t))
- return math.log(1.0 + float(num_items) / term_count)
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.