[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.29
Tim Peters
tim.one@comcast.net
Fri, 3 May 2002 18:17:38 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv25562
Modified Files:
Tag: TextIndexDS9-branch
Index.py
Log Message:
In the maps storing (docid, weight) pairs, the weights have been scaled
ints representing 1.+log(count). This checkin changes the weights to
store the count directly instead when it's < 256. This requires
compensating hair upon both storing and loading weights. The payoff
is that virtually all counts are < 256, and a binary pickle can save
such a count in 1 byte, while a weight generally consumes 2 bytes.
Hmm. Because a pickle stores a typecode too, it's really a reduction
from 3 bytes to 2 for most weights. Maybe it's not worth the bother --
it does slow _get_frequencies by introducing irregularities.
=== Products/ZCTextIndex/Index.py 1.1.2.28 => 1.1.2.29 ===
from Products.ZCTextIndex.IIndex import IIndex
-def scaled_int(f):
- return int(round(f * 1024.))
+# Instead of storing floats, we generally store scaled ints. Binary pickles
+# can store those more efficiently. The default SCALE_FACTOR of 1024
+# is large enough to get about 3 decimal digits of fractional info, and
+# small enough so that scaled values should almost always fit in a signed
+# 16-bit int (we're generally storing logs, so a few bits before the radix
+# point goes a long way; on the flip side, for reasonably small numbers x
+# most of the info in log(x) is in the fractional bits, so we do want to
+# save a lot of those).
+SCALE_FACTOR = 1024.0
+
+def scaled_int(f, scale=SCALE_FACTOR):
+ return int(round(f * scale))
class Index:
__implements__ = IIndex
@@ -60,7 +70,9 @@
#
# w(d, t) = 1 + log f(d, t)
# computed by doc_term_weight(); for a given word t,
- # self._wordinfo[t] is a map from d to w(d, t)
+ # self._wordinfo[t] is a map from d to w(d, t). To save space
+ # in binary pickles, we actually store f(d, t) if that's < 256.
+ # In that case, WDT[f(d, t)] retrieves the correct w(d, t).
#
# w(q, t) = log(1 + N/f(t))
# computed by query_term_weight()
@@ -95,6 +107,8 @@
d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
idf = query_term_weight(len(d2w), N) # this is an unscaled float
for docid, tf in d2w.items():
+ if tf < 256:
+ tf = WDT[tf]
# scaled int * unscaled float / scaled int -> unscaled float
w = tf * idf / self._docweight[docid]
result[docid] = result.get(docid, 0) + scaled_int(w)
@@ -115,18 +129,23 @@
"""Return individual doc-term weights and docweight."""
# computes w(d, t) for each term, and W(d)
# return triple
- # [wid0, wid1, ...]
- # [w(d, wid0), w(d, wid1), ...],
+ # [wid0, wid1, ...].
+ # an encoded form of [w(d, wid0), w(d, wid1), ...],
# W(d)
d = {}
for wid in wids:
d[wid] = d.get(wid, 0) + 1
Wsquares = 0.
- freqs = map(doc_term_weight, d.values())
- for f in freqs:
- Wsquares += f * f
- freqs = map(scaled_int, freqs)
- return d.keys(), freqs, scaled_int(math.sqrt(Wsquares))
+ weights = []
+ push = weights.append
+ for count in d.values():
+ w = doc_term_weight(count)
+ Wsquares += w * w
+ if count < 256:
+ push(count) # 1 byte in a binary pickle
+ else:
+ push(scaled_int(w)) # 2 bytes in a binary pickle, and rare
+ return d.keys(), weights, scaled_int(math.sqrt(Wsquares))
DICT_CUTOFF = 10
@@ -176,7 +195,13 @@
def _get_wdt(self, d, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
- return map.get(d, 0)
+ w = map.get(d, 0)
+ if w == 0:
+ return 0
+ elif w < 256:
+ return WDT[w]
+ else:
+ return w
def _get_Wd(self, d):
return self._docweight[d]
@@ -203,3 +228,16 @@
"""
# implements w(q, t) = log(1 + N/f(t))
return math.log(1. + float(num_items) / term_count)
+
+# For 0 < i < 256, WDT[i] is the integer scaled value of doc_term_weight(i).
+# WDT[0] is None, because doc_term_weight(0) should never happen, and
+# returning None then should quickly lead to an exception.
+WDT = [scaled_int(doc_term_weight(i)) for i in range(1, 256)]
+del i
+WDT.insert(0, None)
+
+# Upon reading up a "w(d, t)" value, we need to know whether it's a true
+# scaled w(d,t), or a raw count that must be used to index WDT[] to get
+# the true scaled w(d,t). The key is that for raw counts >= 256, their
+# scaled w(d,t) values are greater than 255.
+assert scaled_int(doc_term_weight(256)) > 255