[Zope-CVS] CVS: Products/ZCTextIndex - BaseIndex.py:1.8 CosineIndex.py:1.17 OkapiIndex.py:1.22
Tim Peters
tim.one@comcast.net
Fri, 17 May 2002 02:56:01 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv20114
Modified Files:
BaseIndex.py CosineIndex.py OkapiIndex.py
Log Message:
Factor out most of the code for indexing a doc. The cosine index may
take longer to construct now; both indexers' _get_frequencies routines
were fiddled to return the same kind of stuff again, and I had
previously fiddled the cosine indexer's _get_frequencies to do something
weirder but (probably) faster than this.
=== Products/ZCTextIndex/BaseIndex.py 1.7 => 1.8 ===
return WidCode.decode(self._docwords[docid])
- # Subclass must override.
+ # A subclass may wish to extend or override this.
def index_doc(self, docid, text):
+ # XXX If docid is already known, do something smart.
+ wids = self._lexicon.sourceToWordIds(text)
+ wid2weight, docweight = self._get_frequencies(wids)
+ for wid, weight in wid2weight.items():
+ self._add_wordinfo(wid, weight, docid)
+ self._docweight[docid] = docweight
+ self._docwords[docid] = WidCode.encode(wids)
+ return len(wids)
+
+ # Subclass must override.
+ def _get_frequencies(self, wids):
+ # Compute term frequencies and a doc weight, whatever those mean
+ # to an indexer.
+ # Return pair:
+ # {wid0: w(d, wid0), wid1: w(d, wid1), ...],
+ # docweight
+ # The wid->weight mappings are fed into _add_wordinfo, and docweight
+ # becomes the value of _docweight[docid].
raise NotImplementedError
# A subclass may wish to extend or override this.
=== Products/ZCTextIndex/CosineIndex.py 1.16 => 1.17 ===
# computed by self.query_weight()
- def index_doc(self, docid, text):
- wids = self._lexicon.sourceToWordIds(text)
- uniqwids, freqs, docweight = self._get_frequencies(wids)
- for i in range(len(uniqwids)):
- self._add_wordinfo(uniqwids[i], freqs[i], docid)
- self._docweight[docid] = docweight
- self._docwords[docid] = WidCode.encode(wids)
- return len(wids)
-
def _search_wids(self, wids):
if not wids:
return []
@@ -111,30 +102,22 @@
return scaled_int(math.sqrt(sum))
def _get_frequencies(self, wids):
- """Return individual doc-term weights and docweight."""
- # Computes w(d, t) for each term, and W(d).
- # Return triple:
- # [wid0, wid1, ...],
- # [w(d, wid0)/W(d), w(d, wid1)/W(d), ...],
- # W(d)
- # The second list and W(d) are scaled_ints.
d = {}
+ dget = d.get
for wid in wids:
- d[wid] = d.get(wid, 0) + 1
+ d[wid] = dget(wid, 0) + 1
Wsquares = 0.0
- weights = []
- push = weights.append
- for count in d.values():
+ for wid, count in d.items():
w = doc_term_weight(count)
Wsquares += w * w
- push(w)
+ d[wid] = w
W = math.sqrt(Wsquares)
#print "W = %.3f" % W
- for i in xrange(len(weights)):
- #print i, ":", "%.3f" % weights[i],
- weights[i] = scaled_int(weights[i] / W)
- #print "->", weights[i]
- return d.keys(), weights, scaled_int(W)
+ for wid, weight in d.items():
+ #print i, ":", "%.3f" % weight,
+ d[wid] = scaled_int(weight / W)
+ #print "->", d[wid]
+ return d, scaled_int(W)
# The rest are helper methods to support unit tests
=== Products/ZCTextIndex/OkapiIndex.py 1.21 => 1.22 ===
def index_doc(self, docid, text):
- wids = self._lexicon.sourceToWordIds(text)
- self._docweight[docid] = len(wids)
- self._totaldoclen += len(wids)
-
- wid2count = self._get_frequencies(wids)
- for wid, count in wid2count.items():
- self._add_wordinfo(wid, count, docid)
-
- self._docwords[docid] = WidCode.encode(wids)
- return len(wids)
+ count = BaseIndex.index_doc(self, docid, text)
+ self._totaldoclen += count
def unindex_doc(self, docid):
self._totaldoclen -= self._docweight[docid]
@@ -125,15 +117,11 @@
return 10 # arbitrary
def _get_frequencies(self, wids):
- """Return individual term frequencies."""
- # Computes f(d, t) for each term.
- # Returns a dict mapping wid to the number of times wid appeares
- # in wids, {t -> f(d, t)}
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
- return d
+ return d, len(wids)
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.