[Zope-CVS] CVS: Products/ZCTextIndex - BaseIndex.py:1.4 CosineIndex.py:1.8 OkapiIndex.py:1.15
Tim Peters
tim.one@comcast.net
Fri, 17 May 2002 01:36:09 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv686
Modified Files:
BaseIndex.py CosineIndex.py OkapiIndex.py
Log Message:
Refactor/combine _docweight/_doclen.
=== Products/ZCTextIndex/BaseIndex.py 1.3 => 1.4 ===
# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
- # expect all indexers to use ._wordinfo to map wids to its notion
+ # expect each indexer to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
@@ -63,6 +63,12 @@
# this index if and only if _wordinfo.has_key(wid). Note that
# wid 0 must not be a key in _wordinfo.
self._wordinfo = IOBTree()
+
+ # docid -> weight
+ # Different indexers have different notions of doc weight, but we
+ # expect each indexer to use ._docweight to map docids to its
+ # notion of what a doc weight is.
+ self._docweight = IIBTree()
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
=== Products/ZCTextIndex/CosineIndex.py 1.7 => 1.8 ===
# t -> D -> w(d, t)/W(d)
+ # ._docweight for Okapi is
# docid -> W(docid)
- self._docweight = IIBTree()
# Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently
=== Products/ZCTextIndex/OkapiIndex.py 1.14 => 1.15 ===
# wid -> {docid -> frequency}; t -> D -> f(D, t)
+ # ._docweight for Okapi is
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive.
- self._doclen = IIBTree()
- # sum(self._doclen.values()), the total # of words in all docs
+ # sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
self._totaldoclen = 0L
def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
- self._doclen[docid] = len(wids)
+ self._docweight[docid] = len(wids)
self._totaldoclen += len(wids)
wid2count = self._get_frequencies(wids)
@@ -92,8 +92,8 @@
del self._docwords[docid]
- count = self._doclen[docid]
- del self._doclen[docid]
+ count = self._docweight[docid]
+ del self._docweight[docid]
self._totaldoclen -= count
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
@@ -105,7 +105,7 @@
def _search_wids(self, wids):
if not wids:
return []
- N = float(len(self._doclen)) # total # of docs
+ N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
K1 = self.K1
B = self.B
@@ -117,7 +117,7 @@
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = []
- docid2len = self._doclen
+ docid2len = self._docweight
for t in wids:
assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}