[Zope-CVS] CVS: Products/ZCTextIndex - OkapiIndex.py:1.11
Tim Peters
tim.one@comcast.net
Thu, 16 May 2002 22:25:49 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv3269
Modified Files:
OkapiIndex.py
Log Message:
wid 0 isn't the only kind of OOV word possible, so change the search
logic to deal with all cases. All the tests pass again.
=== Products/ZCTextIndex/OkapiIndex.py 1.10 => 1.11 ===
# wid -> {docid -> frequency}; t -> D -> f(D, t)
+ # There are two kinds of OOV words: wid 0 is explicitly OOV,
+ # and it's possible that the lexicon will return a non-zero wid
+ # for a word *we've* never seen (e.g., lexicons can be shared
+ # across indices, and a query can contain a word some other
+ # index knows about but we don't).
self._wordinfo = IOBTree()
# docid -> # of words in the doc
@@ -111,8 +116,7 @@
wids = self._lexicon.termToWordIds(term)
if not wids:
return None # All docs match
- if 0 in wids:
- wids = filter(None, wids)
+ wids = self._remove_oov_wids(wids)
return mass_weightedUnion(self._search_wids(wids))
def search_glob(self, pattern):
@@ -121,9 +125,12 @@
def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
- if 0 in wids:
+ cleaned_wids = self._remove_oov_wids(wids)
+ if len(wids) != len(cleaned_wids):
+ # At least one wid was OOV: can't possibly find it.
return IIBTree()
- hits = mass_weightedIntersection(self._search_wids(wids))
+ scores = self._search_wids(cleaned_wids)
+ hits = mass_weightedIntersection(scores)
if not hits:
return hits
code = WidCode.encode(wids)
@@ -134,6 +141,9 @@
result[docid] = weight
return result
+ def _remove_oov_wids(self, wids):
+ return filter(self._wordinfo.has_key, wids)
+
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
@@ -157,6 +167,7 @@
L = []
docid2len = self._doclen
for t in wids:
+ assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()