[Zope-CVS] CVS: Products/ZCTextIndex - BaseIndex.py:1.2 CosineIndex.py:1.6 OkapiIndex.py:1.13
Tim Peters
tim.one@comcast.net
Thu, 16 May 2002 23:00:14 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv12081
Modified Files:
BaseIndex.py CosineIndex.py OkapiIndex.py
Log Message:
Pushed the subclassing far enough to be useful. More is needed, but
I need a break.
=== Products/ZCTextIndex/BaseIndex.py 1.1 => 1.2 ===
self._lexicon = lexicon
+ # wid -> {docid -> weight}; t -> D -> w(D, t)
+ # Different indexers have different notions of term weight, but we
+ # expect all indexers to use ._wordinfo to map wids to its notion
+ # of a docid-to-weight map.
+ # There are two kinds of OOV words: wid 0 is explicitly OOV,
+ # and it's possible that the lexicon will return a non-zero wid
+ # for a word *we've* never seen (e.g., lexicons can be shared
+ # across indices, and a query can contain a word some other
+ # index knows about but we don't). A word is in-vocabulary for
+ # this index if and only if _wordinfo.has_key(wid). Note that
+ # wid 0 most not be a key in _wordinfo.
+ self._wordinfo = IOBTree()
+
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()
@@ -63,27 +76,13 @@
"""Returns the wordids for a given docid"""
return WidCode.decode(self._docwords[docid])
+ # Subclass must override.
def index_doc(self, docid, text):
- wids = self._lexicon.sourceToWordIds(text)
- self._doclen[docid] = len(wids)
- self._totaldoclen += len(wids)
-
- wid2count = self._get_frequencies(wids)
- for wid, count in wid2count.items():
- self._add_wordinfo(wid, count, docid)
-
- self._docwords[docid] = WidCode.encode(wids)
- return len(wids)
+ raise NotImplementedError
+ # Subclass must override.
def unindex_doc(self, docid):
- for wid in WidCode.decode(self._docwords[docid]):
- self._del_wordinfo(wid, docid)
-
- del self._docwords[docid]
-
- count = self._doclen[docid]
- del self._doclen[docid]
- self._totaldoclen -= count
+ raise NotImplementedError
def search(self, term):
wids = self._lexicon.termToWordIds(term)
@@ -117,52 +116,13 @@
def _remove_oov_wids(self, wids):
return filter(self._wordinfo.has_key, wids)
+ # Subclass must override.
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
- # TF(D,t) * IDF(t) for every docid D containing t.
- # As currently written, the weights are always 1, and the IIBucket maps
- # D to TF(D,t)*IDF(t) directly, where the product is computed as a float
- # but stored as a scaled_int.
+ # TF(D,t) * IDF(t) for every docid D containing t. wids must not
+ # contain any OOV words.
def _search_wids(self, wids):
- if not wids:
- return []
- N = float(len(self._doclen)) # total # of docs
- meandoclen = self._totaldoclen / N
- K1 = self.K1
- B = self.B
- K1_plus1 = K1 + 1.0
- B_from1 = 1.0 - B
-
- # f(D, t) * (k1 + 1)
- # TF(D, t) = -------------------------------------------
- # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-
- L = []
- docid2len = self._doclen
- for t in wids:
- assert self._wordinfo.has_key(t) # caller responsible for OOV
- d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
- idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
- result = IIBucket()
- for docid, f in d2f.items():
- lenweight = B_from1 + B * docid2len[docid] / meandoclen
- tf = f * K1_plus1 / (f + K1 * lenweight)
- result[docid] = scaled_int(tf * idf)
- L.append((result, 1))
- return L
-
- # Note about the above: the result is tf * idf. tf is small -- it
- # can't be larger than k1+1 = 2.2. idf is formally unbounded, but
- # is less than 14 for a term that appears in only 1 of a million
- # documents. So the product is probably less than 32, or 5 bits
- # before the radix point. If we did the scaled-int business on
- # both of them, we'd be up to 25 bits. Add 64 of those and we'd
- # be in overflow territory. That's pretty unlikely, so we *could*
- # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
- # as an invariant weight across the whole result. But besides
- # skating near the edge, it's not a speed cure, since the computation
- # of tf would still be done at Python speed, and it's a lot more
- # work than just multiplying by idf.
+ raise NotImplementedError
def query_weight(self, terms):
# This method was inherited from the cosine measure, and doesn't
@@ -246,182 +206,3 @@
"""
# implements IDF(q, t) = log(1 + N/f(t))
return math.log(1.0 + float(num_items) / term_count)
-
-"""
-"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
-It's based on probability arguments about how words are distributed in
-documents, not on an abstract vector space model. A long paper by its
-principal inventors gives an excellent overview of how it was derived:
-
- A probabilistic model of information retrieval: development and status
- K. Sparck Jones, S. Walker, S.E. Robertson
- http://citeseer.nj.nec.com/jones98probabilistic.html
-
-Spellings that ignore relevance information (which we don't have) are of this
-high-level form:
-
- score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
-
-where
-
- D a specific document
-
- Q a specific query
-
- t a term (word, atomic phrase, whatever)
-
- D&Q the terms common to D and Q
-
- TF(D, t) a measure of t's importance in D -- a kind of term frequency
- weight
-
- IDF(Q, t) a measure of t's importance in the query and in the set of
- documents as a whole -- a kind of inverse document frequency
- weight
-
-The IDF(Q, t) here is identical to the one used for our cosine measure.
-Since queries are expected to be short, it ignores Q entirely:
-
- IDF(Q, t) = log(1.0 + N / f(t))
-
-where
-
- N the total number of documents
- f(t) the number of documents in which t appears
-
-Most Okapi literature seems to use log(N/f(t)) instead. We don't, because
-that becomes 0 for a term that's in every document, and, e.g., if someone
-is searching for "documentation" on python.org (a term that may well show
-up on every page, due to the top navigation bar), we still want to find the
-pages that use the word a lot (which is TF's job to find, not IDF's -- we
-just want to stop IDF from considering this t to be irrelevant).
-
-The TF(D, t) spellings are more interesting. With lots of variations, the
-most basic spelling is of the form
-
- f(D, t)
- TF(D, t) = ---------------
- f(D, t) + K(D)
-
-where
-
- f(D, t) the number of times t appears in D
- K(D) a measure of the length of D, normalized to mean doc length
-
-The functional *form* f/(f+K) is clever. It's a gross approximation to a
-mixture of two distinct Poisson distributions, based on the idea that t
-probably appears in D for one of two reasons:
-
-1. More or less at random.
-
-2. Because it's important to D's purpose in life ("eliteness" in papers).
-
-Note that f/(f+K) is always between 0 and 1. If f is very large compared to
-K, it approaches 1. If K is very large compared to f, it approaches 0. If
-t appears in D more or less "for random reasons", f is likely to be small,
-and so K will dominate unless it's a very small doc, and the ratio will be
-small. OTOH, if t appears a lot in D, f will dominate unless it's a very
-large doc, and the ratio will be close to 1.
-
-We use a variation on that simple theme, a simplification of what's called
-BM25 in the literature (it was the 25th stab at a Best Match function from
-the Okapi group; "a simplification" means we're setting some of BM25's more
-esoteric free parameters to 0):
-
- f(D, t) * (k1 + 1)
- TF(D, t) = --------------------
- f(D, t) + k1 * K(D)
-
-where
-
- k1 a "tuning factor", typically between 1.0 and 2.0. We use 1.2,
- the usual default value. This constant adjusts the curve to
- look more like a theoretical 2-Poisson curve.
-
-Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
-an asymptote of k1+1 from below.
-
-Finally, we use
-
- K(D) = (1-b) + b * len(D)/E(len(D))
-
-where
-
- b is another free parameter, discussed below. We use 0.75.
-
- len(D) the length of D in words
-
- E(len(D)) the expected value of len(D) across the whole document set;
- or, IOW, the average document length
-
-b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
-of the "Verbosity Hypothesis". Suppose b is 1, and some word t appears
-10 times as often in document d2 than in document d1. If document d2 is
-also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
-
- f(d2, t) * (k1 + 1)
- TF(d2, t) = --------------------------------- =
- f(d2, t) + k1 * len(d2)/E(len(D))
-
- 10 * f(d1, t) * (k1 + 1)
- ----------------------------------------------- = TF(d1, t)
- 10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
-
-because the 10's cancel out. This is appropriate if we believe that a word
-appearing 10x more often in a doc 10x as long is simply due to that the
-longer doc is more verbose. If we do believe that, the longer doc and the
-shorter doc are probably equally relevant. OTOH, it *could* be that the
-longer doc is talking about t in greater depth too, in which case it's
-probably more relevant than the shorter doc.
-
-At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
-completely, and a doc scores higher for having more occurences of a word
-regardless of the doc's length.
-
-Reality is between these extremes, and probably varies by document and word
-too. Reports in the literature suggest that b=0.75 is a good compromise "in
-general", favoring the "verbosity hypothesis" end of the scale.
-
-Putting it all together, the final TF function is
-
- f(D, t) * (k1 + 1)
- TF(D, t) = --------------------------------------------
- f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
-
-with k1=1.2 and b=0.75.
-
-
-Query Term Weighting
---------------------
-
-I'm ignoring the query adjustment part of Okapi BM25 because I expect our
-queries are very short. Full BM25 takes them into account by adding the
-following to every score(D, Q); it depends on the lengths of D and Q, but
-not on the specific words in Q, or even on whether they appear in D(!):
-
- E(len(D)) - len(D)
- k2 * len(Q) * -------------------
- E(len(D)) + len(D)
-
-Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
-len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
-so it apparently doesn't do much good (or may even hurt).
-
-Full BM25 *also* multiplies the following factor into IDF(Q, t):
-
- f(Q, t) * (k3 + 1)
- ------------------
- f(Q, t) + k3
-
-where k3 is yet another free parameter, and f(Q,t) is the number of times t
-appears in Q. Since we're using short "web style" queries, I expect f(Q,t)
-to always be 1, and then that quotient is
-
- 1 * (k3 + 1)
- ------------ = 1
- 1 + k3
-
-regardless of k3's value. So, in a trivial sense, we are incorporating
-this measure (and optimizing it by not bothering to multiply by 1 <wink>).
-"""
-
=== Products/ZCTextIndex/CosineIndex.py 1.5 => 1.6 ===
BaseIndex.__init__(self, lexicon)
- # wid -> { docid -> frequency }
- self._wordinfo = IOBTree()
+ # ._wordinfo for cosine is wid -> {docid -> weight};
+ # t -> D -> w(d, t)/W(d)
# docid -> W(docid)
self._docweight = IIBTree()
@@ -101,33 +101,6 @@
self._del_wordinfo(wid, docid)
del self._docwords[docid]
del self._docweight[docid]
-
- def search(self, term):
- wids = self._lexicon.termToWordIds(term)
- if not wids:
- return None # All docs match
- if 0 in wids:
- wids = filter(None, wids)
- return mass_weightedUnion(self._search_wids(wids))
-
- def search_glob(self, pattern):
- wids = self._lexicon.globToWordIds(pattern)
- return mass_weightedUnion(self._search_wids(wids))
-
- def search_phrase(self, phrase):
- wids = self._lexicon.termToWordIds(phrase)
- if 0 in wids:
- return IIBTree()
- hits = mass_weightedIntersection(self._search_wids(wids))
- if not hits:
- return hits
- code = WidCode.encode(wids)
- result = IIBTree()
- for docid, weight in hits.items():
- docwords = self._docwords[docid]
- if docwords.find(code) >= 0:
- result[docid] = weight
- return result
def _search_wids(self, wids):
if not wids:
=== Products/ZCTextIndex/OkapiIndex.py 1.12 => 1.13 ===
BaseIndex.__init__(self, lexicon)
+ # ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
- # There are two kinds of OOV words: wid 0 is explicitly OOV,
- # and it's possible that the lexicon will return a non-zero wid
- # for a word *we've* never seen (e.g., lexicons can be shared
- # across indices, and a query can contain a word some other
- # index knows about but we don't).
- self._wordinfo = IOBTree()
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
@@ -100,38 +95,6 @@
count = self._doclen[docid]
del self._doclen[docid]
self._totaldoclen -= count
-
- def search(self, term):
- wids = self._lexicon.termToWordIds(term)
- if not wids:
- return None # All docs match
- wids = self._remove_oov_wids(wids)
- return mass_weightedUnion(self._search_wids(wids))
-
- def search_glob(self, pattern):
- wids = self._lexicon.globToWordIds(pattern)
- return mass_weightedUnion(self._search_wids(wids))
-
- def search_phrase(self, phrase):
- wids = self._lexicon.termToWordIds(phrase)
- cleaned_wids = self._remove_oov_wids(wids)
- if len(wids) != len(cleaned_wids):
- # At least one wid was OOV: can't possibly find it.
- return IIBTree()
- scores = self._search_wids(cleaned_wids)
- hits = mass_weightedIntersection(scores)
- if not hits:
- return hits
- code = WidCode.encode(wids)
- result = IIBTree()
- for docid, weight in hits.items():
- docwords = self._docwords[docid]
- if docwords.find(code) >= 0:
- result[docid] = weight
- return result
-
- def _remove_oov_wids(self, wids):
- return filter(self._wordinfo.has_key, wids)
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to