[Zope-CVS] CVS: Products/ZCTextIndex - BaseIndex.py:1.5 CosineIndex.py:1.13 IIndex.py:1.3 OkapiIndex.py:1.19
Tim Peters
tim.one@comcast.net
Fri, 17 May 2002 02:03:19 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv7484
Modified Files:
BaseIndex.py CosineIndex.py IIndex.py OkapiIndex.py
Log Message:
Combine _{add,del}wordinfo; make length() part of IIndex.
=== Products/ZCTextIndex/BaseIndex.py 1.4 => 1.5 ===
return 10 # arbitrary
- def _get_frequencies(self, wids):
- """Return individual term frequencies."""
- # Computes f(d, t) for each term.
- # Returns a dict mapping wid to the number of times wid appeares
- # in wids, {t -> f(d, t)}
- d = {}
- dget = d.get
- for wid in wids:
- d[wid] = dget(wid, 0) + 1
- return d
-
DICT_CUTOFF = 10
def _add_wordinfo(self, wid, f, docid):
=== Products/ZCTextIndex/CosineIndex.py 1.12 => 1.13 ===
return d.keys(), weights, scaled_int(W)
- DICT_CUTOFF = 10
-
- def _add_wordinfo(self, wid, f, docid):
- # Store a wordinfo in a dict as long as there are less than
- # DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
-
- # The pickle of a dict is smaller than the pickle of an
- # IIBTree, substantially so for small mappings. Thus, we use
- # a dictionary until the mapping reaches DICT_CUTOFF elements.
-
- # The cutoff is chosen based on the implementation
- # characteristics of Python dictionaries. The dict hashtable
- # always has 2**N slots and is resized whenever it is 2/3s
- # full. A pickled dict with 10 elts is half the size of an
- # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
- # choose 10 as the cutoff for now.
-
- # The IIBTree has a smaller in-memory representation than a
- # dictionary, so pickle size isn't the only consideration when
- # choosing the threshold. The pickle of a 500-elt dict is 92%
- # of the size of the same IIBTree, but the dict uses more
- # space when it is live in memory. An IIBTree stores two C
- # arrays of ints, one for the keys and one for the values. It
- # holds upto 120 key-value pairs in a single bucket.
- try:
- map = self._wordinfo[wid]
- except KeyError:
- map = {}
- else:
- # _add_wordinfo() is called for each update. If the map
- # size exceeds the DICT_CUTOFF, convert to an IIBTree.
- if len(map) == self.DICT_CUTOFF:
- map = IIBTree(map)
- map[docid] = f
- self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
- def _del_wordinfo(self, wid, docid):
- try:
- map = self._wordinfo[wid]
- del map[docid]
- except KeyError:
- return
- if len(map) == 0:
- del self._wordinfo[wid]
- return
- if len(map) == self.DICT_CUTOFF:
- new = {}
- for k, v in map.items():
- new[k] = v
- map = new
- self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
def _add_undoinfo(self, docid, wids):
self._docwords[docid] = WidCode.encode(wids)
=== Products/ZCTextIndex/IIndex.py 1.2 => 1.3 ===
"""Interface for an Index."""
+ def length():
+ """Return the number of documents in the index."""
+
def search(term):
"""Execute a search on a single term given as a string.
=== Products/ZCTextIndex/OkapiIndex.py 1.18 => 1.19 ===
return d
- DICT_CUTOFF = 10
-
- def _add_wordinfo(self, wid, f, docid):
- # Store a wordinfo in a dict as long as there are less than
- # DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
-
- # The pickle of a dict is smaller than the pickle of an
- # IIBTree, substantially so for small mappings. Thus, we use
- # a dictionary until the mapping reaches DICT_CUTOFF elements.
-
- # The cutoff is chosen based on the implementation
- # characteristics of Python dictionaries. The dict hashtable
- # always has 2**N slots and is resized whenever it is 2/3s
- # full. A pickled dict with 10 elts is half the size of an
- # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
- # choose 10 as the cutoff for now.
-
- # The IIBTree has a smaller in-memory representation than a
- # dictionary, so pickle size isn't the only consideration when
- # choosing the threshold. The pickle of a 500-elt dict is 92%
- # of the size of the same IIBTree, but the dict uses more
- # space when it is live in memory. An IIBTree stores two C
- # arrays of ints, one for the keys and one for the values. It
- # holds upto 120 key-value pairs in a single bucket.
- try:
- map = self._wordinfo[wid]
- except KeyError:
- map = {}
- else:
- # _add_wordinfo() is called for each update. If the map
- # size exceeds the DICT_CUTOFF, convert to an IIBTree.
- if len(map) == self.DICT_CUTOFF:
- map = IIBTree(map)
- map[docid] = f
- self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
- def _del_wordinfo(self, wid, docid):
- try:
- map = self._wordinfo[wid]
- del map[docid]
- except KeyError:
- return
- if len(map) == 0:
- del self._wordinfo[wid]
- return
- if len(map) == self.DICT_CUTOFF:
- new = {}
- for k, v in map.items():
- new[k] = v
- map = new
- self._wordinfo[wid] = map # Not redundant, because of Persistency!
-
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
It's based on probability arguments about how words are distributed in