[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.2
Fred L. Drake, Jr.
fdrake@acm.org
Tue, 30 Apr 2002 17:43:24 -0400
Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv15726
Modified Files:
Tag: TextIndexDS9-branch
Index.py
Log Message:
Rename "doc-frequency" for a word to "word-frequency", with many local
variable renames.
Update calls to the lexicon to use the final interface for the
lexicon.
Implement the search() method.
=== Products/ZCTextIndex/Index.py 1.1.2.1 => 1.1.2.2 ===
from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IISet
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
class Index:
@@ -30,7 +30,7 @@
self._lexicon = lexicon
self._fieldname = fieldname
- # wid -> ( doc-frequency, { docid -> frequency } )
+ # wid -> ( word-frequency, { docid -> frequency } )
self._wordinfo = IOBTree()
# docid -> W
@@ -41,7 +41,7 @@
self._docwords = IOBTree()
def index_object(self, docid, obj, threshold=None):
- wids = self._lexicon.textToWordIDs(self._get_object_text(obj))
+ wids = self._lexicon.sourceToWordIds(self._get_object_text(obj))
freqs, docweight = self._get_frequencies(wids)
uniqwids = []
for wid, f in freqs:
@@ -57,6 +57,20 @@
del self._docwords[docid]
del self._docweight[docid]
+ def search(self, term):
+ # XXX this can generate word IDs that are not in the index --
+ # potential denial of service
+ wids = self._lexicon.termToWordIds(term)
+ result = IIBucket()
+ N = len(self._docweight)
+ for wid in wids:
+ wordfreq, map = self._wordinfo[wid]
+ ft = len(map)
+ for docid, f in map.items():
+ w = f * invfreq(N, ft) / self._docweight[docid]
+ result[docid] = result.get(docid, 0) + w
+ return result
+
def _get_object_text(self, obj):
x = getattr(obj, self._fieldname)
if callable(x):
@@ -78,21 +92,24 @@
def _add_wordinfo(self, wid, f, docid):
try:
- olddocfreq, map = self._wordinfo[wid]
+ oldwordfreq, map = self._wordinfo[wid]
except KeyError:
- olddocfreq = 0
+ oldwordfreq = 0
map = IIBTree()
map[docid] = f
- self._wordinfo[wid] = olddocfreq + 1, map
+ self._wordinfo[wid] = oldwordfreq + 1, map
def _del_wordinfo(self, wid, docid):
- olddocfreq, map = self._wordinfo[wid]
- if olddocfreq == 1:
+ oldwordfreq, map = self._wordinfo[wid]
+ if oldwordfreq == 1:
del self._wordinfo[wid]
return
del map[docid]
- self._wordinfo[wid] = olddocfreq - 1, map
+ self._wordinfo[wid] = oldwordfreq - 1, map
def frequency(count):
return count
+
+def invfreq(N, ft):
+ return 1 + (N / ft)