[Zope-CVS] CVS: Products/ZCTextIndex - Index.py:1.1.2.2

Fred L. Drake, Jr. fdrake@acm.org
Tue, 30 Apr 2002 17:43:24 -0400


Update of /cvs-repository/Products/ZCTextIndex
In directory cvs.zope.org:/tmp/cvs-serv15726

Modified Files:
      Tag: TextIndexDS9-branch
	Index.py 
Log Message:
Rename "doc-frequency" for a word to "word-frequency", with many local
variable renames.

Update calls to the lexicon to use the final interface for the
lexicon.

Implement the search() method.


=== Products/ZCTextIndex/Index.py 1.1.2.1 => 1.1.2.2 ===
 
 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IISet
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
 
 
 class Index:
@@ -30,7 +30,7 @@
         self._lexicon = lexicon
         self._fieldname = fieldname
 
-        # wid -> ( doc-frequency, { docid -> frequency } )
+        # wid -> ( word-frequency, { docid -> frequency } )
         self._wordinfo = IOBTree()
 
         # docid -> W
@@ -41,7 +41,7 @@
         self._docwords = IOBTree()
 
     def index_object(self, docid, obj, threshold=None):
-        wids = self._lexicon.textToWordIDs(self._get_object_text(obj))
+        wids = self._lexicon.sourceToWordIds(self._get_object_text(obj))
         freqs, docweight = self._get_frequencies(wids)
         uniqwids = []
         for wid, f in freqs:
@@ -57,6 +57,20 @@
         del self._docwords[docid]
         del self._docweight[docid]
 
+    def search(self, term):
+        # XXX this can generate word IDs that are not in the index --
+        # potential denial of service
+        wids = self._lexicon.termToWordIds(term)
+        result = IIBucket()
+        N = len(self._docweight)
+        for wid in wids:
+            wordfreq, map = self._wordinfo[wid]
+            ft = len(map)
+            for docid, f in map.items():
+                w = f * invfreq(N, ft) / self._docweight[docid]
+                result[docid] = result.get(docid, 0) + w
+        return result
+
     def _get_object_text(self, obj):
         x = getattr(obj, self._fieldname)
         if callable(x):
@@ -78,21 +92,24 @@
 
     def _add_wordinfo(self, wid, f, docid):
         try:
-            olddocfreq, map = self._wordinfo[wid]
+            oldwordfreq, map = self._wordinfo[wid]
         except KeyError:
-            olddocfreq = 0
+            oldwordfreq = 0
             map = IIBTree()
         map[docid] = f
-        self._wordinfo[wid] = olddocfreq + 1, map
+        self._wordinfo[wid] = oldwordfreq + 1, map
 
     def _del_wordinfo(self, wid, docid):
-        olddocfreq, map = self._wordinfo[wid]
-        if olddocfreq == 1:
+        oldwordfreq, map = self._wordinfo[wid]
+        if oldwordfreq == 1:
             del self._wordinfo[wid]
             return
         del map[docid]
-        self._wordinfo[wid] = olddocfreq - 1, map
+        self._wordinfo[wid] = oldwordfreq - 1, map
 
 
 def frequency(count):
     return count
+
+def invfreq(N, ft):
+    return 1 + (N / ft)