[Zope-CVS] CVS: Products/FieldedTextIndex - index.py:1.10 test.py:1.8

Mon Jan 19 01:46:00 EST 2004

Update of /cvs-repository/Products/FieldedTextIndex
In directory cvs.zope.org:/tmp/cvs-serv8815

Modified Files:
	index.py test.py 
Log Message:
Refactor to support default weights and new weighting semantics. Drop support operator support for queries.


=== Products/FieldedTextIndex/index.py 1.9 => 1.10 ===

--- Products/FieldedTextIndex/index.py:1.9	Sat Jan 17 00:20:35 2004
+++ Products/FieldedTextIndex/index.py	Mon Jan 19 01:45:54 2004
@@ -30,9 +30,9 @@
 from Products.ZCTextIndex.SetOps import mass_weightedIntersection
 from Products.ZCTextIndex.okascore import score
 from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IITreeSet, IIBucket
+from BTrees.IIBTree import IIBTree, IITreeSet, IIBucket, IISet
 from BTrees.IIBTree \
-    import weightedIntersection, weightedUnion, multiunion, difference
+    import weightedIntersection, multiunion, difference, intersection
 
 try:
     from Products.PluginIndexes.common.PluggableIndex \
@@ -57,13 +57,11 @@
     manage_main = DTMLFile('www/manageIndex', globals())
     manage_weights = DTMLFile('www/manageWeights', globals())
     
-    query_options = ['query', 'operator', 'fields', 'field_weights']
+    query_options = ['query', 'fields', 'field_weights']
     
     security = ClassSecurityInfo()
     security.declareObjectProtected(Permissions.manage_zcatalog_indexes)
     
-    _default_weights = {}
-    
     def __init__(
         self, id, source_name=None, lexicon=None, extra=None, caller=None, ):
         """Fielded text index constructor
@@ -99,6 +97,7 @@
         # number of fields (probably several hundred max) and it will be
         # changed infrequently
         self._fields = []
+        self._default_weights = {} # map weight => fieldids
     
     ## Index specific methods ##
     
@@ -120,20 +119,25 @@
         for convenience when setting from a form. Note that fields can be
         assigned weights before objects with those fields have been indexed.
         """
-        defaults = {}
-        for name, weight in weights.items():
-            fieldid = self._field_id(name)
-            weight = int(weight)
-            if weight != 1:
-                defaults[fieldid] = weight
-        self._default_weights = defaults
+        self._default_weights = self._mapWeights2Fields(weights)
+    
+    def _mapWeights2Fields(self, weights):
+        weights2fields = {}
+        for fieldid, fname in enumerate(self._fields):                
+            weight = weights.get(fname, 1)
+            if weight in weights2fields:
+                weights2fields[weight].insert(fieldid)
+            else:
+                weights2fields[weight] = IISet((fieldid,))
+        return weights2fields
     
     security.declareProtected(Permissions.search_zcatalog, 'getDefaultWeights')
     def getDefaultWeights(self):
         """Return a dictionary of the default weights for every field"""
         defaults = {}
-        for fieldid, fname in enumerate(self._fields):
-            defaults[fname] = self._default_weights.get(fieldid, 1)
+        for weight, fieldids in self._default_weights.items():
+            for fid in fieldids:
+                defaults[self._fields[fid]] = weight
         return defaults
     
     ## Pluggable Index API ##
@@ -214,7 +218,6 @@
                     wids_added = difference(new_wids, old_wids)
                     wids_removed = difference(old_wids, new_wids)
     '''
-                    
     
     def unindex_object(self, docid):
         """Remove docid from the index"""
@@ -273,41 +276,39 @@
         query_str = ' '.join(record.keys)
         if not query_str:
             return None
-        operator = record.get('operator', 'or')
         fields = record.get('fields')
-        field_weights = record.get('field_weights', {})
-        if field_weights and not fields:
-            fields = field_weights.keys()
+        field_weights = record.get('field_weights')
         
         if fields:                
             # Get the search field, omitting ones we don't know about
             # This avoids errors in legitimate app code searching a catalog
             # not yet populated with objects containing all expected fields
-            fieldids = []
-            weights = {}
+            fieldids = IISet()
             for fname in fields:
                 try:
                     fid = self._fields.index(fname)
                 except ValueError:
                     pass
                 else:
-                    fieldids.append(fid)
-                    weights[fid] = int(field_weights.get(fname, 1))
+                    fieldids.insert(fid)
             
             if not fieldids:
                 # There were no fields we have indexed
                 return IIBucket(), (self.source_name,)
-
-            # Because the parse tree insulates us from the actual search code
-            # pretty completely, we stuff fieldids in the REQUEST so that it
-            # can be accessed by the _search_wids method down the line.
-            # this is a hack, but it saves me from overriding the whole 
-            # QueryParser and ParseTree (less code == good)
-            self.REQUEST._fielded_text_index_options = (
-                fieldids, weights, operator)
         else:
-            # If no fields are specified, we search all fields
-            self.REQUEST._fielded_text_index_options = None
+            fieldids = None
+            
+        if field_weights is not None:
+            weights2fields = self._mapWeights2Fields(field_weights)
+        else:
+            weights2fields = self._default_weights
+                                
+        # Because the parse tree insulates us from the actual search code
+        # pretty completely, we stuff fieldids in the REQUEST so that it
+        # can be accessed by the _search_wids method down the line.
+        # this is a hack, but it saves me from overriding the whole 
+        # QueryParser and ParseTree (less code == good)
+        self.REQUEST._fielded_text_index_options = (fieldids, weights2fields)
             
         tree = QueryParser(self._lexicon).parseQuery(query_str)
         results = tree.executeQuery(self)
@@ -365,8 +366,13 @@
             # I expect this to rarely happen, so I am not concerned
             # about possible conflicts arising here
             self._fields.append(fname)
+            fieldid = self._fields.index(fname)
+            if 1 in self._default_weights:
+                self._default_weights[1].insert(fieldid)
+            else:
+                self._default_weights[1] = IISet((fieldid,))
             self._p_changed = 1
-            return self._fields.index(fname)
+            return fieldid
     
     def _mass_add_wordinfo(self, wid2weight, docid):
         # Override from BaseIndex
@@ -386,19 +392,8 @@
         REQUEST for the fieldids to search (see _apply_index above)"""
         if not wids:
             return []
-        options = getattr(self.REQUEST, '_fielded_text_index_options', None)
-        if options is not None:
-            fields, weights, operator = options
-        else:
-            fields = weights = None
-            operator = 'or'
-        if operator == 'or':
-            combineSets = weightedUnion
-        elif operator == 'and':
-            combineSets = weightedIntersection
-        else:
-            raise RuntimeError(
-                'Invalid operator "%s" for FieldedTextIndex query')
+        search_fields, weights2fields = getattr(
+            self.REQUEST, '_fielded_text_index_options', None)
         N = float(self.document_count())  # total # of docs
         doclen = self._totaldoclen()
         meandoclen = doclen / N
@@ -415,48 +410,45 @@
         docid2len = self._docweight
         for t in wids:
             d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-            if fields is not None:
-                try:
-                    fielddocs = self._wordfields[t]
-                except KeyError:
-                    continue 
-                docsets = []
-                if not weights and operator == 'or':
-                    # We have fields, but no field weights and the set op
-                    # is "or". We can use multiunion in this case which
-                    # is faster than "or"-ing each per field result
-                    # separately
-                    for fieldid in fields:
+            fielddocs = None
+            for weight, fields in weights2fields.items():
+                if search_fields is not None:
+                    fields4weight = intersection(fields, search_fields)
+                    if not fields4weight:
+                        continue # Not searching any fields with this weight
+                else:
+                    fields4weight = fields # searching all fields
+                if len(fields4weight) == len(self._fields):
+                    # We are searching all fields indexed and they all have the
+                    # same weight, so we don't need to weight them individually
+                    idf = inverse_doc_frequency(len(d2f), N)
+                    result = IIBucket()
+                    score(result, d2f.items(), docid2len, idf, meandoclen)
+                    L.append((result, weight))
+                else:
+                    if fielddocs is None:
+                        try:
+                            fielddocs = self._wordfields[t]
+                        except KeyError:
+                            # XXX I'm not sure _wordinfo should ever have a
+                            # XXX key that _wordfields doesn't. This fixes
+                            # XXX it for now at least
+                            continue
+                    docsets = []
+                    for fieldid in fields4weight:
                         try:
                             docsets.append(fielddocs[fieldid])
                         except KeyError:
                             pass # No docs for this field
-                    w, d2f = weightedIntersection(
+                    w, docs4weight = weightedIntersection(
                         d2f, multiunion(docsets), 1, 0)
-                    if not d2f:
+                    if not docs4weight:
                         continue
-                else:
-                    # We have field weights or the operator is "and"
-                    empty = IIBucket()
-                    r = None
-                    for fieldid in fields:
-                        try:
-                            docs4field = fielddocs[fieldid]
-                        except KeyError:
-                            pass # No docs for this field
-                        else:
-                            w, d2f4field = weightedIntersection(
-                                d2f, docs4field, 1, 0)
-                            idf = inverse_doc_frequency(len(d2f4field), N)
-                            result = IIBucket()
-                            score(result, d2f4field.items(), docid2len, 
-                                  idf, meandoclen)
-                            L.append((result, weights.get(fieldid, 1)))
-                    continue
-            idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
-            result = IIBucket()
-            score(result, d2f.items(), docid2len, idf, meandoclen)
-            L.append((result, 1))
+                    idf = inverse_doc_frequency(len(docs4weight), N)
+                    result = IIBucket()
+                    score(result, docs4weight.items(), docid2len, 
+                          idf, meandoclen)
+                    L.append((result, weight))
         return L
 
     def search_phrase(self, phrase):


=== Products/FieldedTextIndex/test.py 1.7 => 1.8 ===
--- Products/FieldedTextIndex/test.py:1.7	Sat Jan 17 00:20:35 2004
+++ Products/FieldedTextIndex/test.py	Mon Jan 19 01:45:54 2004
@@ -214,30 +214,10 @@
     def test_query_empty_field(self):
         self.index_one(1)
         self.index_two(2)
-        import pdb; pdb.set_trace()
         self.index.unindex_object(1)
         results, used = self.index._apply_index(
             {'fields':{'query':'field', 'fields':['izzy']}})
-        self.assertEqual(list(results.keys()), [])        
-    
-    def test_query_op_defaults_to_or(self):
-        self.index_one(1)
-        self.index_two(2)
-        r1, used = self.index._apply_index(
-            {'fields':{'query':'field', 'fields':['yertle', 'clyde']}})
-        r2, used = self.index._apply_index(
-            {'fields':{'query':'field', 'fields':['yertle', 'clyde'],
-                       'operator':'or'}})
-        self.assertEqual(dict(r1), dict(r2))
-        
-    def xxxtest_and_query(self):
-        # XXX operator isn't working yet and may be removed
-        self.index_one(1, common='A common field separate from title')
-        self.index_two(2, common='Another common field')
-        results, used = self.index._apply_index(
-            {'fields':{'query':'title', 'fields':['title', 'common'], 
-                       'operator':'and'}})
-        self.assertEqual(list(results.keys()), [1])
+        self.assertEqual(list(results.keys()), [])
         
     def test_weighted_query_one_word(self):
         self.index_one(1)