[Zope-CVS] CVS: Products/FieldedTextIndex - CHANGES.txt:1.3 README.txt:1.5 index.py:1.8 test.py:1.6 version.txt:1.2

Mon Jan 12 01:09:43 EST 2004

Update of /cvs-repository/Products/FieldedTextIndex
In directory cvs.zope.org:/tmp/cvs-serv23320

Modified Files:
	CHANGES.txt README.txt index.py test.py version.txt 
Log Message:
Implement individual field weighting in queries


=== Products/FieldedTextIndex/CHANGES.txt 1.2 => 1.3 ===

--- Products/FieldedTextIndex/CHANGES.txt:1.2	Sat Dec 27 22:41:45 2003
+++ Products/FieldedTextIndex/CHANGES.txt	Mon Jan 12 01:09:42 2004
@@ -1,5 +1,11 @@
 FieldTextIndex Changelog
 
+  0.2
+
+    - Added support to weight results differently depending on the field
+      matched using the 'field_weights' query keyword. Thanks to
+      Jean-Francois Doyon for suggesting the feature.
+
   12/27/2003 - 0.1 Release
   
     - Initial Release


=== Products/FieldedTextIndex/README.txt 1.4 => 1.5 ===
--- Products/FieldedTextIndex/README.txt:1.4	Fri Dec 12 00:25:19 2003
+++ Products/FieldedTextIndex/README.txt	Mon Jan 12 01:09:42 2004
@@ -119,6 +119,50 @@
     This would return only objects where the query terms occurred in the
     fields 'Title' or 'Description'.
     
+  Specifying field weights (New in 0.2)
+  
+    It is also possible to weight individual fields differently in a query so
+    that hits on certain fields affect the relevance score more than others. In
+    practical terms, this allows you to make search hits on particular fields
+    push the cooresponding objects higher in search results. It allows you to
+    make hits on certain fields more important than others.
+    
+    The 'field_weights' key in the query dictionary is used to specify the
+    weights to apply to each field. The value of 'field_weights' is a
+    dictionary with each field name and its integer weight as its respective
+    keys and values. The relevance score for the intermediate query results for 
+    each field are multiplied by the weight before being combined with the
+    results for other fields::
+    
+      result = catalog(dc_fields={"query":"Some search string",
+                                  "field_weights":{"Title":3,
+                                                   "Description":1,
+                                                   "Subject":2}})
+                                                   
+    This would return objects where the query is matched by the 'Title', 
+    'Description' or 'Subject' fields. Matches on 'Title' have their score
+    multiplied by 3. 'Description' and 'Subject' matches are multiplied by
+    2 and 1 respectively.
+    
+    You can specify 'field_weights' with or without also specifying 'fields'.
+    If you specify 'field_weights' without 'fields', then the search is
+    limited to the fields in 'field_weights'. If you also specify 'fields', then
+    all fields specified by it are searched, regardless of which fields appear
+    in 'field_weights'. Fields named in 'fields' but not 'field_weights'
+    are automatically assigned a weight value of one. Fields in 'field_weights'
+    that do not appear in 'fields' are ignored. This allows your application
+    to have 'field_weights' globally configured while still varying the fields
+    actually searched in different queries::
+    
+      result = catalog(dc_fields={"query":"Some search string",
+                                  "fields":["Title", "Description"],
+                                  "field_weights":{"Title":3,
+                                                   "Subject":2}})
+    
+    In this case, 'Title' is searched with a weight of 3 and 'Description'
+    a weight of 1 (the default). 'Subject' is not searched since it does not 
+    appear in 'fields'.
+
   Creating a query form
   
     Queries can also be generated directly from the web request like other


=== Products/FieldedTextIndex/index.py 1.7 => 1.8 ===
--- Products/FieldedTextIndex/index.py:1.7	Mon Dec 15 12:19:45 2003
+++ Products/FieldedTextIndex/index.py	Mon Jan 12 01:09:42 2004
@@ -31,7 +31,8 @@
 from Products.ZCTextIndex.okascore import score
 from BTrees.IOBTree import IOBTree
 from BTrees.IIBTree import IIBTree, IITreeSet, IIBucket
-from BTrees.IIBTree import weightedIntersection, multiunion, difference
+from BTrees.IIBTree \
+    import weightedIntersection, weightedUnion, multiunion, difference
 
 try:
     from Products.PluginIndexes.common.PluggableIndex \
@@ -54,7 +55,7 @@
     
     manage_main = DTMLFile('www/manageIndex', globals())
     
-    query_options = ['query', 'fields']
+    query_options = ['query', 'operator', 'fields', 'field_weights']
     
     security = ClassSecurityInfo()
     security.declareObjectProtected(Permissions.manage_zcatalog_indexes)
@@ -244,18 +245,26 @@
         query_str = ' '.join(record.keys)
         if not query_str:
             return None
+        operator = record.get('operator', 'or')
         fields = record.get('fields')
+        field_weights = record.get('field_weights', {})
+        if field_weights and not fields:
+            fields = field_weights.keys()
         
         if fields:                
             # Get the search field, omitting ones we don't know about
             # This avoids errors in legitimate app code searching a catalog
             # not yet populated with objects containing all expected fields
             fieldids = []
+            weights = {}
             for fname in fields:
                 try:
-                    fieldids.append(self._fields.index(fname))
+                    fid = self._fields.index(fname)
                 except ValueError:
                     pass
+                else:
+                    fieldids.append(fid)
+                    weights[fid] = int(field_weights.get(fname, 1))
             
             if not fieldids:
                 # There were no fields we have indexed
@@ -266,14 +275,15 @@
             # can be accessed by the _search_wids method down the line.
             # this is a hack, but it saves me from overriding the whole 
             # QueryParser and ParseTree (less code == good)
-            self.REQUEST._fielded_text_index_search_fields = fieldids
+            self.REQUEST._fielded_text_index_options = (
+                fieldids, weights, operator)
         else:
             # If no fields are specified, we search all fields
-            self.REQUEST._fielded_text_index_search_fields = None
+            self.REQUEST._fielded_text_index_options = None
             
         tree = QueryParser(self._lexicon).parseQuery(query_str)
         results = tree.executeQuery(self)
-        del self.REQUEST._fielded_text_index_search_fields
+        del self.REQUEST._fielded_text_index_options
         return  results, (self.source_name,)
 
     def getIndexSourceNames(self):
@@ -348,8 +358,19 @@
         REQUEST for the fieldids to search (see _apply_index above)"""
         if not wids:
             return []
-        search_fields = getattr(
-            self.REQUEST, '_fielded_text_index_search_fields', None)
+        options = getattr(self.REQUEST, '_fielded_text_index_options', None)
+        if options is not None:
+            fields, weights, operator = options
+        else:
+            fields = weights = None
+            operator = 'or'
+        if operator == 'or':
+            combineSets = weightedUnion
+        elif operator == 'and':
+            combineSets = weightedIntersection
+        else:
+            raise RuntimeError(
+                'Invalid operator "%s" for FieldedTextIndex query')
         N = float(self.document_count())  # total # of docs
         doclen = self._totaldoclen()
         meandoclen = doclen / N
@@ -366,16 +387,40 @@
         docid2len = self._docweight
         for t in wids:
             d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
-            if search_fields is not None:
+            if fields is not None:
                 fielddocs = self._wordfields[t]
                 docsets = []
-                for fieldid in search_fields:
-                    try:
-                        docsets.append(fielddocs[fieldid])
-                    except KeyError:
-                        pass # No docs for this field
-                w, d2f = weightedIntersection(d2f, multiunion(docsets), 1, 0)
-                if not d2f:
+                if not weights and operator == 'or':
+                    # We have fields, but no field weights and the set op
+                    # is "or". We can use multiunion in this case which
+                    # is faster than "or"-ing each per field result
+                    # separately
+                    for fieldid in fields:
+                        try:
+                            docsets.append(fielddocs[fieldid])
+                        except KeyError:
+                            pass # No docs for this field
+                    w, d2f = weightedIntersection(
+                        d2f, multiunion(docsets), 1, 0)
+                    if not d2f:
+                        continue
+                else:
+                    # We have field weights or the operator is "and"
+                    empty = IIBucket()
+                    r = None
+                    for fieldid in fields:
+                        try:
+                            docs4field = fielddocs[fieldid]
+                        except KeyError:
+                            pass # No docs for this field
+                        else:
+                            w, d2f4field = weightedIntersection(
+                                d2f, docs4field, 1, 0)
+                            idf = inverse_doc_frequency(len(d2f4field), N)
+                            result = IIBucket()
+                            score(result, d2f4field.items(), docid2len, 
+                                  idf, meandoclen)
+                            L.append((result, weights.get(fieldid, 1)))
                     continue
             idf = inverse_doc_frequency(len(d2f), N)  # an unscaled float
             result = IIBucket()


=== Products/FieldedTextIndex/test.py 1.5 => 1.6 ===
--- Products/FieldedTextIndex/test.py:1.5	Mon Dec 15 12:19:45 2003
+++ Products/FieldedTextIndex/test.py	Mon Jan 12 01:09:42 2004
@@ -42,12 +42,12 @@
         self.test.REQUEST = SimpleItem()
         self.index = self.test.index
         
-    def index_one(self, DOCID):
-        doc = Doc(title="A title", izzy='A field named izzy')
+    def index_one(self, DOCID, **kw):
+        doc = Doc(title="A title", izzy='A field named izzy', **kw)
         return self.index.index_object(DOCID, doc)
 
-    def index_two(self, DOCID):
-        doc = Doc(title="A different title", clyde='The field of clyde')
+    def index_two(self, DOCID, **kw):
+        doc = Doc(title="A different title", clyde='The field of clyde', **kw)
         return self.index.index_object(DOCID, doc)
         
     def test_construct_with_extra(self):
@@ -211,6 +211,55 @@
         self.assertEqual(self.index._apply_index(
             {'shmields':{'query':'field'}}), None)
     
+    def test_query_op_defaults_to_or(self):
+        self.index_one(1)
+        self.index_two(2)
+        r1, used = self.index._apply_index(
+            {'fields':{'query':'field', 'fields':['yertle', 'clyde']}})
+        r2, used = self.index._apply_index(
+            {'fields':{'query':'field', 'fields':['yertle', 'clyde'],
+                       'operator':'or'}})
+        self.assertEqual(dict(r1), dict(r2))
+        
+    def xxxtest_and_query(self):
+        # XXX operator isn't working yet and may be removed
+        self.index_one(1, common='A common field separate from title')
+        self.index_two(2, common='Another common field')
+        results, used = self.index._apply_index(
+            {'fields':{'query':'title', 'fields':['title', 'common'], 
+                       'operator':'and'}})
+        self.assertEqual(list(results.keys()), [1])
+        
+    def test_weighted_query_one_word(self):
+        self.index_one(1)
+        self.index_two(2)
+        r1, used = self.index._apply_index(
+            {'fields':{'query':'title', 'fields':{'title':1}}})
+        r2, used = self.index._apply_index(
+            {'fields':{'query':'title', 'field_weights':{'title':2}}})
+        self.failUnless(r1[1]*2 == r2[1])
+        
+    def test_weights_default_to_one(self):
+        self.index_one(1)
+        self.index_two(2)
+        r1, used = self.index._apply_index(
+            {'fields':{'query':'title', 'fields':['title']}})
+        r2, used = self.index._apply_index(
+            {'fields':{'query':'title', 'field_weights':{'title':2}}})
+        self.failUnless(r1[1]*2 == r2[1])
+    
+    def test_weighted_query_multi_word(self):
+        self.index_one(1)
+        self.index_two(2)
+        results, used = self.index._apply_index(
+            {'fields':{'query':'different or field or izzy', 
+                       'fields':['title', 'clyde', 'izzy']}})
+        self.failUnless(results[2] < results[1])        
+        results, used = self.index._apply_index(
+            {'fields':{'query':'different or field or izzy', 
+                       'field_weights':{'title':2, 'clyde':1, 'izzy':1}}})
+        self.failUnless(results[2] > results[1])
+        
     def test_phrase_match_all_fields(self):
         self.index_one(1)
         self.index_two(2)


=== Products/FieldedTextIndex/version.txt 1.1.1.1 => 1.2 ===
--- Products/FieldedTextIndex/version.txt:1.1.1.1	Mon Dec  8 01:25:01 2003
+++ Products/FieldedTextIndex/version.txt	Mon Jan 12 01:09:42 2004
@@ -1 +1 @@
-0.1
+0.2a