[Zope-CVS] CVS: Products/FieldedTextIndex - CHANGES.txt:1.3
README.txt:1.5 index.py:1.8 test.py:1.6 version.txt:1.2
Casey Duncan
casey at zope.com
Mon Jan 12 01:09:43 EST 2004
Update of /cvs-repository/Products/FieldedTextIndex
In directory cvs.zope.org:/tmp/cvs-serv23320
Modified Files:
CHANGES.txt README.txt index.py test.py version.txt
Log Message:
Implement individual field weighting in queries
=== Products/FieldedTextIndex/CHANGES.txt 1.2 => 1.3 ===
--- Products/FieldedTextIndex/CHANGES.txt:1.2 Sat Dec 27 22:41:45 2003
+++ Products/FieldedTextIndex/CHANGES.txt Mon Jan 12 01:09:42 2004
@@ -1,5 +1,11 @@
FieldTextIndex Changelog
+ 0.2
+
+ - Added support to weight results differently depending on the field
+ matched using the 'field_weights' query keyword. Thanks to
+ Jean-Francois Doyon for suggesting the feature.
+
12/27/2003 - 0.1 Release
- Initial Release
=== Products/FieldedTextIndex/README.txt 1.4 => 1.5 ===
--- Products/FieldedTextIndex/README.txt:1.4 Fri Dec 12 00:25:19 2003
+++ Products/FieldedTextIndex/README.txt Mon Jan 12 01:09:42 2004
@@ -119,6 +119,50 @@
This would return only objects where the query terms occurred in the
fields 'Title' or 'Description'.
+ Specifying field weights (New in 0.2)
+
+ It is also possible to weight individual fields differently in a query so
+ that hits on certain fields affect the relevance score more than others. In
+ practical terms, this allows you to make search hits on particular fields
+ push the cooresponding objects higher in search results. It allows you to
+ make hits on certain fields more important than others.
+
+ The 'field_weights' key in the query dictionary is used to specify the
+ weights to apply to each field. The value of 'field_weights' is a
+ dictionary with each field name and its integer weight as its respective
+ keys and values. The relevance score for the intermediate query results for
+ each field are multiplied by the weight before being combined with the
+ results for other fields::
+
+ result = catalog(dc_fields={"query":"Some search string",
+ "field_weights":{"Title":3,
+ "Description":1,
+ "Subject":2}})
+
+ This would return objects where the query is matched by the 'Title',
+ 'Description' or 'Subject' fields. Matches on 'Title' have their score
+ multiplied by 3. 'Description' and 'Subject' matches are multiplied by
+ 2 and 1 respectively.
+
+ You can specify 'field_weights' with or without also specifying 'fields'.
+ If you specify 'field_weights' without 'fields', then the search is
+ limited to the fields in 'field_weights'. If you also specify 'fields', then
+ all fields specified by it are searched, regardless of which fields appear
+ in 'field_weights'. Fields named in 'fields' but not 'field_weights'
+ are automatically assigned a weight value of one. Fields in 'field_weights'
+ that do not appear in 'fields' are ignored. This allows your application
+ to have 'field_weights' globally configured while still varying the fields
+ actually searched in different queries::
+
+ result = catalog(dc_fields={"query":"Some search string",
+ "fields":["Title", "Description"],
+ "field_weights":{"Title":3,
+ "Subject":2}})
+
+ In this case, 'Title' is searched with a weight of 3 and 'Description'
+ a weight of 1 (the default). 'Subject' is not searched since it does not
+ appear in 'fields'.
+
Creating a query form
Queries can also be generated directly from the web request like other
=== Products/FieldedTextIndex/index.py 1.7 => 1.8 ===
--- Products/FieldedTextIndex/index.py:1.7 Mon Dec 15 12:19:45 2003
+++ Products/FieldedTextIndex/index.py Mon Jan 12 01:09:42 2004
@@ -31,7 +31,8 @@
from Products.ZCTextIndex.okascore import score
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IITreeSet, IIBucket
-from BTrees.IIBTree import weightedIntersection, multiunion, difference
+from BTrees.IIBTree \
+ import weightedIntersection, weightedUnion, multiunion, difference
try:
from Products.PluginIndexes.common.PluggableIndex \
@@ -54,7 +55,7 @@
manage_main = DTMLFile('www/manageIndex', globals())
- query_options = ['query', 'fields']
+ query_options = ['query', 'operator', 'fields', 'field_weights']
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.manage_zcatalog_indexes)
@@ -244,18 +245,26 @@
query_str = ' '.join(record.keys)
if not query_str:
return None
+ operator = record.get('operator', 'or')
fields = record.get('fields')
+ field_weights = record.get('field_weights', {})
+ if field_weights and not fields:
+ fields = field_weights.keys()
if fields:
# Get the search field, omitting ones we don't know about
# This avoids errors in legitimate app code searching a catalog
# not yet populated with objects containing all expected fields
fieldids = []
+ weights = {}
for fname in fields:
try:
- fieldids.append(self._fields.index(fname))
+ fid = self._fields.index(fname)
except ValueError:
pass
+ else:
+ fieldids.append(fid)
+ weights[fid] = int(field_weights.get(fname, 1))
if not fieldids:
# There were no fields we have indexed
@@ -266,14 +275,15 @@
# can be accessed by the _search_wids method down the line.
# this is a hack, but it saves me from overriding the whole
# QueryParser and ParseTree (less code == good)
- self.REQUEST._fielded_text_index_search_fields = fieldids
+ self.REQUEST._fielded_text_index_options = (
+ fieldids, weights, operator)
else:
# If no fields are specified, we search all fields
- self.REQUEST._fielded_text_index_search_fields = None
+ self.REQUEST._fielded_text_index_options = None
tree = QueryParser(self._lexicon).parseQuery(query_str)
results = tree.executeQuery(self)
- del self.REQUEST._fielded_text_index_search_fields
+ del self.REQUEST._fielded_text_index_options
return results, (self.source_name,)
def getIndexSourceNames(self):
@@ -348,8 +358,19 @@
REQUEST for the fieldids to search (see _apply_index above)"""
if not wids:
return []
- search_fields = getattr(
- self.REQUEST, '_fielded_text_index_search_fields', None)
+ options = getattr(self.REQUEST, '_fielded_text_index_options', None)
+ if options is not None:
+ fields, weights, operator = options
+ else:
+ fields = weights = None
+ operator = 'or'
+ if operator == 'or':
+ combineSets = weightedUnion
+ elif operator == 'and':
+ combineSets = weightedIntersection
+ else:
+ raise RuntimeError(
+ 'Invalid operator "%s" for FieldedTextIndex query')
N = float(self.document_count()) # total # of docs
doclen = self._totaldoclen()
meandoclen = doclen / N
@@ -366,16 +387,40 @@
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
- if search_fields is not None:
+ if fields is not None:
fielddocs = self._wordfields[t]
docsets = []
- for fieldid in search_fields:
- try:
- docsets.append(fielddocs[fieldid])
- except KeyError:
- pass # No docs for this field
- w, d2f = weightedIntersection(d2f, multiunion(docsets), 1, 0)
- if not d2f:
+ if not weights and operator == 'or':
+ # We have fields, but no field weights and the set op
+ # is "or". We can use multiunion in this case which
+ # is faster than "or"-ing each per field result
+ # separately
+ for fieldid in fields:
+ try:
+ docsets.append(fielddocs[fieldid])
+ except KeyError:
+ pass # No docs for this field
+ w, d2f = weightedIntersection(
+ d2f, multiunion(docsets), 1, 0)
+ if not d2f:
+ continue
+ else:
+ # We have field weights or the operator is "and"
+ empty = IIBucket()
+ r = None
+ for fieldid in fields:
+ try:
+ docs4field = fielddocs[fieldid]
+ except KeyError:
+ pass # No docs for this field
+ else:
+ w, d2f4field = weightedIntersection(
+ d2f, docs4field, 1, 0)
+ idf = inverse_doc_frequency(len(d2f4field), N)
+ result = IIBucket()
+ score(result, d2f4field.items(), docid2len,
+ idf, meandoclen)
+ L.append((result, weights.get(fieldid, 1)))
continue
idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
result = IIBucket()
=== Products/FieldedTextIndex/test.py 1.5 => 1.6 ===
--- Products/FieldedTextIndex/test.py:1.5 Mon Dec 15 12:19:45 2003
+++ Products/FieldedTextIndex/test.py Mon Jan 12 01:09:42 2004
@@ -42,12 +42,12 @@
self.test.REQUEST = SimpleItem()
self.index = self.test.index
- def index_one(self, DOCID):
- doc = Doc(title="A title", izzy='A field named izzy')
+ def index_one(self, DOCID, **kw):
+ doc = Doc(title="A title", izzy='A field named izzy', **kw)
return self.index.index_object(DOCID, doc)
- def index_two(self, DOCID):
- doc = Doc(title="A different title", clyde='The field of clyde')
+ def index_two(self, DOCID, **kw):
+ doc = Doc(title="A different title", clyde='The field of clyde', **kw)
return self.index.index_object(DOCID, doc)
def test_construct_with_extra(self):
@@ -211,6 +211,55 @@
self.assertEqual(self.index._apply_index(
{'shmields':{'query':'field'}}), None)
+ def test_query_op_defaults_to_or(self):
+ self.index_one(1)
+ self.index_two(2)
+ r1, used = self.index._apply_index(
+ {'fields':{'query':'field', 'fields':['yertle', 'clyde']}})
+ r2, used = self.index._apply_index(
+ {'fields':{'query':'field', 'fields':['yertle', 'clyde'],
+ 'operator':'or'}})
+ self.assertEqual(dict(r1), dict(r2))
+
+ def xxxtest_and_query(self):
+ # XXX operator isn't working yet and may be removed
+ self.index_one(1, common='A common field separate from title')
+ self.index_two(2, common='Another common field')
+ results, used = self.index._apply_index(
+ {'fields':{'query':'title', 'fields':['title', 'common'],
+ 'operator':'and'}})
+ self.assertEqual(list(results.keys()), [1])
+
+ def test_weighted_query_one_word(self):
+ self.index_one(1)
+ self.index_two(2)
+ r1, used = self.index._apply_index(
+ {'fields':{'query':'title', 'fields':{'title':1}}})
+ r2, used = self.index._apply_index(
+ {'fields':{'query':'title', 'field_weights':{'title':2}}})
+ self.failUnless(r1[1]*2 == r2[1])
+
+ def test_weights_default_to_one(self):
+ self.index_one(1)
+ self.index_two(2)
+ r1, used = self.index._apply_index(
+ {'fields':{'query':'title', 'fields':['title']}})
+ r2, used = self.index._apply_index(
+ {'fields':{'query':'title', 'field_weights':{'title':2}}})
+ self.failUnless(r1[1]*2 == r2[1])
+
+ def test_weighted_query_multi_word(self):
+ self.index_one(1)
+ self.index_two(2)
+ results, used = self.index._apply_index(
+ {'fields':{'query':'different or field or izzy',
+ 'fields':['title', 'clyde', 'izzy']}})
+ self.failUnless(results[2] < results[1])
+ results, used = self.index._apply_index(
+ {'fields':{'query':'different or field or izzy',
+ 'field_weights':{'title':2, 'clyde':1, 'izzy':1}}})
+ self.failUnless(results[2] > results[1])
+
def test_phrase_match_all_fields(self):
self.index_one(1)
self.index_two(2)
=== Products/FieldedTextIndex/version.txt 1.1.1.1 => 1.2 ===
--- Products/FieldedTextIndex/version.txt:1.1.1.1 Mon Dec 8 01:25:01 2003
+++ Products/FieldedTextIndex/version.txt Mon Jan 12 01:09:42 2004
@@ -1 +1 @@
-0.1
+0.2a
More information about the Zope-CVS
mailing list