[Zope-CVS] CVS: Products/FieldedTextIndex - index.py:1.10
test.py:1.8
Casey Duncan
casey at zope.com
Mon Jan 19 01:46:00 EST 2004
Update of /cvs-repository/Products/FieldedTextIndex
In directory cvs.zope.org:/tmp/cvs-serv8815
Modified Files:
index.py test.py
Log Message:
Refactor to support default weights and new weighting semantics. Drop support operator support for queries.
=== Products/FieldedTextIndex/index.py 1.9 => 1.10 ===
--- Products/FieldedTextIndex/index.py:1.9 Sat Jan 17 00:20:35 2004
+++ Products/FieldedTextIndex/index.py Mon Jan 19 01:45:54 2004
@@ -30,9 +30,9 @@
from Products.ZCTextIndex.SetOps import mass_weightedIntersection
from Products.ZCTextIndex.okascore import score
from BTrees.IOBTree import IOBTree
-from BTrees.IIBTree import IIBTree, IITreeSet, IIBucket
+from BTrees.IIBTree import IIBTree, IITreeSet, IIBucket, IISet
from BTrees.IIBTree \
- import weightedIntersection, weightedUnion, multiunion, difference
+ import weightedIntersection, multiunion, difference, intersection
try:
from Products.PluginIndexes.common.PluggableIndex \
@@ -57,13 +57,11 @@
manage_main = DTMLFile('www/manageIndex', globals())
manage_weights = DTMLFile('www/manageWeights', globals())
- query_options = ['query', 'operator', 'fields', 'field_weights']
+ query_options = ['query', 'fields', 'field_weights']
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.manage_zcatalog_indexes)
- _default_weights = {}
-
def __init__(
self, id, source_name=None, lexicon=None, extra=None, caller=None, ):
"""Fielded text index constructor
@@ -99,6 +97,7 @@
# number of fields (probably several hundred max) and it will be
# changed infrequently
self._fields = []
+ self._default_weights = {} # map weight => fieldids
## Index specific methods ##
@@ -120,20 +119,25 @@
for convenience when setting from a form. Note that fields can be
assigned weights before objects with those fields have been indexed.
"""
- defaults = {}
- for name, weight in weights.items():
- fieldid = self._field_id(name)
- weight = int(weight)
- if weight != 1:
- defaults[fieldid] = weight
- self._default_weights = defaults
+ self._default_weights = self._mapWeights2Fields(weights)
+
+ def _mapWeights2Fields(self, weights):
+ weights2fields = {}
+ for fieldid, fname in enumerate(self._fields):
+ weight = weights.get(fname, 1)
+ if weight in weights2fields:
+ weights2fields[weight].insert(fieldid)
+ else:
+ weights2fields[weight] = IISet((fieldid,))
+ return weights2fields
security.declareProtected(Permissions.search_zcatalog, 'getDefaultWeights')
def getDefaultWeights(self):
"""Return a dictionary of the default weights for every field"""
defaults = {}
- for fieldid, fname in enumerate(self._fields):
- defaults[fname] = self._default_weights.get(fieldid, 1)
+ for weight, fieldids in self._default_weights.items():
+ for fid in fieldids:
+ defaults[self._fields[fid]] = weight
return defaults
## Pluggable Index API ##
@@ -214,7 +218,6 @@
wids_added = difference(new_wids, old_wids)
wids_removed = difference(old_wids, new_wids)
'''
-
def unindex_object(self, docid):
"""Remove docid from the index"""
@@ -273,41 +276,39 @@
query_str = ' '.join(record.keys)
if not query_str:
return None
- operator = record.get('operator', 'or')
fields = record.get('fields')
- field_weights = record.get('field_weights', {})
- if field_weights and not fields:
- fields = field_weights.keys()
+ field_weights = record.get('field_weights')
if fields:
# Get the search field, omitting ones we don't know about
# This avoids errors in legitimate app code searching a catalog
# not yet populated with objects containing all expected fields
- fieldids = []
- weights = {}
+ fieldids = IISet()
for fname in fields:
try:
fid = self._fields.index(fname)
except ValueError:
pass
else:
- fieldids.append(fid)
- weights[fid] = int(field_weights.get(fname, 1))
+ fieldids.insert(fid)
if not fieldids:
# There were no fields we have indexed
return IIBucket(), (self.source_name,)
-
- # Because the parse tree insulates us from the actual search code
- # pretty completely, we stuff fieldids in the REQUEST so that it
- # can be accessed by the _search_wids method down the line.
- # this is a hack, but it saves me from overriding the whole
- # QueryParser and ParseTree (less code == good)
- self.REQUEST._fielded_text_index_options = (
- fieldids, weights, operator)
else:
- # If no fields are specified, we search all fields
- self.REQUEST._fielded_text_index_options = None
+ fieldids = None
+
+ if field_weights is not None:
+ weights2fields = self._mapWeights2Fields(field_weights)
+ else:
+ weights2fields = self._default_weights
+
+ # Because the parse tree insulates us from the actual search code
+ # pretty completely, we stuff fieldids in the REQUEST so that it
+ # can be accessed by the _search_wids method down the line.
+ # this is a hack, but it saves me from overriding the whole
+ # QueryParser and ParseTree (less code == good)
+ self.REQUEST._fielded_text_index_options = (fieldids, weights2fields)
tree = QueryParser(self._lexicon).parseQuery(query_str)
results = tree.executeQuery(self)
@@ -365,8 +366,13 @@
# I expect this to rarely happen, so I am not concerned
# about possible conflicts arising here
self._fields.append(fname)
+ fieldid = self._fields.index(fname)
+ if 1 in self._default_weights:
+ self._default_weights[1].insert(fieldid)
+ else:
+ self._default_weights[1] = IISet((fieldid,))
self._p_changed = 1
- return self._fields.index(fname)
+ return fieldid
def _mass_add_wordinfo(self, wid2weight, docid):
# Override from BaseIndex
@@ -386,19 +392,8 @@
REQUEST for the fieldids to search (see _apply_index above)"""
if not wids:
return []
- options = getattr(self.REQUEST, '_fielded_text_index_options', None)
- if options is not None:
- fields, weights, operator = options
- else:
- fields = weights = None
- operator = 'or'
- if operator == 'or':
- combineSets = weightedUnion
- elif operator == 'and':
- combineSets = weightedIntersection
- else:
- raise RuntimeError(
- 'Invalid operator "%s" for FieldedTextIndex query')
+ search_fields, weights2fields = getattr(
+ self.REQUEST, '_fielded_text_index_options', None)
N = float(self.document_count()) # total # of docs
doclen = self._totaldoclen()
meandoclen = doclen / N
@@ -415,48 +410,45 @@
docid2len = self._docweight
for t in wids:
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
- if fields is not None:
- try:
- fielddocs = self._wordfields[t]
- except KeyError:
- continue
- docsets = []
- if not weights and operator == 'or':
- # We have fields, but no field weights and the set op
- # is "or". We can use multiunion in this case which
- # is faster than "or"-ing each per field result
- # separately
- for fieldid in fields:
+ fielddocs = None
+ for weight, fields in weights2fields.items():
+ if search_fields is not None:
+ fields4weight = intersection(fields, search_fields)
+ if not fields4weight:
+ continue # Not searching any fields with this weight
+ else:
+ fields4weight = fields # searching all fields
+ if len(fields4weight) == len(self._fields):
+ # We are searching all fields indexed and they all have the
+ # same weight, so we don't need to weight them individually
+ idf = inverse_doc_frequency(len(d2f), N)
+ result = IIBucket()
+ score(result, d2f.items(), docid2len, idf, meandoclen)
+ L.append((result, weight))
+ else:
+ if fielddocs is None:
+ try:
+ fielddocs = self._wordfields[t]
+ except KeyError:
+ # XXX I'm not sure _wordinfo should ever have a
+ # XXX key that _wordfields doesn't. This fixes
+ # XXX it for now at least
+ continue
+ docsets = []
+ for fieldid in fields4weight:
try:
docsets.append(fielddocs[fieldid])
except KeyError:
pass # No docs for this field
- w, d2f = weightedIntersection(
+ w, docs4weight = weightedIntersection(
d2f, multiunion(docsets), 1, 0)
- if not d2f:
+ if not docs4weight:
continue
- else:
- # We have field weights or the operator is "and"
- empty = IIBucket()
- r = None
- for fieldid in fields:
- try:
- docs4field = fielddocs[fieldid]
- except KeyError:
- pass # No docs for this field
- else:
- w, d2f4field = weightedIntersection(
- d2f, docs4field, 1, 0)
- idf = inverse_doc_frequency(len(d2f4field), N)
- result = IIBucket()
- score(result, d2f4field.items(), docid2len,
- idf, meandoclen)
- L.append((result, weights.get(fieldid, 1)))
- continue
- idf = inverse_doc_frequency(len(d2f), N) # an unscaled float
- result = IIBucket()
- score(result, d2f.items(), docid2len, idf, meandoclen)
- L.append((result, 1))
+ idf = inverse_doc_frequency(len(docs4weight), N)
+ result = IIBucket()
+ score(result, docs4weight.items(), docid2len,
+ idf, meandoclen)
+ L.append((result, weight))
return L
def search_phrase(self, phrase):
=== Products/FieldedTextIndex/test.py 1.7 => 1.8 ===
--- Products/FieldedTextIndex/test.py:1.7 Sat Jan 17 00:20:35 2004
+++ Products/FieldedTextIndex/test.py Mon Jan 19 01:45:54 2004
@@ -214,30 +214,10 @@
def test_query_empty_field(self):
self.index_one(1)
self.index_two(2)
- import pdb; pdb.set_trace()
self.index.unindex_object(1)
results, used = self.index._apply_index(
{'fields':{'query':'field', 'fields':['izzy']}})
- self.assertEqual(list(results.keys()), [])
-
- def test_query_op_defaults_to_or(self):
- self.index_one(1)
- self.index_two(2)
- r1, used = self.index._apply_index(
- {'fields':{'query':'field', 'fields':['yertle', 'clyde']}})
- r2, used = self.index._apply_index(
- {'fields':{'query':'field', 'fields':['yertle', 'clyde'],
- 'operator':'or'}})
- self.assertEqual(dict(r1), dict(r2))
-
- def xxxtest_and_query(self):
- # XXX operator isn't working yet and may be removed
- self.index_one(1, common='A common field separate from title')
- self.index_two(2, common='Another common field')
- results, used = self.index._apply_index(
- {'fields':{'query':'title', 'fields':['title', 'common'],
- 'operator':'and'}})
- self.assertEqual(list(results.keys()), [1])
+ self.assertEqual(list(results.keys()), [])
def test_weighted_query_one_word(self):
self.index_one(1)
More information about the Zope-CVS
mailing list