[Zope-Checkins] CVS: Zope/lib/python/Products/ZCatalog - Catalog.py:1.98.6.2 IZCatalog.py:1.4.2.1 Lazy.py:1.7.6.1
Casey Duncan
casey@zope.com
Tue, 10 Dec 2002 10:55:30 -0500
Update of /cvs-repository/Zope/lib/python/Products/ZCatalog
In directory cvs.zope.org:/tmp/cvs-serv31233/lib/python/Products/ZCatalog
Modified Files:
Tag: Zope-2_6-branch
Catalog.py IZCatalog.py Lazy.py
Log Message:
Backport catalog sort speedups from the HEAD sans ZCatalog and PlugInIndex API changes
=== Zope/lib/python/Products/ZCatalog/Catalog.py 1.98.6.1 => 1.98.6.2 ===
--- Zope/lib/python/Products/ZCatalog/Catalog.py:1.98.6.1 Mon Nov 25 15:55:32 2002
+++ Zope/lib/python/Products/ZCatalog/Catalog.py Tue Dec 10 10:55:00 2002
@@ -20,7 +20,7 @@
from Missing import MV
from zLOG import LOG, ERROR
-from Lazy import LazyMap, LazyFilter, LazyCat
+from Lazy import LazyMap, LazyFilter, LazyCat, LazyValues
from CatalogBrains import AbstractCatalogBrain, NoBrainer
from BTrees.IIBTree import intersection, weightedIntersection, IISet
@@ -30,9 +30,10 @@
from Products.PluginIndexes.common.randid import randid
import time, sys, types
+from bisect import bisect
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
- """ An Object Catalog
+ """ An Object Catalog
An Object Catalog maintains a table of object metadata, and a
series of manageable indexes to quickly search for objects
@@ -425,12 +426,17 @@
## This is the Catalog search engine. Most of the heavy lifting happens below
- def _indexedSearch(self, request, sort_index, append, used):
- """
- Iterate through the indexes, applying the query to each one.
- """
- rs = None # resultset
- data = self.data
+ def search(self, request, sort_index=None, reverse=0, limit=None, merge=1):
+ """Iterate through the indexes, applying the query to each one. If
+ merge is true then return a lazy result set (sorted if appropriate)
+ otherwise return the raw (possibly scored) results for later merging.
+ Limit is used in conjuntion with sorting or scored results to inform
+ the catalog how many results you are really interested in. The catalog
+ can then use optimizations to save time and memory. The number of
+ results is not guaranteed to fall within the limit however, you should
+ still slice or batch the results as usual."""
+
+ rs = None # resultset
# Indexes fulfill a fairly large contract here. We hand each
# index the request mapping we are given (which may be composed
@@ -449,16 +455,6 @@
# Note that if the indexes find query arguments, but the end result
# is an empty sequence, we do nothing
- # If sort_index is None, this method should pass sequences of
- # catalog records to append(). The sequences will be concatenated
- # together to generate the result set.
- # If sort_index is not None, this method should instead pass pairs
- # to append(), each pair containing a sort key and a sequence of
- # catalog records.
- # In either case, the sequences may be lazified.
-
- if used is None:
- used = {}
for i in self.indexes.keys():
index = self.getIndex(i)
_apply_index = getattr(index, "_apply_index", None)
@@ -468,19 +464,17 @@
if r is not None:
r, u = r
- for name in u:
- used[name] = 1
w, rs = weightedIntersection(rs, r)
-
+
if rs is None:
# None of the indexes found anything to do with the request
# We take this to mean that the query was empty (an empty filter)
# and so we return everything in the catalog
if sort_index is None:
- rs = data.items()
- append(LazyMap(self.instantiate, rs, len(self)))
+ return LazyMap(self.instantiate, self.data.items(), len(self))
else:
- self._build_sorted_results(data, sort_index, append)
+ return self.sortResults(
+ self.data, sort_index, reverse, limit, merge)
elif rs:
# We got some results from the indexes.
# Sort and convert to sequences.
@@ -488,8 +482,11 @@
# having a 'values' means we have a data structure with
# scores. Build a new result set, sort it by score, reverse
# it, compute the normalized score, and Lazify it.
- rset = rs.byValue(0) # sort it by score
- max = float(rset[0][0])
+
+ # For now we cannot return raw scores for later merging :^(
+
+ rs = rs.byValue(0) # sort it by score
+ max = float(rs[0][0])
# Here we define our getter function inline so that
# we can conveniently store the max value as a default arg
@@ -506,52 +503,58 @@
r.data_record_score_ = score
r.data_record_normalized_score_ = int(100. * score / max)
return r
-
- # Lazify the results
- append(LazyMap(getScoredResult, rset))
+
+ return LazyMap(getScoredResult, rs, len(rs))
elif sort_index is None and not hasattr(rs, 'values'):
- # no scores? Just Lazify.
+ # no scores
if hasattr(rs, 'keys'):
rs = rs.keys()
- append(LazyMap(self.__getitem__, rs))
+ return LazyMap(self.__getitem__, rs, len(rs))
else:
# sort. If there are scores, then this block is not
# reached, therefore 'sort-on' does not happen in the
# context of a text index query. This should probably
# sort by relevance first, then the 'sort-on' attribute.
- self._build_sorted_results(rs,sort_index,append)
-
- #print 'ZCatalog search used', used.keys()
- return used
+ return self.sortResults(rs, sort_index, reverse, limit, merge)
+ else:
+ # Empty result set
+ return LazyCat(rs)
- def _build_sorted_results(self,rs,sort_index,append):
- # This function will .append pairs where the first item
- # in the pair is a sort key, and the second item in the
- # pair is a sequence of results which share the same
- # sort key. Later on the list to which these things
- # are .append()ed will be .sort()ed, and the first element
- # of each pair stripped.
+ def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1):
+ # Sort a result set using a sort index. Return a lazy
+ # result set in sorted order if merge is true otherwise
+ # returns a list of (sortkey, uid, getter_function) tuples
#
# The two 'for' loops in here contribute a significant
# proportion of the time to perform an indexed search.
# Try to avoid all non-local attribute lookup inside
# those loops.
+ assert limit is None or limit > 0, 'Limit value must be 1 or greater'
_lazymap = LazyMap
_intersection = intersection
_self__getitem__ = self.__getitem__
+ _index_keyForDocument = sort_index.keyForDocument
_None = None
- if (len(rs) > (len(sort_index) * 4)):
+ _keyerror = KeyError
+ result = []
+ append = result.append
+ if hasattr(rs, 'keys'):
+ rs = rs.keys()
+ rlen = len(rs)
+
+ if limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))):
# The result set is much larger than the sorted index,
# so iterate over the sorted index for speed.
+ # This is rarely exercised in practice...
+
+ length = 0
try:
intersection(rs, IISet(()))
except TypeError:
# rs is not an object in the IIBTree family.
# Try to turn rs into an IISet.
- if hasattr(rs, 'keys'):
- rs = rs.keys()
rs = IISet(rs)
for k, intset in sort_index.items():
@@ -564,31 +567,99 @@
if keys is not _None:
# Is this ever true?
intset = keys()
- append((k, _lazymap(_self__getitem__, intset)))
+ length += len(intset)
+ append((k, intset, _self__getitem__))
# Note that sort keys are unique.
- else:
- # Iterate over the result set.
- if hasattr(rs, 'keys'):
- rs = rs.keys()
- _sort_index_keyForDocument = sort_index.keyForDocument
- _keyerror = KeyError
+
+ if merge:
+ result.sort()
+ if reverse:
+ result.reverse()
+ result = LazyCat(LazyValues(result), length)
+ else:
+ return result
+ elif limit is None or (limit * 4 > rlen):
+ # Iterate over the result set getting sort keys from the index
for did in rs:
try:
- key = _sort_index_keyForDocument(did)
+ key = _index_keyForDocument(did)
except _keyerror:
- # This document is not in the sort key index.
- # skip it.
+ # This document is not in the sort key index, skip it.
pass
else:
- # We want the sort keys to be unique so that
- # .sort()ing the list does not involve comparing
- # _lazymap objects, which is slow. To ensure
- # uniqueness the first element of each pair is
- # actually a tuple of:
- # (real sort key, some unique number)
- lm = _lazymap(_self__getitem__, [did])
- key = key, id(lm)
- append((key,lm))
+ append((key, did, _self__getitem__))
+ # The reference back to __getitem__ is used in case
+ # we do not merge now and need to intermingle the
+ # results with those of other catalogs while avoiding
+ # the cost of instantiating a LazyMap per result
+ if merge:
+ result.sort()
+ if reverse:
+ result.reverse()
+ if limit is not None:
+ result = result[:limit]
+ result = LazyValues(result)
+ else:
+ return result
+ elif reverse:
+ # Limit/sort results using N-Best algorithm
+ # This is faster for large sets then a full sort
+ # And uses far less memory
+ keys = []
+ n = 0
+ worst = None
+ for did in rs:
+ try:
+ key = _index_keyForDocument(did)
+ except _keyerror:
+ # This document is not in the sort key index, skip it.
+ pass
+ else:
+ if n >= limit and key <= worst:
+ continue
+ i = bisect(keys, key)
+ keys.insert(i, key)
+ result.insert(i, (key, did, _self__getitem__))
+ if n == limit:
+ del keys[0], result[0]
+ else:
+ n += 1
+ worst = keys[0]
+ result.reverse()
+ if merge:
+ result = LazyValues(result)
+ else:
+ return result
+ elif not reverse:
+ # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
+ keys = []
+ n = 0
+ best = None
+ for did in rs:
+ try:
+ key = _index_keyForDocument(did)
+ except _keyerror:
+ # This document is not in the sort key index, skip it.
+ pass
+ else:
+ if n >= limit and key >= best:
+ continue
+ i = bisect(keys, key)
+ keys.insert(i, key)
+ result.insert(i, (key, did, _self__getitem__))
+ if n == limit:
+ del keys[-1], result[-1]
+ else:
+ n += 1
+ best = keys[-1]
+ if merge:
+ result = LazyValues(result)
+ else:
+ return result
+
+ result = LazyMap(self.__getitem__, result, len(result))
+ result.actual_result_count = rlen
+ return result
def _get_sort_attr(self, attr, kw):
"""Helper function to find sort-on or sort-order."""
@@ -626,32 +697,23 @@
else:
return None
-
def searchResults(self, REQUEST=None, used=None, _merge=1, **kw):
+ # The used argument is deprecated and is ignored
if REQUEST is None and not kw:
# Try to acquire request if we get no args for bw compat
REQUEST = getattr(self, 'REQUEST', None)
args = CatalogSearchArgumentsMap(REQUEST, kw)
sort_index = self._getSortIndex(args)
+ sort_limit = self._get_sort_attr('limit', args)
+ reverse = 0
+ if sort_index is not None:
+ order = self._get_sort_attr("order", args)
+ if (isinstance(order, types.StringType) and
+ order.lower() in ('reverse', 'descending')):
+ reverse = 1
# Perform searches with indexes and sort_index
- r = []
- used = self._indexedSearch(args, sort_index, r.append, used)
- if not _merge:
- # Postpone merging and sorting. This provides a way to
- # efficiently sort results merged from multiple queries
- # or multiple catalogs.
- return r
- else:
- has_sort_keys = 0
- reverse = 0
- if sort_index is not None:
- has_sort_keys = 1
- order = self._get_sort_attr("order", args)
- if (isinstance(order, types.StringType) and
- order.lower() in ('reverse', 'descending')):
- reverse = 1
- return mergeResults(r, has_sort_keys, reverse)
-
+ return self.search(args, sort_index, reverse, sort_limit, _merge)
+
__call__ = searchResults
@@ -694,37 +756,28 @@
return 0
else:
return 1
+
-
-def mergeResults(r, has_sort_keys, reverse):
+def mergeResults(results, has_sort_keys, reverse):
"""Sort/merge sub-results, generating a flat sequence.
-
- The contents of r depend on whether has_sort_keys is set.
- If not has_sort_keys, r contains sequences of records.
- If has_sort_keys, r contains pairs of (sort_key, sequence)
- and now we have to sort the results.
+
+ results is a list of result set sequences, all with or without sort keys
"""
- if not r:
- return LazyCat(r)
- elif len(r) == 1:
- if not has_sort_keys:
- return r[0]
- else:
- return r[0][1]
+ if not has_sort_keys:
+ return LazyCat(results)
else:
- if not has_sort_keys:
- # Note that the length of the final sequence is not
- # the same as the length of r, since r may contain
- # sequences of different sizes.
- return LazyCat(r)
+ # Concatenate the catalog results into one list and sort it
+ # Each result record consists of a list of tuples with three values:
+ # (sortkey, docid, catalog__getitem__)
+ if len(results) > 1:
+ all = []
+ for r in results:
+ all.extend(r)
+ elif len(results) == 1:
+ all = results[0]
else:
- r.sort()
- if reverse:
- r.reverse()
- size = 0
- tmp = []
- for i in r:
- elt = i[1]
- tmp.append(elt)
- size += len(elt)
- return LazyCat(tmp, size)
+ return []
+ all.sort()
+ if reverse:
+ all.reverse()
+ return LazyMap(lambda rec: rec[2](rec[1]), all, len(all))
=== Zope/lib/python/Products/ZCatalog/IZCatalog.py 1.4 => 1.4.2.1 ===
--- Zope/lib/python/Products/ZCatalog/IZCatalog.py:1.4 Thu Sep 5 17:22:39 2002
+++ Zope/lib/python/Products/ZCatalog/IZCatalog.py Tue Dec 10 10:55:00 2002
@@ -161,6 +161,10 @@
sort_order -- You can specify 'reverse' or 'descending'.
Default behavior is to sort ascending.
+
+ sort_limit -- An optimization hint to tell the catalog how many
+ results you are really interested in. See the limit argument
+ to the search method for more details.
There are some rules to consider when querying this method:
@@ -192,5 +196,30 @@
def __call__(REQUEST=None, **kw):
"""Search the catalog, the same way as 'searchResults'.
"""
-
+
+ def search(query_request, sort_index=None, reverse=0, limit=None, merge=1):
+ """Programmatic search interface, use for searching the catalog from
+ scripts.
+
+ query_request -- Dictionary containing catalog query. This uses the
+ same format as searchResults.
+
+ sort_index -- Name of sort index
+
+ reverse -- Boolean, reverse sort order (defaults to false)
+
+ limit -- Limit sorted result count to the n best records. This is an
+ optimization hint used in conjunction with a sort_index. If possible
+ ZCatalog will use a different sort algorithm that uses much less memory
+ and scales better then a full sort. The actual number of records
+ returned is not guaranteed to be <= limit. You still need to apply the
+ same batching to the results. Since the len() of the results will no
+ longer be the actual result count, you can use the
+ "actual_result_count" attribute of the lazy result object instead to
+ determine the size of the full result set.
+
+ merge -- Return merged, lazy results (like searchResults) or raw
+ results for later merging. This can be used to perform multiple
+ queries (even across catalogs) and merge and sort the combined results.
+ """
__doc__ = IZCatalog.__doc__ + __doc__
=== Zope/lib/python/Products/ZCatalog/Lazy.py 1.7 => 1.7.6.1 ===
--- Zope/lib/python/Products/ZCatalog/Lazy.py:1.7 Wed Aug 14 18:25:15 2002
+++ Zope/lib/python/Products/ZCatalog/Lazy.py Tue Dec 10 10:55:00 2002
@@ -238,3 +238,21 @@
raise IndexError, index
self._eindex=e
return data[i]
+
+class LazyValues(Lazy):
+ """Given a sequence of two tuples typically (key, value) act as
+ though we are just a list of the values lazily"""
+
+ def __init__(self, seq):
+ self._seq = seq
+
+ def __len__(self):
+ return len(self._seq)
+
+ def __getitem__(self, index):
+ return self._seq[index][1]
+
+ def __getslice__(self, start, end):
+ return self.__class__(self._seq[start:end])
+
+ slice = __getslice__