[Zope-Checkins] CVS: Zope/lib/python/Products/ZCatalog - Catalog.py:1.98.6.2 IZCatalog.py:1.4.2.1 Lazy.py:1.7.6.1

Tue, 10 Dec 2002 10:55:30 -0500

Update of /cvs-repository/Zope/lib/python/Products/ZCatalog
In directory cvs.zope.org:/tmp/cvs-serv31233/lib/python/Products/ZCatalog

Modified Files:
      Tag: Zope-2_6-branch
	Catalog.py IZCatalog.py Lazy.py 
Log Message:
Backport catalog sort speedups from the HEAD sans ZCatalog and PlugInIndex API changes


=== Zope/lib/python/Products/ZCatalog/Catalog.py 1.98.6.1 => 1.98.6.2 ===

--- Zope/lib/python/Products/ZCatalog/Catalog.py:1.98.6.1	Mon Nov 25 15:55:32 2002
+++ Zope/lib/python/Products/ZCatalog/Catalog.py	Tue Dec 10 10:55:00 2002
@@ -20,7 +20,7 @@
 from Missing import MV
 from zLOG import LOG, ERROR
 
-from Lazy import LazyMap, LazyFilter, LazyCat
+from Lazy import LazyMap, LazyFilter, LazyCat, LazyValues
 from CatalogBrains import AbstractCatalogBrain, NoBrainer
 
 from BTrees.IIBTree import intersection, weightedIntersection, IISet
@@ -30,9 +30,10 @@
 from Products.PluginIndexes.common.randid import randid
 
 import time, sys, types
+from bisect import bisect
 
 class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
-    """ An Object Catalog
+    """ An Object Catalog 
 
     An Object Catalog maintains a table of object metadata, and a
     series of manageable indexes to quickly search for objects
@@ -425,12 +426,17 @@
 
 ## This is the Catalog search engine. Most of the heavy lifting happens below
 
-    def _indexedSearch(self, request, sort_index, append, used):
-        """
-        Iterate through the indexes, applying the query to each one.
-        """
-        rs = None             # resultset
-        data = self.data
+    def search(self, request, sort_index=None, reverse=0, limit=None, merge=1):
+        """Iterate through the indexes, applying the query to each one. If
+        merge is true then return a lazy result set (sorted if appropriate)
+        otherwise return the raw (possibly scored) results for later merging.
+        Limit is used in conjuntion with sorting or scored results to inform
+        the catalog how many results you are really interested in. The catalog
+        can then use optimizations to save time and memory. The number of
+        results is not guaranteed to fall within the limit however, you should
+        still slice or batch the results as usual."""
+
+        rs = None # resultset
 
         # Indexes fulfill a fairly large contract here. We hand each
         # index the request mapping we are given (which may be composed
@@ -449,16 +455,6 @@
         # Note that if the indexes find query arguments, but the end result
         # is an empty sequence, we do nothing
 
-        # If sort_index is None, this method should pass sequences of
-        # catalog records to append().  The sequences will be concatenated
-        # together to generate the result set.
-        # If sort_index is not None, this method should instead pass pairs
-        # to append(), each pair containing a sort key and a sequence of
-        # catalog records.
-        # In either case, the sequences may be lazified.
-
-        if used is None:
-            used = {}
         for i in self.indexes.keys():
             index = self.getIndex(i)
             _apply_index = getattr(index, "_apply_index", None)
@@ -468,19 +464,17 @@
 
             if r is not None:
                 r, u = r
-                for name in u:
-                    used[name] = 1
                 w, rs = weightedIntersection(rs, r)
-
+        
         if rs is None:
             # None of the indexes found anything to do with the request
             # We take this to mean that the query was empty (an empty filter)
             # and so we return everything in the catalog
             if sort_index is None:
-                rs = data.items()
-                append(LazyMap(self.instantiate, rs, len(self)))
+                return LazyMap(self.instantiate, self.data.items(), len(self))
             else:
-                self._build_sorted_results(data, sort_index, append)
+                return self.sortResults(
+                    self.data, sort_index, reverse,  limit, merge)
         elif rs:
             # We got some results from the indexes.
             # Sort and convert to sequences.
@@ -488,8 +482,11 @@
                 # having a 'values' means we have a data structure with
                 # scores.  Build a new result set, sort it by score, reverse
                 # it, compute the normalized score, and Lazify it.
-                rset = rs.byValue(0) # sort it by score
-                max = float(rset[0][0])
+                
+                # For now we cannot return raw scores for later merging :^(
+                
+                rs = rs.byValue(0) # sort it by score
+                max = float(rs[0][0])
 
                 # Here we define our getter function inline so that
                 # we can conveniently store the max value as a default arg
@@ -506,52 +503,58 @@
                     r.data_record_score_ = score
                     r.data_record_normalized_score_ = int(100. * score / max)
                     return r
-
-                # Lazify the results
-                append(LazyMap(getScoredResult, rset))
+                
+                return LazyMap(getScoredResult, rs, len(rs))
 
             elif sort_index is None and not hasattr(rs, 'values'):
-                # no scores?  Just Lazify.
+                # no scores
                 if hasattr(rs, 'keys'):
                     rs = rs.keys()
-                append(LazyMap(self.__getitem__, rs))
+                return LazyMap(self.__getitem__, rs, len(rs))
             else:
                 # sort.  If there are scores, then this block is not
                 # reached, therefore 'sort-on' does not happen in the
                 # context of a text index query.  This should probably
                 # sort by relevance first, then the 'sort-on' attribute.
-                self._build_sorted_results(rs,sort_index,append)
-
-        #print 'ZCatalog search used', used.keys()
-        return used
+                return self.sortResults(rs, sort_index, reverse, limit, merge)
+        else:
+            # Empty result set
+            return LazyCat(rs)
 
-    def _build_sorted_results(self,rs,sort_index,append):
-        # This function will .append pairs where the first item
-        # in the pair is a sort key, and the second item in the
-        # pair is a sequence of results which share the same
-        # sort key. Later on the list to which these things
-        # are .append()ed will be .sort()ed, and the first element
-        # of each pair stripped.
+    def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1):
+        # Sort a result set using a sort index. Return a lazy
+        # result set in sorted order if merge is true otherwise
+        # returns a list of (sortkey, uid, getter_function) tuples
         #
         # The two 'for' loops in here contribute a significant
         # proportion of the time to perform an indexed search.
         # Try to avoid all non-local attribute lookup inside
         # those loops.
+        assert limit is None or limit > 0, 'Limit value must be 1 or greater'
         _lazymap = LazyMap
         _intersection = intersection
         _self__getitem__ = self.__getitem__
+        _index_keyForDocument = sort_index.keyForDocument
         _None = None
-        if (len(rs) > (len(sort_index) * 4)):
+        _keyerror = KeyError
+        result = []
+        append = result.append
+        if hasattr(rs, 'keys'):
+            rs = rs.keys()
+        rlen = len(rs)
+        
+        if limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))):
             # The result set is much larger than the sorted index,
             # so iterate over the sorted index for speed.
+            # This is rarely exercised in practice...
+            
+            length = 0
 
             try:
                 intersection(rs, IISet(()))
             except TypeError:
                 # rs is not an object in the IIBTree family.
                 # Try to turn rs into an IISet.
-                if hasattr(rs, 'keys'):
-                    rs = rs.keys()
                 rs = IISet(rs)
 
             for k, intset in sort_index.items():
@@ -564,31 +567,99 @@
                     if keys is not _None:
                         # Is this ever true?
                         intset = keys()
-                    append((k, _lazymap(_self__getitem__, intset)))
+                    length += len(intset)
+                    append((k, intset, _self__getitem__))
                     # Note that sort keys are unique.
-        else:
-            # Iterate over the result set.
-            if hasattr(rs, 'keys'):
-                rs = rs.keys()
-            _sort_index_keyForDocument = sort_index.keyForDocument
-            _keyerror = KeyError
+            
+            if merge:
+                result.sort()
+                if reverse:
+                    result.reverse()
+                result = LazyCat(LazyValues(result), length)
+            else:
+                return result            
+        elif limit is None or (limit * 4 > rlen):
+            # Iterate over the result set getting sort keys from the index
             for did in rs:
                 try:
-                    key = _sort_index_keyForDocument(did)
+                    key = _index_keyForDocument(did)
                 except _keyerror:
-                    # This document is not in the sort key index.
-                    # skip it.
+                    # This document is not in the sort key index, skip it.
                     pass
                 else:
-                    # We want the sort keys to be unique so that
-                    # .sort()ing the list does not involve comparing
-                    # _lazymap objects, which is slow. To ensure
-                    # uniqueness the first element of each pair is
-                    # actually a tuple of:
-                    # (real sort key, some unique number)
-                    lm = _lazymap(_self__getitem__, [did])
-                    key = key, id(lm)
-                    append((key,lm))
+                    append((key, did, _self__getitem__))
+                    # The reference back to __getitem__ is used in case
+                    # we do not merge now and need to intermingle the
+                    # results with those of other catalogs while avoiding
+                    # the cost of instantiating a LazyMap per result
+            if merge:
+                result.sort()
+                if reverse:
+                    result.reverse()
+                if limit is not None:
+                    result = result[:limit]                    
+                result = LazyValues(result)
+            else:
+                return result
+        elif reverse: 
+            # Limit/sort results using N-Best algorithm
+            # This is faster for large sets then a full sort
+            # And uses far less memory
+            keys = []
+            n = 0
+            worst = None
+            for did in rs:
+                try:
+                    key = _index_keyForDocument(did)
+                except _keyerror:
+                    # This document is not in the sort key index, skip it.
+                    pass
+                else:
+                    if n >= limit and key <= worst:
+                        continue
+                    i = bisect(keys, key)
+                    keys.insert(i, key)
+                    result.insert(i, (key, did, _self__getitem__))
+                    if n == limit:
+                        del keys[0], result[0]
+                    else:
+                        n += 1
+                    worst = keys[0]
+            result.reverse()
+            if merge:
+                result = LazyValues(result) 
+            else:
+                return result
+        elif not reverse:
+            # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
+            keys = []
+            n = 0
+            best = None
+            for did in rs:
+                try:
+                    key = _index_keyForDocument(did)
+                except _keyerror:
+                    # This document is not in the sort key index, skip it.
+                    pass
+                else:
+                    if n >= limit and key >= best:
+                        continue
+                    i = bisect(keys, key)
+                    keys.insert(i, key)
+                    result.insert(i, (key, did, _self__getitem__))
+                    if n == limit:
+                        del keys[-1], result[-1]
+                    else:
+                        n += 1
+                    best = keys[-1]
+            if merge:
+                result = LazyValues(result) 
+            else:
+                return result
+        
+        result = LazyMap(self.__getitem__, result, len(result))
+        result.actual_result_count = rlen
+        return result
 
     def _get_sort_attr(self, attr, kw):
         """Helper function to find sort-on or sort-order."""
@@ -626,32 +697,23 @@
         else:
             return None
 
-
     def searchResults(self, REQUEST=None, used=None, _merge=1, **kw):
+        # The used argument is deprecated and is ignored
         if REQUEST is None and not kw:
             # Try to acquire request if we get no args for bw compat
             REQUEST = getattr(self, 'REQUEST', None)
         args = CatalogSearchArgumentsMap(REQUEST, kw)
         sort_index = self._getSortIndex(args)
+        sort_limit = self._get_sort_attr('limit', args)
+        reverse = 0
+        if sort_index is not None:
+            order = self._get_sort_attr("order", args)
+            if (isinstance(order, types.StringType) and
+                order.lower() in ('reverse', 'descending')):
+                reverse = 1
         # Perform searches with indexes and sort_index
-        r = []
-        used = self._indexedSearch(args, sort_index, r.append, used)
-        if not _merge:
-            # Postpone merging and sorting.  This provides a way to
-            # efficiently sort results merged from multiple queries
-            # or multiple catalogs.
-            return r
-        else:
-            has_sort_keys = 0
-            reverse = 0
-            if sort_index is not None:
-                has_sort_keys = 1
-                order = self._get_sort_attr("order", args)
-                if (isinstance(order, types.StringType) and
-                    order.lower() in ('reverse', 'descending')):
-                    reverse = 1
-            return mergeResults(r, has_sort_keys, reverse)
-
+        return self.search(args, sort_index, reverse, sort_limit, _merge)
+                
     __call__ = searchResults
 
 
@@ -694,37 +756,28 @@
             return 0
         else:
             return 1
+        
 
-
-def mergeResults(r, has_sort_keys, reverse):
+def mergeResults(results, has_sort_keys, reverse):
     """Sort/merge sub-results, generating a flat sequence.
-
-    The contents of r depend on whether has_sort_keys is set.
-    If not has_sort_keys, r contains sequences of records.
-    If has_sort_keys, r contains pairs of (sort_key, sequence)
-    and now we have to sort the results.
+    
+    results is a list of result set sequences, all with or without sort keys
     """
-    if not r:
-        return LazyCat(r)
-    elif len(r) == 1:
-        if not has_sort_keys:
-            return r[0]
-        else:
-            return r[0][1]
+    if not has_sort_keys:
+        return LazyCat(results)
     else:
-        if not has_sort_keys:
-            # Note that the length of the final sequence is not
-            # the same as the length of r, since r may contain
-            # sequences of different sizes.
-            return LazyCat(r)
+        # Concatenate the catalog results into one list and sort it
+        # Each result record consists of a list of tuples with three values:
+        # (sortkey, docid, catalog__getitem__)
+        if len(results) > 1:
+            all = []
+            for r in results:
+                all.extend(r)
+        elif len(results) == 1:
+            all = results[0]
         else:
-            r.sort()
-            if reverse:
-                r.reverse()
-            size = 0
-            tmp = []
-            for i in r:
-                elt = i[1]
-                tmp.append(elt)
-                size += len(elt)
-            return LazyCat(tmp, size)
+            return []
+        all.sort()
+        if reverse:
+            all.reverse()
+        return LazyMap(lambda rec: rec[2](rec[1]), all, len(all))


=== Zope/lib/python/Products/ZCatalog/IZCatalog.py 1.4 => 1.4.2.1 ===
--- Zope/lib/python/Products/ZCatalog/IZCatalog.py:1.4	Thu Sep  5 17:22:39 2002
+++ Zope/lib/python/Products/ZCatalog/IZCatalog.py	Tue Dec 10 10:55:00 2002
@@ -161,6 +161,10 @@
 
           sort_order -- You can specify 'reverse' or 'descending'.
           Default behavior is to sort ascending.
+          
+          sort_limit -- An optimization hint to tell the catalog how many
+          results you are really interested in. See the limit argument
+          to the search method for more details.
 
         There are some rules to consider when querying this method:
 
@@ -192,5 +196,30 @@
     def __call__(REQUEST=None, **kw):
         """Search the catalog, the same way as 'searchResults'.
         """
-
+        
+    def search(query_request, sort_index=None, reverse=0, limit=None, merge=1):
+        """Programmatic search interface, use for searching the catalog from
+        scripts.
+            
+        query_request -- Dictionary containing catalog query. This uses the 
+        same format as searchResults.
+        
+        sort_index -- Name of sort index
+        
+        reverse -- Boolean, reverse sort order (defaults to false)
+        
+        limit -- Limit sorted result count to the n best records. This is an
+        optimization hint used in conjunction with a sort_index. If possible
+        ZCatalog will use a different sort algorithm that uses much less memory
+        and scales better then a full sort. The actual number of records
+        returned is not guaranteed to be <= limit. You still need to apply the
+        same batching to the results. Since the len() of the results will no 
+        longer be the actual result count, you can use the
+        "actual_result_count" attribute of the lazy result object instead to
+        determine the size of the full result set.
+
+        merge -- Return merged, lazy results (like searchResults) or raw 
+        results for later merging. This can be used to perform multiple
+        queries (even across catalogs) and merge and sort the combined results.
+        """
 __doc__ = IZCatalog.__doc__ + __doc__


=== Zope/lib/python/Products/ZCatalog/Lazy.py 1.7 => 1.7.6.1 ===
--- Zope/lib/python/Products/ZCatalog/Lazy.py:1.7	Wed Aug 14 18:25:15 2002
+++ Zope/lib/python/Products/ZCatalog/Lazy.py	Tue Dec 10 10:55:00 2002
@@ -238,3 +238,21 @@
                 raise IndexError, index
         self._eindex=e
         return data[i]
+
+class LazyValues(Lazy):
+    """Given a sequence of two tuples typically (key, value) act as
+    though we are just a list of the values lazily"""
+    
+    def __init__(self, seq):
+        self._seq = seq
+    
+    def __len__(self):
+        return len(self._seq)
+    
+    def __getitem__(self, index):
+        return self._seq[index][1]
+    
+    def __getslice__(self, start, end):
+        return self.__class__(self._seq[start:end])
+        
+    slice = __getslice__