Major refactor of the catalog search engine centered around optimizing sort by...

Major refactor of the catalog search engine centered around optimizing sort by index operations. The resulting code greatly outperforms the previous version and uses less memory. Exposed a new ZCatalog method "search" which has a better interface for programmatic searches. Updated documentation as well. Implemented a sort limit option which allows you to inform the catalog that you are only interested in a certain number of results. In the common case this allows the ZCatalog machinery to use a different sorting algorithm (N-Best) which scales much better then a full sort. Also more tightly integrated the merge option which allows you to tell the catalog that you would like raw and unsorted intermediate results returned rather than sorted and lazified results. This can be used to efficiently merge search results across multiple catalogs.

Major refactor of the catalog search engine centered around optimizing sort by...
Major refactor of the catalog search engine centered around optimizing sort by index operations. The resulting code greatly outperforms the previous version and uses less memory. Exposed a new ZCatalog method "search" which has a better interface for programmatic searches. Updated documentation as well. Implemented a sort limit option which allows you to inform the catalog that you are only interested in a certain number of results. In the common case this allows the ZCatalog machinery to use a different sorting algorithm (N-Best) which scales much better then a full sort. Also more tightly integrated the merge option which allows you to tell the catalog that you would like raw and unsorted intermediate results returned rather than sorted and lazified results. This can be used to efficiently merge search results across multiple catalogs.
2167f3f6 · Casey Duncan · 0f4c1093 · 2167f3f6 · 2167f3f6 · 2167f3f6
Commit 2167f3f6 authored Dec 05, 2002 by Casey Duncan
6 changed files
--- a/lib/python/Products/ZCatalog/Catalog.py
+++ b/lib/python/Products/ZCatalog/Catalog.py
@@ -20,7 +20,7 @@ import Record
 from Missing import MV
 from zLOG import LOG, ERROR

-from Lazy import LazyMap, LazyFilter, LazyCat
+from Lazy import LazyMap, LazyFilter, LazyCat, LazyValues
 from CatalogBrains import AbstractCatalogBrain, NoBrainer

 from BTrees.IIBTree import intersection, weightedIntersection, IISet
@@ -30,6 +30,7 @@ import BTrees.Length
 from Products.PluginIndexes.common.randid import randid

 import time, sys, types
+from bisect import bisect

 class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
    """ An Object Catalog
@@ -425,12 +426,18 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):

 ## This is the Catalog search engine. Most of the heavy lifting happens below

-    def _indexedSearch(self, request, sort_index, append, used):
-        """
-        Iterate through the indexes, applying the query to each one.
-        """
+    def search(self, request, sort_index=None, reverse=0, limit=None, merge=1):
+        """Iterate through the indexes, applying the query to each one. If
+        merge is true then return a lazy result set (sorted if appropriate)
+        otherwise return the raw (possibly scored) results for later merging.
+        Limit is used in conjuntion with sorting or scored results to inform
+        the catalog how many results you are really interested in. The catalog
+        can then use optimizations to save time and memory. The number of
+        results is not guaranteed to fall within the limit however, you should
+        still slice or batch the results as usual."""
+
        rs = None # resultset
-        data = self.data
+        result_count = 0

        # Indexes fulfill a fairly large contract here. We hand each
        # index the request mapping we are given (which may be composed
@@ -449,16 +456,6 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
        # Note that if the indexes find query arguments, but the end result
        # is an empty sequence, we do nothing

-        # If sort_index is None, this method should pass sequences of
-        # catalog records to append().  The sequences will be concatenated
-        # together to generate the result set.
-        # If sort_index is not None, this method should instead pass pairs
-        # to append(), each pair containing a sort key and a sequence of
-        # catalog records.
-        # In either case, the sequences may be lazified.
-
-        if used is None:
-            used = {}
        for i in self.indexes.keys():
            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
@@ -468,8 +465,6 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):

            if r is not None:
                r, u = r
-                for name in u:
-                    used[name] = 1
                w, rs = weightedIntersection(rs, r)
        
        if rs is None:
@@ -477,19 +472,28 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            if sort_index is None:
-                rs = data.items()
-                append(LazyMap(self.instantiate, rs, len(self)))
+                if merge:
+                    return LazyMap(
+                        self.instantiate, self.data.items(), len(self))
+                else:
+                    return self.data.keys()
            else:
-                self._build_sorted_results(data, sort_index, append)
+                result_count = len(self)
+                rs = self.sortResults(
+                    self.data, sort_index, reverse,  limit, merge)
        elif rs:
+            result_count = len(rs)
+            
            # We got some results from the indexes.
            # Sort and convert to sequences.
            if sort_index is None and hasattr(rs, 'values'):
                # having a 'values' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.
-                rset = rs.byValue(0) # sort it by score
-                max = float(rset[0][0])
+                if not merge:
+                    return rs
+                rs = rs.byValue(0) # sort it by score
+                max = float(rs[0][0])

                # Here we define our getter function inline so that
                # we can conveniently store the max value as a default arg
@@ -507,31 +511,31 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
                    r.data_record_normalized_score_ = int(100. * score / max)
                    return r
                
-                # Lazify the results
-                append(LazyMap(getScoredResult, rset))
+                return LazyMap(getScoredResult, rs, len(rs))

            elif sort_index is None and not hasattr(rs, 'values'):
-                # no scores?  Just Lazify.
+                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
-                append(LazyMap(self.__getitem__, rs))
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
-                self._build_sorted_results(rs,sort_index,append)
-
-        #print 'ZCatalog search used', used.keys()
-        return used
-
-    def _build_sorted_results(self,rs,sort_index,append):
-        # This function will .append pairs where the first item
-        # in the pair is a sort key, and the second item in the
-        # pair is a sequence of results which share the same
-        # sort key. Later on the list to which these things
-        # are .append()ed will be .sort()ed, and the first element
-        # of each pair stripped.
+                rs = self.sortResults(
+                    rs, sort_index, reverse, limit, merge)
+        
+        if merge:
+            rs = LazyMap(self.__getitem__, rs, len(rs))
+            # Give the application an opportunity to see how many results
+            # there really were, regardless of the limit value
+            rs.actual_result_count = result_count
+        return rs 
+
+    def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1):
+        # Sort a result set using a sort index. Return a lazy
+        # result set in sorted order if merge is true otherwise
+        # returns a list of (sortkey, results) tuples
        #
        # The two 'for' loops in here contribute a significant
        # proportion of the time to perform an indexed search.
@@ -540,10 +544,17 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
        _lazymap = LazyMap
        _intersection = intersection
        _self__getitem__ = self.__getitem__
+        index_key_map = sort_index.documentToKeyMap()
        _None = None
+        result = []
+        append = result.append
+        
        if (len(rs) > (len(sort_index) * 4)):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
+            # This is almost never exercised in practice...
+            
+            length = 0

            try:
                intersection(rs, IISet(()))
@@ -564,31 +575,72 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
                    if keys is not _None:
                        # Is this ever true?
                        intset = keys()
-                    append((k, _lazymap(_self__getitem__, intset)))
+                    length += len(intset)
+                    append((k, intset, _self__getitem__))
                    # Note that sort keys are unique.
+            
+            if merge:
+                result.sort()
+                if reverse:
+                    result.reverse()
+                return LazyCat(LazyValues(result), length)
            else:
-            # Iterate over the result set.
+                return result            
+        else:
+            # Iterate over the result set getting sort keys from the index
            if hasattr(rs, 'keys'):
                rs = rs.keys()
-            _sort_index_keyForDocument = sort_index.keyForDocument
            _keyerror = KeyError
+            if limit is None:
+                for did in rs:
+                    try:
+                        key = index_key_map[did]
+                    except _keyerror:
+                        # This document is not in the sort key index.
+                        # skip it.
+                        pass
+                    else:
+                        append((key, did, _self__getitem__))
+                        # The reference back to __getitem__ is used in case
+                        # we do not merge now and need to intermingle the
+                        # results with those of other catalogs while avoiding
+                        # the cost of instantiating a LazyMap per result
+
+                if merge:
+                    result.sort()
+                    if reverse:
+                        result.reverse()
+                    return LazyValues(result)
+                else:
+                    return result
+            else: 
+                # Limit/sort results using N-Best algorithm
+                # This is faster for large sets then a full sort
+                # And uses far less memory
+                keys = []
+                n = 0
                for did in rs:
                    try:
-                    key = _sort_index_keyForDocument(did)
+                        key = index_key_map[did]
                    except _keyerror:
                        # This document is not in the sort key index.
                        # skip it.
                        pass
                    else:
-                    # We want the sort keys to be unique so that
-                    # .sort()ing the list does not involve comparing
-                    # _lazymap objects, which is slow. To ensure
-                    # uniqueness the first element of each pair is
-                    # actually a tuple of:
-                    # (real sort key, some unique number)
-                    lm = _lazymap(_self__getitem__, [did])
-                    key = key, id(lm)
-                    append((key,lm))
+                        if n >= limit and key <= keys[0]:
+                            continue
+                        i = bisect(keys, key)
+                        keys.insert(i, key)
+                        result.insert(i, (key, did, _self__getitem__))
+                        if n == limit:
+                            del keys[0], result[0]
+                        else:
+                            n += 1
+                result.reverse()
+                if merge:
+                    return LazyValues(result) 
+                else:
+                    return result     

    def _get_sort_attr(self, attr, kw):
        """Helper function to find sort-on or sort-order."""
@@ -626,31 +678,22 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
        else:
            return None

-
    def searchResults(self, REQUEST=None, used=None, _merge=1, **kw):
+        # The used argument is deprecated and is ignored
        if REQUEST is None and not kw:
            # Try to acquire request if we get no args for bw compat
            REQUEST = getattr(self, 'REQUEST', None)
        args = CatalogSearchArgumentsMap(REQUEST, kw)
        sort_index = self._getSortIndex(args)
-        # Perform searches with indexes and sort_index
-        r = []
-        used = self._indexedSearch(args, sort_index, r.append, used)
-        if not _merge:
-            # Postpone merging and sorting.  This provides a way to
-            # efficiently sort results merged from multiple queries
-            # or multiple catalogs.
-            return r
-        else:
-            has_sort_keys = 0
+        sort_limit = self._get_sort_attr('limit', args)
        reverse = 0
        if sort_index is not None:
-                has_sort_keys = 1
            order = self._get_sort_attr("order", args)
            if (isinstance(order, types.StringType) and
                order.lower() in ('reverse', 'descending')):
                reverse = 1
-            return mergeResults(r, has_sort_keys, reverse)
+        # Perform searches with indexes and sort_index
+        return self.search(args, sort_index, reverse, sort_limit, _merge)
                
    __call__ = searchResults

@@ -696,35 +739,33 @@ class CatalogSearchArgumentsMap:
            return 1
   

-def mergeResults(r, has_sort_keys, reverse):
+def mergeResults(catalogs_results, has_sort_keys, reverse):
    """Sort/merge sub-results, generating a flat sequence.
    
+    catalog_results is a sequence of (catalog, r) tuples. This allows you to
+    merge and sort results efficiently from multiple catalogs.
+
    The contents of r depend on whether has_sort_keys is set.
    If not has_sort_keys, r contains sequences of records.
    If has_sort_keys, r contains pairs of (sort_key, sequence)
    and now we have to sort the results.
    """
-    if not r:
-        return LazyCat(r)
-    elif len(r) == 1:
-        if not has_sort_keys:
-            return r[0]
-        else:
-            return r[0][1]
-    else:
+    result = []
    if not has_sort_keys:
-            # Note that the length of the final sequence is not
-            # the same as the length of r, since r may contain
-            # sequences of different sizes.
-            return LazyCat(r)
+        for catalog, r in catalogs_results:
+            if not r:
+                continue
+            result.append(LazyMap(catalog.__getitem__, r))
+        return LazyCat(result)
    else:
-            r.sort()
+        # Concatenate the catalog results into one list and sort it
+        # Each result item consists of a list of tuples with three values:
+        # (sortkey, docid, catalog__getitem__)
+        for catalog, r in catalogs_results:
+            result.extend(r)
+        result.sort()
        if reverse:
-                r.reverse()
-            size = 0
-            tmp = []
-            for i in r:
-                elt = i[1]
-                tmp.append(elt)
-                size += len(elt)
-            return LazyCat(tmp, size)
+            result.reverse()
+        return LazyMap(lambda rec: rec[2](rec[1]), result, len(result))
+
+        
--- a/lib/python/Products/ZCatalog/IZCatalog.py
+++ b/lib/python/Products/ZCatalog/IZCatalog.py
@@ -12,7 +12,7 @@
 #
 ##############################################################################
 """
-$Id: IZCatalog.py,v 1.4 2002/09/05 21:22:39 shane Exp $
+$Id: IZCatalog.py,v 1.5 2002/12/05 21:17:05 caseman Exp $
 """

 from Interface import Interface
@@ -162,6 +162,10 @@ class IZCatalog(Interface):
          sort_order -- You can specify 'reverse' or 'descending'.
          Default behavior is to sort ascending.
          
+          sort_limit -- An optimization hint to tell the catalog how many
+          results you are really interested in. See the limit argument
+          to the search method for more details.
+
        There are some rules to consider when querying this method:

            - an empty query mapping (or a bogus REQUEST) returns all
@@ -193,4 +197,29 @@ class IZCatalog(Interface):
        """Search the catalog, the same way as 'searchResults'.
        """
        
+    def search(query_request, sort_index=None, reverse=0, limit=None, merge=1):
+        """Programmatic search interface, use for searching the catalog from
+        scripts.
+            
+        query_request -- Dictionary containing catalog query. This uses the 
+        same format as searchResults.
+        
+        sort_index -- Name of sort index
+        
+        reverse -- Boolean, reverse sort order (defaults to false)
+        
+        limit -- Limit sorted result count to the n best records. This is an
+        optimization hint used in conjunction with a sort_index. If possible
+        ZCatalog will use a different sort algorithm that uses much less memory
+        and scales better then a full sort. The actual number of records
+        returned is not guaranteed to be <= limit. You still need to apply the
+        same batching to the results. Since the len() of the results will no 
+        longer be the actual result count, you can use the
+        "actual_result_count" attribute of the lazy result object instead to
+        determine the size of the full result set.
+
+        merge -- Return merged, lazy results (like searchResults) or raw 
+        results for later merging. This can be used to perform multiple
+        queries (even across catalogs) and merge and sort the combined results.
+        """
 __doc__ = IZCatalog.__doc__ + __doc__
--- a/lib/python/Products/ZCatalog/Lazy.py
+++ b/lib/python/Products/ZCatalog/Lazy.py
@@ -10,8 +10,8 @@
 # FOR A PARTICULAR PURPOSE
 #
 ##############################################################################
-__doc__='''$Id: Lazy.py,v 1.7 2002/08/14 22:25:15 mj Exp $'''
-__version__='$Revision: 1.7 $'[11:-2]
+__doc__='''$Id: Lazy.py,v 1.8 2002/12/05 21:17:05 caseman Exp $'''
+__version__='$Revision: 1.8 $'[11:-2]


 class Lazy:
@@ -238,3 +238,21 @@ class LazyMop(Lazy):
                raise IndexError, index
        self._eindex=e
        return data[i]
+
+class LazyValues(Lazy):
+    """Given a sequence of two tuples typically (key, value) act as
+    though we are just a list of the values lazily"""
+    
+    def __init__(self, seq):
+        self._seq = seq
+    
+    def __len__(self):
+        return len(self._seq)
+    
+    def __getitem__(self, index):
+        return self._seq[index][1]
+    
+    def __getslice__(self, start, end):
+        return self.__class__(self._seq[start:end])
+        
+    slice = __getslice__
--- a/lib/python/Products/ZCatalog/ZCatalog.py
+++ b/lib/python/Products/ZCatalog/ZCatalog.py
@@ -137,7 +137,7 @@ class ZCatalog(Folder, Persistent, Implicit):
         ['searchResults', '__call__', 'uniqueValuesFor',
          'getpath', 'schema', 'indexes', 'index_objects', 
          'all_meta_types', 'valid_roles', 'resolve_url',
-          'getobject'],
+          'getobject', 'search'],
         ['Anonymous', 'Manager']),
         
        (manage_zcatalog_indexes, 
@@ -610,16 +610,35 @@ class ZCatalog(Folder, Persistent, Implicit):
        return r

    def searchResults(self, REQUEST=None, used=None, **kw):
-        """Search the catalog according to the ZTables search interface.
+        """Search the catalog

        Search terms can be passed in the REQUEST or as keyword
        arguments.
+        
+        The used argument is now deprecated and ignored
        """

        return self._catalog.searchResults(REQUEST, used, **kw)

    __call__=searchResults
    
+    def search(
+        self, query_request, sort_index=None, reverse=0, limit=None, merge=1):
+        """Programmatic search interface, use for searching the catalog from
+        scripts.
+            
+        query_request: Dictionary containing catalog query
+        sort_index:    Name of sort index
+        reverse:       Reverse sort order?
+        limit:         Limit sorted result count (optimization hint)
+        merge:         Return merged results (like searchResults) or raw 
+                       results for later merging.
+        """
+        if sort_index is not None:
+            sort_index = self._catalog.indexes[sort_index]
+        return self._catalog.search(
+            query_request, sort_index, reverse, limit, merge)
+
 ## this stuff is so the find machinery works

    meta_types=() # Sub-object types that are specific to this object

--- a/lib/python/Products/ZCatalog/help/ZCatalog.py
+++ b/lib/python/Products/ZCatalog/help/ZCatalog.py
@@ -179,6 +179,10 @@ class ZCatalog:
          sort_order -- You can specify 'reverse' or 'descending'.
          Default behavior is to sort ascending.
          
+          sort_limit -- An optimization hint to tell the catalog how many
+          results you are really interested in. See the limit argument
+          to the search method for more details.
+
        There are some rules to consider when querying this method:

            - an empty query mapping (or a bogus REQUEST) returns all
@@ -211,3 +215,26 @@ class ZCatalog:
        """
        Search the catalog, the same way as 'searchResults'.
        """
+
+    def search(query_request, sort_index=None, reverse=0, limit=None, merge=1):
+        """Programmatic search interface, use for searching the catalog from
+        scripts.
+            
+        query_request -- Dictionary containing catalog query. This uses the 
+        same format as searchResults.
+        
+        sort_index -- Name of sort index
+        
+        reverse -- Boolean, reverse sort order (defaults to false)
+        
+        limit -- Limit sorted result count to the n best records. This is an
+        optimization hint used in conjunction with a sort_index. If possible
+        ZCatalog will use a different sort algorithm that uses much less memory
+        and scales better then a full sort. The actual number of records
+        returned is not guaranteed to be <= limit. You still need to apply the
+        same batching to the results.
+
+        merge -- Return merged, lazy results (like searchResults) or raw 
+        results for later merging. This can be used to perform multiple
+        queries (even across catalogs) and merge and sort the combined results.
+        """
--- a/lib/python/Products/ZCatalog/tests/testCatalog.py
+++ b/lib/python/Products/ZCatalog/tests/testCatalog.py
@@ -158,6 +158,13 @@ class TestZCatalog(unittest.TestCase):
        data = self._catalog.getIndexDataForUID(testNum)
        assert data['title'][0] == testNum
        
+    def testSearch(self):
+        query = {'title': ['5','6','7']}
+        sr = self._catalog.searchResults(query)
+        self.assertEqual(len(sr), 3)
+        sr = self._catalog.search(query)
+        self.assertEqual(len(sr), 3)
+
 class TestCatalogObject(unittest.TestCase):
    def setUp(self):
        self._vocabulary = Vocabulary.Vocabulary('Vocabulary','Vocabulary',
@@ -347,6 +354,11 @@ class TestCatalogObject(unittest.TestCase):
        a = self._catalog(sort_on='att1')
        self.assertEqual(len(a), self.upper)
    
+    def testSortLimit(self):
+        a = self._catalog(sort_on='num', sort_limit=10)
+        self.assertEqual(a[0].num, self.upper - 1)
+        self.assertEqual(a.actual_result_count, self.upper)
+
 class objRS(ExtensionClass.Base):

    def __init__(self,num):