fixes #6086 Merge 6086 to main. We now read in basement nodes if a full...

fixes #6086 Merge 6086 to main. We now read in basement nodes if a full keyrange resides in it so that we can answer more accurately. git-svn-id: file:///svn/toku/tokudb@54342 c7de825b-a66e-492c-adef-691d508d4ae1

fixes #6086 Merge 6086 to main. We now read in basement nodes if a full...
fixes #6086 Merge 6086 to main. We now read in basement nodes if a full keyrange resides in it so that we can answer more accurately. git-svn-id: file:///svn/toku/tokudb@54342 c7de825b-a66e-492c-adef-691d508d4ae1
1164029f · Yoni Fogel · 76fe5767 · 1164029f · 1164029f · 1164029f
Commit 1164029f authored Mar 13, 2013 by Yoni Fogel
9 changed files
--- a/buildheader/make_tdb.cc
+++ b/buildheader/make_tdb.cc
@@ -420,7 +420,9 @@ static void print_db_struct (void) {
    STRUCT_SETUP(DB, set_pagesize,   "int (*%s) (DB *, uint32_t)");
    STRUCT_SETUP(DB, stat,           "int (*%s) (DB *, void *, uint32_t)");
    STRUCT_SETUP(DB, verify,         "int (*%s) (DB *, const char *, const char *, FILE *, uint32_t)");
-    const char *extra[]={"int (*key_range64)(DB*, DB_TXN *, DBT *, uint64_t *less, uint64_t *equal, uint64_t *greater, int *is_exact)",
+    const char *extra[]={
+                         "int (*key_range64)(DB*, DB_TXN *, DBT *, uint64_t *less, uint64_t *equal, uint64_t *greater, int *is_exact)",
+                         "int (*keys_range64)(DB*, DB_TXN *, DBT *keyleft, DBT *keyright, uint64_t *less, uint64_t *left, uint64_t *between, uint64_t *right, uint64_t *greater, bool *middle_3_exact)",
 			 "int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *)",
 			 "int (*pre_acquire_table_lock)(DB*, DB_TXN*)",
 			 "int (*pre_acquire_fileops_lock)(DB*, DB_TXN*)",

--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@@ -51,9 +51,24 @@ enum ftnode_fetch_type {
    ftnode_fetch_none=1, // no partitions needed.  
    ftnode_fetch_subset, // some subset of partitions needed
    ftnode_fetch_prefetch, // this is part of a prefetch call
-    ftnode_fetch_all // every partition is needed
+    ftnode_fetch_all, // every partition is needed
+    ftnode_fetch_keymatch, // one child is needed if it holds both keys
 };

+static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) UU();
+static bool is_valid_ftnode_fetch_type(enum ftnode_fetch_type type) {
+    switch (type) {
+        case ftnode_fetch_none:
+        case ftnode_fetch_subset:
+        case ftnode_fetch_prefetch:
+        case ftnode_fetch_all:
+        case ftnode_fetch_keymatch:
+            return true;
+        default:
+            return false;
+    }
+}
+
 //
 // An extra parameter passed to cachetable functions 
 // That is used in all types of fetch callbacks.
@@ -730,6 +745,46 @@ static inline void fill_bfe_for_full_read(struct ftnode_fetch_extra *bfe, FT h)
    bfe->decompress_time = 0;
 }

+//
+// Helper function to fill a ftnode_fetch_extra with data
+// that will tell the fetch callback that an explicit range of children is
+// necessary. Used in cases where the portion of the node that is required
+// is known in advance, e.g. for keysrange when the left and right key
+// are in the same basement node.
+//
+static inline void fill_bfe_for_keymatch(
+    struct ftnode_fetch_extra *bfe,
+    FT h,
+    DBT *left,
+    DBT *right,
+    bool disable_prefetching,
+    bool read_all_partitions
+    )
+{
+    paranoid_invariant(h->h->type == FT_CURRENT);
+    bfe->type = ftnode_fetch_keymatch;
+    bfe->h = h;
+    bfe->search = nullptr;
+    toku_init_dbt(&bfe->range_lock_left_key);
+    toku_init_dbt(&bfe->range_lock_right_key);
+    if (left) {
+        toku_copyref_dbt(&bfe->range_lock_left_key, *left);
+    }
+
+    if (right) {
+        toku_copyref_dbt(&bfe->range_lock_right_key, *right);
+    }
+    bfe->left_is_neg_infty = left == nullptr;
+    bfe->right_is_pos_infty = right == nullptr;
+    bfe->child_to_read = -1;
+    bfe->disable_prefetching = disable_prefetching;
+    bfe->read_all_partitions = read_all_partitions;
+    bfe->bytes_read = 0;
+    bfe->io_time = 0;
+    bfe->deserialize_time = 0;
+    bfe->decompress_time = 0;
+}
+
 //
 // Helper function to fill a ftnode_fetch_extra with data
 // that will tell the fetch callback that some subset of the node

--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@@ -596,20 +596,15 @@ next_dict_id(void) {
 bool
 toku_bfe_wants_child_available (struct ftnode_fetch_extra* bfe, int childnum)
 {
-    if (bfe->type == ftnode_fetch_all ||
-        (bfe->type == ftnode_fetch_subset && bfe->child_to_read == childnum))
-    {
-        return true;
-    }
-    else {
-        return false;
-    }
+    return bfe->type == ftnode_fetch_all ||
+        (bfe->child_to_read == childnum &&
+         (bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_keymatch));
 }

 int
 toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node)
 {
-    paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch);
+    paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch || bfe->type == ftnode_fetch_keymatch);
    if (bfe->left_is_neg_infty) {
        return 0;
    } else if (bfe->range_lock_left_key.data == nullptr) {
@@ -622,7 +617,7 @@ toku_bfe_leftmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node)
 int
 toku_bfe_rightmost_child_wanted(struct ftnode_fetch_extra *bfe, FTNODE node)
 {
-    paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch);
+    paranoid_invariant(bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_prefetch || bfe->type == ftnode_fetch_keymatch);
    if (bfe->right_is_pos_infty) {
        return node->n_children - 1;
    } else if (bfe->range_lock_right_key.data == nullptr) {
@@ -875,7 +870,7 @@ toku_ft_status_update_pivot_fetch_reason(struct ftnode_fetch_extra *bfe)
        STATUS_INC(FT_NUM_PIVOTS_FETCHED_WRITE, 1);
        STATUS_INC(FT_BYTES_PIVOTS_FETCHED_WRITE, bfe->bytes_read);
        STATUS_INC(FT_TOKUTIME_PIVOTS_FETCHED_WRITE, bfe->io_time);
-    } else if (bfe->type == ftnode_fetch_subset) {
+    } else if (bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_keymatch) {
        STATUS_INC(FT_NUM_PIVOTS_FETCHED_QUERY, 1);
        STATUS_INC(FT_BYTES_PIVOTS_FETCHED_QUERY, bfe->bytes_read);
        STATUS_INC(FT_TOKUTIME_PIVOTS_FETCHED_QUERY, bfe->io_time);
@@ -1146,8 +1141,24 @@ bool toku_ftnode_pf_req_callback(void* ftnode_pv, void* read_extraargs) {
                retval = true;
            }
        }
-    }
-    else {
+    } else if (bfe->type == ftnode_fetch_keymatch) {
+        // we do not take into account prefetching yet
+        // as of now, if we need a subset, the only thing
+        // we can possibly require is a single basement node
+        // we find out what basement node the query cares about
+        // and check if it is available
+        paranoid_invariant(bfe->h->compare_fun);
+        if (node->height == 0) {
+            int left_child = toku_bfe_leftmost_child_wanted(bfe, node);
+            int right_child = toku_bfe_rightmost_child_wanted(bfe, node);
+            if (left_child == right_child) {
+                bfe->child_to_read = left_child;
+                unsafe_touch_clock(node,bfe->child_to_read);
+                // child we want to read is not available, must set retval to true
+                retval = (BP_STATE(node, bfe->child_to_read) != PT_AVAIL);
+            }
+        }
+    } else {
        // we have a bug. The type should be known
        abort();
    }
@@ -1264,7 +1275,7 @@ int toku_ftnode_pf_callback(void* ftnode_pv, void* disk_data, void* read_extraar
    struct ftnode_fetch_extra *bfe = (struct ftnode_fetch_extra *) read_extraargs;
    // there must be a reason this is being called. If we get a garbage type or the type is ftnode_fetch_none,
    // then something went wrong
-    assert((bfe->type == ftnode_fetch_subset) || (bfe->type == ftnode_fetch_all) || (bfe->type == ftnode_fetch_prefetch));
+    assert((bfe->type == ftnode_fetch_subset) || (bfe->type == ftnode_fetch_all) || (bfe->type == ftnode_fetch_prefetch) || (bfe->type == ftnode_fetch_keymatch));
    // determine the range to prefetch
    int lc, rc;
    if (!bfe->disable_prefetching &&
@@ -5583,63 +5594,99 @@ keyrange_compare (OMTVALUE lev, void *extra) {
 }

 static void
-keyrange_in_leaf_partition (FT_HANDLE brt, FTNODE node, DBT *key, int child_number, uint64_t estimated_num_rows,
-                            uint64_t *less, uint64_t *equal, uint64_t *greater)
+keysrange_in_leaf_partition (FT_HANDLE brt, FTNODE node,
+                             DBT* key_left, DBT* key_right,
+                             int left_child_number, int right_child_number, uint64_t estimated_num_rows,
+                             uint64_t *less, uint64_t* equal_left, uint64_t* middle,
+                             uint64_t* equal_right, uint64_t* greater, bool* single_basement_node)
 // If the partition is in main memory then estimate the number
-// If KEY==NULL then use an arbitrary key (leftmost or zero)
+// Treat key_left == NULL as negative infinity
+// Treat key_right == NULL as positive infinity
 {
    paranoid_invariant(node->height == 0); // we are in a leaf
-    if (BP_STATE(node, child_number) == PT_AVAIL) {
-        // If the partition is in main memory then get an exact count.
-        struct keyrange_compare_s s = {brt,key};
-        BASEMENTNODE bn = BLB(node, child_number);
+    paranoid_invariant(!(key_left == NULL && key_right != NULL));
+    paranoid_invariant(left_child_number <= right_child_number);
+    bool single_basement = left_child_number == right_child_number;
+    paranoid_invariant(!single_basement || (BP_STATE(node, left_child_number) == PT_AVAIL));
+    if (BP_STATE(node, left_child_number) == PT_AVAIL) {
+        int r;
+        // The partition is in main memory then get an exact count.
+        struct keyrange_compare_s s_left = {brt, key_left};
+        BASEMENTNODE bn = BLB(node, left_child_number);
        OMTVALUE datav;
-        uint32_t idx = 0;
-        // if key is NULL then set r==-1 and idx==0.
-        int r = key ? toku_omt_find_zero(bn->buffer, keyrange_compare, &s, &datav, &idx) : -1;
-        if (r==0) {
-            *less    = idx;
-            *equal   = 1;
-            *greater = toku_omt_size(bn->buffer)-idx-1;
-        } else {
-            // If not found, then the idx says where it's between.
-            *less    = idx;
-            *equal   = 0;
-            *greater = toku_omt_size(bn->buffer)-idx;
-        }
+        uint32_t idx_left = 0;
+        // if key_left is NULL then set r==-1 and idx==0.
+        r = key_left ? toku_omt_find_zero(bn->buffer, keyrange_compare, &s_left, &datav, &idx_left) : -1;
+        *less = idx_left;
+        *equal_left = (r==0) ? 1 : 0;
+
+        uint32_t size = toku_omt_size(bn->buffer);
+        printf("Estimated vs Actual: %" PRIu64 " vs %" PRIu32 "\n", estimated_num_rows, size);
+        uint32_t idx_right = size;
+        r = -1;
+        if (single_basement && key_right) {
+            struct keyrange_compare_s s_right = {brt, key_right};
+            r = toku_omt_find_zero(bn->buffer, keyrange_compare, &s_right, &datav, &idx_right);
+        }
+        *middle = idx_right - idx_left - *equal_left;
+        *equal_right = (r==0) ? 1 : 0;
+        *greater = size - idx_right - *equal_right;
    } else {
-        *less    = estimated_num_rows / 2;
-        *equal   = 0;
-        *greater = *less;
+        paranoid_invariant(!single_basement);
+        uint32_t idx_left = estimated_num_rows / 2;
+        if (!key_left) {
+            //Both nullptr, assume key_left belongs before leftmost entry, key_right belongs after rightmost entry
+            idx_left = 0;
+            paranoid_invariant(!key_right);
+        }
+        // Assume idx_left and idx_right point to where key_left and key_right belong, (but are not there).
+        *less = idx_left;
+        *equal_left = 0;
+        *middle = estimated_num_rows - idx_left;
+        *equal_right = 0;
+        *greater = 0;
    }
+    *single_basement_node = single_basement;
 }

 static int
-toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,
-                            DBT *key, uint64_t *less, uint64_t *equal, uint64_t *greater,
+toku_ft_keysrange_internal (FT_HANDLE brt, FTNODE node,
+                            DBT* key_left, DBT* key_right, bool may_find_right,
+                            uint64_t* less, uint64_t* equal_left, uint64_t* middle,
+                            uint64_t* equal_right, uint64_t* greater, bool* single_basement_node,
                            uint64_t estimated_num_rows,
-                            struct ftnode_fetch_extra *bfe, // set up to read a minimal read.
+                            struct ftnode_fetch_extra *min_bfe, // set up to read a minimal read.
+                            struct ftnode_fetch_extra *match_bfe, // set up to read a basement node iff both keys in it
                            struct unlockers *unlockers, ANCESTORS ancestors, struct pivot_bounds const * const bounds)
 // Implementation note: Assign values to less, equal, and greater, and then on the way out (returning up the stack) we add more values in.
 {
    int r = 0;
    // if KEY is NULL then use the leftmost key.
-    int child_number = key ? toku_ftnode_which_child (node, key, &brt->ft->cmp_descriptor, brt->ft->compare_fun) : 0;
+    int left_child_number = key_left ? toku_ftnode_which_child (node, key_left, &brt->ft->cmp_descriptor, brt->ft->compare_fun) : 0;
+    int right_child_number = node->n_children;  // Sentinel that does not equal left_child_number.
+    if (may_find_right) {
+        right_child_number = key_right ? toku_ftnode_which_child (node, key_right, &brt->ft->cmp_descriptor, brt->ft->compare_fun) : node->n_children - 1;
+    }
+
    uint64_t rows_per_child = estimated_num_rows / node->n_children;
    if (node->height == 0) {
+        keysrange_in_leaf_partition(brt, node, key_left, key_right, left_child_number, right_child_number,
+                                    rows_per_child, less, equal_left, middle, equal_right, greater, single_basement_node);

-        keyrange_in_leaf_partition(brt, node, key, child_number, rows_per_child, less, equal, greater);
-
-        *less    += rows_per_child * child_number;
-        *greater += rows_per_child * (node->n_children - child_number - 1);
-
+        *less    += rows_per_child * left_child_number;
+        if (*single_basement_node) {
+            *greater += rows_per_child * (node->n_children - left_child_number - 1);
+        } else {
+            *middle += rows_per_child * (node->n_children - left_child_number - 1);
+        }
    } else {
        // do the child.
-        struct ancestors next_ancestors = {node, child_number, ancestors};
-        BLOCKNUM childblocknum = BP_BLOCKNUM(node, child_number);
-        uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, child_number);
+        struct ancestors next_ancestors = {node, left_child_number, ancestors};
+        BLOCKNUM childblocknum = BP_BLOCKNUM(node, left_child_number);
+        uint32_t fullhash = compute_child_fullhash(brt->ft->cf, node, left_child_number);
        FTNODE childnode;
        bool msgs_applied = false;
+        bool child_may_find_right = may_find_right && left_child_number == right_child_number;
        r = toku_pin_ftnode_batched(
            brt,
            childblocknum,
@@ -5647,7 +5694,7 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,
            unlockers,
            &next_ancestors,
            bounds,
-            bfe,
+            child_may_find_right ? match_bfe : min_bfe,
            PL_READ, // may_modify_node is false, because node guaranteed to not change
            false,
            &childnode,
@@ -5659,15 +5706,20 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,

            struct unlock_ftnode_extra unlock_extra   = {brt,childnode,false};
            struct unlockers next_unlockers = {true, unlock_ftnode_fun, (void*)&unlock_extra, unlockers};
-            const struct pivot_bounds next_bounds = next_pivot_keys(node, child_number, bounds);
+            const struct pivot_bounds next_bounds = next_pivot_keys(node, left_child_number, bounds);

-            r = toku_ft_keyrange_internal(brt, childnode, key, less, equal, greater, rows_per_child,
-                                           bfe, &next_unlockers, &next_ancestors, &next_bounds);
+            r = toku_ft_keysrange_internal(brt, childnode, key_left, key_right, child_may_find_right,
+                                           less, equal_left, middle, equal_right, greater, single_basement_node,
+                                           rows_per_child, min_bfe, match_bfe, &next_unlockers, &next_ancestors, &next_bounds);
            if (r != TOKUDB_TRY_AGAIN) {
                assert_zero(r);

-                *less    += rows_per_child * child_number;
-                *greater += rows_per_child * (node->n_children - child_number - 1);
+                *less    += rows_per_child * left_child_number;
+                if (*single_basement_node) {
+                    *greater += rows_per_child * (node->n_children - left_child_number - 1);
+                } else {
+                    *middle += rows_per_child * (node->n_children - left_child_number - 1);
+                }

                assert(unlockers->locked);
                toku_unpin_ftnode_read_only(brt->ft, childnode);
@@ -5677,20 +5729,39 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,
    return r;
 }

-void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less_p, uint64_t *equal_p, uint64_t *greater_p)
-// Effect: Return an estimate  of the number of keys to the left, the number equal, and the number to the right of the key.
+void toku_ft_keysrange(FT_HANDLE brt, DBT* key_left, DBT* key_right, uint64_t *less_p, uint64_t* equal_left_p, uint64_t* middle_p, uint64_t* equal_right_p, uint64_t* greater_p, bool* middle_3_exact_p)
+// Effect: Return an estimate  of the number of keys to the left, the number equal (to left key), number between keys, number equal to right key, and the number to the right of both keys.
 //   The values are an estimate.
-//   If you perform a keyrange on two keys that are in the same in-memory and uncompressed basement,
-//   you can use the keys_right numbers (or the keys_left) numbers to get an exact number keys in the range,
-//   if the basement does not change between the keyrange queries.
-//   TODO 4184: What to do with a NULL key?
+//   If you perform a keyrange on two keys that are in the same basement, equal_less, middle, and equal_right will be exact.
+//   4184: What to do with a NULL key?
+//   key_left==NULL is treated as -infinity
+//   key_right==NULL is treated as +infinity
 //   If KEY is NULL then the system picks an arbitrary key and returns it.
+//   key_right can be non-null only if key_left is non-null;
 {
-    struct ftnode_fetch_extra bfe;
-    fill_bfe_for_min_read(&bfe, brt->ft);  // read pivot keys but not message buffers
+    if (!key_left && key_right) {
+        // Simplify internals by only supporting key_right != null when key_left != null
+        // If key_right != null and key_left == null, then swap them and fix up numbers.
+        uint64_t less = 0, equal_left = 0, middle = 0, equal_right = 0, greater = 0;
+        toku_ft_keysrange(brt, key_right, nullptr, &less, &equal_left, &middle, &equal_right, &greater, middle_3_exact_p);
+        *less_p = 0;
+        *equal_left_p = 0;
+        *middle_p = less;
+        *equal_right_p = equal_left;
+        *greater_p = middle;
+        invariant_zero(equal_right);
+        invariant_zero(greater);
+        return;
+    }
+    paranoid_invariant(!(!key_left && key_right));
+    struct ftnode_fetch_extra min_bfe;
+    struct ftnode_fetch_extra match_bfe;
+    fill_bfe_for_min_read(&min_bfe, brt->ft);  // read pivot keys but not message buffers
+    fill_bfe_for_keymatch(&match_bfe, brt->ft, key_left, key_right, false, false);  // read basement node only if both keys in it.
 try_again:
    {
-        uint64_t less = 0, equal = 0, greater = 0;
+        uint64_t less = 0, equal_left = 0, middle = 0, equal_right = 0, greater = 0;
+        bool single_basement_node = false;
        FTNODE node = NULL;
        {
            uint32_t fullhash;
@@ -5700,7 +5771,7 @@ void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less_p, uint64_t *equal
                brt->ft,
                root_key,
                fullhash,
-                &bfe,
+                &match_bfe,
                PL_READ, // may_modify_node, cannot change root during keyrange
                0,
                NULL,
@@ -5712,25 +5783,81 @@ void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less_p, uint64_t *equal
        struct unlockers unlockers = {true, unlock_ftnode_fun, (void*)&unlock_extra, (UNLOCKERS)NULL};

        {
+            int r;
            int64_t numrows = brt->ft->in_memory_stats.numrows;
            if (numrows < 0)
                numrows = 0;  // prevent appearance of a negative number
-            int r = toku_ft_keyrange_internal (brt, node, key,
-                                                &less, &equal, &greater,
-                                                numrows,
-                                                &bfe, &unlockers, (ANCESTORS)NULL, &infinite_bounds);
+            r = toku_ft_keysrange_internal (brt, node, key_left, key_right, true,
+                                            &less, &equal_left, &middle, &equal_right, &greater,
+                                            &single_basement_node, numrows,
+                                            &min_bfe, &match_bfe, &unlockers, (ANCESTORS)NULL, &infinite_bounds);
            assert(r == 0 || r == TOKUDB_TRY_AGAIN);
            if (r == TOKUDB_TRY_AGAIN) {
                assert(!unlockers.locked);
                goto try_again;
            }
+            // May need to do a second query.
+            if (!single_basement_node && key_right != nullptr) {
+                // "greater" is stored in "middle"
+                invariant_zero(equal_right);
+                invariant_zero(greater);
+                uint64_t less2 = 0, equal_left2 = 0, middle2 = 0, equal_right2 = 0, greater2 = 0;
+                bool ignore;
+                r = toku_ft_keysrange_internal (brt, node, key_right, nullptr, false,
+                                                &less2, &equal_left2, &middle2, &equal_right2, &greater2,
+                                                &ignore, numrows,
+                                                &min_bfe, &match_bfe, &unlockers, (ANCESTORS)nullptr, &infinite_bounds);
+                assert(r == 0 || r == TOKUDB_TRY_AGAIN);
+                if (r == TOKUDB_TRY_AGAIN) {
+                    assert(!unlockers.locked);
+                    goto try_again;
+                }
+                invariant_zero(equal_right2);
+                invariant_zero(greater2);
+                // Update numbers.
+                // less is already correct.
+                // equal_left is already correct.
+
+                // "middle" currently holds everything greater than left_key in first query
+                // 'middle2' currently holds everything greater than right_key in second query
+                // 'equal_left2' is how many match right_key
+
+                // Prevent underflow.
+                if (middle >= equal_left2 + middle2) {
+                    middle -= equal_left2 + middle2;
+                } else {
+                    middle = 0;
+                }
+                equal_right = equal_left2;
+                greater = middle2;
+            }
        }
        assert(unlockers.locked);
        toku_unpin_ftnode_read_only(brt->ft, node);
-        *less_p    = less;
-        *equal_p   = equal;
-        *greater_p = greater;
-    }
+        if (!key_right) {
+            paranoid_invariant_zero(equal_right);
+            paranoid_invariant_zero(greater);
+        }
+        if (!key_left) {
+            paranoid_invariant_zero(less);
+            paranoid_invariant_zero(equal_left);
+        }
+        *less_p        = less;
+        *equal_left_p  = equal_left;
+        *middle_p      = middle;
+        *equal_right_p = equal_right;
+        *greater_p     = greater;
+        *middle_3_exact_p = single_basement_node;
+    }
+}
+
+//Test-only wrapper for the old one-key range function
+void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less,  uint64_t *equal,  uint64_t *greater) {
+    uint64_t zero_equal_right, zero_greater;
+    bool ignore;
+    toku_ft_keysrange(brt, key, nullptr, less, equal, greater, &zero_equal_right, &zero_greater, &ignore);
+    invariant_zero(zero_equal_right);
+    invariant_zero(zero_greater);
 }

 void toku_ft_handle_stat64 (FT_HANDLE brt, TOKUTXN UU(txn), struct ftstat64_s *s) {

--- a/ft/ft-ops.h
+++ b/ft/ft-ops.h
@@ -205,6 +205,7 @@ enum ft_flags {
 };

 void toku_ft_keyrange(FT_HANDLE brt, DBT *key, uint64_t *less,  uint64_t *equal,  uint64_t *greater);
+void toku_ft_keysrange(FT_HANDLE brt, DBT* key_left, DBT* key_right, uint64_t *less_p, uint64_t* equal_left_p, uint64_t* middle_p, uint64_t* equal_right_p, uint64_t* greater_p, bool* middle_3_exact_p);

 struct ftstat64_s {
    uint64_t nkeys; /* estimate how many unique keys (even when flattened this may be an estimate)     */

--- a/ft/ft_node-serialize.cc
+++ b/ft/ft_node-serialize.cc
@@ -1372,6 +1372,20 @@ update_bfe_using_ftnode(FTNODE node, struct ftnode_fetch_extra *bfe)
            node,
            bfe->search
            );
+    } else if (bfe->type == ftnode_fetch_keymatch) {
+        // we do not take into account prefetching yet
+        // as of now, if we need a subset, the only thing
+        // we can possibly require is a single basement node
+        // we find out what basement node the query cares about
+        // and check if it is available
+        paranoid_invariant(bfe->h->compare_fun);
+        if (node->height == 0) {
+            int left_child = toku_bfe_leftmost_child_wanted(bfe, node);
+            int right_child = toku_bfe_rightmost_child_wanted(bfe, node);
+            if (left_child == right_child) {
+                bfe->child_to_read = left_child;
+            }
+        }
    }
 }

@@ -1688,7 +1702,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
    // rbuf, so we might be able to store the compressed data for some
    // objects.
    // We can proceed to deserialize the individual subblocks.
-    paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch);
+    paranoid_invariant(is_valid_ftnode_fetch_type(bfe->type));

    // setup the memory of the partitions
    // for partitions being decompressed, create either FIFO or basement node
@@ -2323,7 +2337,7 @@ deserialize_ftnode_from_rbuf(

    // now that the node info has been deserialized, we can proceed to deserialize
    // the individual sub blocks
-    paranoid_invariant(bfe->type == ftnode_fetch_none || bfe->type == ftnode_fetch_subset || bfe->type == ftnode_fetch_all || bfe->type == ftnode_fetch_prefetch);
+    paranoid_invariant(is_valid_ftnode_fetch_type(bfe->type));

    // setup the memory of the partitions
    // for partitions being decompressed, create either FIFO or basement node

--- a/ft/tests/keyrange.cc
+++ b/ft/tests/keyrange.cc
@@ -66,6 +66,49 @@ static void maybe_reopen (enum memory_state ms, uint64_t limit) {
    assert(0);
 }

+static void verify_keysrange(enum memory_state UU(ms), uint64_t limit,
+        uint64_t intkey1,
+        uint64_t intkey2,
+        uint64_t less,
+        uint64_t equal1,
+        uint64_t middle,
+        uint64_t equal2,
+        uint64_t greater,
+        bool middle3exact) {
+    uint64_t max_item = limit * 2 - 1;
+    uint64_t perfect_total = limit;
+    uint64_t perfect_less = intkey1 / 2;
+    uint64_t perfect_equal1 = intkey1 % 2 == 1;
+    uint64_t perfect_equal2 = intkey2 % 2 == 1 && intkey2 <= max_item;
+    uint64_t perfect_greater = intkey2 >= max_item ? 0 : (max_item + 1 - intkey2) / 2;
+    uint64_t perfect_middle = perfect_total - perfect_less - perfect_equal1 - perfect_equal2 - perfect_greater;
+
+    uint64_t total = less + equal1 + middle + equal2 + greater;
+    assert(total > 0);
+    assert(total < 2 * perfect_total);
+    assert(total > perfect_total / 2);
+
+    assert(equal1 == perfect_equal1 || (equal1 == 0 && !middle3exact));
+    assert(equal2 == perfect_equal2 || (equal2 == 0 && !middle3exact));
+
+    // As of 2013-02-25 this is accurate with fiddle ~= total/50.
+    // Set to 1/10th to prevent flakiness.
+    uint64_t fiddle = perfect_total / 10;
+    assert(less + fiddle > perfect_less);
+    assert(less < perfect_less + fiddle);
+
+    assert(middle + fiddle > perfect_middle);
+    assert(middle < perfect_middle + fiddle);
+
+    assert(greater + fiddle > perfect_greater);
+    assert(greater < perfect_greater + fiddle);
+
+    if (middle3exact) {
+        assert(middle == perfect_middle);
+    }
+}
+
+
 static void test_keyrange (enum memory_state ms, uint64_t limit) {
    open_ft_and_ct(true);

@@ -123,7 +166,9 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) {
 #endif
 	    } else {
 		// after reopen, none of the basements are in memory
-		assert(equal == 0);
+		// However, "both" keys can be in the same basement (specifically the last basement node in the tree)
+                // Without trying to figure out how many are in the last basement node, we expect at least the first half not to be in the last basement node.
+                assert(i > limit / 2 || equal == 0);
 #if 0
 		if (i<10) {
 		    assert(less==0);
@@ -189,6 +234,80 @@ static void test_keyrange (enum memory_state ms, uint64_t limit) {
 #endif
    }

+    maybe_reopen(ms, limit);
+
+    {
+        uint64_t totalqueries = 0;
+        uint64_t num_middle3_exact = 0;
+        for (uint64_t i=0; i < 2*limit; i++) {
+	    char key[100];
+	    char keyplus4[100];
+	    char keyplus5[100];
+            uint64_t intkey = i;
+
+	    snprintf(key, 100, "%08" PRIu64 "", intkey);
+	    snprintf(keyplus4, 100, "%08" PRIu64 "", intkey+4);
+	    snprintf(keyplus5, 100, "%08" PRIu64 "", intkey+5);
+
+	    DBT k;
+	    DBT k2;
+	    DBT k3;
+            toku_fill_dbt(&k, key, 1+strlen(key));
+            toku_fill_dbt(&k2, keyplus4, 1+strlen(keyplus4));
+            toku_fill_dbt(&k3, keyplus5, 1+strlen(keyplus5));
+	    uint64_t less,equal1,middle,equal2,greater;
+            bool middle3exact;
+	    toku_ft_keysrange(t, &k, &k2, &less, &equal1, &middle, &equal2, &greater, &middle3exact);
+            if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
+                //TODO(yoni): when reading basement nodes is implemented, get rid of this hack
+                middle3exact = false;
+            }
+            totalqueries++;
+            num_middle3_exact += middle3exact;
+            if (verbose > 1) {
+                printf("Rkey2 %" PRIu64 "/%" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %s\n",
+                       intkey, 2*limit, less, equal1, middle, equal2, greater, middle3exact ? "true" : "false");
+            }
+            verify_keysrange(ms, limit, intkey, intkey+4,
+                    less, equal1, middle, equal2, greater, middle3exact);
+
+	    toku_ft_keysrange(t, &k, &k3, &less, &equal1, &middle, &equal2, &greater, &middle3exact);
+            if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
+                //TODO(yoni): when reading basement nodes is implemented, get rid of this hack
+                middle3exact = false;
+            }
+            totalqueries++;
+            num_middle3_exact += middle3exact;
+            if (verbose > 1) {
+                printf("Rkey3 %" PRIu64 "/%" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %" PRIu64
+                       " %s\n",
+                       intkey, 2*limit, less, equal1, middle, equal2, greater, middle3exact ? "true" : "false");
+            }
+            verify_keysrange(ms, limit, intkey, intkey+5,
+                    less, equal1, middle, equal2, greater, middle3exact);
+        }
+        assert(num_middle3_exact <= totalqueries);
+        if (ms == CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
+            //TODO(yoni): when reading basement nodes is implemented, get rid of this hack
+            assert(num_middle3_exact == 0);
+        } else {
+            // About 85% of the time, the key for an int (and +4 or +5) is in the
+            // same basement node.  Check >= 70% so this isn't very flaky.
+            assert(num_middle3_exact > totalqueries * 7 / 10);
+        }
+    }
+
    close_ft_and_ct();
 }


--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -695,12 +695,8 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)
  declare_custom_tests(keyrange.tdb)
  add_ydb_test_aux(keyrange-get0.tdb keyrange.tdb --get 0)
  add_ydb_test_aux(keyrange-get1.tdb keyrange.tdb --get 1)
-  if (0)
-    add_ydb_test_aux(keyrange-random-get0.tdb keyrange.tdb --get 0 --random_keys 1)
-    add_ydb_test_aux(keyrange-random-get1.tdb keyrange.tdb --get 1 --random_keys 1)
-  else ()
-    message(WARNING "TODO(leif): re-enable keyrange tests, see #5666")
-  endif ()
+  add_ydb_test_aux(keyrange-random-get0.tdb keyrange.tdb --get 0 --random_keys 1)
+  add_ydb_test_aux(keyrange-random-get1.tdb keyrange.tdb --get 1 --random_keys 1)
  add_ydb_test_aux(keyrange-loader-get0.tdb keyrange.tdb --get 0 --loader 1)
  add_ydb_test_aux(keyrange-loader-get1.tdb keyrange.tdb --get 1 --loader 1)


--- a/src/tests/keyrange.cc
+++ b/src/tests/keyrange.cc
@@ -60,7 +60,7 @@ run_test(void) {

    size_t key_size = 9;
    size_t val_size = 9;
-    size_t est_row_size_with_overhead = 8 + key_size + 4 + val_size + 4; // xid + key + key_len + val + val)len
+    size_t est_row_size_with_overhead = 8 + key_size + 4 + val_size + 4 + 5; // xid + key + key_len + val + val_len + mvcc overhead
    size_t rows_per_basement = db_basement_size / est_row_size_with_overhead;

    int r;
@@ -72,7 +72,8 @@ run_test(void) {
    r = env->open(env, envdir, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);

    r = db_create(&db, env, 0); CKERR(r);
-    r = db->set_pagesize(db, db_page_size);
+    r = db->set_pagesize(db, db_page_size); CKERR(r);
+    r = db->set_readpagesize(db, db_basement_size); CKERR(r);
    r = env->txn_begin(env, 0, &txn, 0); CKERR(r);
    r = db->open(db, txn, "foo.db", 0, DB_BTREE, DB_CREATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
    r = txn->commit(txn, 0);    CKERR(r);
@@ -145,7 +146,11 @@ run_test(void) {

    if (0) goto skipit; // debug: just write the tree

+    bool last_basement;
+    last_basement = false;
    // verify key_range for keys that exist in the tree
+    uint64_t random_fudge;
+    random_fudge = random_keys ? rows_per_basement + nrows / 10 : 0;
    for (uint64_t i=0; i<nrows; i++) {
 	char key[100];
 	snprintf(key, 100, "%08llu", (unsigned long long)2*i+1);
@@ -160,15 +165,31 @@ run_test(void) {
        assert(0 < less + equal + greater);
        if (use_loader) {
            assert(less + equal + greater <= nrows);
-            assert(get_all ? equal == 1 : equal == 0);
+            if (get_all || last_basement) {
+                assert(equal == 1);
+            } else if (i < nrows - rows_per_basement * 2) {
+                assert(equal == 0);
+            } else if (i == nrows - 1) {
+                assert(equal == 1);
+            } else if (equal == 1) {
+                last_basement = true;
+            }
            assert(less <= max64(i, i + rows_per_basement/2));
            assert(greater <= nrows - less);
        } else {
            assert(less + equal + greater <= nrows + nrows / 8);
-            assert(get_all ? equal == 1 : equal == 0);
-            uint64_t est_i = max64(i, i + rows_per_basement/2);
-            assert(less <= est_i + est_i / 1);
-            assert(greater <= nrows - i + rows_per_basement/2);
+            if (get_all || last_basement) {
+                assert(equal == 1);
+            } else if (i < nrows - rows_per_basement * 2) {
+                assert(equal == 0);
+            } else if (i == nrows - 1) {
+                assert(equal == 1);
+            } else if (equal == 1) {
+                last_basement = true;
+            }
+            uint64_t est_i = i * 2 + rows_per_basement;
+            assert(less <= est_i + random_fudge);
+            assert(greater <= nrows - i + rows_per_basement + random_fudge);
 	}
    }

@@ -193,9 +214,9 @@ run_test(void) {
        } else {
            assert(less + equal + greater <= nrows + nrows / 8);
            assert(equal == 0);
-            uint64_t est_i = max64(i, i + rows_per_basement/2);
-            assert(less <= est_i + est_i / 1);
-            assert(greater <= nrows - i + rows_per_basement/2);
+            uint64_t est_i = i * 2 + rows_per_basement;
+            assert(less <= est_i + random_fudge);
+            assert(greater <= nrows - i + rows_per_basement + random_fudge);
        }
    }


--- a/src/ydb_db.cc
+++ b/src/ydb_db.cc
@@ -642,17 +642,30 @@ toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
 }

 static int 
-toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, uint64_t* less, uint64_t* equal, uint64_t* greater, int* is_exact) {
+toku_db_keys_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* keyleft, DBT* keyright, uint64_t* less, uint64_t* left, uint64_t* between, uint64_t *right, uint64_t *greater, bool* middle_3_exact) {
    HANDLE_PANICKED_DB(db);
    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);

-    // note that toku_ft_keyrange does not have a txn param
-    // this will be fixed later
-    // temporarily, because the caller, locked_db_keyrange, 
-    // has the ydb lock, we are ok
-    toku_ft_keyrange(db->i->ft_handle, key, less, equal, greater);
-    // temporarily set is_exact to 0 because ft_keyrange does not have this parameter
-    *is_exact = 0;
+    // note that we ignore the txn param.  It would be more complicated to support it.
+    // TODO(yoni): Maybe add support for txns later?  How would we do this?  ydb lock comment about db_keyrange64 is obsolete.
+    toku_ft_keysrange(db->i->ft_handle, keyleft, keyright, less, left, between, right, greater, middle_3_exact);
+    return 0;
+}
+
+static int 
+toku_db_key_range64(DB* db, DB_TXN* txn, DBT* key, uint64_t* less_p, uint64_t* equal_p, uint64_t* greater_p, int* is_exact) {
+    uint64_t less, equal_left, middle, equal_right, greater;
+    bool ignore;
+    int r = toku_db_keys_range64(db, txn, key, NULL, &less, &equal_left, &middle, &equal_right, &greater, &ignore);
+    if (r == 0) {
+        *less_p = less;
+        *equal_p = equal_left;
+        *greater_p = middle;
+        paranoid_invariant_zero(greater);  // no keys are greater than positive infinity
+        paranoid_invariant_zero(equal_right);  // no keys are equal to positive infinity
+        // toku_ft_keysrange does not know when all 3 are exact, so set is_exact to false
+        *is_exact = false;
+    }
    return 0;
 }

@@ -928,6 +941,7 @@ toku_db_create(DB ** db, DB_ENV * env, uint32_t flags) {
    USDB(pre_acquire_table_lock);
    USDB(pre_acquire_fileops_lock);
    USDB(key_range64);
+    USDB(keys_range64);
    USDB(hot_optimize);
    USDB(stat64);
    USDB(get_fractal_tree_info64);