fixes #5771 merge the single txnid optimization to main. single threaded write...

fixes #5771 merge the single txnid optimization to main. single threaded write performance is up 20-50% in mysql and multithreaded performance is largely unchanged. git-svn-id: file:///svn/toku/tokudb@51108 c7de825b-a66e-492c-adef-691d508d4ae1

fixes #5771 merge the single txnid optimization to main. single threaded write...
fixes #5771 merge the single txnid optimization to main. single threaded write performance is up 20-50% in mysql and multithreaded performance is largely unchanged. git-svn-id: file:///svn/toku/tokudb@51108 c7de825b-a66e-492c-adef-691d508d4ae1
5943bb41 · John Esmet · Yoni Fogel · f50474f9 · 5943bb41 · 5943bb41
Commit 5943bb41 authored Dec 13, 2012 by John Esmet Committed by Yoni Fogel Apr 17, 2013
21 changed files
--- a/locktree/concurrent_tree.cc
+++ b/locktree/concurrent_tree.cc
@@ -29,6 +29,7 @@ void concurrent_tree::locked_keyrange::prepare(concurrent_tree *tree) {
    treenode *const root = &tree->m_root;
    m_tree = tree;
    m_subtree = root;
+    m_range = keyrange::get_infinite_range();
    root->mutex_lock();
 }

@@ -83,12 +84,6 @@ void concurrent_tree::locked_keyrange::remove(const keyrange &range) {
    }
 }

-uint64_t concurrent_tree::locked_keyrange::remove_all(uint64_t *mem_released) {
-    // in practice, remove_all is only okay on the root node, because you 
-    // want to remove all of the elements in the tree, not some subtree.
-    //
-    // so we lazily enforce that you are removing from a non-empty root.
-    invariant(m_subtree->is_root());
-    invariant(!m_subtree->is_empty());
-    return m_subtree->recursive_remove(mem_released);
+void concurrent_tree::locked_keyrange::remove_all(void) {
+    m_subtree->recursive_remove();
 }
--- a/locktree/concurrent_tree.h
+++ b/locktree/concurrent_tree.h
@@ -36,11 +36,14 @@ class concurrent_tree {
        // effect: prepare to acquire a locked keyrange over the given
        //         concurrent_tree, preventing other threads from preparing
        //         until this thread either does acquire() or release().
-        // rationale: this provides the user with a serialization point
-        //            for descending / modifying the the tree.
+        // note: operations performed on a prepared keyrange are equivalent
+        //         to ones performed on an acquired keyrange over -inf, +inf.
+        // rationale: this provides the user with a serialization point for descending 
+        //            or modifying the the tree. it also proives a convenient way of
+        //            doing serializable operations on the tree.
        // There are two valid sequences of calls:
-        //  - prepare, acquire, release
-        //  - prepare, release
+        //  - prepare, acquire, [operations], release
+        //  - prepare, [operations],release
        void prepare(concurrent_tree *tree);

        // requires: the locked keyrange was prepare()'d
@@ -68,12 +71,9 @@ class concurrent_tree {
        // rationale: caller is responsible for only removing existing ranges
        void remove(const keyrange &range);

-        // effect: removes everything within this locked keyrange
-        // requires: the locked keyrange is over -inf, +inf
-        // returns: the number of nodes removed
-        // returns: *mem_released updated to the cumulative size of all keyranges destroyed
-        // rationale: we'd like a fast O(n) way of removing everything from the tree
-        uint64_t remove_all(uint64_t *mem_released);
+        // effect: removes all of the keys represented by this locked keyrange
+        // rationale: we'd like a fast way to empty out a tree
+        void remove_all(void);

    private:
        // the concurrent tree this locked keyrange is for

--- a/locktree/locktree.cc
+++ b/locktree/locktree.cc
@@ -11,12 +11,14 @@
 #include <portability/toku_time.h>

 #include "locktree.h"
+#include "range_buffer.h"

 // including the concurrent_tree here expands the templates
 // and "defines" the implementation, so we do it here in
 // the locktree source file instead of the header.
 #include "concurrent_tree.h"

+
 namespace toku {

 // A locktree represents the set of row locks owned by all transactions
@@ -41,7 +43,10 @@ void locktree::create(manager::memory_tracker *mem_tracker, DICTIONARY_ID dict_i
    m_userdata = nullptr;
    XCALLOC(m_rangetree);
    m_rangetree->create(m_cmp);
-    reset_single_txnid_optimization(TXNID_NONE);
+
+    m_sto_txnid = TXNID_NONE;
+    m_sto_buffer.create();
+    m_sto_score = STO_SCORE_THRESHOLD;

    m_lock_request_info.pending_lock_requests.create();
    m_lock_request_info.mutex = TOKU_MUTEX_INITIALIZER;
@@ -63,6 +68,7 @@ void locktree::destroy(void) {
    m_rangetree->destroy();
    toku_free(m_cmp);
    toku_free(m_rangetree);
+    m_sto_buffer.destroy();

    m_lock_request_info.pending_lock_requests.destroy();
 }
@@ -78,7 +84,7 @@ struct row_lock {
 // caller does not own the range inside the returned row locks,
 // so remove from the tree with care using them as keys.
 static void iterate_and_get_overlapping_row_locks(
-        const concurrent_tree::locked_keyrange &lkr,
+        const concurrent_tree::locked_keyrange *lkr,
        GrowableArray<row_lock> *row_locks) {
    struct copy_fn_obj {
        GrowableArray<row_lock> *row_locks;
@@ -89,7 +95,7 @@ static void iterate_and_get_overlapping_row_locks(
        }
    } copy_fn;
    copy_fn.row_locks = row_locks;
-    lkr.iterate(&copy_fn);
+    lkr->iterate(&copy_fn);
 }

 // given a txnid and a set of overlapping row locks, determine
@@ -112,75 +118,140 @@ static bool determine_conflicting_txnids(const GrowableArray<row_lock> &row_lock
    return conflicts_exist;
 }

-// given a row lock, what is the effective memory size?
-// that is, how much memory does it take when stored in a tree?
-static uint64_t effective_row_lock_memory_size(const row_lock &lock) {
+// how much memory does a row lock take up in a concurrent tree?
+static uint64_t row_lock_size_in_tree(const row_lock &lock) {
    const uint64_t overhead = concurrent_tree::get_insertion_memory_overhead();
    return lock.range.get_memory_size() + overhead;
 }

-// remove all row locks from the given lkr, then notify the memory tracker.
-static void remove_all_row_locks(concurrent_tree::locked_keyrange *lkr,
-        locktree::manager::memory_tracker *mem_tracker) {
-    uint64_t mem_released = 0;
-    uint64_t num_removed = lkr->remove_all(&mem_released);
-    mem_released += num_removed * concurrent_tree::get_insertion_memory_overhead();
-    mem_tracker->note_mem_released(mem_released);
-}
-
 // remove and destroy the given row lock from the locked keyrange,
 // then notify the memory tracker of the newly freed lock.
-static void remove_row_lock(concurrent_tree::locked_keyrange *lkr,
+static void remove_row_lock_from_tree(concurrent_tree::locked_keyrange *lkr,
        const row_lock &lock, locktree::manager::memory_tracker *mem_tracker) {
-    const uint64_t mem_released = effective_row_lock_memory_size(lock);
+    const uint64_t mem_released = row_lock_size_in_tree(lock);
    lkr->remove(lock.range);
    mem_tracker->note_mem_released(mem_released);
 }

 // insert a row lock into the locked keyrange, then notify
 // the memory tracker of this newly acquired lock.
-static void insert_row_lock(concurrent_tree::locked_keyrange *lkr,
+static void insert_row_lock_into_tree(concurrent_tree::locked_keyrange *lkr,
        const row_lock &lock, locktree::manager::memory_tracker *mem_tracker) {
-    uint64_t mem_used = effective_row_lock_memory_size(lock);
+    uint64_t mem_used = row_lock_size_in_tree(lock);
    lkr->insert(lock.range, lock.txnid);
    mem_tracker->note_mem_used(mem_used);
 }

-void locktree::update_single_txnid_optimization(TXNID txnid) {
-    if (m_rangetree->is_empty()) {
-        // the optimization becomes possible for this txnid if the
-        // tree was previously empy before we touched it. the idea
-        // here is that if we are still the only one to have touched
-        // it by the time we commit, the optimization holds.
-        reset_single_txnid_optimization(txnid);
-    } else {
-        // the tree is not empty, so some txnid must have touched it.
-        invariant(m_single_txnid != TXNID_NONE);
-        // the optimization is not possible if the txnid has changed 
-        if (m_single_txnid_optimization_possible && m_single_txnid != txnid) {
-            m_single_txnid_optimization_possible = false;
+void locktree::sto_begin(TXNID txnid) {
+    invariant(m_sto_txnid == TXNID_NONE);
+    invariant(m_sto_buffer.is_empty());
+    m_sto_txnid = txnid;
+}
+
+void locktree::sto_append(const DBT *left_key, const DBT *right_key) {
+    uint64_t buffer_mem, delta;
+    keyrange range;
+    range.create(left_key, right_key);
+
+    buffer_mem = m_sto_buffer.get_num_bytes();
+    m_sto_buffer.append(left_key, right_key);
+    delta = m_sto_buffer.get_num_bytes() - buffer_mem;
+    m_mem_tracker->note_mem_used(delta);
+}
+
+void locktree::sto_end(void) {
+    uint64_t num_bytes = m_sto_buffer.get_num_bytes();
+    m_mem_tracker->note_mem_released(num_bytes);
+    m_sto_buffer.destroy();
+    m_sto_buffer.create();
+    m_sto_txnid = TXNID_NONE;
+}
+
+void locktree::sto_end_early(void *prepared_lkr) {
+    sto_migrate_buffer_ranges_to_tree(prepared_lkr);
+    sto_end();
+    m_sto_score = 0;
+}
+
+void locktree::sto_migrate_buffer_ranges_to_tree(void *prepared_lkr) {
+    // There should be something to migrate, and nothing in the rangetree.
+    invariant(!m_sto_buffer.is_empty());
+    invariant(m_rangetree->is_empty());
+
+    concurrent_tree sto_rangetree;
+    concurrent_tree::locked_keyrange sto_lkr;
+    sto_rangetree.create(m_cmp);
+
+    // insert all of the ranges from the single txnid buffer into a new rangtree
+    range_buffer::iterator iter;
+    range_buffer::iterator::record rec;
+    iter.create(&m_sto_buffer);
+    while (iter.current(&rec)) {
+        sto_lkr.prepare(&sto_rangetree);
+        int r = acquire_lock_consolidated(&sto_lkr,
+                m_sto_txnid, rec.get_left_key(), rec.get_right_key(), nullptr);
+        invariant_zero(r);
+        sto_lkr.release();
+        iter.next();
    }
+
+    // Iterate the newly created rangetree and insert each range into the
+    // locktree's rangetree, on behalf of the old single txnid.
+    struct migrate_fn_obj {
+        concurrent_tree::locked_keyrange *dst_lkr;
+        bool fn(const keyrange &range, TXNID txnid) {
+            dst_lkr->insert(range, txnid);
+            return true;
        }
+    } migrate_fn;
+    migrate_fn.dst_lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr);
+    sto_lkr.prepare(&sto_rangetree);
+    sto_lkr.iterate(&migrate_fn);
+    sto_lkr.remove_all();
+    sto_lkr.release();
+    sto_rangetree.destroy();
+    invariant(!m_rangetree->is_empty());
 }

-// acquire a lock in the given key range, inclusive. if successful,
-// return 0. otherwise, populate the conflicts txnid_set with the set of
-// transactions that conflict with this request.
-int locktree::acquire_lock(bool is_write_request, TXNID txnid,
+bool locktree::sto_try_acquire(void *prepared_lkr, TXNID txnid,
+        const DBT *left_key, const DBT *right_key) {
+    if (m_rangetree->is_empty() && m_sto_buffer.is_empty() && m_sto_score >= STO_SCORE_THRESHOLD) {
+        // We can do the optimization because the rangetree is empty, and
+        // we know its worth trying because the sto score is big enough.
+        sto_begin(txnid);
+    } else if (m_sto_txnid != TXNID_NONE) {
+        // We are currently doing the optimization. Check if we need to cancel
+        // it because a new txnid appeared, or if the current single txnid has
+        // taken too many locks already.
+        if (m_sto_txnid != txnid || m_sto_buffer.get_num_ranges() > STO_BUFFER_MAX_SIZE) {
+            sto_end_early(prepared_lkr);
+        }
+    }
+
+    // At this point the sto txnid is properly set. If it is valid, then
+    // this txnid can append its lock to the sto buffer successfully.
+    if (m_sto_txnid != TXNID_NONE) {
+        invariant(m_sto_txnid == txnid);
+        sto_append(left_key, right_key);
+        return true;
+    } else {
+        invariant(m_sto_buffer.is_empty());
+        return false;
+    }
+}
+
+// try to acquire a lock and consolidate it with existing locks if possible
+// param: lkr, a prepared locked keyrange
+// return: 0 on success, DB_LOCK_NOTGRANTED if conflicting locks exist.
+int locktree::acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
        const DBT *left_key, const DBT *right_key, txnid_set *conflicts) {
+    int r = 0;
+    concurrent_tree::locked_keyrange *lkr;
+
    keyrange requested_range;
    requested_range.create(left_key, right_key);
-
-    // we are only supporting write locks for simplicity
-    invariant(is_write_request);
-
-    // acquire and prepare a locked keyrange over the requested range.
-    // prepare is a serialzation point, so we take the opportunity to
-    // update the single txnid optimization bits.
-    concurrent_tree::locked_keyrange lkr;
-    lkr.prepare(m_rangetree);
-    update_single_txnid_optimization(txnid);
-    lkr.acquire(requested_range);
+    lkr = static_cast<concurrent_tree::locked_keyrange *>(prepared_lkr); 
+    lkr->acquire(requested_range);

    // copy out the set of overlapping row locks.
    GrowableArray<row_lock> overlapping_row_locks;
@@ -189,7 +260,8 @@ int locktree::acquire_lock(bool is_write_request, TXNID txnid,
    size_t num_overlapping_row_locks = overlapping_row_locks.get_size();

    // if any overlapping row locks conflict with this request, bail out.
-    bool conflicts_exist = determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
+    bool conflicts_exist = determine_conflicting_txnids(
+            overlapping_row_locks, txnid, conflicts);
    if (!conflicts_exist) {
        // there are no conflicts, so all of the overlaps are for the requesting txnid.
        // so, we must consolidate all existing overlapping ranges and the requested
@@ -198,23 +270,43 @@ int locktree::acquire_lock(bool is_write_request, TXNID txnid,
            row_lock overlapping_lock = overlapping_row_locks.fetch_unchecked(i);
            invariant(overlapping_lock.txnid == txnid);
            requested_range.extend(m_cmp, overlapping_lock.range);
-            remove_row_lock(&lkr, overlapping_lock, m_mem_tracker);
+            remove_row_lock_from_tree(lkr, overlapping_lock, m_mem_tracker);
        }

        row_lock new_lock = { .range = requested_range, .txnid = txnid };
-        insert_row_lock(&lkr, new_lock, m_mem_tracker);
+        insert_row_lock_into_tree(lkr, new_lock, m_mem_tracker);
+    } else {
+        r = DB_LOCK_NOTGRANTED;
    }

-    lkr.release();
-    overlapping_row_locks.deinit();
    requested_range.destroy();
+    overlapping_row_locks.deinit();
+    return r;
+}

-    // if there were conflicts, the lock is not granted.
-    if (conflicts_exist) {
-        return DB_LOCK_NOTGRANTED;
-    } else {
-        return 0;
+// acquire a lock in the given key range, inclusive. if successful,
+// return 0. otherwise, populate the conflicts txnid_set with the set of
+// transactions that conflict with this request.
+int locktree::acquire_lock(bool is_write_request, TXNID txnid,
+        const DBT *left_key, const DBT *right_key, txnid_set *conflicts) {
+    int r = 0;
+
+    // we are only supporting write locks for simplicity
+    invariant(is_write_request);
+
+    // acquire and prepare a locked keyrange over the requested range.
+    // prepare is a serialzation point, so we take the opportunity to
+    // try the single txnid optimization first.
+    concurrent_tree::locked_keyrange lkr;
+    lkr.prepare(m_rangetree);
+
+    bool acquired = sto_try_acquire(&lkr, txnid, left_key, right_key);
+    if (!acquired) {
+        r = acquire_lock_consolidated(&lkr, txnid, left_key, right_key, conflicts);
    }
+
+    lkr.release();
+    return r;
 }

 int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,
@@ -252,7 +344,7 @@ void locktree::get_conflicts(bool is_write_request, TXNID txnid,
    // copy out the set of overlapping row locks and determine the conflicts
    GrowableArray<row_lock> overlapping_row_locks;
    overlapping_row_locks.init();
-    iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+    iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);

    // we don't care if conflicts exist. we just want the conflicts set populated.
    (void) determine_conflicting_txnids(overlapping_row_locks, txnid, conflicts);
@@ -300,7 +392,7 @@ void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
    // copy out the set of overlapping row locks.
    GrowableArray<row_lock> overlapping_row_locks;
    overlapping_row_locks.init();
-    iterate_and_get_overlapping_row_locks(lkr, &overlapping_row_locks);
+    iterate_and_get_overlapping_row_locks(&lkr, &overlapping_row_locks);
    size_t num_overlapping_row_locks = overlapping_row_locks.get_size();

    for (size_t i = 0; i < num_overlapping_row_locks; i++) {
@@ -308,7 +400,7 @@ void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
        // If this isn't our lock, that's ok, just don't remove it.
        // See rationale above.
        if (lock.txnid == txnid) {
-            remove_row_lock(&lkr, lock, m_mem_tracker);
+            remove_row_lock_from_tree(&lkr, lock, m_mem_tracker);
        }
    }

@@ -317,41 +409,27 @@ void locktree::remove_overlapping_locks_for_txnid(TXNID txnid,
    release_range.destroy();
 }

-// reset the optimization bit to possible for the given txnid
-void locktree::reset_single_txnid_optimization(TXNID txnid) {
-    m_single_txnid = txnid;
-    m_single_txnid_optimization_possible = true;
+inline bool locktree::sto_txnid_is_valid_unsafe(void) const {
+    return m_sto_txnid != TXNID_NONE;
 }

-inline bool locktree::unsafe_read_single_txnid_optimization_possible(void) const {
-    return m_single_txnid_optimization_possible;
+inline int locktree::sto_get_score_unsafe(void) const {
+    return m_sto_score;
 }

-bool locktree::try_single_txnid_release_optimization(TXNID txnid) {
+bool locktree::sto_try_release(TXNID txnid) {
    bool released = false;
-    if (unsafe_read_single_txnid_optimization_possible()) {
+    if (sto_txnid_is_valid_unsafe()) {
        // check the bit again with a prepared locked keyrange,
        // which protects the optimization bits and rangetree data
        concurrent_tree::locked_keyrange lkr;
        lkr.prepare(m_rangetree);
-        if (m_single_txnid_optimization_possible) {
+        if (m_sto_txnid != TXNID_NONE) {
            // this txnid better be the single txnid on this locktree,
            // or else we are in big trouble (meaning the logic is broken)
-            invariant(m_single_txnid == txnid);
-
-            // acquire a locked range on -inf, +inf. this is just for 
-            // readability's sake, since the prepared lkr already has
-            // the root locked, but the API says to do this so we do.
-            keyrange infinite_range = keyrange::get_infinite_range();
-            lkr.acquire(infinite_range);
-
-            // knowing that only our row locks exist in the locktree
-            // and that we have the entire thing locked, remove everything.
-            remove_all_row_locks(&lkr, m_mem_tracker);
-
-            // reset the optimization back to possible, with no txnid
-            // we set txnid to TXNID_NONE for invariant purposes.
-            reset_single_txnid_optimization(TXNID_NONE);
+            invariant(m_sto_txnid == txnid);
+            invariant(m_rangetree->is_empty());
+            sto_end();
            released = true;
        }
        lkr.release();
@@ -364,7 +442,7 @@ bool locktree::try_single_txnid_release_optimization(TXNID txnid) {
 void locktree::release_locks(TXNID txnid, const range_buffer *ranges) {
    // try the single txn optimization. if it worked, then all of the
    // locks are already released, otherwise we need to do it here.
-    bool released = try_single_txnid_release_optimization(txnid);
+    bool released = sto_try_release(txnid);
    if (!released) {
        range_buffer::iterator iter;
        range_buffer::iterator::record rec;
@@ -375,6 +453,13 @@ void locktree::release_locks(TXNID txnid, const range_buffer *ranges) {
            remove_overlapping_locks_for_txnid(txnid, left_key, right_key);
            iter.next();
        }
+        // Increase the sto score slightly. Eventually it will hit
+        // the threshold and we'll try the optimization again. This
+        // is how a previously multithreaded system transitions into
+        // a single threaded system that benefits from the optimization.
+        if (sto_get_score_unsafe() < STO_SCORE_THRESHOLD) {
+            toku_sync_fetch_and_add(&m_sto_score, 1);
+        }
    }
 }

@@ -409,12 +494,12 @@ static int extract_first_n_row_locks(concurrent_tree::locked_keyrange *lkr,

    // now that the ranges have been copied out, complete
    // the extraction by removing the ranges from the tree.
-    // use remove_row_lock() so we properly track the
+    // use remove_row_lock_from_tree() so we properly track the
    // amount of memory and number of locks freed.
    int num_extracted = extract_fn.num_extracted;
    invariant(num_extracted <= num_to_extract);
    for (int i = 0; i < num_extracted; i++) {
-        remove_row_lock(lkr, row_locks[i], mem_tracker);
+        remove_row_lock_from_tree(lkr, row_locks[i], mem_tracker);
    }

    return num_extracted;
@@ -437,6 +522,13 @@ void locktree::escalate(void) {
    lkr.prepare(m_rangetree);
    lkr.acquire(infinite_range);

+    // if we're in the single txnid optimization, simply call it off.
+    // if you have to run escalation, you probably don't care about
+    // the optimization anyway, and this makes things easier.
+    if (m_sto_txnid != TXNID_NONE) {
+        sto_end_early(&lkr);
+    }
+
    // extract and remove batches of row locks from the locktree
    int num_extracted;
    static const int num_row_locks_per_batch = 128;
@@ -487,7 +579,7 @@ void locktree::escalate(void) {
    size_t new_num_locks = escalated_locks.get_size();
    for (size_t i = 0; i < new_num_locks; i++) {
        row_lock lock = escalated_locks.fetch_unchecked(i);
-        insert_row_lock(&lkr, lock, m_mem_tracker);
+        insert_row_lock_into_tree(&lkr, lock, m_mem_tracker);
        lock.range.destroy();
    }


--- a/locktree/locktree.h
+++ b/locktree/locktree.h
@@ -168,6 +168,12 @@ class locktree {
        };
        ENSURE_POD(memory_tracker);

+        // effect: calls the private function run_escalation(), only ok to
+        //         do for tests.
+        // rationale: to get better stress test coverage, we want a way to
+        //            deterministicly trigger lock escalation.
+        void run_escalation_for_test(void);
+
    private:
        static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
        static const uint64_t DEFAULT_LOCK_WAIT_TIME = 0;
@@ -240,54 +246,148 @@ class locktree {

    struct lt_lock_request_info m_lock_request_info;

-    // the following is an optimization for locktrees that contain
-    // locks for only a single txnid. in this case, we can just
-    // delete everything from the locktree when that txnid unlocks.
+    // The following fields and members prefixed with "sto_" are for
+    // the single txnid optimization, intended to speed up the case
+    // when only one transaction is using the locktree. If we know
+    // the locktree has only one transaction, then acquiring locks
+    // takes O(1) work and releasing all locks takes O(1) work.
+    //
+    // How do we know that the locktree only has a single txnid?
+    // What do we do if it does?
+    //
+    // When a txn with txnid T requests a lock:
+    // - If the tree is empty, the optimization is possible. Set the single
+    // txnid to T, and insert the lock range into the buffer.
+    // - If the tree is not empty, check if the single txnid is T. If so,
+    // append the lock range to the buffer. Otherwise, migrate all of
+    // the locks in the buffer into the rangetree on behalf of txnid T,
+    // and invalid the single txnid.
+    //
+    // When a txn with txnid T releases its locks:
+    // - If the single txnid is valid, it must be for T. Destroy the buffer.
+    // - If it's not valid, release locks the normal way in the rangetree.
+    //
+    // To carry out the optimization we need to record a single txnid
+    // and a range buffer for each locktree, each protected by the root
+    // lock of the locktree's rangetree. The root lock for a rangetree
+    // is grabbed by preparing a locked keyrange on the rangetree.
+    TXNID m_sto_txnid;
+    range_buffer m_sto_buffer;
+
+    // The single txnid optimization speeds up the case when only one
+    // transaction is using the locktree. But it has the potential to
+    // hurt the case when more than one txnid exists.
    //
-    // how do we know that this locktree only has a single txnid?
+    // There are two things we need to do to make the optimization only
+    // optimize the case we care about, and not hurt the general case.
    //
-    // when a txn requests a lock:
-    // - if the tree is empty, set the single txnid to that txn, set
-    // the optimization to true.
-    // - if the tree is not empty, then some txnid has inserted into
-    // the tree before and its txnid is m_single_txnid. set the bit
-    // to false if that txnid is different than the one about to insert.
-    // - if the txnid never changes (ie: only one txnid inserts into
-    // a locktree) then the bit stays true and the optimization happens.
+    // Bound the worst-case latency for lock migration when the
+    // optimization stops working:
+    // - Idea: Stop the optimization and migrate immediate if we notice
+    // the single txnid has takes many locks in the range buffer.
+    // - Implementation: Enforce a max size on the single txnid range buffer.
+    // - Analysis: Choosing the perfect max value, M, is difficult to do
+    // without some feedback from the field. Intuition tells us that M should
+    // not be so small that the optimization is worthless, and it should not
+    // be so big that it's unreasonable to have to wait behind a thread doing
+    // the work of converting M buffer locks into rangetree locks.
    //
-    // when a txn releases its locks
-    // - check the optimization bit. if it is set, take the locktree's mutex
-    // and then check it agian. if it is still set, then perform the optimizaiton.
-    // - if the bit was not set, then carry out release locks as usual.
+    // Prevent concurrent-transaction workloads from trying the optimization
+    // in vain:
+    // - Idea: Don't even bother trying the optimization if we think the
+    // system is in a concurrent-transaction state.
+    // - Implementation: Do something even simpler than detecting whether the
+    // system is in a concurent-transaction state. Just keep a "score" value
+    // and some threshold. If at any time the locktree is eligible for the
+    // optimization, only do it if the score is at this threshold. When you
+    // actually do the optimization but someone has to migrate locks in the buffer
+    // (expensive), then reset the score back to zero. Each time a txn
+    // releases locks, the score is incremented by 1.
+    // - Analysis: If you let the threshold be "C", then at most 1 / C txns will
+    // do the optimization in a concurrent-transaction system. Similarly, it
+    // takes at most C txns to start using the single txnid optimzation, which
+    // is good when the system transitions from multithreaded to single threaded.
    //
-    // the single txnid and the optimizable possible bit are both protected
-    // by the root lock on the concurrent tree. the way this is implemented
-    // is by a locked keyrange function called prepare(), which grabs
-    // the root lock and returns. once acquire/release() is called, the root
-    // lock is unlocked if necessary. so prepare() acts as a serialization
-    // point where we can safely read and modify these bits.
-    TXNID m_single_txnid;
-    bool m_single_txnid_optimization_possible;
-
-    // effect: If the single txnid is possible, assert that it
-    //         is for the given txnid and then release all of
-    //         the locks in the locktree.
-    // returns: True if locks were released, false otherwise
-    bool try_single_txnid_release_optimization(TXNID txnid);
-
-    // effect: Checks if the single txnid bit is set and, if so,
-    //         sets it to false iff the given txnid differs
-    //         from the current known single txnid.
-    void update_single_txnid_optimization(TXNID txnid);
-
-    // effect: Sets the single txnid bit to be true for the given txnid
-    void reset_single_txnid_optimization(TXNID txnid);
+    // STO_BUFFER_MAX_SIZE:
+    //
+    // We choose the max value to be 1 million since most transactions are smaller
+    // than 1 million and we can create a rangetree of 1 million elements in
+    // less than a second. So we can be pretty confident that this threshold
+    // enables the optimization almost always, and prevents super pathological
+    // latency issues for the first lock taken by a second thread.
+    //
+    // STO_SCORE_THRESHOLD:
+    //
+    // A simple first guess at a good value for the score threshold is 100.
+    // By our analysis, we'd end up doing the optimization in vain for
+    // around 1% of all transactions, which seems reasonable. Further,
+    // if the system goes single threaded, it ought to be pretty quick
+    // for 100 transactions to go by, so we won't have to wait long before
+    // we start doing the single txind optimzation again.
+    static const int STO_BUFFER_MAX_SIZE = 1 * 1024 * 1024;
+    static const int STO_SCORE_THRESHOLD = 100;
+    int m_sto_score;
+
+    // effect: begins the single txnid optimizaiton, setting m_sto_txnid
+    //         to the given txnid.
+    // requires: m_sto_txnid is invalid
+    void sto_begin(TXNID txnid);
+
+    // effect: append a range to the sto buffer
+    // requires: m_sto_txnid is valid
+    void sto_append(const DBT *left_key, const DBT *right_key);
+
+    // effect: ends the single txnid optimization, releaseing any memory
+    //         stored in the sto buffer, notifying the tracker, and
+    //         invalidating m_sto_txnid.
+    // requires: m_sto_txnid is valid
+    void sto_end(void);
+
+    // params: prepared_lkr is a void * to a prepared locked keyrange. see below.
+    // effect: ends the single txnid optimization early, migrating buffer locks
+    //         into the rangetree, calling sto_end(), and then setting the
+    //         sto_score back to zero.
+    // requires: m_sto_txnid is valid
+    void sto_end_early(void *prepared_lkr);
+
+    // params: prepared_lkr is a void * to a prepared locked keyrange. we can't use
+    //         the real type because the compiler won't allow us to forward declare
+    //         concurrent_tree::locked_keyrange without including concurrent_tree.h,
+    //         which we cannot do here because it is a template implementation.
+    // requires: the prepared locked keyrange is for the locktree's rangetree
+    // requires: m_sto_txnid is valid
+    // effect: migrates each lock in the single txnid buffer into the locktree's
+    //         rangetree, notifying the memory tracker as necessary.
+    void sto_migrate_buffer_ranges_to_tree(void *prepared_lkr);
+
+    // effect: If m_sto_txnid is valid, then release the txnid's locks
+    //         by ending the optimization.
+    // requires: If m_sto_txnid is valid, it is equal to the given txnid
+    // returns: True if locks were released for this txnid
+    bool sto_try_release(TXNID txnid);
+
+    // params: prepared_lkr is a void * to a prepared locked keyrange. see above.
+    // requires: the prepared locked keyrange is for the locktree's rangetree
+    // effect: If m_sto_txnid is valid and equal to the given txnid, then
+    // append a range onto the buffer. Otherwise, if m_sto_txnid is valid
+    //        but not equal to this txnid, then migrate the buffer's locks
+    //        into the rangetree and end the optimization, setting the score
+    //        back to zero.
+    // returns: true if the lock was acquired for this txnid
+    bool sto_try_acquire(void *prepared_lkr, TXNID txnid,
+            const DBT *left_key, const DBT *right_key);

    // Effect:
    //  Provides a hook for a helgrind suppression.
    // Returns:
-    //  m_single_txnid_optimization_possible
-    bool unsafe_read_single_txnid_optimization_possible(void) const;
+    //  true if m_sto_txnid is not TXNID_NONE
+    bool sto_txnid_is_valid_unsafe(void) const;
+
+    // Effect:
+    //  Provides a hook for a helgrind suppression.
+    // Returns:
+    //  m_sto_score
+    int sto_get_score_unsafe(void )const;

    // effect: Creates a locktree that uses the given memory tracker
    //         to report memory usage and honor memory constraints.
@@ -299,12 +399,15 @@ class locktree {
    void remove_overlapping_locks_for_txnid(TXNID txnid,
            const DBT *left_key, const DBT *right_key);

-    int try_acquire_lock(bool is_write_request, TXNID txnid,
+    int acquire_lock_consolidated(void *prepared_lkr, TXNID txnid,
            const DBT *left_key, const DBT *right_key, txnid_set *conflicts);

    int acquire_lock(bool is_write_request, TXNID txnid,
            const DBT *left_key, const DBT *right_key, txnid_set *conflicts);

+    int try_acquire_lock(bool is_write_request, TXNID txnid,
+            const DBT *left_key, const DBT *right_key, txnid_set *conflicts);
+
    void escalate();

    friend class locktree_unit_test;

--- a/locktree/manager.cc
+++ b/locktree/manager.cc
@@ -196,6 +196,11 @@ void locktree::manager::release_lt(locktree *lt) {
    }
 }

+// test-only version of lock escalation
+void locktree::manager::run_escalation_for_test(void) {
+    run_escalation();
+}
+
 // effect: escalate's the locks in each locktree
 // requires: manager's mutex is held
 void locktree::manager::run_escalation(void) {

--- a/locktree/range_buffer.cc
+++ b/locktree/range_buffer.cc
@@ -116,6 +116,7 @@ void range_buffer::create(void) {
    m_buf = nullptr;
    m_buf_size = 0;
    m_buf_current = 0;
+    m_num_ranges = 0;
 }

 void range_buffer::append(const DBT *left_key, const DBT *right_key) {
@@ -125,6 +126,19 @@ void range_buffer::append(const DBT *left_key, const DBT *right_key) {
    } else {
        append_range(left_key, right_key);
    }
+    m_num_ranges++;
+}
+
+bool range_buffer::is_empty(void) const {
+    return m_buf == nullptr;
+}
+
+uint64_t range_buffer::get_num_bytes(void) const {
+    return m_buf_current;
+}
+
+int range_buffer::get_num_ranges(void) const {
+    return m_num_ranges;
 }

 void range_buffer::destroy(void) {

--- a/locktree/range_buffer.h
+++ b/locktree/range_buffer.h
@@ -98,12 +98,22 @@ class range_buffer {
    // if the keys are equal, then only one copy is stored.
    void append(const DBT *left_key, const DBT *right_key);

+    // is this range buffer empty?
+    bool is_empty(void) const;
+
+    // how many bytes are stored in this range buffer?
+    uint64_t get_num_bytes(void) const;
+
+    // how many ranges are stored in this range buffer?
+    int get_num_ranges(void) const;
+
    void destroy(void);

 private:
    char *m_buf;
    size_t m_buf_size;
    size_t m_buf_current;
+    int m_num_ranges;

    void append_range(const DBT *left_key, const DBT *right_key);


--- a/locktree/tests/concurrent_tree_lkr_remove_all.cc
+++ b/locktree/tests/concurrent_tree_lkr_remove_all.cc
@@ -18,13 +18,6 @@ void concurrent_tree_unit_test::test_lkr_remove_all(void) {
    const uint64_t min = 0;
    const uint64_t max = 20;

-    // determine how much memory should be released
-    keyrange example_keyrange;
-    example_keyrange.create(get_dbt(0), get_dbt(0));
-    const uint64_t num_elements = max + 1;
-    const uint64_t expected_mem_released = 
-        example_keyrange.get_memory_size() * num_elements;
-
    // remove_all should work regardless of how the
    // data was inserted into the tree, so we test it
    // on a tree whose elements were populated starting
@@ -44,10 +37,7 @@ void concurrent_tree_unit_test::test_lkr_remove_all(void) {

        // remove_all() from the locked keyrange and assert that
        // the number of elements and memory removed is correct.
-        uint64_t mem_released = 0;
-        uint64_t num_removed = lkr.remove_all(&mem_released);
-        invariant(num_removed == num_elements);
-        invariant(mem_released == expected_mem_released);
+        lkr.remove_all();

        invariant(lkr.m_subtree->is_empty());
        invariant(tree.is_empty());

--- a/locktree/tests/lock_request_start_pending.cc
+++ b/locktree/tests/lock_request_start_pending.cc
@@ -51,7 +51,7 @@ void lock_request_unit_test::test_start_pending(void) {
    invariant(compare_dbts(nullptr, &request.m_right_key_copy, one) == 0);

    // release the range lock for txnid b
-    lt->remove_overlapping_locks_for_txnid(txnid_b, zero, two);
+    locktree_unit_test::locktree_test_release_lock(lt, txnid_b, zero, two);

    // now retry the lock requests.
    // it should transition the request to successfully complete.
@@ -60,7 +60,7 @@ void lock_request_unit_test::test_start_pending(void) {
    invariant(request.m_state == lock_request::state::COMPLETE);
    invariant(request.m_complete_r == 0);

-    lt->remove_overlapping_locks_for_txnid(txnid_a, one, one);
+    locktree_unit_test::locktree_test_release_lock(lt, txnid_a, one, one);

    request.destroy();
    mgr.release_lt(lt);

--- a/locktree/tests/lock_request_unit_test.h
+++ b/locktree/tests/lock_request_unit_test.h
@@ -8,6 +8,7 @@
 #define TOKU_LOCK_REQUEST_UNIT_TEST_H

 #include "test.h"
+#include "locktree_unit_test.h"

 #include "lock_request.h"

@@ -37,7 +38,7 @@ class lock_request_unit_test {
    // lt->release_locks(), not individually using lt->remove_overlapping_locks_for_txnid).
    void release_lock_and_retry_requests(locktree *lt,
            TXNID txnid, const DBT *left_key, const DBT * right_key) {
-        lt->remove_overlapping_locks_for_txnid(txnid, left_key, right_key);
+        locktree_unit_test::locktree_test_release_lock(lt, txnid, left_key, right_key);
        lock_request::retry_all_lock_requests(lt);
    }
 };

--- a/locktree/tests/locktree_conflicts.cc
+++ b/locktree/tests/locktree_conflicts.cc
@@ -77,10 +77,9 @@ void locktree_unit_test::test_conflicts(void) {
 #undef ACQUIRE_LOCK
        }

-        invariant(num_row_locks(lt) == 2);
        lt->remove_overlapping_locks_for_txnid(txnid_a, one, one);
        lt->remove_overlapping_locks_for_txnid(txnid_a, three, four);
-        invariant(num_row_locks(lt) == 0);
+        invariant(no_row_locks(lt));
    }

    mgr.release_lt(lt);

--- a/locktree/tests/locktree_overlapping_relock.cc
+++ b/locktree/tests/locktree_overlapping_relock.cc
@@ -29,6 +29,16 @@ void locktree_unit_test::test_overlapping_relock(void) {
    int r;
    TXNID txnid_a = 1001;

+    // because of the single txnid optimization, there is no consolidation
+    // of read or write ranges until there is at least two txnids in
+    // the locktree. so here we add some arbitrary txnid to get a point
+    // lock [100, 100] so that the test below can expect to actually 
+    // do something. at the end of the test, we release 100, 100.
+    const TXNID the_other_txnid = 9999;
+    const DBT *hundred = get_dbt(100);
+    r = lt->acquire_write_lock(the_other_txnid, hundred, hundred, nullptr);
+    invariant(r == 0);
+
    for (int test_run = 0; test_run < 2; test_run++) {
        // test_run == 0 means test with read lock
        // test_run == 1 means test with write lock
@@ -40,19 +50,22 @@ void locktree_unit_test::test_overlapping_relock(void) {
        // ensure only [1,2] exists in the tree
        r = ACQUIRE_LOCK(txnid_a, one, one, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 1);
        r = ACQUIRE_LOCK(txnid_a, two, two, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 2);
        r = ACQUIRE_LOCK(txnid_a, one, two, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 1);

        struct verify_fn_obj {
+            bool saw_the_other;
            TXNID expected_txnid;
            keyrange *expected_range;
            comparator *cmp;
            bool fn(const keyrange &range, TXNID txnid) {
+                if (txnid == the_other_txnid) {
+                    invariant(!saw_the_other);
+                    saw_the_other = true;
+                    return true;
+                }
                invariant(txnid == expected_txnid);
                keyrange::comparison c = range.compare(cmp, *expected_range);
                invariant(c == keyrange::comparison::EQUALS);
@@ -61,55 +74,52 @@ void locktree_unit_test::test_overlapping_relock(void) {
        } verify_fn;
        verify_fn.cmp = lt->m_cmp;

+#define do_verify() \
+        do { verify_fn.saw_the_other = false; locktree_iterate<verify_fn_obj>(lt, &verify_fn); } while (0)
+
        keyrange range;
        range.create(one, two);
        verify_fn.expected_txnid = txnid_a;
        verify_fn.expected_range = &range;
-        locktree_iterate<verify_fn_obj>(lt, &verify_fn);
+        do_verify();

        // unlocking [1,1] should remove the only range,
        // the other unlocks shoudl do nothing.
-        invariant(num_row_locks(lt) == 1);
        lt->remove_overlapping_locks_for_txnid(txnid_a, one, one);
-        invariant(num_row_locks(lt) == 0);
        lt->remove_overlapping_locks_for_txnid(txnid_a, two, two);
-        invariant(num_row_locks(lt) == 0);
        lt->remove_overlapping_locks_for_txnid(txnid_a, one, two);
-        invariant(num_row_locks(lt) == 0);

        // try overlapping from the right
        r = ACQUIRE_LOCK(txnid_a, one, three, nullptr);
-        invariant(num_row_locks(lt) == 1);
        r = ACQUIRE_LOCK(txnid_a, two, five, nullptr);
-        invariant(num_row_locks(lt) == 1);
        verify_fn.expected_txnid = txnid_a;
        range.create(one, five);
        verify_fn.expected_range = &range;
-        locktree_iterate<verify_fn_obj>(lt, &verify_fn);
+        do_verify();

        // now overlap from the left
        r = ACQUIRE_LOCK(txnid_a, zero, four, nullptr);
-        invariant(num_row_locks(lt) == 1);
        verify_fn.expected_txnid = txnid_a;
        range.create(zero, five);
        verify_fn.expected_range = &range;
-        locktree_iterate<verify_fn_obj>(lt, &verify_fn);
+        do_verify();

        // now relock in a range that is already dominated
        r = ACQUIRE_LOCK(txnid_a, five, five, nullptr);
-        invariant(num_row_locks(lt) == 1);
        verify_fn.expected_txnid = txnid_a;
        range.create(zero, five);
        verify_fn.expected_range = &range;
-        locktree_iterate<verify_fn_obj>(lt, &verify_fn);
+        do_verify();

        // release one of the locks we acquired. this should clean up the whole range.
        lt->remove_overlapping_locks_for_txnid(txnid_a, zero, four);
-        invariant(num_row_locks(lt) == 0);

 #undef ACQUIRE_LOCK
    }

+    // remove the other txnid's lock now
+    lt->remove_overlapping_locks_for_txnid(the_other_txnid, hundred, hundred);
+
    mgr.release_lt(lt);
    mgr.destroy();
 }

--- a/locktree/tests/locktree_simple_lock.cc
+++ b/locktree/tests/locktree_simple_lock.cc
@@ -36,60 +36,35 @@ void locktree_unit_test::test_simple_lock(void) {
        // four txns, four points
        r = ACQUIRE_LOCK(txnid_a, one, one, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 1);
        r = ACQUIRE_LOCK(txnid_b, two, two, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 2);
        r = ACQUIRE_LOCK(txnid_c, three, three, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 3);
        r = ACQUIRE_LOCK(txnid_d, four, four, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 4);
-        lt->remove_overlapping_locks_for_txnid(txnid_a, one, one);
-        invariant(num_row_locks(lt) == 3);
-        lt->remove_overlapping_locks_for_txnid(txnid_b, two, two);
-        invariant(num_row_locks(lt) == 2);
-        lt->remove_overlapping_locks_for_txnid(txnid_c, three, three);
-        invariant(num_row_locks(lt) == 1);
-        lt->remove_overlapping_locks_for_txnid(txnid_d, four, four);
-        invariant(num_row_locks(lt) == 0);
+        locktree_test_release_lock(lt, txnid_a, one, one);
+        locktree_test_release_lock(lt, txnid_b, two, two);
+        locktree_test_release_lock(lt, txnid_c, three, three);
+        locktree_test_release_lock(lt, txnid_d, four, four);
+        invariant(no_row_locks(lt));

        // two txns, two ranges
        r = ACQUIRE_LOCK(txnid_c, one, two, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 1);
        r = ACQUIRE_LOCK(txnid_b, three, four, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 2);
-        lt->remove_overlapping_locks_for_txnid(txnid_c, one, two);
-        invariant(num_row_locks(lt) == 1);
-        lt->remove_overlapping_locks_for_txnid(txnid_b, three, four);
-        invariant(num_row_locks(lt) == 0);
-
-        // one txn, one range, one point
-        r = ACQUIRE_LOCK(txnid_a, two, three, nullptr);
-        invariant(r == 0);
-        invariant(num_row_locks(lt) == 1);
-        r = ACQUIRE_LOCK(txnid_a, four, four, nullptr);
-        invariant(r == 0);
-        invariant(num_row_locks(lt) == 2);
-        lt->remove_overlapping_locks_for_txnid(txnid_a, two, three);
-        invariant(num_row_locks(lt) == 1);
-        lt->remove_overlapping_locks_for_txnid(txnid_a, four, four);
-        invariant(num_row_locks(lt) == 0);
+        locktree_test_release_lock(lt, txnid_c, one, two);
+        locktree_test_release_lock(lt, txnid_b, three, four);
+        invariant(no_row_locks(lt));

        // two txns, one range, one point
        r = ACQUIRE_LOCK(txnid_c, three, four, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 1);
        r = ACQUIRE_LOCK(txnid_d, one, one, nullptr);
        invariant(r == 0);
-        invariant(num_row_locks(lt) == 2);
-        lt->remove_overlapping_locks_for_txnid(txnid_c, three, four);
-        invariant(num_row_locks(lt) == 1);
-        lt->remove_overlapping_locks_for_txnid(txnid_d, one, one);
-        invariant(num_row_locks(lt) == 0);
+        locktree_test_release_lock(lt, txnid_c, three, four);
+        locktree_test_release_lock(lt, txnid_d, one, one);
+        invariant(no_row_locks(lt));

 #undef ACQUIRE_LOCK
    }
@@ -124,7 +99,7 @@ void locktree_unit_test::test_simple_lock(void) {

    for (int64_t i = 0; i < num_locks; i++) {
        k.data = (void *) &keys[i];
-        lt->remove_overlapping_locks_for_txnid(txnid_a, &k, &k);
+        locktree_test_release_lock(lt, txnid_a, &k, &k);
    }

    toku_free(keys);

--- a/locktree/tests/locktree_single_txnid_optimization.cc
+++ b/locktree/tests/locktree_single_txnid_optimization.cc
@@ -58,20 +58,10 @@ void locktree_unit_test::test_single_txnid_optimization(void) {
        lock_and_append_point_for_txnid_a(zero);
        maybe_point_locks_for_txnid_b(2);

-        // txnid b does not take a lock on iteration 3
-        if (where != 3) {
-            invariant(num_row_locks(lt) == 4);
-        } else {
-            invariant(num_row_locks(lt) == 3);
-        }
-
-
        lt->release_locks(txnid_a, &buffer);

        // txnid b does not take a lock on iteration 3
        if (where != 3) {
-            invariant(num_row_locks(lt) == 1);
-
            struct verify_fn_obj {
                TXNID expected_txnid;
                keyrange *expected_range;

--- a/locktree/tests/locktree_unit_test.h
+++ b/locktree/tests/locktree_unit_test.h
@@ -56,20 +56,19 @@ class locktree_unit_test {
        ltr.release();
    }

-    static size_t num_row_locks(const locktree *lt) {
-        struct count_fn_obj {
-            size_t count;
-            bool fn(const keyrange &range, TXNID txnid) {
-                (void) range;
-                (void) txnid;
-                count++;
-                return true;
+    static bool no_row_locks(const locktree *lt) {
+        return lt->m_rangetree->is_empty() && lt->m_sto_buffer.is_empty();
    }
-        } count_fn;
-        count_fn.count = 0;
-        locktree_iterate<count_fn_obj>(lt, &count_fn);
-        return count_fn.count;
+
+    static void locktree_test_release_lock(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key) {
+        range_buffer buffer;
+        buffer.create();
+        buffer.append(left_key, right_key);
+        lt->release_locks(txnid, &buffer);
+        buffer.destroy();
    }
+
+    friend class lock_request_unit_test;
 };

 } /* namespace toku */

--- a/locktree/treenode.cc
+++ b/locktree/treenode.cc
@@ -266,31 +266,23 @@ treenode *treenode::remove_root_of_subtree() {
    return this;
 }

-uint64_t treenode::recursive_remove(uint64_t *mem_released) {
-    // remove left and right subtrees
-    uint64_t nodes_removed = 0;
-
+void treenode::recursive_remove(void) {
    treenode *left = m_left_child.ptr;
    if (left) {
-        nodes_removed += left->recursive_remove(mem_released);
+        left->recursive_remove();
    }
    m_left_child.set(nullptr);

    treenode *right = m_right_child.ptr;
    if (right) {
-        nodes_removed += right->recursive_remove(mem_released);
+        right->recursive_remove();
    }
    m_right_child.set(nullptr);

-    // note the amount of memory to-be released by this node
-    if (mem_released) {
-        *mem_released += m_range.get_memory_size();
-    }
    // we do not take locks on the way down, so we know non-root nodes
    // are unlocked here and the caller is required to pass a locked
    // root, so this free is correct.
    treenode::free(this);
-    return nodes_removed + 1;
 }

 treenode *treenode::remove(const keyrange &range) {

--- a/locktree/treenode.h
+++ b/locktree/treenode.h
@@ -86,10 +86,7 @@ class treenode {

    // effect: removes this node and all of its children, recursively
    // requires: every node at and below this node is unlocked
-    // returns: the number of nodes removed
-    // returns: *mem_released is the total amount of keyrange memory released.
-    //          mem_released does not account for treenode insertion overhead.
-    uint64_t recursive_remove(uint64_t *mem_released);
+    void recursive_remove(void);

 private:


--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -343,7 +343,9 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)
    # libtokudb.so.
    # We link the test with util directly so that the test code itself can use
    # some of those things (i.e. kibbutz in the threaded tests).
-    target_link_libraries(${base}.tdb util ${LIBTOKUDB} ${LIBTOKUPORTABILITY})
+    # We link the locktree so that threaded stress tests can call some
+    # functions (ie: lock escalation) directly. 
+    target_link_libraries(${base}.tdb util locktree ${LIBTOKUDB} ${LIBTOKUPORTABILITY})
    set_property(TARGET ${base}.tdb APPEND PROPERTY
      COMPILE_DEFINITIONS "ENVDIR=\"dir.${bin}\";USE_TDB;IS_TDB=1;TOKUDB=1")
    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "dir.${bin}")
@@ -378,7 +380,7 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)

  function(add_custom_executable prefix binary source)
    add_executable(${prefix}_${binary} ${source})
-    target_link_libraries(${prefix}_${binary} util ${LIBTOKUDB} ${LIBTOKUPORTABILITY})
+    target_link_libraries(${prefix}_${binary} util locktree ${LIBTOKUDB} ${LIBTOKUPORTABILITY})
    set_target_properties(${prefix}_${binary} PROPERTIES
      COMPILE_DEFINITIONS "ENVDIR=\"dir.${prefix}_${source}.tdb\";USE_TDB;IS_TDB=1;TOKUDB=1")
    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "dir.${prefix}_${source}.tdb")

--- a/src/tests/test_stress0.cc
+++ b/src/tests/test_stress0.cc
@@ -40,9 +40,18 @@ stress_table(DB_ENV *env, DB **dbp, struct cli_args *cli_args) {
    myargs[0].operation_extra = &soe[0];
    myargs[0].operation = scan_op;

+    // make the lock escalation thread.
+    // it should sleep somewhere between 10 and 20
+    // seconds between each escalation.
+    struct lock_escalation_op_extra eoe;
+    eoe.min_sleep_time_micros = 10UL * (1000 * 1000);
+    eoe.max_sleep_time_micros = 20UL * (1000 * 1000);
+    myargs[1].operation_extra = &eoe;
+    myargs[1].operation = lock_escalation_op;
+
    // make the threads that update the db
    struct update_op_args uoe = get_update_op_args(cli_args, NULL);
-    for (int i = 1; i < 1 + cli_args->num_update_threads; ++i) {
+    for (int i = 2; i < 2 + cli_args->num_update_threads; ++i) {
        myargs[i].operation_extra = &uoe;
        myargs[i].operation = update_op;
        myargs[i].do_prepare = false;
@@ -50,7 +59,7 @@ stress_table(DB_ENV *env, DB **dbp, struct cli_args *cli_args) {
        // doing sequential updates. the rest of the threads
        // will take point write locks on update as usual.
        // this ensures both ranges and points are stressed.
-        myargs[i].prelock_updates = i < 4 ? true : false;
+        myargs[i].prelock_updates = i < 5 ? true : false;
    }

    run_workers(myargs, num_threads, cli_args->num_seconds, false, cli_args);

--- a/src/tests/threaded_stress_test_helpers.h
+++ b/src/tests/threaded_stress_test_helpers.h
@@ -34,6 +34,8 @@
 #include <util/rwlock.h>
 #include <util/kibbutz.h>

+#include <src/ydb-internal.h>
+
 #include <ft/ybt.h>

 using namespace toku;
@@ -888,6 +890,27 @@ static int UU() verify_op(DB_TXN* UU(txn), ARG UU(arg), void* UU(operation_extra
    return r;
 }

+struct lock_escalation_op_extra {
+    // sleep somewhere between these times before running escalation.
+    // this will add some chaos into the mix.
+    uint64_t min_sleep_time_micros;
+    uint64_t max_sleep_time_micros;
+};
+
+static int UU() lock_escalation_op(DB_TXN *UU(txn), ARG arg, void* operation_extra, void *UU(stats_extra)) {
+    struct lock_escalation_op_extra *CAST_FROM_VOIDP(extra, operation_extra);
+    if (extra->max_sleep_time_micros > 0) {
+        invariant(extra->max_sleep_time_micros >= extra->min_sleep_time_micros);
+        uint64_t extra_sleep_time = (extra->max_sleep_time_micros - extra->min_sleep_time_micros) + 1;
+        uint64_t sleep_time = extra->min_sleep_time_micros + (myrandom_r(arg->random_data) % extra_sleep_time);
+        usleep(sleep_time);
+    }
+    if (!arg->cli->nolocktree) {
+        toku_env_run_lock_escalation_for_test(arg->env);
+    }
+    return 0;
+}
+
 static int UU() scan_op(DB_TXN *txn, ARG UU(arg), void* operation_extra, void *UU(stats_extra)) {
    struct scan_op_extra* CAST_FROM_VOIDP(extra, operation_extra);
    for (int i = 0; run_test && i < arg->cli->num_DBs; i++) {

--- a/src/ydb-internal.h
+++ b/src/ydb-internal.h
@@ -95,6 +95,12 @@ struct __toku_db_env_internal {
    int tmpdir_lockfd;
 };

+// test-only environment function for running lock escalation
+static inline void toku_env_run_lock_escalation_for_test(DB_ENV *env) {
+    toku::locktree::manager *mgr = &env->i->ltm;
+    mgr->run_escalation_for_test();
+}
+
 // Common error handling macros and panic detection
 #define MAYBE_RETURN_ERROR(cond, status) if (cond) return status;
 #define HANDLE_PANICKED_ENV(env) if (toku_env_is_panicked(env)) { sleep(1); return EINVAL; }