#141 identify a big transaction and have it run lock escalation

c8993dec · Rich Prohaska · 96a2a6fc · c8993dec · c8993dec · c8993dec
Commit c8993dec authored Jan 13, 2014 by Rich Prohaska
25 changed files
--- a/locktree/lock_request.cc
+++ b/locktree/lock_request.cc
@@ -125,9 +125,7 @@ void lock_request::destroy(void) {
 }

 // set the lock request parameters. this API allows a lock request to be reused.
-void lock_request::set(locktree *lt, TXNID txnid,
-        const DBT *left_key, const DBT *right_key,
-        lock_request::type lock_type) {
+void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, lock_request::type lock_type, bool big_txn) {
    invariant(m_state != state::PENDING);
    m_lt = lt;
    m_txnid = txnid;
@@ -138,6 +136,7 @@ void lock_request::set(locktree *lt, TXNID txnid,
    m_type = lock_type;
    m_state = state::INITIALIZED;
    m_info = lt->get_lock_request_info();
+    m_big_txn = big_txn;
 }

 // get rid of any stored left and right key copies and
@@ -207,10 +206,10 @@ int lock_request::start(void) {
    txnid_set conflicts;
    conflicts.create();
    if (m_type == type::WRITE) {
-        r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts);
+        r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
    } else {
        invariant(m_type == type::READ);
-        r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts);
+        r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
    }

    // if the lock is not granted, save it to the set of lock requests
@@ -310,9 +309,9 @@ int lock_request::retry(void) {

    invariant(m_state == state::PENDING);
    if (m_type == type::WRITE) {
-        r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, nullptr);
+        r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, nullptr, m_big_txn);
    } else {
-        r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, nullptr);
+        r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, nullptr, m_big_txn);
    }

    // if the acquisition succeeded then remove ourselves from the

--- a/locktree/lock_request.h
+++ b/locktree/lock_request.h
@@ -133,8 +133,7 @@ public:

    // effect: Resets the lock request parameters, allowing it to be reused.
    // requires: Lock request was already created at some point
-    void set(locktree *lt, TXNID txnid,
-            const DBT *left_key, const DBT *right_key, type lock_type);
+    void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, type lock_type, bool big_txn);

    // effect: Tries to acquire a lock described by this lock request.
    // returns: The return code of locktree::acquire_[write,read]_lock()
@@ -198,6 +197,8 @@ private:

    toku_cond_t m_wait_cond;

+    bool m_big_txn;
+
    // the lock request info state stored in the
    // locktree that this lock request is for.
    struct locktree::lt_lock_request_info *m_info;

--- a/locktree/locktree.cc
+++ b/locktree/locktree.cc
@@ -119,6 +119,7 @@ namespace toku {
 void locktree::create(manager::memory_tracker *mem_tracker, DICTIONARY_ID dict_id,
        DESCRIPTOR desc, ft_compare_func cmp) {
    m_mem_tracker = mem_tracker;
+    m_mgr = mem_tracker->get_manager();
    m_dict_id = dict_id;

    // the only reason m_cmp is malloc'd here is to prevent gdb from printing
@@ -410,8 +411,8 @@ int locktree::acquire_lock(bool is_write_request, TXNID txnid,
 }

 int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,
-        const DBT *left_key, const DBT *right_key, txnid_set *conflicts) {
-    int r = m_mem_tracker->check_current_lock_constraints();
+        const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn) {
+    int r = m_mgr->check_current_lock_constraints(big_txn);
    if (r == 0) {
        r = acquire_lock(is_write_request, txnid, left_key, right_key, conflicts);
    }
@@ -420,13 +421,13 @@ int locktree::try_acquire_lock(bool is_write_request, TXNID txnid,

 // the locktree silently upgrades read locks to write locks for simplicity
 int locktree::acquire_read_lock(TXNID txnid,
-        const DBT *left_key, const DBT *right_key, txnid_set *conflicts) {
-    return acquire_write_lock(txnid, left_key, right_key, conflicts);
+        const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn) {
+    return acquire_write_lock(txnid, left_key, right_key, conflicts, big_txn);
 }

 int locktree::acquire_write_lock(TXNID txnid,
-        const DBT *left_key, const DBT *right_key, txnid_set *conflicts) {
-    return try_acquire_lock(true, txnid, left_key, right_key, conflicts);
+        const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn) {
+    return try_acquire_lock(true, txnid, left_key, right_key, conflicts, big_txn);
 }

 void locktree::get_conflicts(bool is_write_request, TXNID txnid,

--- a/locktree/locktree.h
+++ b/locktree/locktree.h
@@ -105,6 +105,11 @@ PATENT RIGHTS GRANT:
 #include "wfg.h"
 #include "range_buffer.h"

+#define TOKU_LOCKTREE_ESCALATOR_LAMBDA 0
+#if TOKU_LOCKTREE_ESCALATOR_LAMBDA
+#include <functional>
+#endif
+
 enum {
    LTM_SIZE_CURRENT = 0,
    LTM_SIZE_LIMIT,
@@ -164,15 +169,13 @@ public:
    //          If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
    // note: Read locks cannot be shared between txnids, as one would expect.
    //       This is for simplicity since read locks are rare in MySQL.
-    int acquire_read_lock(TXNID txnid,
-            const DBT *left_key, const DBT *right_key, txnid_set *conflicts);
+    int acquire_read_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn);

    // effect: Attempts to grant a write lock for the range of keys between [left_key, right_key].
    // returns: If the lock cannot be granted, return DB_LOCK_NOTGRANTED, and populate the
    //          given conflicts set with the txnids that hold conflicting locks in the range.
    //          If the locktree cannot create more locks, return TOKUDB_OUT_OF_LOCKS.
-    int acquire_write_lock(TXNID txnid,
-            const DBT *left_key, const DBT *right_key, txnid_set *conflicts);
+    int acquire_write_lock(TXNID txnid, const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn);

    // effect: populate the conflicts set with the txnids that would preventing
    //         the given txnid from getting a lock on [left_key, right_key]
@@ -215,6 +218,25 @@ public:
    // since the lock_request object is opaque 
    struct lt_lock_request_info *get_lock_request_info(void);

+    class manager;
+
+    // the escalator coordinates escalation on a set of locktrees for a bunch of threads
+    class escalator {
+    public:
+        void create(void);
+        void destroy(void);
+#if TOKU_LOCKTREE_ESCALATOR_LAMBDA
+        void run(manager *mgr, std::function<void (void)> escalate_locktrees_fun);
+#else
+        void run(manager *mgr, void (*escalate_locktrees_fun)(void *extra), void *extra);
+#endif
+    private:
+        toku_mutex_t m_escalator_mutex;
+        toku_cond_t m_escalator_done;
+        bool m_escalator_running;
+    };
+    ENSURE_POD(escalator);
+
    // The locktree manager manages a set of locktrees,
    // one for each open dictionary. Locktrees are accessed through
    // the manager, and when they are no longer needed, they can
@@ -265,6 +287,7 @@ public:
        class memory_tracker {
        public:
            void set_manager(manager *mgr);
+            manager *get_manager(void);

            // effect: Determines if too many locks or too much memory is being used,
            //         Runs escalation on the manager if so.
@@ -273,6 +296,8 @@ public:
            //          enough resources for a new lock.
            int check_current_lock_constraints(void);

+            bool over_big_threshold(void);
+
            void note_mem_used(uint64_t mem_used);

            void note_mem_released(uint64_t mem_freed);
@@ -297,6 +322,7 @@ public:
        // rationale: to get better stress test coverage, we want a way to
        //            deterministicly trigger lock escalation.
        void run_escalation_for_test(void);
+        void run_escalation(void);

        void get_status(LTM_STATUS status);

@@ -311,6 +337,20 @@ public:
                                                     void *extra);
        int iterate_pending_lock_requests(lock_request_iterate_callback cb, void *extra);

+        int check_current_lock_constraints(bool big_txn);
+
+        // Escalate locktrees touched by a txn
+        void escalate_lock_trees_for_txn(TXNID, locktree *lt);
+
+        // Escalate all locktrees
+        void escalate_all_locktrees(void);
+
+        // Escalate a set of locktrees
+        void escalate_locktrees(locktree **locktrees, int num_locktrees);
+
+        // Add time t to the escalator's wait time statistics
+        void add_escalator_wait_time(uint64_t t);
+
    private:
        static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
        static const uint64_t DEFAULT_LOCK_WAIT_TIME = 0;
@@ -357,23 +397,14 @@ public:
        // requires: Manager's mutex is held
        void locktree_map_remove(locktree *lt);

-        // effect: Runs escalation on all locktrees.
-        void run_escalation(void);
-
        static int find_by_dict_id(locktree *const &lt, const DICTIONARY_ID &dict_id);

        void escalator_init(void);

        void escalator_destroy(void);

-        // effect: Add time t to the escalator's wait time statistics
-        void add_escalator_wait_time(uint64_t t);
-
-        // effect: escalate's the locks in each locktree
-        // requires: manager's mutex is held
-        void escalate_all_locktrees(void);
-
        // statistics about lock escalation.
+        toku_mutex_t m_escalation_mutex;
        uint64_t m_escalation_count;
        tokutime_t m_escalation_time;
        uint64_t m_escalation_latest_result;
@@ -382,25 +413,16 @@ public:
        uint64_t m_long_wait_escalation_count;
        uint64_t m_long_wait_escalation_time;

-        toku_mutex_t m_escalator_mutex;
-#define DO_ESCALATOR_THREAD 0
-#if DO_ESCALATOR_THREAD
-        toku_cond_t m_escalator_work;    // signal the escalator to run
-        toku_cond_t m_escalator_done;    // signal that escalation is done
-        bool m_escalator_killed;
-        toku_pthread_t m_escalator_id;
-#endif
+        escalator m_escalator;

        friend class manager_unit_test;
-
-    public:
-        void escalator_work(void);
    };
    ENSURE_POD(manager);

    manager::memory_tracker *get_mem_tracker(void) const;

 private:
+    manager *m_mgr;
    manager::memory_tracker *m_mem_tracker;

    DICTIONARY_ID m_dict_id;
@@ -418,7 +440,6 @@ private:

    uint32_t m_reference_count;

-    // the locktree stores locks in a concurrent, non-overlapping rangetree
    concurrent_tree *m_rangetree;

    void *m_userdata;
@@ -590,7 +611,7 @@ private:
            const DBT *left_key, const DBT *right_key, txnid_set *conflicts);

    int try_acquire_lock(bool is_write_request, TXNID txnid,
-            const DBT *left_key, const DBT *right_key, txnid_set *conflicts);
+            const DBT *left_key, const DBT *right_key, txnid_set *conflicts, bool big_txn);

    void escalate(manager::lt_escalate_cb after_escalate_callback, void *extra);


--- a/locktree/manager.cc
+++ b/locktree/manager.cc
--- a/locktree/tests/lock_request_get_set_keys.cc
+++ b/locktree/tests/lock_request_get_set_keys.cc
@@ -111,20 +111,20 @@ void lock_request_unit_test::test_get_set_keys(void) {

    // request should not copy dbts for neg/pos inf, so get_left
    // and get_right should return the same pointer given
-    request.set(null_lt, txnid_a, neg_inf, pos_inf, lock_request::type::WRITE);
+    request.set(null_lt, txnid_a, neg_inf, pos_inf, lock_request::type::WRITE, false);
    invariant(request.get_left_key() == neg_inf);
    invariant(request.get_right_key() == pos_inf);

    // request should make copies of non-infinity-valued keys.
-    request.set(null_lt, txnid_a, neg_inf, one, lock_request::type::WRITE);
+    request.set(null_lt, txnid_a, neg_inf, one, lock_request::type::WRITE, false);
    invariant(request.get_left_key() == neg_inf);
    invariant(request.get_right_key() == one);

-    request.set(null_lt, txnid_a, two, pos_inf, lock_request::type::WRITE);
+    request.set(null_lt, txnid_a, two, pos_inf, lock_request::type::WRITE, false);
    invariant(request.get_left_key() == two);
    invariant(request.get_right_key() == pos_inf);

-    request.set(null_lt, txnid_a, one, two, lock_request::type::WRITE);
+    request.set(null_lt, txnid_a, one, two, lock_request::type::WRITE, false);
    invariant(request.get_left_key() == one);
    invariant(request.get_right_key() == two);


--- a/locktree/tests/lock_request_start_deadlock.cc
+++ b/locktree/tests/lock_request_start_deadlock.cc
@@ -119,30 +119,30 @@ void lock_request_unit_test::test_start_deadlock(void) {
    const DBT *two = get_dbt(2);

    // start and succeed 1,1 for A and 2,2 for B.
-    request_a.set(lt, txnid_a, one, one, lock_request::type::WRITE);
+    request_a.set(lt, txnid_a, one, one, lock_request::type::WRITE, false);
    r = request_a.start();
    invariant_zero(r);
-    request_b.set(lt, txnid_b, two, two, lock_request::type::WRITE);
+    request_b.set(lt, txnid_b, two, two, lock_request::type::WRITE, false);
    r = request_b.start();
    invariant_zero(r);

    // txnid A should not be granted a lock on 2,2, so it goes pending.
-    request_a.set(lt, txnid_a, two, two, lock_request::type::WRITE);
+    request_a.set(lt, txnid_a, two, two, lock_request::type::WRITE, false);
    r = request_a.start();
    invariant(r == DB_LOCK_NOTGRANTED);

    // if txnid B wants a lock on 1,1 it should deadlock with A
-    request_b.set(lt, txnid_b, one, one, lock_request::type::WRITE);
+    request_b.set(lt, txnid_b, one, one, lock_request::type::WRITE, false);
    r = request_b.start();
    invariant(r == DB_LOCK_DEADLOCK);

    // txnid C should not deadlock on either of these - it should just time out.
-    request_c.set(lt, txnid_c, one, one, lock_request::type::WRITE);
+    request_c.set(lt, txnid_c, one, one, lock_request::type::WRITE, false);
    r = request_c.start();
    invariant(r == DB_LOCK_NOTGRANTED);
    r = request_c.wait(lock_wait_time);
    invariant(r == DB_LOCK_NOTGRANTED);
-    request_c.set(lt, txnid_c, two, two, lock_request::type::WRITE);
+    request_c.set(lt, txnid_c, two, two, lock_request::type::WRITE, false);
    r = request_c.start();
    invariant(r == DB_LOCK_NOTGRANTED);
    r = request_c.wait(lock_wait_time);

--- a/locktree/tests/lock_request_start_pending.cc
+++ b/locktree/tests/lock_request_start_pending.cc
@@ -113,7 +113,7 @@ void lock_request_unit_test::test_start_pending(void) {
    const DBT *two = get_dbt(2);

    // take a range lock using txnid b
-    r = lt->acquire_write_lock(txnid_b, zero, two, nullptr);
+    r = lt->acquire_write_lock(txnid_b, zero, two, nullptr, false);
    invariant_zero(r);

    locktree::lt_lock_request_info *info = lt->get_lock_request_info();
@@ -121,7 +121,7 @@ void lock_request_unit_test::test_start_pending(void) {
    // start a lock request for 1,1
    // it should fail. the request should be stored and in the pending state.
    request.create();
-    request.set(lt, txnid_a, one, one, lock_request::type::WRITE);
+    request.set(lt, txnid_a, one, one, lock_request::type::WRITE, false);
    r = request.start();
    invariant(r == DB_LOCK_NOTGRANTED);
    invariant(info->pending_lock_requests.size() == 1);

--- a/locktree/tests/lock_request_wait_time_callback.cc
+++ b/locktree/tests/lock_request_wait_time_callback.cc
@@ -123,12 +123,12 @@ void lock_request_unit_test::test_wait_time_callback(void) {
    const DBT *two = get_dbt(2);

    // a locks 'one'
-    request_a.set(lt, txnid_a, one, one, lock_request::type::WRITE);
+    request_a.set(lt, txnid_a, one, one, lock_request::type::WRITE, false);
    r = request_a.start();
    assert_zero(r);

    // b tries to lock 'one'
-    request_b.set(lt, txnid_b, one, two, lock_request::type::WRITE);
+    request_b.set(lt, txnid_b, one, two, lock_request::type::WRITE, false);
    r = request_b.start();
    assert(r == DB_LOCK_NOTGRANTED);
    assert(my_calls == 0);

--- a/locktree/tests/locktree_conflicts.cc
+++ b/locktree/tests/locktree_conflicts.cc
@@ -125,8 +125,8 @@ void locktree_unit_test::test_conflicts(void) {
        // test_run == 0 means test with read lock
        // test_run == 1 means test with write lock
 #define ACQUIRE_LOCK(txn, left, right, conflicts) \
-        test_run == 0 ? lt->acquire_read_lock(txn, left, right, conflicts) \
-                      : lt->acquire_write_lock(txn, left, right, conflicts)
+        test_run == 0 ? lt->acquire_read_lock(txn, left, right, conflicts, false) \
+            : lt->acquire_write_lock(txn, left, right, conflicts, false)

        // acquire some locks for txnid_a
        r = ACQUIRE_LOCK(txnid_a, one, one, nullptr);
@@ -142,8 +142,8 @@ void locktree_unit_test::test_conflicts(void) {
        // if test_run == 0, then read locks exist. only test write locks.
 #define ACQUIRE_LOCK(txn, left, right, conflicts) \
        sub_test_run == 0 && test_run == 1 ? \
-            lt->acquire_read_lock(txn, left, right, conflicts) \
-          : lt->acquire_write_lock(txn, left, right, conflicts)
+            lt->acquire_read_lock(txn, left, right, conflicts, false) \
+          : lt->acquire_write_lock(txn, left, right, conflicts, false)
            // try to get point write locks for txnid_b, should fail
            r = ACQUIRE_LOCK(txnid_b, one, one, nullptr);
            invariant(r == DB_LOCK_NOTGRANTED);

--- a/locktree/tests/locktree_escalation_1big7lt_1small.cc
+++ b/locktree/tests/locktree_escalation_1big7lt_1small.cc
--- a/locktree/tests/locktree_escalation_2big_1lt.cc
+++ b/locktree/tests/locktree_escalation_2big_1lt.cc
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <stdio.h>
+#include "locktree.h"
+#include "test.h"
+
+// One client locks 1,2,3...
+// The other client locks -1,-2,-3...
+// Eventually lock escalation runs.
+
+using namespace toku;
+
+static int verbose = 0;
+static int killed = 0;
+
+static void locktree_release_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k) {
+    range_buffer buffer;
+    buffer.create();
+    DBT left; toku_fill_dbt(&left, &left_k, sizeof left_k);
+    DBT right; toku_fill_dbt(&right, &right_k, sizeof right_k);
+    buffer.append(&left, &right);
+    lt->release_locks(txn_id, &buffer);
+    buffer.destroy();
+}
+
+// grab a write range lock on int64 keys bounded by left_k and right_k
+static int locktree_write_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k, bool big_txn) {
+    DBT left; toku_fill_dbt(&left, &left_k, sizeof left_k);
+    DBT right; toku_fill_dbt(&right, &right_k, sizeof right_k);
+    return lt->acquire_write_lock(txn_id, &left, &right, nullptr, big_txn);
+}
+
+static void run_big_txn(locktree::manager *mgr UU(), locktree *lt, TXNID txn_id, int64_t start_i) {
+    fprintf(stderr, "%u run_big_txn %p %" PRIu64 " %" PRId64 "\n", toku_os_gettid(), lt, txn_id, start_i);
+    int64_t last_i = -1;
+    for (int64_t i = start_i; !killed; i++) {
+        if (0)
+            printf("%u %" PRId64 "\n", toku_os_gettid(), i);
+        uint64_t t_start = toku_current_time_microsec();
+        int r = locktree_write_lock(lt, txn_id, i, i, true);
+        if (r != 0)
+            break;
+        last_i = i;
+        uint64_t t_end = toku_current_time_microsec();
+        uint64_t t_duration = t_end - t_start;
+        if (t_duration > 100000) {
+            printf("%u %s %" PRId64 " %" PRIu64 "\n", toku_os_gettid(), __FUNCTION__, i, t_duration);
+        }
+        toku_pthread_yield();
+    }
+    if (last_i != -1)
+        locktree_release_lock(lt, txn_id, start_i, last_i); // release the range start_i .. last_i
+}
+
+struct arg {
+    locktree::manager *mgr;
+    locktree *lt;
+    TXNID txn_id;
+    int64_t start_i;
+};
+
+static void *big_f(void *_arg) {
+    struct arg *arg = (struct arg *) _arg;
+    run_big_txn(arg->mgr, arg->lt, arg->txn_id, arg->start_i);
+    return arg;
+}
+
+static void e_callback(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *extra) {
+    if (verbose)
+        printf("%u %s %" PRIu64 " %p %d %p\n", toku_os_gettid(), __FUNCTION__, txnid, lt, buffer.get_num_ranges(), extra);
+}
+
+static uint64_t get_escalation_count(locktree::manager &mgr) {
+    LTM_STATUS_S ltm_status;
+    mgr.get_status(&ltm_status);
+
+    TOKU_ENGINE_STATUS_ROW key_status = NULL;
+    // lookup keyname in status
+    for (int i = 0; ; i++) {
+        TOKU_ENGINE_STATUS_ROW status = &ltm_status.status[i];
+        if (status->keyname == NULL)
+            break;
+        if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
+            key_status = status;
+            break;
+        }
+    }
+    assert(key_status);
+    return key_status->value.num;
+}
+
+int main(int argc, const char *argv[]) {
+    const int n_big = 2;
+    int n_lt = 1;
+    uint64_t stalls = 1;
+    uint64_t max_lock_memory = 1000000;
+    
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
+            verbose++;
+            continue;
+        }
+        if (strcmp(argv[i], "--stalls") == 0 && i+1 < argc) {
+            stalls = atoll(argv[++i]);
+            continue;
+        }
+        if (strcmp(argv[i], "--n_lt") == 0 && i+1 < argc) {
+            n_lt = atoi(argv[++i]);
+            continue;
+        }
+        if (strcmp(argv[i], "--max_lock_memory") == 0 && i+1 < argc) {
+            max_lock_memory = atoll(argv[++i]);
+            continue;
+        }
+    }
+
+    int r;
+
+    // create a manager
+    locktree::manager mgr;
+    mgr.create(nullptr, nullptr, e_callback, nullptr);
+    mgr.set_max_lock_memory(max_lock_memory);
+
+    // create lock trees
+    DESCRIPTOR desc[n_lt];
+    DICTIONARY_ID dict_id[n_lt];
+    locktree *lt[n_big];
+    for (int i = 0; i < n_lt; i++) {
+        desc[i] = nullptr;
+        dict_id[i] = { (uint64_t)i };
+        lt[i] = mgr.get_lt(dict_id[i], desc[i], compare_dbts, nullptr);
+        assert(lt[i]);
+    }
+
+    // create the worker threads
+    struct arg big_arg[n_big];
+    pthread_t big_ids[n_big];
+    for (int i = 0; i < n_big; i++) {
+        big_arg[i] = { &mgr, lt[i % n_lt], (TXNID)(1000+i), i == 0 ? 1 : -1000000000 };
+        r = toku_pthread_create(&big_ids[i], nullptr, big_f, &big_arg[i]);
+        assert(r == 0);
+    }
+
+    // wait for some escalations to occur
+    while (get_escalation_count(mgr) < stalls) {
+        sleep(1);
+    }
+    killed = 1;
+
+    // cleanup
+    for (int i = 0; i < n_big; i++) {
+        void *ret;
+        r = toku_pthread_join(big_ids[i], &ret);
+        assert(r == 0);
+    }
+    for (int i = 0; i < n_lt ; i++) {
+        mgr.release_lt(lt[i]);
+    }
+    mgr.destroy();
+
+    return 0;
+}
--- a/locktree/tests/locktree_escalation_2big_2lt.cc
+++ b/locktree/tests/locktree_escalation_2big_2lt.cc
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <stdio.h>
+#include "locktree.h"
+#include "test.h"
+
+// One client locks 1,2,3...
+// The other client locks -1,-2,-3...
+// Eventually lock escalation runs.
+
+using namespace toku;
+
+static int verbose = 0;
+static int killed = 0;
+
+static void locktree_release_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k) {
+    range_buffer buffer;
+    buffer.create();
+    DBT left; toku_fill_dbt(&left, &left_k, sizeof left_k);
+    DBT right; toku_fill_dbt(&right, &right_k, sizeof right_k);
+    buffer.append(&left, &right);
+    lt->release_locks(txn_id, &buffer);
+    buffer.destroy();
+}
+
+// grab a write range lock on int64 keys bounded by left_k and right_k
+static int locktree_write_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k, bool big_txn) {
+    DBT left; toku_fill_dbt(&left, &left_k, sizeof left_k);
+    DBT right; toku_fill_dbt(&right, &right_k, sizeof right_k);
+    return lt->acquire_write_lock(txn_id, &left, &right, nullptr, big_txn);
+}
+
+static void run_big_txn(locktree::manager *mgr UU(), locktree *lt, TXNID txn_id, int64_t start_i) {
+    fprintf(stderr, "%u run_big_txn %p %" PRIu64 " %" PRId64 "\n", toku_os_gettid(), lt, txn_id, start_i);
+    int64_t last_i = -1;
+    for (int64_t i = start_i; !killed; i++) {
+        if (0)
+            printf("%u %" PRId64 "\n", toku_os_gettid(), i);
+        uint64_t t_start = toku_current_time_microsec();
+        int r = locktree_write_lock(lt, txn_id, i, i, true);
+        if (r != 0)
+            break;
+        last_i = i;
+        uint64_t t_end = toku_current_time_microsec();
+        uint64_t t_duration = t_end - t_start;
+        if (t_duration > 100000) {
+            printf("%u %s %" PRId64 " %" PRIu64 "\n", toku_os_gettid(), __FUNCTION__, i, t_duration);
+        }
+        toku_pthread_yield();
+    }
+    if (last_i != -1)
+        locktree_release_lock(lt, txn_id, start_i, last_i); // release the range start_i .. last_i
+}
+
+struct arg {
+    locktree::manager *mgr;
+    locktree *lt;
+    TXNID txn_id;
+    int64_t start_i;
+};
+
+static void *big_f(void *_arg) {
+    struct arg *arg = (struct arg *) _arg;
+    run_big_txn(arg->mgr, arg->lt, arg->txn_id, arg->start_i);
+    return arg;
+}
+
+static void e_callback(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *extra) {
+    if (verbose)
+        printf("%u %s %" PRIu64 " %p %d %p\n", toku_os_gettid(), __FUNCTION__, txnid, lt, buffer.get_num_ranges(), extra);
+}
+
+static uint64_t get_escalation_count(locktree::manager &mgr) {
+    LTM_STATUS_S ltm_status;
+    mgr.get_status(&ltm_status);
+
+    TOKU_ENGINE_STATUS_ROW key_status = NULL;
+    // lookup keyname in status
+    for (int i = 0; ; i++) {
+        TOKU_ENGINE_STATUS_ROW status = &ltm_status.status[i];
+        if (status->keyname == NULL)
+            break;
+        if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
+            key_status = status;
+            break;
+        }
+    }
+    assert(key_status);
+    return key_status->value.num;
+}
+
+int main(int argc, const char *argv[]) {
+    const int n_big = 2;
+    int n_lt = 2;
+    uint64_t stalls = 1;
+    uint64_t max_lock_memory = 1000000;
+    
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
+            verbose++;
+            continue;
+        }
+        if (strcmp(argv[i], "--stalls") == 0 && i+1 < argc) {
+            stalls = atoll(argv[++i]);
+            continue;
+        }
+        if (strcmp(argv[i], "--n_lt") == 0 && i+1 < argc) {
+            n_lt = atoi(argv[++i]);
+            continue;
+        }
+        if (strcmp(argv[i], "--max_lock_memory") == 0 && i+1 < argc) {
+            max_lock_memory = atoll(argv[++i]);
+            continue;
+        }
+    }
+
+    int r;
+
+    // create a manager
+    locktree::manager mgr;
+    mgr.create(nullptr, nullptr, e_callback, nullptr);
+    mgr.set_max_lock_memory(max_lock_memory);
+
+    // create lock trees
+    DESCRIPTOR desc[n_lt];
+    DICTIONARY_ID dict_id[n_lt];
+    locktree *lt[n_big];
+    for (int i = 0; i < n_lt; i++) {
+        desc[i] = nullptr;
+        dict_id[i] = { (uint64_t)i };
+        lt[i] = mgr.get_lt(dict_id[i], desc[i], compare_dbts, nullptr);
+        assert(lt[i]);
+    }
+
+    // create the worker threads
+    struct arg big_arg[n_big];
+    pthread_t big_ids[n_big];
+    for (int i = 0; i < n_big; i++) {
+        big_arg[i] = { &mgr, lt[i % n_lt], (TXNID)(1000+i), i == 0 ? 1 : -1000000000 };
+        r = toku_pthread_create(&big_ids[i], nullptr, big_f, &big_arg[i]);
+        assert(r == 0);
+    }
+
+    // wait for some escalations to occur
+    while (get_escalation_count(mgr) < stalls) {
+        sleep(1);
+    }
+    killed = 1;
+
+    // cleanup
+    for (int i = 0; i < n_big; i++) {
+        void *ret;
+        r = toku_pthread_join(big_ids[i], &ret);
+        assert(r == 0);
+    }
+    for (int i = 0; i < n_lt ; i++) {
+        mgr.release_lt(lt[i]);
+    }
+    mgr.destroy();
+
+    return 0;
+}
--- a/locktree/tests/locktree_escalation_impossible.cc
+++ b/locktree/tests/locktree_escalation_impossible.cc
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <stdio.h>
+#include "locktree.h"
+#include "test.h"
+
+// Two big txn's grab alternating locks in a single lock tree.
+// Eventually lock escalation runs.
+// Since the locks can not be consolidated, the out of locks error should be returned.
+
+using namespace toku;
+
+static int verbose = 0;
+
+static inline void locktree_release_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k) {
+    range_buffer buffer;
+    buffer.create();
+    DBT left; toku_fill_dbt(&left, &left_k, sizeof left_k);
+    DBT right; toku_fill_dbt(&right, &right_k, sizeof right_k);
+    buffer.append(&left, &right);
+    lt->release_locks(txn_id, &buffer);
+    buffer.destroy();
+}
+
+// grab a write range lock on int64 keys bounded by left_k and right_k
+static int locktree_write_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k, bool big_txn) {
+    DBT left; toku_fill_dbt(&left, &left_k, sizeof left_k);
+    DBT right; toku_fill_dbt(&right, &right_k, sizeof right_k);
+    return lt->acquire_write_lock(txn_id, &left, &right, nullptr, big_txn);
+}
+
+static void e_callback(TXNID txnid, const locktree *lt, const range_buffer &buffer, void *extra) {
+    if (verbose)
+        printf("%u %s %" PRIu64 " %p %d %p\n", toku_os_gettid(), __FUNCTION__, txnid, lt, buffer.get_num_ranges(), extra);
+}
+
+static uint64_t get_escalation_count(locktree::manager &mgr) {
+    LTM_STATUS_S ltm_status;
+    mgr.get_status(&ltm_status);
+
+    TOKU_ENGINE_STATUS_ROW key_status = NULL;
+    // lookup keyname in status
+    for (int i = 0; ; i++) {
+        TOKU_ENGINE_STATUS_ROW status = &ltm_status.status[i];
+        if (status->keyname == NULL)
+            break;
+        if (strcmp(status->keyname, "LTM_ESCALATION_COUNT") == 0) {
+            key_status = status;
+            break;
+        }
+    }
+    assert(key_status);
+    return key_status->value.num;
+}
+
+int main(int argc, const char *argv[]) {
+    uint64_t max_lock_memory = 1000000;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
+            verbose++;
+            continue;
+        }
+        if (strcmp(argv[i], "--max_lock_memory") == 0 && i+1 < argc) {
+            max_lock_memory = atoll(argv[++i]);
+            continue;
+        }
+    }
+
+    int r;
+
+    // create a manager
+    locktree::manager mgr;
+    mgr.create(nullptr, nullptr, e_callback, nullptr);
+    mgr.set_max_lock_memory(max_lock_memory);
+
+    const TXNID txn_a = 10;
+    const TXNID txn_b = 100;
+
+    // create lock trees
+    DESCRIPTOR desc = nullptr;
+    DICTIONARY_ID dict_id = { 1 };
+    locktree *lt = mgr.get_lt(dict_id, desc, compare_dbts, nullptr);
+
+    int64_t last_i = -1;
+    for (int64_t i = 0; ; i++) {
+        if (verbose)
+            printf("%" PRId64 "\n", i);
+        int64_t k = 2*i;
+        r = locktree_write_lock(lt, txn_a, k, k, true);
+        if (r != 0) {
+            assert(r == TOKUDB_OUT_OF_LOCKS);
+            break;
+        }
+        last_i = i;
+        r = locktree_write_lock(lt, txn_b, k+1, k+1, true);
+        if (r != 0) {
+            assert(r == TOKUDB_OUT_OF_LOCKS);
+            break;
+        }
+    }
+
+    // wait for an escalation to occur
+    assert(get_escalation_count(mgr) > 0);
+
+    if (last_i != -1) {
+        locktree_release_lock(lt, txn_a, 0, 2*last_i);
+        locktree_release_lock(lt, txn_b, 0, 2*last_i+1);
+    }
+
+    mgr.release_lt(lt);
+    mgr.destroy();
+
+    return 0;
+}
--- a/locktree/tests/locktree_escalation_stalls.cc
+++ b/locktree/tests/locktree_escalation_stalls.cc
@@ -89,6 +89,14 @@ PATENT RIGHTS GRANT:
 #ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

+// This test verifies that small txn's do not get stalled for a long time by lock escalation.
+// Two lock trees are used by the test: a big lock tree and a small lock tree.
+// One big txn grabs lots of write locks on the big lock tree. 
+// Several small txn's grab a single write lock on the small lock tree.
+// None of the locks conflict.
+// Eventually, the locks for the big txn consume all of the lock tree memory, so lock escalation runs.
+// The test measures the lock acquisition time and makes sure that the small txn's are not blocked for 
+
 // locktree_escalation_stalls -v --stalls 10
 // verify that only big txn's get tagged with > 1 second stalls

@@ -112,17 +120,17 @@ static void locktree_release_lock(locktree *lt, TXNID txn_id, int64_t left_k, in
 }

 // grab a write range lock on int64 keys bounded by left_k and right_k
-static int locktree_write_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k) {
+static int locktree_write_lock(locktree *lt, TXNID txn_id, int64_t left_k, int64_t right_k, bool big_txn) {
    DBT left; toku_fill_dbt(&left, &left_k, sizeof left_k);
    DBT right; toku_fill_dbt(&right, &right_k, sizeof right_k);
-    return lt->acquire_write_lock(txn_id, &left, &right, nullptr);
+    return lt->acquire_write_lock(txn_id, &left, &right, nullptr, big_txn);
 }

 static void run_big_txn(locktree::manager *mgr UU(), locktree *lt, TXNID txn_id) {
    int64_t last_i = -1;
    for (int64_t i = 0; !killed; i++) {
        uint64_t t_start = toku_current_time_microsec();
-        int r = locktree_write_lock(lt, txn_id, i, i);
+        int r = locktree_write_lock(lt, txn_id, i, i, true);
        assert(r == 0);
        last_i = i;
        uint64_t t_end = toku_current_time_microsec();
@@ -139,7 +147,7 @@ static void run_big_txn(locktree::manager *mgr UU(), locktree *lt, TXNID txn_id)
 static void run_small_txn(locktree::manager *mgr UU(), locktree *lt, TXNID txn_id, int64_t k) {
    for (int64_t i = 0; !killed; i++) {
        uint64_t t_start = toku_current_time_microsec();
-        int r = locktree_write_lock(lt, txn_id, k, k);
+        int r = locktree_write_lock(lt, txn_id, k, k, false);
        assert(r == 0);
        uint64_t t_end = toku_current_time_microsec();
        uint64_t t_duration = t_end - t_start;
@@ -196,6 +204,7 @@ static uint64_t get_escalation_count(locktree::manager &mgr) {

 int main(int argc, const char *argv[]) {
    uint64_t stalls = 0;
+    uint64_t max_lock_memory = 1000000000;
    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
            verbose++;
@@ -205,6 +214,10 @@ int main(int argc, const char *argv[]) {
            stalls = atoll(argv[++i]);
            continue;
        }
+        if (strcmp(argv[i], "--max_lock_memory") == 0 && i+1 < argc) {
+            max_lock_memory = atoll(argv[++i]);
+            continue;
+        }
    }

    int r;
@@ -212,7 +225,7 @@ int main(int argc, const char *argv[]) {
    // create a manager
    locktree::manager mgr;
    mgr.create(nullptr, nullptr, e_callback, nullptr);
-    mgr.set_max_lock_memory(1000000000);
+    mgr.set_max_lock_memory(max_lock_memory);

    // create lock trees
    DESCRIPTOR desc_0 = nullptr;

--- a/locktree/tests/locktree_infinity.cc
+++ b/locktree/tests/locktree_infinity.cc
@@ -112,54 +112,54 @@ void locktree_unit_test::test_infinity(void) {
    const DBT max_int = max_dbt();

    // txn A will lock -inf, 5.
-    r = lt->acquire_write_lock(txnid_a, toku_dbt_negative_infinity(), five, nullptr);
+    r = lt->acquire_write_lock(txnid_a, toku_dbt_negative_infinity(), five, nullptr, false);
    invariant(r == 0);
    // txn B will fail to get any lock <= 5, even min_int
-    r = lt->acquire_write_lock(txnid_b, five, five, nullptr);
+    r = lt->acquire_write_lock(txnid_b, five, five, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, zero, one, nullptr);
+    r = lt->acquire_write_lock(txnid_b, zero, one, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, &min_int, &min_int, nullptr);
+    r = lt->acquire_write_lock(txnid_b, &min_int, &min_int, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, toku_dbt_negative_infinity(), &min_int, nullptr);
+    r = lt->acquire_write_lock(txnid_b, toku_dbt_negative_infinity(), &min_int, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);

    lt->remove_overlapping_locks_for_txnid(txnid_a, toku_dbt_negative_infinity(), five);

    // txn A will lock 1, +inf
-    r = lt->acquire_write_lock(txnid_a, one, toku_dbt_positive_infinity(), nullptr);
+    r = lt->acquire_write_lock(txnid_a, one, toku_dbt_positive_infinity(), nullptr, false);
    invariant(r == 0);
    // txn B will fail to get any lock >= 1, even max_int
-    r = lt->acquire_write_lock(txnid_b, one, one, nullptr);
+    r = lt->acquire_write_lock(txnid_b, one, one, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, two, five, nullptr);
+    r = lt->acquire_write_lock(txnid_b, two, five, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, &max_int, &max_int, nullptr);
+    r = lt->acquire_write_lock(txnid_b, &max_int, &max_int, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, &max_int, toku_dbt_positive_infinity(), nullptr);
+    r = lt->acquire_write_lock(txnid_b, &max_int, toku_dbt_positive_infinity(), nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);

    lt->remove_overlapping_locks_for_txnid(txnid_a, toku_dbt_negative_infinity(), five);

    // txn A will lock -inf, +inf
-    r = lt->acquire_write_lock(txnid_a, toku_dbt_negative_infinity(), toku_dbt_positive_infinity(), nullptr);
+    r = lt->acquire_write_lock(txnid_a, toku_dbt_negative_infinity(), toku_dbt_positive_infinity(), nullptr, false);
    invariant(r == 0);
    // txn B will fail to get any lock
-    r = lt->acquire_write_lock(txnid_b, zero, one, nullptr);
+    r = lt->acquire_write_lock(txnid_b, zero, one, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, two, five, nullptr);
+    r = lt->acquire_write_lock(txnid_b, two, five, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, &min_int, &min_int, nullptr);
+    r = lt->acquire_write_lock(txnid_b, &min_int, &min_int, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, &min_int, &max_int, nullptr);
+    r = lt->acquire_write_lock(txnid_b, &min_int, &max_int, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, &max_int, &max_int, nullptr);
+    r = lt->acquire_write_lock(txnid_b, &max_int, &max_int, nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, toku_dbt_negative_infinity(), toku_dbt_negative_infinity(), nullptr);
+    r = lt->acquire_write_lock(txnid_b, toku_dbt_negative_infinity(), toku_dbt_negative_infinity(), nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, toku_dbt_negative_infinity(), toku_dbt_positive_infinity(), nullptr);
+    r = lt->acquire_write_lock(txnid_b, toku_dbt_negative_infinity(), toku_dbt_positive_infinity(), nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);
-    r = lt->acquire_write_lock(txnid_b, toku_dbt_positive_infinity(), toku_dbt_positive_infinity(), nullptr);
+    r = lt->acquire_write_lock(txnid_b, toku_dbt_positive_infinity(), toku_dbt_positive_infinity(), nullptr, false);
    invariant(r == DB_LOCK_NOTGRANTED);

    lt->remove_overlapping_locks_for_txnid(txnid_a, toku_dbt_negative_infinity(), toku_dbt_positive_infinity());

--- a/locktree/tests/locktree_overlapping_relock.cc
+++ b/locktree/tests/locktree_overlapping_relock.cc
@@ -121,15 +121,15 @@ void locktree_unit_test::test_overlapping_relock(void) {
    // do something. at the end of the test, we release 100, 100.
    const TXNID the_other_txnid = 9999;
    const DBT *hundred = get_dbt(100);
-    r = lt->acquire_write_lock(the_other_txnid, hundred, hundred, nullptr);
+    r = lt->acquire_write_lock(the_other_txnid, hundred, hundred, nullptr, false);
    invariant(r == 0);

    for (int test_run = 0; test_run < 2; test_run++) {
        // test_run == 0 means test with read lock
        // test_run == 1 means test with write lock
 #define ACQUIRE_LOCK(txn, left, right, conflicts) \
-        test_run == 0 ? lt->acquire_read_lock(txn, left, right, conflicts) \
-                      : lt->acquire_write_lock(txn, left, right, conflicts)
+        test_run == 0 ? lt->acquire_read_lock(txn, left, right, conflicts, false) \
+            : lt->acquire_write_lock(txn, left, right, conflicts, false)

        // lock [1,1] and [2,2]. then lock [1,2].
        // ensure only [1,2] exists in the tree

--- a/locktree/tests/locktree_simple_lock.cc
+++ b/locktree/tests/locktree_simple_lock.cc
@@ -115,8 +115,8 @@ void locktree_unit_test::test_simple_lock(void) {
        // test_run == 0 means test with read lock
        // test_run == 1 means test with write lock
 #define ACQUIRE_LOCK(txn, left, right, conflicts) \
-        test_run == 0 ? lt->acquire_read_lock(txn, left, right, conflicts) \
-                      : lt->acquire_write_lock(txn, left, right, conflicts)
+        test_run == 0 ? lt->acquire_read_lock(txn, left, right, conflicts, false) \
+            : lt->acquire_write_lock(txn, left, right, conflicts, false)

        // four txns, four points
        r = ACQUIRE_LOCK(txnid_a, one, one, nullptr);
@@ -178,7 +178,7 @@ void locktree_unit_test::test_simple_lock(void) {

    for (int64_t i = 0; i < num_locks; i++) {
        k.data = (void *) &keys[i];
-        r = lt->acquire_read_lock(txnid_a, &k, &k, nullptr);
+        r = lt->acquire_read_lock(txnid_a, &k, &k, nullptr, false);
        invariant(r == 0);
    }


--- a/locktree/tests/locktree_single_txnid_optimization.cc
+++ b/locktree/tests/locktree_single_txnid_optimization.cc
@@ -124,13 +124,13 @@ void locktree_unit_test::test_single_txnid_optimization(void) {
        buffer.create();

 #define lock_and_append_point_for_txnid_a(key) \
-        r = lt->acquire_write_lock(txnid_a, key, key, nullptr); \
+        r = lt->acquire_write_lock(txnid_a, key, key, nullptr, false);   \
        invariant_zero(r); \
        buffer.append(key, key);

 #define maybe_point_locks_for_txnid_b(i) \
        if (where == i) { \
-            r = lt->acquire_write_lock(txnid_b, one, one, nullptr); \
+            r = lt->acquire_write_lock(txnid_b, one, one, nullptr, false);    \
            invariant_zero(r); \
        }


--- a/locktree/tests/manager_status.cc
+++ b/locktree/tests/manager_status.cc
@@ -130,13 +130,13 @@ void manager_unit_test::test_status(void) {
    const DBT *one = get_dbt(1);

    // txn a write locks one
-    r = lt->acquire_write_lock(txnid_a, one, one, nullptr);
+    r = lt->acquire_write_lock(txnid_a, one, one, nullptr, false);
    assert(r == 0);

    // txn b tries to write lock one, conflicts, waits, and fails to lock one
    lock_request request_b;
    request_b.create();
-    request_b.set(lt, txnid_b, one, one, lock_request::type::WRITE);
+    request_b.set(lt, txnid_b, one, one, lock_request::type::WRITE, false);
    r = request_b.start();
    assert(r == DB_LOCK_NOTGRANTED);
    r = request_b.wait(1000);

--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -106,6 +106,7 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)
      loader-stress-del
      loader-stress-test
      loader-tpch-load
+      locktree_escalation_stalls
      lock-pressure
      manyfiles
      maxsize-for-loader

--- a/src/tests/locktree_escalation_stalls.cc
+++ b/src/tests/locktree_escalation_stalls.cc
--- a/src/ydb_row_lock.cc
+++ b/src/ydb_row_lock.cc
@@ -247,7 +247,7 @@ int toku_db_start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT
        toku::lock_request::type lock_type, toku::lock_request *request) {
    DB_TXN *txn_anc = txn_oldest_ancester(txn);
    TXNID txn_anc_id = txn_anc->id64(txn_anc);
-    request->set(db->i->lt, txn_anc_id, left_key, right_key, lock_type);
+    request->set(db->i->lt, txn_anc_id, left_key, right_key, lock_type, toku_is_big_txn(txn_anc));

    const int r = request->start();
    if (r == 0) {
@@ -297,7 +297,7 @@ void toku_db_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn) {
    // This lock request must succeed, so we do not want to wait
    toku::lock_request request;
    request.create();
-    request.set(db->i->lt, txn_anc_id, key, key, toku::lock_request::type::WRITE);
+    request.set(db->i->lt, txn_anc_id, key, key, toku::lock_request::type::WRITE, toku_is_big_txn(txn_anc));
    int r = request.start();
    invariant_zero(r);
    db_txn_note_row_lock(db, txn_anc, key, key);

--- a/src/ydb_txn.cc
+++ b/src/ydb_txn.cc
@@ -323,7 +323,7 @@ static int locked_txn_commit_with_progress(DB_TXN *txn, uint32_t flags,
    if (!toku_txn_is_read_only(tokutxn)) {
        // A readonly transaction does no logging, and therefore does not need the MO lock.
        holds_mo_lock = true;
-        if (toku_txn_has_spilled_rollback(tokutxn)) {
+        if (toku_is_big_tokutxn(tokutxn)) {
            low_priority = true;
            toku_low_priority_multi_operation_client_lock();
        } else {
@@ -351,7 +351,7 @@ static int locked_txn_abort_with_progress(DB_TXN *txn,
    if (!toku_txn_is_read_only(tokutxn)) {
        // A readonly transaction does no logging, and therefore does not need the MO lock.
        holds_mo_lock = true;
-        if (toku_txn_has_spilled_rollback(tokutxn)) {
+        if (toku_is_big_tokutxn(tokutxn)) {
            low_priority = true;
            toku_low_priority_multi_operation_client_lock();
        } else {
@@ -602,3 +602,11 @@ void toku_keep_prepared_txn_callback (DB_ENV *env, TOKUTXN tokutxn) {
 void toku_increase_last_xid(DB_ENV *env, uint64_t increment) {
    toku_txn_manager_increase_last_xid(toku_logger_get_txn_manager(env->i->logger), increment);
 }
+
+bool toku_is_big_txn(DB_TXN *txn) {
+    return toku_is_big_tokutxn(db_txn_struct_i(txn)->tokutxn);
+}
+
+bool toku_is_big_tokutxn(TOKUTXN tokutxn) {
+    return toku_txn_has_spilled_rollback(tokutxn);
+}
--- a/src/ydb_txn.h
+++ b/src/ydb_txn.h
@@ -107,6 +107,9 @@ int locked_txn_abort(DB_TXN *txn);

 void toku_keep_prepared_txn_callback(DB_ENV *env, TOKUTXN tokutxn);

+bool toku_is_big_txn(DB_TXN *txn);
+bool toku_is_big_tokutxn(TOKUTXN tokutxn);
+
 // Test-only function
 extern "C" void toku_increase_last_xid(DB_ENV *env, uint64_t increment) __attribute__((__visibility__("default")));