Addresses #1125 Merged nested transactions from temporary merge branch into main.

Current tests fail (not regressions, they fail as of 13461) * {{{x1.tdbrun}}} * {{{test_log(2,3,4,5,6,7,8,9,10).recover}}} * {{{test-recover(1,2,3).tdbrun}}} * {{{test1324.tdbrun}}} ULE_DEBUG disabled (defined to 0) Can be re-enabled for test purposes (set to 1). refs [t:1125] Merging into the temp branch (tokudb.main_13461+1125) {{{svn merge --accept=postpone -r 12527:13461 ../tokudb.1125 ./}}} Merging into main {{{svn merge --accept=postpone -r13462:13463 ../tokudb.main_13461+1125/ ./}}} git-svn-id: file:///svn/toku/tokudb@13464 c7de825b-a66e-492c-adef-691d508d4ae1

Addresses #1125 Merged nested transactions from temporary merge branch into main.
Current tests fail (not regressions, they fail as of 13461) * {{{x1.tdbrun}}} * {{{test_log(2,3,4,5,6,7,8,9,10).recover}}} * {{{test-recover(1,2,3).tdbrun}}} * {{{test1324.tdbrun}}} ULE_DEBUG disabled (defined to 0) Can be re-enabled for test purposes (set to 1). refs [t:1125] Merging into the temp branch (tokudb.main_13461+1125) {{{svn merge --accept=postpone -r 12527:13461 ../tokudb.1125 ./}}} Merging into main {{{svn merge --accept=postpone -r13462:13463 ../tokudb.main_13461+1125/ ./}}} git-svn-id: file:///svn/toku/tokudb@13464 c7de825b-a66e-492c-adef-691d508d4ae1
b6c03d45 · Yoni Fogel · 19e7b929 · b6c03d45 · b6c03d45 · b6c03d45
Commit b6c03d45 authored Jul 22, 2009 by Yoni Fogel
48 changed files
--- a/db-benchmark-test/Makefile
+++ b/db-benchmark-test/Makefile
@@ -182,9 +182,20 @@ QUIET_BENCH_ARG=
 	rm -rf $@
 	$(BENCH_TIME) ./$< --env $@ $(QUIET_BENCH_ARG) $(EXTRA_BENCH_ARGS)

+DB_TYPES = no-txn txn abort child child_abort child_abortfirst txn1 abort1 child1 child-abort1 child_abortfirst1
+
 no-txn.benchmark.dir:            EXTRA_BENCH_ARGS=
 txn.benchmark.dir:               EXTRA_BENCH_ARGS= -x --singlex --prelock
 abort.benchmark.dir:             EXTRA_BENCH_ARGS= -x --singlex --prelock --abort
+child.benchmark.dir:             EXTRA_BENCH_ARGS= -x --singlex --prelock         --singlex-child
+child-abort.benchmark.dir:       EXTRA_BENCH_ARGS= -x --singlex --prelock --abort --singlex-child
+child-abortfirst.benchmark.dir:  EXTRA_BENCH_ARGS= -x --singlex --prelock --abort --singlex-child --finish-child-first
+txn1.benchmark.dir:              EXTRA_BENCH_ARGS= -x --singlex --prelock                                              --insert1first
+abort1.benchmark.dir:            EXTRA_BENCH_ARGS= -x --singlex --prelock --abort                                      --insert1first
+child1.benchmark.dir:            EXTRA_BENCH_ARGS= -x --singlex --prelock         --singlex-child                      --insert1first
+child-abort1.benchmark.dir:      EXTRA_BENCH_ARGS= -x --singlex --prelock --abort --singlex-child                      --insert1first
+child-abortfirst1.benchmark.dir: EXTRA_BENCH_ARGS= -x --singlex --prelock --abort --singlex-child --finish-child-first --insert1first
+

 QUIET_SCAN_ARG=
 SCANSCAN_ARGS=--lwc --prelock --prelockflag --cachesize 268435456 # scanscan default, cache of windows (256MB)
@@ -210,6 +221,7 @@ SCANSCAN_TDB = scanscan-tokudb$(BINSUF)
 %.flattenedscan.dir: QUIET_SCAN_ARGS=-q
 %.flattenedscan.dir: BENCH_TIME=
 %.flattenedscan.dir: $(SCANSCAN_TDB) %.benchmark.dir
+	rm -rf $@
 	cp -R $*.benchmark.dir $@
 	./$< --env $@ $(SCANSCAN_ARGS) $(QUIET_SCAN_ARG)

@@ -217,3 +229,5 @@ SCANSCAN_TDB = scanscan-tokudb$(BINSUF)
 %.flattenedscan: $(SCANSCAN_TDB) %.flattenedscan.dir
 	$(SCAN_TIME) ./$< --env $@.dir $(SCANSCAN_ARGS)$(QUIET_SCAN_ARG)

+.SECONDARY: $(patsubst %,%.flattenedscan.dir, $(DB_TYPES))
+
--- a/db-benchmark-test/db-benchmark-test.c
+++ b/db-benchmark-test/db-benchmark-test.c
@@ -45,6 +45,8 @@ int prelock  = 0;
 int prelockflag = 0;
 int items_per_transaction = DEFAULT_ITEMS_PER_TRANSACTION;
 int items_per_iteration   = DEFAULT_ITEMS_TO_INSERT_PER_ITERATION;
+int finish_child_first = 0;  // Commit or abort child first (before doing so to the parent).  No effect if child does not exist.
+int singlex_child = 0;  // Do a single transaction, but do all work with a child
 int singlex = 0;  // Do a single transaction
 int singlex_create = 0;  // Create the db using the single transaction (only valid if singlex)
 int insert1first = 0;  // insert 1 before doing the rest
@@ -79,6 +81,7 @@ char *dbname;

 DB_ENV *dbenv;
 DB *db;
+DB_TXN *parenttid=0;
 DB_TXN *tid=0;


@@ -152,26 +155,36 @@ static void benchmark_setup (void) {
        if (do_transactions) {
            r=tid->commit(tid, 0);
            assert(r==0);
+            tid = NULL;
            r=dbenv->txn_begin(dbenv, 0, &tid, 0); CKERR(r);
        }
        insert(-1);
        if (singlex) {
            r=tid->commit(tid, 0);
            assert(r==0);
+            tid = NULL;
            r=dbenv->txn_begin(dbenv, 0, &tid, 0); CKERR(r);
        }
    }
    else if (singlex && !singlex_create) {
        r=tid->commit(tid, 0);
        assert(r==0);
+        tid = NULL;
        r=dbenv->txn_begin(dbenv, 0, &tid, 0); CKERR(r);
    }
    if (do_transactions) {
-	if (singlex) do_prelock(db, tid);
+	if (singlex)
+            do_prelock(db, tid);
        else {
            r=tid->commit(tid, 0);
            assert(r==0);
+            tid = NULL;
+        }
    }
+    if (singlex_child) {
+        parenttid = tid;
+        tid = NULL;
+        r=dbenv->txn_begin(dbenv, parenttid, &tid, 0); CKERR(r);
    }

 }
@@ -187,15 +200,34 @@ static void benchmark_shutdown (void) {
 #endif
    if (do_transactions && singlex && !insert1first && (singlex_create || prelock)) {
 #if defined(TOKUDB)
+        //There should be a single 'truncate' in the rolltmp instead of many 'insert' entries.
 	struct txn_stat *s;
 	r = tid->txn_stat(tid, &s);
 	assert(r==0);
-	assert(s->rolltmp_raw_count < 100);
+        //TODO: #1125 Always do the test after performance testing is done.
+        if (singlex_child) fprintf(stderr, "SKIPPED 'small rolltmp' test for child txn\n");
+        else
+            assert(s->rolltmp_raw_count < 100);  // gross test, not worth investigating details
 	os_free(s);
 	//system("ls -l bench.tokudb");
 #endif
+    }
+    if (do_transactions && singlex) {
+        if (!singlex_child || finish_child_first) {
+            assert(tid);
            r = (do_abort ? tid->abort(tid) : tid->commit(tid, 0));    assert(r==0);
+            tid = NULL; 
+        }
+        if (singlex_child) {
+            assert(parenttid);
+            r = (do_abort ? parenttid->abort(parenttid) : parenttid->commit(parenttid, 0));    assert(r==0);
+            parenttid = NULL;
        }
+        else
+            assert(!parenttid);
+    }
+    assert(!tid);
+    assert(!parenttid);

    r = db->close(db, 0);
    assert(r == 0);
@@ -240,6 +272,7 @@ static void insert (long long v) {
 	if (n_insertions_since_txn_began>=items_per_transaction && !singlex) {
 	    n_insertions_since_txn_began=0;
 	    r = tid->commit(tid, 0); assert(r==0);
+            tid = NULL;
 	    r=dbenv->txn_begin(dbenv, 0, &tid, 0); assert(r==0);
            do_prelock(db, tid);
 	    n_insertions_since_txn_began=0;
@@ -265,7 +298,7 @@ static void serial_insert_from (long long from) {
    }
    if (do_transactions && !singlex) {
 	int  r= tid->commit(tid, 0);             assert(r==0);
-	tid=0;
+	tid=NULL;
    }
 }

@@ -284,7 +317,7 @@ static void random_insert_below (long long below) {
    }
    if (do_transactions && !singlex) {
 	int  r= tid->commit(tid, 0);             assert(r==0);
-	tid=0;
+	tid=NULL;
    }
 }

@@ -328,6 +361,8 @@ static int print_usage (const char *argv0) {
    fprintf(stderr, "    --compressibility C   creates data that should compress by about a factor C.   Default C is large.   C is an float.\n");
    fprintf(stderr, "    --xcount N            how many insertions per transaction (default=%d)\n", DEFAULT_ITEMS_PER_TRANSACTION);
    fprintf(stderr, "    --singlex             (implies -x) Run the whole job as a single transaction.  (Default don't run as a single transaction.)\n");
+    fprintf(stderr, "    --singlex-child       (implies -x) Run the whole job as a single transaction, do all work a child of that transaction.\n");
+    fprintf(stderr, "    --finish-child-first  Commit/abort child before doing so to parent (no effect if no child).\n");
    fprintf(stderr, "    --singlex-create      (implies --singlex)  Create the file using the single transaction (Default is to use a different transaction to create.)\n");
    fprintf(stderr, "    --check_small_rolltmp (Only valid in --singlex mode)  Verify that very little data was saved in the rollback logs.\n");
    fprintf(stderr, "    --prelock             Prelock the database.\n");
@@ -405,6 +440,12 @@ int main (int argc, const char *argv[]) {
 	    do_transactions = 1;
 	    singlex = 1;
 	    singlex_create = 1;
+	} else if (strcmp(arg, "--finish-child-first") == 0) {
+	    finish_child_first = 1;
+	} else if (strcmp(arg, "--singlex-child") == 0) {
+	    do_transactions = 1;
+	    singlex = 1;
+	    singlex_child = 1;
 	} else if (strcmp(arg, "--singlex") == 0) {
 	    do_transactions = 1;
 	    singlex = 1;

--- a/include/tdb-internal.h
+++ b/include/tdb-internal.h
@@ -14,9 +14,9 @@ struct simple_dbt {
 struct __toku_db_txn_internal {
    //TXNID txnid64; /* A sixty-four bit txn id. */
    struct tokutxn *tokutxn;
-    struct __toku_lth *lth;
+    struct __toku_lth *lth;  //Hash table holding list of dictionaries this txn has touched
    u_int32_t flags;
-    DB_TXN *child, *next, *prev;
+    DB_TXN *child;
 };

 struct __toku_dbc_internal {

--- a/newbrt/Makefile
+++ b/newbrt/Makefile
@@ -42,10 +42,12 @@ BRT_SOURCES = \
  brt-serialize \
  brt-verify \
  brt \
+  brt_msg \
  brt-test-helpers \
  cachetable \
  checkpoint \
  fifo \
+  fifo_msg \
  fingerprint \
  key \
  leafentry \
@@ -60,11 +62,13 @@ BRT_SOURCES = \
  recover \
  roll \
  rollback \
+  ule \
  threadpool \
  toku_worker \
  trace_mem \
  txn \
  x1764 \
+  xids \
  ybt \
 # keep this line so I can have a \ on the previous line


--- a/newbrt/brt-internal.h
+++ b/newbrt/brt-internal.h
@@ -26,10 +26,8 @@ typedef void *OMTVALUE;
 enum { TREE_FANOUT = BRT_FANOUT };
 enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
 enum { OMT_ITEM_OVERHEAD = 0 }; /* No overhead for the OMT item.  The PMA needed to know the idx, but the OMT doesn't. */
-enum { BRT_CMD_OVERHEAD = (1     // the type
-			   + 8)  // the xid
+enum { BRT_CMD_OVERHEAD = (1)     // the type
 };
-enum { LE_OVERHEAD_BOUND = 9 }; // the type and xid

 enum { BRT_DEFAULT_NODE_SIZE = 1 << 22 };

@@ -246,7 +244,7 @@ static const BRTNODE null_brtnode=0;

 //extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen);
 //extern u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp);
-extern u_int32_t toku_calc_fingerprint_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen);
+extern u_int32_t toku_calc_fingerprint_cmd (u_int32_t type, XIDS xids, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen);
 extern u_int32_t toku_calc_fingerprint_cmdstruct (BRT_CMD cmd);

 // How long is the pivot key?
@@ -322,6 +320,7 @@ enum brt_layout_version_e {
    BRT_LAYOUT_VERSION_8 = 8,   // Diff from 7 to 8:  Use murmur instead of crc32.  We are going to make a simplification and stop supporting version 7 and before.  Current As of Beta 1.0.6
    BRT_LAYOUT_VERSION_9 = 9,   // Diff from 8 to 9:  Variable-sized blocks and compression.
    BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer
+    BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned).  BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
    BRT_ANTEULTIMATE_VERSION,   // the version after the most recent version
    BRT_LAYOUT_VERSION   = BRT_ANTEULTIMATE_VERSION-1 // A hack so I don't have to change this line.
 };

--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@@ -188,10 +188,11 @@ static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) {
        assert(0 <= n_buffers && n_buffers < TREE_FANOUT+1);
 	for (i=0; i< n_buffers; i++) {
 	    FIFO_ITERATE(BNC_BUFFER(node,i),
-			 key __attribute__((__unused__)), keylen,
+			 key, keylen,
 			 data __attribute__((__unused__)), datalen,
-			 type __attribute__((__unused__)), xid __attribute__((__unused__)),
-			 (hsize+=BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+keylen+datalen));
+			 type __attribute__((__unused__)), xids,
+			 (hsize+=BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+keylen+datalen+
+                          xids_get_serialize_size(xids)));
 	}
 	assert(hsize==node->u.n.n_bytes_in_buffers);
 	assert(csize==node->u.n.totalchildkeylens);
@@ -201,7 +202,7 @@ static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) {
 	toku_omt_iterate(node->u.l.buffer,
 			 addupsize,
 			 &hsize);
-	assert(hsize<=node->u.l.n_bytes_in_buffer);
+	assert(hsize==node->u.l.n_bytes_in_buffer);
 	hsize+=4; /* add n entries in buffer table. */
 	hsize+=3*8; /* add the three leaf stats, but no exact bit. */
 	return size+hsize;
@@ -226,12 +227,12 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) {
 	result+=4; /* n_entries in buffer table. */
 	result+=3*8; /* the three leaf stats. */
 	result+=node->u.l.n_bytes_in_buffer;
+    }
    if (toku_memory_check) {
        unsigned int slowresult = toku_serialize_brtnode_size_slow(node);
        if (result!=slowresult) printf("%s:%d result=%u slowresult=%u\n", __FILE__, __LINE__, result, slowresult);
        assert(result==slowresult);
    }
-    }
    return result;
 }

@@ -408,14 +409,14 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b
 	    for (i=0; i< n_buffers; i++) {
 		//printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i]));
 		wbuf_int(&w, toku_fifo_n_entries(BNC_BUFFER(node,i)));
-		FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xid,
+		FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xids,
 				  {
 				      assert(type>=0 && type<256);
 				      wbuf_char(&w, (unsigned char)type);
-				      wbuf_TXNID(&w, xid);
+				      wbuf_xids(&w, xids);
 				      wbuf_bytes(&w, key, keylen);
 				      wbuf_bytes(&w, data, datalen);
-				      check_local_fingerprint+=node->rand4fingerprint*toku_calc_fingerprint_cmd(type, xid, key, keylen, data, datalen);
+				      check_local_fingerprint+=node->rand4fingerprint*toku_calc_fingerprint_cmd(type, xids, key, keylen, data, datalen);
 				  });
 	    }
 	    //printf("%s:%d check_local_fingerprint=%8x\n", __FILE__, __LINE__, check_local_fingerprint);
@@ -736,7 +737,7 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash
    result->layout_version    = rbuf_int(&rc);
    {
 	switch (result->layout_version) {
-	case BRT_LAYOUT_VERSION_10: goto ok_layout_version;
+	case BRT_LAYOUT_VERSION: goto ok_layout_version;
 	    // Don't support older versions.
 	}
 	r=toku_db_badformat();
@@ -826,19 +827,21 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash
 		    bytevec val; ITEMLEN vallen;
 		    //toku_verify_counts(result);
                    int type = rbuf_char(&rc);
-		    TXNID xid  = rbuf_ulonglong(&rc);
+                    XIDS xids;
+                    xids_create_from_buffer(&rc, &xids);
 		    rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */
 		    rbuf_bytes(&rc, &val, &vallen);
-		    check_local_fingerprint += result->rand4fingerprint * toku_calc_fingerprint_cmd(type, xid, key, keylen, val, vallen);
+		    check_local_fingerprint += result->rand4fingerprint * toku_calc_fingerprint_cmd(type, xids, key, keylen, val, vallen);
 		    //printf("Found %s,%s\n", (char*)key, (char*)val);
 		    {
-			r=toku_fifo_enq(BNC_BUFFER(result, cnum), key, keylen, val, vallen, type, xid); /* Copies the data into the hash table. */
+			r=toku_fifo_enq(BNC_BUFFER(result, cnum), key, keylen, val, vallen, type, xids); /* Copies the data into the hash table. */
 			if (r!=0) { goto died_12; }
 		    }
-		    diff =  keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
+		    diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
 		    result->u.n.n_bytes_in_buffers += diff;
 		    BNC_NBYTESINBUF(result,cnum)   += diff;
 		    //printf("Inserted\n");
+                    xids_destroy(&xids);
 		}
 	    }
 	    if (check_local_fingerprint != result->local_fingerprint) {
@@ -977,6 +980,7 @@ serialize_brt_header_min_size (u_int32_t version) {
    u_int32_t size;
    switch(version) {
        case BRT_LAYOUT_VERSION_10:
+        case BRT_LAYOUT_VERSION_11:
            size = (+8 // "tokudata"
                    +4 // version
                    +4 // size
@@ -1231,7 +1235,7 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
    list_init(&h->zombie_brts);
    //version MUST be in network order on disk regardless of disk order
    h->layout_version = rbuf_network_int(&rc);
-    assert(h->layout_version==BRT_LAYOUT_VERSION_10);
+    assert(h->layout_version==BRT_LAYOUT_VERSION);

    //Size MUST be in network order regardless of disk order.
    u_int32_t size = rbuf_network_int(&rc);
@@ -1311,8 +1315,9 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf *
        if (r==0) {
            //Version MUST be in network order regardless of disk order.
            version = rbuf_network_int(rb);
-            if (version < BRT_LAYOUT_VERSION_10) r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
-            if (version > BRT_LAYOUT_VERSION_10) r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
+            //TODO: #1125 Possibly support transparent upgrade.  If so, it should be < ...10
+            if (version < BRT_LAYOUT_VERSION) r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
+            if (version > BRT_LAYOUT_VERSION) r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
        }
        u_int32_t size;
        if (r==0) {

--- a/newbrt/brt-test-helpers.c
+++ b/newbrt/brt-test-helpers.c
@@ -3,6 +3,7 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "includes.h"
+#include "ule.h"

 int toku_testsetup_leaf(BRT brt, BLOCKNUM *blocknum) {
    BRTNODE node;
@@ -74,15 +75,21 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke
    toku_verify_counts(node);
    assert(node->height==0);

-    u_int32_t lesize, disksize;
+    size_t lesize, disksize;
    LEAFENTRY leafentry;
-    r = le_committed(keylen, key, vallen, val, &lesize, &disksize, &leafentry, node->u.l.buffer, &node->u.l.buffer_mempool, 0);
-
    OMTVALUE storeddatav;
    u_int32_t idx;
    DBT keydbt,valdbt;
-    BRT_CMD_S cmd = {BRT_INSERT, 0, .u.id={toku_fill_dbt(&keydbt, key, keylen),
+    BRT_CMD_S cmd = {BRT_INSERT, xids_get_root_xids(),
+                     .u.id={toku_fill_dbt(&keydbt, key, keylen),
                            toku_fill_dbt(&valdbt, val, vallen)}};
+    //Generate a leafentry (committed insert key,val)
+    r = apply_msg_to_leafentry(&cmd, NULL, //No old leafentry
+                               &lesize, &disksize, &leafentry, 
+                               node->u.l.buffer, &node->u.l.buffer_mempool, 0);
+    assert(r==0);
+
+
    struct cmd_leafval_heaviside_extra be = {brt, &cmd, node->flags & TOKU_DB_DUPSORT};
    r = toku_omt_find_zero(node->u.l.buffer, toku_cmd_leafval_heaviside, &be, &storeddatav, &idx, NULL);

@@ -127,12 +134,13 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_cmd_t
 				       toku_fill_dbt(&v, val, vallen),
 				       brt);

-    r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, (TXNID)0);
+    XIDS xids_0 = xids_get_root_xids();
+    r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, xids_0);
    assert(r==0);
-    u_int32_t fdelta = node->rand4fingerprint * toku_calc_fingerprint_cmd(cmdtype, (TXNID)0, key, keylen, val, vallen);
+    u_int32_t fdelta = node->rand4fingerprint * toku_calc_fingerprint_cmd(cmdtype, xids_0, key, keylen, val, vallen);
    node->local_fingerprint += fdelta;
    *subtree_fingerprint += fdelta;
-    int sizediff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
+    int sizediff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids_0);
    node->u.n.n_bytes_in_buffers += sizediff;
    BNC_NBYTESINBUF(node, childnum) += sizediff;
    node->dirty = 1;

--- a/newbrt/brt-verify.c
+++ b/newbrt/brt-verify.c
@@ -50,12 +50,12 @@ static int compare_pairs (BRT brt, struct kv_pair *a, struct kv_pair *b) {
 static int compare_leafentries (BRT brt, LEAFENTRY a, LEAFENTRY b) {
    DBT x,y;
    int cmp = brt->compare_fun(brt->db,
-			       toku_fill_dbt(&x, le_any_key(a), le_any_keylen(a)),
-			       toku_fill_dbt(&y, le_any_key(b), le_any_keylen(b)));
+			       toku_fill_dbt(&x, le_key(a), le_keylen(a)),
+			       toku_fill_dbt(&y, le_key(b), le_keylen(b)));
    if (cmp==0 && (brt->flags & TOKU_DB_DUPSORT)) {
 	cmp = brt->dup_compare(brt->db,
-			       toku_fill_dbt(&x, le_any_val(a), le_any_vallen(a)),
-			       toku_fill_dbt(&y, le_any_val(b), le_any_vallen(b)));
+			       toku_fill_dbt(&x, le_innermost_inserted_val(a), le_innermost_inserted_vallen(a)),
+			       toku_fill_dbt(&y, le_innermost_inserted_val(b), le_innermost_inserted_vallen(b)));
    }
    return cmp;
 }
@@ -75,7 +75,7 @@ static void verify_pair (bytevec key, unsigned int keylen,
                         bytevec data __attribute__((__unused__)),
                         unsigned int datalen __attribute__((__unused__)),
                         int type __attribute__((__unused__)),
-                         TXNID xid __attribute__((__unused__)),
+                         XIDS xids __attribute__((__unused__)),
                         void *arg) {
    struct verify_pair_arg *vparg = (struct verify_pair_arg *)arg;
    BRT brt = vparg->brt;

--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -181,6 +181,10 @@ message are not gorged.  (But they may be hungry or too fat or too thin.)
 #include "includes.h"
 #include "leaflock.h"
 #include "checkpoint.h"
+// Access to nested transaction logic
+#include "ule.h"
+#include "xids.h"
+#include "roll.h"

 // We invalidate all the OMTCURSORS any time we push into the root of the BRT for that OMT.
 // We keep a counter on each brt header, but if the brt header is evicted from the cachetable
@@ -269,12 +273,12 @@ fill_leafnode_estimates (OMTVALUE val, u_int32_t UU(idx), void *vs)
 {
    LEAFENTRY le = val;
    struct fill_leafnode_estimates_state *s = vs;
-    s->e->dsize += le_any_keylen(le) + le_any_vallen(le);
+    s->e->dsize += le_keylen(le) + le_innermost_inserted_vallen(le);
    s->e->ndata++;
    if ((s->prevval == NULL) ||
 	(0 == (s->node->flags & TOKU_DB_DUPSORT)) ||
-	(le_any_keylen(le) != le_any_keylen(s->prevval)) ||
-	(memcmp(le_any_key(le), le_any_key(s->prevval), le_any_keylen(le))!=0)) { // really should use comparison function
+	(le_keylen(le) != le_keylen(s->prevval)) ||
+	(memcmp(le_key(le), le_key(s->prevval), le_keylen(le))!=0)) { // really should use comparison function
 	s->e->nkeys++;
    }
    s->prevval = le;
@@ -351,8 +355,8 @@ verify_local_fingerprint_nonleaf (BRTNODE node)
        int i;
        if (node->height==0) return;
        for (i=0; i<node->u.n.n_children; i++)
-            FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xid,
-                         fp += toku_calc_fingerprint_cmd(type, xid, key, keylen, data, datalen);
+            FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xids,
+                         fp += toku_calc_fingerprint_cmd(type, xids, key, keylen, data, datalen);
                         );
        fp *= node->rand4fingerprint;
        assert(fp==node->local_fingerprint);
@@ -478,7 +482,7 @@ int toku_brtnode_fetch_callback (CACHEFILE cachefile, BLOCKNUM nodename, u_int32
 }

 static int
-leafval_heaviside_le_committed (u_int32_t klen, void *kval,
+leafval_heaviside_le (u_int32_t klen, void *kval,
                      u_int32_t dlen, void *dval,
                      struct cmd_leafval_heaviside_extra *be) {
    BRT t = be->t;
@@ -495,36 +499,18 @@ leafval_heaviside_le_committed (u_int32_t klen, void *kval,
    }
 }

-static int
-leafval_heaviside_le_both (TXNID xid __attribute__((__unused__)),
-                        u_int32_t klen, void *kval,
-                        u_int32_t clen __attribute__((__unused__)), void *cval __attribute__((__unused__)),
-                        u_int32_t plen, void *pval,
-                        struct cmd_leafval_heaviside_extra *be) {
-    return leafval_heaviside_le_committed(klen, kval, plen, pval, be);
-}
-
-static int
-leafval_heaviside_le_provdel (TXNID xid __attribute__((__unused__)),
-                           u_int32_t klen, void *kval,
-                           u_int32_t clen, void *cval,
-                           struct cmd_leafval_heaviside_extra *be) {
-    return leafval_heaviside_le_committed(klen, kval, clen, cval, be);
-}
-
-static int
-leafval_heaviside_le_provpair (TXNID xid __attribute__((__unused__)),
-                            u_int32_t klen, void *kval,
-                            u_int32_t plen, void *pval,
-                            struct cmd_leafval_heaviside_extra *be) {
-    return leafval_heaviside_le_committed(klen, kval, plen, pval, be);
-}
-
-int toku_cmd_leafval_heaviside (OMTVALUE lev, void *extra) {
+//TODO: #1125 optimize
+int
+toku_cmd_leafval_heaviside (OMTVALUE lev, void *extra) {
    LEAFENTRY le=lev;
    struct cmd_leafval_heaviside_extra *be = extra;
-    LESWITCHCALL(le, leafval_heaviside, be);
-    abort(); return 0; // make certain compilers happy
+    u_int32_t keylen;
+    void*     key = le_key_and_len(le, &keylen);
+    u_int32_t vallen;
+    void*     val = le_innermost_inserted_val_and_len(le, &vallen);
+    return leafval_heaviside_le(keylen, key,
+                                vallen, val,
+                                be);
 }

 // If you pass in data==0 then it only compares the key, not the data (even if is a DUPSORT database)
@@ -846,6 +832,9 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk)
                    }
                }
            }
+//TODO: #1125 REMOVE DEBUG
+            assert(             sumsofar <= toku_mempool_get_size(&B   ->u.l.buffer_mempool));
+            assert(sumlesizes - sumsofar <= toku_mempool_get_size(&node->u.l.buffer_mempool));
        }
        // Now we know where we are going to break it
        OMT old_omt = node->u.l.buffer;
@@ -867,8 +856,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk)
 		    if (t->flags & TOKU_DB_DUPSORT) key_is_unique=TRUE;
 		    else if (prevle==NULL)          key_is_unique=TRUE;
 		    else if (t->compare_fun(t->db,
-					    toku_fill_dbt(&xdbt, le_any_key(prevle), le_any_keylen(prevle)),
-					    toku_fill_dbt(&ydbt, le_any_key(oldle),   le_any_keylen(oldle)))
+					    toku_fill_dbt(&xdbt, le_key(prevle), le_keylen(prevle)),
+					    toku_fill_dbt(&ydbt, le_key(oldle),   le_keylen(oldle)))
 			     ==0) {
 			key_is_unique=FALSE;
 		    } else {
@@ -877,8 +866,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk)
 		}
 		if (key_is_unique) diff_est.nkeys++;
 		diff_est.ndata++;
-		diff_est.dsize += le_any_keylen(oldle) + le_any_vallen(oldle);
-		//printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_any_keylen(oldle)+ le_any_vallen(oldle), diff_est.dsize);
+		diff_est.dsize += le_keylen(oldle) + le_innermost_inserted_vallen(oldle);
+		//printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_keylen(oldle)+ le_innermost_inserted_vallen(oldle), diff_est.dsize);
                diff_fp += toku_le_crc(oldle);
                diff_size += OMT_ITEM_OVERHEAD + leafentry_disksize(oldle);
                memcpy(newle, oldle, leafentry_memsize(oldle));
@@ -920,11 +909,11 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk)
        assert(r==0); // that fetch should have worked.
        LEAFENTRY le=lev;
        if (node->flags&TOKU_DB_DUPSORT) {
-            splitk->size = le_any_keylen(le)+le_any_vallen(le);
-            splitk->data = kv_pair_malloc(le_any_key(le), le_any_keylen(le), le_any_val(le), le_any_vallen(le));
+            splitk->size = le_keylen(le)+le_innermost_inserted_vallen(le);
+            splitk->data = kv_pair_malloc(le_key(le), le_keylen(le), le_innermost_inserted_val(le), le_innermost_inserted_vallen(le));
        } else {
-            splitk->size = le_any_keylen(le);
-            splitk->data = kv_pair_malloc(le_any_key(le), le_any_keylen(le), 0, 0);
+            splitk->size = le_keylen(le);
+            splitk->data = kv_pair_malloc(le_key(le), le_keylen(le), 0, 0);
        }
        splitk->flags=0;
    }
@@ -1004,15 +993,15 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
                bytevec key, data;
                unsigned int keylen, datalen;
                u_int32_t type;
-                TXNID xid;
-                int fr = toku_fifo_peek(from_htab, &key, &keylen, &data, &datalen, &type, &xid);
+                XIDS xids;
+                int fr = toku_fifo_peek(from_htab, &key, &keylen, &data, &datalen, &type, &xids);
                if (fr!=0) break;
-                int n_bytes_moved = keylen+datalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
+                int n_bytes_moved = keylen+datalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
                u_int32_t old_from_fingerprint = node->local_fingerprint;
-                u_int32_t delta = toku_calc_fingerprint_cmd(type, xid, key, keylen, data, datalen);
+                u_int32_t delta = toku_calc_fingerprint_cmd(type, xids, key, keylen, data, datalen);
                u_int32_t new_from_fingerprint = old_from_fingerprint - node->rand4fingerprint*delta;
 		B->local_fingerprint += B->rand4fingerprint*delta;
-                int r = toku_fifo_enq(to_htab, key, keylen, data, datalen, type, xid);
+                int r = toku_fifo_enq(to_htab, key, keylen, data, datalen, type, xids);
                if (r!=0) return r;
                toku_fifo_deq(from_htab);
                // key and data will no longer be valid
@@ -1248,6 +1237,7 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react)
    }
 }

+//TODO: Rename this function
 static int
 should_compare_both_keys (BRTNODE node, BRT_CMD cmd)
 // Effect: Return nonzero if we need to compare both the key and the value.
@@ -1269,245 +1259,15 @@ should_compare_both_keys (BRTNODE node, BRT_CMD cmd)
    abort(); return 0;
 }

-static int apply_cmd_to_le_committed (u_int32_t klen, void *kval,
-                                      u_int32_t dlen, void *dval,
-                                      BRT_CMD cmd,
-                                      u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data,
-                                      OMT omt, struct mempool *mp, void **maybe_free) {
-    //assert(cmd->u.id.key->size == klen);
-    //assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
-    switch (cmd->type) {
-    case BRT_INSERT:
-        return le_both(cmd->xid,
-                       klen, kval,
-                       dlen, dval, 
-                       cmd->u.id.val->size, cmd->u.id.val->data,
-                       newlen, disksize, new_data,
-                       omt, mp, maybe_free);
-    case BRT_DELETE_ANY:
-    case BRT_DELETE_BOTH:
-        return le_provdel(cmd->xid,
-                          klen, kval,
-                          dlen, dval,
-                          newlen, disksize, new_data,
-                          omt, mp, maybe_free);
-    case BRT_ABORT_BOTH:
-    case BRT_ABORT_ANY:
-    case BRT_COMMIT_BOTH:
-    case BRT_COMMIT_ANY:
-        // Just return the original committed record
-        return le_committed(klen, kval, dlen, dval,
-                            newlen, disksize, new_data,
-                            omt, mp, maybe_free);
-    case BRT_NONE: break;
-    }
-    abort(); return 0;
-}
-
-static int apply_cmd_to_le_both (TXNID xid,
-                                 u_int32_t klen, void *kval,
-                                 u_int32_t clen, void *cval,
-                                 u_int32_t plen, void *pval,
-                                 BRT_CMD cmd,
-                                 u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data,
-                                 OMT omt, struct mempool *mp, void *maybe_free) {
-    u_int32_t prev_len;
-    void     *prev_val;
-    if (xid==cmd->xid) {
-        // The xids match, so throw away the provisional value.
-        prev_len = clen;  prev_val = cval;
-    } else {
-        // If the xids don't match, then we are moving the provisional value to committed status.
-        prev_len = plen;  prev_val = pval;
-    }
-    // keep the committed value for rollback.
-    //assert(cmd->u.id.key->size == klen);
-    //assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
-    switch (cmd->type) {
-    case BRT_INSERT:
-        return le_both(cmd->xid,
-                       klen, kval,
-                       prev_len, prev_val,
-                       cmd->u.id.val->size, cmd->u.id.val->data,
-                       newlen, disksize, new_data,
-                       omt, mp, maybe_free);
-    case BRT_DELETE_ANY:
-    case BRT_DELETE_BOTH:
-        return le_provdel(cmd->xid,
-                          klen, kval,
-                          prev_len, prev_val,
-                          newlen, disksize, new_data,
-                          omt, mp, maybe_free);
-    case BRT_ABORT_BOTH:
-    case BRT_ABORT_ANY:
-        // I don't see how you could have an abort where the xids don't match.  But do it anyway.
-        return le_committed(klen, kval,
-                            prev_len, prev_val,
-                            newlen, disksize, new_data,
-                            omt, mp, maybe_free);
-    case BRT_COMMIT_BOTH:
-    case BRT_COMMIT_ANY:
-        // In the future we won't even have these commit messages.
-        return le_committed(klen, kval,
-                            plen, pval,
-                            newlen, disksize, new_data,
-                            omt, mp, maybe_free);
-    case BRT_NONE: break;
-    }
-    abort(); return 0;
-}
-
-static int apply_cmd_to_le_provdel (TXNID xid,
-                                    u_int32_t klen, void *kval,
-                                    u_int32_t clen, void *cval,
-                                    BRT_CMD cmd,
-                                    u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data,
-                                    OMT omt, struct mempool *mp, void *maybe_free)
-{
-    // keep the committed value for rollback
-    //assert(cmd->u.id.key->size == klen);
-    //assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
-    switch (cmd->type) {
-    case BRT_INSERT:
-        if (cmd->xid == xid) {
-            return le_both(cmd->xid,
-                           klen, kval,
-                           clen, cval,
-                           cmd->u.id.val->size, cmd->u.id.val->data,
-                           newlen, disksize, new_data,
-                           omt, mp, maybe_free);
-        } else {
-            // It's an insert, but the committed value is deleted (since the xids don't match, we assume the delete took effect)
-            return le_provpair(cmd->xid,
-                               klen, kval,
-                               cmd->u.id.val->size, cmd->u.id.val->data,
-                               newlen, disksize, new_data,
-                               omt, mp, maybe_free);
-        }
-    case BRT_DELETE_ANY:
-    case BRT_DELETE_BOTH:
-        if (cmd->xid == xid) {
-            // A delete of a delete could conceivably return the identical value, saving a malloc and a free, but to simplify things we just reallocate it
-            // because othewise we have to notice not to free() the olditem.
-            return le_provdel(cmd->xid,
-                              klen, kval,
-                              clen, cval,
-                              newlen, disksize, new_data,
-                              omt, mp, maybe_free);
-        } else {
-            // The commited value is deleted, and we are deleting, so treat as a delete.
-            *new_data = 0;
-            return 0;
-        }
-    case BRT_ABORT_BOTH:
-    case BRT_ABORT_ANY:
-        // I don't see how the xids could not match...
-        return le_committed(klen, kval,
-                            clen, cval,
-                            newlen, disksize, new_data,
-                            omt, mp, maybe_free);
-    case BRT_COMMIT_BOTH:
-    case BRT_COMMIT_ANY:
-        *new_data = 0;
-        return 0;
-    case BRT_NONE: break;
-    }
-    abort(); return 0;
-}
-
-static int apply_cmd_to_le_provpair (TXNID xid,
-                                     u_int32_t klen, void *kval,
-                                     u_int32_t plen , void *pval,
-                                     BRT_CMD cmd,
-                                     u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data,
-                                     OMT omt, struct mempool *mp, void **maybe_free) {
-    //assert(cmd->u.id.key->size == klen);
-    //assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
-    switch (cmd->type) {
-    case BRT_INSERT:
-        if (cmd->xid == xid) {
-            // it's still a provpair (the old prov value is lost)
-            return le_provpair(cmd->xid,
-                               klen, kval,
-                               cmd->u.id.val->size, cmd->u.id.val->data,
-                               newlen, disksize, new_data,
-                               omt, mp, maybe_free);
-        } else {
-            // the old prov was actually committed.
-            return le_both(cmd->xid,
-                           klen, kval,
-                           plen, pval,
-                           cmd->u.id.val->size, cmd->u.id.val->data,
-                           newlen, disksize, new_data,
-                           omt, mp, maybe_free);
-        }
-    case BRT_DELETE_BOTH:
-    case BRT_DELETE_ANY:
-        if (cmd->xid == xid) {
-            // A delete of a provisional pair is nothign
-            *new_data = 0;
-            return 0;
-        } else {
-            // The prov pair is actually a committed value.
-            return le_provdel(cmd->xid,
-                              klen, kval,
-                              plen, pval,
-                              newlen, disksize, new_data,
-                              omt, mp, maybe_free);
-        }
-    case BRT_ABORT_BOTH:
-    case BRT_ABORT_ANY:
-        // An abort of a provisional pair is nothing.
-        *new_data = 0;
-        return 0;
-    case BRT_COMMIT_ANY:
-    case BRT_COMMIT_BOTH:
-        return le_committed(klen, kval,
-                            plen, pval,
-                            newlen, disksize, new_data,
-                            omt, mp, maybe_free);
-    case BRT_NONE: break;
-    }
-    abort(); return 0;
-}
-
+//TODO: #1125 remove scaffolding
 static int
-apply_cmd_to_leaf (BRT_CMD cmd,
+apply_cmd_to_leaf(BRT_CMD cmd,
 		   void *stored_data, // NULL if there was no stored data.
-                   u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data,
-                   OMT omt, struct mempool *mp, void **maybe_free)
-{
-    if (stored_data==0) {
-        switch (cmd->type) {
-        case BRT_INSERT:
-            {
-                LEAFENTRY le;
-                int r = le_provpair(cmd->xid,
-                                    cmd->u.id.key->size, cmd->u.id.key->data,
-                                    cmd->u.id.val->size, cmd->u.id.val->data,
-                                    newlen, disksize, &le,
+		   size_t *newlen, size_t *disksize, LEAFENTRY *new_data,
+		   OMT omt, struct mempool *mp, void **maybe_free) {
+    int r = apply_msg_to_leafentry(cmd, stored_data, newlen, disksize, new_data,
                                       omt, mp, maybe_free);
-                if (r==0) *new_data=le;
    return r;
-            }
-        case BRT_DELETE_BOTH:
-        case BRT_DELETE_ANY:
-        case BRT_ABORT_BOTH:
-        case BRT_ABORT_ANY:
-        case BRT_COMMIT_BOTH:
-        case BRT_COMMIT_ANY:
-            *new_data = 0;
-            return 0; // Don't have to insert anything.
-        case BRT_NONE:
-            break;
-        }
-        abort();
-    } else {
-        LESWITCHCALL(stored_data, apply_cmd_to, cmd,
-                     newlen, disksize, new_data,
-                     omt, mp, maybe_free);
-    }
-    abort(); return 0;
 }

 static int
@@ -1517,9 +1277,9 @@ other_key_matches (BRTNODE node, u_int32_t idx, LEAFENTRY le)
    int r = toku_omt_fetch(node->u.l.buffer, idx, &other_lev, (OMTCURSOR)NULL);
    assert(r==0);
    LEAFENTRY other_le = other_lev;
-    u_int32_t other_keylen = le_any_keylen(other_le);
-    if (other_keylen == le_any_keylen(le)
-	&& memcmp(le_any_key(other_le), le_any_key(le), other_keylen)==0)   // really should use comparison function
+    u_int32_t other_keylen = le_keylen(other_le);
+    if (other_keylen == le_keylen(le)
+	&& memcmp(le_key(other_le), le_key(le), other_keylen)==0)   // really should use comparison function
 	return 1;
    else
 	return 0;
@@ -1550,7 +1310,7 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
 {
    // brt_leaf_check_leaf_stats(node);

-    u_int32_t newlen=0, newdisksize=0;
+    size_t newlen=0, newdisksize=0;
    LEAFENTRY new_le=0;
    void *maybe_free = 0;
    // This function may call mempool_malloc_dont_release() to allocate more space.
@@ -1572,18 +1332,18 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,

 	// If we are replacing a leafentry, then the counts on the estimates remain unchanged, but the size might change
 	{
-	    u_int32_t oldlen = le_any_vallen(le);
+	    u_int32_t oldlen = le_innermost_inserted_vallen(le);
 	    assert(node->u.l.leaf_stats.dsize >= oldlen);
 	    assert(node->u.l.leaf_stats.dsize < (1U<<31)); // make sure we didn't underflow
 	    node->u.l.leaf_stats.dsize -= oldlen;
-	    node->u.l.leaf_stats.dsize += le_any_vallen(new_le); // add it in two pieces to avoid ugly overflow
+	    node->u.l.leaf_stats.dsize += le_innermost_inserted_vallen(new_le); // add it in two pieces to avoid ugly overflow
 	    assert(node->u.l.leaf_stats.dsize < (1U<<31)); // make sure we didn't underflow
 	}

        node->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + leafentry_disksize(le);
        node->local_fingerprint     -= node->rand4fingerprint * toku_le_crc(le);
        
-	//printf("%s:%d Added %u-%u got %lu\n", __FILE__, __LINE__, le_any_keylen(new_le), le_any_vallen(le), node->u.l.leaf_stats.dsize);
+	//printf("%s:%d Added %u-%u got %lu\n", __FILE__, __LINE__, le_keylen(new_le), le_innermost_inserted_vallen(le), node->u.l.leaf_stats.dsize);
 	// the ndata and nkeys remains unchanged

        u_int32_t size = leafentry_memsize(le);
@@ -1610,7 +1370,7 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
            node->local_fingerprint     -= node->rand4fingerprint * toku_le_crc(le);

 	    {
-		u_int32_t oldlen = le_any_vallen(le) + le_any_keylen(le);
+		u_int32_t oldlen = le_innermost_inserted_vallen(le) + le_keylen(le);
 		assert(node->u.l.leaf_stats.dsize >= oldlen);
 		node->u.l.leaf_stats.dsize -= oldlen;
 	    }
@@ -1628,7 +1388,7 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
            node->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + newdisksize;
            node->local_fingerprint += node->rand4fingerprint*toku_le_crc(new_le);

-	    node->u.l.leaf_stats.dsize += le_any_vallen(new_le) + le_any_keylen(new_le);
+	    node->u.l.leaf_stats.dsize += le_innermost_inserted_vallen(new_le) + le_keylen(new_le);
 	    assert(node->u.l.leaf_stats.dsize < (1U<<31)); // make sure we didn't underflow
 	    node->u.l.leaf_stats.ndata ++;
 	    // Look at the key to the left and the one to the right.  If both are different then increment nkeys.
@@ -1736,7 +1496,7 @@ brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
    case BRT_DELETE_ANY:
    case BRT_ABORT_ANY:
    case BRT_COMMIT_ANY:
-        // Delete all the matches
+        // Apply to all the matches

        r = toku_omt_find_zero(node->u.l.buffer, toku_cmd_leafval_heaviside, &be,
                               &storeddatav, &idx, NULL);
@@ -1745,26 +1505,36 @@ brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
        storeddata=storeddatav;

        while (1) {
-            int   vallen   = le_any_vallen(storeddata);
-            void *save_val = toku_memdup(le_any_val(storeddata), vallen);
+            u_int32_t num_leafentries_before = toku_omt_size(node->u.l.buffer);

            r = brt_leaf_apply_cmd_once(node, cmd, idx, storeddata);
            if (r!=0) return r;
+            node->dirty = 1;

-            // Now we must find the next one.
-            DBT valdbt;
-            BRT_CMD_S ncmd = { cmd->type, cmd->xid, .u.id={cmd->u.id.key, toku_fill_dbt(&valdbt, save_val, vallen)}};
-            struct cmd_leafval_heaviside_extra nbe = {t, &ncmd, 1};
-            r = toku_omt_find(node->u.l.buffer, toku_cmd_leafval_heaviside,  &nbe, +1,
-                              &storeddatav, &idx, NULL);
-            
-            toku_free(save_val);
-            if (r!=0) break;
+            { 
+                // Now we must find the next leafentry. 
+                u_int32_t num_leafentries_after = toku_omt_size(node->u.l.buffer); 
+                //idx is the index of the leafentry we just modified.
+                //If the leafentry was deleted, we will have one less leafentry in 
+                //the omt than we started with and the next leafentry will be at the 
+                //same index as the deleted one. Otherwise, the next leafentry will 
+                //be at the next index (+1). 
+                assert(num_leafentries_before   == num_leafentries_after || 
+                       num_leafentries_before-1 == num_leafentries_after); 
+                if (num_leafentries_after==num_leafentries_before) idx++; //Not deleted, advance index.
+
+                assert(idx <= num_leafentries_after);
+                if (idx == num_leafentries_after) break; //Reached the end of the leaf
+                r = toku_omt_fetch(node->u.l.buffer, idx, &storeddatav, NULL); 
+                assert(r==0);
+            } 
            storeddata=storeddatav;
            {   // Continue only if the next record that we found has the same key.
                DBT adbt;
+                u_int32_t keylen;
+                void *keyp = le_key_and_len(storeddata, &keylen);
                if (t->compare_fun(t->db,
-                                   toku_fill_dbt(&adbt, le_any_key(storeddata), le_any_keylen(storeddata)),
+                                   toku_fill_dbt(&adbt, keyp, keylen),
                                   cmd->u.id.key) != 0)
                    break;
            }
@@ -1822,13 +1592,14 @@ static int brt_nonleaf_cmd_once_to_child (BRT t, BRTNODE node, unsigned int chil
 put_in_fifo:

    {
+        //TODO: Determine if we like direct access to type, to key, to val, 
        int type = cmd->type;
        DBT *k = cmd->u.id.key;
        DBT *v = cmd->u.id.val;

-	node->local_fingerprint += node->rand4fingerprint * toku_calc_fingerprint_cmd(type, cmd->xid, k->data, k->size, v->data, v->size);
-        int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
-        int r=toku_fifo_enq(BNC_BUFFER(node,childnum), k->data, k->size, v->data, v->size, type, cmd->xid);
+	node->local_fingerprint += node->rand4fingerprint * toku_calc_fingerprint_cmd(type, cmd->xids, k->data, k->size, v->data, v->size);
+        int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(cmd->xids);
+        int r=toku_fifo_enq(BNC_BUFFER(node,childnum), k->data, k->size, v->data, v->size, type, cmd->xids);
        assert(r==0);
        node->u.n.n_bytes_in_buffers += diff;
        BNC_NBYTESINBUF(node, childnum) += diff;
@@ -1908,6 +1679,7 @@ static int brt_nonleaf_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd,

    verify_local_fingerprint_nonleaf(node);
    /* find the right subtree */
+    //TODO: accesses key, val directly
    unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, cmd->u.id.val, t);

    int r = brt_nonleaf_cmd_once_to_child (t, node, childnum, cmd, re_array, did_io);
@@ -1931,6 +1703,7 @@ brt_nonleaf_cmd_many (BRT t, BRTNODE node, BRT_CMD cmd,
        if (delidx == 0 || sendchild[delidx-1] != i) sendchild[delidx++] = i;
    unsigned int i;
    for (i = 0; i+1 < (unsigned int)node->u.n.n_children; i++) {
+        //TODO: Is touching key directly
        int cmp = brt_compare_pivot(t, cmd->u.id.key, 0, node->u.n.childkeys[i]);
        if (cmp > 0) {
            continue;
@@ -1975,6 +1748,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,

    verify_local_fingerprint_nonleaf(node);

+    //TODO: Accessing type directly
    switch (cmd->type) {
    case BRT_INSERT:
    case BRT_DELETE_BOTH:
@@ -2016,13 +1790,13 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) {
 	    int idx = toku_omt_size(a->u.l.buffer);
            int r = toku_omt_insert_at(omta, new_le, idx);
            assert(r==0);
-            a->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + le_size;
+            a->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + le_size; //This should be disksize
            a->local_fingerprint     += a->rand4fingerprint * le_crc;

 	    a->u.l.leaf_stats.ndata++;
 	    maybe_bump_nkeys(a, idx, new_le, +1);
-	    a->u.l.leaf_stats.dsize+= le_any_keylen(le) + le_any_vallen(le);
-	    //printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_any_keylen(le)+le_any_vallen(le), a->u.l.leaf_stats.dsize);
+	    a->u.l.leaf_stats.dsize+= le_keylen(le) + le_innermost_inserted_vallen(le);
+	    //printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_keylen(le)+le_innermost_inserted_vallen(le), a->u.l.leaf_stats.dsize);
        }
        {
 	    maybe_bump_nkeys(b, 0, le, -1);
@@ -2032,8 +1806,8 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) {
            b->local_fingerprint     -= b->rand4fingerprint * le_crc;

 	    b->u.l.leaf_stats.ndata--;
-	    b->u.l.leaf_stats.dsize-= le_any_keylen(le) + le_any_vallen(le);
-	    //printf("%s:%d Subed %u got %lu\n", __FILE__, __LINE__, le_any_keylen(le)+le_any_vallen(le), b->u.l.leaf_stats.dsize);
+	    b->u.l.leaf_stats.dsize-= le_keylen(le) + le_innermost_inserted_vallen(le);
+	    //printf("%s:%d Subed %u got %lu\n", __FILE__, __LINE__, le_keylen(le)+le_innermost_inserted_vallen(le), b->u.l.leaf_stats.dsize);
 	    assert(b->u.l.leaf_stats.ndata < 1U<<31);
 	    assert(b->u.l.leaf_stats.nkeys < 1U<<31);
 	    assert(b->u.l.leaf_stats.dsize < 1U<<31);
@@ -2075,8 +1849,8 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk)
            to  ->local_fingerprint     += to->rand4fingerprint * le_crc;

 	    to->u.l.leaf_stats.ndata++;
-	    to->u.l.leaf_stats.dsize+= le_any_keylen(le) + le_any_vallen(le);
-	    //printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_any_keylen(le)+ le_any_vallen(le), to->u.l.leaf_stats.dsize);
+	    to->u.l.leaf_stats.dsize+= le_keylen(le) + le_innermost_inserted_vallen(le);
+	    //printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_keylen(le)+ le_innermost_inserted_vallen(le), to->u.l.leaf_stats.dsize);
        }
        {
 	    maybe_bump_nkeys(from, from_idx, le, -1);
@@ -2086,10 +1860,10 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk)
            from->local_fingerprint     -= from->rand4fingerprint * le_crc;

 	    from->u.l.leaf_stats.ndata--;
-	    from->u.l.leaf_stats.dsize-= le_any_keylen(le) + le_any_vallen(le);
+	    from->u.l.leaf_stats.dsize-= le_keylen(le) + le_innermost_inserted_vallen(le);
 	    assert(from->u.l.leaf_stats.ndata < 1U<<31);
 	    assert(from->u.l.leaf_stats.nkeys < 1U<<31);
-	    //printf("%s:%d Removed %u  get %lu\n", __FILE__, __LINE__, le_any_keylen(le)+ le_any_vallen(le), from->u.l.leaf_stats.dsize);
+	    //printf("%s:%d Removed %u  get %lu\n", __FILE__, __LINE__, le_keylen(le)+ le_innermost_inserted_vallen(le), from->u.l.leaf_stats.dsize);

            toku_mempool_mfree(&from->u.l.buffer_mempool, 0, le_size);
        }
@@ -2099,9 +1873,9 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk)
    {
        LEAFENTRY le = fetch_from_buf(a->u.l.buffer, toku_omt_size(a->u.l.buffer)-1);
        if (a->flags&TOKU_DB_DUPSORT) {
-            *splitk = kv_pair_malloc(le_any_key(le), le_any_keylen(le), le_any_val(le), le_any_vallen(le));
+            *splitk = kv_pair_malloc(le_key(le), le_keylen(le), le_innermost_inserted_val(le), le_innermost_inserted_vallen(le));
        } else {
-            *splitk = kv_pair_malloc(le_any_key(le), le_any_keylen(le), 0, 0);
+            *splitk = kv_pair_malloc(le_key(le), le_keylen(le), 0, 0);
        }
    }
    a->dirty = 1; // make them dirty even if nothing actually happened.
@@ -2459,16 +2233,17 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
        //printf("%s:%d Try random_pick, weight=%d \n", __FILE__, __LINE__, BNC_NBYTESINBUF(node, childnum));
        assert(toku_fifo_n_entries(BNC_BUFFER(node,childnum))>0);
        u_int32_t type;
-        TXNID xid;
-        while(0==toku_fifo_peek(BNC_BUFFER(node,childnum), &key, &keylen, &val, &vallen, &type, &xid)) {
+        XIDS xids;
+        while(0==toku_fifo_peek(BNC_BUFFER(node,childnum), &key, &keylen, &val, &vallen, &type, &xids)) {
            DBT hk,hv;

-            BRT_CMD_S brtcmd = { (enum brt_cmd_type)type, xid, .u.id= {toku_fill_dbt(&hk, key, keylen),
+            //TODO: Factor out (into a function) conversion of fifo_entry to message
+            BRT_CMD_S brtcmd = { (enum brt_cmd_type)type, xids, .u.id= {toku_fill_dbt(&hk, key, keylen),
                                                                       toku_fill_dbt(&hv, val, vallen)} };

-            int n_bytes_removed = (hk.size + hv.size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD);
+            int n_bytes_removed = (hk.size + hv.size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));
            u_int32_t old_from_fingerprint = node->local_fingerprint;
-            u_int32_t delta = toku_calc_fingerprint_cmd(type, xid, key, keylen, val, vallen);
+            u_int32_t delta = toku_calc_fingerprint_cmd(type, xids, key, keylen, val, vallen);
            u_int32_t new_from_fingerprint = old_from_fingerprint - node->rand4fingerprint*delta;

            //printf("%s:%d random_picked\n", __FILE__, __LINE__);
@@ -2668,19 +2443,26 @@ int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn)
 // Effect: Insert the key-val pair into brt.
 {
    int r;
+    XIDS message_xids;
    TXNID xid = toku_txn_get_txnid(txn);
    if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) {
        BYTESTRING keybs  = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
        int need_data = (brt->flags&TOKU_DB_DUPSORT)!=0; // dupsorts don't need the data part
        if (need_data) {
            BYTESTRING databs = {val->size, toku_memdup_in_rollback(txn, val->data, val->size)};
-            r = toku_logger_save_rollback_cmdinsertboth(txn, xid, toku_cachefile_filenum(brt->cf), keybs, databs);
+            r = toku_logger_save_rollback_cmdinsertboth(txn, toku_cachefile_filenum(brt->cf), keybs, databs);
        } else {
-            r = toku_logger_save_rollback_cmdinsert    (txn, xid, toku_cachefile_filenum(brt->cf), keybs);
+            r = toku_logger_save_rollback_cmdinsert    (txn, toku_cachefile_filenum(brt->cf), keybs);
        }
        if (r!=0) return r;
        r = toku_txn_note_brt(txn, brt);
        if (r!=0) return r;
+        message_xids = toku_txn_get_xids(txn);
+    }
+    else {
+        //Treat this insert as a commit-immediately insert.
+        //It will never be given an abort message (will be truncated on abort).
+        message_xids = xids_get_root_xids();
    }
    TOKULOGGER logger = toku_txn_logger(txn);
    if (logger) {
@@ -2690,7 +2472,7 @@ int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn)
        if (r!=0) return r;
    }

-    BRT_CMD_S brtcmd = { BRT_INSERT, xid, .u.id={key,val}};
+    BRT_CMD_S brtcmd = { BRT_INSERT, message_xids, .u.id={key,val}};
    r = toku_brt_root_put_cmd(brt, &brtcmd, logger);
    if (r!=0) return r;
    return r;
@@ -2698,13 +2480,20 @@ int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn)

 int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) {
    int r;
+    XIDS message_xids;
    TXNID xid = toku_txn_get_txnid(txn);
    if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) {
        BYTESTRING keybs  = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
-        r = toku_logger_save_rollback_cmddelete(txn, xid, toku_cachefile_filenum(brt->cf), keybs);
+        r = toku_logger_save_rollback_cmddelete(txn, toku_cachefile_filenum(brt->cf), keybs);
        if (r!=0) return r;
        r = toku_txn_note_brt(txn, brt);
        if (r!=0) return r;
+        message_xids = toku_txn_get_xids(txn);
+    }
+    else {
+        //Treat this delete as a commit-immediately insert.
+        //It will never be given an abort message (will be truncated on abort).
+        message_xids = xids_get_root_xids();
    }
    TOKULOGGER logger = toku_txn_logger(txn);
    if (logger) {
@@ -2714,7 +2503,7 @@ int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) {
        if (r!=0) return r;
    }
    DBT val;
-    BRT_CMD_S brtcmd = { BRT_DELETE_ANY, xid, .u.id={key, toku_init_dbt(&val)}};
+    BRT_CMD_S brtcmd = { BRT_DELETE_ANY, message_xids, .u.id={key, toku_init_dbt(&val)}};
    r = toku_brt_root_put_cmd(brt, &brtcmd, logger);
    return r;
 }
@@ -3397,12 +3186,10 @@ static inline void load_dbts_from_omt(BRT_CURSOR c, DBT *key, DBT *val) {
    int r = toku_omt_cursor_current(c->omtcursor, &le);
    assert(r==0);
    if (key) {
-        key->data = le_latest_key(le);
-        key->size = le_latest_keylen(le);
+        key->data = le_latest_key_and_len(le, &key->size);
    }
    if (val) {
-        val->data = le_latest_val(le);
-        val->size = le_latest_vallen(le);
+        val->data = le_latest_val_and_len(le, &val->size);
    }
 }

@@ -3501,7 +3288,7 @@ brt_cursor_not_set(BRT_CURSOR cursor) {
 }

 static int
-pair_leafval_heaviside_le_committed (u_int32_t klen, void *kval,
+pair_leafval_heaviside_le (u_int32_t klen, void *kval,
                           u_int32_t dlen, void *dval,
                           brt_search_t *search) {
    DBT x,y;
@@ -3518,36 +3305,19 @@ pair_leafval_heaviside_le_committed (u_int32_t klen, void *kval,


 static int
-pair_leafval_heaviside_le_both (TXNID xid __attribute__((__unused__)),
-                             u_int32_t klen, void *kval,
-                             u_int32_t clen __attribute__((__unused__)), void *cval __attribute__((__unused__)),
-                             u_int32_t plen, void *pval,
-                             brt_search_t *search) {
-    return pair_leafval_heaviside_le_committed(klen, kval, plen, pval, search);
-}
-
-static int
-pair_leafval_heaviside_le_provdel (TXNID xid __attribute__((__unused__)),
-                                u_int32_t klen, void *kval,
-                                u_int32_t clen, void *cval,
-                                brt_search_t *be) {
-    return pair_leafval_heaviside_le_committed(klen, kval, clen, cval, be);
-}
+heaviside_from_search_t (OMTVALUE lev, void *extra) {
+    LEAFENTRY le=lev;
+    brt_search_t *search = extra;
+    u_int32_t keylen;
+    void* key = le_key_and_len(le, &keylen);
+    u_int32_t vallen;
+    void* val = le_innermost_inserted_val_and_len(le, &vallen);

-static int
-pair_leafval_heaviside_le_provpair (TXNID xid __attribute__((__unused__)),
-                                 u_int32_t klen, void *kval,
-                                 u_int32_t plen, void *pval,
-                                 brt_search_t *be) {
-    return pair_leafval_heaviside_le_committed(klen, kval, plen, pval, be);
+    return pair_leafval_heaviside_le (keylen, key,
+                                      vallen, val,
+                                      search);
 }

-static int heaviside_from_search_t (OMTVALUE lev, void *extra) {
-    LEAFENTRY leafval=lev;
-    brt_search_t *search = extra;
-    LESWITCHCALL(leafval, pair_leafval_heaviside, search);
-    abort(); return 0;
-}

 // This is the only function that associates a brt cursor (and its contained
 // omt cursor) with a brt node (and its associated omt).  This is different
@@ -3577,7 +3347,7 @@ brt_cursor_update(BRT_CURSOR brtcursor) {

 // This is a bottom layer of the search functions.
 static int
-brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, BRT_GET_STRADDLE_CALLBACK_FUNCTION getf, void *getf_v, enum reactivity *re, BOOL *doprefetch, BRT_CURSOR brtcursor)
+brt_search_leaf_node(BRTNODE node, brt_search_t *search, BRT_GET_STRADDLE_CALLBACK_FUNCTION getf, void *getf_v, enum reactivity *re, BOOL *doprefetch, BRT_CURSOR brtcursor)
 {
    // Now we have to convert from brt_search_t to the heaviside function with a direction.  What a pain...

@@ -3601,43 +3371,27 @@ brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, BRT_GET_STRADD

    LEAFENTRY le = datav;
    if (le_is_provdel(le)) {
-        TXNID xid = le_any_xid(le);
-        TOKUTXN txn = 0;
-        toku_txn_find_by_xid(brt, xid, &txn);
-
        // Provisionally deleted stuff is gone.
        // So we need to scan in the direction to see if we can find something
        while (1) {
-            // see if the transaction is alive
-            TXNID newxid = le_any_xid(le);
-            if (newxid != xid) {
-                xid = newxid;
-                txn = 0;
-                toku_txn_find_by_xid(brt, xid, &txn);
-            }
-
+#if !TOKU_DO_COMMIT_CMD_DELETE || !TOKU_DO_COMMIT_CMD_DELETE_BOTH 
+//Implicit promotions on delete is not necessary since we have explicit commits for deletes.
+//However, if we ever turn that off, we need to deal with the implicit commits.
+// TODO: (if this error gets hit.. deal with this)
+#error Need to add implicit promotions on deletes
+#endif
            switch (search->direction) {
            case BRT_SEARCH_LEFT:
-                if (txn) {
-                    // printf("xid %llu -> %p\n", (unsigned long long) xid, txn);
                idx++;
-                } else {
-                    // apply a commit message for this leafentry to the node
-                    // printf("apply commit_both %llu\n", (unsigned long long) xid);
-                    DBT key, val;
-                    BRT_CMD_S brtcmd = { BRT_COMMIT_BOTH, xid, .u.id= {toku_fill_dbt(&key, le_latest_key(le), le_latest_keylen(le)),
-                                                                       toku_fill_dbt(&val, le_latest_val(le), le_latest_vallen(le))} };
-                    r = brt_leaf_apply_cmd_once(node, &brtcmd, idx, le);
-                    assert(r == 0);
-                }
                if (idx>=toku_omt_size(node->u.l.buffer)) return DB_NOTFOUND;
                break;
            case BRT_SEARCH_RIGHT:
                if (idx==0) return DB_NOTFOUND;
                idx--;
                break;
+            default:
+                assert(FALSE);
            }
-            if (idx>=toku_omt_size(node->u.l.buffer)) continue;
            r = toku_omt_fetch(node->u.l.buffer, idx, &datav, NULL);
            assert(r==0); // we just validated the index
            le = datav;
@@ -3646,10 +3400,10 @@ brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, BRT_GET_STRADD
    }
 got_a_good_value:
    {
-        u_int32_t keylen = le_latest_keylen(le);
-        bytevec   key    = le_latest_key(le);
-        u_int32_t vallen = le_latest_vallen(le);
-        bytevec   val    = le_latest_val(le);
+        u_int32_t keylen;
+        bytevec   key    = le_latest_key_and_len(le, &keylen);
+        u_int32_t vallen;
+        bytevec   val    = le_latest_val_and_len(le, &vallen);

        assert(brtcursor->current_in_omt == FALSE);
        r = getf(keylen, key,
@@ -3816,7 +3570,7 @@ brt_search_node (BRT brt, BRTNODE node, brt_search_t *search, BRT_GET_STRADDLE_C
    if (node->height > 0)
        return brt_search_nonleaf_node(brt, node, search, getf, getf_v, re, doprefetch, brtcursor);
    else {
-        return brt_search_leaf_node(brt, node, search, getf, getf_v, re, doprefetch, brtcursor);
+        return brt_search_leaf_node(node, search, getf, getf_v, re, doprefetch, brtcursor);
    }
 }

@@ -4056,10 +3810,10 @@ brt_cursor_shortcut (BRT_CURSOR cursor, int direction, u_int32_t limit, BRT_GET_
            assert(r==0);

            if (!le_is_provdel(le)) {
-                u_int32_t keylen = le_latest_keylen(le);
-                bytevec   key    = le_latest_key(le);
-                u_int32_t vallen = le_latest_vallen(le);
-                bytevec   val    = le_latest_val(le);
+                u_int32_t keylen;
+                bytevec   key    = le_latest_key_and_len(le, &keylen);
+                u_int32_t vallen;
+                bytevec   val    = le_latest_val_and_len(le, &vallen);

                r = getf(keylen, key, vallen, val, getf_v);
                if (r==0) {
@@ -4543,14 +4297,21 @@ toku_brt_lookup (BRT brt, DBT *k, DBT *v, BRT_GET_CALLBACK_FUNCTION getf, void *
 int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
    //{ unsigned i; printf("del %p keylen=%d key={", brt->db, key->size); for(i=0; i<key->size; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", val->size); for(i=0; i<val->size; i++) printf("%d,", ((char*)val->data)[i]); printf("}\n"); }
    int r;
+    XIDS message_xids;
    TXNID xid = toku_txn_get_txnid(txn);
    if (txn && (brt->h->txnid_that_created_or_locked_when_empty != xid)) {
        BYTESTRING keybs  = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
        BYTESTRING databs = {val->size, toku_memdup_in_rollback(txn, val->data, val->size)};
-        r = toku_logger_save_rollback_cmddeleteboth(txn, xid, toku_cachefile_filenum(brt->cf), keybs, databs);
+        r = toku_logger_save_rollback_cmddeleteboth(txn, toku_cachefile_filenum(brt->cf), keybs, databs);
        if (r!=0) return r;
        r = toku_txn_note_brt(txn, brt);
        if (r!=0) return r;
+        message_xids = toku_txn_get_xids(txn);
+    }
+    else {
+        //Treat this delete as a commit-immediately delete.
+        //It will never be given an abort message (will be truncated on abort).
+        message_xids = xids_get_root_xids();
    }
    TOKULOGGER logger = toku_txn_logger(txn);
    if (logger) {
@@ -4560,7 +4321,7 @@ int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
        if (r!=0) return r;
    }

-    BRT_CMD_S brtcmd = { BRT_DELETE_BOTH, xid, .u.id={key,val}};
+    BRT_CMD_S brtcmd = { BRT_DELETE_BOTH, message_xids, .u.id={key,val}};
    r = toku_brt_root_put_cmd(brt, &brtcmd, logger);
    return r;
 }
@@ -4754,10 +4515,10 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lo
 			    e->ndata, e->nkeys, e->dsize, (int)e->exact);
 		}
 		fprintf(file, "\n");
-                FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xid,
+                FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xids,
                                  {
                                      data=data; datalen=datalen; keylen=keylen;
-                                      fprintf(file, "%*s xid=%"PRIu64" %u (type=%d)\n", depth+2, "", xid, (unsigned)toku_dtoh32(*(int*)key), type);
+                                      fprintf(file, "%*s xid=%"PRIu64" %u (type=%d)\n", depth+2, "", xids_get_innermost_xid(xids), (unsigned)toku_dtoh32(*(int*)key), type);
                                      //assert(strlen((char*)key)+1==keylen);
                                      //assert(strlen((char*)data)+1==datalen);
                                  });
@@ -4794,7 +4555,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lo
            print_leafentry(file, v);
            fprintf(file, "\n");
        }
-        //             printf(" (%d)%u ", len, *(int*)le_any_key(data)));
+        //             printf(" (%d)%u ", len, *(int*)le_key(data)));
        fprintf(file, "\n");
    }
    r = toku_cachetable_unpin(brt->cf, blocknum, fullhash, CACHETABLE_CLEAN, 0);

--- a/newbrt/brt_msg.c
+++ b/newbrt/brt_msg.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+
+#include <toku_portability.h>
+#include "brttypes.h"
+#include "xids.h"
+#include "fifo_msg.h"
+#include "brt_msg.h"
+
+//BRT_MSG internals are in host order
+//XIDS are not 'internal' to BRT_MSG
+
+void
+brt_msg_from_dbts(BRT_MSG brt_msg,
+                  DBT *key, DBT *val,
+                  XIDS xids, brt_msg_type type) {
+    brt_msg->u.id.key = key;
+    brt_msg->u.id.val = val;
+    brt_msg->xids   = xids;
+    brt_msg->type   = type;
+}
+
+//No conversion (from disk to host) is necessary
+//Accessor functions for fifo return host order bytes.
+#if 0
+void
+brt_msg_from_fifo_msg(BRT_MSG brt_msg, FIFO_MSG fifo_msg) {
+    brt_msg->keylen = fifo_msg_get_keylen(fifo_msg);
+    brt_msg->vallen = fifo_msg_get_vallen(fifo_msg);
+    brt_msg->vallen = fifo_msg_get_vallen(fifo_msg);
+    brt_msg->key    = fifo_msg_get_key(fifo_msg);
+    brt_msg->val    = fifo_msg_get_val(fifo_msg);
+    brt_msg->xids   = fifo_msg_get_xids(fifo_msg);
+    brt_msg->type   = fifo_msg_get_type(fifo_msg);
+}
+#endif
+
+u_int32_t 
+brt_msg_get_keylen(BRT_MSG brt_msg) {
+    u_int32_t rval = brt_msg->u.id.key->size;
+    return rval;
+}
+
+u_int32_t 
+brt_msg_get_vallen(BRT_MSG brt_msg) {
+    u_int32_t rval = brt_msg->u.id.val->size;
+    return rval;
+}
+
+XIDS
+brt_msg_get_xids(BRT_MSG brt_msg) {
+    XIDS rval = brt_msg->xids;
+    return rval;
+}
+
+void *
+brt_msg_get_key(BRT_MSG brt_msg) {
+    void * rval = brt_msg->u.id.key->data;
+    return rval;
+}
+
+void *
+brt_msg_get_val(BRT_MSG brt_msg) {
+    void * rval = brt_msg->u.id.val->data;
+    return rval;
+}
+
+brt_msg_type
+brt_msg_get_type(BRT_MSG brt_msg) {
+    brt_msg_type rval = brt_msg->type;
+    return rval;
+}
+
--- a/newbrt/brt_msg.h
+++ b/newbrt/brt_msg.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+
+/* The purpose of this file is to provide access to the brt_msg,
+ * which is the ephemeral version of the fifo_msg.
+ */
+
+
+
+
+#ifndef BRT_MSG_H
+#define BRT_MSG_H
+
+u_int32_t brt_msg_get_keylen(BRT_MSG brt_msg);
+
+u_int32_t brt_msg_get_vallen(BRT_MSG brt_msg);
+
+XIDS brt_msg_get_xids(BRT_MSG brt_msg);
+
+void * brt_msg_get_key(BRT_MSG brt_msg);
+
+void * brt_msg_get_val(BRT_MSG brt_msg);
+
+brt_msg_type brt_msg_get_type(BRT_MSG brt_msg);
+
+void brt_msg_from_fifo_msg(BRT_MSG brt_msg, FIFO_MSG fifo_msg);
+
+void brt_msg_from_dbts(BRT_MSG brt_msg, DBT *key, DBT *val, XIDS xids, brt_msg_type type);
+
+#endif
+
--- a/newbrt/brtdump.c
+++ b/newbrt/brtdump.c
@@ -100,7 +100,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
 	    printf("   child %d: %" PRId64 "\n", i, BNC_BLOCKNUM(n, i).b);
 	    printf("   buffer contains %u bytes (%d items)\n", BNC_NBYTESINBUF(n, i), toku_fifo_n_entries(BNC_BUFFER(n,i)));
 	    if (dump_data) {
-		FIFO_ITERATE(BNC_BUFFER(n,i), key, keylen, data, datalen, typ, xid,
+		FIFO_ITERATE(BNC_BUFFER(n,i), key, keylen, data, datalen, typ, xids,
 			     {
 				 printf("    TYPE=");
 				 switch ((enum brt_cmd_type)typ) {
@@ -115,7 +115,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
 				 }
 				 printf("HUH?");
 			     ok:
-				 printf(" xid=%"PRIu64" ", xid);
+				 printf(" xid=%"PRIu64" ", xids_get_innermost_xid(xids));
 				 print_item(key, keylen);
 				 if (datalen>0) {
 				     printf(" ");

--- a/newbrt/brttypes.h
+++ b/newbrt/brttypes.h
@@ -90,10 +90,12 @@ enum brt_cmd_type {
    BRT_COMMIT_BOTH = 7
 };

+typedef struct xids_t *XIDS;
+typedef struct fifo_msg_t *FIFO_MSG;
 /* tree commands */
 struct brt_cmd {
    enum brt_cmd_type type;
-    TXNID xid;
+    XIDS         xids;
    union {
        /* insert or delete */
        struct brt_cmd_insert_delete {
@@ -104,17 +106,15 @@ struct brt_cmd {
 };
 typedef struct brt_cmd BRT_CMD_S, *BRT_CMD;

-#if !defined(__cplusplus)
-static inline
-BRT_CMD_S
-build_brt_cmd (enum brt_cmd_type type, TXNID xid, DBT *key, DBT *val) {
-    BRT_CMD_S result = {type, xid, .u.id={key,val}};
-    return result;
-}
-#endif
+// TODO: replace brt_cmd_type when ready
+typedef enum brt_cmd_type brt_msg_type;
+// Message sent into brt to implement command (insert, delete, etc.)
+// This structure supports nested transactions, and obsoletes brt_cmd.
+typedef struct brt_cmd BRT_MSG_S, *BRT_MSG;

-#define UU(x) x __attribute__((__unused__))

-typedef struct leafentry *LEAFENTRY;
+
+#define UU(x) x __attribute__((__unused__))

 #endif
+
--- a/newbrt/fifo.c
+++ b/newbrt/fifo.c
@@ -3,6 +3,7 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "includes.h"
+#include "xids.h"

 struct fifo {
    int n_items_in_fifo;
@@ -22,7 +23,9 @@ static void fifo_init(struct fifo *fifo) {
 }

 static int fifo_entry_size(struct fifo_entry *entry) {
-    return sizeof (struct fifo_entry) + entry->keylen + entry->vallen;
+    return sizeof (struct fifo_entry) + entry->keylen + entry->vallen
+                  + xids_get_size(&entry->xids_s)
+                  - sizeof(XIDS_S); //Prevent double counting from fifo_entry+xids_get_size
 }

 static struct fifo_entry *fifo_peek(struct fifo *fifo) {
@@ -59,8 +62,11 @@ static int next_power_of_two (int n) {
    return r;
 }

-int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, TXNID xid) {
-    int need_space_here = sizeof(struct fifo_entry) + keylen + datalen;
+int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *data, unsigned int datalen, int type, XIDS xids) {
+    int need_space_here = sizeof(struct fifo_entry)
+                          + keylen + datalen
+                          + xids_get_size(xids)
+                          - sizeof(XIDS_S); //Prevent double counting
    int need_space_total = fifo->memory_used+need_space_here;
    if (fifo->memory == NULL) {
 	fifo->memory_size = next_power_of_two(need_space_total);
@@ -88,30 +94,32 @@ int toku_fifo_enq(FIFO fifo, const void *key, unsigned int keylen, const void *d
    }
    struct fifo_entry *entry = (struct fifo_entry *)(fifo->memory + fifo->memory_start + fifo->memory_used);
    entry->type = (unsigned char)type;
-    entry->xid  = xid;
+    xids_cpy(&entry->xids_s, xids);
    entry->keylen = keylen;
-    memcpy(entry->key, key, keylen);
+    unsigned char *e_key = xids_get_end_of_array(&entry->xids_s);
+    memcpy(e_key, key, keylen);
    entry->vallen = datalen;
-    memcpy(entry->key + keylen, data, datalen);
+    memcpy(e_key + keylen, data, datalen);
    fifo->n_items_in_fifo++;
    fifo->memory_used += need_space_here;
    return 0;
 }

 int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_CMD cmd) {
-    return toku_fifo_enq(fifo, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size, cmd->type, cmd->xid);
+    return toku_fifo_enq(fifo, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size, cmd->type, cmd->xids);
 }

 /* peek at the head (the oldest entry) of the fifo */
-int toku_fifo_peek(FIFO fifo, bytevec *key, unsigned int *keylen, bytevec *data, unsigned int *datalen, u_int32_t *type, TXNID *xid) {
+int toku_fifo_peek(FIFO fifo, bytevec *key, unsigned int *keylen, bytevec *data, unsigned int *datalen, u_int32_t *type, XIDS *xids) {
    struct fifo_entry *entry = fifo_peek(fifo);
    if (entry == 0) return -1;
-    *key = entry->key;
+    unsigned char *e_key = xids_get_end_of_array(&entry->xids_s);
+    *key = e_key;
    *keylen = entry->keylen;
-    *data = entry->key + entry->keylen;
+    *data = e_key + entry->keylen;
    *datalen = entry->vallen;
    *type = entry->type;
-    *xid  = entry->xid;
+    *xids  = &entry->xids_s;
    return 0;
 }

@@ -120,7 +128,7 @@ int toku_fifo_peek_cmdstruct (FIFO fifo, BRT_CMD cmd, DBT*key, DBT*data) {
    u_int32_t type;
    bytevec keyb,datab;
    unsigned int keylen,datalen;
-    int r = toku_fifo_peek(fifo, &keyb, &keylen, &datab, &datalen, &type, &cmd->xid);
+    int r = toku_fifo_peek(fifo, &keyb, &keylen, &datab, &datalen, &type, &cmd->xids);
    if (r!=0) return r;
    cmd->type=(enum brt_cmd_type)type;
    toku_fill_dbt(key, keyb, keylen);
@@ -151,10 +159,10 @@ struct fifo_entry * toku_fifo_iterate_internal_get_entry(FIFO fifo, int off) {
    return (struct fifo_entry *)(fifo->memory + off);
 }

-void toku_fifo_iterate (FIFO fifo, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, TXNID xid, void*), void *arg) {
+void toku_fifo_iterate (FIFO fifo, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, XIDS xids, void*), void *arg) {
    FIFO_ITERATE(fifo,
-		 key, keylen, data, datalen, type, xid,
-		 f(key,keylen,data,datalen,type,xid, arg));
+		 key, keylen, data, datalen, type, xids,
+		 f(key,keylen,data,datalen,type,xids, arg));
 }

 unsigned long toku_fifo_memory_size(FIFO fifo) {

--- a/newbrt/fifo.h
+++ b/newbrt/fifo.h
@@ -5,13 +5,14 @@
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include "brttypes.h"
+#include "xids-internal.h"
+#include "xids.h"

 struct fifo_entry {
    unsigned int keylen;
    unsigned int vallen;
    unsigned char type;
-    TXNID xid;
-    unsigned char key[];
+    XIDS_S        xids_s;
 };

 typedef struct fifo *FIFO;
@@ -20,8 +21,8 @@ int toku_fifo_create(FIFO *);
 void toku_fifo_free(FIFO *);
 int toku_fifo_n_entries(FIFO);
 int toku_fifo_enq_cmdstruct (FIFO fifo, const BRT_CMD cmd);
-int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, TXNID xid);
-int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, TXNID *xid);
+int toku_fifo_enq (FIFO, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type, XIDS xids);
+int toku_fifo_peek (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, XIDS *xids);
 int toku_fifo_peek_cmdstruct (FIFO, BRT_CMD, DBT*, DBT*); // fill in the BRT_CMD, using the two DBTs for the DBT part.
 int toku_fifo_deq(FIFO);

@@ -30,20 +31,20 @@ unsigned long toku_fifo_memory_size(FIFO); // return how much memory the fifo us
 //These two are problematic, since I don't want to malloc() the bytevecs, but dequeueing the fifo frees the memory.
 //int toku_fifo_peek_deq (FIFO, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, u_int32_t *type, TXNID *xid);
 //int toku_fifo_peek_deq_cmdstruct (FIFO, BRT_CMD, DBT*, DBT*); // fill in the BRT_CMD, using the two DBTs for the DBT part.
-void toku_fifo_iterate (FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, TXNID xid, void*), void*);
+void toku_fifo_iterate (FIFO, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, XIDS xids, void*), void*);

-#define FIFO_ITERATE(fifo,keyvar,keylenvar,datavar,datalenvar,typevar,xidvar,body) do {    \
+#define FIFO_ITERATE(fifo,keyvar,keylenvar,datavar,datalenvar,typevar,xidsvar,body) do {    \
  int fifo_iterate_off;                                                                    \
  for (fifo_iterate_off = toku_fifo_iterate_internal_start(fifo);                          \
       toku_fifo_iterate_internal_has_more(fifo, fifo_iterate_off);			   \
       fifo_iterate_off = toku_fifo_iterate_internal_next(fifo, fifo_iterate_off)) {       \
      struct fifo_entry *e = toku_fifo_iterate_internal_get_entry(fifo, fifo_iterate_off); \
-      bytevec keyvar = e->key;                                                             \
      ITEMLEN keylenvar = e->keylen;                                                       \
-      bytevec datavar = e->key + e->keylen;                                           \
      ITEMLEN datalenvar = e->vallen;                                                 \
      int     typevar = e->type;                                                      \
-      TXNID   xidvar = e->xid;                                                        \
+      XIDS    xidsvar = &e->xids_s;                                                   \
+      bytevec keyvar  = xids_get_end_of_array(xidsvar);                               \
+      bytevec datavar = keyvar + e->keylen;                                           \
      body;                                                                           \
  } } while (0)


--- a/newbrt/fifo_msg.c
+++ b/newbrt/fifo_msg.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+
+/* Purpose of this file is to define and handle the fifo_msg, which
+ * is the stored format of a brt_msg.
+ *
+ * Note, when translating from fifo_msg to brt_msg, the brt_msg
+ * will be created with a pointer into the xids in the fifo_msg.
+ * (The xids will not be embedded in the brt_msg.)  This means
+ * that a valid xids struct must be embedded in the fifo_msg.
+ *
+ * NOTE: fifo_msg is stored in memory and on disk in same format.
+ *       fifo_msg is stored in same byte order both in-memory
+ *       and on-disk.  Accessors are responsible for tranposition
+ *       to host order.
+ */
+
+
+#include <string.h>
+
+#include <toku_portability.h>
+#include "brttypes.h"
+#include "xids.h"
+#include "xids-internal.h"
+#include "brt_msg.h"
+#include "fifo_msg.h"
+#include <toku_htod.h>
+
+
+// xids_and_key_and_val field is XIDS_S followed by key
+// followed by value.
+
+struct fifo_msg_t {
+    u_int32_t keylen;
+    u_int32_t vallen;
+    u_int8_t  type;
+    // u_int8_t pad[7]; // force 64-bit alignment if needed ???
+    u_int8_t  xids_and_key_and_val[];	// undifferentiated bytes
+};
+
+
+u_int32_t 
+fifo_msg_get_keylen(FIFO_MSG fifo_msg) {
+    u_int32_t rval = fifo_msg->keylen;
+    rval = toku_dtoh32(rval);
+    return rval;
+}
+
+u_int32_t 
+fifo_msg_get_vallen(FIFO_MSG fifo_msg) {
+    u_int32_t rval = fifo_msg->vallen;
+    rval = toku_dtoh32(rval);
+    return rval;
+}
+
+XIDS
+fifo_msg_get_xids(FIFO_MSG fifo_msg) {
+    XIDS rval = (XIDS) &fifo_msg->xids_and_key_and_val;
+    return rval;
+}
+
+
+static u_int32_t
+fifo_msg_get_xids_size(FIFO_MSG fifo_msg) {
+    u_int32_t rval;
+    XIDS xids = fifo_msg_get_xids(fifo_msg);
+    rval      = xids_get_size(xids);
+    return rval;
+}
+
+
+void *
+fifo_msg_get_key(FIFO_MSG fifo_msg) {
+    void * rval;
+    u_int32_t xidslen = fifo_msg_get_xids_size(fifo_msg);
+    rval = (void*)fifo_msg->xids_and_key_and_val + xidslen;
+    return rval;
+}
+
+void *
+fifo_msg_get_val(FIFO_MSG fifo_msg) {
+    void * rval;
+    void * key = fifo_msg_get_key(fifo_msg);
+    u_int32_t keylen  = fifo_msg_get_keylen(fifo_msg);
+    rval = key + keylen;
+    return rval;
+}
+
+brt_msg_type
+fifo_msg_get_type(FIFO_MSG fifo_msg) {
+    brt_msg_type rval = fifo_msg->type;
+    return rval;
+}
+
+
+// Finds size of a fifo msg.
+u_int32_t 
+fifo_msg_get_size(FIFO_MSG fifo_msg) {
+    u_int32_t rval;
+    u_int32_t keylen  = fifo_msg_get_keylen(fifo_msg);
+    u_int32_t vallen  = fifo_msg_get_vallen(fifo_msg);
+    u_int32_t xidslen = fifo_msg_get_xids_size(fifo_msg);
+    rval = keylen + vallen + xidslen + sizeof(*fifo_msg);
+    return rval;
+}
+
+// Return number of bytes required for a fifo_msg created from
+// the given brt_msg
+u_int32_t 
+fifo_msg_get_size_required(BRT_MSG brt_msg) {
+    u_int32_t rval;
+    u_int32_t keylen  = brt_msg_get_keylen(brt_msg);
+    u_int32_t vallen  = brt_msg_get_vallen(brt_msg);
+    XIDS      xids    = brt_msg_get_xids(brt_msg);
+    u_int32_t xidslen = xids_get_size(xids);
+    rval = keylen + vallen + xidslen + sizeof(struct fifo_msg_t);
+    return rval;
+}
+
+void
+fifo_msg_from_brt_msg(FIFO_MSG fifo_msg, BRT_MSG brt_msg) {
+    u_int32_t keylen_host = brt_msg_get_keylen(brt_msg);
+    u_int32_t vallen_host = brt_msg_get_vallen(brt_msg);
+    fifo_msg->type   = brt_msg_get_type(brt_msg);
+    fifo_msg->keylen = toku_htod32(keylen_host);
+    fifo_msg->vallen = toku_htod32(vallen_host);
+    //Copy XIDS
+    XIDS xids         = brt_msg_get_xids(brt_msg);
+    XIDS xids_target  = fifo_msg_get_xids(fifo_msg);
+    u_int32_t xidslen = xids_get_size(xids);
+    memcpy(xids_target, xids, xidslen);
+    //Copy Key
+    void *key         = brt_msg_get_key(brt_msg);
+    void *key_target  = fifo_msg_get_key(fifo_msg);
+    memcpy(key_target, key, keylen_host);
+    //Copy Val
+    void *val         = brt_msg_get_val(brt_msg);
+    void *val_target  = fifo_msg_get_val(fifo_msg);
+    memcpy(val_target, val, vallen_host);
+}
+
--- a/newbrt/fifo_msg.h
+++ b/newbrt/fifo_msg.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+
+/* The purpose of this file is to provide access to the fifo_msg,
+ * which is the stored representation of the brt_msg.
+ * 
+ * NOTE: Accessor functions return all values in host byte order.
+ */
+
+
+
+
+#ifndef FIFO_MSG_H
+#define FIFO_MSG_H
+
+u_int32_t fifo_msg_get_keylen(FIFO_MSG fifo_msg);
+
+u_int32_t fifo_msg_get_vallen(FIFO_MSG fifo_msg);
+
+XIDS fifo_msg_get_xids(FIFO_MSG fifo_msg);
+
+void * fifo_msg_get_key(FIFO_MSG fifo_msg);
+
+void * fifo_msg_get_val(FIFO_MSG fifo_msg);
+
+brt_msg_type fifo_msg_get_type(FIFO_MSG fifo_msg);
+
+u_int32_t fifo_msg_get_size(FIFO_MSG fifo_msg);
+
+// Return number of bytes required for a fifo_msg created from
+// the given brt_msg
+u_int32_t fifo_msg_get_size_required(BRT_MSG brt_msg);
+
+#endif
+
--- a/newbrt/fingerprint.c
+++ b/newbrt/fingerprint.c
@@ -28,15 +28,12 @@ u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp) {
 }
 #endif

-u_int32_t toku_calc_fingerprint_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen) {
+u_int32_t toku_calc_fingerprint_cmd (u_int32_t type, XIDS xids, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen) {
    unsigned char type_c = (unsigned char)type;
-    unsigned int a = toku_htod32(xid>>32);
-    unsigned int b = toku_htod32(xid&0xffffffff);
    struct x1764 mm;
    x1764_init(&mm);
    x1764_add(&mm, &type_c, 1);
-    x1764_add(&mm, &a, 4);
-    x1764_add(&mm, &b, 4);
+    toku_calc_more_murmur_xids(&mm, xids);
    toku_calc_more_murmur_kvpair(&mm, key, keylen, val, vallen);
    return x1764_finish(&mm);
 }
--- a/newbrt/leafentry.c
+++ b/newbrt/leafentry.c
@@ -8,8 +8,10 @@ u_int32_t toku_le_crc(LEAFENTRY v) {
    return x1764_memory(v, leafentry_memsize(v));
 }

+
+//TODO: #1125 delete function
 static void *
-le_malloc(OMT omt, struct mempool *mp, size_t size, void **maybe_free)
+le10_malloc(OMT omt, struct mempool *mp, size_t size, void **maybe_free)
 {
    if (omt)
 	return mempool_malloc_from_omt(omt, mp, size, maybe_free);
@@ -17,11 +19,12 @@ le_malloc(OMT omt, struct mempool *mp, size_t size, void **maybe_free)
 	return toku_malloc(size);
 }

+//Constructors for version 10 leafentries, possibly needed for upgrades.
 int
-le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result,
+le10_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result,
 	      OMT omt, struct mempool *mp, void **maybe_free) {
    size_t size = 9+klen+dlen;
-    unsigned char *lec=le_malloc(omt, mp, size, maybe_free);
+    unsigned char *lec=le10_malloc(omt, mp, size, maybe_free);
    assert(lec);
    lec[0] = LE_COMMITTED;
    putint(lec+1, klen);
@@ -34,11 +37,12 @@ le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t
    return 0;
 }

-int le_both (TXNID xid, u_int32_t klen, void* kval, u_int32_t clen, void* cval, u_int32_t plen, void* pval,
+int
+le10_both (TXNID xid, u_int32_t klen, void* kval, u_int32_t clen, void* cval, u_int32_t plen, void* pval,
 	     u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result,
 	     OMT omt, struct mempool *mp, void **maybe_free) {
    size_t size = 1+8+4*3+klen+clen+plen;
-    unsigned char *lec=le_malloc(omt, mp, size, maybe_free);
+    unsigned char *lec=le10_malloc(omt, mp, size, maybe_free);
    assert(lec);
    lec[0] = LE_BOTH;
    putint64(lec+1,          xid);
@@ -56,11 +60,11 @@ int le_both (TXNID xid, u_int32_t klen, void* kval, u_int32_t clen, void* cval,
 }

 int
-le_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
+le10_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
 	    u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result,
 	    OMT omt, struct mempool *mp, void **maybe_free) {
    size_t size = 1 + 8 + 2*4 + klen + dlen;
-    unsigned char *lec= le_malloc(omt, mp, size, maybe_free);
+    unsigned char *lec= le10_malloc(omt, mp, size, maybe_free);
    assert(lec);
    lec[0] = LE_PROVDEL;
    putint64(lec+1,          xid);
@@ -75,10 +79,10 @@ le_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
 }

 int
-le_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t plen, void* pval, u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result,
+le10_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t plen, void* pval, u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result,
 	     OMT omt, struct mempool *mp, void **maybe_free) {
    size_t size = 1 + 8 + 2*4 + klen + plen;
-    unsigned char *lec= le_malloc(omt, mp, size, maybe_free);
+    unsigned char *lec= le10_malloc(omt, mp, size, maybe_free);
    assert(lec);
    lec[0] = LE_PROVPAIR;
    putint64(lec+1,          xid);
@@ -92,370 +96,87 @@ le_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t plen, void* pval,
    return 0;
 }

-static u_int32_t memsize_le_committed (u_int32_t keylen, void *key __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t memsize_le10_committed (u_int32_t keylen, void *key __attribute__((__unused__)),
 				       u_int32_t vallen, void *val __attribute__((__unused__))) {
    return 1+ 2*4 + keylen + vallen;
 }

-static u_int32_t memsize_le_both (TXNID txnid __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t memsize_le10_both (TXNID txnid __attribute__((__unused__)),
 				  u_int32_t klen, void *kval __attribute__((__unused__)),
 				  u_int32_t clen, void *cval __attribute__((__unused__)),
 				  u_int32_t plen, void *pval __attribute__((__unused__))) {
    return 1 + 8 + 4*3 + klen + clen + plen;
 }

-static u_int32_t memsize_le_provdel (TXNID txnid __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t memsize_le10_provdel (TXNID txnid __attribute__((__unused__)),
 				     u_int32_t klen, void *kval __attribute__((__unused__)),
 				     u_int32_t clen, void *cval __attribute__((__unused__))) {
    return 1 + 8 + 4*2 + klen + clen;
 }

-static u_int32_t memsize_le_provpair (TXNID txnid __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t memsize_le10_provpair (TXNID txnid __attribute__((__unused__)),
 				     u_int32_t klen, void *kval __attribute__((__unused__)),
 				     u_int32_t plen, void *pval __attribute__((__unused__))) {
    return 1 + 8 + 4*2 + klen + plen;
 }

-u_int32_t leafentry_memsize (LEAFENTRY le) {
+u_int32_t leafentry_memsize_10 (LEAFENTRY le) {
    LESWITCHCALL(le, memsize);
    abort(); return 0;  // make certain compilers happy
 }

-static u_int32_t disksize_le_committed (u_int32_t keylen, void *key __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t disksize_le10_committed (u_int32_t keylen, void *key __attribute__((__unused__)),
 				       u_int32_t vallen, void *val __attribute__((__unused__))) {
    return 1 + 4 + 4 + keylen + vallen;
 }

-static u_int32_t disksize_le_both (TXNID txnid __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t disksize_le10_both (TXNID txnid __attribute__((__unused__)),
 				  u_int32_t klen, void *kval __attribute__((__unused__)),
 				  u_int32_t clen, void *cval __attribute__((__unused__)),
 				  u_int32_t plen, void *pval __attribute__((__unused__))) {
    return 1 + 8 + 4*3 + klen + clen + plen;
 }

-static u_int32_t disksize_le_provdel (TXNID txnid __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t disksize_le10_provdel (TXNID txnid __attribute__((__unused__)),
 				     u_int32_t klen, void *kval __attribute__((__unused__)),
 				     u_int32_t clen, void *cval __attribute__((__unused__))) {
    return 1 + 8 + 4 + 4 + klen + clen;
 }

-static u_int32_t disksize_le_provpair (TXNID txnid __attribute__((__unused__)),
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+static u_int32_t disksize_le10_provpair (TXNID txnid __attribute__((__unused__)),
 				       u_int32_t klen, void *kval __attribute__((__unused__)),
 				       u_int32_t plen, void *pval __attribute__((__unused__))) {
    return 1 + 8 + 4 + 4 + klen + plen;
 }


+//TODO: #1125 FUNCTION NEEDED for upgrading?
 static u_int32_t
-leafentry_disksize_internal (LEAFENTRY le) {
+le10_disksize_internal (LEAFENTRY le) {
    LESWITCHCALL(le, disksize);
    abort(); return 0;  // make certain compilers happy
 }

-u_int32_t leafentry_disksize (LEAFENTRY le) {
-    u_int32_t d = leafentry_disksize_internal(le);
+//TODO: #1125 FUNCTION NEEDED for upgrading?
+u_int32_t le10_disksize (LEAFENTRY le) {
+    u_int32_t d = le10_disksize_internal(le);
 #if 0
    // this computation is currently identical to the _disksize_internal
-    u_int32_t m = leafentry_memsize(le);
+    u_int32_t m = le10_memsize(le);
    assert(m==d);
 #endif
    return d;
 }

-u_int32_t toku_logsizeof_LEAFENTRY (LEAFENTRY le) {
-    return leafentry_disksize(le);
-}
-
-int toku_fread_LEAFENTRY(FILE *f, LEAFENTRY *le, struct x1764 *checksum, u_int32_t *len) {
-    u_int8_t state;
-    int r = toku_fread_u_int8_t (f, &state, checksum, len); if (r!=0) return r;
-    TXNID xid;
-    BYTESTRING a,b,c;
-    u_int32_t memsize, disksize;
-    switch ((enum le_state)state) {
-    case LE_COMMITTED:
-	r = toku_fread_BYTESTRING(f, &a, checksum, len);  if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &b, checksum, len);  if (r!=0) return r;
-	r = le_committed(a.len, a.data, b.len, b.data,
-			 &memsize, &disksize, le,
-			 0, 0, 0);
-	toku_free_BYTESTRING(a);
-	toku_free_BYTESTRING(b);
-	return r;
-    case LE_BOTH:
-	r = toku_fread_TXNID(f, &xid, checksum, len);     if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &a, checksum, len);  if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &b, checksum, len);  if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &c, checksum, len);  if (r!=0) return r;
-	r = le_both(xid, a.len, a.data, b.len, b.data, c.len, c.data,
-		    &memsize, &disksize, le,
-		    0, 0, 0);
-	toku_free_BYTESTRING(a);
-	toku_free_BYTESTRING(b);
-	toku_free_BYTESTRING(c);
-	return r;
-    case LE_PROVDEL:
-	r = toku_fread_TXNID(f, &xid, checksum, len);     if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &a, checksum, len);  if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &b, checksum, len);  if (r!=0) return r;
-	r = le_provdel(xid, a.len, a.data, b.len, b.data,
-		       &memsize, &disksize, le,
-		       0, 0, 0);
-	toku_free_BYTESTRING(a);
-	toku_free_BYTESTRING(b);
-	return r;
-    case LE_PROVPAIR:
-	r = toku_fread_TXNID(f, &xid, checksum, len);     if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &a, checksum, len);  if (r!=0) return r;
-	r = toku_fread_BYTESTRING(f, &b, checksum, len);  if (r!=0) return r;
-	r = le_provpair(xid, a.len, a.data, b.len, b.data,
-			&memsize, &disksize, le,
-			0, 0, 0);
-	toku_free_BYTESTRING(a);
-	toku_free_BYTESTRING(b);
-	return r;
-    }
-    return DB_BADFORMAT;
-}
-
-static int print_le_committed (u_int32_t keylen, void *key, u_int32_t vallen, void *val, FILE *outf) {
-    fprintf(outf, "{C: ");
-    toku_print_BYTESTRING(outf, keylen, key);
-    toku_print_BYTESTRING(outf, vallen, val);
-    fprintf(outf, "}");
-    return 0;
-}
-
-static int print_le_both (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval, FILE *outf) {
-    fprintf(outf, "{B: ");
-    fprintf(outf, " xid=%" PRIu64, xid);
-    fprintf(outf, " key=");
-    toku_print_BYTESTRING(outf, klen, kval);
-    toku_print_BYTESTRING(outf, clen, cval);
-    fprintf(outf, " provisional=");
-    toku_print_BYTESTRING(outf, plen, pval);
-    fprintf(outf, "}");
-    return 0;
-}
-
-static int print_le_provdel (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, FILE *outf) {
-    fprintf(outf, "{D: ");
-    fprintf(outf, " xid=%" PRIu64, xid);
-    fprintf(outf, " key=");
-    toku_print_BYTESTRING(outf, klen, kval);
-    fprintf(outf, " committed=");
-    toku_print_BYTESTRING(outf, clen, cval);
-    fprintf(outf, "}");
-    return 0;
-}
-
-static int print_le_provpair (TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval, FILE *outf) {
-    fprintf(outf, "{P: ");
-    fprintf(outf, " xid=%" PRIu64, xid);
-    fprintf(outf, " key=");
-    toku_print_BYTESTRING(outf, klen, kval);
-    fprintf(outf, " provisional=");
-    toku_print_BYTESTRING(outf, plen, pval);
-    fprintf(outf, "}");
-    return 0;
-}
-
-int print_leafentry (FILE *outf, LEAFENTRY v) {
-    if (!v) { printf("NULL"); return 0; }
-    LESWITCHCALL(v, print, outf);
-    abort(); return 0;  // make certain compilers happy
-}
-
-int toku_logprint_LEAFENTRY (FILE *outf, FILE *inf, const char *fieldname, struct x1764 *checksum, u_int32_t *len, const char *format __attribute__((__unused__))) {
-    LEAFENTRY v;
-    int r = toku_fread_LEAFENTRY(inf, &v, checksum, len);
-    if (r!=0) return r;
-    fprintf(outf, " %s=", fieldname);
-    print_leafentry(outf, v);
-    toku_free(v);
-    return 0;
-}
-
 void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le) {
    wbuf_literal_bytes(w, le, leafentry_disksize(le));
 }

-void rbuf_LEAFENTRY(struct rbuf *r, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *lep) {
-    LEAFENTRY le = (LEAFENTRY)(&r->buf[r->ndone]);
-    u_int32_t siz = leafentry_disksize(le);
-    bytevec bytes;
-    rbuf_literal_bytes(r, &bytes, siz);
-    *lep = toku_memdup(le, siz);
-    assert(*lep);
-    *resultsize = siz;
-    *disksize   = siz;
-    return;
-}
-
-// LEAFENTRUse toku_free()
-void toku_free_LEAFENTRY(LEAFENTRY le) {
-    toku_free(le);
-}
-
-
-int le_is_provdel(LEAFENTRY le) {
-    return get_le_state(le)==LE_PROVDEL;
-}
-
-void* latest_key_le_committed (u_int32_t UU(keylen), void *key, u_int32_t UU(vallen), void *UU(val)) {
-    return key;
-}
-void* latest_key_le_both (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
-    return kval;
-}
-void* latest_key_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
-    return 0; // for provisional delete, there is no *latest* key, so return NULL
-}
-void* latest_key_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(plen), void *UU(pval)) {
-    return kval;
-}
-void* le_latest_key (LEAFENTRY le) {
-    LESWITCHCALL(le, latest_key);
-    abort(); return 0;  // make certain compilers happy
-}
-
-u_int32_t latest_keylen_le_committed (u_int32_t keylen, void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
-    return keylen;
-}
-u_int32_t latest_keylen_le_both (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
-    return klen;
-}
-u_int32_t latest_keylen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
-    return 0; // for provisional delete, there is no *latest* key, so return 0.  What else can we do?
-}
-u_int32_t latest_keylen_le_provpair (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(plen), void *UU(pval)) {
-    return klen;
-}
-u_int32_t le_latest_keylen (LEAFENTRY le) {
-    LESWITCHCALL(le, latest_keylen);
-    abort(); return 0;  // make certain compilers happy
-}
-
-void* latest_val_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
-    return val;
-}
-void* latest_val_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *pval) {
-    return pval;
-}
-void* latest_val_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
-    return 0; // for provisional delete, there is no *latest* key, so return NULL
-}
-void* latest_val_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(plen), void *pval) {
-    return pval;
-}
-void* le_latest_val (LEAFENTRY le) {
-    LESWITCHCALL(le, latest_val);
-    abort(); return 0;  // make certain compilers happy
-}
-
-u_int32_t latest_vallen_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t vallen, void *UU(val)) {
-    return vallen;
-}
-u_int32_t latest_vallen_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t plen, void *UU(pval)) {
-    return plen;
-}
-u_int32_t latest_vallen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
-    return 0; // for provisional delete, there is no *latest* key, so return 0.  What else can we do?
-}
-u_int32_t latest_vallen_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t plen, void *UU(pval)) {
-    return plen;
-}
-u_int32_t le_latest_vallen (LEAFENTRY le) {
-    LESWITCHCALL(le, latest_vallen);
-    abort(); return 0;  // make certain compilers happy
-}
-
-void* any_key_le_committed (u_int32_t UU(keylen), void *key, u_int32_t UU(vallen), void *UU(val)) {
-    return key;
-}
-void* any_key_le_both (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
-    return kval;
-}
-void* any_key_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval)) {
-    return kval;
-}
-void* any_key_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(plen), void *UU(pval)) {
-    return kval;
-}
-void* le_any_key (LEAFENTRY le) {
-    LESWITCHCALL(le, any_key);
-    abort(); return 0;  // make certain compilers happy
-}
-
-u_int32_t any_keylen_le_committed (u_int32_t keylen, void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
-    return keylen;
-}
-u_int32_t any_keylen_le_both (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
-    return klen;
-}
-u_int32_t any_keylen_le_provdel (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
-    return klen;
-}
-u_int32_t any_keylen_le_provpair (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(plen), void *UU(pval)) {
-    return klen;
-}
-u_int32_t le_any_keylen (LEAFENTRY le) {
-    LESWITCHCALL(le, any_keylen);
-    abort(); return 0;  // make certain compilers happy
-}
-
-void* any_val_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
-    return val;
-}
-void* any_val_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *pval) {
-    return pval;
-}
-void* any_val_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *cval) {
-    return cval;
-}
-void* any_val_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(plen), void *pval) {
-    return pval;
-}
-void* le_any_val (LEAFENTRY le) {
-    LESWITCHCALL(le, any_val);
-    abort(); return 0;  // make certain compilers happy
-}
-
-u_int32_t any_vallen_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t vallen, void *UU(val)) {
-    return vallen;
-}
-u_int32_t any_vallen_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t plen, void *UU(pval)) {
-    return plen;
-}
-u_int32_t any_vallen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t clen, void *UU(cval)) {
-    return clen; // for provisional delete, there is no *any* key, so return 0.  What else can we do?
-}
-u_int32_t any_vallen_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t plen, void *UU(pval)) {
-    return plen;
-}
-u_int32_t le_any_vallen (LEAFENTRY le) {
-    LESWITCHCALL(le, any_vallen);
-    abort(); return 0;  // make certain compilers happy
-}
-
-
-u_int64_t any_xid_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
-    return 0;
-}
-
-u_int64_t any_xid_le_both (TXNID xid, u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
-    return xid;
-}
-
-u_int64_t any_xid_le_provdel (TXNID xid, u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
-    return xid;
-}
-
-u_int64_t any_xid_le_provpair (TXNID xid, u_int32_t UU(klen), void *UU(kval), u_int32_t UU(plen), void *UU(pval)) {
-    return xid;
-}
-
-u_int64_t le_any_xid (LEAFENTRY le) {
-    LESWITCHCALL(le, any_xid);
-    abort(); return 0;  // make certain compilers happy
-}
--- a/newbrt/leafentry.h
+++ b/newbrt/leafentry.h
@@ -34,17 +34,67 @@
 #include "rbuf.h"
 #include "x1764.h"

+#if 0
+    Memory format of packed nodup leaf entry
+    CONSTANTS:
+        num_uxrs
+        keylen
+    Run-time-constants
+        voffset of val/vallen??? (for le_any_val) This must be small if it is interpreted as voffset = realoffset_of_val - keylen
+            GOOD performance optimization.
+            ALSO good for simplicity (no having to scan packed version)
+        key[]
+    variable length
+        
+        
+    Memory format of packed dup leaf entry
+    CONSTANTS:
+        num_uxrs
+        keylen
+        vallen
+    Run-time-constants
+        key[]
+        val[]
+#endif
+#if TOKU_WINDOWS
+#pragma pack(push, 1)
+#endif
+struct __attribute__ ((__packed__)) leafentry {
+    u_int8_t  num_xrs;
+    u_int32_t keylen;
+    u_int32_t innermost_inserted_vallen;
+    union {
+        struct __attribute__ ((__packed__)) leafentry_committed {
+            u_int8_t key_val[0];     //Actual key, then actual val
+        } comm;
+        struct __attribute__ ((__packed__)) leafentry_provisional {
+            u_int8_t innermost_type;
+            TXNID    xid_outermost_uncommitted;
+            u_int8_t key_val_xrs[];  //Actual key,
+                                     //then actual innermost inserted val,
+                                     //then transaction records.
+        } prov;
+    } u;
+};
+#if TOKU_WINDOWS
+#pragma pack(pop)
+#endif
+
+typedef struct leafentry *LEAFENTRY;
+
+
 u_int32_t toku_le_crc(LEAFENTRY v);

-int le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result,
+//TODO: #1125 next four probably are not necessary once testing for new structure is done (except possibly for test-leafentry.c, rename to test-leafentry10.c
+int le10_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result,
 		  OMT, struct mempool *, void **maybe_free);
-int le_both (TXNID xid, u_int32_t cklen, void* ckval, u_int32_t cdlen, void* cdval, u_int32_t pdlen, void* pdval,
+int le10_both (TXNID xid, u_int32_t cklen, void* ckval, u_int32_t cdlen, void* cdval, u_int32_t pdlen, void* pdval,
 	     u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result,
 	     OMT, struct mempool *, void **maybe_free);
-int le_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
+int le10_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
 		u_int32_t *resultsize, u_int32_t *memsize, LEAFENTRY *result,
 		OMT, struct mempool *, void **maybe_free);
-int le_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t plen, void* pval, u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result,
+int le10_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t plen, void* pval, u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result,
 		 OMT omt, struct mempool *mp, void **maybe_free);

 enum le_state { LE_COMMITTED=1, // A committed pair.
@@ -52,8 +102,6 @@ enum le_state { LE_COMMITTED=1, // A committed pair.
 		LE_PROVDEL,     // A committed pair that has been provisionally deleted
 		LE_PROVPAIR };  // No committed value, but a provisional pair.

-u_int32_t leafentry_memsize (LEAFENTRY);
-
 static inline enum le_state get_le_state(LEAFENTRY le) {
    return (enum le_state)*(unsigned char *)le;
 }
@@ -93,7 +141,7 @@ static inline u_int64_t getint64 (unsigned char *p) {
    unsigned char* __kvaladdr = 4      + __klenaddr;                                                                 \
    unsigned char* __clenaddr = __klen + __kvaladdr;   u_int32_t __clen = getint(__clenaddr);                        \
    unsigned char* __cvaladdr = 4 + __clenaddr;                                                                      \
-    return funname ## _le_committed(__klen, __kvaladdr, __clen, __cvaladdr
+    return funname ## _le10_committed(__klen, __kvaladdr, __clen, __cvaladdr

 #define DO_LE_BOTH(funname,le)  case LE_BOTH: {                         \
    unsigned char* __xidaddr  = 1+(unsigned char*)le;  u_int64_t __xid  = getint64(__xidaddr);                       \
@@ -103,7 +151,7 @@ static inline u_int64_t getint64 (unsigned char *p) {
    unsigned char* __cvaladdr = 4 + __clenaddr;                                                                      \
    unsigned char* __plenaddr = __clen + __cvaladdr;   u_int32_t __plen = getint(__plenaddr);                        \
    unsigned char* __pvaladdr = 4 + __plenaddr;                                                                      \
-    return funname ## _le_both(__xid, __klen, __kvaladdr, __clen, __cvaladdr, __plen, __pvaladdr
+    return funname ## _le10_both(__xid, __klen, __kvaladdr, __clen, __cvaladdr, __plen, __pvaladdr

 #define DO_LE_PROVDEL(funname,le )  case LE_PROVDEL:  {                                                              \
    unsigned char* __xidaddr  = 1+(unsigned char*)le;  u_int64_t __xid  = getint64(__xidaddr);                       \
@@ -111,7 +159,7 @@ static inline u_int64_t getint64 (unsigned char *p) {
    unsigned char* __kvaladdr = 4 + __klenaddr;                                                                      \
    unsigned char* __dlenaddr = __klen + __kvaladdr;   u_int32_t __dlen = getint(__dlenaddr);                        \
    unsigned char* __dvaladdr = 4 + __dlenaddr;                                                                      \
-    return funname ## _le_provdel(__xid, __klen, __kvaladdr, __dlen, __dvaladdr
+    return funname ## _le10_provdel(__xid, __klen, __kvaladdr, __dlen, __dvaladdr

 #define DO_LE_PROVPAIR(funname,le)   case LE_PROVPAIR:  {                                                            \
    unsigned char* __xidaddr  = 1+(unsigned char*)le;  u_int64_t __xid  = getint64(__xidaddr);                       \
@@ -119,7 +167,7 @@ static inline u_int64_t getint64 (unsigned char *p) {
    unsigned char* __kvaladdr = 4 + __klenaddr;                                                                      \
    unsigned char* __plenaddr = __klen + __kvaladdr;   u_int32_t __plen = getint(__plenaddr);                        \
    unsigned char* __pvaladdr = 4 + __plenaddr;                                                                      \
-    return funname ## _le_provpair(__xid, __klen, __kvaladdr, __plen, __pvaladdr
+    return funname ## _le10_provpair(__xid, __klen, __kvaladdr, __plen, __pvaladdr

 #ifdef __ICL
 #define LESWITCHCALL(le,funname, ...) do {        \
@@ -139,80 +187,57 @@ static inline u_int64_t getint64 (unsigned char *p) {
  } abort(); } while (0)
 #endif

-u_int32_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory.
-u_int32_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY.  The size of a leafentry on disk.
-u_int32_t toku_logsizeof_LEAFENTRY(LEAFENTRY le);
+size_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory.
+size_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY.  The size of a leafentry on disk.
 void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
-void rbuf_LEAFENTRY(struct rbuf *r, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *le);
-int toku_fread_LEAFENTRY(FILE *f, LEAFENTRY *le, struct x1764 *, u_int32_t *len); // read a leafentry from a log
-int toku_logprint_LEAFENTRY(FILE *outf, FILE *inf, const char *fieldname, struct x1764 *, u_int32_t *len, const char *format); // read a leafentry from a log and then print it in human-readable form.
-void toku_free_LEAFENTRY(LEAFENTRY le);
 int print_leafentry (FILE *outf, LEAFENTRY v); // Print a leafentry out in human-readable form.

 int le_is_provdel(LEAFENTRY le); // Return true if it is a provisional delete.
 void*     le_latest_key (LEAFENTRY le); // Return the latest key (return NULL for provisional deletes)
 u_int32_t le_latest_keylen (LEAFENTRY le); // Return the latest keylen.
+void* le_latest_key_and_len (LEAFENTRY le, u_int32_t *len);
 void*     le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes)
 u_int32_t le_latest_vallen (LEAFENTRY le); // Return the latest vallen.  Returns 0 for provisional deletes.
-
- // Return any key or value (even if it's only provisional)
-void* le_any_key (LEAFENTRY le);
-u_int32_t le_any_keylen (LEAFENTRY le);
-void* le_any_val (LEAFENTRY le);
-u_int32_t le_any_vallen (LEAFENTRY le);
-u_int64_t le_any_xid (LEAFENTRY le);
-
-void *latest_key_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-void *latest_key_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-void *latest_key_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-void *latest_key_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-u_int32_t latest_keylen_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-u_int32_t latest_keylen_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-u_int32_t latest_keylen_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-u_int32_t latest_keylen_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-void *latest_val_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-void *latest_val_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-void *latest_val_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-void *latest_val_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-u_int32_t latest_vallen_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-u_int32_t latest_vallen_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-u_int32_t latest_vallen_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-u_int32_t latest_vallen_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-u_int64_t latest_xid_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-u_int64_t latest_xid_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-u_int64_t latest_xid_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-u_int64_t latest_xid_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-//
-
-void *any_key_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-void *any_key_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-void *any_key_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-void *any_key_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-u_int32_t any_keylen_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-u_int32_t any_keylen_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-u_int32_t any_keylen_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-u_int32_t any_keylen_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-void *any_val_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-void *any_val_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-void *any_val_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-void *any_val_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-u_int32_t any_vallen_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-u_int32_t any_vallen_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-u_int32_t any_vallen_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-u_int32_t any_vallen_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
-
-u_int64_t any_xid_le_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val);
-u_int64_t any_xid_le_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval);
-u_int64_t any_xid_le_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval);
-u_int64_t any_xid_le_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval);
+void* le_latest_val_and_len (LEAFENTRY le, u_int32_t *len);
+
+ // Return any key or value (even if it's only provisional).
+void* le_key (LEAFENTRY le);
+u_int32_t le_keylen (LEAFENTRY le);
+void* le_key_and_len (LEAFENTRY le, u_int32_t *len);
+
+u_int64_t le_outermost_uncommitted_xid (LEAFENTRY le);
+
+ // Return any key or value (even if it's only provisional)  If more than one exist, choose innermost (newest)
+void* le_innermost_inserted_val (LEAFENTRY le);
+u_int32_t le_innermost_inserted_vallen (LEAFENTRY le);
+void* le_innermost_inserted_val_and_len (LEAFENTRY le, u_int32_t *len);
+
+void le_full_promotion(LEAFENTRY le, size_t *new_leafentry_memorysize, size_t *new_leafentry_disksize);
+//Effect: Fully promotes le.  Returns new memory/disk size.
+//        Reuses the memory of le.
+//        Memory size is guaranteed to reduce.
+//           result of leafentry_memsize() changes
+//        Pointer to le is reused.
+//           No need to update omt if it just points to the leafentry.
+//        Does not change results of:
+//           le_is_provdel()
+//           le_latest_keylen()
+//           le_latest_vallen()
+//           le_keylen()
+//           le_innermost_inserted_vallen()
+//        le_outermost_uncommitted_xid will return 0 after this.
+//        Changes results of following pointer functions, but memcmp of old/new answers would say they're the same.
+//          Note: You would have to memdup the old answers before calling le_full_promotion, if you want to run the comparison
+//           le_latest_key()
+//           le_latest_val()
+//           le_key()
+//           le_innermost_inserted_val()
+//        le_outermost_uncommitted_xid will return 0 after this
+//        key/val pointers will change, but data pointed to by them will be the same
+//           as before
+//Requires: le is not a provdel
+//Requires: le is not marked committed
+//Requires: The outermost uncommitted xid in le has actually committed (le was not yet updated to reflect that)

 #endif

--- a/newbrt/log-internal.h
+++ b/newbrt/log-internal.h
@@ -105,6 +105,7 @@ struct tokutxn {
    toku_off_t      rollentry_filesize;   // How many bytes are in the rollentry file (this is the uncompressed bytes.  If the file is compressed it may actually be smaller (or even larger with header information))
    u_int64_t  rollentry_raw_count;  // the total count of every byte in the transaction and all its children.
    OMT        open_brts; // a collection of the brts that we touched.  Indexed by filenum.
+    XIDS       xids;      //Represents the xid list
 };

 int toku_logger_finish (TOKULOGGER logger, struct logbytes *logbytes, struct wbuf *wbuf, int do_fsync);

--- a/newbrt/log.h
+++ b/newbrt/log.h
@@ -32,7 +32,6 @@ struct roll_entry;
 #include "recover.h"
 #include "txn.h"

-// needed by logformat.c
 static inline int toku_copy_BYTESTRING(BYTESTRING *target, BYTESTRING val) {
    target->len = val.len;
    target->data = toku_memdup(val.data, (size_t)val.len);

--- a/newbrt/logformat.c
+++ b/newbrt/logformat.c
@@ -44,26 +44,26 @@ struct logtype {
 int logformat_version_number = 0;

 const struct logtype rollbacks[] = {
-    {"fcreate", 'F', FA{{"TXNID", "xid", 0},
+    {"fcreate", 'F', FA{{"TXNID", "txnid", 0},
                        {"FILENUM", "filenum", 0},
 			{"BYTESTRING", "fname", 0},
 			NULLFIELD}},
    // cmdinsert is used to insert a key-value pair into a NODUP DB.  For rollback we don't need the data.
-    {"cmdinsert", 'i', FA{{"TXNID", "xid", 0},
+    {"cmdinsert", 'i', FA{
 			  {"FILENUM", "filenum", 0},
 			  {"BYTESTRING", "key", 0},
 			  NULLFIELD}},
-    {"cmdinsertboth", 'I', FA{{"TXNID", "xid", 0},
+    {"cmdinsertboth", 'I', FA{
 			  {"FILENUM", "filenum", 0},
 			  {"BYTESTRING", "key", 0},
 			  {"BYTESTRING", "data", 0},
 			  NULLFIELD}},
-    {"cmddeleteboth", 'D', FA{{"TXNID", "xid", 0},
+    {"cmddeleteboth", 'D', FA{
 			      {"FILENUM", "filenum", 0},
 			      {"BYTESTRING", "key", 0},
 			      {"BYTESTRING", "data", 0},
 			      NULLFIELD}},
-    {"cmddelete", 'd', FA{{"TXNID", "xid", 0},
+    {"cmddelete", 'd', FA{
 			  {"FILENUM", "filenum", 0},
 			  {"BYTESTRING", "key", 0},
 			  NULLFIELD}},

--- a/newbrt/omt-with-o1-cursors/omt.c
+++ b/newbrt/omt-with-o1-cursors/omt.c
-#ident "Copyright (c) 2007 Tokutek Inc.  All rights reserved."
-
-#include <errno.h>
-#include <sys/types.h>
-#include <stdint.h>
-
-typedef void *OMTVALUE;
-#include "omt.h"
-#include "../newbrt/memory.h"
-#include "../newbrt/toku_assert.h"
-#include "../include/db.h"
-#include "../newbrt/brttypes.h"
-
-typedef u_int32_t node_idx;
-static const node_idx NODE_NULL = UINT32_MAX;
-
-typedef struct omt_node *OMT_NODE;
-struct omt_node {
-    u_int32_t weight; /* Size of subtree rooted at this node (including this one). */
-    node_idx  left;   /* Index of left  subtree. */
-    node_idx  right;  /* Index of right subtree. */
-    OMTVALUE  value;  /* The value stored in the node. */
-};
-
-struct omt {
-    node_idx   root;
-
-    u_int32_t  node_capacity;
-    OMT_NODE   nodes;
-    node_idx   free_idx;
-
-    u_int32_t  tmparray_size;
-    node_idx*  tmparray;
-
-    OMTCURSOR  associated; // the OMTs associated with this.
-};
-
-struct omt_cursor {
-    OMT omt;   // The omt this cursor is associated with.  NULL if not present.
-    u_int32_t max_pathlen; //Max (root to leaf) path length;
-    u_int32_t pathlen;     //Length of current path
-    node_idx *path;
-    OMTCURSOR next,prev; // circular linked list of all OMTCURSORs associated with omt.
-};
-
-//Initial max size of root-to-leaf path
-static const u_int32_t TOKU_OMTCURSOR_INITIAL_SIZE = 4;
-
-static int omt_create_internal(OMT *omtp, u_int32_t num_starting_nodes) {
-    if (num_starting_nodes < 2) num_starting_nodes = 2;
-    OMT MALLOC(result);
-    if (result==NULL) return errno;
-    result->root=NODE_NULL;
-    result->node_capacity = num_starting_nodes*2;
-    MALLOC_N(result->node_capacity, result->nodes);
-    if (result->nodes==NULL) {
-        toku_free(result);
-        return errno;
-    }
-    result->tmparray_size = num_starting_nodes*2;
-    MALLOC_N(result->tmparray_size, result->tmparray);
-    if (result->tmparray==NULL) {
-        toku_free(result->nodes);
-        toku_free(result);
-        return errno;
-    }
-    result->free_idx = 0;
-    result->associated = NULL;
-    *omtp = result;
-    return 0;
-}
-
-int toku_omt_create (OMT *omtp) {
-    return omt_create_internal(omtp, 2);
-}
-
-int toku_omt_cursor_create (OMTCURSOR *omtcp) {
-    OMTCURSOR MALLOC(c);
-    if (c==NULL) return errno;
-    c->omt = NULL;
-    c->next = c->prev = NULL;
-    c->max_pathlen    = TOKU_OMTCURSOR_INITIAL_SIZE;
-    c->pathlen        = 0;
-    MALLOC_N(c->max_pathlen, c->path); 
-    if (c->path==NULL) {
-        toku_free(c);
-        return errno;
-    }
-    *omtcp = c;
-    return 0;
-}
-
-void toku_omt_cursor_invalidate (OMTCURSOR c) {
-    if (c==NULL || c->omt==NULL) return;
-    if (c->next == c) {
-	// It's the last one.
-	c->omt->associated = NULL;
-    } else {
-	OMTCURSOR next = c->next;
-	OMTCURSOR prev = c->prev;
-	if (c->omt->associated == c) {
-	    c->omt->associated = next;
-	}
-	next->prev = prev;
-	prev->next = next;
-    }
-    c->next = c->prev = NULL;
-    c->omt = NULL;
-}
-
-void toku_omt_cursor_destroy (OMTCURSOR *p) {
-    toku_omt_cursor_invalidate(*p);
-    toku_free((*p)->path);
-    toku_free(*p);
-    *p = NULL;
-}
-
-static void invalidate_cursors (OMT omt) {
-    OMTCURSOR assoced;
-    while ((assoced = omt->associated)) {
-	toku_omt_cursor_invalidate(assoced);
-    }
-}
-
-static void associate (OMT omt, OMTCURSOR c)
-{
-    if (c->omt==omt) return;
-    toku_omt_cursor_invalidate(c);
-    if (omt->associated==NULL) {
-	c->prev = c;
-	c->next = c;
-	omt->associated = c;
-    } else {
-	c->prev = omt->associated->prev;
-	c->next = omt->associated;
-	omt->associated->prev->next = c;
-	omt->associated->prev = c;
-    }
-    c->omt = omt;
-}
-
-void toku_omt_destroy(OMT *omtp) {
-    OMT omt=*omtp;
-    invalidate_cursors(omt);
-    toku_free(omt->nodes);
-    toku_free(omt->tmparray);
-    toku_free(omt);
-    *omtp=NULL;
-}
-
-static inline u_int32_t nweight(OMT omt, node_idx idx) {
-    if (idx==NODE_NULL) return 0;
-    else return (omt->nodes+idx)->weight;
-}
-
-u_int32_t toku_omt_size(OMT V) {
-    return nweight(V, V->root);
-}
-
-static inline node_idx omt_node_malloc(OMT omt) {
-    assert(omt->free_idx < omt->node_capacity);
-    return omt->free_idx++;
-}
-
-static inline void omt_node_free(OMT omt, node_idx idx) {
-    assert(idx < omt->node_capacity);
-}
-
-static inline void fill_array_with_subtree_values(OMT omt, OMTVALUE *array, node_idx tree_idx) {
-    if (tree_idx==NODE_NULL) return;
-    OMT_NODE tree = omt->nodes+tree_idx;
-    fill_array_with_subtree_values(omt, array, tree->left);
-    array[nweight(omt, tree->left)] = tree->value;
-    fill_array_with_subtree_values(omt, array+nweight(omt, tree->left)+1, tree->right); 
-}
-
-// Example:  numvalues=4,  halfway=2,  left side is values of size 2
-//                                     right side is values+3 of size 1
-//           numvalues=3,  halfway=1,  left side is values of size 1
-//                                     right side is values+2 of size 1
-//           numvalues=2,  halfway=1,  left side is values of size 1
-//                                     right side is values+2 of size 0
-//           numvalues=1,  halfway=0,  left side is values of size 0
-//                                     right side is values of size 0.
-static inline void create_from_sorted_array_internal(OMT omt, node_idx *n_idxp,
-                                                     OMTVALUE *values, u_int32_t numvalues) {
-    if (numvalues==0) {
-        *n_idxp = NODE_NULL;
-    } else {
-        u_int32_t halfway = numvalues/2;
-        node_idx newidx   = omt_node_malloc(omt);
-        OMT_NODE newnode  = omt->nodes+newidx;
-        newnode->weight   = numvalues;
-        newnode->value    = values[halfway]; 
-        create_from_sorted_array_internal(omt, &newnode->left,  values,           halfway);
-        create_from_sorted_array_internal(omt, &newnode->right, values+halfway+1, numvalues-(halfway+1));
-        *n_idxp = newidx;
-    }
-}
-
-int toku_omt_create_from_sorted_array(OMT *omtp, OMTVALUE *values, u_int32_t numvalues) {
-    OMT omt = NULL;
-    int r;
-    if ((r = omt_create_internal(&omt, numvalues))) return r;
-    create_from_sorted_array_internal(omt, &omt->root, values, numvalues);
-    *omtp=omt;
-    return 0;
-}
-
-enum build_choice { MAYBE_REBUILD, JUST_RESIZE };
-
-static inline int maybe_resize_and_rebuild(OMT omt, u_int32_t n, enum build_choice choice) {
-    node_idx *new_tmparray = NULL;
-    OMT_NODE  new_nodes    = NULL;
-    OMTVALUE *tmp_values   = NULL;
-    int r = ENOSYS;
-    u_int32_t new_size = n<=2 ? 4 : 2*n;
-
-    if (omt->tmparray_size<n ||
-        (omt->tmparray_size/2 >= new_size)) {
-        /* Malloc and free instead of realloc (saves the memcpy). */
-        MALLOC_N(new_size, new_tmparray);
-        if (new_tmparray==NULL) { r = errno; goto cleanup; }
-    }
-    /* Rebuild/realloc the nodes array iff any of the following:
-     *  The array is smaller than the number of elements we want.
-     *  We are increasing the number of elements and there is no free space.
-     *  The array is too large. */
-    u_int32_t num_nodes = nweight(omt, omt->root);
-    if ((omt->node_capacity/2 >= new_size) ||
-        (omt->free_idx>=omt->node_capacity && num_nodes<n) ||
-        (omt->node_capacity<n)) {
-        if (choice==MAYBE_REBUILD) {
-            MALLOC_N(num_nodes, tmp_values);
-            if (tmp_values==NULL) { r = errno; goto cleanup;}
-        }
-        MALLOC_N(new_size, new_nodes);
-        if (new_nodes==NULL)  { r = errno; goto cleanup; }
-    }
-
-    /* Nothing can fail now.  Atomically update both sizes. */
-    if (new_tmparray) {
-       toku_free(omt->tmparray); 
-       omt->tmparray      = new_tmparray;
-       omt->tmparray_size = new_size;
-    }
-    if (new_nodes) {
-        /* Rebuild the tree in the new array, leftshifted, in preorder */
-        if (choice==MAYBE_REBUILD) {
-            fill_array_with_subtree_values(omt, tmp_values, omt->root);
-        }
-        toku_free(omt->nodes);
-        omt->nodes         = new_nodes;
-        omt->node_capacity = new_size;
-        omt->free_idx      = 0; /* Allocating from mempool starts over. */
-        omt->root          = NODE_NULL;
-        if (choice==MAYBE_REBUILD) {
-            create_from_sorted_array_internal(omt, &omt->root, tmp_values, num_nodes);
-        }
-    }
-    r = 0;
-cleanup:
-    if (r!=0) {
-        if (new_tmparray) toku_free(new_tmparray);
-        if (new_nodes)    toku_free(new_nodes);
-    }
-    if (tmp_values)       toku_free(tmp_values);
-    return r;
-}
-
-static inline void fill_array_with_subtree_idxs(OMT omt, node_idx *array, node_idx tree_idx) {
-    if (tree_idx==NODE_NULL) return;
-    OMT_NODE tree = omt->nodes+tree_idx;
-    fill_array_with_subtree_idxs(omt, array, tree->left);
-    array[nweight(omt, tree->left)] = tree_idx;
-    fill_array_with_subtree_idxs(omt, array+nweight(omt, tree->left)+1, tree->right); 
-}
-
-/* Reuses existing OMT_NODE structures (used for rebalancing). */
-static inline void rebuild_subtree_from_idxs(OMT omt, node_idx *n_idxp, node_idx *idxs,
-                                             u_int32_t numvalues) {
-    if (numvalues==0) {
-        *n_idxp=NODE_NULL;
-    } else {
-        u_int32_t halfway = numvalues/2;
-        node_idx newidx   = idxs[halfway];
-        OMT_NODE newnode  = omt->nodes+newidx;
-        newnode->weight   = numvalues;
-        // value is already in there.
-        rebuild_subtree_from_idxs(omt, &newnode->left,  idxs,           halfway);
-        rebuild_subtree_from_idxs(omt, &newnode->right, idxs+halfway+1, numvalues-(halfway+1));
-        *n_idxp = newidx;
-    }
-}
-
-static inline void rebalance(OMT omt, node_idx *n_idxp) {
-    node_idx idx = *n_idxp;
-    OMT_NODE n   = omt->nodes+idx;
-    fill_array_with_subtree_idxs(omt, omt->tmparray, idx);
-    rebuild_subtree_from_idxs(omt, n_idxp, omt->tmparray, n->weight);
-}
-
-static inline BOOL will_need_rebalance(OMT omt, node_idx n_idx, int leftmod, int rightmod) {
-    if (n_idx==NODE_NULL) return FALSE;
-    OMT_NODE n = omt->nodes+n_idx;
-    // one of the 1's is for the root.
-    // the other is to take ceil(n/2)
-    u_int32_t weight_left  = nweight(omt, n->left)  + leftmod;
-    u_int32_t weight_right = nweight(omt, n->right) + rightmod;
-    return ((1+weight_left < (1+1+weight_right)/2)
-            ||
-            (1+weight_right < (1+1+weight_left)/2));
-} 
-
-static inline void insert_internal(OMT omt, node_idx *n_idxp, OMTVALUE value, u_int32_t index, node_idx **rebalance_idx) {
-    if (*n_idxp==NODE_NULL) {
-        assert(index==0);
-        node_idx newidx  = omt_node_malloc(omt);
-        OMT_NODE newnode = omt->nodes+newidx;
-        newnode->weight  = 1;
-        newnode->left    = NODE_NULL;
-        newnode->right   = NODE_NULL;
-        newnode->value   = value;
-        *n_idxp = newidx;
-    } else {
-        node_idx idx = *n_idxp;
-        OMT_NODE n   = omt->nodes+idx;
-        n->weight++;
-        if (index <= nweight(omt, n->left)) {
-            if (*rebalance_idx==NULL && will_need_rebalance(omt, idx, 1, 0)) {
-                *rebalance_idx = n_idxp;
-            }
-            insert_internal(omt, &n->left,  value, index, rebalance_idx);
-        } else {
-            if (*rebalance_idx==NULL && will_need_rebalance(omt, idx, 0, 1)) {
-                *rebalance_idx = n_idxp;
-            }
-            u_int32_t sub_index = index-nweight(omt, n->left)-1;
-            insert_internal(omt, &n->right, value, sub_index, rebalance_idx);
-        }
-    }
-}
-
-int toku_omt_insert_at(OMT omt, OMTVALUE value, u_int32_t index) {
-    int r;
-    invalidate_cursors(omt);
-    if (index>nweight(omt, omt->root)) return EINVAL;
-    if ((r=maybe_resize_and_rebuild(omt, 1+nweight(omt, omt->root), MAYBE_REBUILD))) return r;
-    node_idx* rebalance_idx = NULL;
-    insert_internal(omt, &omt->root, value, index, &rebalance_idx);
-    if (rebalance_idx) rebalance(omt, rebalance_idx);
-    return 0;
-}
-
-static inline void set_at_internal(OMT omt, node_idx n_idx, OMTVALUE v, u_int32_t index) {
-    assert(n_idx!=NODE_NULL);
-    OMT_NODE n = omt->nodes+n_idx;
-    if (index<nweight(omt, n->left))
-	set_at_internal(omt, n->left, v, index);
-    else if (index==nweight(omt, n->left)) {
-	n->value = v;
-    } else {
-	set_at_internal(omt, n->right, v, index-nweight(omt, n->left)-1);
-    }
-}
-
-int toku_omt_set_at (OMT omt, OMTVALUE value, u_int32_t index) {
-    if (index>=nweight(omt, omt->root)) return EINVAL;
-    set_at_internal(omt, omt->root, value, index);
-    return 0;
-}
-
-static inline void delete_internal(OMT omt, node_idx *n_idxp, u_int32_t index, OMTVALUE *vp, node_idx **rebalance_idx) {
-    assert(*n_idxp!=NODE_NULL);
-    OMT_NODE n = omt->nodes+*n_idxp;
-    if (index < nweight(omt, n->left)) {
-        n->weight--;
-        if (*rebalance_idx==NULL && will_need_rebalance(omt, *n_idxp, -1, 0)) {
-            *rebalance_idx = n_idxp;
-        }
-        delete_internal(omt, &n->left, index, vp, rebalance_idx);
-    } else if (index == nweight(omt, n->left)) {
-        if (n->left==NODE_NULL) {
-            u_int32_t idx = *n_idxp;
-            *n_idxp = n->right;
-            *vp     = n->value;
-            omt_node_free(omt, idx);
-        } else if (n->right==NODE_NULL) {
-            u_int32_t idx = *n_idxp;
-            *n_idxp = n->left;
-            *vp     = n->value;
-            omt_node_free(omt, idx);
-        } else {
-            OMTVALUE zv;
-            // delete the successor of index, get the value, and store it here.
-            if (*rebalance_idx==NULL && will_need_rebalance(omt, *n_idxp, 0, -1)) {
-                *rebalance_idx = n_idxp;
-            }
-            delete_internal(omt, &n->right, 0, &zv, rebalance_idx);
-            n->value = zv;
-            n->weight--;
-        }
-    } else {
-        n->weight--;
-        if (*rebalance_idx==NULL && will_need_rebalance(omt, *n_idxp, 0, -1)) {
-            *rebalance_idx = n_idxp;
-        }
-        delete_internal(omt, &n->right, index-nweight(omt, n->left)-1, vp, rebalance_idx);
-    }
-}
-
-int toku_omt_delete_at(OMT omt, u_int32_t index) {
-    OMTVALUE v;
-    int r;
-    invalidate_cursors(omt);
-    if (index>=nweight(omt, omt->root)) return EINVAL;
-    if ((r=maybe_resize_and_rebuild(omt, -1+nweight(omt, omt->root), MAYBE_REBUILD))) return r;
-    node_idx* rebalance_idx = NULL;
-    delete_internal(omt, &omt->root, index, &v, &rebalance_idx);
-    if (rebalance_idx) rebalance(omt, rebalance_idx);
-    return 0;
-}
-
-static inline void omtcursor_stack_pop(OMTCURSOR c) {
-    assert(c->pathlen);
-    c->pathlen--;
-}
-
-static inline int omtcursor_stack_push(OMTCURSOR c, node_idx idx) {
-    if (c->max_pathlen-1<=c->pathlen) {
-        //Increase max_pathlen
-        u_int32_t new_max = c->max_pathlen*2;
-        node_idx *tmp_path = toku_realloc(c->path, new_max*sizeof(*c->path));
-        if (tmp_path==NULL) return errno;
-        c->path        = tmp_path;
-        c->max_pathlen = new_max;
-    }
-    c->path[c->pathlen++] = idx;
-    return 0;
-}
-
-static inline node_idx omtcursor_stack_peek(OMTCURSOR c) {
-    return c->path[c->pathlen-1];
-}
-
-static inline int fetch_internal(OMT V, node_idx idx, u_int32_t i, OMTVALUE *v, OMTCURSOR c) {
-    OMT_NODE n = V->nodes+idx;
-    int r;
-    if (c!=NULL && (r=omtcursor_stack_push(c, idx))) return r;
-    if (i < nweight(V, n->left)) {
-        return fetch_internal(V, n->left,  i, v, c);
-    } else if (i == nweight(V, n->left)) {
-        *v = n->value;
-        return 0;
-    } else {
-        return fetch_internal(V, n->right, i-nweight(V, n->left)-1, v, c);
-    }
-}
-
-int toku_omt_fetch(OMT V, u_int32_t i, OMTVALUE *v, OMTCURSOR c) {
-    if (i>=nweight(V, V->root)) return EINVAL;
-    if (c) associate(V,c);
-    int r = fetch_internal(V, V->root, i, v, c);
-    if (c && r!=0) {
-        toku_omt_cursor_invalidate(c);
-    }
-    return r;
-}
-
-static inline int iterate_internal(OMT omt, u_int32_t left, u_int32_t right,
-                                   node_idx n_idx, u_int32_t idx,
-                                   int (*f)(OMTVALUE, u_int32_t, void*), void*v) {
-    int r;
-    if (n_idx==NODE_NULL) return 0;
-    OMT_NODE n = omt->nodes+n_idx;
-    u_int32_t idx_root = idx+nweight(omt,n->left);
-    if (left< idx_root && (r=iterate_internal(omt, left, right, n->left, idx, f, v))) return r;
-    if (left<=idx_root && idx_root<right && (r=f(n->value, idx_root, v))) return r;
-    if (idx_root+1<right) return iterate_internal(omt, left, right, n->right, idx_root+1, f, v);
-    return 0;
-}
-
-int toku_omt_iterate(OMT omt, int (*f)(OMTVALUE, u_int32_t, void*), void*v) {
-    return iterate_internal(omt, 0, nweight(omt, omt->root), omt->root, 0, f, v);
-}
-
-int toku_omt_iterate_on_range(OMT omt, u_int32_t left, u_int32_t right, int (*f)(OMTVALUE, u_int32_t, void*), void*v) {
-    return iterate_internal(omt, left, right, omt->root, 0, f, v);
-}
-
-int toku_omt_insert(OMT omt, OMTVALUE value, int(*h)(OMTVALUE, void*v), void *v, u_int32_t *index) {
-    int r;
-    u_int32_t idx;
-
-    invalidate_cursors(omt);
-
-    r = toku_omt_find_zero(omt, h, v, NULL, &idx, NULL);
-    if (r==0) {
-        if (index) *index = idx;
-        return DB_KEYEXIST;
-    }
-    if (r!=DB_NOTFOUND) return r;
-
-    if ((r = toku_omt_insert_at(omt, value, idx))) return r;
-    if (index) *index = idx;
-
-    return 0;
-}
-
-static inline int find_internal_zero(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c)
-// requires: index!=NULL
-{
-    int r;
-    if (n_idx==NODE_NULL) {
-	*index = 0;
-	return DB_NOTFOUND;
-    }
-    if (c!=NULL && (r=omtcursor_stack_push(c, n_idx))) return r;
-    OMT_NODE n = omt->nodes+n_idx;
-    int hv = h(n->value, extra);
-    if (hv<0) {
-        r = find_internal_zero(omt, n->right, h, extra, value, index, c);
-        *index += nweight(omt, n->left)+1;
-        return r;
-    } else if (hv>0) {
-        return find_internal_zero(omt, n->left, h, extra, value, index, c);
-    } else {
-        r =  find_internal_zero(omt, n->left, h, extra, value, index, c);
-        if (r==DB_NOTFOUND) {
-            *index = nweight(omt, n->left);
-            if (value) *value = n->value;
-            if (c!=NULL) {
-                //Truncate the saved cursor path at n_idx.
-                while (omtcursor_stack_peek(c)!=n_idx) omtcursor_stack_pop(c);
-            }
-            r = 0;
-        }
-        return r;
-    }
-}
-
-int toku_omt_find_zero(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c) {
-    //Index can be modified before a cursor error, so we must use a temp.
-    u_int32_t tmp_index;
-    if (c) associate(V,c);
-    int r = find_internal_zero(V, V->root, h, extra, value, &tmp_index, c);
-    if (c && r!=0) {
-	toku_omt_cursor_invalidate(c);
-    }
-    if ((r==0 || r==DB_NOTFOUND) && index!=NULL) *index = tmp_index;
-    return r;
-}
-
-//  If direction <0 then find the largest  i such that h(V_i,extra)<0.
-static inline int find_internal_minus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c)
-// requires: index!=NULL
-{
-    int r;
-    if (n_idx==NODE_NULL) return DB_NOTFOUND;
-    if (c!=NULL && (r=omtcursor_stack_push(c, n_idx))) return r;
-    OMT_NODE n = omt->nodes+n_idx;
-    int hv = h(n->value, extra);
-    if (hv<0) {
-        r = find_internal_minus(omt, n->right, h, extra, value, index, c);
-        if (r==0) *index += nweight(omt, n->left)+1;
-        else if (r==DB_NOTFOUND) {
-            *index = nweight(omt, n->left);
-            if (value!=NULL) *value = n->value;
-            if (c!=NULL) {
-                //Truncate the saved cursor path at n_idx.
-                while (omtcursor_stack_peek(c)!=n_idx) omtcursor_stack_pop(c);
-            }
-            r = 0;
-        }
-        return r;
-    } else {
-        return find_internal_minus(omt, n->left, h, extra, value, index, c);
-    }
-}
-
-//  If direction >0 then find the smallest i such that h(V_i,extra)>0.
-static inline int find_internal_plus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c)
-// requires: index!=NULL
-{
-    int r;
-    if (n_idx==NODE_NULL) return DB_NOTFOUND;
-    if (c!=NULL && (r=omtcursor_stack_push(c, n_idx))) return r;
-    OMT_NODE n = omt->nodes+n_idx;
-    int hv = h(n->value, extra);
-    if (hv>0) {
-        r = find_internal_plus(omt, n->left, h, extra, value, index, c);
-        if (r==DB_NOTFOUND) {
-            *index = nweight(omt, n->left);
-            if (value!=NULL) *value = n->value;
-            if (c!=NULL) {
-                //Truncate the saved cursor path at n_idx.
-                while (omtcursor_stack_peek(c)!=n_idx) omtcursor_stack_pop(c);
-            }
-            r = 0;
-        }
-        return r;
-    } else {
-        r = find_internal_plus(omt, n->right, h, extra, value, index, c);
-        if (r==0) *index += nweight(omt, n->left)+1;
-        return r;
-    }
-}
-
-int toku_omt_find(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, int direction, OMTVALUE *value, u_int32_t *index, OMTCURSOR c) {
-    u_int32_t tmp_index;
-    int r;
-    if (index==NULL) index=&tmp_index;
-    if (c) associate(V,c);
-    if (direction==0) {
-	abort();
-    } else if (direction<0) {
-        r = find_internal_minus(V, V->root, h, extra, value, index, c);
-    } else {
-        r = find_internal_plus( V, V->root, h, extra, value, index, c);
-    }
-    if (c && r!=0) {
-	toku_omt_cursor_invalidate(c);
-    }
-    return r;
-}
-
-int toku_omt_split_at(OMT omt, OMT *newomtp, u_int32_t index) {
-    int r                = ENOSYS;
-    OMT newomt           = NULL;
-    OMTVALUE *tmp_values = NULL;
-    invalidate_cursors(omt);
-    if (index>nweight(omt, omt->root)) { r = EINVAL; goto cleanup; }
-    u_int32_t newsize = nweight(omt, omt->root)-index;
-    if ((r = omt_create_internal(&newomt, newsize))) goto cleanup;
-    MALLOC_N(nweight(omt, omt->root), tmp_values);
-    if (tmp_values==NULL) { r = errno; goto cleanup; }
-    fill_array_with_subtree_values(omt, tmp_values, omt->root);
-    // Modify omt's array at the last possible moment, since after this nothing can fail.
-    if ((r = maybe_resize_and_rebuild(omt, index, TRUE))) goto cleanup;
-    create_from_sorted_array_internal(omt,    &omt->root,    tmp_values,       index);
-    create_from_sorted_array_internal(newomt, &newomt->root, tmp_values+index, newsize);
-    *newomtp = newomt;
-    r = 0;
-cleanup:
-    if (r!=0) {
-        if (newomt) toku_omt_destroy(&newomt);
-    }
-    if (tmp_values) toku_free(tmp_values);
-    return r;
-}
-    
-int toku_omt_merge(OMT leftomt, OMT rightomt, OMT *newomtp) {
-    int r                = ENOSYS;
-    OMT newomt           = NULL;
-    OMTVALUE *tmp_values = NULL;
-    invalidate_cursors(leftomt);
-    invalidate_cursors(rightomt);
-    u_int32_t newsize = toku_omt_size(leftomt)+toku_omt_size(rightomt);
-    if ((r = omt_create_internal(&newomt, newsize))) goto cleanup;
-    MALLOC_N(newsize, tmp_values);
-    if (tmp_values==NULL) { r = errno; goto cleanup; }
-
-    fill_array_with_subtree_values(leftomt,  tmp_values,                        leftomt->root);
-    fill_array_with_subtree_values(rightomt, tmp_values+toku_omt_size(leftomt), rightomt->root);
-    create_from_sorted_array_internal(newomt, &newomt->root, tmp_values, newsize);
-    toku_omt_destroy(&leftomt);
-    toku_omt_destroy(&rightomt);
-    *newomtp = newomt;
-    r = 0;
-cleanup:
-    if (r!=0) {
-        if (newomt) toku_omt_destroy(&newomt);
-    }
-    if (tmp_values) toku_free(tmp_values);
-    return r;
-}
-
-void toku_omt_clear(OMT omt) {
-    invalidate_cursors(omt);
-    omt->free_idx = 0;
-    omt->root     = NODE_NULL;
-}
-
-unsigned long toku_omt_memory_size (OMT omt) {
-    return sizeof(*omt)+omt->node_capacity*sizeof(omt->nodes[0]) + omt->tmparray_size*sizeof(omt->tmparray[0]);
-}
-
-int toku_omt_cursor_is_valid (OMTCURSOR c) {
-    return c->omt!=NULL;
-}
-
-static inline void omtcursor_current_internal(OMTCURSOR c, OMTVALUE *v) {
-    *v = c->omt->nodes[omtcursor_stack_peek(c)].value;
-}
-
-static inline int omtcursor_next_internal(OMTCURSOR c) {
-    OMT_NODE current = c->omt->nodes+omtcursor_stack_peek(c);
-    if (current->right!=NODE_NULL) {
-        //Enter into subtree
-        if (omtcursor_stack_push(c, current->right)) return EINVAL;
-        current = c->omt->nodes+current->right;
-        while (current->left!=NODE_NULL) {
-            if (omtcursor_stack_push(c, current->left)) return EINVAL;
-            current = c->omt->nodes+current->left;
-        }
-        return 0;
-    }
-    else {
-        //Pop the stack till we remove a left child.
-        node_idx parent_idx = omtcursor_stack_peek(c);
-        node_idx child_idx;
-        while (c->pathlen>=2) {
-            child_idx  = parent_idx;
-            omtcursor_stack_pop(c);
-            parent_idx = omtcursor_stack_peek(c);
-            if (c->omt->nodes[parent_idx].left==child_idx) return 0;
-        }
-        return EINVAL;
-    }
-}
-
-int toku_omt_cursor_next (OMTCURSOR c, OMTVALUE *v) {
-    if (c->omt == NULL) return EINVAL;
-    int r = omtcursor_next_internal(c);
-    if (r!=0) toku_omt_cursor_invalidate(c);
-    else omtcursor_current_internal(c, v);
-    return r;
-}
-
-static inline int omtcursor_prev_internal(OMTCURSOR c) {
-    OMT_NODE current = c->omt->nodes+omtcursor_stack_peek(c);
-    if (current->left!=NODE_NULL) {
-        //Enter into subtree
-        if (omtcursor_stack_push(c, current->left)) return EINVAL;
-        current = c->omt->nodes+current->left;
-        while (current->right!=NODE_NULL) {
-            if (omtcursor_stack_push(c, current->right)) return EINVAL;
-            current = c->omt->nodes+current->right;
-        }
-        return 0;
-    }
-    else {
-        //Pop the stack till we remove a right child.
-        node_idx parent_idx = omtcursor_stack_peek(c);
-        node_idx child_idx;
-        while (c->pathlen>=2) {
-            child_idx  = parent_idx;
-            omtcursor_stack_pop(c);
-            parent_idx = omtcursor_stack_peek(c);
-            if (c->omt->nodes[parent_idx].right==child_idx) return 0;
-        }
-        return EINVAL;
-    }
-}
-
-int toku_omt_cursor_prev (OMTCURSOR c, OMTVALUE *v) {
-    if (c->omt == NULL) return EINVAL;
-    int r = omtcursor_prev_internal(c);
-    if (r!=0) toku_omt_cursor_invalidate(c);
-    else omtcursor_current_internal(c, v);
-    return r;
-}
-
-int toku_omt_cursor_current (OMTCURSOR c, OMTVALUE *v) {
-    if (c->omt == NULL) return EINVAL;
-    omtcursor_current_internal(c, v);
-    return 0;
-}
-
--- a/newbrt/recover.c
+++ b/newbrt/recover.c
@@ -239,11 +239,17 @@ toku_recover_enq_insert (LSN lsn __attribute__((__unused__)), FILENUM filenum, T
    struct brt_cmd cmd;
    DBT keydbt, valdbt;
    cmd.type=BRT_INSERT;
-    cmd.xid =xid;
+    //TODO: #1125 and recovery:  Remove this hack
+    //            Assume this is a root txn (not yet enough info to construct full XIDS for message)
+    XIDS root = xids_get_root_xids();
+    r = xids_create_child(root, &cmd.xids, xid);
+    assert(r==0);
    cmd.u.id.key = toku_fill_dbt(&keydbt, key.data, key.len);
    cmd.u.id.val = toku_fill_dbt(&valdbt, val.data, val.len);
    r = toku_brt_root_put_cmd(pair->brt, &cmd, null_tokulogger);
    assert(r==0);
+    xids_destroy(&cmd.xids);
+    xids_destroy(&root);
    toku_free(key.data);
    toku_free(val.data);
 }
@@ -265,11 +271,17 @@ toku_recover_enq_delete_both (LSN lsn __attribute__((__unused__)), FILENUM filen
    struct brt_cmd cmd;
    DBT keydbt, valdbt;
    cmd.type = BRT_DELETE_BOTH;
-    cmd.xid =xid;
+    //TODO: #1125 and recovery:  Remove this hack
+    //            Assume this is a root txn (not yet enough info to construct full XIDS for message)
+    XIDS root = xids_get_root_xids();
+    r = xids_create_child(root, &cmd.xids, xid);
+    assert(r==0);
    cmd.u.id.key = toku_fill_dbt(&keydbt, key.data, key.len);
    cmd.u.id.val = toku_fill_dbt(&valdbt, val.data, val.len);
    r = toku_brt_root_put_cmd(pair->brt, &cmd, null_tokulogger);
    assert(r==0);
+    xids_destroy(&cmd.xids);
+    xids_destroy(&root);
    toku_free(key.data);
    toku_free(val.data);
 }
@@ -291,11 +303,17 @@ toku_recover_enq_delete_any (LSN lsn __attribute__((__unused__)), FILENUM filenu
    struct brt_cmd cmd;
    DBT keydbt, valdbt;
    cmd.type = BRT_DELETE_ANY;
-    cmd.xid = xid;
+    //TODO: #1125 and recovery:  Remove this hack
+    //            Assume this is a root txn (not yet enough info to construct full XIDS for message)
+    XIDS root = xids_get_root_xids();
+    r = xids_create_child(root, &cmd.xids, xid);
+    assert(r==0);
    cmd.u.id.key = toku_fill_dbt(&keydbt, key.data, key.len);
    cmd.u.id.val = toku_fill_dbt(&valdbt, val.data, val.len);
    r = toku_brt_root_put_cmd(pair->brt, &cmd, null_tokulogger);
    assert(r==0);
+    xids_destroy(&cmd.xids);
+    xids_destroy(&root);
    toku_free(key.data);
    toku_free(val.data);
 }

--- a/newbrt/roll.c
+++ b/newbrt/roll.c
@@ -7,12 +7,8 @@

 #include "includes.h"
 #include "checkpoint.h"
-
-// these flags control whether or not we send commit messages for
-// various operations
-#define TOKU_DO_COMMIT_CMD_INSERT 0
-#define TOKU_DO_COMMIT_CMD_DELETE 1
-#define TOKU_DO_COMMIT_CMD_DELETE_BOTH 1
+#include "xids.h"
+#include "roll.h"

 int
 toku_commit_fcreate (TXNID UU(xid),
@@ -63,14 +59,15 @@ static int find_brt_from_filenum (OMTVALUE v, void *filenumvp) {
    return 0;
 }

-static int do_insertion (enum brt_cmd_type type, TXNID xid, FILENUM filenum, BYTESTRING key, BYTESTRING *data,TOKUTXN txn) {
+static int do_insertion (enum brt_cmd_type type, FILENUM filenum, BYTESTRING key, BYTESTRING *data,TOKUTXN txn) {
    CACHEFILE cf;
    //printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data);
    int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf);
    assert(r==0);

    DBT key_dbt,data_dbt;
-    BRT_CMD_S brtcmd = { type, xid,
+    XIDS xids = toku_txn_get_xids(txn);
+    BRT_CMD_S brtcmd = { type, xids,
 			 .u.id={toku_fill_dbt(&key_dbt,  key.data,  key.len),
 				data
 				? toku_fill_dbt(&data_dbt, data->data, data->len)
@@ -93,18 +90,17 @@ static int do_nothing_with_filenum(TOKUTXN txn, FILENUM filenum) {
 }


-int toku_commit_cmdinsert (TXNID xid, FILENUM filenum, BYTESTRING key, TOKUTXN txn, YIELDF UU(yield), void *UU(yieldv)) {
+int toku_commit_cmdinsert (FILENUM filenum, BYTESTRING key, TOKUTXN txn, YIELDF UU(yield), void *UU(yieldv)) {
 #if TOKU_DO_COMMIT_CMD_INSERT
-    return do_insertion (BRT_COMMIT_ANY, xid, filenum, key, 0, txn);
+    return do_insertion (BRT_COMMIT_ANY, filenum, key, 0, txn);
 #else
-    xid = xid; key = key;
+    key = key;
    return do_nothing_with_filenum(txn, filenum);
 #endif
 }

 int
-toku_commit_cmdinsertboth (TXNID      xid,
-			   FILENUM    filenum,
+toku_commit_cmdinsertboth (FILENUM    filenum,
 			   BYTESTRING key,
 			   BYTESTRING data,
 			   TOKUTXN    txn,
@@ -112,39 +108,36 @@ toku_commit_cmdinsertboth (TXNID      xid,
 			   void *     UU(yieldv))
 {
 #if TOKU_DO_COMMIT_CMD_INSERT
-    return do_insertion (BRT_COMMIT_BOTH, xid, filenum, key, &data, txn);
+    return do_insertion (BRT_COMMIT_BOTH, filenum, key, &data, txn);
 #else
-    xid = xid; key = key; data = data;
+    key = key; data = data;
    return do_nothing_with_filenum(txn, filenum);
 #endif
 }

 int
-toku_rollback_cmdinsert (TXNID      xid,
-			 FILENUM    filenum,
+toku_rollback_cmdinsert (FILENUM    filenum,
 			 BYTESTRING key,
 			 TOKUTXN    txn,
 			 YIELDF     UU(yield),
 			 void *     UU(yieldv))
 {
-    return do_insertion (BRT_ABORT_ANY, xid, filenum, key, 0, txn);
+    return do_insertion (BRT_ABORT_ANY, filenum, key, 0, txn);
 }

 int
-toku_rollback_cmdinsertboth (TXNID      xid,
-			     FILENUM    filenum,
+toku_rollback_cmdinsertboth (FILENUM    filenum,
 			     BYTESTRING key,
 			     BYTESTRING data,
 			     TOKUTXN    txn,
 			     YIELDF     UU(yield),
 			     void *     UU(yieldv))
 {
-    return do_insertion (BRT_ABORT_BOTH, xid, filenum, key, &data, txn);
+    return do_insertion (BRT_ABORT_BOTH, filenum, key, &data, txn);
 }

 int
-toku_commit_cmddeleteboth (TXNID      xid,
-			   FILENUM    filenum,
+toku_commit_cmddeleteboth (FILENUM    filenum,
 			   BYTESTRING key,
 			   BYTESTRING data,
 			   TOKUTXN    txn,
@@ -152,7 +145,7 @@ toku_commit_cmddeleteboth (TXNID      xid,
 			   void *     UU(yieldv))
 {
 #if TOKU_DO_COMMIT_CMD_DELETE_BOTH
-    return do_insertion (BRT_COMMIT_BOTH, xid, filenum, key, &data, txn);
+    return do_insertion (BRT_COMMIT_BOTH, filenum, key, &data, txn);
 #else
    xid = xid; key = key; data = data;
    return do_nothing_with_filenum(txn, filenum);
@@ -160,27 +153,25 @@ toku_commit_cmddeleteboth (TXNID      xid,
 }

 int
-toku_rollback_cmddeleteboth (TXNID      xid,
-			     FILENUM    filenum,
+toku_rollback_cmddeleteboth (FILENUM    filenum,
 			     BYTESTRING key,
 			     BYTESTRING data,
 			     TOKUTXN    txn,
 			     YIELDF     UU(yield),
 			     void *     UU(yieldv))
 {
-    return do_insertion (BRT_ABORT_BOTH, xid, filenum, key, &data, txn);
+    return do_insertion (BRT_ABORT_BOTH, filenum, key, &data, txn);
 }

 int
-toku_commit_cmddelete (TXNID xid,
-		       FILENUM filenum,
+toku_commit_cmddelete (FILENUM filenum,
 		       BYTESTRING key,
 		       TOKUTXN txn,
 		       YIELDF     UU(yield),
 		       void *     UU(yieldv))
 {
 #if TOKU_DO_COMMIT_CMD_DELETE
-    return do_insertion (BRT_COMMIT_ANY, xid, filenum, key, 0, txn);
+    return do_insertion (BRT_COMMIT_ANY, filenum, key, 0, txn);
 #else
    xid = xid; key = key;
    return do_nothing_with_filenum(txn, filenum);
@@ -188,14 +179,13 @@ toku_commit_cmddelete (TXNID xid,
 }

 int
-toku_rollback_cmddelete (TXNID      xid,
-			 FILENUM    filenum,
+toku_rollback_cmddelete (FILENUM    filenum,
 			 BYTESTRING key,
 			 TOKUTXN    txn,
 			 YIELDF     UU(yield),
 			 void *     UU(yieldv))
 {
-    return do_insertion (BRT_ABORT_ANY, xid, filenum, key, 0, txn);
+    return do_insertion (BRT_ABORT_ANY, filenum, key, 0, txn);
 }

 int

--- a/newbrt/roll.h
+++ b/newbrt/roll.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "$Id: roll.c 12588 2009-06-09 00:05:02Z yfogel $"
+#ident "Copyright (c) 2007, 2008, 2009 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#ifndef TOKUDB_ROLL_H
+#define TOKUDB_ROLL_H
+// these flags control whether or not we send commit messages for
+// various operations
+
+// When a transaction is committed, should we send a BRT_COMMIT message
+// for each BRT_INSERT message sent earlier by the transaction?
+#define TOKU_DO_COMMIT_CMD_INSERT 0
+
+// When a transaction is committed, should we send a BRT_COMMIT message
+// for each BRT_DELETE_ANY message sent earlier by the transaction?
+#define TOKU_DO_COMMIT_CMD_DELETE 1
+
+// When a transaction is committed, should we send a BRT_COMMIT message
+// for each BRT_DELETE_BOTH message sent earlier by the transaction?
+#define TOKU_DO_COMMIT_CMD_DELETE_BOTH 1
+
+#endif
+
--- a/newbrt/rollback.c
+++ b/newbrt/rollback.c
@@ -32,6 +32,7 @@ void toku_rollback_txn_close (TOKUTXN txn) {

    list_remove(&txn->live_txns_link);
    note_txn_closing(txn);
+    xids_destroy(&txn->xids);
    toku_free(txn);
    return;
 }

--- a/newbrt/tests/Makefile
+++ b/newbrt/tests/Makefile
@@ -94,7 +94,8 @@ REGRESSION_TESTS_RAW = \
 	test-brt-overflow \
 	test-del-inorder \
 	test-inc-split \
-	test-leafentry \
+	test-leafentry10 \
+	test-leafentry-nested \
 	test_oexcl \
 	test_toku_malloc_plain_free \
 	threadpool-test \

--- a/newbrt/tests/brt-serialize-test.c
+++ b/newbrt/tests/brt-serialize-test.c
@@ -46,12 +46,25 @@ static void test_serialize(void) {
    BNC_SUBTREE_ESTIMATES(&sn, 1).exact = (BOOL)(random()%2 != 0);
    r = toku_fifo_create(&BNC_BUFFER(&sn,0)); assert(r==0);
    r = toku_fifo_create(&BNC_BUFFER(&sn,1)); assert(r==0);
-    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, (TXNID)0);   assert(r==0);    sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, (TXNID)0, "a", 2, "aval", 5);
-    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "b", 2, "bval", 5, BRT_NONE, (TXNID)123); assert(r==0);    sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, (TXNID)123,  "b", 2, "bval", 5);
-    r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, (TXNID)234); assert(r==0);    sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, (TXNID)234, "x", 2, "xval", 5);
-    BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
-    BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
-    sn.u.n.n_bytes_in_buffers = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
+    //Create XIDS
+    XIDS xids_0 = xids_get_root_xids();
+    XIDS xids_123;
+    XIDS xids_234;
+    r = xids_create_child(xids_0, &xids_123, (TXNID)123);
+    CKERR(r);
+    r = xids_create_child(xids_123, &xids_234, (TXNID)234);
+    CKERR(r);
+
+    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, xids_0);   assert(r==0);    sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, xids_0, "a", 2, "aval", 5);
+    r = toku_fifo_enq(BNC_BUFFER(&sn,0), "b", 2, "bval", 5, BRT_NONE, xids_123); assert(r==0);    sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, xids_123,  "b", 2, "bval", 5);
+    r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, xids_234); assert(r==0);    sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, xids_234, "x", 2, "xval", 5);
+    BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123);
+    BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_234);
+    sn.u.n.n_bytes_in_buffers = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123) + xids_get_serialize_size(xids_234);
+    //Cleanup:
+    xids_destroy(&xids_0);
+    xids_destroy(&xids_123);
+    xids_destroy(&xids_234);

    struct brt *XMALLOC(brt);
    struct brt_header *XCALLOC(brt_h);

--- a/newbrt/tests/fifo-test.c
+++ b/newbrt/tests/fifo-test.c
@@ -46,18 +46,26 @@ test_fifo_enq (int n) {
    for (i=0; i<n; i++) {
        buildkey(i);
        buildval(i);
-        r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, (TXNID)i); assert(r == 0);
+        XIDS xids;
+        if (i==0)
+            xids = xids_get_root_xids();
+        else {
+            r = xids_create_child(xids_get_root_xids(), &xids, (TXNID)i);
+            assert(r==0);
+        }
+        r = toku_fifo_enq(f, thekey, thekeylen, theval, thevallen, i, xids); assert(r == 0);
+        xids_destroy(&xids);
    }

    i = 0;
-    FIFO_ITERATE(f, key, keylen, val, vallen, type, xid, {
+    FIFO_ITERATE(f, key, keylen, val, vallen, type, xids, {
        if (verbose) printf("checkit %d %d\n", i, type);
        buildkey(i);
        buildval(i);
        assert((int) keylen == thekeylen); assert(memcmp(key, thekey, keylen) == 0);
        assert((int) vallen == thevallen); assert(memcmp(val, theval, vallen) == 0);
        assert(i % 256 == type);
-	assert((TXNID)i==xid);
+	assert((TXNID)i==xids_get_innermost_xid(xids));
        i += 1;
    });
    assert(i == n);

--- a/newbrt/tests/test-leafentry-nested.c
+++ b/newbrt/tests/test-leafentry-nested.c
+#include <toku_portability.h>
+#include <string.h>
+
+#include "test.h"
+#include "brttypes.h"
+#include "includes.h"
+#include "ule.h"
+
+enum {MAX_SIZE = 256};
+
+static void
+verify_ule_equal(ULE a, ULE b) {
+    assert(a->num_uxrs > 0);
+    assert(a->num_uxrs <= MAX_TRANSACTION_RECORDS);
+    assert(a->num_uxrs == b->num_uxrs);
+    assert(a->keylen   == b->keylen);
+    assert(memcmp(a->keyp, b->keyp, a->keylen) == 0);
+    u_int32_t i;
+    for (i = 0; i < a->num_uxrs; i++) {
+        assert(a->uxrs[i].type == b->uxrs[i].type);
+        assert(a->uxrs[i].xid  == b->uxrs[i].xid);
+        if (a->uxrs[i].type == XR_INSERT) {
+            assert(a->uxrs[i].vallen  == b->uxrs[i].vallen);
+            assert(memcmp(a->uxrs[i].valp, b->uxrs[i].valp, a->uxrs[i].vallen) == 0);
+        }
+    }
+}
+
+static void
+fillrandom(u_int8_t buf[MAX_SIZE], u_int32_t length) {
+    assert(length < MAX_SIZE);
+    u_int32_t i;
+    for (i = 0; i < length; i++) {
+        buf[i] = random() & 0xFF;
+    } 
+}
+
+static void
+test_le_offset_is(LEAFENTRY le, void *field, size_t expected_offset) {
+    size_t le_address    = (size_t) le;
+    size_t field_address = (size_t) field;
+    assert(field_address >= le_address);
+    size_t actual_offset = field_address - le_address;
+    assert(actual_offset == expected_offset);
+}
+
+//Fixed offsets in a packed leafentry.
+enum {
+    LE_OFFSET_NUM      = 0,
+    LE_OFFSET_KEYLEN   = 1+LE_OFFSET_NUM,
+    LE_OFFSET_VALLEN   = 4+LE_OFFSET_KEYLEN, //Vallen of innermost insert record
+    LE_OFFSET_VARIABLE = 4+LE_OFFSET_VALLEN
+};
+
+static void
+test_le_fixed_offsets (void) {
+    LEAFENTRY XMALLOC(le);
+    test_le_offset_is(le, &le->num_xrs,                    LE_OFFSET_NUM);
+    test_le_offset_is(le, &le->keylen,                     LE_OFFSET_KEYLEN);
+    test_le_offset_is(le, &le->innermost_inserted_vallen,  LE_OFFSET_VALLEN);
+    toku_free(le);
+}
+
+//Fixed offsets in a leafentry with no uncommitted transaction records.
+//(Note, there is no type required.) 
+enum {
+    LE_COMMITTED_OFFSET_KEY    = LE_OFFSET_VARIABLE
+};
+
+static void
+test_le_committed_offsets (void) {
+    LEAFENTRY XMALLOC(le);
+    test_le_offset_is(le, &le->u.comm.key_val, LE_COMMITTED_OFFSET_KEY);
+    toku_free(le);
+}
+
+//Fixed offsets in a leafentry with uncommitted transaction records.
+enum {
+    LE_PROVISIONAL_OFFSET_TYPE   = LE_OFFSET_VARIABLE, //Type of innermost record
+    LE_PROVISIONAL_OFFSET_XID    = 1+LE_PROVISIONAL_OFFSET_TYPE, //XID of outermost noncommitted record
+    LE_PROVISIONAL_OFFSET_KEY    = 8+LE_PROVISIONAL_OFFSET_XID
+};
+
+static void
+test_le_provisional_offsets (void) {
+    LEAFENTRY XMALLOC(le);
+    test_le_offset_is(le, &le->u.prov.innermost_type,            LE_PROVISIONAL_OFFSET_TYPE);
+    test_le_offset_is(le, &le->u.prov.xid_outermost_uncommitted, LE_PROVISIONAL_OFFSET_XID);
+    test_le_offset_is(le, &le->u.prov.key_val_xrs,               LE_PROVISIONAL_OFFSET_KEY);
+    toku_free(le);
+}
+
+//We use a packed struct to represent a leafentry.
+//We want to make sure the compiler correctly represents the offsets.
+//This test verifies all offsets in a packed leafentry correspond to the required memory format.
+static void
+test_le_offsets (void) {
+    test_le_fixed_offsets();
+    test_le_committed_offsets();
+    test_le_provisional_offsets();
+}
+
+static void
+test_ule_packs_to_nothing (ULE ule) {
+    size_t memsize;
+    size_t disksize;
+    LEAFENTRY le;
+    int r = le_pack(ule,
+                    &memsize, &disksize,
+                    &le, NULL, NULL, NULL);
+    assert(r==0);
+    assert(le==NULL);
+}
+
+//A leafentry must contain at least one 'insert' (all deletes means the leafentry
+//should not exist).
+//Verify that 'le_pack' of any set of all deletes ends up not creating a leafentry.
+static void
+test_le_empty_packs_to_nothing (void) {
+    ULE_S ule;
+
+    int key = random(); //Arbitrary number
+    //Set up defaults.
+    ule.keylen       = sizeof(key);
+    ule.keyp         = &key;
+    ule.uxrs[0].type = XR_DELETE;
+    ule.uxrs[0].xid  = 0;
+    u_int8_t num_xrs;
+    for (num_xrs = 1; num_xrs < MAX_TRANSACTION_RECORDS; num_xrs++) {
+        if (num_xrs > 1) {
+            ule.uxrs[num_xrs-1].type = XR_DELETE,
+            ule.uxrs[num_xrs-1].xid  = ule.uxrs[num_xrs-2].xid + (random() % 32 + 1); //Abitrary number, xids must be strictly increasing
+        }
+        ule.num_uxrs = num_xrs;
+        test_ule_packs_to_nothing(&ule);
+        if (num_xrs > 2 && num_xrs % 4) {
+            //Set some of them to placeholders instead of deletes
+            ule.uxrs[num_xrs-2].type = XR_PLACEHOLDER;
+        }
+        test_ule_packs_to_nothing(&ule);
+    }
+}
+
+static void
+le_verify_accessors(LEAFENTRY le, ULE ule,
+                    size_t pre_calculated_memsize,
+                    size_t pre_calculated_disksize) {
+    assert(le);
+    assert(ule->num_uxrs > 0);
+    assert(ule->num_uxrs <= MAX_TRANSACTION_RECORDS);
+    assert(ule->uxrs[ule->num_uxrs-1].type != XR_PLACEHOLDER);
+    //Extract expected values from ULE
+    size_t memsize  = le_memsize_from_ule(ule);
+    size_t disksize = le_memsize_from_ule(ule);
+
+    void *latest_key        = ule->uxrs[ule->num_uxrs-1].type == XR_DELETE ? NULL : ule->keyp;
+    u_int32_t latest_keylen = ule->uxrs[ule->num_uxrs-1].type == XR_DELETE ? 0    : ule->keylen;
+    void *key               = ule->keyp;
+    u_int32_t keylen        = ule->keylen;
+    void *latest_val        = ule->uxrs[ule->num_uxrs-1].type == XR_DELETE ? NULL : ule->uxrs[ule->num_uxrs-1].valp;
+    u_int32_t latest_vallen = ule->uxrs[ule->num_uxrs-1].type == XR_DELETE ? 0    : ule->uxrs[ule->num_uxrs-1].vallen;
+    void *innermost_inserted_val;
+    u_int32_t innermost_inserted_vallen;
+    {
+        int i;
+        for (i = ule->num_uxrs - 1; i >= 0; i--) {
+            if (ule->uxrs[i].type == XR_INSERT) {
+                innermost_inserted_val    = ule->uxrs[i].valp;
+                innermost_inserted_vallen = ule->uxrs[i].vallen;
+                goto found_insert;
+            }
+        }
+        assert(FALSE);
+    }
+found_insert:;
+    TXNID outermost_uncommitted_xid = ule->num_uxrs == 1 ? 0 : ule->uxrs[1].xid;
+    int   is_provdel = ule->uxrs[ule->num_uxrs-1].type == XR_DELETE;
+
+    assert(le!=NULL);
+    //Verify all accessors
+    assert(memsize  == pre_calculated_memsize);
+    assert(disksize == pre_calculated_disksize);
+    assert(memsize  == disksize);
+    assert(memsize  == leafentry_memsize(le));
+    assert(disksize == leafentry_disksize(le));
+    {
+        u_int32_t test_keylen;
+        void*     test_keyp = le_latest_key_and_len(le, &test_keylen);
+        if (latest_key != NULL) assert(test_keyp != latest_key);
+        assert(test_keylen == latest_keylen);
+        assert(memcmp(test_keyp, latest_key, test_keylen) == 0);
+        assert(le_latest_key(le)    == test_keyp);
+        assert(le_latest_keylen(le) == test_keylen);
+    }
+    {
+        u_int32_t test_keylen;
+        void*     test_keyp = le_key_and_len(le, &test_keylen);
+        if (key != NULL) assert(test_keyp != key);
+        assert(test_keylen == keylen);
+        assert(memcmp(test_keyp, key, test_keylen) == 0);
+        assert(le_key(le)    == test_keyp);
+        assert(le_keylen(le) == test_keylen);
+    }
+    {
+        u_int32_t test_vallen;
+        void*     test_valp = le_latest_val_and_len(le, &test_vallen);
+        if (latest_val != NULL) assert(test_valp != latest_val);
+        assert(test_vallen == latest_vallen);
+        assert(memcmp(test_valp, latest_val, test_vallen) == 0);
+        assert(le_latest_val(le)    == test_valp);
+        assert(le_latest_vallen(le) == test_vallen);
+    }
+    {
+        u_int32_t test_vallen;
+        void*     test_valp = le_innermost_inserted_val_and_len(le, &test_vallen);
+        if (innermost_inserted_val != NULL) assert(test_valp != innermost_inserted_val);
+        assert(test_vallen == innermost_inserted_vallen);
+        assert(memcmp(test_valp, innermost_inserted_val, test_vallen) == 0);
+        assert(le_innermost_inserted_val(le)    == test_valp);
+        assert(le_innermost_inserted_vallen(le) == test_vallen);
+    }
+    {
+        assert(le_outermost_uncommitted_xid(le) == outermost_uncommitted_xid);
+    }
+    {
+        assert((le_is_provdel(le)==0) == (is_provdel==0));
+    }
+}
+
+
+
+static void
+test_le_pack_committed (void) {
+    ULE_S ule;
+
+    u_int8_t key[MAX_SIZE];
+    u_int8_t val[MAX_SIZE];
+    u_int32_t keysize;
+    u_int32_t valsize;
+    for (keysize = 0; keysize < MAX_SIZE; keysize += (random() % MAX_SIZE) + 1) {
+        for (valsize = 0; valsize < MAX_SIZE; valsize += (random() % MAX_SIZE) + 1) {
+            fillrandom(key, keysize);
+            fillrandom(val, valsize);
+
+            ule.num_uxrs       = 1;
+            ule.keylen         = keysize;
+            ule.keyp           = key;
+            ule.uxrs[0].type   = XR_INSERT;
+            ule.uxrs[0].xid    = 0;
+            ule.uxrs[0].valp   = val;
+            ule.uxrs[0].vallen = valsize;
+
+            size_t memsize;
+            size_t disksize;
+            LEAFENTRY le;
+            int r = le_pack(&ule,
+                            &memsize, &disksize,
+                            &le, NULL, NULL, NULL);
+            assert(r==0);
+            assert(le!=NULL);
+            le_verify_accessors(le, &ule, memsize, disksize);
+            ULE_S tmp_ule;
+            le_unpack(&tmp_ule, le);
+            verify_ule_equal(&ule, &tmp_ule);
+            LEAFENTRY tmp_le;
+            size_t    tmp_memsize;
+            size_t    tmp_disksize;
+            r = le_pack(&tmp_ule,
+                        &tmp_memsize, &tmp_disksize,
+                        &tmp_le, NULL, NULL, NULL);
+            assert(r==0);
+            assert(tmp_memsize == memsize);
+            assert(tmp_disksize == disksize);
+            assert(memcmp(le, tmp_le, memsize) == 0);
+
+            toku_free(tmp_le);
+            toku_free(le);
+        }
+    }
+}
+
+static void
+test_le_pack_uncommitted (u_int8_t committed_type, u_int8_t prov_type, int num_placeholders) {
+    ULE_S ule;
+
+    u_int8_t key[MAX_SIZE];
+    u_int8_t cval[MAX_SIZE];
+    u_int8_t pval[MAX_SIZE];
+    u_int32_t keysize;
+    u_int32_t cvalsize;
+    u_int32_t pvalsize;
+    for (keysize = 0; keysize < MAX_SIZE; keysize += (random() % MAX_SIZE) + 1) {
+        for (cvalsize = 0; cvalsize < MAX_SIZE; cvalsize += (random() % MAX_SIZE) + 1) {
+            pvalsize = (cvalsize + random()) % MAX_SIZE;
+            fillrandom(key, keysize);
+            if (committed_type == XR_INSERT)
+                fillrandom(cval, cvalsize);
+            if (prov_type == XR_INSERT)
+                fillrandom(pval, pvalsize);
+            ule.uxrs[0].type   = committed_type;
+            ule.uxrs[0].xid    = 0;
+            ule.uxrs[0].vallen = cvalsize;
+            ule.uxrs[0].valp   = cval;
+            ule.keylen         = keysize;
+            ule.keyp           = key;
+            ule.num_uxrs       = 2 + num_placeholders;
+
+            u_int8_t idx;
+            for (idx = 1; idx <= num_placeholders; idx++) {
+                ule.uxrs[idx].type = XR_PLACEHOLDER;
+                ule.uxrs[idx].xid  = ule.uxrs[idx-1].xid + (random() % 32 + 1); //Abitrary number, xids must be strictly increasing
+            }
+            ule.uxrs[idx].xid  = ule.uxrs[idx-1].xid + (random() % 32 + 1); //Abitrary number, xids must be strictly increasing
+            ule.uxrs[idx].type   = prov_type;
+            ule.uxrs[idx].vallen = pvalsize;
+            ule.uxrs[idx].valp   = pval;
+
+            size_t memsize;
+            size_t disksize;
+            LEAFENTRY le;
+            int r = le_pack(&ule,
+                            &memsize, &disksize,
+                            &le, NULL, NULL, NULL);
+            assert(r==0);
+            assert(le!=NULL);
+            le_verify_accessors(le, &ule, memsize, disksize);
+            ULE_S tmp_ule;
+            le_unpack(&tmp_ule, le);
+            verify_ule_equal(&ule, &tmp_ule);
+            LEAFENTRY tmp_le;
+            size_t    tmp_memsize;
+            size_t    tmp_disksize;
+            r = le_pack(&tmp_ule,
+                        &tmp_memsize, &tmp_disksize,
+                        &tmp_le, NULL, NULL, NULL);
+            assert(r==0);
+            assert(tmp_memsize == memsize);
+            assert(tmp_disksize == disksize);
+            assert(memcmp(le, tmp_le, memsize) == 0);
+
+            toku_free(tmp_le);
+            toku_free(le);
+        }
+    }
+}
+
+static void
+test_le_pack_provpair (int num_placeholders) {
+    test_le_pack_uncommitted(XR_DELETE, XR_INSERT, num_placeholders);
+}
+
+static void
+test_le_pack_provdel (int num_placeholders) {
+    test_le_pack_uncommitted(XR_INSERT, XR_DELETE, num_placeholders);
+}
+
+static void
+test_le_pack_both (int num_placeholders) {
+    test_le_pack_uncommitted(XR_INSERT, XR_INSERT, num_placeholders);
+}
+
+//Test of PACK
+//  Committed leafentry
+//      delete -> nothing (le_empty_packs_to_nothing)
+//      insert
+//          make key/val have diff lengths/content
+//  Uncommitted
+//      committed delete
+//          followed by placeholder*, delete (le_empty_packs_to_nothing)
+//          followed by placeholder*, insert
+//      committed insert
+//          followed by placeholder*, delete
+//          followed by placeholder*, insert
+//          
+//  placeholder* is 0,1, or 2 placeholders
+static void
+test_le_pack (void) {
+    test_le_empty_packs_to_nothing();
+    test_le_pack_committed();
+    int i;
+    for (i = 0; i < 3; i++) {
+        test_le_pack_provpair(i);
+        test_le_pack_provdel(i);
+        test_le_pack_both(i);
+    }
+}
+
+//TODO: #1125 tests:
+//      Will probably have to expose ULE_S definition
+//            - Check memsize function is correct
+//             - Assert == disksize (almost useless, but go ahead)
+//            - Check standard accessors
+//             - le_latest_key_and_len
+//             - le_latest_key 
+//             - le_latest_keylen
+//             - le_latest_val_and_len
+//             - le_latest_val 
+//             - le_latest_vallen
+//             - le_key_and_len
+//             - le_key 
+//             - le_keylen
+//             - le_innermost_inserted_val_and_len
+//             - le_innermost_inserted_val 
+//             - le_innermost_inserted_vallen
+//            - Check le_outermost_uncommitted_xid
+//            - Check le_is_provdel
+//            - Check unpack+pack memcmps equal
+//            - Check exact memory expected (including size) for various leafentry types.
+//            - Check apply_msg logic
+//             - Known start, known expected.. various types.
+//            - Go through test-leafentry10.c
+//             - Verify we have tests for all analogous stuff.
+//
+//  PACK
+//  UNPACK
+//      verify pack+unpack is no-op
+//      verify unpack+pack is no-op
+//  accessors
+//  Test apply_msg logic
+//      i.e. start with LE, apply message
+//          in parallel, construct the expected ULE manually, and pack that
+//          Compare the two results
+//  Test full_promote
+
+int
+test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+    srandom(7); //Arbitrary seed.
+    test_le_offsets();
+    test_le_pack();
+    return 0;
+}
--- a/newbrt/tests/test-leafentry.c
+++ b/newbrt/tests/test-leafentry.c
@@ -19,7 +19,7 @@ static void test_leafentry_1 (void) {
    LEAFENTRY l;
    int r;
    u_int32_t msize, dsize;
-    r = le_committed(4, "abc", 3, "xy", &msize, &dsize, &l, 0, 0, 0);
+    r = le10_committed(4, "abc", 3, "xy", &msize, &dsize, &l, 0, 0, 0);
    assert(r==0);
    char expect[] = {LE_COMMITTED,
                     UINT32TOCHAR(4), 
@@ -36,7 +36,7 @@ static void test_leafentry_2 (void) {
    LEAFENTRY l;
    int r;
    u_int32_t msize, dsize;
-    r = le_both(0x0123456789abcdef0LL, 3, "ab", 4, "xyz", 5, "lmno", &msize, &dsize, &l, 0, 0, 0);
+    r = le10_both(0x0123456789abcdef0LL, 3, "ab", 4, "xyz", 5, "lmno", &msize, &dsize, &l, 0, 0, 0);
    assert(r==0);
    char expect[] = {LE_BOTH,
                     UINT64TOCHAR(0x0123456789abcdef0LL),
@@ -53,7 +53,7 @@ static void test_leafentry_3 (void) {
    LEAFENTRY l;
    int r;
    u_int32_t msize, dsize;
-    r = le_provdel(0x0123456789abcdef0LL, 3, "ab", 5, "lmno", &msize, &dsize, &l, 0, 0, 0);
+    r = le10_provdel(0x0123456789abcdef0LL, 3, "ab", 5, "lmno", &msize, &dsize, &l, 0, 0, 0);
    assert(r==0);
    char expect[] = {LE_PROVDEL,
                     UINT64TOCHAR(0x0123456789abcdef0LL),
@@ -69,7 +69,7 @@ static void test_leafentry_4 (void) {
    LEAFENTRY l;
    int r;
    u_int32_t msize, dsize;
-    r = le_provpair(0x0123456789abcdef0LL, 3, "ab", 5, "lmno", &msize, &dsize, &l, 0, 0, 0);
+    r = le10_provpair(0x0123456789abcdef0LL, 3, "ab", 5, "lmno", &msize, &dsize, &l, 0, 0, 0);
    assert(r==0);
    char expect[] = {LE_PROVPAIR,
                     UINT64TOCHAR(0x0123456789abcdef0LL),
@@ -101,7 +101,7 @@ static void test_leafentry_3long (void) {
    LEAFENTRY l;
    int r;
    u_int32_t msize, dsize;
-    r = le_provdel(0x0123456789abcdef0LL, 301, zeros, 1025, zeros, &msize, &dsize, &l, 0, 0, 0);
+    r = le10_provdel(0x0123456789abcdef0LL, 301, zeros, 1025, zeros, &msize, &dsize, &l, 0, 0, 0);
    assert(r==0);
    assert(sizeof(expect_3long)==msize);
    assert(msize==dsize);

--- a/newbrt/txn.c
+++ b/newbrt/txn.c
@@ -20,11 +20,19 @@ int toku_txn_begin_txn (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER log
    }
    r = toku_omt_create(&result->open_brts);
    if (r!=0) {
+died1:
 	toku_logger_panic(logger, r);
 	toku_free(result);
 	return r;
    }
    result->txnid64 = result->first_lsn.lsn;
+    XIDS parent_xids;
+    if (parent_tokutxn==NULL)
+        parent_xids = xids_get_root_xids();
+    else
+        parent_xids = parent_tokutxn->xids;
+    if ((r=xids_create_child(parent_xids, &result->xids, result->txnid64)))
+        goto died1;
    result->logger = logger;
    result->parent = parent_tokutxn;
    result->oldest_logentry = result->newest_logentry = 0;
@@ -70,3 +78,9 @@ void toku_txn_close_txn(TOKUTXN txn) {
    toku_rollback_txn_close(txn);
    return;
 }
+
+XIDS toku_txn_get_xids (TOKUTXN txn) {
+    if (txn==0) return xids_get_root_xids();
+    else return txn->xids;
+}
+
--- a/newbrt/txn.h
+++ b/newbrt/txn.h
@@ -9,5 +9,6 @@ int toku_txn_begin_txn (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TOKULOGGER log
 int toku_txn_commit_txn (TOKUTXN txn, int nosync, YIELDF yield, void*yieldv);
 int toku_txn_abort_txn(TOKUTXN txn, YIELDF yield, void*yieldv);
 void toku_txn_close_txn(TOKUTXN txn);
+XIDS toku_txn_get_xids (TOKUTXN);

 #endif //TOKUTXN_H
--- a/newbrt/ule.c
+++ b/newbrt/ule.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+
+// Purpose of this file is to handle all modifications and queries to the database
+// at the level of leafentry.  
+// 
+// ule = Unpacked Leaf Entry
+//
+// This design unpacks the leafentry into a convenient format, performs all work
+// on the unpacked form, then repacks the leafentry into its compact format.
+//
+// See design documentation for nested transactions at
+// TokuWiki/Imp/TransactionsOverview.
+
+#include <toku_portability.h>
+#include "brttypes.h"
+#include "brt-internal.h"
+
+// Sorry:
+#include "mempool.h"
+#include "omt.h"
+
+
+#include "leafentry.h"
+#include "xids.h"
+#include "brt_msg.h"
+#include "ule.h"
+
+///////////////////////////////////////////////////////////////////////////////////
+//
+// Question: Can any software outside this file modify or read a leafentry?  
+// If so, is it worthwhile to put it all here?
+//
+// There are two entries, one each for modification and query:
+//   apply_msg_to_leafentry()        performs all inserts/deletes/aborts
+//   do_implicit_promotions_query()  
+//
+//
+//
+//
+
+//This is what we use to initialize uxrs[0] in a new unpacked leafentry.
+const UXR_S committed_delete = {
+    .type   = XR_DELETE,
+    .vallen = 0,
+    .xid    = 0,
+    .valp   = NULL
+};  // static allocation of uxr with type set to committed delete and xid = 0
+
+// Local functions:
+
+static void msg_init_empty_ule(ULE ule, BRT_MSG msg);
+static void msg_modify_ule(ULE ule, BRT_MSG msg);
+static void ule_init_empty_ule(ULE ule, u_int32_t keylen, void * keyp);
+static void ule_do_implicit_promotions(ULE ule, XIDS xids);
+static void ule_promote_innermost_to_index(ULE ule, u_int8_t index);
+static void ule_apply_insert(ULE ule, XIDS xids, u_int32_t vallen, void * valp);
+static void ule_apply_delete(ULE ule, XIDS xids);
+static void ule_prepare_for_new_uxr(ULE ule, XIDS xids);
+static void ule_apply_abort(ULE ule, XIDS xids);
+static void ule_apply_commit(ULE ule, XIDS xids);
+static void ule_push_insert_uxr(ULE ule, TXNID xid, u_int32_t vallen, void * valp);
+static void ule_push_delete_uxr(ULE ule, TXNID xid);
+static void ule_push_placeholder_uxr(ULE ule, TXNID xid);
+static UXR ule_get_innermost_uxr(ULE ule);
+static UXR ule_get_first_empty_uxr(ULE ule);
+static void ule_remove_innermost_uxr(ULE ule);
+static TXNID ule_get_innermost_xid(ULE ule);
+static TXNID ule_get_xid(ULE ule, u_int8_t index);
+static void ule_remove_innermost_placeholders(ULE ule);
+static void ule_add_placeholders(ULE ule, XIDS xids);
+static inline BOOL uxr_type_is_insert(u_int8_t type);
+static inline BOOL uxr_type_is_delete(u_int8_t type);
+static inline BOOL uxr_type_is_placeholder(u_int8_t type);
+static inline BOOL uxr_is_insert(UXR uxr);
+static inline BOOL uxr_is_delete(UXR uxr);
+static inline BOOL uxr_is_placeholder(UXR uxr);
+
+
+///////////// TEMP TEMP TEMP TEMP
+///////////// scaffolding/upgrading begins here  
+///////////// Some of this code may be used to upgrade an old database to our new version.
+
+//
+// le_unpack_le_* functions are throwaway code as part of phase 1 (temp
+// scaffolding)
+//
+#if 0
+static void le_unpack_le10_committed(u_int32_t klen, void *kval, u_int32_t vallen, void *val, ULE ule) {
+    //Committed value
+    ule_init_empty_ule(ule, klen, kval);
+    ule_remove_innermost_uxr(ule);	// pop committed delete
+    ule_push_insert_uxr(ule, 0, vallen, val);
+}
+
+static void le_unpack_le10_both(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval, ULE ule) {
+    //committed value and provisional insert
+    ule_init_empty_ule(ule, klen, kval);
+    ule_remove_innermost_uxr(ule);	// pop committed delete
+    ule_push_insert_uxr(ule, 0, clen, cval);	// push committed 
+    ule_push_insert_uxr(ule, xid, plen, pval);	// push provisional
+}
+
+static void le_unpack_le10_provdel(TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, ULE ule) {
+    //committed value and provisional delete
+    ule_init_empty_ule(ule, klen, kval);
+    ule_remove_innermost_uxr(ule);	// pop committed delete
+    ule_push_insert_uxr(ule, 0, clen, cval);	// push committed
+    ule_push_delete_uxr(ule, xid);		// push provisional
+}
+
+static void le_unpack_le10_provpair(TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval, ULE ule) {
+    //committed delete and provisional insert
+    ule_init_empty_ule(ule, klen, kval);
+    ule_push_insert_uxr(ule, xid, plen, pval);	// push provisional
+}
+
+//Used to unpack a version 10 record to ule, which can be packed to version 11.
+static void UU()
+le_unpack_from_version_10(ULE ule, LEAFENTRY le) {
+    LESWITCHCALL(le, le_unpack, ule);
+}
+#endif
+
+static void *
+le_malloc(OMT omt, struct mempool *mp, size_t size, void **maybe_free)
+{
+    if (omt)
+	return mempool_malloc_from_omt(omt, mp, size, maybe_free);
+    else
+	return toku_malloc(size);
+}
+
+///////////// end scaffolding/upgrade
+///////////// ENDTEMP ENDTEMP ENDTEMP ENDTEMP
+
+
+
+
+
+
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// This is the big enchilada.  (Bring Tums.)  Note that this level of abstraction 
+// has no knowledge of the inner structure of either leafentry or msg.  It makes
+// calls into the next lower layer (msg_xxx) which handles messages.
+//
+// NOTE: This is the only function (at least in this body of code) that modifies
+//        a leafentry.
+//
+// Return 0 if ???  (looking at original code, it seems that it always returns 0).
+// ??? How to inform caller that leafentry is to be destroyed?
+// 
+// Temporarily declared as static until we are ready to remove wrapper apply_cmd_to_leaf().
+// 
+int 
+apply_msg_to_leafentry(BRT_MSG   msg,		// message to apply to leafentry
+		       LEAFENTRY old_leafentry, // NULL if there was no stored data.
+		       size_t *new_leafentry_memorysize, 
+		       size_t *new_leafentry_disksize, 
+		       LEAFENTRY *new_leafentry_p,
+		       OMT omt, 
+		       struct mempool *mp, 
+		       void **maybe_free) {
+    ULE_S ule;
+    int rval;
+
+    if (old_leafentry == NULL)           // if leafentry does not exist ...
+        msg_init_empty_ule(&ule, msg);   // ... create empty unpacked leaf entry
+    else 
+        le_unpack(&ule, old_leafentry); // otherwise unpack leafentry 
+    msg_modify_ule(&ule, msg);          // modify unpacked leafentry
+    rval = le_pack(&ule,                // create packed leafentry
+		   new_leafentry_memorysize, 
+		   new_leafentry_disksize, 
+		   new_leafentry_p,
+		   omt, mp, maybe_free);                       
+    return rval;
+}
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// This layer of abstraction (msg_xxx)
+// knows the accessors of msg, but not of leafentry or unpacked leaf entry.
+// It makes calls into the lower layer (le_xxx) which handles leafentries.
+
+
+
+// Purpose is to init the ule with given key and no transaction records
+// 
+static void 
+msg_init_empty_ule(ULE ule, BRT_MSG msg) {   
+    u_int32_t keylen = brt_msg_get_keylen(msg);
+    void     *keyp   = brt_msg_get_key(msg);
+    ule_init_empty_ule(ule, keylen, keyp);
+}
+
+
+// Purpose is to modify the unpacked leafentry in our private workspace.
+//
+static void 
+msg_modify_ule(ULE ule, BRT_MSG msg) {
+    XIDS xids = brt_msg_get_xids(msg);
+    assert(xids_get_num_xids(xids) < MAX_TRANSACTION_RECORDS);
+    ule_do_implicit_promotions(ule, xids);
+    brt_msg_type type = brt_msg_get_type(msg);
+    switch (type) {
+    case BRT_INSERT: ;
+	u_int32_t vallen = brt_msg_get_vallen(msg);
+	void * valp      = brt_msg_get_val(msg);
+        ule_apply_insert(ule, xids, vallen, valp);
+        break;
+    case BRT_DELETE_ANY:
+    case BRT_DELETE_BOTH:
+        ule_apply_delete(ule, xids);
+        break;
+    case BRT_ABORT_ANY:
+    case BRT_ABORT_BOTH:
+        ule_apply_abort(ule, xids);
+        break;
+    case BRT_COMMIT_ANY:
+    case BRT_COMMIT_BOTH:
+        ule_apply_commit(ule, xids);
+        break;
+    default:
+	assert(FALSE /* illegal BRT_MSG.type */);
+	break;
+    }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// This layer of abstraction (le_xxx) understands the structure of the leafentry
+// and of the unpacked leafentry.  It is the only layer that understands the
+// structure of leafentry.  It has no knowledge of any other data structures.
+//
+// There are two formats for a packed leaf entry, indicated by the number of 
+// transaction records:
+// 
+// No uncommitted transactions: 
+//  num = 1  (one byte)
+//  keylen   (4 bytes)
+//  vallen   (4 bytes)
+//  key      (keylen bytes)
+//  val      (vallen bytes)
+// 
+// At least one uncommitted transaction (maybe a committed value as well):
+// 
+//  num > 1
+//  keylen
+//  vallen of innermost insert
+//  type of innermost transaction record
+//  xid of outermost uncommitted transaction 
+//  key
+//  val of innermost insert
+//  records excluding extracted data above
+//   first (innermost) record is missing the type (above)
+//   innermost insert record is missing the vallen and val
+//   outermost uncommitted record is missing xid
+//   outermost record (always committed) is missing xid (implied 0)
+//    default record:
+//      type = XR_INSERT  or  type = XR_PLACEHOLDER or XR_DELETE
+//      xid                   xid
+//      vallen
+//      val
+//     
+//
+
+#if 0
+#if TOKU_WINDOWS
+#pragma pack(push, 1)
+#endif
+//TODO: #1125 Add tests to verify ALL offsets (to verify we used 'pack' right).
+//            May need to add extra __attribute__((__packed__)) attributes within the definition
+struct __attribute__ ((__packed__)) leafentry {
+    u_int8_t  num_xrs;
+    u_int32_t keylen;
+    u_int32_t innermost_inserted_vallen;
+    union {
+        struct leafentry_committed {
+            u_int8_t key_val[0];     //Actual key, then actual val
+        } comm;
+        struct leafentry_provisional {
+            u_int8_t innermost_type;
+            TXNID    xid_outermost_uncommitted;
+            u_int8_t key_val_xrs[];  //Actual key,
+                                     //then actual innermost inserted val,
+                                     //then transaction records.
+        } prov;
+    } u;
+};
+#if TOKU_WINDOWS
+#pragma pack(pop)
+#endif
+#endif
+
+
+// Purpose of le_unpack() is to populate our private workspace with the contents of the given le.
+void
+le_unpack(ULE ule, LEAFENTRY le) {
+    //Read num_uxrs
+    ule->num_uxrs = le->num_xrs;
+    assert(ule->num_uxrs > 0);
+
+    //Read the keylen
+    ule->keylen = toku_dtoh32(le->keylen);
+
+    //Read the vallen of innermost insert
+    u_int32_t vallen_of_innermost_insert = toku_dtoh32(le->innermost_inserted_vallen);
+
+    u_int8_t *p;
+    if (ule->num_uxrs == 1) {
+        //Unpack a 'committed leafentry' (No uncommitted transactions exist)
+        ule->keyp           = le->u.comm.key_val;
+        ule->uxrs[0].type   = XR_INSERT; //Must be or the leafentry would not exist
+        ule->uxrs[0].vallen = vallen_of_innermost_insert;
+        ule->uxrs[0].valp   = &le->u.comm.key_val[ule->keylen];
+        ule->uxrs[0].xid    = 0;          //Required.
+
+        //Set p to immediately after leafentry
+        p = &le->u.comm.key_val[ule->keylen + vallen_of_innermost_insert];
+    }
+    else {
+        //Unpack a 'provisional leafentry' (Uncommitted transactions exist)
+
+        //Read in type.
+        u_int8_t innermost_type = le->u.prov.innermost_type;
+        assert(!uxr_type_is_placeholder(innermost_type));
+
+        //Read in xid
+        TXNID xid_outermost_uncommitted = toku_dtoh64(le->u.prov.xid_outermost_uncommitted);
+
+        //Read pointer to key
+        ule->keyp = le->u.prov.key_val_xrs;
+
+        //Read pointer to innermost inserted val (immediately after key)
+        u_int8_t *valp_of_innermost_insert = &le->u.prov.key_val_xrs[ule->keylen];
+
+        //Point p to immediately after 'header'
+        p = &le->u.prov.key_val_xrs[ule->keylen + vallen_of_innermost_insert];
+
+        BOOL found_innermost_insert = FALSE;
+        int i; //Index in ULE.uxrs[]
+        //Loop inner to outer
+        for (i = ule->num_uxrs - 1; i >= 0; i--) {
+            UXR uxr = &ule->uxrs[i];
+
+            //Innermost's type is in header.
+            if (i < ule->num_uxrs - 1) {
+                //Not innermost, so load the type.
+                uxr->type = *p;
+                p += 1;
+            }
+            else {
+                //Innermost, load the type previously read from header
+                uxr->type = innermost_type;
+            }
+
+            //Committed txn id is implicit (0).  (i==0)
+            //Outermost uncommitted txnid is stored in header. (i==1)
+            if (i > 1) {
+                //Not committed nor outermost uncommitted, so load the xid.
+                uxr->xid = toku_dtoh64(*(TXNID*)p);
+                p += 8;
+            }
+            else if (i == 1) {
+                //Outermost uncommitted, load the xid previously read from header
+                uxr->xid = xid_outermost_uncommitted;
+            }
+            else {
+                // i == 0, committed entry
+                uxr->xid = 0;
+            }
+
+            if (uxr_is_insert(uxr)) {
+                if (found_innermost_insert) {
+                    //Not the innermost insert.  Load vallen/valp
+                    uxr->vallen = toku_dtoh32(*(u_int32_t*)p);
+                    p += 4;
+
+                    uxr->valp = p;
+                    p += uxr->vallen;
+                }
+                else {
+                    //Innermost insert, load the vallen/valp previously read from header
+                    uxr->vallen = vallen_of_innermost_insert;
+                    uxr->valp   = valp_of_innermost_insert;
+                    found_innermost_insert = TRUE;
+                }
+            }
+        }
+        assert(found_innermost_insert);
+    }
+#if ULE_DEBUG
+    size_t memsize = le_memsize_from_ule(ule);
+    assert(p == ((u_int8_t*)le) + memsize);
+#endif
+}
+
+// Purpose is to return a newly allocated leaf entry in packed format, or
+// return null if leaf entry should be destroyed (if no transaction records 
+// are for inserts).
+// Transaction records in packed le are stored inner to outer (first xr is innermost),
+// with some information extracted out of the transaction records into the header.
+// Transaction records in ule are stored outer to inner (uxr[0] is outermost).
+int
+le_pack(ULE ule,                            // data to be packed into new leafentry
+	size_t *new_leafentry_memorysize, 
+	size_t *new_leafentry_disksize, 
+	LEAFENTRY * const new_leafentry_p,   // this is what this function creates
+	OMT omt, 
+	struct mempool *mp, 
+	void **maybe_free) {
+    int rval;
+    u_int8_t index_of_innermost_insert;
+    void     *valp_innermost_insert = NULL;
+    u_int32_t vallen_innermost_insert;
+    {
+        //If there are no 'insert' entries, return NO leafentry.
+        //Loop inner to outer searching for innermost insert.
+        //uxrs[0] is outermost (committed)
+        int i;
+        for (i = ule->num_uxrs - 1; i >= 0; i--) {
+            if (uxr_is_insert(&ule->uxrs[i])) {
+                index_of_innermost_insert = i;
+                vallen_innermost_insert   = ule->uxrs[i].vallen;
+                valp_innermost_insert     = ule->uxrs[i].valp;
+                goto found_insert;
+            }
+        }
+        *new_leafentry_p = NULL;
+        rval = 0;
+        goto cleanup;
+    }
+found_insert:;
+    size_t memsize = le_memsize_from_ule(ule);
+    LEAFENTRY new_leafentry = le_malloc(omt, mp, memsize, maybe_free);
+    if (new_leafentry==NULL) {
+        rval = ENOMEM;
+        goto cleanup;
+    }
+    //Universal data
+    new_leafentry->num_xrs = ule->num_uxrs;
+    new_leafentry->keylen  = toku_htod32(ule->keylen);
+    new_leafentry->innermost_inserted_vallen  = toku_htod32(vallen_innermost_insert);
+
+    u_int8_t *p;
+    //Type (committed/provisional) specific data
+    if (ule->num_uxrs == 1) {
+        //Pack a 'committed leafentry' (No uncommitted transactions exist)
+
+        //Store actual key.
+        memcpy(new_leafentry->u.comm.key_val, ule->keyp, ule->keylen);
+
+        //Store actual val of innermost insert immediately after actual key
+        memcpy(&new_leafentry->u.comm.key_val[ule->keylen],
+               valp_innermost_insert,
+               vallen_innermost_insert);
+
+        //Set p to after leafentry
+        p = &new_leafentry->u.comm.key_val[ule->keylen + vallen_innermost_insert];
+    }
+    else {
+        //Pack a 'provisional leafentry' (Uncommitted transactions exist)
+        //Store the type of the innermost transaction record
+        new_leafentry->u.prov.innermost_type = ule_get_innermost_uxr(ule)->type;
+
+        //uxrs[0] is the committed, uxrs[1] is the outermost non-committed
+        //Store the outermost non-committed xid
+        new_leafentry->u.prov.xid_outermost_uncommitted = toku_htod64(ule->uxrs[1].xid);
+
+        //Store actual key.
+        memcpy(new_leafentry->u.prov.key_val_xrs, ule->keyp, ule->keylen);
+
+        //Store actual val of innermost insert immediately after actual key
+        memcpy(&new_leafentry->u.prov.key_val_xrs[ule->keylen],
+               valp_innermost_insert,
+               vallen_innermost_insert);
+
+        //Set p to after 'header'
+        p = &new_leafentry->u.prov.key_val_xrs[ule->keylen + vallen_innermost_insert];
+
+        int i;  //index into ULE
+        //Loop inner to outer
+        for (i = ule->num_uxrs - 1; i >= 0; i--) {
+            UXR uxr = &ule->uxrs[i];
+
+            //Innermost's type is in header.
+            if (i < ule->num_uxrs - 1) {
+                //Not innermost, so record the type.
+                *p = uxr->type;
+                p += 1;
+            }
+
+            //Committed txn id is implicit (0).  (i==0)
+            //Outermost uncommitted txnid is stored in header. (i==1)
+            if (i > 1) {
+                //Not committed nor outermost uncommitted, so record the xid.
+                *((TXNID*)p) = toku_htod64(uxr->xid);
+                p += 8;
+            }
+
+            //Innermost insert's length and value are stored in header.
+            if (uxr_is_insert(uxr) && i != index_of_innermost_insert) {
+                //Is an insert, and not the innermost insert, so store length/val
+                *((u_int32_t*)p) = toku_htod32(uxr->vallen);
+                p += 4;
+
+                memcpy(p, uxr->valp, uxr->vallen); //Store actual val
+                p += uxr->vallen;
+            }
+        }
+    }
+    //p points to first unused byte after packed leafentry
+
+    size_t bytes_written = (size_t)p - (size_t)new_leafentry;
+    assert(bytes_written == memsize);
+#if ULE_DEBUG
+    if (omt) { //Disable recursive debugging.
+        size_t memsize_verify = leafentry_memsize(new_leafentry);
+        assert(memsize_verify == memsize);
+
+        ULE_S ule_tmp;
+        le_unpack(&ule_tmp, new_leafentry);
+
+        memsize_verify = le_memsize_from_ule(&ule_tmp);
+        assert(memsize_verify == memsize);
+        //Debugging code inside le_unpack will repack and verify it is the same.
+
+        LEAFENTRY le_copy;
+
+        int r_tmp = le_pack(&ule_tmp, &memsize_verify, &memsize_verify,
+                            &le_copy, NULL, NULL, NULL);
+        assert(r_tmp==0);
+        assert(memsize_verify == memsize);
+
+        assert(memcmp(new_leafentry, le_copy, memsize)==0);
+        toku_free(le_copy);
+    }
+#endif
+
+    *new_leafentry_p = (LEAFENTRY)new_leafentry;
+    *new_leafentry_memorysize = memsize;
+    *new_leafentry_disksize   = memsize;
+    rval = 0;
+cleanup:
+    return rval;
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// Following functions provide convenient access to a packed leafentry.
+
+//Requires:
+//  Leafentry that ule represents should not be destroyed (is not just all deletes)
+size_t
+le_memsize_from_ule (ULE ule) {
+    assert(ule->num_uxrs);
+    size_t rval;
+    if (ule->num_uxrs == 1) {
+        assert(uxr_is_insert(&ule->uxrs[0]));
+        rval = 1                    //num_uxrs
+              +4                    //keylen
+              +4                    //vallen
+              +ule->keylen          //actual key
+              +ule->uxrs[0].vallen; //actual val
+    }
+    else {
+        rval = 1                    //num_uxrs
+              +4                    //keylen
+              +ule->keylen          //actual key
+              +1*ule->num_uxrs      //types
+              +8*(ule->num_uxrs-1); //txnids
+        u_int8_t i;
+        for (i = 0; i < ule->num_uxrs; i++) {
+            UXR uxr = &ule->uxrs[i];
+            if (uxr_is_insert(uxr)) {
+                rval += 4;           //vallen
+                rval += uxr->vallen; //actual val
+            }
+        }
+    }
+    return rval;
+}
+
+#define LE_COMMITTED_MEMSIZE(le, keylen, vallen)             \
+    (sizeof((le)->num_xrs)                   /* num_uxrs */  \
+    +sizeof((le)->keylen)                    /* keylen */    \
+    +sizeof((le)->innermost_inserted_vallen) /* vallen */    \
+    +keylen                                 /* actual key */ \
+    +vallen)                                /* actual val */
+
+size_t
+leafentry_memsize (LEAFENTRY le) {
+    size_t rval = 0;
+
+    //Read num_uxrs
+    u_int8_t num_uxrs = le->num_xrs;
+    assert(num_uxrs > 0);
+
+    //Read the keylen
+    u_int32_t keylen = toku_dtoh32(le->keylen);
+
+    //Read the vallen of innermost insert
+    u_int32_t vallen_of_innermost_insert = toku_dtoh32(le->innermost_inserted_vallen);
+
+    if (num_uxrs == 1) {
+        //Committed version (no uncommitted records)
+        rval = LE_COMMITTED_MEMSIZE(le, keylen, vallen_of_innermost_insert);
+    }
+    else {
+        //A 'provisional leafentry' (Uncommitted transactions exist)
+        //Read in type.
+        u_int8_t innermost_type = le->u.prov.innermost_type;
+        assert(!uxr_type_is_placeholder(innermost_type));
+        //Set p to immediately after key,val (begginning of transaction records)
+        u_int8_t *p = &le->u.prov.key_val_xrs[keylen + vallen_of_innermost_insert];
+
+        BOOL found_innermost_insert = FALSE;
+        int i; //would be index in ULE.uxrs[] were we to unpack
+        //Loop inner to outer
+        UXR_S current_uxr;
+        UXR uxr = &current_uxr;
+        for (i = num_uxrs - 1; i >= 0; i--) {
+            //Innermost's type is in header.
+            if (i < num_uxrs - 1) {
+                //Not innermost, so load the type.
+                uxr->type = *p;
+                p += 1;
+            }
+            else {
+                //Innermost, load the type previously read from header
+                uxr->type = innermost_type;
+            }
+
+            //Committed txn id is implicit (0).  (i==0)
+            //Outermost uncommitted txnid is stored in header. (i==1)
+            if (i > 1) {
+                //Not committed nor outermost uncommitted, so load the xid.
+                p += 8;
+            }
+
+            if (uxr_is_insert(uxr)) {
+                if (found_innermost_insert) {
+                    //Not the innermost insert.  Load vallen/valp
+                    uxr->vallen = toku_dtoh32(*(u_int32_t*)p);
+                    p += 4;
+                    p += uxr->vallen;
+                }
+                else
+                    found_innermost_insert = TRUE;
+            }
+        }
+        assert(found_innermost_insert);
+        rval = (size_t)p - (size_t)le;
+    }
+#if ULE_DEBUG
+    ULE_S ule;
+    le_unpack(&ule, le);
+    size_t slow_rval = le_memsize_from_ule(&ule);
+    assert(slow_rval == rval);
+#endif
+    return rval;
+}
+
+size_t
+leafentry_disksize (LEAFENTRY le) {
+    return leafentry_memsize(le);
+}
+
+
+// le is normally immutable.  This is the only exception.
+void
+le_full_promotion(LEAFENTRY le,
+                  size_t *new_leafentry_memorysize, 
+                  size_t *new_leafentry_disksize) {
+#if ULE_DEBUG
+    // Create a new le ("slow_le") using normal commit message for comparison.
+    // Creation of slow_le must be done first, because le is being modified.
+    assert(le);
+    assert(le->num_xrs > 1); //Not committed
+    assert(!le_is_provdel(le));
+    assert(le_outermost_uncommitted_xid(le) != 0);
+
+    size_t old_memsize = leafentry_memsize(le);
+    u_int32_t old_keylen;
+    u_int32_t old_vallen;
+    void *old_key = le_key_and_len(le, &old_keylen);
+    void *old_val = le_innermost_inserted_val_and_len(le, &old_vallen);
+
+    assert(old_key    == le_latest_key(le));
+    assert(old_keylen == le_latest_keylen(le));
+    assert(old_val    == le_latest_val(le));
+    assert(old_vallen == le_latest_vallen(le));
+
+    //Save copies for verification.
+    old_key = toku_memdup(old_key, old_keylen);
+    assert(old_key);
+    old_val = toku_memdup(old_val, old_vallen);
+    assert(old_val);
+
+    BRT_MSG_S slow_full_promotion_msg = {
+        .type = BRT_COMMIT_ANY,
+        .xids = xids_get_root_xids(),
+        .u.id = {
+            .key = NULL,
+            .val = NULL,
+        }
+    };
+    size_t slow_new_memsize;
+    size_t slow_new_disksize;
+    LEAFENTRY slow_le;
+    int r_apply = apply_msg_to_leafentry(&slow_full_promotion_msg,
+                                         le,
+                                         &slow_new_memsize, &slow_new_disksize,
+                                         &slow_le,
+                                         NULL, NULL, NULL);
+    assert(r_apply == 0);
+    assert(slow_new_memsize == slow_new_disksize);
+    assert(slow_new_memsize < old_memsize);
+    assert(slow_le);
+#endif
+
+    //Save keylen for later use.
+    u_int32_t keylen = le_keylen(le);
+    //Save innermost inserted vallen for later use.
+    u_int32_t vallen = le_innermost_inserted_vallen(le);
+
+    //Set as committed.
+    le->num_xrs = 1;
+
+    //Keylen is unchanged but we need to extract it.
+    //Innermost inserted vallen is unchanged but we need to extract it.
+
+    //Move key and value using memmove. memcpy does not support overlapping memory.
+
+    //Move the key
+    memmove(le->u.comm.key_val,          le->u.prov.key_val_xrs, keylen);
+
+    //Move the val
+    memmove(&le->u.comm.key_val[keylen], &le->u.prov.key_val_xrs[keylen], vallen);
+
+    size_t new_memsize = LE_COMMITTED_MEMSIZE(le, keylen, vallen);
+    *new_leafentry_memorysize = new_memsize;
+    *new_leafentry_disksize   = new_memsize;
+
+#if ULE_DEBUG
+    // now compare le with "slow_le" created via normal commit message.
+    assert(*new_leafentry_memorysize == slow_new_memsize);  //Size same
+    assert(*new_leafentry_disksize   == slow_new_disksize); //Size same
+    assert(memcmp(le, slow_le, slow_new_memsize) == 0);     //Bitwise the same.
+    assert(!le_is_provdel(le));
+    assert(le_outermost_uncommitted_xid(le) == 0);
+
+    //Verify key(len), val(len) unchanged.
+    u_int32_t new_keylen;
+    u_int32_t new_vallen;
+    void *new_key = le_key_and_len(le, &new_keylen);
+    void *new_val = le_innermost_inserted_val_and_len(le, &new_vallen);
+    assert(new_key    == le_latest_key(le));
+    assert(new_keylen == le_latest_keylen(le));
+    assert(new_val    == le_latest_val(le));
+    assert(new_vallen == le_latest_vallen(le));
+
+    assert(new_keylen == old_keylen);
+    assert(new_vallen == old_vallen);
+    assert(memcpy(new_key, old_key, old_keylen) == 0);
+    assert(memcpy(new_val, old_val, old_vallen) == 0);
+
+    toku_free(slow_le);
+    toku_free(old_key);
+    toku_free(old_val);
+#endif
+}
+
+
+int le_is_provdel(LEAFENTRY le) {
+    int rval;
+    u_int8_t num_xrs = le->num_xrs;
+    if (num_xrs == 1)
+        rval = 0;
+    else
+        rval = uxr_type_is_delete(le->u.prov.innermost_type);
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_innermost_uxr(&ule);
+    int slow_rval = uxr_is_delete(uxr);
+    assert((rval==0) == (slow_rval==0));
+#endif
+    return rval;
+}
+
+//If le_is_provdel, return (NULL,0)
+//Else,             return (key,keylen)
+void*
+le_latest_key_and_len (LEAFENTRY le, u_int32_t *len) {
+    u_int8_t num_xrs = le->num_xrs;
+    void *keyp;
+    *len = toku_dtoh32(le->keylen);
+    if (num_xrs == 1)
+        keyp = le->u.comm.key_val;
+    else {
+        keyp = le->u.prov.key_val_xrs;
+        if (uxr_type_is_delete(le->u.prov.innermost_type)) {
+            keyp = NULL;
+            *len = 0;
+        }
+    }
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_innermost_uxr(&ule);
+    void     *slow_keyp;
+    u_int32_t slow_len;
+    if (uxr_is_insert(uxr)) {
+        slow_keyp = ule.keyp;
+        slow_len  = ule.keylen; 
+    }
+    else {
+        slow_keyp = NULL;
+        slow_len  = 0;
+    }
+    assert(slow_keyp == le_latest_key(le));
+    assert(slow_len  == le_latest_keylen(le));
+    assert(keyp==slow_keyp);
+    assert(*len==slow_len);
+#endif
+    return keyp;
+}
+
+void*
+le_latest_key (LEAFENTRY le) {
+    u_int8_t num_xrs = le->num_xrs;
+    void *rval;
+    if (num_xrs == 1)
+        rval = le->u.comm.key_val;
+    else {
+        rval = le->u.prov.key_val_xrs;
+        if (uxr_type_is_delete(le->u.prov.innermost_type))
+            rval = NULL;
+    }
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_innermost_uxr(&ule);
+    void *slow_rval;
+    if (uxr_is_insert(uxr))
+        slow_rval = ule.keyp;
+    else
+        slow_rval = NULL;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+u_int32_t
+le_latest_keylen (LEAFENTRY le) {
+    u_int8_t num_xrs = le->num_xrs;
+    u_int32_t rval = toku_dtoh32(le->keylen);
+    if (num_xrs > 1 && uxr_type_is_delete(le->u.prov.innermost_type))
+        rval = 0;
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_innermost_uxr(&ule);
+    u_int32_t slow_rval;
+    if (uxr_is_insert(uxr))
+        slow_rval = ule.keylen;
+    else
+        slow_rval = 0;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+void*
+le_latest_val_and_len (LEAFENTRY le, u_int32_t *len) {
+    u_int8_t num_xrs = le->num_xrs;
+    void *valp;
+    u_int32_t keylen = toku_dtoh32(le->keylen);
+    *len = toku_dtoh32(le->innermost_inserted_vallen);
+    if (num_xrs == 1)
+        valp = &le->u.comm.key_val[keylen];
+    else {
+        valp = &le->u.prov.key_val_xrs[keylen];
+        if (uxr_type_is_delete(le->u.prov.innermost_type)) {
+            valp = NULL;
+            *len = 0;
+        }
+    }
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_innermost_uxr(&ule);
+    void     *slow_valp;
+    u_int32_t slow_len;
+    if (uxr_is_insert(uxr)) {
+        slow_valp = uxr->valp;
+        slow_len  = uxr->vallen; 
+    }
+    else {
+        slow_valp = NULL;
+        slow_len  = 0;
+    }
+    assert(slow_valp == le_latest_val(le));
+    assert(slow_len == le_latest_vallen(le));
+    assert(valp==slow_valp);
+    assert(*len==slow_len);
+#endif
+    return valp;
+}
+
+void*
+le_latest_val (LEAFENTRY le) {
+    u_int8_t num_xrs = le->num_xrs;
+    void *rval;
+    u_int32_t keylen = toku_dtoh32(le->keylen);
+    if (num_xrs == 1)
+        rval = &le->u.comm.key_val[keylen];
+    else {
+        rval = &le->u.prov.key_val_xrs[keylen];
+        if (uxr_type_is_delete(le->u.prov.innermost_type))
+            rval = NULL;
+    }
+#if ULE_DEBUG
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_innermost_uxr(&ule);
+    void *slow_rval;
+    if (uxr_is_insert(uxr))
+        slow_rval = uxr->valp;
+    else
+        slow_rval = NULL;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+u_int32_t
+le_latest_vallen (LEAFENTRY le) {
+    u_int8_t num_xrs = le->num_xrs;
+    u_int32_t rval   = toku_dtoh32(le->innermost_inserted_vallen);
+    if (num_xrs > 1 && uxr_type_is_delete(le->u.prov.innermost_type))
+        rval = 0;
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    UXR uxr = ule_get_innermost_uxr(&ule);
+    u_int32_t slow_rval;
+    if (uxr_is_insert(uxr))
+        slow_rval = uxr->vallen;
+    else
+        slow_rval = 0;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+//Return key and keylen unconditionally
+void*
+le_key_and_len (LEAFENTRY le, u_int32_t *len) {
+    u_int8_t num_xrs = le->num_xrs;
+    *len = toku_dtoh32(le->keylen);
+    void *keyp;
+    if (num_xrs == 1)
+        keyp = le->u.comm.key_val;
+    else
+        keyp = le->u.prov.key_val_xrs;
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    void     *slow_keyp;
+    u_int32_t slow_len;
+    slow_keyp = ule.keyp;
+    slow_len  = ule.keylen; 
+    assert(slow_keyp == le_key(le));
+    assert(slow_len == le_keylen(le));
+    assert(keyp==slow_keyp);
+    assert(*len==slow_len);
+#endif
+    return keyp;
+}
+
+
+void*
+le_key (LEAFENTRY le) {
+    u_int8_t num_xrs = le->num_xrs;
+    void *rval;
+    if (num_xrs == 1)
+        rval = le->u.comm.key_val;
+    else
+        rval = le->u.prov.key_val_xrs;
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    void *slow_rval = ule.keyp;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+u_int32_t
+le_keylen (LEAFENTRY le) {
+    u_int32_t rval = toku_dtoh32(le->keylen);
+#if ULE_DEBUG
+    ULE_S ule;
+    le_unpack(&ule, le);
+    u_int32_t slow_rval = ule.keylen;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+void*
+le_innermost_inserted_val_and_len (LEAFENTRY le, u_int32_t *len) {
+    u_int8_t num_xrs = le->num_xrs;
+    void *valp;
+    u_int32_t keylen = toku_dtoh32(le->keylen);
+    *len = toku_dtoh32(le->innermost_inserted_vallen);
+    if (num_xrs == 1)
+        valp = &le->u.comm.key_val[keylen];
+    else
+        valp = &le->u.prov.key_val_xrs[keylen];
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    u_int8_t i;
+    for (i = ule.num_uxrs; i > 0; i--) {
+        if (uxr_is_insert(&ule.uxrs[i-1]))
+            break;
+    }
+    assert(i > 0);
+    i--;
+    UXR uxr = &ule.uxrs[i];
+    void     *slow_valp;
+    u_int32_t slow_len;
+    slow_valp = uxr->valp;
+    slow_len  = uxr->vallen; 
+    assert(slow_valp == le_innermost_inserted_val(le));
+    assert(slow_len == le_innermost_inserted_vallen(le));
+    assert(valp==slow_valp);
+    assert(*len==slow_len);
+#endif
+    return valp;
+}
+
+
+void*
+le_innermost_inserted_val (LEAFENTRY le) {
+    u_int8_t num_xrs = le->num_xrs;
+    void *rval;
+    u_int32_t keylen = toku_dtoh32(le->keylen);
+    if (num_xrs == 1)
+        rval = &le->u.comm.key_val[keylen];
+    else
+        rval = &le->u.prov.key_val_xrs[keylen];
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    u_int8_t i;
+    for (i = ule.num_uxrs; i > 0; i--) {
+        if (uxr_is_insert(&ule.uxrs[i-1]))
+            break;
+    }
+    assert(i > 0);
+    i--;
+    void *slow_rval = ule.uxrs[i].valp;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+u_int32_t
+le_innermost_inserted_vallen (LEAFENTRY le) {
+    u_int32_t rval = toku_dtoh32(le->innermost_inserted_vallen);
+#if ULE_DEBUG
+    ULE_S ule;
+    le_unpack(&ule, le);
+    u_int8_t i;
+    for (i = ule.num_uxrs; i > 0; i--) {
+        if (uxr_is_insert(&ule.uxrs[i-1]))
+            break;
+    }
+    assert(i > 0);
+    i--;
+    u_int32_t slow_rval = ule.uxrs[i].vallen;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+u_int64_t 
+le_outermost_uncommitted_xid (LEAFENTRY le) {
+    u_int8_t num_xrs = le->num_xrs;
+    TXNID rval;
+    if (num_xrs == 1)
+        rval = 0;
+    else
+        rval = toku_dtoh64(le->u.prov.xid_outermost_uncommitted);
+#if ULE_DEBUG
+    assert(num_xrs);
+    ULE_S ule;
+    le_unpack(&ule, le);
+    TXNID slow_rval = 0;
+    if (ule.num_uxrs > 1)
+        slow_rval = ule.uxrs[1].xid;
+    assert(rval==slow_rval);
+#endif
+    return rval;
+}
+
+
+//Optimization not required.  This is a debug only function.
+//Print a leafentry out in human-readable format
+int
+print_leafentry (FILE *outf, LEAFENTRY le) {
+    ULE_S ule;
+    le_unpack(&ule, le);
+    u_int8_t i;
+    assert(ule.num_uxrs > 0);
+    UXR uxr = &ule.uxrs[0];
+    if (!le) { printf("NULL"); return 0; }
+    fprintf(outf, "{key=");
+    toku_print_BYTESTRING(outf, ule.keylen, ule.keyp);
+    for (i = 0; i < ule.num_uxrs; i++) {
+        fprintf(outf, "\n%*s", i+1, " "); //Nested indenting
+        uxr = &ule.uxrs[i];
+
+        if (uxr_is_placeholder(uxr))
+            fprintf(outf, "P: xid=%016" PRIx64, uxr->xid);
+        else if (uxr_is_delete(uxr))
+            fprintf(outf, "D: xid=%016" PRIx64, uxr->xid);
+        else {
+            assert(uxr_is_insert(uxr));
+            fprintf(outf, "I: xid=%016" PRIx64 " val=", uxr->xid);
+            toku_print_BYTESTRING(outf, uxr->vallen, uxr->valp);
+        }
+    }
+    fprintf(outf, "}");
+    return 0;
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// This layer of abstraction (ule_xxx) knows the structure of the unpacked
+// leafentry and no other structure.
+//
+
+// ule constructor
+// Note that transaction 0 is explicit in the ule
+static void 
+ule_init_empty_ule(ULE ule, u_int32_t keylen, void * keyp) {
+    ule->keylen   = keylen;
+    ule->keyp     = keyp;
+    ule->num_uxrs = 1;
+    ule->uxrs[0]  = committed_delete;
+}
+
+static inline u_int8_t 
+min_u8(u_int8_t a, u_int8_t b) {
+    u_int8_t rval = a < b ? a : b;
+    return rval;
+}
+
+///////////////////
+// Implicit promotion logic:
+//
+// If the leafentry has already been promoted, there is nothing to do.
+// We have two transaction stacks (one from message, one from leaf entry).
+// We want to implicitly promote transactions newer than (but not including) 
+// the innermost common ancestor (ICA) of the two stacks of transaction ids.  We 
+// know that this is the right thing to do because each transaction with an id
+// greater (later) than the ICA must have been either committed or aborted.
+// If it was aborted then we would have seen an abort message and removed the
+// xid from the stack of transaction records.  So any transaction still on the 
+// leaf entry stack must have been successfully promoted.
+// 
+// After finding the ICA, promote transaction later than the ICA by copying
+// value and type from innermost transaction record of leafentry to transaction
+// record of ICA, keeping the transaction id of the ICA.
+// Outermost xid is zero for both ule and xids<>
+//
+static void 
+ule_do_implicit_promotions(ULE ule, XIDS xids) {
+    //Optimization for (most) common case.
+    //No commits necessary if everything is already committed.
+    if (ule->num_uxrs > 1) {
+        u_int8_t max_index = min_u8(ule->num_uxrs, xids_get_num_xids(xids)) - 1;
+        u_int8_t ica_index = max_index;
+        u_int8_t index;
+        for (index = 1; index <= max_index; index++) { //xids at index 0 are defined to be equal.
+            TXNID current_msg_xid = xids_get_xid(xids, index);
+            TXNID current_ule_xid = ule_get_xid(ule, index);
+            if (current_msg_xid != current_ule_xid) {
+                //ica is innermost transaction with matching xids.
+                ica_index = index - 1;
+                break;
+            }
+        }
+
+        //If ica is the innermost uxr in the leafentry, no commits are necessary.
+        if (ica_index < ule->num_uxrs - 1) {
+            ule_promote_innermost_to_index(ule, ica_index);
+        }
+    }
+}
+
+// Purpose is to promote the value (and type) of the innermost transaction
+// record to the uxr at the specified index (keeping the txnid of the uxr at
+// specified index.)
+static void 
+ule_promote_innermost_to_index(ULE ule, u_int8_t index) {
+    assert(ule->num_uxrs - 1 > index);
+    UXR old_innermost_uxr = ule_get_innermost_uxr(ule);
+    assert(!uxr_is_placeholder(old_innermost_uxr));
+    TXNID new_innermost_xid = ule->uxrs[index].xid;
+    ule->num_uxrs  = index; //Discard old uxr at index (and everything inner)
+    if (uxr_is_delete(old_innermost_uxr)) {
+        ule_push_delete_uxr(ule, new_innermost_xid);
+    }
+    else {
+        ule_push_insert_uxr(ule,
+                            new_innermost_xid,
+                            old_innermost_uxr->vallen,
+                            old_innermost_uxr->valp);
+    }
+}
+
+///////////////////
+//  All ule_apply_xxx operations are done after implicit promotions,
+//  so the innermost transaction record in the leafentry is the ICA.
+//
+
+
+// Purpose is to apply an insert message to this leafentry:
+static void 
+ule_apply_insert(ULE ule, XIDS xids, u_int32_t vallen, void * valp) {
+    ule_prepare_for_new_uxr(ule, xids);
+    TXNID this_xid = xids_get_innermost_xid(xids);  // xid of transaction doing this insert
+    ule_push_insert_uxr(ule, this_xid, vallen, valp);
+}
+
+// Purpose is to apply a delete message to this leafentry:
+static void 
+ule_apply_delete(ULE ule, XIDS xids) {
+    ule_prepare_for_new_uxr(ule, xids);
+    TXNID this_xid = xids_get_innermost_xid(xids);  // xid of transaction doing this delete
+    ule_push_delete_uxr(ule, this_xid);
+}
+
+// First, discard anything done earlier by this transaction.
+// Then, add placeholders if necessary.  This transaction may be nested within 
+// outer transactions that are newer than then newest (innermost) transaction in
+// the leafentry.  If so, record those outer transactions in the leafentry
+// with placeholders.
+static void 
+ule_prepare_for_new_uxr(ULE ule, XIDS xids) {
+    TXNID this_xid = xids_get_innermost_xid(xids);
+    if (ule_get_innermost_xid(ule) == this_xid)   
+        ule_remove_innermost_uxr(ule);
+    else
+        ule_add_placeholders(ule, xids);
+}
+
+// Purpose is to apply an abort message to this leafentry.
+// If the aborted transaction (the transaction whose xid is the innermost xid
+// in the id stack passed in the message), has not modified this leafentry,
+// then there is nothing to be done.
+// If this transaction did modify the leafentry, then undo whatever it did (by
+// removing the transaction record (uxr) and any placeholders underneath.
+// Remember, the innermost uxr can only be an insert or a delete, not a placeholder. 
+static void 
+ule_apply_abort(ULE ule, XIDS xids) {
+    TXNID this_xid = xids_get_innermost_xid(xids);   // xid of transaction doing this abort
+    assert(this_xid!=0);
+    if (ule_get_innermost_xid(ule) == this_xid) {
+        assert(ule->num_uxrs>1);
+        ule_remove_innermost_uxr(ule);                    
+        ule_remove_innermost_placeholders(ule); 
+    }
+    assert(ule->num_uxrs > 0);
+}
+
+// Purpose is to apply a commit message to this leafentry.
+// If the committed transaction (the transaction whose xid is the innermost xid
+// in the id stack passed in the message), has not modified this leafentry,
+// then there is nothing to be done.
+// Also, if there are no uncommitted transaction records there is nothing to do.
+// If this transaction did modify the leafentry, then promote whatever it did.
+// Remember, the innermost uxr can only be an insert or a delete, not a placeholder. 
+void ule_apply_commit(ULE ule, XIDS xids) {
+    TXNID this_xid = xids_get_innermost_xid(xids);  // xid of transaction committing
+    assert(this_xid!=0);
+    if (ule_get_innermost_xid(ule) == this_xid) { 
+        //ule->uxrs[ule->num_uxrs-1] is the innermost (this transaction)
+        //ule->uxrs[ule->num_uxrs-2] is the 2nd innermost
+        assert(ule->num_uxrs > 1);
+        //We want to promote the innermost uxr one level out.
+        ule_promote_innermost_to_index(ule, ule->num_uxrs-2);
+    }
+}
+
+///////////////////
+// Helper functions called from the functions above:
+//
+
+// Purpose is to record an insert for this transaction (and set type correctly).
+static void 
+ule_push_insert_uxr(ULE ule, TXNID xid, u_int32_t vallen, void * valp) {
+    UXR uxr     = ule_get_first_empty_uxr(ule);
+    uxr->xid    = xid;
+    uxr->vallen = vallen;
+    uxr->valp   = valp;
+    uxr->type   = XR_INSERT;
+    ule->num_uxrs++;
+}
+
+// Purpose is to record a delete for this transaction.  If this transaction
+// is the root transaction, then truly delete the leafentry by marking the 
+// ule as empty.
+static void 
+ule_push_delete_uxr(ULE ule, TXNID xid) {
+    UXR uxr     = ule_get_first_empty_uxr(ule);
+    uxr->xid    = xid;
+    uxr->type   = XR_DELETE;
+    ule->num_uxrs++;
+}
+
+// Purpose is to push a placeholder on the top of the leafentry's transaction stack.
+static void 
+ule_push_placeholder_uxr(ULE ule, TXNID xid) {
+    UXR uxr     = ule_get_first_empty_uxr(ule);
+    uxr->xid    = xid;
+    uxr->type   = XR_PLACEHOLDER;
+    ule->num_uxrs++;
+}
+
+// Return innermost transaction record.
+static UXR 
+ule_get_innermost_uxr(ULE ule) {
+    assert(ule->num_uxrs > 0);
+    UXR rval = &(ule->uxrs[ule->num_uxrs - 1]);
+    return rval;
+}
+
+// Return first empty transaction record
+static UXR 
+ule_get_first_empty_uxr(ULE ule) {
+    assert(ule->num_uxrs < MAX_TRANSACTION_RECORDS);
+    UXR rval = &(ule->uxrs[ule->num_uxrs]);
+    return rval;
+}
+
+// Remove the innermost transaction (pop the leafentry's stack), undoing
+// whatever the innermost transaction did.
+static void 
+ule_remove_innermost_uxr(ULE ule) {
+    //It is possible to remove the committed delete at first insert.
+    assert(ule->num_uxrs > 0);
+    ule->num_uxrs--;
+}
+
+static TXNID 
+ule_get_innermost_xid(ULE ule) {
+    TXNID rval = ule_get_xid(ule, ule->num_uxrs - 1);
+    return rval;
+}
+
+static TXNID 
+ule_get_xid(ULE ule, u_int8_t index) {
+    assert(index < ule->num_uxrs);
+    TXNID rval = ule->uxrs[index].xid;
+    return rval;
+}
+
+// Purpose is to remove any placeholders from the top of the leaf stack (the 
+// innermost recorded transactions), if necessary.  This function is idempotent.
+// It makes no logical sense for a placeholder to be the innermost recorded
+// transaction record, so placeholders at the top of the stack are not legal.
+static void 
+ule_remove_innermost_placeholders(ULE ule) {
+    UXR uxr = ule_get_innermost_uxr(ule);
+    while (uxr_is_placeholder(uxr)) {
+	assert(ule->num_uxrs > 1);	// outermost is committed, cannot be placeholder
+        ule_remove_innermost_uxr(ule);
+        uxr = ule_get_innermost_uxr(ule);
+    }
+}
+
+
+// Purpose is to add placeholders to the top of the leaf stack (the innermost
+// recorded transactions), if necessary.  This function is idempotent.
+// Note, after placeholders are added, an insert or delete will be added.  This 
+// function temporarily leaves the transaction stack in an illegal state (having
+// placeholders on top).
+static void 
+ule_add_placeholders(ULE ule, XIDS xids) {
+    //Placeholders can be placed on top of the committed uxr.
+    assert(ule->num_uxrs > 0);
+    TXNID ica_xid  = ule_get_innermost_xid(ule); // xid of ica
+    TXNID this_xid = xids_get_innermost_xid(xids); // xid of this transaction
+    if (ica_xid != this_xid) {		// if this transaction is the ICA, don't push any placeholders
+	u_int8_t index           = xids_find_index_of_xid(xids, ica_xid) + 1; // Get index of next inner transaction after ICA
+	TXNID    current_msg_xid = xids_get_xid(xids, index);
+	while (current_msg_xid != this_xid) { // Placeholder for each transaction before this transaction
+	    ule_push_placeholder_uxr(ule, current_msg_xid);
+	    index++;
+	    current_msg_xid = xids_get_xid(xids, index);
+	}
+    }
+}
+
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//  This layer of abstraction (uxr_xxx) understands uxr and nothing else.
+//
+
+static inline BOOL
+uxr_type_is_insert(u_int8_t type) {
+    BOOL rval = (type == XR_INSERT);
+    return rval;
+}
+
+static inline BOOL
+uxr_is_insert(UXR uxr) {
+    return uxr_type_is_insert(uxr->type);
+}
+
+static inline BOOL
+uxr_type_is_delete(u_int8_t type) {
+    BOOL rval = (type == XR_DELETE);
+    return rval;
+}
+
+static inline BOOL
+uxr_is_delete(UXR uxr) {
+    return uxr_type_is_delete(uxr->type);
+}
+
+static inline BOOL
+uxr_type_is_placeholder(u_int8_t type) {
+    BOOL rval = (type == XR_PLACEHOLDER);
+    return rval;
+}
+
+static inline BOOL
+uxr_is_placeholder(UXR uxr) {
+    return uxr_type_is_placeholder(uxr->type);
+}
+
+
+
+
+
+#ifdef IMPLICIT_PROMOTION_ON_QUERY
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Functions here are responsible for implicit promotions on queries.
+// 
+// Purpose is to promote any transactions in this leafentry by detecting if 
+// transactions that have modified it have been committed.
+// During a query, the read lock for the leaf entry is not necessarily taken.
+// (We use a locking regime that tests the lock after the read.)
+// If a transaction unrelated to the transaction issuing the query is writing 
+// to this leafentry (possible because we didn't take the read lock), then that 
+// unrelated transaction is alive and there should be no implicit promotion.
+// So any implicit promotions done during the query must be based solely on 
+// whether the transactions whose xids are recorded in the leafentry are still
+// open.  (An open transaction is one that has not committed or aborted.)
+// Our logic is:
+// If the innermost transaction in the leafentry is definitely open, then no 
+// implicit promotions are necessary (or possible).  This is a fast test.
+// Otherwise, scan from inner to outer to find the innermost uncommitted
+// transaction.  Then promote the innermost transaction to the transaction
+// record of the innermost open (uncommitted) transaction.
+// Transaction id of zero is always considered open for this purpose.
+leafentry do_implicit_promotions_on_query(le) {
+    innermost_xid = le_get_innermost_xid(le);
+    // if innermost transaction still open, nothing to promote
+    if (!transaction_open(innermost_xid)) {
+        ule = unpack(le);
+        // scan outward starting with next outer transaction
+        for (index = ule->num_uxrs - 2; index > 0; index--) {
+            xid = ule_get_xid(ule, index);
+            if (transaction_open(xid)) break;
+        }
+        promote_innermost_to_index(ule, index);
+        le = le_pack(ule);
+    }
+    return le;
+}
+
+
+// Examine list of open transactions, return true if transaction is still open.
+// Transaction zero is always open.
+//
+// NOTE: Old code already does implicit promotion of provdel on query,
+//       and that code uses some equivalent of transaction_open().
+//
+
+bool transaction_open(TXNID xid) {
+    rval = TRUE;
+    if (xid != 0) {
+        //TODO: Logic
+    }
+    return rval;
+}
+
+#endif
+
+
+
--- a/newbrt/ule.h
+++ b/newbrt/ule.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+/* Purpose of this file is to provide the world with everything necessary
+ * to use the nested transaction logic and nothing else.  No internal
+ * requirements of the nested transaction logic belongs here.
+ */
+
+#ifndef ULE_H
+#define ULE_H
+
+//1 does much slower debugging
+#define ULE_DEBUG 0
+
+/////////////////////////////////////////////////////////////////////////////////
+// Following data structures are the unpacked format of a leafentry. 
+//   * ule is the unpacked leaf entry, that contains an array of unpacked
+//     transaction records
+//   * uxr is the unpacked transaction record
+//
+
+
+//Types of transaction records.
+enum {XR_INSERT      = 1,
+      XR_DELETE      = 2,
+      XR_PLACEHOLDER = 3};
+
+typedef struct {     // unpacked transaction record
+    u_int8_t   type;     // delete/insert/placeholder
+    u_int32_t  vallen;   // number of bytes in value
+    void *     valp;     // pointer to value  (Where is value really stored?)
+    TXNID      xid;      // transaction id
+    // Note: when packing ule into a new leafentry, will need
+    //       to copy actual data from valp to new leafentry
+} UXR_S, *UXR;
+
+// Unpacked Leaf Entry is of fixed size because it's just on the 
+// stack and we care about ease of access more than the memory footprint.
+typedef struct {     // unpacked leaf entry
+    u_int8_t   num_uxrs;   // how many of uxrs[] are valid
+    u_int32_t  keylen;
+    void *     keyp;
+    UXR_S      uxrs[MAX_TRANSACTION_RECORDS];    // uxrs[0] is outermost, uxrs[num_uxrs-1] is innermost
+} ULE_S, *ULE;
+
+int apply_msg_to_leafentry(BRT_MSG   msg,
+			   LEAFENTRY old_leafentry, // NULL if there was no stored data.
+			   size_t *new_leafentry_memorysize, 
+			   size_t *new_leafentry_disksize, 
+			   LEAFENTRY *new_leafentry_p,
+			   OMT omt, 
+			   struct mempool *mp, 
+			   void **maybe_free);
+
+//////////////////////////////////////////////////////////////////////////////////////
+//Functions exported for test purposes only (used internally for non-test purposes).
+void le_unpack(ULE ule,  LEAFENTRY le);
+int le_pack(ULE ule,                            // data to be packed into new leafentry
+	size_t *new_leafentry_memorysize, 
+	size_t *new_leafentry_disksize, 
+	LEAFENTRY * const new_leafentry_p,   // this is what this function creates
+	OMT omt, 
+	struct mempool *mp, 
+	void **maybe_free);
+
+
+size_t le_memsize_from_ule (ULE ule);
+
+#endif  // ULE_H
+
--- a/newbrt/xids-internal.h
+++ b/newbrt/xids-internal.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+
+#ifndef XIDS_INTERNAL_H
+#define XIDS_INTERNAL_H
+
+// Variable size list of transaction ids (known in design doc as xids<>).
+// ids[0] is the outermost transaction.
+// ids[num_xids - 1] is the innermost transaction.
+// Should only be accessed by accessor functions xids_xxx, not directly.
+typedef struct xids_t {
+    u_int8_t  num_stored_xids;    // maximum value of MAX_TRANSACTION_RECORDS - 1 ...
+				    // ... because transaction 0 is implicit
+    TXNID     ids[];
+} XIDS_S;
+
+#endif
--- a/newbrt/xids.c
+++ b/newbrt/xids.c
+/* -*- mode: C; c-basic-offset: 4 -*- */
+
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+/* Purpose of this file is to implement xids list of nested transactions
+ * ids.
+ *
+ * See design documentation for nested transactions at
+ * TokuWiki/Imp/TransactionsOverview.
+ *
+ * NOTE: xids are always stored in disk byte order.  
+ *       Accessors are responsible for transposing bytes to 
+ *       host order.
+ */
+
+
+#include <errno.h>
+#include <string.h>
+
+#include <toku_portability.h>
+#include "brttypes.h"
+#include "xids.h"
+#include "xids-internal.h"
+#include "toku_assert.h"
+#include "memory.h"
+#include <toku_htod.h>
+
+
+/////////////////////////////////////////////////////////////////////////////////
+//  This layer of abstraction (xids_xxx) understands xids<> and nothing else.
+//  It contains all the functions that understand xids<>
+//
+//  xids<> do not store the implicit transaction id of 0 at index 0.
+//  The accessor functions make the id of 0 explicit at index 0.
+//  The number of xids physically stored in the xids array is in
+//  the variable num_stored_xids.
+//
+// The xids struct is immutable.  The caller gets an initial version of XIDS
+// by calling xids_get_root_xids(), which returns the constant struct
+// representing the root transaction (id 0).  When a transaction begins, 
+// a new XIDS is created with the id of the current transaction appended to
+// the list.
+// 
+//
+
+
+// This is the xids list for a transactionless environment.
+// It is also the initial state of any xids list created for
+// nested transactions.
+
+
+XIDS
+xids_get_root_xids(void) {
+    static const struct xids_t root_xids = {
+        .num_stored_xids = 0
+    };
+
+    XIDS rval = (XIDS)&root_xids;
+    return rval;
+} 
+
+
+// xids is immutable.  This function creates a new xids by copying the 
+// parent's list and then appending the xid of the new transaction.
+int
+xids_create_child(XIDS   parent_xids,		// xids list for parent transaction
+		  XIDS * xids_p,		// xids list created
+		  TXNID  this_xid) {		// xid of this transaction (new innermost)
+    int rval;
+    assert(parent_xids);
+    assert(this_xid > xids_get_innermost_xid(parent_xids));
+    u_int8_t num_stored_xids = parent_xids->num_stored_xids + 1;
+    u_int8_t num_xids        = num_stored_xids + 1;
+    assert(num_xids > 0);
+    assert(num_xids <= MAX_TRANSACTION_RECORDS);
+    if (num_xids == MAX_TRANSACTION_RECORDS) rval = EINVAL;
+    else {
+        XIDS xids = toku_malloc(sizeof(*xids) + num_stored_xids*sizeof(xids->ids[0]));
+        if (!xids) rval = ENOMEM;
+        else {
+            xids->num_stored_xids = num_stored_xids;
+            memcpy(xids->ids,
+                   parent_xids->ids,
+                   parent_xids->num_stored_xids*sizeof(parent_xids->ids[0])); 
+            TXNID this_xid_disk = toku_htod64(this_xid);
+            xids->ids[num_stored_xids-1] = this_xid_disk;
+            *xids_p = xids;
+            rval = 0;
+        }
+    }
+    return rval;
+}
+
+void
+xids_create_from_buffer(struct rbuf *rb,		// xids list for parent transaction
+		        XIDS * xids_p) {		// xids list created
+    u_int8_t num_stored_xids = rbuf_char(rb);
+    u_int8_t num_xids        = num_stored_xids + 1;
+    assert(num_xids > 0);
+    assert(num_xids < MAX_TRANSACTION_RECORDS);
+    XIDS xids = toku_xmalloc(sizeof(*xids) + num_stored_xids*sizeof(xids->ids[0]));
+    xids->num_stored_xids = num_stored_xids;
+    u_int8_t index;
+    for (index = 0; index < xids->num_stored_xids; index++) {
+        rbuf_TXNID(rb, &xids->ids[index]);
+        if (index > 0)
+            assert(xids->ids[index] > xids->ids[index-1]);
+    }
+    *xids_p = xids;
+}
+
+
+void
+xids_destroy(XIDS *xids_p) {
+    if (*xids_p != xids_get_root_xids()) toku_free(*xids_p);
+    *xids_p = NULL;
+}
+
+
+
+
+// Return xid at requested position.  
+// If requesting an xid out of range (which will be the case if xids array is empty)
+// then return 0, the xid of the root transaction.
+TXNID 
+xids_get_xid(XIDS xids, u_int8_t index) {
+    TXNID rval = 0;
+    if (index > 0) {
+        assert(index < xids_get_num_xids(xids));
+        rval = xids->ids[index-1];
+        rval = toku_dtoh64(rval);
+    }
+    return rval;
+}
+
+// This function assumes that target_xid IS in the list
+// of xids.
+u_int8_t 
+xids_find_index_of_xid(XIDS xids, TXNID target_xid) {
+    u_int8_t index = 0;  // search outer to inner
+    TXNID current_xid = xids_get_xid(xids, index);
+    while (current_xid != target_xid) {
+        assert(current_xid < target_xid);
+        index++;
+        current_xid = xids_get_xid(xids, index); // Next inner txnid in xids.
+    }
+    return index;
+}
+
+u_int8_t 
+xids_get_num_xids(XIDS xids) {
+    u_int8_t rval = xids->num_stored_xids+1; //+1 for the id of 0 made explicit by xids<> accessors
+    return rval;
+}
+
+// Return innermost xid 
+TXNID 
+xids_get_innermost_xid(XIDS xids) {
+    TXNID rval = xids_get_xid(xids, xids_get_num_xids(xids)-1);
+    return rval;
+}
+
+void
+xids_cpy(XIDS target, XIDS source) {
+    size_t size = xids_get_size(source);
+    memcpy(target, source, size);
+}
+
+// return size in bytes
+u_int32_t 
+xids_get_size(XIDS xids){
+    u_int32_t rval;
+    u_int8_t num_stored_xids = xids->num_stored_xids;
+    rval = sizeof(*xids) + num_stored_xids * sizeof(xids->ids[0]);
+    return rval;
+};
+
+u_int32_t 
+xids_get_serialize_size(XIDS xids){
+    u_int32_t rval;
+    u_int8_t num_stored_xids = xids->num_stored_xids;
+    rval = 1 + //num stored xids
+           8 * num_stored_xids;
+    return rval;
+};
+
+void
+toku_calc_more_murmur_xids (struct x1764 *mm, XIDS xids) {
+    x1764_add(mm, &xids->num_stored_xids, 1);
+    u_int8_t index;
+    u_int8_t num_xids = xids_get_num_xids(xids);
+    for (index = 0; index < num_xids; index++) {
+        TXNID current_xid = xids_get_xid(xids, index);
+        x1764_add(mm, &current_xid, 8);
+    }
+}
+
+unsigned char *
+xids_get_end_of_array(XIDS xids) {
+    TXNID *r = xids->ids + xids->num_stored_xids;
+    return (unsigned char*)r;
+}
+
+void wbuf_xids(struct wbuf *wb, XIDS xids) {
+    wbuf_char(wb, (unsigned char)xids->num_stored_xids);
+    u_int8_t index;
+    for (index = 0; index < xids->num_stored_xids; index++) {
+        wbuf_TXNID(wb, xids->ids[index]);
+    }
+}
+
--- a/newbrt/xids.h
+++ b/newbrt/xids.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+/* Purpose of this file is to provide the world with everything necessary
+ * to use the xids and nothing else.  
+ * Internal requirements of the xids logic do not belong here.
+ *
+ * xids is (abstractly) an immutable list of nested transaction ids, accessed only
+ * via the functions in this file.  
+ *
+ * See design documentation for nested transactions at
+ * TokuWiki/Imp/TransactionsOverview.
+ */
+
+#ifndef XIDS_H
+#define XIDS_H
+
+#include "x1764.h"
+
+#include "rbuf.h"
+#include "wbuf.h"
+/* The number of transaction ids stored in the xids structure is 
+ * represented by an 8-bit value.  The value 255 is reserved. 
+ * The constant MAX_NESTED_TRANSACTIONS is one less because
+ * one slot in the packed leaf entry is used for the implicit
+ * root transaction (id 0).
+ */
+enum {MAX_NESTED_TRANSACTIONS = 253};
+enum {MAX_TRANSACTION_RECORDS = MAX_NESTED_TRANSACTIONS + 1};	
+
+
+//Retrieve an XIDS representing the root transaction.
+XIDS xids_get_root_xids(void);
+
+void xids_cpy(XIDS target, XIDS source);
+
+//Creates an XIDS representing this transaction.
+//You must pass in an XIDS representing the parent of this transaction.
+int  xids_create_child(XIDS parent_xids, XIDS *xids_p, TXNID this_xid);
+void xids_create_from_buffer(struct rbuf *rb, XIDS * xids_p);
+
+void xids_destroy(XIDS *xids_p);
+
+TXNID xids_get_xid(XIDS xids, u_int8_t index);
+
+u_int8_t xids_find_index_of_xid(XIDS xids, TXNID target_xid);
+
+u_int8_t xids_get_num_xids(XIDS xids);
+
+TXNID xids_get_innermost_xid(XIDS xids);
+
+// return size in bytes
+u_int32_t xids_get_size(XIDS xids);
+
+u_int32_t xids_get_serialize_size(XIDS xids);
+
+void toku_calc_more_murmur_xids (struct x1764 *mm, XIDS xids);
+
+unsigned char *xids_get_end_of_array(XIDS xids);
+
+void wbuf_xids(struct wbuf *wb, XIDS xids);
+
+#endif 
--- a/src/tests/Makefile
+++ b/src/tests/Makefile
@@ -137,8 +137,6 @@ TDB_TESTS_THAT_SHOULD_FAIL= \
 	test_groupcommit_count \
 	test944 \
 	test_truncate_txn_abort \
-	test_txn_nested_abort3 \
-	test_txn_nested_abort4 \
 #\ ends prev line
 ifneq ($(OS_CHOICE),windows)
    TDB_TESTS_THAT_SHOULD_FAIL+= \

--- a/src/tests/test_thread_stack.c
+++ b/src/tests/test_thread_stack.c
@@ -111,7 +111,12 @@ test_main(int argc, char *argv[]) {
    DB_ENV *env;

    r = db_env_create(&env, 0); assert(r == 0);
-    r = env->set_cachesize(env, 0, 8000000, 1); assert(r == 0);
+    //r = env->set_cachesize(env, 0, 8000000, 1); assert(r == 0); //Prior to nested transactions
+    //This ran incredibly slow with nested transactions.  I believe it makes sense to do the following:
+    //a node is 4MiB.  Nodes can become overfull.  If you can't have two nodes in memory, you thrash,
+    //So support 2 nodes plus a bit of wiggle room.
+    //r = env->set_cachesize(env, 0, (8<<20) + (1<<8), 1); assert(r == 0); //As of [13075] this is enough to hold the 2 nodes/run fast
+    r = env->set_cachesize(env, 0, (9<<20), 1); assert(r == 0);
    r = env->open(env, ENVDIR, DB_CREATE + DB_THREAD + DB_PRIVATE + DB_INIT_MPOOL + DB_INIT_LOCK, S_IRWXU+S_IRWXG+S_IRWXO); assert(r == 0);

    DB *db;

--- a/src/ydb-internal.h
+++ b/src/ydb-internal.h
@@ -92,6 +92,21 @@ void toku_ydb_unlock(void);
 /** Handle a panicked database: return EINVAL if the database env is panicked */
 #define HANDLE_PANICKED_DB(db) HANDLE_PANICKED_ENV(db->dbenv)

+
+/** Handle a transaction that has a child: return EINVAL if the transaction tries to do any work.
+    Only commit/abort/prelock (which are used by handlerton) are allowed when a child exists.  */
+#define HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, txn) \
+        RAISE_COND_EXCEPTION(((txn) && db_txn_struct_i(txn)->child), \
+                             toku_ydb_do_error((env),                \
+                                               EINVAL,               \
+                                               "%s: Transaction cannot do work when child exists", __FUNCTION__))
+
+#define HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn) \
+        HANDLE_ILLEGAL_WORKING_PARENT_TXN((db)->dbenv, txn)
+
+#define HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c)   \
+        HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN((c)->dbp, dbc_struct_i(c)->txn)
+
 /* */
 void toku_ydb_error_all_cases(const DB_ENV * env, 
                              int error, 

--- a/src/ydb.c
+++ b/src/ydb.c
@@ -1035,23 +1035,19 @@ static int toku_txn_commit(DB_TXN * txn, u_int32_t flags) {
    HANDLE_PANICKED_ENV(txn->mgrp);
    //Recursively kill off children
    int r_child_first = 0;
-    while (db_txn_struct_i(txn)->child) {
+    if (db_txn_struct_i(txn)->child) {
+        //commit of child sets the child pointer to NULL
        int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, flags);
        if (!r_child_first) r_child_first = r_child;
        //In a panicked env, the child may not be removed from the list.
        HANDLE_PANICKED_ENV(txn->mgrp);
    }
+    assert(!db_txn_struct_i(txn)->child);
    //Remove from parent
    if (txn->parent) {
-        if (db_txn_struct_i(txn->parent)->child==txn) db_txn_struct_i(txn->parent)->child=db_txn_struct_i(txn)->next;
-        if (db_txn_struct_i(txn->parent)->child==txn) {
+        assert(db_txn_struct_i(txn->parent)->child == txn);
        db_txn_struct_i(txn->parent)->child=NULL;
    }
-        else {
-	    db_txn_struct_i(db_txn_struct_i(txn)->next)->prev = db_txn_struct_i(txn)->prev;
-            db_txn_struct_i(db_txn_struct_i(txn)->prev)->next = db_txn_struct_i(txn)->next;
-        }
-    }
    //toku_ydb_notef("flags=%d\n", flags);
    int nosync = (flags & DB_TXN_NOSYNC)!=0 || (db_txn_struct_i(txn)->flags&DB_TXN_NOSYNC);
    flags &= ~DB_TXN_NOSYNC;
@@ -1092,25 +1088,21 @@ static u_int32_t toku_txn_id(DB_TXN * txn) {

 static int toku_txn_abort(DB_TXN * txn) {
    HANDLE_PANICKED_ENV(txn->mgrp);
-    //Recursively kill off children
+    //Recursively kill off children (abort or commit are both correct)
    int r_child_first = 0;
-    while (db_txn_struct_i(txn)->child) {
-        int r_child = toku_txn_abort(db_txn_struct_i(txn)->child);
+    if (db_txn_struct_i(txn)->child) {
+        //commit of child sets the child pointer to NULL
+        int r_child = toku_txn_commit(db_txn_struct_i(txn)->child, DB_TXN_NOSYNC);
        if (!r_child_first) r_child_first = r_child;
        //In a panicked env, the child may not be removed from the list.
        HANDLE_PANICKED_ENV(txn->mgrp);
    }
+    assert(!db_txn_struct_i(txn)->child);
    //Remove from parent
    if (txn->parent) {
-        if (db_txn_struct_i(txn->parent)->child==txn) db_txn_struct_i(txn->parent)->child=db_txn_struct_i(txn)->next;
-        if (db_txn_struct_i(txn->parent)->child==txn) {
+        assert(db_txn_struct_i(txn->parent)->child == txn);
        db_txn_struct_i(txn->parent)->child=NULL;
    }
-        else {
-            db_txn_struct_i(db_txn_struct_i(txn)->next)->prev = db_txn_struct_i(txn)->prev;
-            db_txn_struct_i(db_txn_struct_i(txn)->prev)->next = db_txn_struct_i(txn)->next;
-        }
-    }
    //int r = toku_logger_abort(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL);
    int r = toku_txn_abort_txn(db_txn_struct_i(txn)->tokutxn, ydb_yield, NULL);
    int r2 = toku_txn_release_locks(txn);
@@ -1159,6 +1151,7 @@ static int locked_txn_abort(DB_TXN *txn) {

 static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t flags) {
    HANDLE_PANICKED_ENV(env);
+    HANDLE_ILLEGAL_WORKING_PARENT_TXN(env, stxn); //Cannot create child while child already exists.
    if (!toku_logger_is_open(env->i->logger)) return toku_ydb_do_error(env, EINVAL, "Environment does not have logging enabled\n");
    if (!(env->i->open_flags & DB_INIT_TXN))  return toku_ydb_do_error(env, EINVAL, "Environment does not have transactions enabled\n");
    u_int32_t txn_flags = 0;
@@ -1217,17 +1210,8 @@ static int toku_txn_begin(DB_ENV *env, DB_TXN * stxn, DB_TXN ** txn, u_int32_t f
        return r;
    //Add to the list of children for the parent.
    if (result->parent) {
-        if (!db_txn_struct_i(result->parent)->child) {
+        assert(!db_txn_struct_i(result->parent)->child);
        db_txn_struct_i(result->parent)->child = result;
-            db_txn_struct_i(result)->next = result;
-            db_txn_struct_i(result)->prev = result;
-        }
-        else {
-            db_txn_struct_i(result)->prev = db_txn_struct_i(db_txn_struct_i(result->parent)->child)->prev;
-            db_txn_struct_i(result)->next = db_txn_struct_i(result->parent)->child;
-            db_txn_struct_i(db_txn_struct_i(db_txn_struct_i(result->parent)->child)->prev)->next = result;
-            db_txn_struct_i(db_txn_struct_i(result->parent)->child)->prev = result;
-        }
    }
    *txn = result;
    return 0;
@@ -1418,7 +1402,9 @@ c_db_is_nodup(DBC *c) {

 static int
 toku_c_get(DBC* c, DBT* key, DBT* val, u_int32_t flag) {
-    //OW!! SCALDING!
+    //This function exists for legacy (test compatibility) purposes/parity with bdb.
+    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    u_int32_t main_flag       = get_main_cursor_flag(flag);
    u_int32_t remaining_flags = get_nonmain_cursor_flags(flag);
@@ -1693,6 +1679,7 @@ static int c_del_callback(DBT const *key, DBT const *val, void *extra);
 static int
 toku_c_del(DBC * c, u_int32_t flags) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
@@ -1748,6 +1735,7 @@ static int c_getf_first_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, by
 static int
 toku_c_getf_first(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_S context; //Describes the context of this query.
    query_context_init(&context, c, flag, f, extra); 
@@ -1802,6 +1790,7 @@ static int c_getf_last_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, byt
 static int
 toku_c_getf_last(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_S context; //Describes the context of this query.
    query_context_init(&context, c, flag, f, extra); 
@@ -1857,6 +1846,7 @@ static int
 toku_c_getf_next(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    if (c_db_is_nodup(c))             r = toku_c_getf_next_nodup(c, flag, f, extra);
    else if (toku_c_uninitialized(c)) r = toku_c_getf_first(c, flag, f, extra);
    else {
@@ -1912,6 +1902,7 @@ static int
 toku_c_getf_next_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    if (toku_c_uninitialized(c)) r = toku_c_getf_first(c, flag, f, extra);
    else {
        QUERY_CONTEXT_S context; //Describes the context of this query.
@@ -1929,6 +1920,7 @@ static int c_getf_next_dup_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen,
 static int
 toku_c_getf_next_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    if (toku_c_uninitialized(c)) return EINVAL;

    QUERY_CONTEXT_S context; //Describes the context of this query.
@@ -1983,6 +1975,7 @@ static int
 toku_c_getf_prev(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    if (c_db_is_nodup(c))             r = toku_c_getf_prev_nodup(c, flag, f, extra);
    else if (toku_c_uninitialized(c)) r = toku_c_getf_last(c, flag, f, extra);
    else {
@@ -2038,6 +2031,7 @@ static int
 toku_c_getf_prev_nodup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    if (toku_c_uninitialized(c)) r = toku_c_getf_last(c, flag, f, extra);
    else {
        QUERY_CONTEXT_S context; //Describes the context of this query.
@@ -2055,6 +2049,7 @@ static int c_getf_prev_dup_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen,
 static int
 toku_c_getf_prev_dup(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    if (toku_c_uninitialized(c)) return EINVAL;

    QUERY_CONTEXT_S context; //Describes the context of this query.
@@ -2108,6 +2103,7 @@ static int c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen,
 static int
 toku_c_getf_current(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_S context; //Describes the context of this query.
    query_context_init(&context, c, flag, f, extra); 
@@ -2143,6 +2139,7 @@ c_getf_current_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, bytevec val
 static int
 toku_c_getf_current_binding(DBC *c, u_int32_t flag, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_S context; //Describes the context of this query.
    query_context_init(&context, c, flag, f, extra); 
@@ -2158,6 +2155,7 @@ static int c_getf_set_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen, byte
 static int
 toku_c_getf_set(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
@@ -2215,6 +2213,7 @@ static int c_getf_set_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen
 static int
 toku_c_getf_set_range(DBC *c, u_int32_t flag, DBT *key, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
    query_context_with_input_init(&context, c, flag, key, NULL, f, extra); 
@@ -2273,6 +2272,7 @@ static int c_getf_get_both_callback(ITEMLEN keylen, bytevec key, ITEMLEN vallen,
 static int
 toku_c_getf_get_both(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);

    QUERY_CONTEXT_WITH_INPUT_S context; //Describes the context of this query.
    query_context_with_input_init(&context, c, flag, key, val, f, extra); 
@@ -2323,6 +2323,7 @@ static int c_getf_get_both_range_callback(ITEMLEN keylen, bytevec key, ITEMLEN v
 static int
 toku_c_getf_get_both_range(DBC *c, u_int32_t flag, DBT *key, DBT *val, YDB_CALLBACK_FUNCTION f, void *extra) {
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    int r;
    if (c_db_is_nodup(c)) r = toku_c_getf_get_both(c, flag, key, val, f, extra);
    else {
@@ -2417,6 +2418,7 @@ toku_c_getf_heaviside(DBC *c, u_int32_t flag,
                      int direction) {
    int r;
    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    HEAVI_WRAPPER_S wrapper;
    heavi_wrapper_init(&wrapper, h, extra_h, direction);
    QUERY_CONTEXT_HEAVISIDE_S context; //Describes the context of this query.
@@ -2557,6 +2559,8 @@ tmp_cleanup:
 }

 static int toku_c_close(DBC * c) {
+    HANDLE_PANICKED_DB(c->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(c);
    int r = toku_brt_cursor_close(dbc_struct_i(c)->c);
    toku_sdbt_cleanup(&dbc_struct_i(c)->skey_s);
    toku_sdbt_cleanup(&dbc_struct_i(c)->sval_s);
@@ -2576,6 +2580,8 @@ static inline int keyeq(DBC *c, DBT *a, DBT *b) {
 // pointed to by the brt cursor.  
 static int 
 toku_c_count(DBC *cursor, db_recno_t *count, u_int32_t flags) {
+    HANDLE_PANICKED_DB(cursor->dbp);
+    HANDLE_CURSOR_ILLEGAL_WORKING_PARENT_TXN(cursor);
    int r;
    DBC *count_cursor = 0;
    DBT currentkey;
@@ -2651,6 +2657,7 @@ db_getf_get_both(DB *db, DB_TXN *txn, u_int32_t flags, DBT *key, DBT *val, YDB_C
 static int
 toku_db_del(DB *db, DB_TXN *txn, DBT *key, u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
    BOOL error_if_missing = (BOOL)(!(flags&DB_DELETE_ANY));
@@ -2701,6 +2708,7 @@ static int locked_c_del(DBC * c, u_int32_t flags) {

 static int toku_db_cursor(DB * db, DB_TXN * txn, DBC ** c, u_int32_t flags, int is_temporary_cursor) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    if (flags != 0)
        return EINVAL;
    DBC *MALLOC(result);
@@ -2753,6 +2761,7 @@ static int toku_db_cursor(DB * db, DB_TXN * txn, DBC ** c, u_int32_t flags, int
 static int
 toku_db_delboth(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    u_int32_t unchecked_flags = flags;
    //DB_DELETE_ANY means delete regardless of whether it exists in the db.
    BOOL error_if_missing = (BOOL)(!(flags&DB_DELETE_ANY));
@@ -2788,6 +2797,7 @@ static inline int db_thread_need_flags(DBT *dbt) {

 static int toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    int r;

    if ((db->i->open_flags & DB_THREAD) && db_thread_need_flags(data))
@@ -2810,6 +2820,7 @@ static int toku_db_get (DB * db, DB_TXN * txn, DBT * key, DBT * data, u_int32_t
 #if 0
 static int toku_db_key_range(DB * db, DB_TXN * txn, DBT * dbt, DB_KEY_RANGE * kr, u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    txn=txn; dbt=dbt; kr=kr; flags=flags;
    toku_ydb_barf();
    abort();
@@ -2988,6 +2999,7 @@ cleanup:

 static int toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, u_int32_t flags, int mode) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    if (dbname!=NULL) 
        return multiple_db_open(db, txn, fname, dbname, dbtype, flags, mode);

@@ -3187,6 +3199,7 @@ db_put_check_overwrite_constraint(DB *db, DB_TXN *txn, DBT *key, DBT *UU(val),
 static int
 toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    int r;

    u_int32_t lock_flags = get_prelocked_flags(flags, txn);
@@ -3389,6 +3402,7 @@ static int toku_db_set_pagesize(DB *db, u_int32_t pagesize) {

 static int toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    return toku_brt_stat64(db->i->brt, db_txn_struct_i(txn)->tokutxn, &s->bt_nkeys, &s->bt_ndata, &s->bt_dsize, &s->bt_fsize);
 }
 static int locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
@@ -3401,6 +3415,7 @@ static int locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) {

 static int toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);

    // note that toku_brt_keyrange does not have a txn param
    // this will be fixed later
@@ -3549,6 +3564,7 @@ static int locked_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) {
 // effect: remove all of the rows from a database
 static int toku_db_truncate(DB *db, DB_TXN *txn, u_int32_t *row_count, u_int32_t flags) {
    HANDLE_PANICKED_DB(db);
+    HANDLE_DB_ILLEGAL_WORKING_PARENT_TXN(db, txn);
    int r;

    u_int32_t unhandled_flags = flags;

--- a/test.wiki
+++ b/test.wiki
+ * One
+ * two
+ 1. FOO
+  a. sock
+  a. pizza
+ 2. elephant
+
+[[Include(source:toku/tokudb.1125/test2.wiki,wiki)]]
--- a/test2.wiki
+++ b/test2.wiki
+1. these
+1. lines
+1. from
+1. test2.wiki
--- a/toku_include/Makefile.include
+++ b/toku_include/Makefile.include
@@ -184,6 +184,9 @@ VGRIND=valgrind --quiet --error-exitcode=1 --leak-check=full --show-reachable=ye
 ifeq ($(DB_ATTACH),1)
 	VGRIND+=--db-attach=yes
 endif
+ifeq ($(TRACK_ORIGINS),1)
+	VGRIND+=--track-origins=yes
+endif

 HGRIND=valgrind --quiet --tool=helgrind --error-exitcode=1


--- a/toku_include/toku_htod.h
+++ b/toku_include/toku_htod.h
@@ -47,6 +47,16 @@ static const int64_t toku_byte_order_host = 0x0102030405060708LL;
 #endif

 #if DISK_BYTE_ORDER == HOST_BYTE_ORDER
+static inline uint64_t
+toku_dtoh64(uint64_t i) {
+    return i;
+}
+
+static inline uint64_t
+toku_htod64(uint64_t i) {
+    return i;
+}
+
 static inline uint32_t
 toku_dtoh32(uint32_t i) {
    return i;