Refs Tokutek/ft-index#46 Add dmt (dynamic OMT)

Use dmt to replace omt in bn_data class for storing leafentries. Optimization for serial inserts and mempool

Refs Tokutek/ft-index#46 Add dmt (dynamic OMT)
Use dmt to replace omt in bn_data class for storing leafentries. Optimization for serial inserts and mempool
87da689b · Yoni Fogel · 71eb7b8e · 87da689b · 87da689b · 87da689b
Commit 87da689b authored Oct 02, 2013 by Yoni Fogel
23 changed files
--- a/ft/CMakeLists.txt
+++ b/ft/CMakeLists.txt
@@ -31,6 +31,7 @@ set(FT_SOURCES
  checkpoint
  compress
  dbufio
+  dmt-wrapper
  fifo
  ft
  ft-cachetable-wrappers

--- a/ft/bndata.cc
+++ b/ft/bndata.cc
@@ -90,46 +90,154 @@ PATENT RIGHTS GRANT:
 #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

 #include <bndata.h>
+#include <ft-ops.h>

-static uint32_t klpair_size(KLPAIR klpair){
-    return sizeof(*klpair) + klpair->keylen + leafentry_memsize(get_le_from_klpair(klpair));
-}
-
-static uint32_t klpair_disksize(KLPAIR klpair){
-    return sizeof(*klpair) + klpair->keylen + leafentry_disksize(get_le_from_klpair(klpair));
+using namespace toku;
+uint32_t bn_data::klpair_disksize(const uint32_t klpair_len, const klpair_struct *klpair) const {
+    return sizeof(*klpair) + keylen_from_klpair_len(klpair_len) + leafentry_disksize(get_le_from_klpair(klpair));
 }

 void bn_data::init_zero() {
    toku_mempool_zero(&m_buffer_mempool);
+    m_disksize_of_keys = 0;
 }

 void bn_data::initialize_empty() {
-    toku_mempool_zero(&m_buffer_mempool);
-    m_buffer.create_no_array();
+    init_zero();
+    m_buffer.create();
+}
+
+void bn_data::add_key(uint32_t keylen) {
+    m_disksize_of_keys += sizeof(keylen) + keylen;
+}
+
+void bn_data::add_keys(uint32_t n_keys, uint32_t combined_keylen) {
+    invariant(n_keys * sizeof(uint32_t) <= combined_keylen);
+    m_disksize_of_keys += combined_keylen;
+}
+
+void bn_data::remove_key(uint32_t keylen) {
+    m_disksize_of_keys -= sizeof(keylen) + keylen;
+}
+
+void bn_data::initialize_from_separate_keys_and_vals(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version UU(),
+                                                     uint32_t key_data_size, uint32_t val_data_size, bool all_keys_same_length,
+                                                     uint32_t fixed_key_length) {
+    paranoid_invariant(version >= FT_LAYOUT_VERSION_25);  // Support was added @25
+    uint32_t ndone_before = rb->ndone;
+    init_zero();
+    invariant(all_keys_same_length);  // Until otherwise supported.
+    bytevec keys_src;
+    rbuf_literal_bytes(rb, &keys_src, key_data_size);
+    //Generate dmt
+    this->m_buffer.create_from_sorted_memory_of_fixed_size_elements(
+            keys_src, num_entries, key_data_size, fixed_key_length);
+    toku_mempool_construct(&this->m_buffer_mempool, val_data_size);
+
+    bytevec vals_src;
+    rbuf_literal_bytes(rb, &vals_src, val_data_size);
+
+    if (num_entries > 0) {
+        void *vals_dest = toku_mempool_malloc(&this->m_buffer_mempool, val_data_size, 1);
+        paranoid_invariant_notnull(vals_dest);
+        memcpy(vals_dest, vals_src, val_data_size);
+    }
+
+    add_keys(num_entries, num_entries * fixed_key_length);
+
+    toku_note_deserialized_basement_node(all_keys_same_length);
+
+    invariant(rb->ndone - ndone_before == data_size);
+}
+// static inline void rbuf_literal_bytes (struct rbuf *r, bytevec *bytes, unsigned int n_bytes) {
+
+void bn_data::prepare_to_serialize(void) {
+    if (m_buffer.is_value_length_fixed()) {
+        m_buffer.prepare_for_serialize();
+        omt_compress_kvspace(0, nullptr, true);  // Gets it ready for easy serialization.
+    }
+}
+
+void bn_data::serialize_header(struct wbuf *wb) const {
+    bool fixed = m_buffer.is_value_length_fixed();
+
+    //key_data_size
+    wbuf_nocrc_uint(wb, m_disksize_of_keys);
+    //val_data_size
+    wbuf_nocrc_uint(wb, toku_mempool_get_used_space(&m_buffer_mempool));
+    //fixed_key_length
+    wbuf_nocrc_uint(wb, m_buffer.get_fixed_length());
+    // all_keys_same_length
+    wbuf_nocrc_uint8_t(wb, fixed);
+    // keys_vals_separate
+    wbuf_nocrc_uint8_t(wb, fixed);
 }

-void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uint32_t data_size) {
+void bn_data::serialize_rest(struct wbuf *wb) const {
+    //Write keys
+    invariant(m_buffer.is_value_length_fixed()); //Assumes prepare_to_serialize was called
+    m_buffer.serialize_values(m_disksize_of_keys, wb);
+
+    //Write leafentries
+    paranoid_invariant(toku_mempool_get_frag_size(&m_buffer_mempool) == 0); //Just ran omt_compress_kvspace
+    uint32_t val_data_size = toku_mempool_get_used_space(&m_buffer_mempool);
+    wbuf_nocrc_literal_bytes(wb, toku_mempool_get_base(&m_buffer_mempool), val_data_size);
+}
+
+bool bn_data::need_to_serialize_each_leafentry_with_key(void) const {
+    return !m_buffer.is_value_length_fixed();
+}
+
+void bn_data::initialize_from_data(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version) {
+    uint32_t key_data_size = data_size;  // overallocate if < version 25
+    uint32_t val_data_size = data_size;  // overallocate if < version 25
+
+    bool all_keys_same_length = false;
+    bool keys_vals_separate = false;
+    uint32_t fixed_key_length = 0;
+
+    if (version >= FT_LAYOUT_VERSION_25) {
+        uint32_t ndone_before = rb->ndone;
+        key_data_size = rbuf_int(rb);
+        val_data_size = rbuf_int(rb);
+        fixed_key_length = rbuf_int(rb);  // 0 if !all_keys_same_length
+        all_keys_same_length = rbuf_char(rb);
+        keys_vals_separate = rbuf_char(rb);
+        invariant(all_keys_same_length == keys_vals_separate);  // Until we support this
+        uint32_t header_size = rb->ndone - ndone_before;
+        data_size -= header_size;
+        invariant(header_size == HEADER_LENGTH);
+        if (keys_vals_separate) {
+            initialize_from_separate_keys_and_vals(num_entries, rb, data_size, version,
+                                                   key_data_size, val_data_size, all_keys_same_length,
+                                                   fixed_key_length);
+            return;
+        }
+    }
+    bytevec bytes;
+    rbuf_literal_bytes(rb, &bytes, data_size);
+    const unsigned char *CAST_FROM_VOIDP(buf, bytes);
    if (data_size == 0) {
        invariant_zero(num_entries);
    }
-    KLPAIR *XMALLOC_N(num_entries, array); // create array of pointers to leafentries
+    init_zero();
+    klpair_dmt_t::builder dmt_builder;
+    dmt_builder.create(num_entries, key_data_size);
+
    unsigned char *newmem = NULL;
    // add same wiggle room that toku_mempool_construct would, 25% extra
-    uint32_t allocated_bytes = data_size + data_size/4;
-    CAST_FROM_VOIDP(newmem, toku_xmalloc(allocated_bytes)); 
-    unsigned char* curr_src_pos = buf;
+    uint32_t allocated_bytes_vals = val_data_size + val_data_size/4;
+    CAST_FROM_VOIDP(newmem, toku_xmalloc(allocated_bytes_vals));
+    const unsigned char* curr_src_pos = buf;
    unsigned char* curr_dest_pos = newmem;
    for (uint32_t i = 0; i < num_entries; i++) {
-        KLPAIR curr_kl = (KLPAIR)curr_dest_pos;
-        array[i] = curr_kl;
-
        uint8_t curr_type = curr_src_pos[0];
        curr_src_pos++;
        // first thing we do is lay out the key,
        // to do so, we must extract it from the leafentry
        // and write it in
        uint32_t keylen = 0;
-        void* keyp = NULL;
+        const void* keyp = NULL;
        keylen = *(uint32_t *)curr_src_pos;
        curr_src_pos += sizeof(uint32_t);
        uint32_t clean_vallen = 0;
@@ -150,12 +258,10 @@ void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uin
            keyp = curr_src_pos;
            curr_src_pos += keylen;
        }
-        // now that we have the keylen and the key, we can copy it
-        // into the destination
-        *(uint32_t *)curr_dest_pos = keylen;
-        curr_dest_pos += sizeof(keylen);
-        memcpy(curr_dest_pos, keyp, keylen);
-        curr_dest_pos += keylen;
+        uint32_t le_offset = curr_dest_pos - newmem;
+        dmt_builder.insert_sorted(toku::dmt_functor<klpair_struct>(keylen, le_offset, keyp));
+        add_key(keylen);
+
        // now curr_dest_pos is pointing to where the leafentry should be packed
        curr_dest_pos[0] = curr_type;
        curr_dest_pos++;
@@ -173,31 +279,56 @@ void bn_data::initialize_from_data(uint32_t num_entries, unsigned char *buf, uin
            *(uint8_t *)curr_dest_pos = num_pxrs;
            curr_dest_pos += sizeof(num_pxrs);
            // now we need to pack the rest of the data
-            uint32_t num_rest_bytes = leafentry_rest_memsize(num_pxrs, num_cxrs, curr_src_pos);
+            uint32_t num_rest_bytes = leafentry_rest_memsize(num_pxrs, num_cxrs, const_cast<uint8_t*>(curr_src_pos));
            memcpy(curr_dest_pos, curr_src_pos, num_rest_bytes);
            curr_dest_pos += num_rest_bytes;
            curr_src_pos += num_rest_bytes;
        }
    }
-    uint32_t num_bytes_read UU() = (uint32_t)(curr_src_pos - buf);
+    dmt_builder.build_and_destroy(&this->m_buffer);
+    toku_note_deserialized_basement_node(m_buffer.is_value_length_fixed());
+
+#if TOKU_DEBUG_PARANOID
+    uint32_t num_bytes_read = (uint32_t)(curr_src_pos - buf);
    paranoid_invariant( num_bytes_read == data_size);
-    uint32_t num_bytes_written = curr_dest_pos - newmem;
-    paranoid_invariant( num_bytes_written == data_size);
-    toku_mempool_init(&m_buffer_mempool, newmem, (size_t)(num_bytes_written), allocated_bytes);

-    // destroy old omt that was created by toku_create_empty_bn(), so we can create a new one
-    m_buffer.destroy();
-    m_buffer.create_steal_sorted_array(&array, num_entries, num_entries);
+    uint32_t num_bytes_written = curr_dest_pos - newmem + m_disksize_of_keys;
+    paranoid_invariant( num_bytes_written == data_size);
+#endif
+    toku_mempool_init(&m_buffer_mempool, newmem, (size_t)(curr_dest_pos - newmem), allocated_bytes_vals);
+
+    paranoid_invariant(get_disk_size() == data_size);
+    if (version < FT_LAYOUT_VERSION_25) {
+        //Maybe shrink mempool.  Unnecessary after version 25
+        size_t used = toku_mempool_get_used_space(&m_buffer_mempool);
+        size_t max_allowed = used + used / 4;
+        size_t allocated = toku_mempool_get_size(&m_buffer_mempool);
+        size_t footprint = toku_mempool_footprint(&m_buffer_mempool);
+        if (allocated > max_allowed && footprint > max_allowed) {
+            // Reallocate smaller mempool to save memory
+            invariant_zero(toku_mempool_get_frag_size(&m_buffer_mempool));
+            struct mempool new_mp;
+            toku_mempool_construct(&new_mp, used);
+            void * newbase = toku_mempool_malloc(&new_mp, used, 1);
+            invariant_notnull(newbase);
+            memcpy(newbase, toku_mempool_get_base(&m_buffer_mempool), used);
+            toku_mempool_destroy(&m_buffer_mempool);
+            m_buffer_mempool = new_mp;
+        }
+    }
 }

 uint64_t bn_data::get_memory_size() {
    uint64_t retval = 0;
+    //TODO: Maybe ask for memory_size instead of mempool_footprint (either this todo or the next)
    // include fragmentation overhead but do not include space in the
    // mempool that has not yet been allocated for leaf entries
    size_t poolsize = toku_mempool_footprint(&m_buffer_mempool);
-    invariant(poolsize >= get_disk_size());
    retval += poolsize;
+    // This one includes not-yet-allocated for nodes (just like old constant-key omt)
+    //TODO: Maybe ask for mempool_footprint instead of memory_size.
    retval += m_buffer.memory_size();
+    invariant(retval >= get_disk_size());
    return retval;
 }

@@ -207,42 +338,46 @@ void bn_data::delete_leafentry (
    uint32_t old_le_size
    ) 
 {
+    remove_key(keylen);
    m_buffer.delete_at(idx);
-    toku_mempool_mfree(&m_buffer_mempool, 0, old_le_size + keylen + sizeof(keylen)); // Must pass 0, since le is no good any more.
+    toku_mempool_mfree(&m_buffer_mempool, nullptr, old_le_size); // Must pass nullptr, since le is no good any more.
 }

 /* mempool support */

 struct omt_compressor_state {
    struct mempool *new_kvspace;
-    KLPAIR *newvals;
+    class bn_data *bd;
 };

-static int move_it (const KLPAIR &klpair, const uint32_t idx, struct omt_compressor_state * const oc) {
-    uint32_t size = klpair_size(klpair);
-    KLPAIR CAST_FROM_VOIDP(newdata, toku_mempool_malloc(oc->new_kvspace, size, 1));
+static int move_it (const uint32_t, klpair_struct *klpair, const uint32_t idx UU(), struct omt_compressor_state * const oc) {
+    LEAFENTRY old_le = oc->bd->get_le_from_klpair(klpair);
+    uint32_t size = leafentry_memsize(old_le);
+    void* newdata = toku_mempool_malloc(oc->new_kvspace, size, 1);
    paranoid_invariant_notnull(newdata); // we do this on a fresh mempool, so nothing bad should happen
-    memcpy(newdata, klpair, size);
-    oc->newvals[idx] = newdata;
+    memcpy(newdata, old_le, size);
+    klpair->le_offset = toku_mempool_get_offset_from_pointer_and_base(oc->new_kvspace, newdata);
    return 0;
 }

 // Compress things, and grow the mempool if needed.
-void bn_data::omt_compress_kvspace(size_t added_size, void **maybe_free) {
+void bn_data::omt_compress_kvspace(size_t added_size, void **maybe_free, bool force_compress) {
    uint32_t total_size_needed = toku_mempool_get_used_space(&m_buffer_mempool) + added_size;
    // set the new mempool size to be twice of the space we actually need.
    // On top of the 25% that is padded within toku_mempool_construct (which we
    // should consider getting rid of), that should be good enough.
+    if (!force_compress && toku_mempool_get_frag_size(&m_buffer_mempool) == 0) {
+        // Skip iterate, just realloc.
+        toku_mempool_realloc_larger(&m_buffer_mempool, 2*total_size_needed);
+        if (maybe_free) {
+            *maybe_free = nullptr;
+        }
+        return;
+    }
    struct mempool new_kvspace;
    toku_mempool_construct(&new_kvspace, 2*total_size_needed);
-    uint32_t numvals = omt_size();
-    KLPAIR *XMALLOC_N(numvals, newvals);
-    struct omt_compressor_state oc = { &new_kvspace, newvals };
-
-    m_buffer.iterate_on_range< decltype(oc), move_it >(0, omt_size(), &oc);
-
-    m_buffer.destroy();
-    m_buffer.create_steal_sorted_array(&newvals, numvals, numvals);
+    struct omt_compressor_state oc = { &new_kvspace, this};
+    m_buffer.iterate_ptr< decltype(oc), move_it >(&oc);

    if (maybe_free) {
        *maybe_free = m_buffer_mempool.base;
@@ -256,38 +391,47 @@ void bn_data::omt_compress_kvspace(size_t added_size, void **maybe_free) {
 //  from the OMT (which items refer to items in the old mempool) into the new mempool.
 //  If MAYBE_FREE is NULL then free the old mempool's space.
 //  Otherwise, store the old mempool's space in maybe_free.
-KLPAIR bn_data::mempool_malloc_from_omt(size_t size, void **maybe_free) {
+LEAFENTRY bn_data::mempool_malloc_and_update_omt(size_t size, void **maybe_free) {
    void *v = toku_mempool_malloc(&m_buffer_mempool, size, 1);
    if (v == NULL) {
-        omt_compress_kvspace(size, maybe_free);
+        omt_compress_kvspace(size, maybe_free, false);
        v = toku_mempool_malloc(&m_buffer_mempool, size, 1);
        paranoid_invariant_notnull(v);
    }
-    return (KLPAIR)v;
+    return (LEAFENTRY)v;
 }

 //TODO: probably not free the "maybe_free" right away?
 void bn_data::get_space_for_overwrite(
    uint32_t idx,
-    const void* keyp,
-    uint32_t keylen,
+    const void* keyp UU(),
+    uint32_t keylen UU(),
    uint32_t old_le_size,
    uint32_t new_size,
    LEAFENTRY* new_le_space
    )
 {
    void* maybe_free = nullptr;
-    uint32_t size_alloc = new_size + keylen + sizeof(keylen);
-    KLPAIR new_kl = mempool_malloc_from_omt(
-        size_alloc,
+    LEAFENTRY new_le = mempool_malloc_and_update_omt(
+        new_size,
        &maybe_free
        );
-    uint32_t size_freed = old_le_size + keylen + sizeof(keylen);
-    toku_mempool_mfree(&m_buffer_mempool, nullptr, size_freed);  // Must pass nullptr, since le is no good any more.
-    new_kl->keylen = keylen;
-    memcpy(new_kl->key_le, keyp, keylen);
-    m_buffer.set_at(new_kl, idx);
-    *new_le_space = get_le_from_klpair(new_kl);
+    toku_mempool_mfree(&m_buffer_mempool, nullptr, old_le_size);  // Must pass nullptr, since le is no good any more.
+    KLPAIR klp = nullptr;
+    uint32_t klpair_len;  //TODO: maybe delete klpair_len
+    int r = m_buffer.fetch(idx, &klpair_len, &klp);
+    invariant_zero(r);
+    paranoid_invariant(klp!=nullptr);
+    // Key never changes.
+    paranoid_invariant(keylen_from_klpair_len(klpair_len) == keylen);
+    paranoid_invariant(!memcmp(klp->key_le, keyp, keylen));  // TODO: can keyp be pointing to the old space?  If so this could fail
+
+    size_t new_le_offset = toku_mempool_get_offset_from_pointer_and_base(&this->m_buffer_mempool, new_le);
+    paranoid_invariant(new_le_offset <= UINT32_MAX - new_size);  // Not using > 4GB
+    klp->le_offset = new_le_offset;
+
+    paranoid_invariant(new_le == get_le_from_klpair(klp));
+    *new_le_space = new_le;
    // free at end, so that the keyp and keylen
    // passed in is still valid
    if (maybe_free) {
@@ -304,16 +448,19 @@ void bn_data::get_space_for_insert(
    LEAFENTRY* new_le_space
    )
 {
+    add_key(keylen);
+
    void* maybe_free = nullptr;
-    uint32_t size_alloc = size + keylen + sizeof(keylen);
-    KLPAIR new_kl = mempool_malloc_from_omt(
-        size_alloc,
+    LEAFENTRY new_le = mempool_malloc_and_update_omt(
+        size,
        &maybe_free
        );
-    new_kl->keylen = keylen;
-    memcpy(new_kl->key_le, keyp, keylen);
-    m_buffer.insert_at(new_kl, idx);
-    *new_le_space = get_le_from_klpair(new_kl);
+    size_t new_le_offset = toku_mempool_get_offset_from_pointer_and_base(&this->m_buffer_mempool, new_le);
+
+    toku::dmt_functor<klpair_struct> kl(keylen, new_le_offset, keyp);
+    m_buffer.insert_at(kl, idx);
+
+    *new_le_space = new_le;
    // free at end, so that the keyp and keylen
    // passed in is still valid (you never know if
    // it was part of the old mempool, this is just
@@ -330,41 +477,50 @@ void bn_data::move_leafentries_to(
     )
 //Effect: move leafentries in the range [lbi, ube) from this to src_omt to newly created dest_omt
 {
+    //TODO: improve speed: maybe use dmt_builder for one or both, or implement some version of optimized split_at?
    paranoid_invariant(lbi < ube);
    paranoid_invariant(ube <= omt_size());
-    KLPAIR *XMALLOC_N(ube-lbi, newklpointers);    // create new omt
+
+    dest_bd->initialize_empty();

    size_t mpsize = toku_mempool_get_used_space(&m_buffer_mempool);   // overkill, but safe
    struct mempool *dest_mp = &dest_bd->m_buffer_mempool;
    struct mempool *src_mp  = &m_buffer_mempool;
    toku_mempool_construct(dest_mp, mpsize);

-    uint32_t i = 0;
-    for (i = lbi; i < ube; i++) {
-        KLPAIR curr_kl;
-        m_buffer.fetch(i, &curr_kl);
-
-        size_t kl_size = klpair_size(curr_kl);
-        KLPAIR new_kl = NULL;
-        CAST_FROM_VOIDP(new_kl, toku_mempool_malloc(dest_mp, kl_size, 1));
-        memcpy(new_kl, curr_kl, kl_size);
-        newklpointers[i-lbi] = new_kl;
-        toku_mempool_mfree(src_mp, curr_kl, kl_size);
+    for (uint32_t i = lbi; i < ube; i++) {
+        KLPAIR curr_kl = nullptr;
+        uint32_t curr_kl_len;
+        int r = m_buffer.fetch(i, &curr_kl_len, &curr_kl);
+        invariant_zero(r);
+
+        LEAFENTRY old_le = get_le_from_klpair(curr_kl);
+        size_t le_size = leafentry_memsize(old_le);
+        void* new_le = toku_mempool_malloc(dest_mp, le_size, 1);
+        memcpy(new_le, old_le, le_size);
+        size_t le_offset = toku_mempool_get_offset_from_pointer_and_base(dest_mp, new_le);
+        dest_bd->m_buffer.insert_at(dmt_functor<klpair_struct>(keylen_from_klpair_len(curr_kl_len), le_offset, curr_kl->key_le), i-lbi);
+
+        this->remove_key(keylen_from_klpair_len(curr_kl_len));
+        dest_bd->add_key(keylen_from_klpair_len(curr_kl_len));
+
+        toku_mempool_mfree(src_mp, old_le, le_size);
    }

-    dest_bd->m_buffer.create_steal_sorted_array(&newklpointers, ube-lbi, ube-lbi);
    // now remove the elements from src_omt
-    for (i=ube-1; i >= lbi; i--) {
+    for (uint32_t i=ube-1; i >= lbi; i--) {
        m_buffer.delete_at(i);
    }
 }

 uint64_t bn_data::get_disk_size() {
-    return toku_mempool_get_used_space(&m_buffer_mempool);
+    return m_disksize_of_keys +
+           toku_mempool_get_used_space(&m_buffer_mempool);
 }

 void bn_data::verify_mempool(void) {
    // TODO: implement something
+    // TODO: check 7.0 code and see if there was anything there?
 }

 uint32_t bn_data::omt_size(void) const {
@@ -375,6 +531,7 @@ void bn_data::destroy(void) {
    // The buffer may have been freed already, in some cases.
    m_buffer.destroy();
    toku_mempool_destroy(&m_buffer_mempool);
+    m_disksize_of_keys = 0;
 }

 //TODO: Splitting key/val requires changing this
@@ -384,31 +541,39 @@ void bn_data::replace_contents_with_clone_of_sorted_array(
    uint32_t* old_keylens,
    LEAFENTRY* old_les, 
    size_t *le_sizes, 
-    size_t mempool_size
+    size_t total_key_size,
+    size_t total_le_size
    ) 
 {
-    toku_mempool_construct(&m_buffer_mempool, mempool_size);
-    KLPAIR *XMALLOC_N(num_les, le_array);
+    toku_mempool_construct(&m_buffer_mempool, total_le_size);
+    m_buffer.destroy();
+    m_disksize_of_keys = 0;
+
+    klpair_dmt_t::builder dmt_builder;
+    dmt_builder.create(num_les, total_key_size);
+
+    //TODO: speed this up with some form of mass create dmt
    for (uint32_t idx = 0; idx < num_les; idx++) {
-        KLPAIR new_kl = (KLPAIR)toku_mempool_malloc(
-            &m_buffer_mempool,
-            le_sizes[idx] + old_keylens[idx] + sizeof(uint32_t),
-            1); // point to new location
-        new_kl->keylen = old_keylens[idx];
-        memcpy(new_kl->key_le, old_key_ptrs[idx], new_kl->keylen);
-        memcpy(get_le_from_klpair(new_kl), old_les[idx], le_sizes[idx]);
-        CAST_FROM_VOIDP(le_array[idx], new_kl);
+        void* new_le = toku_mempool_malloc(&m_buffer_mempool, le_sizes[idx], 1);
+        memcpy(new_le, old_les[idx], le_sizes[idx]);
+        size_t le_offset = toku_mempool_get_offset_from_pointer_and_base(&m_buffer_mempool, new_le);
+        dmt_builder.insert_sorted(dmt_functor<klpair_struct>(old_keylens[idx], le_offset, old_key_ptrs[idx]));
+        add_key(old_keylens[idx]);
    }
-    //TODO: Splitting key/val requires changing this; keys are stored in old omt.. cannot delete it yet?
-    m_buffer.destroy();
-    m_buffer.create_steal_sorted_array(&le_array, num_les, num_les);
+    dmt_builder.build_and_destroy(&this->m_buffer);
+}
+
+LEAFENTRY bn_data::get_le_from_klpair(const klpair_struct *klpair) const {
+    void * ptr = toku_mempool_get_pointer_from_base_and_offset(&this->m_buffer_mempool, klpair->le_offset);
+    LEAFENTRY CAST_FROM_VOIDP(le, ptr);
+    return le;
 }


 // get info about a single leafentry by index
 int bn_data::fetch_le(uint32_t idx, LEAFENTRY *le) {
    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+    int r = m_buffer.fetch(idx, nullptr, &klpair);
    if (r == 0) {
        *le = get_le_from_klpair(klpair);
    }
@@ -417,9 +582,10 @@ int bn_data::fetch_le(uint32_t idx, LEAFENTRY *le) {

 int bn_data::fetch_klpair(uint32_t idx, LEAFENTRY *le, uint32_t *len, void** key) {
    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+    uint32_t klpair_len;
+    int r = m_buffer.fetch(idx, &klpair_len, &klpair);
    if (r == 0) {
-        *len = klpair->keylen;
+        *len = keylen_from_klpair_len(klpair_len);
        *key = klpair->key_le;
        *le = get_le_from_klpair(klpair);
    }
@@ -428,47 +594,28 @@ int bn_data::fetch_klpair(uint32_t idx, LEAFENTRY *le, uint32_t *len, void** key

 int bn_data::fetch_klpair_disksize(uint32_t idx, size_t *size) {
    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+    uint32_t klpair_len;
+    int r = m_buffer.fetch(idx, &klpair_len, &klpair);
    if (r == 0) {
-        *size = klpair_disksize(klpair);
+        *size = klpair_disksize(klpair_len, klpair);
    }
    return r;
 }

 int bn_data::fetch_le_key_and_len(uint32_t idx, uint32_t *len, void** key) {
    KLPAIR klpair = NULL;
-    int r = m_buffer.fetch(idx, &klpair);
+    uint32_t klpair_len;
+    int r = m_buffer.fetch(idx, &klpair_len, &klpair);
    if (r == 0) {
-        *len = klpair->keylen;
+        *len = keylen_from_klpair_len(klpair_len);
        *key = klpair->key_le;
    }
    return r;
 }

-
-struct mp_pair {
-    void* orig_base;
-    void* new_base;
-    klpair_omt_t* omt;
-};
-
-static int fix_mp_offset(const KLPAIR &klpair, const uint32_t idx,  struct mp_pair * const p) {
-    char* old_value = (char *) klpair;
-    char *new_value = old_value - (char *)p->orig_base + (char *)p->new_base;
-    p->omt->set_at((KLPAIR)new_value, idx);
-    return 0;
-}
-
 void bn_data::clone(bn_data* orig_bn_data) {
    toku_mempool_clone(&orig_bn_data->m_buffer_mempool, &m_buffer_mempool);
    m_buffer.clone(orig_bn_data->m_buffer);
-    struct mp_pair p;
-    p.orig_base = toku_mempool_get_base(&orig_bn_data->m_buffer_mempool);
-    p.new_base = toku_mempool_get_base(&m_buffer_mempool);
-    p.omt = &m_buffer;
-
-    int r = m_buffer.iterate_on_range<decltype(p), fix_mp_offset>(0, omt_size(), &p);
-    invariant_zero(r);
+    this->m_disksize_of_keys = orig_bn_data->m_disksize_of_keys;
 }

-
--- a/ft/bndata.h
+++ b/ft/bndata.h
@@ -91,9 +91,10 @@ PATENT RIGHTS GRANT:

 #pragma once

-#include <util/omt.h>
-#include "leafentry.h"
 #include <util/mempool.h>
+#include "wbuf.h"
+#include <util/dmt.h>
+#include "leafentry.h"

 #if 0 //for implementation
 static int
@@ -110,50 +111,80 @@ UU() verify_in_mempool(OMTVALUE lev, uint32_t UU(idx), void *mpv)
 #endif

 struct klpair_struct {
-    uint32_t keylen;
+    uint32_t le_offset;  //Offset of leafentry (in leafentry mempool)
    uint8_t key_le[0]; // key, followed by le
 };

-typedef struct klpair_struct *KLPAIR;
-
-static inline LEAFENTRY get_le_from_klpair(KLPAIR klpair){
-    uint32_t keylen = klpair->keylen;
-    LEAFENTRY le = (LEAFENTRY)(klpair->key_le + keylen);
-    return le;
+static constexpr uint32_t keylen_from_klpair_len(const uint32_t klpair_len) {
+    return klpair_len - __builtin_offsetof(klpair_struct, key_le);
 }

-template<typename omtcmp_t,
-         int (*h)(const DBT &, const omtcmp_t &)>
-static int wrappy_fun_find(const KLPAIR &klpair, const omtcmp_t &extra) {
-    //TODO: kill this function when we split, and/or use toku_fill_dbt
+typedef struct klpair_struct KLPAIR_S, *KLPAIR;
+
+static_assert(__builtin_offsetof(klpair_struct, key_le) == 1*sizeof(uint32_t), "klpair alignment issues");
+static_assert(__builtin_offsetof(klpair_struct, key_le) == sizeof(klpair_struct), "klpair size issues");
+
+template<typename dmtcmp_t,
+         int (*h)(const DBT &, const dmtcmp_t &)>
+static int wrappy_fun_find(const uint32_t klpair_len, const klpair_struct &klpair, const dmtcmp_t &extra) {
    DBT kdbt;
-    kdbt.data = klpair->key_le;
-    kdbt.size = klpair->keylen;
+    kdbt.data = const_cast<void*>(reinterpret_cast<const void*>(klpair.key_le));
+    kdbt.size = keylen_from_klpair_len(klpair_len);
    return h(kdbt, extra);
 }

+template<typename inner_iterate_extra_t>
+struct wrapped_iterate_extra_t {
+    public:
+    inner_iterate_extra_t *inner;
+    const class bn_data * bd;
+};
+
 template<typename iterate_extra_t,
         int (*h)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t idx, iterate_extra_t *const)>
-static int wrappy_fun_iterate(const KLPAIR &klpair, const uint32_t idx, iterate_extra_t *const extra) {
-    uint32_t keylen = klpair->keylen;
-    void* key = klpair->key_le;
-    LEAFENTRY le = get_le_from_klpair(klpair);
-    return h(key, keylen, le, idx, extra);
+static int wrappy_fun_iterate(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx, wrapped_iterate_extra_t<iterate_extra_t> *const extra) {
+    const void* key = &klpair.key_le;
+    LEAFENTRY le = extra->bd->get_le_from_klpair(&klpair);
+    return h(key, keylen_from_klpair_len(klpair_len), le, idx, extra->inner);
+}
+
+
+namespace toku {
+template<>
+class dmt_functor<klpair_struct> {
+    public:
+        size_t get_dmtdatain_t_size(void) const {
+            return sizeof(klpair_struct) + this->keylen;
+        }
+        void write_dmtdata_t_to(klpair_struct *const dest) const {
+            dest->le_offset = this->le_offset;
+            memcpy(dest->key_le, this->keyp, this->keylen);
+        }
+
+        dmt_functor(uint32_t _keylen, uint32_t _le_offset, const void* _keyp)
+            : keylen(_keylen), le_offset(_le_offset), keyp(_keyp) {}
+        dmt_functor(const uint32_t klpair_len, klpair_struct *const src)
+            : keylen(keylen_from_klpair_len(klpair_len)), le_offset(src->le_offset), keyp(src->key_le) {}
+    private:
+        const uint32_t keylen;
+        const uint32_t le_offset;
+        const void* keyp;
+};
 }

-typedef toku::omt<KLPAIR> klpair_omt_t;
+typedef toku::dmt<KLPAIR_S, KLPAIR> klpair_dmt_t;
 // This class stores the data associated with a basement node
 class bn_data {
 public:
    void init_zero(void);
    void initialize_empty(void);
-    void initialize_from_data(uint32_t num_entries, unsigned char *buf, uint32_t data_size);
+    void initialize_from_data(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version);
    // globals
    uint64_t get_memory_size(void);
    uint64_t get_disk_size(void);
    void verify_mempool(void);

-    // Interact with "omt"
+    // Interact with "dmt"
    uint32_t omt_size(void) const;

    template<typename iterate_extra_t,
@@ -165,14 +196,16 @@ class bn_data {
    template<typename iterate_extra_t,
             int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t, iterate_extra_t *const)>
    int omt_iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
-        return m_buffer.iterate_on_range< iterate_extra_t, wrappy_fun_iterate<iterate_extra_t, f> >(left, right, iterate_extra);
+        wrapped_iterate_extra_t<iterate_extra_t> wrapped_extra = { iterate_extra, this };
+        return m_buffer.iterate_on_range< wrapped_iterate_extra_t<iterate_extra_t>, wrappy_fun_iterate<iterate_extra_t, f> >(left, right, &wrapped_extra);
    }

-    template<typename omtcmp_t,
-             int (*h)(const DBT &, const omtcmp_t &)>
-    int find_zero(const omtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
+    template<typename dmtcmp_t,
+             int (*h)(const DBT &, const dmtcmp_t &)>
+    int find_zero(const dmtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
        KLPAIR klpair = NULL;
-        int r = m_buffer.find_zero< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, &klpair, idxp);
+        uint32_t klpair_len;
+        int r = m_buffer.find_zero< dmtcmp_t, wrappy_fun_find<dmtcmp_t, h> >(extra, &klpair_len, &klpair, idxp);
        if (r == 0) {
            if (value) {
                *value = get_le_from_klpair(klpair);
@@ -180,20 +213,21 @@ class bn_data {
            if (key) {
                paranoid_invariant(keylen != NULL);
                *key = klpair->key_le;
-                *keylen = klpair->keylen;
+                *keylen = keylen_from_klpair_len(klpair_len);
            }
            else {
-                paranoid_invariant(keylen == NULL);
+                paranoid_invariant_null(keylen);
            }
        }
        return r;
    }

-    template<typename omtcmp_t,
-             int (*h)(const DBT &, const omtcmp_t &)>
-    int find(const omtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
+    template<typename dmtcmp_t,
+             int (*h)(const DBT &, const dmtcmp_t &)>
+    int find(const dmtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
        KLPAIR klpair = NULL;
-        int r = m_buffer.find< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, direction, &klpair, idxp);
+        uint32_t klpair_len;
+        int r = m_buffer.find< dmtcmp_t, wrappy_fun_find<dmtcmp_t, h> >(extra, direction, &klpair_len, &klpair, idxp);
        if (r == 0) {
            if (value) {
                *value = get_le_from_klpair(klpair);
@@ -201,7 +235,7 @@ class bn_data {
            if (key) {
                paranoid_invariant(keylen != NULL);
                *key = klpair->key_le;
-                *keylen = klpair->keylen;
+                *keylen = keylen_from_klpair_len(klpair_len);
            }
            else {
                paranoid_invariant(keylen == NULL);
@@ -218,9 +252,9 @@ class bn_data {

    // Interact with another bn_data
    void move_leafentries_to(BN_DATA dest_bd,
-                                      uint32_t lbi, //lower bound inclusive
-                                      uint32_t ube //upper bound exclusive
-                                      );
+                              uint32_t lbi, //lower bound inclusive
+                              uint32_t ube //upper bound exclusive
+                              );

    void destroy(void);

@@ -232,7 +266,8 @@ class bn_data {
        uint32_t* old_keylens,
        LEAFENTRY* old_les,
        size_t *le_sizes,
-        size_t mempool_size
+        size_t total_key_size,
+        size_t total_le_size
        );

    void clone(bn_data* orig_bn_data);
@@ -243,14 +278,39 @@ class bn_data {
        );
    void get_space_for_overwrite(uint32_t idx, const void* keyp, uint32_t keylen, uint32_t old_size, uint32_t new_size, LEAFENTRY* new_le_space);
    void get_space_for_insert(uint32_t idx, const void* keyp, uint32_t keylen, size_t size, LEAFENTRY* new_le_space);
+
+    LEAFENTRY get_le_from_klpair(const klpair_struct *klpair) const;
+
+    void prepare_to_serialize(void);
+    void serialize_header(struct wbuf *wb) const;
+    void serialize_rest(struct wbuf *wb) const;
+    bool need_to_serialize_each_leafentry_with_key(void) const;
+
+    static const uint32_t HEADER_LENGTH = 0
+        + sizeof(uint32_t) // key_data_size
+        + sizeof(uint32_t) // val_data_size
+        + sizeof(uint32_t) // fixed_key_length
+        + sizeof(uint8_t) // all_keys_same_length
+        + sizeof(uint8_t) // keys_vals_separate
+        + 0;
 private:
+
    // Private functions
-    KLPAIR mempool_malloc_from_omt(size_t size, void **maybe_free);
-    void omt_compress_kvspace(size_t added_size, void **maybe_free);
+    LEAFENTRY mempool_malloc_and_update_omt(size_t size, void **maybe_free);
+    void omt_compress_kvspace(size_t added_size, void **maybe_free, bool force_compress);
+    void add_key(uint32_t keylen);
+    void add_keys(uint32_t n_keys, uint32_t combined_keylen);
+    void remove_key(uint32_t keylen);

-    klpair_omt_t m_buffer;                     // pointers to individual leaf entries
+    klpair_dmt_t m_buffer;                     // pointers to individual leaf entries
    struct mempool m_buffer_mempool;  // storage for all leaf entries

    friend class bndata_bugfix_test;
+    uint32_t klpair_disksize(const uint32_t klpair_len, const klpair_struct *klpair) const;
+    size_t m_disksize_of_keys;
+
+    void initialize_from_separate_keys_and_vals(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version,
+                                                uint32_t key_data_size, uint32_t val_data_size, bool all_keys_same_length,
+                                                uint32_t fixed_key_length);
 };

--- a/ft/dmt-wrapper.cc
+++ b/ft/dmt-wrapper.cc
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <toku_portability.h>
+#include <memory.h>
+#include <string.h>
+#include <db.h>
+
+#include <util/mempool.h>
+#include "dmt-wrapper.h"
+
+namespace toku {
+template<>
+class dmt_functor<DMTVALUE> {
+    public:
+        size_t get_dmtdatain_t_size(void) const {
+            return sizeof(DMTVALUE);
+        }
+        void write_dmtdata_t_to(DMTVALUE *const dest) const {
+            *dest = value;
+        }
+
+        dmt_functor(DMTVALUE _value)
+            : value(_value) {}
+        dmt_functor(const uint32_t size UU(), DMTVALUE *const src)
+            : value(*src) {
+            paranoid_invariant(size == sizeof(DMTVALUE));
+        }
+    private:
+        const DMTVALUE value;
+};
+}
+
+int
+toku_dmt_create_steal_sorted_array(DMT *dmtp, DMTVALUE **valuesp, uint32_t numvalues, uint32_t capacity) {
+    //TODO: implement using create_steal_sorted_array when it exists
+    (void)capacity;
+    toku_dmt_create_from_sorted_array(dmtp, *valuesp, numvalues);
+    toku_free(*valuesp);
+    *valuesp = nullptr;
+
+
+//    DMT XMALLOC(dmt);
+    //dmt->create_steal_sorted_array(valuesp, numvalues, capacity);
+ //   *dmtp = dmt;
+    return 0;
+}
+
+//TODO: Put all dmt API functions here.
+int toku_dmt_create (DMT *dmtp) {
+    DMT XMALLOC(dmt);
+    dmt->create();
+    *dmtp = dmt;
+    return 0;
+}
+
+void toku_dmt_destroy(DMT *dmtp) {
+    DMT dmt=*dmtp;
+    dmt->destroy();
+    toku_free(dmt);
+    *dmtp=NULL;
+}
+
+uint32_t toku_dmt_size(DMT V) {
+    return V->size();
+}
+
+int toku_dmt_create_from_sorted_array(DMT *dmtp, DMTVALUE *values, uint32_t numvalues) {
+    //TODO: implement using create_from_sorted_array when it exists
+
+    DMT XMALLOC(dmt);
+    dmt->create();
+    for (uint32_t i = 0; i < numvalues; i++) {
+        toku_dmt_insert_at(dmt, values[i], i);
+    }
+    //dmt->create_from_sorted_array(values, numvalues);
+    *dmtp=dmt;
+    return 0;
+}
+
+int toku_dmt_insert_at(DMT dmt, DMTVALUE value, uint32_t index) {
+    toku::dmt_functor<DMTVALUE> functor(value);
+    return dmt->insert_at(functor, index);
+}
+
+int toku_dmt_set_at (DMT dmt, DMTVALUE value, uint32_t index) {
+    int r = dmt->delete_at(index);
+    if (r!=0) return r;
+    return toku_dmt_insert_at(dmt, value, index);
+}
+
+int toku_dmt_delete_at(DMT dmt, uint32_t index) {
+    return dmt->delete_at(index);
+}
+
+int toku_dmt_fetch(DMT dmt, uint32_t i, DMTVALUE *v) {
+    uint32_t size;
+    return dmt->fetch(i, &size, v);
+}
+
+struct functor {
+    int (*f)(DMTVALUE, uint32_t, void *);
+    void *v;
+};
+static_assert(std::is_pod<functor>::value, "not POD");
+
+int call_functor(const uint32_t size, const DMTVALUE &v, uint32_t idx, functor *const ftor);
+int call_functor(const uint32_t size, const DMTVALUE &v, uint32_t idx, functor *const ftor) {
+    invariant(size == sizeof(DMTVALUE));
+    return ftor->f(const_cast<DMTVALUE>(v), idx, ftor->v);
+}
+
+int toku_dmt_iterate(DMT dmt, int (*f)(DMTVALUE, uint32_t, void*), void*v) {
+    struct functor ftor = { .f = f, .v = v };
+    return dmt->iterate<functor, call_functor>(&ftor);
+}
+
+int toku_dmt_iterate_on_range(DMT dmt, uint32_t left, uint32_t right, int (*f)(DMTVALUE, uint32_t, void*), void*v) {
+    struct functor ftor = { .f = f, .v = v };
+    return dmt->iterate_on_range<functor, call_functor>(left, right, &ftor);
+}
+
+struct heftor {
+    int (*h)(DMTVALUE, void *v);
+    void *v;
+};
+static_assert(std::is_pod<heftor>::value, "not POD");
+
+int call_heftor(const uint32_t size, const DMTVALUE &v, const heftor &htor);
+int call_heftor(const uint32_t size, const DMTVALUE &v, const heftor &htor) {
+    invariant(size == sizeof(DMTVALUE));
+    return htor.h(const_cast<DMTVALUE>(v), htor.v);
+}
+
+int toku_dmt_insert(DMT dmt, DMTVALUE value, int(*h)(DMTVALUE, void*v), void *v, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = v };
+    toku::dmt_functor<DMTVALUE> functor(value);
+    return dmt->insert<heftor, call_heftor>(functor, htor, index);
+}
+
+int toku_dmt_find_zero(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, DMTVALUE *value, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = extra };
+    uint32_t ignore;
+    return V->find_zero<heftor, call_heftor>(htor, &ignore, value, index);
+}
+
+int toku_dmt_find(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, int direction, DMTVALUE *value, uint32_t *index) {
+    struct heftor htor = { .h = h, .v = extra };
+    uint32_t ignore;
+    return V->find<heftor, call_heftor>(htor, direction, &ignore, value, index);
+}
+
+int toku_dmt_split_at(DMT dmt, DMT *newdmtp, uint32_t index) {
+    //TODO: use real split_at when it exists
+    if (index > dmt->size()) { return EINVAL; }
+    DMT XMALLOC(newdmt);
+    newdmt->create();
+    int r;
+
+    for (uint32_t i = index; i < dmt->size(); i++) {
+        DMTVALUE v;
+        r = toku_dmt_fetch(dmt, i, &v);
+        invariant_zero(r);
+        r = toku_dmt_insert_at(newdmt, v, i-index);
+        invariant_zero(r);
+    }
+    if (dmt->size() > 0) {
+        for (uint32_t i = dmt->size(); i > index; i--) {
+            r = toku_dmt_delete_at(dmt, i-1);
+            invariant_zero(r);
+        }
+    }
+    r = 0;
+
+#if 0
+    int r = dmt->split_at(newdmt, index);
+#endif
+    if (r != 0) {
+        toku_free(newdmt);
+    } else {
+        *newdmtp = newdmt;
+    }
+    return r;
+}
+
+int toku_dmt_merge(DMT leftdmt, DMT rightdmt, DMT *newdmtp) {
+    //TODO: use real merge when it exists
+    DMT XMALLOC(newdmt);
+    newdmt->create();
+    int r;
+    for (uint32_t i = 0; i < leftdmt->size(); i++) {
+        DMTVALUE v;
+        r = toku_dmt_fetch(leftdmt, i, &v);
+        invariant_zero(r);
+        r = toku_dmt_insert_at(newdmt, v, i);
+        invariant_zero(r);
+    }
+    uint32_t offset = leftdmt->size();
+    for (uint32_t i = 0; i < rightdmt->size(); i++) {
+        DMTVALUE v;
+        r = toku_dmt_fetch(rightdmt, i, &v);
+        invariant_zero(r);
+        r = toku_dmt_insert_at(newdmt, v, i+offset);
+        invariant_zero(r);
+    }
+    leftdmt->destroy();
+    rightdmt->destroy();
+
+//    newdmt->merge(leftdmt, rightdmt);
+
+    toku_free(leftdmt);
+    toku_free(rightdmt);
+    *newdmtp = newdmt;
+    return 0;
+}
+
+int toku_dmt_clone_noptr(DMT *dest, DMT src) {
+    DMT XMALLOC(dmt);
+    dmt->clone(*src);
+    *dest = dmt;
+    return 0;
+}
+
+void toku_dmt_clear(DMT dmt) {
+    dmt->clear();
+}
+
+size_t toku_dmt_memory_size (DMT dmt) {
+    return dmt->memory_size();
+}
+
--- a/ft/dmt-wrapper.h
+++ b/ft/dmt-wrapper.h
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#if !defined(TOKU_DMT_WRAPPER_H)
+#define TOKU_DMT_WRAPPER_H
+
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+
+// Order Maintenance Tree (DMT)
+//
+// Maintains a collection of totally ordered values, where each value has an integer weight.
+// The DMT is a mutable datatype.
+//
+// The Abstraction:
+//
+// An DMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+// The vector is numbered from $0$ to $|V|-1$.
+// Each value has a weight.  The weight of the $i$th element is denoted $w(V_i)$.
+//
+// We can create a new DMT, which is the empty vector.
+//
+// We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+//  $|V'|=1+|V|$       and
+//
+//   V'_j = V_j       if $j<i$
+//          x         if $j=i$
+//          V_{j-1}   if $j>i$.
+//
+// We can specify $i$ using a kind of function instead of as an integer.
+// Let $b$ be a function mapping from values to nonzero integers, such that
+// the signum of $b$ is monotically increasing.
+// We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+//
+// We look up a value using its index, or using a Heaviside function.
+// For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
+// When lookup up values, we can look up
+//  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a special return code if no such value exists.)
+//      (Rationale:  Ordinarily we want $i$ to be unique.  But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
+//  $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+//  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an indication that no such value exists.)
+//
+// When looking up a value using a Heaviside function, we get the value and its index.
+//
+// We can also split an DMT into two DMTs, splitting the weight of the values evenly.
+// Find a value $j$ such that the values to the left of $j$ have about the same total weight as the values to the right of $j$.
+// The resulting two DMTs contain the values to the left of $j$ and the values to the right of $j$ respectively.
+// All of the values from the original DMT go into one of the new DMTs.
+// If the weights of the values don't split exactly evenly, then the implementation has the freedom to choose whether
+//  the new left DMT or the new right DMT is larger.
+//
+// Performance:
+//  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
+//  The memory required is O(|V|).
+//
+// The programming API:
+
+//typedef struct value *DMTVALUE; // A slight improvement over using void*.
+#include <util/dmt.h>
+typedef void *DMTVALUE;
+typedef toku::dmt<DMTVALUE> *DMT;
+
+
+int toku_dmt_create (DMT *dmtp);
+// Effect: Create an empty DMT.  Stores it in *dmtp.
+// Requires: dmtp != NULL
+// Returns:
+//   0        success
+//   ENOMEM   out of memory (and doesn't modify *dmtp)
+// Performance: constant time.
+
+int toku_dmt_create_from_sorted_array(DMT *dmtp, DMTVALUE *values, uint32_t numvalues);
+// Effect: Create a DMT containing values.  The number of values is in numvalues.
+//  Stores the new DMT in *dmtp.
+// Requires: dmtp != NULL
+// Requires: values != NULL
+// Requires: values is sorted
+// Returns:
+//   0        success
+//   ENOMEM   out of memory (and doesn't modify *dmtp)
+// Performance:  time=O(numvalues)
+// Rational:     Normally to insert N values takes O(N lg N) amortized time.
+//               If the N values are known in advance, are sorted, and
+//               the structure is empty, we can batch insert them much faster.
+
+int toku_dmt_create_steal_sorted_array(DMT *dmtp, DMTVALUE **valuesp, uint32_t numvalues, uint32_t steal_capacity);
+// Effect: Create an DMT containing values.  The number of values is in numvalues.
+//         On success the DMT takes ownership of *valuesp array, and sets valuesp=NULL.
+// Requires: dmtp != NULL
+// Requires: valuesp != NULL
+// Requires: *valuesp is sorted
+// Requires: *valuesp was allocated with toku_malloc
+// Requires: Capacity of the *valuesp array is <= steal_capacity
+// Requires: On success, *valuesp may not be accessed again by the caller.
+// Returns:
+//   0        success
+//   ENOMEM   out of memory (and doesn't modify *dmtp)
+//   EINVAL   *valuesp == NULL or numvalues > capacity
+// Performance:  time=O(1)
+// Rational:     toku_dmt_create_from_sorted_array takes O(numvalues) time.
+//               By taking ownership of the array, we save a malloc and memcpy,
+//               and possibly a free (if the caller is done with the array).
+
+void toku_dmt_destroy(DMT *dmtp);
+// Effect:  Destroy an DMT, freeing all its memory.
+//   Does not free the DMTVALUEs stored in the DMT.
+//   Those values may be freed before or after calling toku_dmt_destroy.
+//   Also sets *dmtp=NULL.
+// Requires: dmtp != NULL
+// Requires: *dmtp != NULL
+// Rationale:  The usage is to do something like
+//   toku_dmt_destroy(&s->dmt);
+// and now s->dmt will have a NULL pointer instead of a dangling freed pointer.
+// Rationale: Returns no values since free() cannot fail.
+// Rationale: Does not free the DMTVALUEs to reduce complexity.
+// Performance:  time=O(toku_dmt_size(*dmtp))
+
+uint32_t toku_dmt_size(DMT V);
+// Effect: return |V|.
+// Requires: V != NULL
+// Performance:  time=O(1)
+
+int toku_dmt_iterate_on_range(DMT dmt, uint32_t left, uint32_t right, int (*f)(DMTVALUE, uint32_t, void*), void*v);
+// Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+//  The second argument passed to f is the index of the value.
+//  The third argument passed to f is v.
+//  The indices run from 0 (inclusive) to toku_dmt_size(dmt) (exclusive).
+//  We will iterate only over [left,right)
+//
+// Requires: dmt != NULL
+// left <= right
+// Requires: f != NULL
+// Returns:
+//  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_dmt_iterate.
+//  If f always returns zero, then toku_dmt_iterate returns 0.
+// Requires:  Don't modify dmt while running.  (E.g., f may not insert or delete values form dmt.)
+// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in dmt.
+// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+
+int toku_dmt_iterate(DMT dmt, int (*f)(DMTVALUE, uint32_t, void*), void*v);
+// Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+//  The second argument passed to f is the index of the value.
+//  The third argument passed to f is v.
+//  The indices run from 0 (inclusive) to toku_dmt_size(dmt) (exclusive).
+// Requires: dmt != NULL
+// Requires: f != NULL
+// Returns:
+//  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_dmt_iterate.
+//  If f always returns zero, then toku_dmt_iterate returns 0.
+// Requires:  Don't modify dmt while running.  (E.g., f may not insert or delete values form dmt.)
+// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in dmt.
+// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+
+int toku_dmt_insert_at(DMT dmt, DMTVALUE value, uint32_t idx);
+// Effect: Increases indexes of all items at slot >= index by 1.
+//         Insert value into the position at index.
+//
+// Returns:
+//   0         success
+//   EINVAL    if index>toku_dmt_size(dmt)
+//   ENOMEM
+// On error, dmt is unchanged.
+// Performance: time=O(\log N) amortized time.
+// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+
+int toku_dmt_set_at (DMT dmt, DMTVALUE value, uint32_t idx);
+// Effect:  Replaces the item at index with value.
+// Returns:
+//   0       success
+//   EINVAL  if index>=toku_dmt_size(dmt)
+// On error, dmt i sunchanged.
+// Performance: time=O(\log N)
+// Rationale: The BRT needs to be able to replace a value with another copy of the same value (allocated in a different location)
+
+int toku_dmt_insert(DMT dmt, DMTVALUE value, int(*h)(DMTVALUE, void*v), void *v, uint32_t *idx);
+// Effect:  Insert value into the DMT.
+//   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+//   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+//      If no such i exists, then let i be |V|
+//   Then this has the same effect as
+//    dmt_insert_at(tree, value, i);
+//   If index!=NULL then i is stored in *index
+// Requires:  The signum of h must be monotonically increasing.
+// Returns:
+//    0            success
+//    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+//    ENOMEM
+// On nonzero return, dmt is unchanged.
+// On nonzero non-DB_KEYEXIST return, *index is unchanged.
+// Performance: time=O(\log N) amortized.
+// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+
+int toku_dmt_delete_at(DMT dmt, uint32_t idx);
+// Effect: Delete the item in slot index.
+//         Decreases indexes of all items at slot >= index by 1.
+// Returns
+//     0            success
+//     EINVAL       if index>=toku_dmt_size(dmt)
+// On error, dmt is unchanged.
+// Rationale: To delete an item, first find its index using toku_dmt_find, then delete it.
+// Performance: time=O(\log N) amortized.
+
+int toku_dmt_fetch (DMT V, uint32_t i, DMTVALUE *v);
+// Effect: Set *v=V_i
+//   If c!=NULL then set c's abstract offset to i.
+// Requires: v   != NULL
+// Returns
+//    0             success
+//    EINVAL        if index>=toku_dmt_size(dmt)
+// On nonzero return, *v is unchanged, and c (if nonnull) is either
+//   invalidated or unchanged.
+// Performance: time=O(\log N)
+// Implementation Notes: It is possible that c was previously valid and was
+//   associated with a different DMT.   If c is changed by this
+//   function, the function must remove c's association with the old
+//   DMT, and associate it with the new DMT.
+
+int toku_dmt_find_zero(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, DMTVALUE *value, uint32_t *idx);
+// Effect:  Find the smallest i such that h(V_i, extra)>=0
+//  If there is such an i and h(V_i,extra)==0 then set *index=i and return 0.
+//  If there is such an i and h(V_i,extra)>0  then set *index=i and return DB_NOTFOUND.
+//  If there is no such i then set *index=toku_dmt_size(V) and return DB_NOTFOUND.
+// Requires: index!=NULL
+
+int toku_dmt_find(DMT V, int (*h)(DMTVALUE, void*extra), void*extra, int direction, DMTVALUE *value, uint32_t *idx);
+//   Effect:
+//    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+//    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+//    (Direction may not be equal to zero.)
+//    If value!=NULL then store V_i in *value
+//    If index!=NULL then store i in *index.
+//   Requires: The signum of h is monotically increasing.
+//   Returns
+//      0             success
+//      DB_NOTFOUND   no such value is found.
+//   On nonzero return, *value and *index are unchanged, and c (if nonnull) is either invalidated
+//      or unchanged.
+//   Performance: time=O(\log N)
+//   Rationale:
+//     Here's how to use the find function to find various things
+//       Cases for find:
+//        find first value:         ( h(v)=+1, direction=+1 )
+//        find last value           ( h(v)=-1, direction=-1 )
+//        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+//        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+//        find X or successor to X  ( same as find first X. )
+//
+//   Rationale: To help understand heaviside functions and behavor of find:
+//    There are 7 kinds of heaviside functions.
+//    The signus of the h must be monotonically increasing.
+//    Given a function of the following form, A is the element
+//    returned for direction>0, B is the element returned
+//    for direction<0, C is the element returned for
+//    direction==0 (see find_zero) (with a return of 0), and D is the element
+//    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+//    If any of A, B, or C are not found, then asking for the
+//    associated direction will return DB_NOTFOUND.
+//    See find_zero for more information.
+//
+//    Let the following represent the signus of the heaviside function.
+//
+//    -...-
+//        A
+//         D
+//
+//    +...+
+//    B
+//    D
+//
+//    0...0
+//    C
+//
+//    -...-0...0
+//        AC
+//
+//    0...0+...+
+//    C    B
+//
+//    -...-+...+
+//        AB
+//         D
+//
+//    -...-0...0+...+
+//        AC    B
+
+int toku_dmt_split_at(DMT dmt, DMT *newdmt, uint32_t idx);
+// Effect: Create a new DMT, storing it in *newdmt.
+//  The values to the right of index (starting at index) are moved to *newdmt.
+// Requires: dmt != NULL
+// Requires: newdmt != NULL
+// Returns
+//    0             success,
+//    EINVAL        if index > toku_dmt_size(dmt)
+//    ENOMEM
+// On nonzero return, dmt and *newdmt are unmodified.
+// Performance: time=O(n)
+// Rationale:  We don't need a split-evenly operation.  We need to split items so that their total sizes
+//  are even, and other similar splitting criteria.  It's easy to split evenly by calling toku_dmt_size(), and dividing by two.
+
+int toku_dmt_merge(DMT leftdmt, DMT rightdmt, DMT *newdmt);
+// Effect: Appends leftdmt and rightdmt to produce a new dmt.
+//  Sets *newdmt to the new dmt.
+//  On success, leftdmt and rightdmt destroyed,.
+// Returns 0 on success
+//   ENOMEM on out of memory.
+// On error, nothing is modified.
+// Performance: time=O(n) is acceptable, but one can imagine implementations that are O(\log n) worst-case.
+
+int toku_dmt_clone_noptr(DMT *dest, DMT src);
+// Effect: Creates a copy of an dmt.
+//  Sets *dest to the clone
+//  Each element is assumed to be stored directly in the dmt, that is, the DMTVALUEs are not pointers, they are data.  Thus no extra memory allocation is required.
+// Returns 0 on success
+//  ENOMEM on out of memory.
+// On error, nothing is modified.
+// Performance: time between O(n) and O(n log n), depending how long it
+//  takes to traverse src.
+
+void toku_dmt_clear(DMT dmt);
+// Effect: Set the tree to be empty.
+//  Note: Will not reallocate or resize any memory, since returning void precludes calling malloc.
+// Performance: time=O(1)
+
+size_t toku_dmt_memory_size (DMT dmt);
+// Effect: Return the size (in bytes) of the dmt, as it resides in main memory.  Don't include any of the DMTVALUES.
+
+
+
+#endif  /* #ifndef TOKU_DMT_WRAPPER_H */
+
--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@@ -1178,6 +1178,8 @@ typedef enum {
    FT_PRO_NUM_STOP_LOCK_CHILD,
    FT_PRO_NUM_STOP_CHILD_INMEM,
    FT_PRO_NUM_DIDNT_WANT_PROMOTE,
+    FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize
+    FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize
    FT_STATUS_NUM_ROWS
 } ft_status_entry;


--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@@ -363,6 +363,8 @@ status_init(void)
    STATUS_INIT(FT_PRO_NUM_STOP_LOCK_CHILD,                PROMOTION_STOPPED_CHILD_LOCKED_OR_NOT_IN_MEMORY, PARCOUNT, "promotion: stopped because the child was locked or not at all in memory", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(FT_PRO_NUM_STOP_CHILD_INMEM,               PROMOTION_STOPPED_CHILD_NOT_FULLY_IN_MEMORY, PARCOUNT, "promotion: stopped because the child was not fully in memory", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE,             PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE,     BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE,  BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);

    ft_status.initialized = true;
 }
@@ -389,6 +391,14 @@ toku_ft_get_status(FT_STATUS s) {
        }                                                                           \
    } while (0)

+void toku_note_deserialized_basement_node(bool fixed_key_size) {
+    if (fixed_key_size) {
+        STATUS_INC(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, 1);
+    } else {
+        STATUS_INC(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, 1);
+    }
+}
+
 bool is_entire_node_in_memory(FTNODE node) {
    for (int i = 0; i < node->n_children; i++) {
        if(BP_STATE(node,i) != PT_AVAIL) {
@@ -595,6 +605,7 @@ ftnode_memory_size (FTNODE node)
    int n_children = node->n_children;
    retval += sizeof(*node);
    retval += (n_children)*(sizeof(node->bp[0]));
+    retval += (n_children > 0 ? n_children-1 : 0)*(sizeof(node->childkeys[0]));
    retval += node->totalchildkeylens;

    // now calculate the sizes of the partitions
@@ -1722,6 +1733,8 @@ toku_ft_bn_apply_cmd_once (
        &new_le, 
        &numbytes_delta
        );
+    // at this point, we cannot trust cmd->u.id.key to be valid.
+    // The dmt may have realloced its mempool and freed the one containing key.

    newsize = new_le ? (leafentry_memsize(new_le) +  + key_storage_size) : 0;
    if (le && new_le) {
@@ -1986,6 +1999,7 @@ toku_ft_bn_apply_cmd (
            int deleted = 0;
            if (!le_is_clean(storeddata)) { //If already clean, nothing to do.
                toku_ft_bn_apply_cmd_once(bn, cmd, idx, storeddata, oldest_referenced_xid_known, gc_info, workdone, stats_to_update);
+                // at this point, we cannot trust cmd->u.id.key to be valid.
                uint32_t new_omt_size = bn->data_buffer.omt_size();
                if (new_omt_size != omt_size) {
                    paranoid_invariant(new_omt_size+1 == omt_size);

--- a/ft/ft-ops.h
+++ b/ft/ft-ops.h
@@ -351,6 +351,8 @@ int toku_ft_strerror_r(int error, char *buf, size_t buflen);

 extern bool garbage_collection_debug;

+void toku_note_deserialized_basement_node(bool fixed_key_size);
+
 // This is a poor place to put global options like these.
 void toku_ft_set_direct_io(bool direct_io_on);
 void toku_ft_set_compress_buffers_before_eviction(bool compress_buffers);

--- a/ft/ft_layout_version.h
+++ b/ft/ft_layout_version.h
@@ -118,7 +118,7 @@ enum ft_layout_version_e {
    FT_LAYOUT_VERSION_22 = 22, // Ming: Add oldest known referenced xid to each ftnode, for better garbage collection
    FT_LAYOUT_VERSION_23 = 23, // Ming: Fix upgrade path #5902
    FT_LAYOUT_VERSION_24 = 24, // Riddler: change logentries that log transactions to store TXNID_PAIRs instead of TXNIDs
-    FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes.  same for xstillopen log entry
+    FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes.  same for xstillopen log entry, basements store key/vals separately on disk
    FT_NEXT_VERSION,           // the version after the current version
    FT_LAYOUT_VERSION   = FT_NEXT_VERSION-1, // A hack so I don't have to change this line.
    FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported

--- a/ft/ft_node-serialize.cc
+++ b/ft/ft_node-serialize.cc
@@ -320,7 +320,7 @@ serialize_ftnode_partition_size (FTNODE node, int i)
        result += toku_bnc_nbytesinbuf(BNC(node, i));
    }
    else {
-        result += 4; // n_entries in buffer table
+        result += 4 + bn_data::HEADER_LENGTH; // n_entries in buffer table + basement header
        result += BLB_NBYTESINDATA(node, i);
    }
    result += 4; // checksum
@@ -380,10 +380,16 @@ serialize_ftnode_partition(FTNODE node, int i, struct sub_block *sb) {
        wbuf_nocrc_char(&wb, ch);
        wbuf_nocrc_uint(&wb, bd->omt_size());

-        //
-        // iterate over leafentries and place them into the buffer
-        //
-        bd->omt_iterate<struct wbuf, wbufwriteleafentry>(&wb);
+        bd->prepare_to_serialize();
+        bd->serialize_header(&wb);
+        if (bd->need_to_serialize_each_leafentry_with_key()) {
+            //
+            // iterate over leafentries and place them into the buffer
+            //
+            bd->omt_iterate<struct wbuf, wbufwriteleafentry>(&wb);
+        } else {
+            bd->serialize_rest(&wb);
+        }
    }
    uint32_t end_to_end_checksum = x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb));
    wbuf_nocrc_int(&wb, end_to_end_checksum);
@@ -592,9 +598,14 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
    // Create an array that will store the size of each basement.
    // This is the sum of the leaf sizes of all the leaves in that basement.
    // We don't know how many basements there will be, so we use num_le as the upper bound.
-    toku::scoped_malloc bn_sizes_buf(sizeof(size_t) * num_alloc);
-    size_t *bn_sizes = reinterpret_cast<size_t *>(bn_sizes_buf.get());
-    bn_sizes[0] = 0;
+
+    // Sum of all le sizes in a single basement
+    toku::scoped_calloc bn_le_sizes_buf(sizeof(size_t) * num_alloc);
+    size_t *bn_le_sizes = reinterpret_cast<size_t *>(bn_le_sizes_buf.get());
+
+    // Sum of all key sizes in a single basement
+    toku::scoped_calloc bn_key_sizes_buf(sizeof(size_t) * num_alloc);
+    size_t *bn_key_sizes = reinterpret_cast<size_t *>(bn_key_sizes_buf.get());

    // TODO 4050: All these arrays should be combined into a single array of some bn_info struct (pivot, msize, num_les).
    // Each entry is the number of leafentries in this basement.  (Again, num_le is overkill upper baound.)
@@ -611,17 +622,20 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
    for (uint32_t i = 0; i < num_le; i++) {
        uint32_t curr_le_size = leafentry_disksize((LEAFENTRY) leafpointers[i]); 
        le_sizes[i] = curr_le_size;
-        if ((bn_size_so_far + curr_le_size > basementnodesize) && (num_le_in_curr_bn != 0)) {
+        if ((bn_size_so_far + curr_le_size + sizeof(uint32_t) + key_sizes[i] > basementnodesize) && (num_le_in_curr_bn != 0)) {
            // cap off the current basement node to end with the element before i
            new_pivots[curr_pivot] = i-1;
            curr_pivot++;
            num_le_in_curr_bn = 0;
            bn_size_so_far = 0;
+            bn_le_sizes[curr_pivot] = 0;
+            bn_key_sizes[curr_pivot] = 0;
        }
        num_le_in_curr_bn++;
        num_les_this_bn[curr_pivot] = num_le_in_curr_bn;
+        bn_le_sizes[curr_pivot] += curr_le_size;
+        bn_key_sizes[curr_pivot] += sizeof(uint32_t) + key_sizes[i];  // uint32_t le_offset
        bn_size_so_far += curr_le_size + sizeof(uint32_t) + key_sizes[i];
-        bn_sizes[curr_pivot] = bn_size_so_far;
    }
    // curr_pivot is now the total number of pivot keys in the leaf node
    int num_pivots   = curr_pivot;
@@ -688,9 +702,6 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
        uint32_t num_les_to_copy = num_les_this_bn[i];
        invariant(num_les_to_copy == num_in_bn); 

-        // construct mempool for this basement
-        size_t size_this_bn = bn_sizes[i];
-
        BN_DATA bd = BLB_DATA(node, i);
        bd->replace_contents_with_clone_of_sorted_array(
            num_les_to_copy,
@@ -698,7 +709,8 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
            &key_sizes[baseindex_this_bn],
            &leafpointers[baseindex_this_bn],
            &le_sizes[baseindex_this_bn],
-            size_this_bn
+            bn_key_sizes[i],  // Total key sizes
+            bn_le_sizes[i]  // total le sizes
            );

        BP_STATE(node,i) = PT_AVAIL;
@@ -1546,10 +1558,9 @@ deserialize_ftnode_partition(
        uint32_t num_entries = rbuf_int(&rb);
        // we are now at the first byte of first leafentry
        data_size -= rb.ndone; // remaining bytes of leafentry data
-        
+
        BASEMENTNODE bn = BLB(node, childnum);
-        bn->data_buffer.initialize_from_data(num_entries, &rb.buf[rb.ndone], data_size);
-        rb.ndone += data_size;
+        bn->data_buffer.initialize_from_data(num_entries, &rb, data_size, node->layout_version_read_from_disk);
    }
    assert(rb.ndone == rb.size);
 exit:
@@ -2101,8 +2112,7 @@ deserialize_and_upgrade_leaf_node(FTNODE node,
        if (has_end_to_end_checksum) {
            data_size -= sizeof(uint32_t);
        }
-        bn->data_buffer.initialize_from_data(n_in_buf, &rb->buf[rb->ndone], data_size);
-        rb->ndone += data_size;
+        bn->data_buffer.initialize_from_data(n_in_buf, rb, data_size, node->layout_version_read_from_disk);
    }

    // Whatever this is must be less than the MSNs of every message above

--- a/ft/memarena.cc
+++ b/ft/memarena.cc
@@ -98,6 +98,7 @@ struct memarena {
    char *buf;
    size_t buf_used, buf_size;
    size_t size_of_other_bufs; // the buf_size of all the other bufs.
+    size_t footprint_of_other_bufs; // the footprint of all the other bufs.
    char **other_bufs;
    int n_other_bufs;
 };
@@ -108,6 +109,7 @@ MEMARENA memarena_create_presized (size_t initial_size) {
    result->buf_used = 0;
    result->other_bufs = NULL;
    result->size_of_other_bufs = 0;
+    result->footprint_of_other_bufs = 0;
    result->n_other_bufs = 0;
    XMALLOC_N(result->buf_size, result->buf);
    return result;
@@ -128,6 +130,7 @@ void memarena_clear (MEMARENA ma) {
    // But reuse the main buffer
    ma->buf_used = 0;
    ma->size_of_other_bufs = 0;
+    ma->footprint_of_other_bufs = 0;
 }

 static size_t
@@ -151,6 +154,7 @@ void* malloc_in_memarena (MEMARENA ma, size_t size) {
            ma->other_bufs[old_n]=ma->buf;
            ma->n_other_bufs = old_n+1;
            ma->size_of_other_bufs += ma->buf_size;
+            ma->footprint_of_other_bufs += toku_memory_footprint(ma->buf, ma->buf_used);
        }
        // Make a new one
        {
@@ -217,7 +221,9 @@ void memarena_move_buffers(MEMARENA dest, MEMARENA source) {
 #endif

    dest  ->size_of_other_bufs += source->size_of_other_bufs + source->buf_size;
+    dest  ->footprint_of_other_bufs += source->footprint_of_other_bufs + toku_memory_footprint(source->buf, source->buf_used);
    source->size_of_other_bufs = 0;
+    source->footprint_of_other_bufs = 0;

    assert(other_bufs);
    dest->other_bufs = other_bufs;
@@ -247,3 +253,11 @@ memarena_total_size_in_use (MEMARENA m)
 {
    return m->size_of_other_bufs + m->buf_used;
 }    
+
+size_t
+memarena_total_footprint (MEMARENA m)
+{
+    return m->footprint_of_other_bufs + toku_memory_footprint(m->buf, m->buf_used) +
+            sizeof(*m) +
+            m->n_other_bufs * sizeof(*m->other_bufs);
+}
--- a/ft/memarena.h
+++ b/ft/memarena.h
@@ -129,5 +129,6 @@ size_t memarena_total_memory_size (MEMARENA);

 size_t memarena_total_size_in_use (MEMARENA);

+size_t memarena_total_footprint (MEMARENA);

 #endif
--- a/ft/rollback.cc
+++ b/ft/rollback.cc
@@ -146,7 +146,7 @@ PAIR_ATTR
 rollback_memory_size(ROLLBACK_LOG_NODE log) {
    size_t size = sizeof(*log);
    if (log->rollentry_arena) {
-        size += memarena_total_memory_size(log->rollentry_arena);
+        size += memarena_total_footprint(log->rollentry_arena);
    }
    return make_rollback_pair_attr(size);
 }

--- a/ft/tests/dmt-test.cc
+++ b/ft/tests/dmt-test.cc
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+
+#include "test.h"
+
+#include "dmt-wrapper.h"
+#include <util/dmt.h>
+
+typedef DMTVALUE TESTVALUE;
+
+static void
+parse_args (int argc, const char *argv[]) {
+    const char *argv0=argv[0];
+    while (argc>1) {
+        int resultcode=0;
+        if (strcmp(argv[1], "-v")==0) {
+            verbose++;
+        } else if (strcmp(argv[1], "-q")==0) {
+            verbose = 0;
+        } else if (strcmp(argv[1], "-h")==0) {
+        do_usage:
+            fprintf(stderr, "Usage:\n%s [-v|-h]\n", argv0);
+            exit(resultcode);
+        } else {
+            resultcode=1;
+            goto do_usage;
+        }
+        argc--;
+        argv++;
+    }
+}
+/* End ".h like" stuff. */
+
+struct value {
+    uint32_t number;
+};
+#define V(x) ((struct value *)(x))
+
+enum rand_type {
+    TEST_RANDOM,
+    TEST_SORTED,
+    TEST_IDENTITY
+};
+enum close_when_done {
+    CLOSE_WHEN_DONE,
+    KEEP_WHEN_DONE
+};
+enum create_type {
+    STEAL_ARRAY,
+    BATCH_INSERT,
+    INSERT_AT,
+    INSERT_AT_ALMOST_RANDOM,
+};
+
+/* Globals */
+DMT global_dmt;
+TESTVALUE*       values = NULL;
+struct value*   nums   = NULL;
+uint32_t       length;
+
+static void
+cleanup_globals (void) {
+    assert(values);
+    toku_free(values);
+    values = NULL;
+    assert(nums);
+    toku_free(nums);
+    nums = NULL;
+}
+
+const unsigned int random_seed = 0xFEADACBA;
+
+static void
+init_init_values (unsigned int seed, uint32_t num_elements) {
+    srandom(seed);
+
+    cleanup_globals();
+
+    MALLOC_N(num_elements, values);
+    assert(values);
+    MALLOC_N(num_elements, nums);
+    assert(nums);
+    length = num_elements;
+}
+
+static void
+init_identity_values (unsigned int seed, uint32_t num_elements) {
+    uint32_t   i;
+
+    init_init_values(seed, num_elements);
+
+    for (i = 0; i < length; i++) {
+        nums[i].number   = i;
+        values[i]        = (TESTVALUE)&nums[i];
+    }
+}
+
+static void
+init_distinct_sorted_values (unsigned int seed, uint32_t num_elements) {
+    uint32_t   i;
+
+    init_init_values(seed, num_elements);
+
+    uint32_t number = 0;
+
+    for (i = 0; i < length; i++) {
+        number          += (uint32_t)(random() % 32) + 1;
+        nums[i].number   = number;
+        values[i]        = (TESTVALUE)&nums[i];
+    }
+}
+
+static void
+init_distinct_random_values (unsigned int seed, uint32_t num_elements) {
+    init_distinct_sorted_values(seed, num_elements);
+
+    uint32_t   i;
+    uint32_t   choice;
+    uint32_t   choices;
+    struct value temp;
+    for (i = 0; i < length - 1; i++) {
+        choices = length - i;
+        choice  = random() % choices;
+        if (choice != i) {
+            temp         = nums[i];
+            nums[i]      = nums[choice];
+            nums[choice] = temp;
+        }
+    }
+}
+
+static void
+init_globals (void) {
+    MALLOC_N(1, values);
+    assert(values);
+    MALLOC_N(1, nums);
+    assert(nums);
+    length = 1;
+}
+
+static void
+test_close (enum close_when_done do_close) {
+    if (do_close == KEEP_WHEN_DONE) return;
+    assert(do_close == CLOSE_WHEN_DONE);
+    toku_dmt_destroy(&global_dmt);
+    assert(global_dmt==NULL);
+}
+
+static void
+test_create (enum close_when_done do_close) {
+    int r;
+    global_dmt = NULL;
+
+    r = toku_dmt_create(&global_dmt);
+    CKERR(r);
+    assert(global_dmt!=NULL);
+    test_close(do_close);
+}
+
+static void
+test_create_size (enum close_when_done do_close) {
+    test_create(KEEP_WHEN_DONE);
+    assert(toku_dmt_size(global_dmt) == 0);
+    test_close(do_close);
+}
+
+static void
+test_create_insert_at_almost_random (enum close_when_done do_close) {
+    uint32_t i;
+    int r;
+    uint32_t size = 0;
+
+    test_create(KEEP_WHEN_DONE);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    for (i = 0; i < length/2; i++) {
+        assert(size==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert_at(global_dmt, values[i], i);
+        CKERR(r);
+        assert(++size==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert_at(global_dmt, values[length-1-i], i+1);
+        CKERR(r);
+        assert(++size==toku_dmt_size(global_dmt));
+    }
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    assert(size==toku_dmt_size(global_dmt));
+    test_close(do_close);
+}
+
+static void
+test_create_insert_at_sequential (enum close_when_done do_close) {
+    uint32_t i;
+    int r;
+    uint32_t size = 0;
+
+    test_create(KEEP_WHEN_DONE);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    for (i = 0; i < length; i++) {
+        assert(size==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert_at(global_dmt, values[i], i);
+        CKERR(r);
+        assert(++size==toku_dmt_size(global_dmt));
+    }
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+1);
+    CKERR2(r, EINVAL);
+    r = toku_dmt_insert_at(global_dmt, values[0], toku_dmt_size(global_dmt)+2);
+    CKERR2(r, EINVAL);
+    assert(size==toku_dmt_size(global_dmt));
+    test_close(do_close);
+}
+
+static void
+test_create_from_sorted_array (enum create_type create_choice, enum close_when_done do_close) {
+    int r;
+    global_dmt = NULL;
+
+    if (create_choice == BATCH_INSERT) {
+        r = toku_dmt_create_from_sorted_array(&global_dmt, values, length);
+        CKERR(r);
+    }
+    else if (create_choice == STEAL_ARRAY) {
+        TESTVALUE* MALLOC_N(length, values_copy);
+        memcpy(values_copy, values, length*sizeof(*values));
+        r = toku_dmt_create_steal_sorted_array(&global_dmt, &values_copy, length, length);
+        CKERR(r);
+        assert(values_copy==NULL);
+    }
+    else if (create_choice == INSERT_AT) {
+        test_create_insert_at_sequential(KEEP_WHEN_DONE);
+    }
+    else if (create_choice == INSERT_AT_ALMOST_RANDOM) {
+        test_create_insert_at_almost_random(KEEP_WHEN_DONE);
+    }
+    else assert(false);
+
+    assert(global_dmt!=NULL);
+    test_close(do_close);
+}
+
+static void
+test_create_from_sorted_array_size (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    assert(toku_dmt_size(global_dmt)==length);
+    test_close(do_close);
+}    
+
+static void
+test_fetch_verify (DMT dmtree, TESTVALUE* val, uint32_t len ) {
+    uint32_t i;
+    int r;
+    TESTVALUE v = (TESTVALUE)&i;
+    TESTVALUE oldv = v;
+
+    assert(len == toku_dmt_size(dmtree));
+    for (i = 0; i < len; i++) {
+        assert(oldv!=val[i]);
+        v = NULL;
+        r = toku_dmt_fetch(dmtree, i, &v);
+        CKERR(r);
+        assert(v != NULL);
+        assert(v != oldv);
+        assert(v == val[i]);
+        assert(V(v)->number == V(val[i])->number);
+        v = oldv;
+    }
+
+    for (i = len; i < len*2; i++) {
+        v = oldv;
+        r = toku_dmt_fetch(dmtree, i, &v);
+        CKERR2(r, EINVAL);
+        assert(v == oldv);
+    }
+
+}
+
+static void
+test_create_fetch_verify (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    test_fetch_verify(global_dmt, values, length);
+    test_close(do_close);
+}
+
+static int iterate_helper_error_return = 1;
+
+static int
+iterate_helper (TESTVALUE v, uint32_t idx, void* extra) {
+    if (extra == NULL) return iterate_helper_error_return;
+    TESTVALUE* vals = (TESTVALUE *)extra;
+    assert(v != NULL);
+    assert(v == vals[idx]);
+    assert(V(v)->number == V(vals[idx])->number);
+    return 0;
+}
+
+static void
+test_iterate_verify (DMT dmtree, TESTVALUE* vals, uint32_t len) {
+    int r;
+    iterate_helper_error_return = 0;
+    r = toku_dmt_iterate(dmtree, iterate_helper, (void*)vals);
+    CKERR(r);
+    iterate_helper_error_return = 0xFEEDABBA;
+    r = toku_dmt_iterate(dmtree, iterate_helper, NULL);
+    if (!len) {
+        CKERR2(r, 0);
+    }
+    else {
+        CKERR2(r, iterate_helper_error_return);
+    }
+}
+
+static void
+test_create_iterate_verify (enum create_type create_choice, enum close_when_done do_close) {
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    test_iterate_verify(global_dmt, values, length);
+    test_close(do_close);
+}
+
+
+static void
+permute_array (uint32_t* arr, uint32_t len) {
+    //
+    // create a permutation of 0...size-1
+    //
+    uint32_t i = 0;
+    for (i = 0; i < len; i++) {
+        arr[i] = i;
+    }
+    for (i = 0; i < len - 1; i++) {
+        uint32_t choices = len - i;
+        uint32_t choice  = random() % choices;
+        if (choice != i) {
+            uint32_t temp = arr[i];
+            arr[i]      = arr[choice];
+            arr[choice] = temp;
+        }
+    }
+}
+
+static void
+test_create_set_at (enum create_type create_choice, enum close_when_done do_close) {
+    uint32_t i = 0;
+
+    struct value*   old_nums   = NULL;
+    MALLOC_N(length, old_nums);
+    assert(nums);
+
+    uint32_t* perm = NULL;
+    MALLOC_N(length, perm);
+    assert(perm);
+
+    TESTVALUE* old_values = NULL;
+    MALLOC_N(length, old_values);
+    assert(old_values);
+    
+    permute_array(perm, length);
+
+    //
+    // These are going to be the new values
+    //
+    for (i = 0; i < length; i++) {
+        old_nums[i] = nums[i];
+        old_values[i] = &old_nums[i];        
+        values[i] = &old_nums[i];
+    }
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+    int r;
+    r = toku_dmt_set_at (global_dmt, values[0], length);
+    CKERR2(r,EINVAL);    
+    r = toku_dmt_set_at (global_dmt, values[0], length+1);
+    CKERR2(r,EINVAL);    
+    for (i = 0; i < length; i++) {
+        uint32_t choice = perm[i];
+        values[choice] = &nums[choice];
+        nums[choice].number = (uint32_t)random();
+        r = toku_dmt_set_at (global_dmt, values[choice], choice);
+        CKERR(r);
+        test_iterate_verify(global_dmt, values, length);
+        test_fetch_verify(global_dmt, values, length);
+    }
+    r = toku_dmt_set_at (global_dmt, values[0], length);
+    CKERR2(r,EINVAL);    
+    r = toku_dmt_set_at (global_dmt, values[0], length+1);
+    CKERR2(r,EINVAL);    
+
+    toku_free(perm);
+    toku_free(old_values);
+    toku_free(old_nums);
+
+    test_close(do_close);
+}
+
+static int
+insert_helper (TESTVALUE value, void* extra_insert) {
+    TESTVALUE to_insert = (DMTVALUE)extra_insert;
+    assert(to_insert);
+
+    if (V(value)->number < V(to_insert)->number) return -1;
+    if (V(value)->number > V(to_insert)->number) return +1;
+    return 0;
+}
+
+static void
+test_create_insert (enum close_when_done do_close) {
+    uint32_t i = 0;
+
+    uint32_t* perm = NULL;
+    MALLOC_N(length, perm);
+    assert(perm);
+
+    permute_array(perm, length);
+
+    test_create(KEEP_WHEN_DONE);
+    int r;
+    uint32_t size = length;
+    length = 0;
+    while (length < size) {
+        uint32_t choice = perm[length];
+        TESTVALUE to_insert = &nums[choice];
+        uint32_t idx = UINT32_MAX;
+
+        assert(length==toku_dmt_size(global_dmt));
+        r = toku_dmt_insert(global_dmt, to_insert, insert_helper, to_insert, &idx);
+        CKERR(r);
+        assert(idx <= length);
+        if (idx > 0) {
+            assert(V(to_insert)->number > V(values[idx-1])->number);
+        }
+        if (idx < length) {
+            assert(V(to_insert)->number < V(values[idx])->number);
+        }
+        length++;
+        assert(length==toku_dmt_size(global_dmt));
+        /* Make room */
+        for (i = length-1; i > idx; i--) {
+            values[i] = values[i-1];
+        }
+        values[idx] = to_insert;
+        test_fetch_verify(global_dmt, values, length);
+        test_iterate_verify(global_dmt, values, length);
+
+        idx = UINT32_MAX;
+        r = toku_dmt_insert(global_dmt, to_insert, insert_helper, to_insert, &idx);
+        CKERR2(r, DB_KEYEXIST);
+        assert(idx < length);
+        assert(V(values[idx])->number == V(to_insert)->number);
+        assert(length==toku_dmt_size(global_dmt));
+
+        test_iterate_verify(global_dmt, values, length);
+        test_fetch_verify(global_dmt, values, length);
+    }
+
+    toku_free(perm);
+
+    test_close(do_close);
+}
+
+static void
+test_create_delete_at (enum create_type create_choice, enum close_when_done do_close) {
+    uint32_t i = 0;
+    int r = ENOSYS;
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length);
+    CKERR2(r,EINVAL);
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length+1);
+    CKERR2(r,EINVAL);
+    while (length > 0) {
+        assert(length == toku_dmt_size(global_dmt));
+        uint32_t index_to_delete = random()%length;
+        r = toku_dmt_delete_at(global_dmt, index_to_delete);
+        CKERR(r);
+        for (i = index_to_delete+1; i < length; i++) {
+            values[i-1] = values[i];
+        }
+        length--;
+        test_fetch_verify(global_dmt, values, length);
+        test_iterate_verify(global_dmt, values, length);
+    }
+    assert(length == 0);
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length);
+    CKERR2(r, EINVAL);
+    assert(length == toku_dmt_size(global_dmt));
+    r = toku_dmt_delete_at(global_dmt, length+1);
+    CKERR2(r, EINVAL);
+    test_close(do_close);
+}
+
+static void
+test_split_merge (enum create_type create_choice, enum close_when_done do_close) {
+    int r = ENOSYS;
+    uint32_t i = 0;
+    DMT left_split = NULL;
+    DMT right_split = NULL;
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+    for (i = 0; i <= length; i++) {
+        r = toku_dmt_split_at(global_dmt, &right_split, length+1);
+        CKERR2(r,EINVAL);
+        r = toku_dmt_split_at(global_dmt, &right_split, length+2);
+        CKERR2(r,EINVAL);
+
+        //
+        // test successful split
+        //
+        r = toku_dmt_split_at(global_dmt, &right_split, i);
+        CKERR(r);
+        left_split = global_dmt;
+        global_dmt = NULL;
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        test_fetch_verify(left_split, values, i);
+        test_iterate_verify(left_split, values, i);
+        test_fetch_verify(right_split, &values[i], length - i);
+        test_iterate_verify(right_split, &values[i], length - i);
+        //
+        // verify that new global_dmt's cannot do bad splits
+        //
+        r = toku_dmt_split_at(left_split, &global_dmt, i+1);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        r = toku_dmt_split_at(left_split, &global_dmt, i+2);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        r = toku_dmt_split_at(right_split, &global_dmt, length - i + 1);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+        r = toku_dmt_split_at(right_split, &global_dmt, length - i + 1);
+        CKERR2(r,EINVAL);
+        assert(toku_dmt_size(left_split) == i);
+        assert(toku_dmt_size(right_split) == length - i);
+
+        //
+        // test merge
+        //
+        r = toku_dmt_merge(left_split,right_split,&global_dmt);
+        CKERR(r);
+        left_split = NULL;
+        right_split = NULL;
+        assert(toku_dmt_size(global_dmt) == length);
+        test_fetch_verify(global_dmt, values, length);
+        test_iterate_verify(global_dmt, values, length);
+    }
+    test_close(do_close);
+}
+
+
+static void
+init_values (enum rand_type rand_choice) {
+    const uint32_t test_size = 100;
+    if (rand_choice == TEST_RANDOM) {
+        init_distinct_random_values(random_seed, test_size);
+    }
+    else if (rand_choice == TEST_SORTED) {
+        init_distinct_sorted_values(random_seed, test_size);
+    }
+    else if (rand_choice == TEST_IDENTITY) {
+        init_identity_values(       random_seed, test_size);
+    }
+    else assert(false);
+}
+
+static void
+test_create_array (enum create_type create_choice, enum rand_type rand_choice) {
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_from_sorted_array(     create_choice, CLOSE_WHEN_DONE);
+    test_create_from_sorted_array_size(create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_fetch_verify(          create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_iterate_verify(        create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_set_at(                create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_delete_at(             create_choice, CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_create_insert(                               CLOSE_WHEN_DONE);
+    /* ********************************************************************** */
+    init_values(rand_choice);
+    test_split_merge(                  create_choice, CLOSE_WHEN_DONE);
+}
+
+typedef struct {
+    uint32_t first_zero;
+    uint32_t first_pos;
+} h_extra;
+
+
+static int
+test_heaviside (DMTVALUE v_dmt, void* x) {
+    TESTVALUE v = (DMTVALUE) v_dmt;
+    h_extra* extra = (h_extra*)x;
+    assert(v && x);
+    assert(extra->first_zero <= extra->first_pos);
+
+    uint32_t value = V(v)->number;
+    if (value < extra->first_zero) return -1;
+    if (value < extra->first_pos) return 0;
+    return 1;
+}
+
+static void
+heavy_extra (h_extra* extra, uint32_t first_zero, uint32_t first_pos) {
+    extra->first_zero = first_zero;
+    extra->first_pos  = first_pos;
+}
+
+static void
+test_find_dir (int dir, void* extra, int (*h)(DMTVALUE, void*),
+	       int r_expect, bool idx_will_change, uint32_t idx_expect,
+	       uint32_t number_expect, bool UU(cursor_valid)) {
+    uint32_t idx     = UINT32_MAX;
+    uint32_t old_idx = idx;
+    TESTVALUE dmt_val;
+    int r;
+
+    dmt_val = NULL;
+
+    /* Verify we can pass NULL value. */
+    dmt_val = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = toku_dmt_find_zero(global_dmt, h, extra,      NULL, &idx);
+    }
+    else {
+        r = toku_dmt_find(     global_dmt, h, extra, dir, NULL, &idx);
+    }
+    CKERR2(r, r_expect);
+    if (idx_will_change) {
+        assert(idx == idx_expect);
+    }
+    else {
+        assert(idx == old_idx);
+    }
+    assert(dmt_val == NULL);
+    
+    /* Verify we can pass NULL idx. */
+    dmt_val  = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = toku_dmt_find_zero(global_dmt, h, extra,      &dmt_val, 0);
+    }
+    else {
+        r = toku_dmt_find(     global_dmt, h, extra, dir, &dmt_val, 0);
+    }
+    CKERR2(r, r_expect);
+    assert(idx == old_idx);
+    if (r == DB_NOTFOUND) {
+        assert(dmt_val == NULL);
+    }
+    else {
+        assert(V(dmt_val)->number == number_expect);
+    }
+
+    /* Verify we can pass NULL both. */
+    dmt_val  = NULL;
+    idx      = old_idx;
+    if (dir == 0) {
+        r = toku_dmt_find_zero(global_dmt, h, extra,      NULL, 0);
+    }
+    else {
+        r = toku_dmt_find(     global_dmt, h, extra, dir, NULL, 0);
+    }
+    CKERR2(r, r_expect);
+    assert(idx == old_idx);
+    assert(dmt_val == NULL);
+}
+
+static void
+test_find (enum create_type create_choice, enum close_when_done do_close) {
+    h_extra extra;
+    init_identity_values(random_seed, 100);
+    test_create_from_sorted_array(create_choice, KEEP_WHEN_DONE);
+
+/*
+    -...-
+        A
+*/
+    heavy_extra(&extra, length, length);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true,  length-1, length-1, true);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0,        0,        false);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true,  length,   length,   false);
+
+
+/*
+    +...+
+    B
+*/
+    heavy_extra(&extra, 0, 0);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true,  0, 0, true);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true,  0, 0, false);
+
+/*
+    0...0
+    C
+*/
+    heavy_extra(&extra, 0, length);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0, 0, false);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  0, 0, true);
+
+/*
+    -...-0...0
+        AC
+*/
+    heavy_extra(&extra, length/2, length);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true,  length/2-1, length/2-1, true);
+    test_find_dir(+1, &extra, test_heaviside, DB_NOTFOUND, false, 0,          0,          false);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  length/2,   length/2,   true);
+
+/*
+    0...0+...+
+    C    B
+*/
+    heavy_extra(&extra, 0, length/2);
+    test_find_dir(-1, &extra, test_heaviside, DB_NOTFOUND, false, 0,        0,        false);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true,  length/2, length/2, true);
+    test_find_dir(0,  &extra, test_heaviside, 0,           true,  0,        0,        true);
+
+/*
+    -...-+...+
+        AB
+*/
+    heavy_extra(&extra, length/2, length/2);
+    test_find_dir(-1, &extra, test_heaviside, 0,           true, length/2-1, length/2-1, true);
+    test_find_dir(+1, &extra, test_heaviside, 0,           true, length/2,   length/2,   true);
+    test_find_dir(0,  &extra, test_heaviside, DB_NOTFOUND, true, length/2,   length/2,   false);
+
+/*
+    -...-0...0+...+
+        AC    B
+*/    
+    heavy_extra(&extra, length/3, 2*length/3);
+    test_find_dir(-1, &extra, test_heaviside, 0, true,   length/3-1,   length/3-1, true);
+    test_find_dir(+1, &extra, test_heaviside, 0, true, 2*length/3,   2*length/3,   true);
+    test_find_dir(0,  &extra, test_heaviside, 0, true,   length/3,     length/3,   true);
+
+    /* Cleanup */
+    test_close(do_close);
+}
+
+static void
+runtests_create_choice (enum create_type create_choice) {
+    test_create_array(create_choice, TEST_SORTED);
+    test_create_array(create_choice, TEST_RANDOM);
+    test_create_array(create_choice, TEST_IDENTITY);
+    test_find(        create_choice, CLOSE_WHEN_DONE);
+}
+
+static void
+test_clone(uint32_t nelts)
+// Test that each clone operation gives the right data back.  If nelts is
+// zero, also tests that you still get a valid DMT back and that the way
+// to deallocate it still works.
+{
+    DMT src = NULL, dest = NULL;
+    int r;
+
+    r = toku_dmt_create(&src);
+    assert_zero(r);
+    for (long i = 0; i < nelts; ++i) {
+        r = toku_dmt_insert_at(src, (DMTVALUE) i, i);
+        assert_zero(r);
+    }
+
+    r = toku_dmt_clone_noptr(&dest, src);
+    assert_zero(r);
+    assert(dest != NULL);
+    assert(toku_dmt_size(dest) == nelts);
+    for (long i = 0; i < nelts; ++i) {
+        DMTVALUE v;
+        long l;
+        r = toku_dmt_fetch(dest, i, &v);
+        assert_zero(r);
+        l = (long) v;
+        assert(l == i);
+    }
+    toku_dmt_destroy(&dest);
+    toku_dmt_destroy(&src);
+}
+
+int
+test_main(int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    init_globals();
+    test_create(      CLOSE_WHEN_DONE);
+    test_create_size( CLOSE_WHEN_DONE);
+    runtests_create_choice(BATCH_INSERT);
+    runtests_create_choice(STEAL_ARRAY);
+    runtests_create_choice(INSERT_AT);
+    runtests_create_choice(INSERT_AT_ALMOST_RANDOM);
+    test_clone(0);
+    test_clone(1);
+    test_clone(1000);
+    test_clone(10000);
+    cleanup_globals();
+    return 0;
+}
+
--- a/ft/tests/dmt-test2.cc
+++ b/ft/tests/dmt-test2.cc
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+  This software is covered by US Patent No. 8,489,638.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+
+#include "test.h"
+
+#include <util/dmt.h>
+
+static void
+parse_args (int argc, const char *argv[]) {
+    const char *argv0=argv[0];
+    while (argc>1) {
+        int resultcode=0;
+        if (strcmp(argv[1], "-v")==0) {
+            verbose++;
+        } else if (strcmp(argv[1], "-q")==0) {
+            verbose = 0;
+        } else if (strcmp(argv[1], "-h")==0) {
+        do_usage:
+            fprintf(stderr, "Usage:\n%s [-v|-h]\n", argv0);
+            exit(resultcode);
+        } else {
+            resultcode=1;
+            goto do_usage;
+        }
+        argc--;
+        argv++;
+    }
+}
+/* End ".h like" stuff. */
+
+struct value {
+    uint32_t number;
+};
+#define V(x) ((struct value *)(x))
+
+
+
+const uint32_t MAXNUM = 1024;
+const uint32_t MAXLEN = 32;
+char data[MAXNUM][MAXLEN];
+
+struct val_type {
+    char c[MAXLEN];
+};
+
+namespace toku {
+template<>
+class dmt_functor<val_type> {
+    public:
+        size_t get_dmtdatain_t_size(void) const {
+            size_t len = strlen(v.c);
+            invariant(len < sizeof(val_type));
+            return len + 1;
+        }
+        void write_dmtdata_t_to(val_type *const dest) const {
+            strcpy(dest->c, v.c);
+        }
+
+        dmt_functor(const char* c) {
+            invariant(strlen(c) < sizeof(val_type));
+            strcpy(v.c, c);
+        }
+
+        dmt_functor(const uint32_t klpair_len, val_type *const src) {
+            invariant(strlen(src->c) < sizeof(val_type));
+            strcpy(v.c, src->c);
+            invariant(klpair_len == get_dmtdatain_t_size());
+        }
+    private:
+        val_type v;
+};
+}
+
+/* Globals */
+typedef toku::dmt<val_type, val_type*> vdmt;
+typedef toku::dmt_functor<val_type> vfunctor;
+
+const unsigned int random_seed = 0xFEADACBA;
+
+///////////////
+
+
+static void fail_one_verify(uint32_t len, uint32_t num, vdmt *v) {
+    val_type* fetched_data;
+    int count = 0;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v->fetch(i-count, &fetched_len, &fetched_data);
+        if (r != 0 || fetched_len != len || strcmp(fetched_data->c, data[i])) {
+            count++;
+            continue;
+        }
+    }
+    invariant(count == 1);
+}
+
+static void verify(uint32_t len, uint32_t num, vdmt *v) {
+    val_type* fetched_data;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v->fetch(i, &fetched_len, &fetched_data);
+        CKERR(r);
+        invariant(fetched_len == len);
+        invariant(!strcmp(fetched_data->c, data[i]));
+    }
+}
+
+
+static void test_builder_fixed(uint32_t len, uint32_t num) {
+    srandom(random_seed);
+    assert(len > 1);
+    assert(len <= MAXLEN);
+    assert(num <= MAXNUM);
+    for (uint32_t i = 0; i < num; i++) {
+        for (uint32_t j = 0; j < len-1; j++) {
+            data[i][j] = random() % 255 + 1; //This way it doesn't end up being 0 and thought of as NUL
+        }
+        data[i][len-1] = '\0'; //cap it
+    }
+
+    vdmt::builder builder;
+    builder.create(num, num * len);
+
+    for (uint32_t i = 0; i < num; i++) {
+        vfunctor vfun(data[i]);
+        builder.insert_sorted(vfun);
+    }
+    invariant(builder.is_value_length_fixed());
+    vdmt v;
+    builder.build_and_destroy(&v);
+    invariant(v.is_value_length_fixed());
+    invariant(v.get_fixed_length() == len);
+
+    invariant(v.size() == num);
+
+    verify(len, num, &v);
+
+    for (uint32_t change = 0; change < num; change++) {
+        vdmt v2;
+        v2.clone(v);
+        v2.delete_at(change);
+        fail_one_verify(len, num, &v2);
+
+        vfunctor vfun(data[change]);
+        v2.insert_at(vfun, change);
+        verify(len, num, &v2);
+        v2.destroy();
+    }
+
+    v.destroy();
+}
+
+static void test_builder_variable(uint32_t len, uint32_t len2, uint32_t num) {
+    srandom(random_seed);
+    assert(len > 1);
+    assert(len <= MAXLEN);
+    assert(num <= MAXNUM);
+    assert(num > 3);
+    uint32_t which2 = random() % num;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t thislen = i == which2 ? len2 : len;
+        for (uint32_t j = 0; j < thislen-1; j++) {
+            data[i][j] = random() % 255 + 1; //This way it doesn't end up being 0 and thought of as NUL
+        }
+        data[i][thislen-1] = '\0'; //cap it
+    }
+
+    vdmt::builder builder;
+    builder.create(num, (num-1) * len + len2);
+
+    for (uint32_t i = 0; i < num; i++) {
+        vfunctor vfun(data[i]);
+        builder.insert_sorted(vfun);
+    }
+    invariant(!builder.is_value_length_fixed());
+    vdmt v;
+    builder.build_and_destroy(&v);
+    invariant(!v.is_value_length_fixed());
+
+    invariant(v.size() == num);
+
+    val_type* fetched_data;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v.fetch(i, &fetched_len, &fetched_data);
+        CKERR(r);
+        if (i == which2) {
+            invariant(fetched_len == len2);
+            invariant(!strcmp(fetched_data->c, data[i]));
+        } else {
+            invariant(fetched_len == len);
+            invariant(!strcmp(fetched_data->c, data[i]));
+        }
+    }
+
+    v.destroy();
+}
+
+static void test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(uint32_t len, uint32_t num) {
+    srandom(random_seed);
+    assert(len > 1);
+    assert(len <= MAXLEN);
+    assert(num <= MAXNUM);
+    assert(num > 4);
+    for (uint32_t i = 0; i < num; i++) {
+        for (uint32_t j = 0; j < len-1; j++) {
+            data[i][j] = random() % 255 + 1; //This way it doesn't end up being 0 and thought of as NUL
+        }
+        data[i][len-1] = '\0'; //cap it
+    }
+
+    char *flat = (char*)toku_xmalloc(len * num);
+    char *p = flat;
+    for (uint32_t i = 0; i < num; i++) {
+        memcpy(p, data[i], len);
+        p += len;
+    }
+    vdmt v;
+
+    v.create_from_sorted_memory_of_fixed_size_elements(flat, num, len*num, len);
+    invariant(v.is_value_length_fixed());
+    invariant(v.get_fixed_length() == len);
+
+    invariant(v.size() == num);
+
+    val_type* fetched_data;
+    for (uint32_t i = 0; i < num; i++) {
+        uint32_t fetched_len;
+        int r = v.fetch(i, &fetched_len, &fetched_data);
+        CKERR(r);
+        invariant(fetched_len == len);
+        invariant(!strcmp(fetched_data->c, data[i]));
+    }
+
+    char *serialized_flat = (char*)toku_xmalloc(len*num);
+    struct wbuf wb;
+    wbuf_nocrc_init(&wb, serialized_flat, len*num);
+    v.prepare_for_serialize();
+    v.serialize_values(len*num, &wb);
+    invariant(!memcmp(serialized_flat, flat, len*num));
+
+    //Currently converting to dtree treats the entire thing as NOT fixed length.
+    //Optional additional perf here.
+    uint32_t which = (random() % (num-1)) + 1;  // Not last, not first
+    invariant(which > 0 && which < num-1);
+    v.delete_at(which);
+
+    memmove(flat + which*len, flat+(which+1)*len, (num-which-1) * len);
+    v.prepare_for_serialize();
+    wbuf_nocrc_init(&wb, serialized_flat, len*(num-1));
+    v.serialize_values(len*(num-1), &wb);
+    invariant(!memcmp(serialized_flat, flat, len*(num-1)));
+
+
+    toku_free(flat);
+    toku_free(serialized_flat);
+
+    v.destroy();
+}
+
+int
+test_main(int argc, const char *argv[]) {
+    parse_args(argc, argv);
+    // Do test with size divisible by 4 and not
+    test_builder_fixed(4, 100);
+    test_builder_fixed(5, 100);
+    // Do test with zero, one, or both sizes divisible
+    test_builder_variable(4, 8, 100);
+    test_builder_variable(4, 5, 100);
+    test_builder_variable(5, 8, 100);
+    test_builder_variable(5, 10, 100);
+
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(4, 100);
+    test_create_from_sorted_memory_of_fixed_sized_elements__and__serialize(5, 100);
+
+    /*TODO
+     * insert
+     * insert_at
+     * delete_at
+     * iterate
+     * iterate_on_range
+     * verify
+     * iterate_ptr
+     * find_zero
+     * find
+     */
+    return 0;
+}
+
--- a/ft/tests/ft-serialize-benchmark.cc
+++ b/ft/tests/ft-serialize-benchmark.cc
@@ -127,7 +127,7 @@ long_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
 }

 static void
-test_serialize_leaf(int valsize, int nelts, double entropy) {
+test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode *sn, *dn;

@@ -214,32 +214,63 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
        assert(size   == 100);
    }

+    struct timeval total_start;
+    struct timeval total_end;
+    total_start.tv_sec = total_start.tv_usec = 0;
+    total_end.tv_sec = total_end.tv_usec = 0;
    struct timeval t[2];
-    gettimeofday(&t[0], NULL);
    FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, brt->ft, false);
-    assert(r==0);
-    gettimeofday(&t[1], NULL);
+    for (int i = 0; i < ser_runs; i++) {
+        gettimeofday(&t[0], NULL);
+        ndd = NULL;
+        sn->dirty = 1;
+        r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, brt->ft, false);
+        assert(r==0);
+        gettimeofday(&t[1], NULL);
+        total_start.tv_sec += t[0].tv_sec;
+        total_start.tv_usec += t[0].tv_usec;
+        total_end.tv_sec += t[1].tv_sec;
+        total_end.tv_usec += t[1].tv_usec;
+        toku_free(ndd);
+    }
    double dt;
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("serialize leaf:   %0.05lf\n", dt);
+    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt *= 1000;
+    dt /= ser_runs;
+    printf("serialize leaf(ms):   %0.05lf (average of %d runs)\n", dt, ser_runs);
+
+    //reset 
+    total_start.tv_sec = total_start.tv_usec = 0;
+    total_end.tv_sec = total_end.tv_usec = 0;

    struct ftnode_fetch_extra bfe;
-    fill_bfe_for_full_read(&bfe, brt_h);
-    gettimeofday(&t[0], NULL);
-    FTNODE_DISK_DATA ndd2 = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
-    assert(r==0);
-    gettimeofday(&t[1], NULL);
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("deserialize leaf: %0.05lf\n", dt);
-    printf("io time %lf decompress time %lf deserialize time %lf\n",
-           tokutime_to_seconds(bfe.io_time),
-           tokutime_to_seconds(bfe.decompress_time),
-           tokutime_to_seconds(bfe.deserialize_time)
+    for (int i = 0; i < deser_runs; i++) {
+        fill_bfe_for_full_read(&bfe, brt_h);
+        gettimeofday(&t[0], NULL);
+        FTNODE_DISK_DATA ndd2 = NULL;
+        r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
+        assert(r==0);
+        gettimeofday(&t[1], NULL);
+
+        total_start.tv_sec += t[0].tv_sec;
+        total_start.tv_usec += t[0].tv_usec;
+        total_end.tv_sec += t[1].tv_sec;
+        total_end.tv_usec += t[1].tv_usec;
+
+        toku_ftnode_free(&dn);
+        toku_free(ndd2);
+    }
+    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt *= 1000;
+    dt /= deser_runs;
+    printf("deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
+    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (average of %d runs)\n",
+           tokutime_to_seconds(bfe.io_time)*1000,
+           tokutime_to_seconds(bfe.decompress_time)*1000,
+           tokutime_to_seconds(bfe.deserialize_time)*1000,
+           deser_runs
           );

-    toku_ftnode_free(&dn);
    toku_ftnode_free(&sn);

    toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
@@ -247,14 +278,12 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
    toku_free(brt_h->h);
    toku_free(brt_h);
    toku_free(brt);
-    toku_free(ndd);
-    toku_free(ndd2);

    r = close(fd); assert(r != -1);
 }

 static void
-test_serialize_nonleaf(int valsize, int nelts, double entropy) {
+test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode sn, *dn;

@@ -353,7 +382,8 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
    gettimeofday(&t[1], NULL);
    double dt;
    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("serialize nonleaf:   %0.05lf\n", dt);
+    dt *= 1000;
+    printf("serialize nonleaf(ms):   %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);

    struct ftnode_fetch_extra bfe;
    fill_bfe_for_full_read(&bfe, brt_h);
@@ -363,11 +393,13 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
    assert(r==0);
    gettimeofday(&t[1], NULL);
    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("deserialize nonleaf: %0.05lf\n", dt);
-    printf("io time %lf decompress time %lf deserialize time %lf\n",
-           tokutime_to_seconds(bfe.io_time),
-           tokutime_to_seconds(bfe.decompress_time),
-           tokutime_to_seconds(bfe.deserialize_time)
+    dt *= 1000;
+    printf("deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
+    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (IGNORED RUNS=%d)\n",
+           tokutime_to_seconds(bfe.io_time)*1000,
+           tokutime_to_seconds(bfe.decompress_time)*1000,
+           tokutime_to_seconds(bfe.deserialize_time)*1000,
+           deser_runs
           );

    toku_ftnode_free(&dn);
@@ -394,19 +426,32 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {

 int
 test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
-    long valsize, nelts;
+    const int DEFAULT_RUNS = 5;
+    long valsize, nelts, ser_runs = DEFAULT_RUNS, deser_runs = DEFAULT_RUNS;
    double entropy = 0.3;

-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s <valsize> <nelts>\n", argv[0]);
+    if (argc != 3 && argc != 5) {
+        fprintf(stderr, "Usage: %s <valsize> <nelts> [<serialize_runs> <deserialize_runs>]\n", argv[0]);
+        fprintf(stderr, "Default (and min) runs is %d\n", DEFAULT_RUNS);
        return 2;
    }
    valsize = strtol(argv[1], NULL, 0);
    nelts = strtol(argv[2], NULL, 0);
+    if (argc == 5) {
+        ser_runs = strtol(argv[3], NULL, 0);
+        deser_runs = strtol(argv[4], NULL, 0);
+    }
+
+    if (ser_runs <= 0) {
+        ser_runs = DEFAULT_RUNS;
+    }
+    if (deser_runs <= 0) {
+        deser_runs = DEFAULT_RUNS;
+    }

    initialize_dummymsn();
-    test_serialize_leaf(valsize, nelts, entropy);
-    test_serialize_nonleaf(valsize, nelts, entropy);
+    test_serialize_leaf(valsize, nelts, entropy, ser_runs, deser_runs);
+    test_serialize_nonleaf(valsize, nelts, entropy, ser_runs, deser_runs);

    return 0;
 }
--- a/ft/tests/ft-serialize-test.cc
+++ b/ft/tests/ft-serialize-test.cc
@@ -98,11 +98,6 @@ PATENT RIGHTS GRANT:
 #endif

 static size_t
-calc_le_size(int keylen, int vallen) {
-    return LE_CLEAN_MEMSIZE(vallen) + keylen + sizeof(uint32_t);
-}
-
-static void
 le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keysize, const char *val, int valsize)
 {
    LEAFENTRY r = NULL;
@@ -118,31 +113,34 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keysize, const cha
    r->type = LE_CLEAN;
    r->u.clean.vallen = valsize;
    memcpy(r->u.clean.val, val, valsize);
+    return size_needed + keysize + sizeof(uint32_t);
 }

-static KLPAIR
-le_fastmalloc(struct mempool * mp, const char *key, int keylen, const char *val, int vallen)
-{
-    KLPAIR kl;
-    size_t le_size = calc_le_size(keylen, vallen);
-    CAST_FROM_VOIDP(kl, toku_mempool_malloc(mp, le_size, 1));
-    resource_assert(kl);
-    kl->keylen = keylen;
-    memcpy(kl->key_le, key, keylen);
-    LEAFENTRY le = get_le_from_klpair(kl);
-    le->type = LE_CLEAN;
-    le->u.clean.vallen = vallen;
-    memcpy(le->u.clean.val, val, vallen);
-    return kl;
-}
+class test_key_le_pair {
+    public:
+    uint32_t keylen;
+    char* keyp;
+    LEAFENTRY le;

-static KLPAIR
-le_malloc(struct mempool * mp, const char *key, const char *val)
-{
-    int keylen = strlen(key) + 1;
-    int vallen = strlen(val) + 1;
-    return le_fastmalloc(mp, key, keylen, val, vallen);
-}
+    test_key_le_pair() : keylen(), keyp(), le() {}
+    void init(const char *_keyp, const char *_val) {
+        init(_keyp, strlen(_keyp) + 1, _val, strlen(_val) + 1);
+    }
+    void init(const char * _keyp, uint32_t _keylen, const char*_val, uint32_t _vallen) {
+        keylen = _keylen;
+
+        CAST_FROM_VOIDP(le, toku_malloc(LE_CLEAN_MEMSIZE(_vallen)));
+        le->type = LE_CLEAN;
+        le->u.clean.vallen = _vallen;
+        memcpy(le->u.clean.val, _val, _vallen);
+
+        CAST_FROM_VOIDP(keyp, toku_xmemdup(_keyp, keylen));
+    }
+    ~test_key_le_pair() {
+        toku_free(le);
+        toku_free(keyp);
+    }
+};

 struct check_leafentries_struct {
    int nelts;
@@ -290,7 +288,6 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
    BP_STATE(&sn,1) = PT_AVAIL;
    set_BLB(&sn, 0, toku_create_empty_bn());
    set_BLB(&sn, 1, toku_create_empty_bn());
-    KLPAIR elts[3];
    le_add_to_bn(BLB_DATA(&sn, 0), 0, "a", 2, "aval", 5);
    le_add_to_bn(BLB_DATA(&sn, 0), 1, "b", 2, "bval", 5);
    le_add_to_bn(BLB_DATA(&sn, 1), 0, "x", 2, "xval", 5);
@@ -346,11 +343,10 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        toku_mempool_construct(&dummy_mp, 1024);
-        elts[0] = le_malloc(&dummy_mp, "a", "aval");
-        elts[1] = le_malloc(&dummy_mp, "b", "bval");
-        elts[2] = le_malloc(&dummy_mp, "x", "xval");
+        test_key_le_pair elts[3];
+        elts[0].init("a", "aval");
+        elts[1].init("b", "bval");
+        elts[2].init("x", "xval");
        const uint32_t npartitions = dn->n_children;
        assert(dn->totalchildkeylens==(2*(npartitions-1)));
        uint32_t last_i = 0;
@@ -366,17 +362,16 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(elts[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(elts[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le));
+                assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(elts[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, elts[last_i].keyp) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }

        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == 3);
    }
    toku_ftnode_free(&dn);
@@ -485,18 +480,14 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        size_t le_size = calc_le_size(keylens, vallens);
-        size_t mpsize = nrows * le_size;
-        toku_mempool_construct(&dummy_mp, mpsize);
-        KLPAIR les[nrows];
+        test_key_le_pair *les = new test_key_le_pair[nrows];
        {
            char key[keylens], val[vallens];
            key[keylens-1] = '\0';
            for (uint32_t i = 0; i < nrows; ++i) {
                char c = 'a' + i;
                memset(key, c, keylens-1);
-                les[i] = le_fastmalloc(&dummy_mp, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
+                les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val));
            }
        }
        const uint32_t npartitions = dn->n_children;
@@ -514,17 +505,17 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(les[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(les[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
+                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(les[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, les[last_i].keyp) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }
        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == nrows);
+        delete[] les;
    }

    toku_ftnode_free(&dn);
@@ -552,7 +543,6 @@ static void
 test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
    int r;
    struct ftnode sn, *dn;
-    const int keylens = sizeof(int), vallens = sizeof(int);
    const uint32_t nrows = 196*1024;
    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);

@@ -566,17 +556,18 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
    sn.dirty = 1;
    sn.oldest_referenced_xid_known = TXNID_NONE;

-    MALLOC_N(sn.n_children, sn.bp);
-    MALLOC_N(sn.n_children-1, sn.childkeys);
+    XMALLOC_N(sn.n_children, sn.bp);
+    XMALLOC_N(sn.n_children-1, sn.childkeys);
    sn.totalchildkeylens = (sn.n_children-1)*sizeof(int);
    for (int i = 0; i < sn.n_children; ++i) {
        BP_STATE(&sn,i) = PT_AVAIL;
        set_BLB(&sn, i, toku_create_empty_bn()); 
    }
+    size_t total_size = 0;
    for (uint32_t i = 0; i < nrows; ++i) {
        uint32_t key = i;
        uint32_t val = i;
-        le_add_to_bn(BLB_DATA(&sn, 0), i, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
+        total_size += le_add_to_bn(BLB_DATA(&sn, 0), i, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
    }

    FT_HANDLE XMALLOC(brt);
@@ -624,15 +615,11 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        size_t le_size = calc_le_size(keylens, vallens);
-        size_t mpsize = nrows * le_size;
-        toku_mempool_construct(&dummy_mp, mpsize);
-        KLPAIR les[nrows];
+        test_key_le_pair *les = new test_key_le_pair[nrows];
        {
            int key = 0, val = 0;
            for (uint32_t i = 0; i < nrows; ++i, key++, val++) {
-                les[i] = le_fastmalloc(&dummy_mp, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
+                les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val));
            }
        }
        const uint32_t npartitions = dn->n_children;
@@ -650,11 +637,11 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(les[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(les[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
+                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
                    uint32_t *CAST_FROM_VOIDP(pivot, dn->childkeys[bn].data);
-                    void* tmp = les[last_i]->key_le;
+                    void* tmp = les[last_i].keyp;
                    uint32_t *CAST_FROM_VOIDP(item, tmp);
                    assert(*pivot >= *item);
                }
@@ -664,8 +651,8 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
            // don't check soft_copy_is_up_to_date or seqinsert
            assert(BLB_DATA(dn, bn)->get_disk_size() < 128*1024);  // BN_MAX_SIZE, apt to change
        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == nrows);
+        delete[] les;
    }

    toku_ftnode_free(&dn);
@@ -772,11 +759,7 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
    {
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        size_t le_size = calc_le_size(key_size, val_size);
-        size_t mpsize = nrows * le_size;
-        toku_mempool_construct(&dummy_mp, mpsize);
-        KLPAIR les[nrows];
+        test_key_le_pair *les = new test_key_le_pair[nrows];
        {
            char key[key_size], val[val_size];
            key[key_size-1] = '\0';
@@ -785,7 +768,7 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
                char c = 'a' + i;
                memset(key, c, key_size-1);
                memset(val, c, val_size-1);
-                les[i] = le_fastmalloc(&dummy_mp, key, key_size, val, val_size);
+                les[i].init(key, key_size, val, val_size);
            }
        }
        const uint32_t npartitions = dn->n_children;
@@ -804,18 +787,18 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(les[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(les[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
+                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(les[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(les[last_i].keyp)) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }
            // don't check soft_copy_is_up_to_date or seqinsert
        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == 7);
+        delete[] les;
    }

    toku_ftnode_free(&dn);
@@ -871,7 +854,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
        set_BLB(&sn, i, toku_create_empty_bn());
        BLB_SEQINSERT(&sn, i) = 0;
    }
-    KLPAIR elts[3];
    le_add_to_bn(BLB_DATA(&sn, 1), 0, "a", 2, "aval", 5);
    le_add_to_bn(BLB_DATA(&sn, 3), 0, "b", 2, "bval", 5);
    le_add_to_bn(BLB_DATA(&sn, 5), 0, "x", 2, "xval", 5);
@@ -921,13 +903,13 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
    assert(dn->height == 0);
    assert(dn->n_children>0);
    {
+        test_key_le_pair elts[3];
+
        // Man, this is way too ugly.  This entire test suite needs to be refactored.
        // Create a dummy mempool and put the leaves there.  Ugh.
-        struct mempool dummy_mp;
-        toku_mempool_construct(&dummy_mp, 1024);
-        elts[0] = le_malloc(&dummy_mp, "a", "aval");
-        elts[1] = le_malloc(&dummy_mp, "b", "bval");
-        elts[2] = le_malloc(&dummy_mp, "x", "xval");
+        elts[0].init("a", "aval");
+        elts[1].init("b", "bval");
+        elts[2].init("x", "xval");
        const uint32_t npartitions = dn->n_children;
        assert(dn->totalchildkeylens==(2*(npartitions-1)));
        uint32_t last_i = 0;
@@ -942,17 +924,16 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
                uint32_t curr_keylen;
                void* curr_key;
                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(get_le_from_klpair(elts[last_i])));
-                assert(memcmp(curr_le, get_le_from_klpair(elts[last_i]), leafentry_memsize(curr_le)) == 0);
+                assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le));
+                assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0);
                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(elts[last_i]->key_le)) <= 0);
+                    assert(strcmp((char*)dn->childkeys[bn].data, (char*)(elts[last_i].keyp)) <= 0);
                }
                // TODO for later, get a key comparison here as well
                last_i++;
            }

        }
-        toku_mempool_destroy(&dummy_mp);
        assert(last_i == 3);
    }
    toku_ftnode_free(&dn);

--- a/ft/tests/test-pick-child-to-flush.cc
+++ b/ft/tests/test-pick-child-to-flush.cc
@@ -189,7 +189,7 @@ doit (void) {
    r = toku_testsetup_root(t, node_root);
    assert(r==0);

-    char filler[900];
+    char filler[900-2*bn_data::HEADER_LENGTH];
    memset(filler, 0, sizeof(filler));
    // now we insert filler data so that a merge does not happen
    r = toku_testsetup_insert_to_leaf (

--- a/ft/wbuf.h
+++ b/ft/wbuf.h
@@ -187,6 +187,13 @@ static inline void wbuf_uint (struct wbuf *w, uint32_t i) {
    wbuf_int(w, (int32_t)i);
 }

+static inline uint8_t* wbuf_nocrc_reserve_literal_bytes(struct wbuf *w, uint32_t nbytes) {
+    assert(w->ndone + nbytes <= w->size);
+    uint8_t * dest = w->buf + w->ndone;
+    w->ndone += nbytes;
+    return dest;
+}
+
 static inline void wbuf_nocrc_literal_bytes(struct wbuf *w, bytevec bytes_bv, uint32_t nbytes) {
    const unsigned char *bytes = (const unsigned char *) bytes_bv;
 #if 0

--- a/util/dmt.cc
+++ b/util/dmt.cc
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <string.h>
+#include <db.h>
+
+#include <toku_include/memory.h>
+#include <limits.h>
+
+namespace toku {
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::create(void) {
+    this->create_internal_no_alloc(false);
+    //TODO: maybe allocate enough space for something by default?
+    //      We may be relying on not needing to allocate space the first time (due to limited time spent while a lock is held)
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::create_from_sorted_memory_of_fixed_size_elements(
+        const void *mem,
+        const uint32_t numvalues,
+        const uint32_t mem_length,
+        const uint32_t fixed_value_length) {
+    this->values_same_size = true;
+    this->value_length = fixed_value_length;
+    this->is_array = true;
+    this->d.a.start_idx = 0;
+    this->d.a.num_values = numvalues;
+    const uint8_t pad_bytes = get_fixed_length_alignment_overhead();
+    uint32_t aligned_memsize = mem_length + numvalues * pad_bytes;
+    toku_mempool_construct(&this->mp, aligned_memsize);
+    if (aligned_memsize > 0) {
+        void *ptr = toku_mempool_malloc(&this->mp, aligned_memsize, 1);
+        paranoid_invariant_notnull(ptr);
+        uint8_t *CAST_FROM_VOIDP(dest, ptr);
+        const uint8_t *CAST_FROM_VOIDP(src, mem);
+        if (pad_bytes == 0) {
+            paranoid_invariant(aligned_memsize == mem_length);
+            memcpy(dest, src, aligned_memsize);
+        } else {
+            const uint32_t fixed_len = this->value_length;
+            const uint32_t fixed_aligned_len = align(this->value_length);
+            paranoid_invariant(this->d.a.num_values*fixed_len == mem_length);
+            for (uint32_t i = 0; i < this->d.a.num_values; i++) {
+                memcpy(&dest[i*fixed_aligned_len], &src[i*fixed_len], fixed_len);
+            }
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::create_no_array(void) {
+    this->create_internal_no_alloc(false);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::create_internal_no_alloc(bool as_tree) {
+    toku_mempool_zero(&this->mp);
+    this->values_same_size = true;
+    this->value_length = 0;
+    this->is_array = !as_tree;
+    if (as_tree) {
+        this->d.t.root.set_to_null();
+    } else {
+        this->d.a.start_idx = 0;
+        this->d.a.num_values = 0;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::clone(const dmt &src) {
+    *this = src;
+    toku_mempool_clone(&src.mp, &this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::clear(void) {
+    this->is_array = true;
+    this->d.a.start_idx = 0;
+    this->d.a.num_values = 0;
+    this->values_same_size = true;  // Reset state
+    this->value_length = 0;
+    toku_mempool_destroy(&this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::destroy(void) {
+    this->clear();
+    toku_mempool_destroy(&this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t>::size(void) const {
+    if (this->is_array) {
+        return this->d.a.num_values;
+    } else {
+        return this->nweight(this->d.t.root);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t>::nweight(const subtree &subtree) const {
+    if (subtree.is_null()) {
+        return 0;
+    } else {
+        const dmt_base_node & node = get_node<dmt_base_node>(subtree);
+        return node.weight;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t, int (*h)(const uint32_t size, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::insert(const dmtdatain_t &value, const dmtcmp_t &v, uint32_t *const idx) {
+    int r;
+    uint32_t insert_idx;
+
+    r = this->find_zero<dmtcmp_t, h>(v, nullptr, nullptr, &insert_idx);
+    if (r==0) {
+        if (idx) *idx = insert_idx;
+        return DB_KEYEXIST;
+    }
+    if (r != DB_NOTFOUND) return r;
+
+    if ((r = this->insert_at(value, insert_idx))) return r;
+    if (idx) *idx = insert_idx;
+
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+int dmt<dmtdata_t, dmtdataout_t>::insert_at(const dmtdatain_t &value, const uint32_t idx) {
+    if (idx > this->size()) { return EINVAL; }
+
+    bool same_size = this->values_same_size && (this->size() == 0 || value.get_dmtdatain_t_size() == this->value_length);
+    if (same_size) {
+        if (this->is_array) {
+            if (idx == this->d.a.num_values) {
+                return this->insert_at_array_end<true>(value);
+            }
+#if 0
+            //TODO: enable if we support delete_at with array
+            if (idx == 0 && this->d.a.start_idx > 0) {
+                paranoid_invariant(false); // Should not be possible (yet)
+                return this->insert_at_array_beginning(value);
+            }
+#endif
+        }
+    }
+    if (this->is_array) {
+        this->convert_to_dtree();
+    }
+    if (!same_size) {
+        this->values_same_size = false;
+    }
+    paranoid_invariant(!is_array);
+    // Is a d-tree.
+
+    this->maybe_resize_dtree(&value);
+    subtree *rebalance_subtree = nullptr;
+    this->insert_internal(&this->d.t.root, value, idx, &rebalance_subtree);
+    if (rebalance_subtree != nullptr) {
+        this->rebalance(rebalance_subtree);
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<bool with_resize>
+int dmt<dmtdata_t, dmtdataout_t>::insert_at_array_end(const dmtdatain_t& value_in) {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+    if (this->d.a.num_values == 0) {
+        this->value_length = value_in.get_dmtdatain_t_size();
+    }
+    paranoid_invariant(this->value_length == value_in.get_dmtdatain_t_size());
+
+    if (with_resize) {
+        this->maybe_resize_array(+1);
+    }
+    dmtdata_t *dest = this->alloc_array_value_end();
+    value_in.write_dmtdata_t_to(dest);
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+int dmt<dmtdata_t, dmtdataout_t>::insert_at_array_beginning(const dmtdatain_t& value_in) {
+    invariant(false); //TODO: enable this later
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+    paranoid_invariant(this->d.a.num_values > 0);
+    //TODO: when deleting last element, should set start_idx to 0
+
+    this->maybe_resize_array(+1);  // +1 or 0?  Depends on how memory management works
+    dmtdata_t *dest = this->alloc_array_value_beginning();
+    value_in.write_dmtdata_t_to(dest);
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+dmtdata_t * dmt<dmtdata_t, dmtdataout_t>::alloc_array_value_end(void) {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+    this->d.a.num_values++;
+
+    void *ptr = toku_mempool_malloc(&this->mp, align(this->value_length), 1);
+    paranoid_invariant_notnull(ptr);
+    paranoid_invariant(reinterpret_cast<size_t>(ptr) % ALIGNMENT == 0);
+    dmtdata_t *CAST_FROM_VOIDP(n, ptr);
+    paranoid_invariant(n == get_array_value(this->d.a.num_values - 1));
+    return n;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+dmtdata_t * dmt<dmtdata_t, dmtdataout_t>::alloc_array_value_beginning(void) {
+    invariant(false); //TODO: enable this later
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+
+    paranoid_invariant(this->d.a.start_idx > 0);
+    const uint32_t real_idx = --this->d.a.start_idx;
+    this->d.a.num_values++;
+    //TODO: figure out how to keep mempool correct here.. do we free during delete_at (begin)?  If so how do we re'malloc' from beginning?  Alternatively never free from beginning?
+
+    return get_array_value_internal(&this->mp, real_idx);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+dmtdata_t * dmt<dmtdata_t, dmtdataout_t>::get_array_value(const uint32_t idx) const {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+
+    //TODO: verify initial create always set is_array and values_same_size
+    paranoid_invariant(idx < this->d.a.num_values);
+    const uint32_t real_idx = idx + this->d.a.start_idx;
+    return get_array_value_internal(&this->mp, real_idx);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+dmtdata_t * dmt<dmtdata_t, dmtdataout_t>::get_array_value_internal(const struct mempool *mempool, const uint32_t real_idx) const {
+    void* ptr = toku_mempool_get_pointer_from_base_and_offset(mempool, real_idx * align(this->value_length));
+    dmtdata_t *CAST_FROM_VOIDP(value, ptr);
+    return value;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::maybe_resize_array(const int change) {
+    paranoid_invariant(change == -1 || change == 1);
+    paranoid_invariant(change == 1); //TODO: go over change == -1.. may or may not be ok
+
+    bool space_available = change < 0 || toku_mempool_get_free_space(&this->mp) >= align(this->value_length);
+
+    const uint32_t n = this->d.a.num_values + change;
+    const uint32_t new_n = n<=2 ? 4 : 2*n;
+    const uint32_t new_space = align(this->value_length) * new_n;
+    bool too_much_space = new_space <= toku_mempool_get_size(&this->mp) / 2;
+
+    if (!space_available || too_much_space) {
+        struct mempool new_kvspace;
+        toku_mempool_construct(&new_kvspace, new_space);
+        size_t copy_bytes = this->d.a.num_values * align(this->value_length);
+        invariant(copy_bytes + align(this->value_length) <= new_space);
+        // Copy over to new mempool
+        if (this->d.a.num_values > 0) {
+            void* dest = toku_mempool_malloc(&new_kvspace, copy_bytes, 1);
+            invariant(dest!=nullptr);
+            memcpy(dest, get_array_value(0), copy_bytes);
+        }
+        toku_mempool_destroy(&this->mp);
+        this->mp = new_kvspace;
+        this->d.a.start_idx = 0;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t>::align(const uint32_t x) const {
+    return roundup_to_multiple(ALIGNMENT, x);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::convert_to_dtree(void) {
+    paranoid_invariant(this->is_array);  //TODO: remove this when ctree implemented
+    paranoid_invariant(this->values_same_size);
+    if (this->is_array) {
+        convert_from_array_to_tree<true>();
+    } else {
+        //TODO: implement this one.
+
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::prepare_for_serialize(void) {
+    if (!this->is_array) {
+        this->convert_from_tree_to_array<true>();
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<bool with_sizes>
+void dmt<dmtdata_t, dmtdataout_t>::convert_from_tree_to_array(void) {
+    static_assert(with_sizes, "not in prototype");
+    paranoid_invariant(!this->is_array);
+    paranoid_invariant(this->values_same_size);
+    
+    const uint32_t num_values = this->size();
+
+    node_idx *tmp_array;
+    bool malloced = false;
+    tmp_array = alloc_temp_node_idxs(num_values);
+    if (!tmp_array) {
+        malloced = true;
+        XMALLOC_N(num_values, tmp_array);
+    }
+    this->fill_array_with_subtree_idxs(tmp_array, this->d.t.root);
+
+    struct mempool new_mp = this->mp;
+    const uint32_t fixed_len = this->value_length;
+    const uint32_t fixed_aligned_len = align(this->value_length);
+    size_t mem_needed = num_values * fixed_aligned_len;
+    toku_mempool_construct(&new_mp, mem_needed);
+    uint8_t* CAST_FROM_VOIDP(dest, toku_mempool_malloc(&new_mp, mem_needed, 1));
+    paranoid_invariant_notnull(dest);
+    for (uint32_t i = 0; i < num_values; i++) {
+        const dmt_dnode &n = get_node<dmt_dnode>(tmp_array[i]);
+        memcpy(&dest[i*fixed_aligned_len], &n.value, fixed_len);
+    }
+    toku_mempool_destroy(&this->mp);
+    this->mp = new_mp;
+    this->is_array = true;
+    this->d.a.start_idx = 0;
+    this->d.a.num_values = num_values;
+
+    if (malloced) toku_free(tmp_array);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<bool with_sizes>
+void dmt<dmtdata_t, dmtdataout_t>::convert_from_array_to_tree(void) {
+    paranoid_invariant(this->is_array);
+    paranoid_invariant(this->values_same_size);
+    
+    //save array-format information to locals
+    const uint32_t num_values = this->d.a.num_values;
+    const uint32_t offset = this->d.a.start_idx;
+    paranoid_invariant_zero(offset); //TODO: remove this
+
+    static_assert(with_sizes, "not in prototype");
+
+    node_idx *tmp_array;
+    bool malloced = false;
+    tmp_array = alloc_temp_node_idxs(num_values);
+    if (!tmp_array) {
+        malloced = true;
+        XMALLOC_N(num_values, tmp_array);
+    }
+
+    struct mempool old_mp = this->mp;
+    size_t mem_needed = num_values * align(this->value_length + __builtin_offsetof(dmt_mnode<with_sizes>, value));
+    toku_mempool_construct(&this->mp, mem_needed);
+
+    for (uint32_t i = 0; i < num_values; i++) {
+        dmtdatain_t functor(this->value_length, get_array_value_internal(&old_mp, i+offset));
+        tmp_array[i] = node_malloc_and_set_value<with_sizes>(functor);
+    }
+    this->is_array = false;
+    //TODO: when/if we support ctree, set appropriate field here.
+    this->rebuild_subtree_from_idxs(&this->d.t.root, tmp_array, num_values);
+
+    if (malloced) toku_free(tmp_array);
+    toku_mempool_destroy(&old_mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+int dmt<dmtdata_t, dmtdataout_t>::delete_at(const uint32_t idx) {
+    uint32_t n = this->size();
+    if (idx >= n) { return EINVAL; }
+
+    if (n == 1) {
+        this->clear();  //Emptying out the entire dmt.
+        return 0;
+    }
+    //TODO: support array delete/ctree delete
+    if (this->is_array) {  //TODO: support ctree
+        this->convert_to_dtree();
+    }
+    paranoid_invariant(!is_array);
+
+    if (this->is_array) {
+        paranoid_invariant(false);
+    } else {
+        subtree *rebalance_subtree = nullptr;
+        this->delete_internal(&this->d.t.root, idx, nullptr, &rebalance_subtree);
+        if (rebalance_subtree != nullptr) {
+            this->rebalance(rebalance_subtree);
+        }
+    }
+    this->maybe_resize_dtree(nullptr);
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t>::iterate(iterate_extra_t *const iterate_extra) const {
+    return this->iterate_on_range<iterate_extra_t, f>(0, this->size(), iterate_extra);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t>::iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
+    if (right > this->size()) { return EINVAL; }
+    if (left == right) { return 0; }
+    if (this->is_array) {
+        return this->iterate_internal_array<iterate_extra_t, f>(left, right, iterate_extra);
+    }
+    return this->iterate_internal<iterate_extra_t, f>(left, right, this->d.t.root, 0, iterate_extra);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::verify(void) const {
+    if (!is_array) {
+        verify_internal(this->d.t.root);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::verify_internal(const subtree &subtree) const {
+    if (subtree.is_null()) {
+        return;
+    }
+    const dmt_dnode &node = get_node<dmt_dnode>(subtree);
+
+    const uint32_t leftweight = this->nweight(node.left);
+    const uint32_t rightweight = this->nweight(node.right);
+
+    invariant(leftweight + rightweight + 1 == this->nweight(subtree));
+    if (this->values_same_size) {
+        invariant(node.value_length == this->value_length);
+    }
+    verify_internal(node.left);
+    verify_internal(node.right);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+void dmt<dmtdata_t, dmtdataout_t>::iterate_ptr(iterate_extra_t *const iterate_extra) {
+    if (this->is_array) {
+        this->iterate_ptr_internal_array<iterate_extra_t, f>(0, this->size(), iterate_extra);
+    } else {
+        this->iterate_ptr_internal<iterate_extra_t, f>(0, this->size(), this->d.t.root, 0, iterate_extra);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+int dmt<dmtdata_t, dmtdataout_t>::fetch(const uint32_t idx, uint32_t *const value_len, dmtdataout_t *const value) const {
+    if (idx >= this->size()) { return EINVAL; }
+    if (this->is_array) {
+        this->fetch_internal_array(idx, value_len, value);
+    } else {
+        this->fetch_internal(this->d.t.root, idx, value_len, value);
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find_zero(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    uint32_t tmp_index;
+    uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+    int r;
+    if (this->is_array) {
+        r = this->find_internal_zero_array<dmtcmp_t, h>(extra, value_len, value, child_idxp);
+    }
+    else {
+        r = this->find_internal_zero<dmtcmp_t, h>(this->d.t.root, extra, value_len, value, child_idxp);
+    }
+    return r;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find(const dmtcmp_t &extra, int direction, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    uint32_t tmp_index;
+    uint32_t *const child_idxp = (idxp != nullptr) ? idxp : &tmp_index;
+    paranoid_invariant(direction != 0);
+    if (direction < 0) {
+        if (this->is_array) {
+            return this->find_internal_minus_array<dmtcmp_t, h>(extra, value_len,  value, child_idxp);
+        } else {
+            return this->find_internal_minus<dmtcmp_t, h>(this->d.t.root, extra, value_len,  value, child_idxp);
+        }
+    } else {
+        if (this->is_array) {
+            return this->find_internal_plus_array<dmtcmp_t, h>(extra, value_len,  value, child_idxp);
+        } else {
+            return this->find_internal_plus<dmtcmp_t, h>(this->d.t.root, extra, value_len, value, child_idxp);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+size_t dmt<dmtdata_t, dmtdataout_t>::memory_size(void) {
+    return (sizeof *this) + toku_mempool_get_size(&this->mp);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename node_type>
+node_type & dmt<dmtdata_t, dmtdataout_t>::get_node(const subtree &subtree) const {
+    paranoid_invariant(!subtree.is_null());
+    return get_node<node_type>(subtree.get_index());
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename node_type>
+node_type & dmt<dmtdata_t, dmtdataout_t>::get_node(const node_idx offset) const {
+    //TODO: implement
+    //Need to decide what to do with regards to cnode/dnode
+    void* ptr = toku_mempool_get_pointer_from_base_and_offset(&this->mp, offset);
+    node_type *CAST_FROM_VOIDP(node, ptr);
+    return *node;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::node_set_value(dmt_mnode<true> * n, const dmtdatain_t &value) {
+    n->value_length = value.get_dmtdatain_t_size();
+    value.write_dmtdata_t_to(&n->value);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<bool with_length>
+node_idx dmt<dmtdata_t, dmtdataout_t>::node_malloc_and_set_value(const dmtdatain_t &value) {
+    static_assert(with_length, "not in prototype");
+    size_t val_size = value.get_dmtdatain_t_size();
+    size_t size_to_alloc = __builtin_offsetof(dmt_mnode<with_length>, value) + val_size;
+    size_to_alloc = align(size_to_alloc);
+    void* np = toku_mempool_malloc(&this->mp, size_to_alloc, 1);
+    paranoid_invariant_notnull(np);
+    dmt_mnode<with_length> *CAST_FROM_VOIDP(n, np);
+    node_set_value(n, value);
+
+    n->b.clear_stolen_bits();
+    return toku_mempool_get_offset_from_pointer_and_base(&this->mp, np);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::node_free(const subtree &st) {
+    dmt_dnode &n = get_node<dmt_dnode>(st);
+    size_t size_to_free = __builtin_offsetof(dmt_dnode, value) + n.value_length;
+    size_to_free = align(size_to_free);
+    toku_mempool_mfree(&this->mp, &n, size_to_free);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::maybe_resize_dtree(const dmtdatain_t * value) {
+    static_assert(std::is_same<dmtdatain_t, dmtdatain_t>::value, "functor wrong type");
+    const ssize_t curr_capacity = toku_mempool_get_size(&this->mp);
+    const ssize_t curr_free = toku_mempool_get_free_space(&this->mp);
+    const ssize_t curr_used = toku_mempool_get_used_space(&this->mp);
+    ssize_t add_size = 0;
+    if (value) {
+        add_size = __builtin_offsetof(dmt_dnode, value) + value->get_dmtdatain_t_size();
+        add_size = align(add_size);
+    }
+
+    const ssize_t need_size = curr_used + add_size;
+    paranoid_invariant(need_size <= UINT32_MAX);
+    const ssize_t new_size = 2*need_size;
+    paranoid_invariant(new_size <= UINT32_MAX);
+
+    //const uint32_t num_nodes = this->nweight(this->d.t.root);
+
+    if ((curr_capacity / 2 >= new_size) || // Way too much allocated
+        (curr_free < add_size)) {  // No room in mempool
+        // Copy all memory and reconstruct dmt in new mempool.
+        struct mempool new_kvspace;
+        struct mempool old_kvspace;
+        toku_mempool_construct(&new_kvspace, new_size);
+
+        if (!this->d.t.root.is_null()) {
+            const dmt_dnode &n = get_node<dmt_dnode>(this->d.t.root);
+            node_idx *tmp_array;
+            bool malloced = false;
+            tmp_array = alloc_temp_node_idxs(n.b.weight);
+            if (!tmp_array) {
+                malloced = true;
+                XMALLOC_N(n.b.weight, tmp_array);
+            }
+            this->fill_array_with_subtree_idxs(tmp_array, this->d.t.root);
+            for (node_idx i = 0; i < n.b.weight; i++) {
+                dmt_dnode &node = get_node<dmt_dnode>(tmp_array[i]);
+                const size_t bytes_to_copy = __builtin_offsetof(dmt_dnode, value) + node.value_length;
+                const size_t bytes_to_alloc = align(bytes_to_copy);
+                void* newdata = toku_mempool_malloc(&new_kvspace, bytes_to_alloc, 1);
+                memcpy(newdata, &node, bytes_to_copy);
+                tmp_array[i] = toku_mempool_get_offset_from_pointer_and_base(&new_kvspace, newdata);
+            }
+
+            old_kvspace = this->mp;
+            this->mp = new_kvspace;
+            this->rebuild_subtree_from_idxs(&this->d.t.root, tmp_array, n.b.weight);
+            if (malloced) toku_free(tmp_array);
+            toku_mempool_destroy(&old_kvspace);
+        }
+        else {
+            toku_mempool_destroy(&this->mp);
+            this->mp = new_kvspace;
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+bool dmt<dmtdata_t, dmtdataout_t>::will_need_rebalance(const subtree &subtree, const int leftmod, const int rightmod) const {
+    if (subtree.is_null()) { return false; }
+    const dmt_dnode &n = get_node<dmt_dnode>(subtree);
+    // one of the 1's is for the root.
+    // the other is to take ceil(n/2)
+    const uint32_t weight_left  = this->nweight(n.b.left)  + leftmod;
+    const uint32_t weight_right = this->nweight(n.b.right) + rightmod;
+    return ((1+weight_left < (1+1+weight_right)/2)
+            ||
+            (1+weight_right < (1+1+weight_left)/2));
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::insert_internal(subtree *const subtreep, const dmtdatain_t &value, const uint32_t idx, subtree **const rebalance_subtree) {
+    if (subtreep->is_null()) {
+        paranoid_invariant_zero(idx);
+        const node_idx newidx = this->node_malloc_and_set_value<true>(value);  //TODO: make this not <true> arbitrarily
+        dmt_dnode &newnode = get_node<dmt_dnode>(newidx);
+        newnode.b.weight = 1;
+        newnode.b.left.set_to_null();
+        newnode.b.right.set_to_null();
+        subtreep->set_index(newidx);
+    } else {
+        dmt_dnode &n = get_node<dmt_dnode>(*subtreep);
+        n.b.weight++;
+        if (idx <= this->nweight(n.b.left)) {
+            if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 1, 0)) {
+                *rebalance_subtree = subtreep;
+            }
+            this->insert_internal(&n.b.left, value, idx, rebalance_subtree);
+        } else {
+            if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 0, 1)) {
+                *rebalance_subtree = subtreep;
+            }
+            const uint32_t sub_index = idx - this->nweight(n.b.left) - 1;
+            this->insert_internal(&n.b.right, value, sub_index, rebalance_subtree);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::delete_internal(subtree *const subtreep, const uint32_t idx, subtree *const subtree_replace, subtree **const rebalance_subtree) {
+    paranoid_invariant_notnull(subtreep);
+    paranoid_invariant_notnull(rebalance_subtree);
+    paranoid_invariant(!subtreep->is_null());
+    dmt_dnode &n = get_node<dmt_dnode>(*subtreep);
+    const uint32_t leftweight = this->nweight(n.b.left);
+    if (idx < leftweight) {
+        n.b.weight--;
+        if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, -1, 0)) {
+            *rebalance_subtree = subtreep;
+        }
+        this->delete_internal(&n.b.left, idx, subtree_replace, rebalance_subtree);
+    } else if (idx == leftweight) {
+        // Found the correct index.
+        if (n.b.left.is_null()) {
+            // Delete n and let parent point to n.b.right
+            subtree ptr_this = *subtreep;
+            *subtreep = n.b.right;
+            subtree to_free;
+            if (subtree_replace != nullptr) {
+                // Swap self with the other node.
+                to_free = *subtree_replace;
+                dmt_dnode &ancestor = get_node<dmt_dnode>(*subtree_replace);
+                if (*rebalance_subtree == &ancestor.b.right) {
+                    // Take over rebalance responsibility.
+                    *rebalance_subtree = &n.b.right;
+                }
+                n.b.weight = ancestor.b.weight;
+                n.b.left = ancestor.b.left;
+                n.b.right = ancestor.b.right;
+                *subtree_replace = ptr_this;
+            } else {
+                to_free = ptr_this;
+            }
+            this->node_free(to_free);
+        } else if (n.b.right.is_null()) {
+            // Delete n and let parent point to n.b.left
+            subtree to_free = *subtreep;
+            *subtreep = n.b.left;
+            paranoid_invariant_null(subtree_replace);  // To be recursive, we're looking for index 0.  n is index > 0 here.
+            this->node_free(to_free);
+        } else {
+            if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 0, -1)) {
+                *rebalance_subtree = subtreep;
+            }
+            // don't need to copy up value, it's only used by this
+            // next call, and when that gets to the bottom there
+            // won't be any more recursion
+            n.b.weight--;
+            this->delete_internal(&n.b.right, 0, subtreep, rebalance_subtree);
+        }
+    } else {
+        n.b.weight--;
+        if (*rebalance_subtree == nullptr && this->will_need_rebalance(*subtreep, 0, -1)) {
+            *rebalance_subtree = subtreep;
+        }
+        this->delete_internal(&n.b.right, idx - leftweight - 1, subtree_replace, rebalance_subtree);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t>::iterate_internal_array(const uint32_t left, const uint32_t right,
+                                                         iterate_extra_t *const iterate_extra) const {
+    int r;
+    for (uint32_t i = left; i < right; ++i) {
+        r = f(this->value_length, *get_array_value(i), i, iterate_extra);
+        if (r != 0) {
+            return r;
+        }
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+void dmt<dmtdata_t, dmtdataout_t>::iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                                                        const subtree &subtree, const uint32_t idx,
+                                                        iterate_extra_t *const iterate_extra) {
+    if (!subtree.is_null()) { 
+        dmt_dnode &n = get_node<dmt_dnode>(subtree);
+        const uint32_t idx_root = idx + this->nweight(n.b.left);
+        if (left < idx_root) {
+            this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.b.left, idx, iterate_extra);
+        }
+        if (left <= idx_root && idx_root < right) {
+            int r = f(n.value_length, &n.value, idx_root, iterate_extra);
+            lazy_assert_zero(r);
+        }
+        if (idx_root + 1 < right) {
+            this->iterate_ptr_internal<iterate_extra_t, f>(left, right, n.b.right, idx_root + 1, iterate_extra);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+void dmt<dmtdata_t, dmtdataout_t>::iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                                              iterate_extra_t *const iterate_extra) {
+    for (uint32_t i = left; i < right; ++i) {
+        int r = f(this->value_length, get_array_value(i), i, iterate_extra);
+        lazy_assert_zero(r);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename iterate_extra_t,
+         int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+int dmt<dmtdata_t, dmtdataout_t>::iterate_internal(const uint32_t left, const uint32_t right,
+                                                   const subtree &subtree, const uint32_t idx,
+                                                   iterate_extra_t *const iterate_extra) const {
+    if (subtree.is_null()) { return 0; }
+    int r;
+    const dmt_dnode &n = get_node<dmt_dnode>(subtree);
+    const uint32_t idx_root = idx + this->nweight(n.b.left);
+    if (left < idx_root) {
+        r = this->iterate_internal<iterate_extra_t, f>(left, right, n.b.left, idx, iterate_extra);
+        if (r != 0) { return r; }
+    }
+    if (left <= idx_root && idx_root < right) {
+        r = f(n.value_length, n.value, idx_root, iterate_extra);
+        if (r != 0) { return r; }
+    }
+    if (idx_root + 1 < right) {
+        return this->iterate_internal<iterate_extra_t, f>(left, right, n.b.right, idx_root + 1, iterate_extra);
+    }
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::fetch_internal_array(const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const {
+    copyout(value_len, value, this->value_length, get_array_value(i));
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::fetch_internal(const subtree &subtree, const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const {
+    dmt_dnode &n = get_node<dmt_dnode>(subtree);
+    const uint32_t leftweight = this->nweight(n.b.left);
+    if (i < leftweight) {
+        this->fetch_internal(n.b.left, i, value_len, value);
+    } else if (i == leftweight) {
+        copyout(value_len, value, &n);
+    } else {
+        this->fetch_internal(n.b.right, i - leftweight - 1, value_len, value);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::fill_array_with_subtree_idxs(node_idx *const array, const subtree &subtree) const {
+    if (!subtree.is_null()) {
+        const dmt_dnode &tree = get_node<dmt_dnode>(subtree);
+        this->fill_array_with_subtree_idxs(&array[0], tree.b.left);
+        array[this->nweight(tree.b.left)] = subtree.get_index();
+        this->fill_array_with_subtree_idxs(&array[this->nweight(tree.b.left) + 1], tree.b.right);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::rebuild_subtree_from_idxs(subtree *const subtree, const node_idx *const idxs, const uint32_t numvalues) {
+    if (numvalues==0) {
+        subtree->set_to_null();
+    } else {
+        uint32_t halfway = numvalues/2;
+        subtree->set_index(idxs[halfway]);
+        dmt_dnode &newnode = get_node<dmt_dnode>(idxs[halfway]);
+        newnode.b.weight = numvalues;
+        // value is already in there.
+        this->rebuild_subtree_from_idxs(&newnode.b.left,  &idxs[0], halfway);
+        this->rebuild_subtree_from_idxs(&newnode.b.right, &idxs[halfway+1], numvalues-(halfway+1));
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+node_idx* dmt<dmtdata_t, dmtdataout_t>::alloc_temp_node_idxs(uint32_t num_idxs) {
+    size_t mem_needed = num_idxs * sizeof(node_idx);
+    size_t mem_free;
+    mem_free = toku_mempool_get_free_space(&this->mp);
+    node_idx* CAST_FROM_VOIDP(tmp, toku_mempool_get_next_free_ptr(&this->mp));
+    if (mem_free >= mem_needed) {
+        return tmp;
+    }
+    return nullptr;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::rebalance(subtree *const subtree) {
+    (void) subtree;
+    paranoid_invariant(!subtree->is_null());
+    node_idx idx = subtree->get_index();
+//TODO: restore optimization, maybe only if values_same_size
+#if 0
+    if (!dynamic && idx==this->d.t.root.get_index()) {
+        //Try to convert to an array.
+        //If this fails, (malloc) nothing will have changed.
+        //In the failure case we continue on to the standard rebalance
+        //algorithm.
+        this->convert_to_array();
+        if (supports_marks) {
+            this->convert_to_tree();
+        }
+    } else {
+#endif
+        const dmt_dnode &n = get_node<dmt_dnode>(idx);
+        node_idx *tmp_array;
+        bool malloced = false;
+        tmp_array = alloc_temp_node_idxs(n.b.weight);
+        if (!tmp_array) {
+            malloced = true;
+            XMALLOC_N(n.b.weight, tmp_array);
+        }
+        this->fill_array_with_subtree_idxs(tmp_array, *subtree);
+        this->rebuild_subtree_from_idxs(subtree, tmp_array, n.b.weight);
+        if (malloced) toku_free(tmp_array);
+#if 0
+    }
+#endif
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::copyout(uint32_t *const outlen, dmtdata_t *const out, const dmt_dnode *const n) {
+    if (outlen) {
+        *outlen = n->value_length;
+    }
+    if (out) {
+        *out = n->value;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::copyout(uint32_t *const outlen, dmtdata_t **const out, dmt_dnode *const n) {
+    if (outlen) {
+        *outlen = n->value_length;
+    }
+    if (out) {
+        *out = &n->value;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::copyout(uint32_t *const outlen, dmtdata_t *const out, const uint32_t len, const dmtdata_t *const stored_value_ptr) {
+    if (outlen) {
+        *outlen = len;
+    }
+    if (out) {
+        *out = *stored_value_ptr;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::copyout(uint32_t *const outlen, dmtdata_t **const out, const uint32_t len, dmtdata_t *const stored_value_ptr) {
+    if (outlen) {
+        *outlen = len;
+    }
+    if (out) {
+        *out = stored_value_ptr;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find_internal_zero_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    uint32_t min = 0;
+    uint32_t limit = this->d.a.num_values;
+    uint32_t best_pos = subtree::NODE_NULL;
+    uint32_t best_zero = subtree::NODE_NULL;
+
+    while (min!=limit) {
+        uint32_t mid = (min + limit) / 2;
+        int hv = h(this->value_length, *get_array_value(mid), extra);
+        if (hv<0) {
+            min = mid+1;
+        }
+        else if (hv>0) {
+            best_pos  = mid;
+            limit     = mid;
+        }
+        else {
+            best_zero = mid;
+            limit     = mid;
+        }
+    }
+    if (best_zero!=subtree::NODE_NULL) {
+        //Found a zero
+        copyout(value_len, value, this->value_length, get_array_value(best_zero));
+        *idxp = best_zero;
+        return 0;
+    }
+    if (best_pos!=subtree::NODE_NULL) *idxp = best_pos;
+    else                     *idxp = this->d.a.num_values;
+    return DB_NOTFOUND;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find_internal_zero(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    if (subtree.is_null()) {
+        *idxp = 0;
+        return DB_NOTFOUND;
+    }
+    dmt_dnode &n = get_node<dmt_dnode>(subtree);
+    int hv = h(n.value_length, n.value, extra);
+    if (hv<0) {
+        int r = this->find_internal_zero<dmtcmp_t, h>(n.b.right, extra, value_len, value, idxp);
+        *idxp += this->nweight(n.b.left)+1;
+        return r;
+    } else if (hv>0) {
+        return this->find_internal_zero<dmtcmp_t, h>(n.b.left, extra, value_len, value, idxp);
+    } else {
+        int r = this->find_internal_zero<dmtcmp_t, h>(n.b.left, extra, value_len, value, idxp);
+        if (r==DB_NOTFOUND) {
+            *idxp = this->nweight(n.b.left);
+            copyout(value_len, value, &n);
+            r = 0;
+        }
+        return r;
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find_internal_plus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    uint32_t min = 0;
+    uint32_t limit = this->d.a.num_values;
+    uint32_t best = subtree::NODE_NULL;
+
+    while (min != limit) {
+        const uint32_t mid = (min + limit) / 2;
+        const int hv = h(this->value_length, *get_array_value(mid), extra);
+        if (hv > 0) {
+            best = mid;
+            limit = mid;
+        } else {
+            min = mid + 1;
+        }
+    }
+    if (best == subtree::NODE_NULL) { return DB_NOTFOUND; }
+    copyout(value_len, value, this->value_length, get_array_value(best));
+    *idxp = best;
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find_internal_plus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    if (subtree.is_null()) {
+        return DB_NOTFOUND;
+    }
+    dmt_dnode & n = get_node<dmt_dnode>(subtree);
+    int hv = h(n.value_length, n.value, extra);
+    int r;
+    if (hv > 0) {
+        r = this->find_internal_plus<dmtcmp_t, h>(n.b.left, extra, value_len, value, idxp);
+        if (r == DB_NOTFOUND) {
+            *idxp = this->nweight(n.b.left);
+            copyout(value_len, value, &n);
+            r = 0;
+        }
+    } else {
+        r = this->find_internal_plus<dmtcmp_t, h>(n.b.right, extra, value_len, value, idxp);
+        if (r == 0) {
+            *idxp += this->nweight(n.b.left) + 1;
+        }
+    }
+    return r;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find_internal_minus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    uint32_t min = 0;
+    uint32_t limit = this->d.a.num_values;
+    uint32_t best = subtree::NODE_NULL;
+
+    while (min != limit) {
+        const uint32_t mid = (min + limit) / 2;
+        const int hv = h(this->value_length, *get_array_value(mid), extra);
+        if (hv < 0) {
+            best = mid;
+            min = mid + 1;
+        } else {
+            limit = mid;
+        }
+    }
+    if (best == subtree::NODE_NULL) { return DB_NOTFOUND; }
+    copyout(value_len, value, this->value_length, get_array_value(best));
+    *idxp = best;
+    return 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+template<typename dmtcmp_t,
+         int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+int dmt<dmtdata_t, dmtdataout_t>::find_internal_minus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const {
+    paranoid_invariant_notnull(idxp);
+    if (subtree.is_null()) {
+        return DB_NOTFOUND;
+    }
+    dmt_dnode & n = get_node<dmt_dnode>(subtree);
+    int hv = h(n.value_length, n.value, extra);
+    if (hv < 0) {
+        int r = this->find_internal_minus<dmtcmp_t, h>(n.b.right, extra, value_len, value, idxp);
+        if (r == 0) {
+            *idxp += this->nweight(n.b.left) + 1;
+        } else if (r == DB_NOTFOUND) {
+            *idxp = this->nweight(n.b.left);
+            copyout(value_len, value, &n);
+            r = 0;
+        }
+        return r;
+    } else {
+        return this->find_internal_minus<dmtcmp_t, h>(n.b.left, extra, value_len, value, idxp);
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t>::get_fixed_length(void) const {
+    return this->values_same_size ? this->value_length : 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+uint32_t dmt<dmtdata_t, dmtdataout_t>::get_fixed_length_alignment_overhead(void) const {
+    return this->values_same_size ? align(this->value_length) - this->value_length : 0;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+bool dmt<dmtdata_t, dmtdataout_t>::is_value_length_fixed(void) const {
+    return this->values_same_size;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::serialize_values(uint32_t expected_unpadded_memory, struct wbuf *wb) const {
+    invariant(this->is_array);
+    const uint8_t pad_bytes = get_fixed_length_alignment_overhead();
+    const uint32_t fixed_len = this->value_length;
+    const uint32_t fixed_aligned_len = align(this->value_length);
+    paranoid_invariant(expected_unpadded_memory == this->d.a.num_values * this->value_length);
+    paranoid_invariant(toku_mempool_get_used_space(&this->mp) >=
+                       expected_unpadded_memory + pad_bytes * this->d.a.num_values +
+                       this->d.a.start_idx * fixed_aligned_len);
+    if (this->d.a.num_values == 0) {
+        // Nothing to serialize
+    } else if (pad_bytes == 0) {
+        // Basically a memcpy
+        wbuf_nocrc_literal_bytes(wb, get_array_value(0), expected_unpadded_memory);
+    } else {
+        uint8_t* dest = wbuf_nocrc_reserve_literal_bytes(wb, expected_unpadded_memory);
+        uint8_t* src = reinterpret_cast<uint8_t*>(get_array_value(0));
+        paranoid_invariant(this->d.a.num_values*fixed_len == expected_unpadded_memory);
+        for (uint32_t i = 0; i < this->d.a.num_values; i++) {
+            memcpy(&dest[i*fixed_len], &src[i*fixed_aligned_len], fixed_len);
+        }
+    }
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::builder::create(uint32_t _max_values, uint32_t _max_value_bytes) {
+    this->max_values = _max_values;
+    this->max_value_bytes = _max_value_bytes;
+    this->temp.create_no_array();
+    this->temp_valid = true;
+    this->sorted_nodes = nullptr;
+    // Include enough space for alignment padding
+    size_t initial_space = (ALIGNMENT - 1) * _max_values + _max_value_bytes;
+
+    toku_mempool_construct(&this->temp.mp, initial_space);  // Adds 25%
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::builder::insert_sorted(const dmtdatain_t &value) {
+    paranoid_invariant(this->temp_valid);
+    //NOTE: Always use d.a.num_values for size because we have not yet created root.
+    if (this->temp.values_same_size && (this->temp.d.a.num_values == 0 || value.get_dmtdatain_t_size() == this->temp.value_length)) {
+        this->temp.insert_at_array_end<false>(value);
+        return;
+    }
+    if (this->temp.is_array) {
+        // Convert to dtree format (even if ctree exists, it should not be used).
+        XMALLOC_N(this->max_values, this->sorted_nodes);
+
+        // Include enough space for alignment padding
+        size_t mem_needed = (ALIGNMENT - 1 + __builtin_offsetof(dmt_mnode<true>, value)) * max_values + max_value_bytes;
+        struct mempool old_mp = this->temp.mp;
+
+        const uint32_t num_values = this->temp.d.a.num_values;
+        toku_mempool_construct(&this->temp.mp, mem_needed);
+
+        // Copy over and get node_idxs
+        for (uint32_t i = 0; i < num_values; i++) {
+            dmtdatain_t functor(this->temp.value_length, this->temp.get_array_value_internal(&old_mp, i));
+            this->sorted_nodes[i] = this->temp.node_malloc_and_set_value<true>(functor);
+        }
+        this->temp.is_array = false;
+        this->temp.values_same_size = false;
+        toku_mempool_destroy(&old_mp);
+    }
+    paranoid_invariant(!this->temp.is_array);
+    this->temp.values_same_size = false;
+    // Insert dynamic.
+    this->sorted_nodes[this->temp.d.a.num_values++] = this->temp.node_malloc_and_set_value<true>(value);
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+bool dmt<dmtdata_t, dmtdataout_t>::builder::is_value_length_fixed(void) {
+    paranoid_invariant(this->temp_valid);
+    return this->temp.values_same_size;
+}
+
+template<typename dmtdata_t, typename dmtdataout_t>
+void dmt<dmtdata_t, dmtdataout_t>::builder::build_and_destroy(dmt<dmtdata_t, dmtdataout_t> *dest) {
+    invariant(this->temp_valid);
+    //NOTE: Always use d.a.num_values for size because we have not yet created root.
+    invariant(this->temp.d.a.num_values == this->max_values); // Optionally make it <=
+    // Memory invariant is taken care of incrementally
+
+    if (!this->temp.is_array) {
+        invariant_notnull(this->sorted_nodes);
+        this->temp.rebuild_subtree_from_idxs(&this->temp.d.t.root, this->sorted_nodes, this->temp.d.a.num_values);
+        toku_free(this->sorted_nodes);
+        this->sorted_nodes = nullptr;
+    }
+    paranoid_invariant_null(this->sorted_nodes);
+
+    size_t used = toku_mempool_get_used_space(&this->temp.mp);
+    size_t allocated = toku_mempool_get_size(&this->temp.mp);
+    size_t max_allowed = used + used / 4;
+    size_t footprint = toku_mempool_footprint(&this->temp.mp);
+    if (allocated > max_allowed && footprint > max_allowed) {
+        // Reallocate smaller mempool to save memory
+        invariant_zero(toku_mempool_get_frag_size(&this->temp.mp));
+        struct mempool new_mp;
+        toku_mempool_construct(&new_mp, used);
+        void * newbase = toku_mempool_malloc(&new_mp, used, 1);
+        invariant_notnull(newbase);
+        memcpy(newbase, toku_mempool_get_base(&this->temp.mp), used);
+        toku_mempool_destroy(&this->temp.mp);
+        this->temp.mp = new_mp;
+    }
+
+    *dest = this->temp;
+    this->temp_valid = false;
+
+}
+} // namespace toku
--- a/util/dmt.h
+++ b/util/dmt.h
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ifndef UTIL_DMT_H
+#define UTIL_DMT_H
+
+#ident "$Id$"
+/*
+COPYING CONDITIONS NOTICE:
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation, and provided that the
+  following conditions are met:
+
+      * Redistributions of source code must retain this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below).
+
+      * Redistributions in binary form must reproduce this COPYING
+        CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
+        DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
+        PATENT MARKING NOTICE (below), and the PATENT RIGHTS
+        GRANT (below) in the documentation and/or other materials
+        provided with the distribution.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+  02110-1301, USA.
+
+COPYRIGHT NOTICE:
+
+  TokuDB, Tokutek Fractal Tree Indexing Library.
+  Copyright (C) 2007-2013 Tokutek, Inc.
+
+DISCLAIMER:
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+UNIVERSITY PATENT NOTICE:
+
+  The technology is licensed by the Massachusetts Institute of
+  Technology, Rutgers State University of New Jersey, and the Research
+  Foundation of State University of New York at Stony Brook under
+  United States of America Serial No. 11/760379 and to the patents
+  and/or patent applications resulting from it.
+
+PATENT MARKING NOTICE:
+
+  This software is covered by US Patent No. 8,185,551.
+
+PATENT RIGHTS GRANT:
+
+  "THIS IMPLEMENTATION" means the copyrightable works distributed by
+  Tokutek as part of the Fractal Tree project.
+
+  "PATENT CLAIMS" means the claims of patents that are owned or
+  licensable by Tokutek, both currently or in the future; and that in
+  the absence of this license would be infringed by THIS
+  IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
+
+  "PATENT CHALLENGE" shall mean a challenge to the validity,
+  patentability, enforceability and/or non-infringement of any of the
+  PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
+
+  Tokutek hereby grants to you, for the term and geographical scope of
+  the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
+  irrevocable (except as stated in this section) patent license to
+  make, have made, use, offer to sell, sell, import, transfer, and
+  otherwise run, modify, and propagate the contents of THIS
+  IMPLEMENTATION, where such license applies only to the PATENT
+  CLAIMS.  This grant does not include claims that would be infringed
+  only as a consequence of further modifications of THIS
+  IMPLEMENTATION.  If you or your agent or licensee institute or order
+  or agree to the institution of patent litigation against any entity
+  (including a cross-claim or counterclaim in a lawsuit) alleging that
+  THIS IMPLEMENTATION constitutes direct or contributory patent
+  infringement, or inducement of patent infringement, then any rights
+  granted to you under this License shall terminate as of the date
+  such litigation is filed.  If you or your agent or exclusive
+  licensee institute or order or agree to the institution of a PATENT
+  CHALLENGE, then Tokutek may terminate any rights granted to you
+  under this License.
+*/
+
+#ident "Copyright (c) 2007-2013 Tokutek Inc.  All rights reserved."
+#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
+
+#include <stdint.h>
+#include <memory.h>
+#include <toku_portability.h>
+#include <toku_race_tools.h>
+#include "growable_array.h"
+#include "../ft/wbuf.h"
+
+namespace toku {
+typedef uint32_t node_idx;
+
+
+/**
+ * Order Maintenance Tree (DMT)
+ *
+ * Maintains a collection of totally ordered values, where each value has an integer weight.
+ * The DMT is a mutable datatype.
+ *
+ * The Abstraction:
+ *
+ * An DMT is a vector of values, $V$, where $|V|$ is the length of the vector.
+ * The vector is numbered from $0$ to $|V|-1$.
+ * Each value has a weight.  The weight of the $i$th element is denoted $w(V_i)$.
+ *
+ * We can create a new DMT, which is the empty vector.
+ *
+ * We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
+ *  $|V'|=1+|V|$       and
+ *
+ *   V'_j = V_j       if $j<i$
+ *          x         if $j=i$
+ *          V_{j-1}   if $j>i$.
+ *
+ * We can specify $i$ using a kind of function instead of as an integer.
+ * Let $b$ be a function mapping from values to nonzero integers, such that
+ * the signum of $b$ is monotically increasing.
+ * We can specify $i$ as the minimum integer such that $b(V_i)>0$.
+ *
+ * We look up a value using its index, or using a Heaviside function.
+ * For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
+ * When lookup up values, we can look up
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a special return code if no such value exists.)
+ *      (Rationale:  Ordinarily we want $i$ to be unique.  But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
+ *  $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
+ *  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an indication that no such value exists.)
+ *
+ * When looking up a value using a Heaviside function, we get the value and its index.
+ *
+ * We can also split an DMT into two DMTs, splitting the weight of the values evenly.
+ * Find a value $j$ such that the values to the left of $j$ have about the same total weight as the values to the right of $j$.
+ * The resulting two DMTs contain the values to the left of $j$ and the values to the right of $j$ respectively.
+ * All of the values from the original DMT go into one of the new DMTs.
+ * If the weights of the values don't split exactly evenly, then the implementation has the freedom to choose whether
+ *  the new left DMT or the new right DMT is larger.
+ *
+ * Performance:
+ *  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
+ *  The memory required is O(|V|).
+ *
+ * Usage:
+ *  The dmt is templated by two parameters:
+ *   - dmtdata_t is what will be stored within the dmt.  These could be pointers or real data types (ints, structs).
+ *   - dmtdataout_t is what will be returned by find and related functions.  By default, it is the same as dmtdata_t, but you can set it to (dmtdata_t *).
+ *  To create an dmt which will store "TXNID"s, for example, it is a good idea to typedef the template:
+ *   typedef dmt<TXNID> txnid_dmt_t;
+ *  If you are storing structs, you may want to be able to get a pointer to the data actually stored in the dmt (see find_zero).  To do this, use the second template parameter:
+ *   typedef dmt<struct foo, struct foo *> foo_dmt_t;
+ */
+
+namespace dmt_internal {
+
+template<bool subtree_supports_marks>
+class subtree_templated {
+    static_assert(!subtree_supports_marks, "Not yet supported");
+private:
+    uint32_t m_index;
+public:
+    static const uint32_t NODE_NULL = UINT32_MAX;
+    inline void set_to_null(void) {
+        m_index = NODE_NULL;
+    }
+
+    inline bool is_null(void) const {
+        return NODE_NULL == this->get_index();
+    }
+
+    inline node_idx get_index(void) const {
+        return m_index;
+    }
+
+    inline void set_index(node_idx index) {
+        paranoid_invariant(index != NODE_NULL);
+        m_index = index;
+    }
+} __attribute__((__packed__,aligned(4)));
+
+template<typename dmtdata_t, bool subtree_supports_marks>
+class dmt_base_node_templated {
+    static_assert(!subtree_supports_marks, "Not yet supported");
+public:
+    uint32_t weight;
+    subtree_templated<subtree_supports_marks> left;
+    subtree_templated<subtree_supports_marks> right;
+
+    // this needs to be in both implementations because we don't have
+    // a "static if" the caller can use
+    inline void clear_stolen_bits(void) {}
+};
+
+template<typename dmtdata_t, bool subtree_supports_marks, bool store_value_length>
+class dmt_node_templated {
+    static_assert(store_value_length, "Not yet supported");
+public:
+    dmt_base_node_templated<dmtdata_t, subtree_supports_marks> b;
+    uint32_t value_length;
+    dmtdata_t value;
+};// __attribute__((__packed__,aligned(4)));
+
+}
+
+template<typename dmtdata_t>
+class dmt_functor {
+    static_assert(!std::is_same<dmtdata_t, dmtdata_t>::value, "Must use partial specialization");
+    // Defines the interface:
+    //static size_t get_dmtdata_t_size(const dmtdata_t &) { return 0; }
+    //size_t get_dmtdatain_t_size(void) { return 0; }
+    //void write_dmtdata_t_to(dmtdata_t *const dest) {}
+};
+
+template<typename dmtdata_t,
+         typename dmtdataout_t=dmtdata_t
+        >
+class dmt {
+private:
+    typedef dmt_internal::subtree_templated<false> subtree;
+    typedef dmt_internal::dmt_base_node_templated<dmtdata_t, false> dmt_base_node;
+    template<bool with_length>
+        using dmt_mnode = dmt_internal::dmt_node_templated<dmtdata_t, false, with_length>;
+    typedef dmt_mnode<true> dmt_dnode;
+
+    typedef dmt_functor<dmtdata_t> dmtdatain_t;
+
+public:
+    static const uint8_t ALIGNMENT = 4;
+
+    class builder {
+    public:
+        void insert_sorted(const dmtdatain_t &value);
+        void create(uint32_t n_values, uint32_t n_value_bytes);
+        bool is_value_length_fixed(void);
+        void build_and_destroy(dmt<dmtdata_t, dmtdataout_t> *dest);
+    private:
+        uint32_t max_values;
+        uint32_t max_value_bytes;
+        node_idx *sorted_nodes;
+        bool temp_valid;
+        dmt<dmtdata_t, dmtdataout_t> temp;
+    };
+
+    /**
+     * Effect: Create an empty DMT.
+     * Performance: constant time.
+     */
+    void create(void);
+
+    /**
+     * Effect: Create an empty DMT with no internal allocated space.
+     * Performance: constant time.
+     * Rationale: In some cases we need a valid dmt but don't want to malloc.
+     */
+    void create_no_array(void);
+
+    /**
+     * Effect: Create a DMT containing values.  The number of values is in numvalues.
+     * Requires: this has not been created yet
+     * Rationale:    Normally to insert N values takes O(N lg N) amortized time.
+     *               If the N values are known in advance, are sorted, and
+     *               the structure is empty, we can batch insert them much faster.
+     */
+    __attribute__((nonnull))
+    void create_from_sorted_memory_of_fixed_size_elements(
+            const void *mem,
+            const uint32_t numvalues,
+            const uint32_t mem_length,
+            const uint32_t fixed_value_length);
+
+    /**
+     * Effect: Creates a copy of an dmt.
+     *  Creates this as the clone.
+     *  Each element is copied directly.  If they are pointers, the underlying data is not duplicated.
+     * Performance: O(n) or the running time of fill_array_with_subtree_values()
+     */
+    void clone(const dmt &src);
+
+    /**
+     * Effect: Set the tree to be empty.
+     *  Note: Will not reallocate or resize any memory.
+     * Performance: time=O(1)
+     */
+    void clear(void);
+
+    /**
+     * Effect:  Destroy an DMT, freeing all its memory.
+     *   If the values being stored are pointers, their underlying data is not freed.  See free_items()
+     *   Those values may be freed before or after calling toku_dmt_destroy.
+     * Rationale: Returns no values since free() cannot fail.
+     * Rationale: Does not free the underlying pointers to reduce complexity.
+     * Performance:  time=O(1)
+     */
+    void destroy(void);
+
+    /**
+     * Effect: return |this|.
+     * Performance:  time=O(1)
+     */
+    uint32_t size(void) const;
+
+    void serialize_values(uint32_t expected_unpadded_memory, struct wbuf *wb) const;
+
+    /**
+     * Effect:  Insert value into the DMT.
+     *   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
+     *   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
+     *      If no such i exists, then let i be |V|
+     *   Then this has the same effect as
+     *    insert_at(tree, value, i);
+     *   If idx!=NULL then i is stored in *idx
+     * Requires:  The signum of h must be monotonically increasing.
+     * Returns:
+     *    0            success
+     *    DB_KEYEXIST  the key is present (h was equal to zero for some value)
+     * On nonzero return, dmt is unchanged.
+     * Performance: time=O(\log N) amortized.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    template<typename dmtcmp_t, int (*h)(const uint32_t size, const dmtdata_t &, const dmtcmp_t &)>
+    int insert(const dmtdatain_t &value, const dmtcmp_t &v, uint32_t *const idx);
+
+    /**
+     * Effect: Increases indexes of all items at slot >= idx by 1.
+     *         Insert value into the position at idx.
+     * Returns:
+     *   0         success
+     *   EINVAL    if idx > this->size()
+     * On error, dmt is unchanged.
+     * Performance: time=O(\log N) amortized time.
+     * Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
+     */
+    int insert_at(const dmtdatain_t &value, const uint32_t idx);
+
+    /**
+     * Effect: Delete the item in slot idx.
+     *         Decreases indexes of all items at slot > idx by 1.
+     * Returns
+     *     0            success
+     *     EINVAL       if idx>=this->size()
+     * On error, dmt is unchanged.
+     * Rationale: To delete an item, first find its index using find or find_zero, then delete it.
+     * Performance: time=O(\log N) amortized.
+     */
+    int delete_at(const uint32_t idx);
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: f != NULL
+     * Returns:
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate.
+     *  If f always returns zero, then iterate returns 0.
+     * Requires:  Don't modify the dmt while running.  (E.g., f may not insert or delete values from the dmt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt.
+     * Rationale: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     * Rationale: We may at some point use functors, but for now this is a smaller change from the old DMT.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate(iterate_extra_t *const iterate_extra) const;
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a ref-to-const of the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     *  We will iterate only over [left,right)
+     *
+     * Requires: left <= right
+     * Requires: f != NULL
+     * Returns:
+     *  EINVAL  if right > this->size()
+     *  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by iterate_on_range.
+     *  If f always returns zero, then iterate_on_range returns 0.
+     * Requires:  Don't modify the dmt while running.  (E.g., f may not insert or delete values from the dmt.)
+     * Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in the dmt.
+     * Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const;
+
+    void verify(void) const;
+
+    /**
+     * Effect:  Iterate over the values of the dmt, from left to right, calling f on each value.
+     *  The first argument passed to f is a pointer to the value stored in the dmt.
+     *  The second argument passed to f is the index of the value.
+     *  The third argument passed to f is iterate_extra.
+     *  The indices run from 0 (inclusive) to this->size() (exclusive).
+     * Requires: same as for iterate()
+     * Returns: same as for iterate()
+     * Performance: same as for iterate()
+     * Rationale: In general, most iterators should use iterate() since they should not modify the data stored in the dmt.  This function is for iterators which need to modify values (for example, free_items).
+     * Rationale: We assume if you are transforming the data in place, you want to do it to everything at once, so there is not yet an iterate_on_range_ptr (but there could be).
+     */
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr(iterate_extra_t *const iterate_extra);
+
+    /**
+     * Effect: Set *value=V_idx
+     * Returns
+     *    0             success
+     *    EINVAL        if index>=toku_dmt_size(dmt)
+     * On nonzero return, *value is unchanged
+     * Performance: time=O(\log N)
+     */
+    int fetch(const uint32_t idx, uint32_t *const value_size, dmtdataout_t *const value) const;
+
+    /**
+     * Effect:  Find the smallest i such that h(V_i, extra)>=0
+     *  If there is such an i and h(V_i,extra)==0 then set *idxp=i, set *value = V_i, and return 0.
+     *  If there is such an i and h(V_i,extra)>0  then set *idxp=i and return DB_NOTFOUND.
+     *  If there is no such i then set *idx=this->size() and return DB_NOTFOUND.
+     * Note: value is of type dmtdataout_t, which may be of type (dmtdata_t) or (dmtdata_t *) but is fixed by the instantiation.
+     *  If it is the value type, then the value is copied out (even if the value type is a pointer to something else)
+     *  If it is the pointer type, then *value is set to a pointer to the data within the dmt.
+     *  This is determined by the type of the dmt as initially declared.
+     *   If the dmt is declared as dmt<foo_t>, then foo_t's will be stored and foo_t's will be returned by find and related functions.
+     *   If the dmt is declared as dmt<foo_t, foo_t *>, then foo_t's will be stored, and pointers to the stored items will be returned by find and related functions.
+     * Rationale:
+     *  Structs too small for malloc should be stored directly in the dmt.
+     *  These structs may need to be edited as they exist inside the dmt, so we need a way to get a pointer within the dmt.
+     *  Using separate functions for returning pointers and values increases code duplication and reduces type-checking.
+     *  That also reduces the ability of the creator of a data structure to give advice to its future users.
+     *  Slight overloading in this case seemed to provide a better API and better type checking.
+     */
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_zero(const dmtcmp_t &extra, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     *   Effect:
+     *    If direction >0 then find the smallest i such that h(V_i,extra)>0.
+     *    If direction <0 then find the largest  i such that h(V_i,extra)<0.
+     *    (Direction may not be equal to zero.)
+     *    If value!=NULL then store V_i in *value
+     *    If idxp!=NULL then store i in *idxp.
+     *   Requires: The signum of h is monotically increasing.
+     *   Returns
+     *      0             success
+     *      DB_NOTFOUND   no such value is found.
+     *   On nonzero return, *value and *idxp are unchanged
+     *   Performance: time=O(\log N)
+     *   Rationale:
+     *     Here's how to use the find function to find various things
+     *       Cases for find:
+     *        find first value:         ( h(v)=+1, direction=+1 )
+     *        find last value           ( h(v)=-1, direction=-1 )
+     *        find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
+     *        find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
+     *        find X or successor to X  ( same as find first X. )
+     *
+     *   Rationale: To help understand heaviside functions and behavor of find:
+     *    There are 7 kinds of heaviside functions.
+     *    The signus of the h must be monotonically increasing.
+     *    Given a function of the following form, A is the element
+     *    returned for direction>0, B is the element returned
+     *    for direction<0, C is the element returned for
+     *    direction==0 (see find_zero) (with a return of 0), and D is the element
+     *    returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
+     *    If any of A, B, or C are not found, then asking for the
+     *    associated direction will return DB_NOTFOUND.
+     *    See find_zero for more information.
+     *
+     *    Let the following represent the signus of the heaviside function.
+     *
+     *    -...-
+     *        A
+     *         D
+     *
+     *    +...+
+     *    B
+     *    D
+     *
+     *    0...0
+     *    C
+     *
+     *    -...-0...0
+     *        AC
+     *
+     *    0...0+...+
+     *    C    B
+     *
+     *    -...-+...+
+     *        AB
+     *         D
+     *
+     *    -...-0...0+...+
+     *        AC    B
+     */
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find(const dmtcmp_t &extra, int direction, uint32_t *const value_size, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    /**
+     * Effect: Return the size (in bytes) of the dmt, as it resides in main memory.  If the data stored are pointers, don't include the size of what they all point to.
+     */
+    size_t memory_size(void);
+
+    bool is_value_length_fixed(void) const;
+
+    uint32_t get_fixed_length(void) const;
+
+    void prepare_for_serialize(void);
+
+private:
+    static_assert(sizeof(dmt_dnode) - sizeof(dmtdata_t) == __builtin_offsetof(dmt_dnode, value), "value is not last field in node");
+    static_assert(4 * sizeof(uint32_t) == __builtin_offsetof(dmt_dnode, value), "dmt_node is padded");
+    ENSURE_POD(subtree);
+
+    struct dmt_array {
+        uint32_t start_idx;
+        uint32_t num_values;
+    };
+
+    struct dmt_tree {
+        subtree root;
+    };
+
+
+    bool values_same_size;  //TODO: is this necessary? maybe sentinel for value_length
+    uint32_t value_length;
+    struct mempool mp;
+    bool is_array;
+    union {
+        struct dmt_array a;
+        struct dmt_tree t;
+    } d;
+
+    uint32_t get_fixed_length_alignment_overhead(void) const;
+
+    void verify_internal(const subtree &subtree) const;
+
+    void create_internal_no_alloc(bool as_tree);
+
+    template<typename node_type>
+    node_type & get_node(const subtree &subtree) const;
+
+    template<typename node_type>
+    node_type & get_node(const node_idx offset) const;
+
+    uint32_t nweight(const subtree &subtree) const;
+
+    template<bool with_sizes>
+    node_idx node_malloc_and_set_value(const dmtdatain_t &value);
+
+    void node_set_value(dmt_mnode<true> *n, const dmtdatain_t &value);
+
+    void node_free(const subtree &st);
+
+    void maybe_resize_array(const int change);
+
+    void convert_to_tree(void);
+
+    void maybe_resize_dtree(const dmtdatain_t * value);
+
+    bool will_need_rebalance(const subtree &subtree, const int leftmod, const int rightmod) const;
+
+    __attribute__((nonnull))
+    void insert_internal(subtree *const subtreep, const dmtdatain_t &value, const uint32_t idx, subtree **const rebalance_subtree);
+
+    template<bool with_resize>
+    int insert_at_array_end(const dmtdatain_t& value_in);
+
+    int insert_at_array_beginning(const dmtdatain_t& value_in);
+
+    dmtdata_t * alloc_array_value_end(void);
+
+    dmtdata_t * alloc_array_value_beginning(void);
+
+    dmtdata_t * get_array_value(const uint32_t idx) const;
+
+    dmtdata_t * get_array_value_internal(const struct mempool *mempool, const uint32_t real_idx) const;
+
+    void convert_to_dtree(void);
+
+    template<bool with_sizes>
+    void convert_from_array_to_tree(void);
+
+    template<bool with_sizes>
+    void convert_from_tree_to_array(void);
+
+    __attribute__((nonnull(2,5)))
+    void delete_internal(subtree *const subtreep, const uint32_t idx, subtree *const subtree_replace, subtree **const rebalance_subtree);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal_array(const uint32_t left, const uint32_t right,
+                                      iterate_extra_t *const iterate_extra) const;
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal(const uint32_t left, const uint32_t right,
+                                     const subtree &subtree, const uint32_t idx,
+                                     iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, dmtdata_t *, const uint32_t, iterate_extra_t *const)>
+    void iterate_ptr_internal_array(const uint32_t left, const uint32_t right,
+                                           iterate_extra_t *const iterate_extra);
+
+    template<typename iterate_extra_t,
+             int (*f)(const uint32_t, const dmtdata_t &, const uint32_t, iterate_extra_t *const)>
+    int iterate_internal(const uint32_t left, const uint32_t right,
+                                const subtree &subtree, const uint32_t idx,
+                                iterate_extra_t *const iterate_extra) const;
+
+    void fetch_internal_array(const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const;
+
+    void fetch_internal(const subtree &subtree, const uint32_t i, uint32_t *const value_len, dmtdataout_t *const value) const;
+
+    __attribute__((nonnull))
+    void fill_array_with_subtree_idxs(node_idx *const array, const subtree &subtree) const;
+
+    __attribute__((nonnull))
+    void rebuild_subtree_from_idxs(subtree *const subtree, const node_idx *const idxs, const uint32_t numvalues);
+
+    __attribute__((nonnull))
+    void rebalance(subtree *const subtree);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t *const out, const dmt_dnode *const n);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t **const out, dmt_dnode *const n);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t *const out, const uint32_t len, const dmtdata_t *const stored_value_ptr);
+
+    __attribute__((nonnull))
+    static void copyout(uint32_t *const outlen, dmtdata_t **const out, const uint32_t len, dmtdata_t *const stored_value_ptr);
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_zero_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_zero(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_plus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_plus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_minus_array(const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    template<typename dmtcmp_t,
+             int (*h)(const uint32_t, const dmtdata_t &, const dmtcmp_t &)>
+    int find_internal_minus(const subtree &subtree, const dmtcmp_t &extra, uint32_t *const value_len, dmtdataout_t *const value, uint32_t *const idxp) const;
+
+    node_idx* alloc_temp_node_idxs(uint32_t num_idxs);
+    uint32_t align(const uint32_t x) const;
+};
+
+} // namespace toku
+
+// include the implementation here
+#include "dmt.cc"
+
+#endif // UTIL_DMT_H
--- a/util/mempool.cc
+++ b/util/mempool.cc
@@ -131,7 +131,7 @@ void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_
 void toku_mempool_construct(struct mempool *mp, size_t data_size) {
    if (data_size) {
        size_t mpsize = data_size + (data_size/4);     // allow 1/4 room for expansion (would be wasted if read-only)
-        mp->base = toku_xmalloc(mpsize);               // allocate buffer for mempool
+        mp->base = toku_xmalloc_aligned(64, mpsize);   // allocate buffer for mempool
        mp->size = mpsize;
        mp->free_offset = 0;                     // address of first available memory for new data
        mp->frag_size = 0;                       // all allocated space is now in use
@@ -142,6 +142,16 @@ void toku_mempool_construct(struct mempool *mp, size_t data_size) {
    }
 }

+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size) {
+    invariant(data_size > mp->free_offset);
+    size_t mpsize = data_size + (data_size/4);     // allow 1/4 room for expansion (would be wasted if read-only)
+    void* newmem = toku_xmalloc_aligned(64, mpsize);   // allocate new buffer for mempool
+    memcpy(newmem, mp->base, mp->free_offset);  // Copy old info
+    toku_free(mp->base);
+    mp->base = newmem;
+    mp->size = mpsize;
+}
+

 void toku_mempool_destroy(struct mempool *mp) {
    // printf("mempool_destroy %p %p %lu %lu\n", mp, mp->base, mp->size, mp->frag_size);
@@ -150,27 +160,40 @@ void toku_mempool_destroy(struct mempool *mp) {
    toku_mempool_zero(mp);
 }

-void *toku_mempool_get_base(struct mempool *mp) {
+void *toku_mempool_get_base(const struct mempool *mp) {
    return mp->base;
 }

-size_t toku_mempool_get_size(struct mempool *mp) {
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset) {
+    return reinterpret_cast<void*>(reinterpret_cast<char*>(mp->base) + offset);
+}
+
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, void* p) {
+    paranoid_invariant(p >= mp->base);
+    return reinterpret_cast<char*>(p) - reinterpret_cast<char*>(mp->base);
+}
+
+size_t toku_mempool_get_size(const struct mempool *mp) {
    return mp->size;
 }

-size_t toku_mempool_get_frag_size(struct mempool *mp) {
+size_t toku_mempool_get_frag_size(const struct mempool *mp) {
    return mp->frag_size;
 }

-size_t toku_mempool_get_used_space(struct mempool *mp) {
+size_t toku_mempool_get_used_space(const struct mempool *mp) {
    return mp->free_offset - mp->frag_size;
 }

-size_t toku_mempool_get_free_space(struct mempool *mp) {
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp) {
+    return toku_mempool_get_pointer_from_base_and_offset(mp, mp->free_offset);
+}
+
+size_t toku_mempool_get_free_space(const struct mempool *mp) {
    return mp->size - mp->free_offset;
 }

-size_t toku_mempool_get_allocated_space(struct mempool *mp) {
+size_t toku_mempool_get_allocated_space(const struct mempool *mp) {
    return mp->free_offset;
 }

@@ -211,10 +234,10 @@ size_t toku_mempool_footprint(struct mempool *mp) {
    return rval;
 }

-void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp) {
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp) {
    new_mp->frag_size = orig_mp->frag_size;
    new_mp->free_offset = orig_mp->free_offset;
    new_mp->size = orig_mp->free_offset; // only make the cloned mempool store what is needed
-    new_mp->base = toku_xmalloc(new_mp->size);
+    new_mp->base = toku_xmalloc_aligned(64, new_mp->size);
    memcpy(new_mp->base, orig_mp->base, new_mp->size);
 }
--- a/util/mempool.h
+++ b/util/mempool.h
@@ -123,26 +123,39 @@ void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_
 */
 void toku_mempool_construct(struct mempool *mp, size_t data_size);

+/* reallocate memory for construct mempool
+ */
+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size);
+
 /* destroy the memory pool */
 void toku_mempool_destroy(struct mempool *mp);

 /* get the base address of the memory pool */
-void *toku_mempool_get_base(struct mempool *mp);
+void *toku_mempool_get_base(const struct mempool *mp);
+
+/* get the a pointer that is offset bytes in front of base of the memory pool */
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset);
+
+/* get the offset from base of a pointer */
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, void* p);
+
+/* get the a pointer of the first free byte (if any) */
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp);

 /* get the size of the memory pool */
-size_t toku_mempool_get_size(struct mempool *mp);
+size_t toku_mempool_get_size(const struct mempool *mp);

 /* get the amount of fragmented (wasted) space in the memory pool */
-size_t toku_mempool_get_frag_size(struct mempool *mp);
+size_t toku_mempool_get_frag_size(const struct mempool *mp);

 /* get the amount of space that is holding useful data */
-size_t toku_mempool_get_used_space(struct mempool *mp);
+size_t toku_mempool_get_used_space(const struct mempool *mp);

 /* get the amount of space that is available for new data */
-size_t toku_mempool_get_free_space(struct mempool *mp);
+size_t toku_mempool_get_free_space(const struct mempool *mp);

 /* get the amount of space that has been allocated for use (wasted or not) */
-size_t toku_mempool_get_allocated_space(struct mempool *mp);
+size_t toku_mempool_get_allocated_space(const struct mempool *mp);

 /* allocate a chunk of memory from the memory pool suitably aligned */
 void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment);
@@ -160,6 +173,8 @@ static inline int toku_mempool_inrange(struct mempool *mp, void *vp, size_t size
 /* get memory footprint */
 size_t toku_mempool_footprint(struct mempool *mp);

-void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp);
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp);
+
+

 #endif // UTIL_MEMPOOL_H