Refs Tokutek/ft-index#46 Add dmt (dynamic OMT)

Use dmt to replace omt in bn_data class for storing leafentries. Optimization for serial inserts and mempool

Refs Tokutek/ft-index#46 Add dmt (dynamic OMT)
Use dmt to replace omt in bn_data class for storing leafentries. Optimization for serial inserts and mempool
b0ccec78 · Yoni Fogel · 5a61f344 · b0ccec78 · b0ccec78 · b0ccec78
Commit b0ccec78 authored Jan 29, 2014 by Yoni Fogel
23 changed files
--- a/ft/CMakeLists.txt
+++ b/ft/CMakeLists.txt
@@ -31,6 +31,7 @@ set(FT_SOURCES
  checkpoint
  compress
  dbufio
+  dmt-wrapper
  fifo
  ft
  ft-cachetable-wrappers

--- a/ft/bndata.cc
+++ b/ft/bndata.cc
--- a/ft/bndata.h
+++ b/ft/bndata.h
@@ -91,9 +91,10 @@ PATENT RIGHTS GRANT:

 #pragma once

-#include <util/omt.h>
-#include "leafentry.h"
 #include <util/mempool.h>
+#include "wbuf.h"
+#include <util/dmt.h>
+#include "leafentry.h"

 #if 0 //for implementation
 static int
@@ -110,50 +111,80 @@ UU() verify_in_mempool(OMTVALUE lev, uint32_t UU(idx), void *mpv)
 #endif

 struct klpair_struct {
-    uint32_t keylen;
+    uint32_t le_offset;  //Offset of leafentry (in leafentry mempool)
    uint8_t key_le[0]; // key, followed by le
 };

-typedef struct klpair_struct *KLPAIR;
-
-static inline LEAFENTRY get_le_from_klpair(KLPAIR klpair){
-    uint32_t keylen = klpair->keylen;
-    LEAFENTRY le = (LEAFENTRY)(klpair->key_le + keylen);
-    return le;
+static constexpr uint32_t keylen_from_klpair_len(const uint32_t klpair_len) {
+    return klpair_len - __builtin_offsetof(klpair_struct, key_le);
 }

-template<typename omtcmp_t,
-         int (*h)(const DBT &, const omtcmp_t &)>
-static int wrappy_fun_find(const KLPAIR &klpair, const omtcmp_t &extra) {
-    //TODO: kill this function when we split, and/or use toku_fill_dbt
+typedef struct klpair_struct KLPAIR_S, *KLPAIR;
+
+static_assert(__builtin_offsetof(klpair_struct, key_le) == 1*sizeof(uint32_t), "klpair alignment issues");
+static_assert(__builtin_offsetof(klpair_struct, key_le) == sizeof(klpair_struct), "klpair size issues");
+
+template<typename dmtcmp_t,
+         int (*h)(const DBT &, const dmtcmp_t &)>
+static int wrappy_fun_find(const uint32_t klpair_len, const klpair_struct &klpair, const dmtcmp_t &extra) {
    DBT kdbt;
-    kdbt.data = klpair->key_le;
-    kdbt.size = klpair->keylen;
+    kdbt.data = const_cast<void*>(reinterpret_cast<const void*>(klpair.key_le));
+    kdbt.size = keylen_from_klpair_len(klpair_len);
    return h(kdbt, extra);
 }

+template<typename inner_iterate_extra_t>
+struct wrapped_iterate_extra_t {
+    public:
+    inner_iterate_extra_t *inner;
+    const class bn_data * bd;
+};
+
 template<typename iterate_extra_t,
         int (*h)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t idx, iterate_extra_t *const)>
-static int wrappy_fun_iterate(const KLPAIR &klpair, const uint32_t idx, iterate_extra_t *const extra) {
-    uint32_t keylen = klpair->keylen;
-    void* key = klpair->key_le;
-    LEAFENTRY le = get_le_from_klpair(klpair);
-    return h(key, keylen, le, idx, extra);
+static int wrappy_fun_iterate(const uint32_t klpair_len, const klpair_struct &klpair, const uint32_t idx, wrapped_iterate_extra_t<iterate_extra_t> *const extra) {
+    const void* key = &klpair.key_le;
+    LEAFENTRY le = extra->bd->get_le_from_klpair(&klpair);
+    return h(key, keylen_from_klpair_len(klpair_len), le, idx, extra->inner);
+}
+
+
+namespace toku {
+template<>
+class dmt_functor<klpair_struct> {
+    public:
+        size_t get_dmtdatain_t_size(void) const {
+            return sizeof(klpair_struct) + this->keylen;
+        }
+        void write_dmtdata_t_to(klpair_struct *const dest) const {
+            dest->le_offset = this->le_offset;
+            memcpy(dest->key_le, this->keyp, this->keylen);
+        }
+
+        dmt_functor(uint32_t _keylen, uint32_t _le_offset, const void* _keyp)
+            : keylen(_keylen), le_offset(_le_offset), keyp(_keyp) {}
+        dmt_functor(const uint32_t klpair_len, klpair_struct *const src)
+            : keylen(keylen_from_klpair_len(klpair_len)), le_offset(src->le_offset), keyp(src->key_le) {}
+    private:
+        const uint32_t keylen;
+        const uint32_t le_offset;
+        const void* keyp;
+};
 }

-typedef toku::omt<KLPAIR> klpair_omt_t;
+typedef toku::dmt<KLPAIR_S, KLPAIR> klpair_dmt_t;
 // This class stores the data associated with a basement node
 class bn_data {
 public:
    void init_zero(void);
    void initialize_empty(void);
-    void initialize_from_data(uint32_t num_entries, unsigned char *buf, uint32_t data_size);
+    void initialize_from_data(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version);
    // globals
    uint64_t get_memory_size(void);
    uint64_t get_disk_size(void);
    void verify_mempool(void);

-    // Interact with "omt"
+    // Interact with "dmt"
    uint32_t omt_size(void) const;

    template<typename iterate_extra_t,
@@ -165,14 +196,16 @@ public:
    template<typename iterate_extra_t,
             int (*f)(const void * key, const uint32_t keylen, const LEAFENTRY &, const uint32_t, iterate_extra_t *const)>
    int omt_iterate_on_range(const uint32_t left, const uint32_t right, iterate_extra_t *const iterate_extra) const {
-        return m_buffer.iterate_on_range< iterate_extra_t, wrappy_fun_iterate<iterate_extra_t, f> >(left, right, iterate_extra);
+        wrapped_iterate_extra_t<iterate_extra_t> wrapped_extra = { iterate_extra, this };
+        return m_buffer.iterate_on_range< wrapped_iterate_extra_t<iterate_extra_t>, wrappy_fun_iterate<iterate_extra_t, f> >(left, right, &wrapped_extra);
    }

-    template<typename omtcmp_t,
-             int (*h)(const DBT &, const omtcmp_t &)>
-    int find_zero(const omtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
+    template<typename dmtcmp_t,
+             int (*h)(const DBT &, const dmtcmp_t &)>
+    int find_zero(const dmtcmp_t &extra, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
        KLPAIR klpair = NULL;
-        int r = m_buffer.find_zero< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, &klpair, idxp);
+        uint32_t klpair_len;
+        int r = m_buffer.find_zero< dmtcmp_t, wrappy_fun_find<dmtcmp_t, h> >(extra, &klpair_len, &klpair, idxp);
        if (r == 0) {
            if (value) {
                *value = get_le_from_klpair(klpair);
@@ -180,20 +213,21 @@ public:
            if (key) {
                paranoid_invariant(keylen != NULL);
                *key = klpair->key_le;
-                *keylen = klpair->keylen;
+                *keylen = keylen_from_klpair_len(klpair_len);
            }
            else {
-                paranoid_invariant(keylen == NULL);
+                paranoid_invariant_null(keylen);
            }
        }
        return r;
    }

-    template<typename omtcmp_t,
-             int (*h)(const DBT &, const omtcmp_t &)>
-    int find(const omtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
+    template<typename dmtcmp_t,
+             int (*h)(const DBT &, const dmtcmp_t &)>
+    int find(const dmtcmp_t &extra, int direction, LEAFENTRY *const value, void** key, uint32_t* keylen, uint32_t *const idxp) const {
        KLPAIR klpair = NULL;
-        int r = m_buffer.find< omtcmp_t, wrappy_fun_find<omtcmp_t, h> >(extra, direction, &klpair, idxp);
+        uint32_t klpair_len;
+        int r = m_buffer.find< dmtcmp_t, wrappy_fun_find<dmtcmp_t, h> >(extra, direction, &klpair_len, &klpair, idxp);
        if (r == 0) {
            if (value) {
                *value = get_le_from_klpair(klpair);
@@ -201,7 +235,7 @@ public:
            if (key) {
                paranoid_invariant(keylen != NULL);
                *key = klpair->key_le;
-                *keylen = klpair->keylen;
+                *keylen = keylen_from_klpair_len(klpair_len);
            }
            else {
                paranoid_invariant(keylen == NULL);
@@ -232,7 +266,8 @@ public:
        uint32_t* old_keylens,
        LEAFENTRY* old_les,
        size_t *le_sizes,
-        size_t mempool_size
+        size_t total_key_size,
+        size_t total_le_size
        );

    void clone(bn_data* orig_bn_data);
@@ -243,14 +278,39 @@ public:
        );
    void get_space_for_overwrite(uint32_t idx, const void* keyp, uint32_t keylen, uint32_t old_size, uint32_t new_size, LEAFENTRY* new_le_space);
    void get_space_for_insert(uint32_t idx, const void* keyp, uint32_t keylen, size_t size, LEAFENTRY* new_le_space);
+
+    LEAFENTRY get_le_from_klpair(const klpair_struct *klpair) const;
+
+    void prepare_to_serialize(void);
+    void serialize_header(struct wbuf *wb) const;
+    void serialize_rest(struct wbuf *wb) const;
+    bool need_to_serialize_each_leafentry_with_key(void) const;
+
+    static const uint32_t HEADER_LENGTH = 0
+        + sizeof(uint32_t) // key_data_size
+        + sizeof(uint32_t) // val_data_size
+        + sizeof(uint32_t) // fixed_key_length
+        + sizeof(uint8_t) // all_keys_same_length
+        + sizeof(uint8_t) // keys_vals_separate
+        + 0;
 private:
+
    // Private functions
-    KLPAIR mempool_malloc_from_omt(size_t size, void **maybe_free);
-    void omt_compress_kvspace(size_t added_size, void **maybe_free);
+    LEAFENTRY mempool_malloc_and_update_omt(size_t size, void **maybe_free);
+    void omt_compress_kvspace(size_t added_size, void **maybe_free, bool force_compress);
+    void add_key(uint32_t keylen);
+    void add_keys(uint32_t n_keys, uint32_t combined_keylen);
+    void remove_key(uint32_t keylen);

-    klpair_omt_t m_buffer;                     // pointers to individual leaf entries
+    klpair_dmt_t m_buffer;                     // pointers to individual leaf entries
    struct mempool m_buffer_mempool;  // storage for all leaf entries

    friend class bndata_bugfix_test;
+    uint32_t klpair_disksize(const uint32_t klpair_len, const klpair_struct *klpair) const;
+    size_t m_disksize_of_keys;
+
+    void initialize_from_separate_keys_and_vals(uint32_t num_entries, struct rbuf *rb, uint32_t data_size, uint32_t version,
+                                                uint32_t key_data_size, uint32_t val_data_size, bool all_keys_same_length,
+                                                uint32_t fixed_key_length);
 };

--- a/ft/dmt-wrapper.cc
+++ b/ft/dmt-wrapper.cc
--- a/ft/dmt-wrapper.h
+++ b/ft/dmt-wrapper.h
--- a/ft/ft-internal.h
+++ b/ft/ft-internal.h
@@ -1178,6 +1178,8 @@ typedef enum {
    FT_PRO_NUM_STOP_LOCK_CHILD,
    FT_PRO_NUM_STOP_CHILD_INMEM,
    FT_PRO_NUM_DIDNT_WANT_PROMOTE,
+    FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize
+    FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize
    FT_STATUS_NUM_ROWS
 } ft_status_entry;


--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@@ -363,6 +363,8 @@ status_init(void)
    STATUS_INIT(FT_PRO_NUM_STOP_LOCK_CHILD,                PROMOTION_STOPPED_CHILD_LOCKED_OR_NOT_IN_MEMORY, PARCOUNT, "promotion: stopped because the child was locked or not at all in memory", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(FT_PRO_NUM_STOP_CHILD_INMEM,               PROMOTION_STOPPED_CHILD_NOT_FULLY_IN_MEMORY, PARCOUNT, "promotion: stopped because the child was not fully in memory", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
    STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE,             PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE,     BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
+    STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE,  BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);

    ft_status.initialized = true;
 }
@@ -389,6 +391,14 @@ toku_ft_get_status(FT_STATUS s) {
        }                                                                           \
    } while (0)

+void toku_note_deserialized_basement_node(bool fixed_key_size) {
+    if (fixed_key_size) {
+        STATUS_INC(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, 1);
+    } else {
+        STATUS_INC(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, 1);
+    }
+}
+
 bool is_entire_node_in_memory(FTNODE node) {
    for (int i = 0; i < node->n_children; i++) {
        if(BP_STATE(node,i) != PT_AVAIL) {
@@ -595,6 +605,7 @@ ftnode_memory_size (FTNODE node)
    int n_children = node->n_children;
    retval += sizeof(*node);
    retval += (n_children)*(sizeof(node->bp[0]));
+    retval += (n_children > 0 ? n_children-1 : 0)*(sizeof(node->childkeys[0]));
    retval += node->totalchildkeylens;

    // now calculate the sizes of the partitions
@@ -1722,6 +1733,8 @@ toku_ft_bn_apply_cmd_once (
        &new_le, 
        &numbytes_delta
        );
+    // at this point, we cannot trust cmd->u.id.key to be valid.
+    // The dmt may have realloced its mempool and freed the one containing key.

    newsize = new_le ? (leafentry_memsize(new_le) +  + key_storage_size) : 0;
    if (le && new_le) {
@@ -1986,6 +1999,7 @@ toku_ft_bn_apply_cmd (
            int deleted = 0;
            if (!le_is_clean(storeddata)) { //If already clean, nothing to do.
                toku_ft_bn_apply_cmd_once(bn, cmd, idx, storeddata, oldest_referenced_xid_known, gc_info, workdone, stats_to_update);
+                // at this point, we cannot trust cmd->u.id.key to be valid.
                uint32_t new_omt_size = bn->data_buffer.omt_size();
                if (new_omt_size != omt_size) {
                    paranoid_invariant(new_omt_size+1 == omt_size);

--- a/ft/ft-ops.h
+++ b/ft/ft-ops.h
@@ -351,6 +351,8 @@ int toku_ft_strerror_r(int error, char *buf, size_t buflen);

 extern bool garbage_collection_debug;

+void toku_note_deserialized_basement_node(bool fixed_key_size);
+
 // This is a poor place to put global options like these.
 void toku_ft_set_direct_io(bool direct_io_on);
 void toku_ft_set_compress_buffers_before_eviction(bool compress_buffers);

--- a/ft/ft_layout_version.h
+++ b/ft/ft_layout_version.h
@@ -118,7 +118,7 @@ enum ft_layout_version_e {
    FT_LAYOUT_VERSION_22 = 22, // Ming: Add oldest known referenced xid to each ftnode, for better garbage collection
    FT_LAYOUT_VERSION_23 = 23, // Ming: Fix upgrade path #5902
    FT_LAYOUT_VERSION_24 = 24, // Riddler: change logentries that log transactions to store TXNID_PAIRs instead of TXNIDs
-    FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes.  same for xstillopen log entry
+    FT_LAYOUT_VERSION_25 = 25, // SecretSquirrel: ROLLBACK_LOG_NODES (on disk and in memory) now just use blocknum (instead of blocknum + hash) to point to other log nodes.  same for xstillopen log entry, basements store key/vals separately on disk
    FT_NEXT_VERSION,           // the version after the current version
    FT_LAYOUT_VERSION   = FT_NEXT_VERSION-1, // A hack so I don't have to change this line.
    FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported

--- a/ft/ft_node-serialize.cc
+++ b/ft/ft_node-serialize.cc
@@ -320,7 +320,7 @@ serialize_ftnode_partition_size (FTNODE node, int i)
        result += toku_bnc_nbytesinbuf(BNC(node, i));
    }
    else {
-        result += 4; // n_entries in buffer table
+        result += 4 + bn_data::HEADER_LENGTH; // n_entries in buffer table + basement header
        result += BLB_NBYTESINDATA(node, i);
    }
    result += 4; // checksum
@@ -380,10 +380,16 @@ serialize_ftnode_partition(FTNODE node, int i, struct sub_block *sb) {
        wbuf_nocrc_char(&wb, ch);
        wbuf_nocrc_uint(&wb, bd->omt_size());

+        bd->prepare_to_serialize();
+        bd->serialize_header(&wb);
+        if (bd->need_to_serialize_each_leafentry_with_key()) {
            //
            // iterate over leafentries and place them into the buffer
            //
            bd->omt_iterate<struct wbuf, wbufwriteleafentry>(&wb);
+        } else {
+            bd->serialize_rest(&wb);
+        }
    }
    uint32_t end_to_end_checksum = x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb));
    wbuf_nocrc_int(&wb, end_to_end_checksum);
@@ -592,9 +598,14 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
    // Create an array that will store the size of each basement.
    // This is the sum of the leaf sizes of all the leaves in that basement.
    // We don't know how many basements there will be, so we use num_le as the upper bound.
-    toku::scoped_malloc bn_sizes_buf(sizeof(size_t) * num_alloc);
-    size_t *bn_sizes = reinterpret_cast<size_t *>(bn_sizes_buf.get());
-    bn_sizes[0] = 0;
+
+    // Sum of all le sizes in a single basement
+    toku::scoped_calloc bn_le_sizes_buf(sizeof(size_t) * num_alloc);
+    size_t *bn_le_sizes = reinterpret_cast<size_t *>(bn_le_sizes_buf.get());
+
+    // Sum of all key sizes in a single basement
+    toku::scoped_calloc bn_key_sizes_buf(sizeof(size_t) * num_alloc);
+    size_t *bn_key_sizes = reinterpret_cast<size_t *>(bn_key_sizes_buf.get());

    // TODO 4050: All these arrays should be combined into a single array of some bn_info struct (pivot, msize, num_les).
    // Each entry is the number of leafentries in this basement.  (Again, num_le is overkill upper baound.)
@@ -611,17 +622,20 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
    for (uint32_t i = 0; i < num_le; i++) {
        uint32_t curr_le_size = leafentry_disksize((LEAFENTRY) leafpointers[i]); 
        le_sizes[i] = curr_le_size;
-        if ((bn_size_so_far + curr_le_size > basementnodesize) && (num_le_in_curr_bn != 0)) {
+        if ((bn_size_so_far + curr_le_size + sizeof(uint32_t) + key_sizes[i] > basementnodesize) && (num_le_in_curr_bn != 0)) {
            // cap off the current basement node to end with the element before i
            new_pivots[curr_pivot] = i-1;
            curr_pivot++;
            num_le_in_curr_bn = 0;
            bn_size_so_far = 0;
+            bn_le_sizes[curr_pivot] = 0;
+            bn_key_sizes[curr_pivot] = 0;
        }
        num_le_in_curr_bn++;
        num_les_this_bn[curr_pivot] = num_le_in_curr_bn;
+        bn_le_sizes[curr_pivot] += curr_le_size;
+        bn_key_sizes[curr_pivot] += sizeof(uint32_t) + key_sizes[i];  // uint32_t le_offset
        bn_size_so_far += curr_le_size + sizeof(uint32_t) + key_sizes[i];
-        bn_sizes[curr_pivot] = bn_size_so_far;
    }
    // curr_pivot is now the total number of pivot keys in the leaf node
    int num_pivots   = curr_pivot;
@@ -688,9 +702,6 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
        uint32_t num_les_to_copy = num_les_this_bn[i];
        invariant(num_les_to_copy == num_in_bn); 

-        // construct mempool for this basement
-        size_t size_this_bn = bn_sizes[i];
-
        BN_DATA bd = BLB_DATA(node, i);
        bd->replace_contents_with_clone_of_sorted_array(
            num_les_to_copy,
@@ -698,7 +709,8 @@ rebalance_ftnode_leaf(FTNODE node, unsigned int basementnodesize)
            &key_sizes[baseindex_this_bn],
            &leafpointers[baseindex_this_bn],
            &le_sizes[baseindex_this_bn],
-            size_this_bn
+            bn_key_sizes[i],  // Total key sizes
+            bn_le_sizes[i]  // total le sizes
            );

        BP_STATE(node,i) = PT_AVAIL;
@@ -1548,8 +1560,7 @@ deserialize_ftnode_partition(
        data_size -= rb.ndone; // remaining bytes of leafentry data

        BASEMENTNODE bn = BLB(node, childnum);
-        bn->data_buffer.initialize_from_data(num_entries, &rb.buf[rb.ndone], data_size);
-        rb.ndone += data_size;
+        bn->data_buffer.initialize_from_data(num_entries, &rb, data_size, node->layout_version_read_from_disk);
    }
    assert(rb.ndone == rb.size);
 exit:
@@ -2101,8 +2112,7 @@ deserialize_and_upgrade_leaf_node(FTNODE node,
        if (has_end_to_end_checksum) {
            data_size -= sizeof(uint32_t);
        }
-        bn->data_buffer.initialize_from_data(n_in_buf, &rb->buf[rb->ndone], data_size);
-        rb->ndone += data_size;
+        bn->data_buffer.initialize_from_data(n_in_buf, rb, data_size, node->layout_version_read_from_disk);
    }

    // Whatever this is must be less than the MSNs of every message above

--- a/ft/memarena.cc
+++ b/ft/memarena.cc
@@ -98,6 +98,7 @@ struct memarena {
    char *buf;
    size_t buf_used, buf_size;
    size_t size_of_other_bufs; // the buf_size of all the other bufs.
+    size_t footprint_of_other_bufs; // the footprint of all the other bufs.
    char **other_bufs;
    int n_other_bufs;
 };
@@ -108,6 +109,7 @@ MEMARENA memarena_create_presized (size_t initial_size) {
    result->buf_used = 0;
    result->other_bufs = NULL;
    result->size_of_other_bufs = 0;
+    result->footprint_of_other_bufs = 0;
    result->n_other_bufs = 0;
    XMALLOC_N(result->buf_size, result->buf);
    return result;
@@ -128,6 +130,7 @@ void memarena_clear (MEMARENA ma) {
    // But reuse the main buffer
    ma->buf_used = 0;
    ma->size_of_other_bufs = 0;
+    ma->footprint_of_other_bufs = 0;
 }

 static size_t
@@ -151,6 +154,7 @@ void* malloc_in_memarena (MEMARENA ma, size_t size) {
            ma->other_bufs[old_n]=ma->buf;
            ma->n_other_bufs = old_n+1;
            ma->size_of_other_bufs += ma->buf_size;
+            ma->footprint_of_other_bufs += toku_memory_footprint(ma->buf, ma->buf_used);
        }
        // Make a new one
        {
@@ -217,7 +221,9 @@ void memarena_move_buffers(MEMARENA dest, MEMARENA source) {
 #endif

    dest  ->size_of_other_bufs += source->size_of_other_bufs + source->buf_size;
+    dest  ->footprint_of_other_bufs += source->footprint_of_other_bufs + toku_memory_footprint(source->buf, source->buf_used);
    source->size_of_other_bufs = 0;
+    source->footprint_of_other_bufs = 0;

    assert(other_bufs);
    dest->other_bufs = other_bufs;
@@ -247,3 +253,11 @@ memarena_total_size_in_use (MEMARENA m)
 {
    return m->size_of_other_bufs + m->buf_used;
 }    
+
+size_t
+memarena_total_footprint (MEMARENA m)
+{
+    return m->footprint_of_other_bufs + toku_memory_footprint(m->buf, m->buf_used) +
+            sizeof(*m) +
+            m->n_other_bufs * sizeof(*m->other_bufs);
+}
--- a/ft/memarena.h
+++ b/ft/memarena.h
@@ -129,5 +129,6 @@ size_t memarena_total_memory_size (MEMARENA);

 size_t memarena_total_size_in_use (MEMARENA);

+size_t memarena_total_footprint (MEMARENA);

 #endif
--- a/ft/rollback.cc
+++ b/ft/rollback.cc
@@ -146,7 +146,7 @@ PAIR_ATTR
 rollback_memory_size(ROLLBACK_LOG_NODE log) {
    size_t size = sizeof(*log);
    if (log->rollentry_arena) {
-        size += memarena_total_memory_size(log->rollentry_arena);
+        size += memarena_total_footprint(log->rollentry_arena);
    }
    return make_rollback_pair_attr(size);
 }

--- a/ft/tests/dmt-test.cc
+++ b/ft/tests/dmt-test.cc
--- a/ft/tests/dmt-test2.cc
+++ b/ft/tests/dmt-test2.cc
--- a/ft/tests/ft-serialize-benchmark.cc
+++ b/ft/tests/ft-serialize-benchmark.cc
@@ -127,7 +127,7 @@ long_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
 }

 static void
-test_serialize_leaf(int valsize, int nelts, double entropy) {
+test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode *sn, *dn;

@@ -214,32 +214,63 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
        assert(size   == 100);
    }

+    struct timeval total_start;
+    struct timeval total_end;
+    total_start.tv_sec = total_start.tv_usec = 0;
+    total_end.tv_sec = total_end.tv_usec = 0;
    struct timeval t[2];
-    gettimeofday(&t[0], NULL);
    FTNODE_DISK_DATA ndd = NULL;
+    for (int i = 0; i < ser_runs; i++) {
+        gettimeofday(&t[0], NULL);
+        ndd = NULL;
+        sn->dirty = 1;
        r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, brt->ft, false);
        assert(r==0);
        gettimeofday(&t[1], NULL);
+        total_start.tv_sec += t[0].tv_sec;
+        total_start.tv_usec += t[0].tv_usec;
+        total_end.tv_sec += t[1].tv_sec;
+        total_end.tv_usec += t[1].tv_usec;
+        toku_free(ndd);
+    }
    double dt;
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("serialize leaf:   %0.05lf\n", dt);
+    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt *= 1000;
+    dt /= ser_runs;
+    printf("serialize leaf(ms):   %0.05lf (average of %d runs)\n", dt, ser_runs);
+
+    //reset 
+    total_start.tv_sec = total_start.tv_usec = 0;
+    total_end.tv_sec = total_end.tv_usec = 0;

    struct ftnode_fetch_extra bfe;
+    for (int i = 0; i < deser_runs; i++) {
        fill_bfe_for_full_read(&bfe, brt_h);
        gettimeofday(&t[0], NULL);
        FTNODE_DISK_DATA ndd2 = NULL;
        r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
        assert(r==0);
        gettimeofday(&t[1], NULL);
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("deserialize leaf: %0.05lf\n", dt);
-    printf("io time %lf decompress time %lf deserialize time %lf\n",
-           tokutime_to_seconds(bfe.io_time),
-           tokutime_to_seconds(bfe.decompress_time),
-           tokutime_to_seconds(bfe.deserialize_time)
-           );
+
+        total_start.tv_sec += t[0].tv_sec;
+        total_start.tv_usec += t[0].tv_usec;
+        total_end.tv_sec += t[1].tv_sec;
+        total_end.tv_usec += t[1].tv_usec;

        toku_ftnode_free(&dn);
+        toku_free(ndd2);
+    }
+    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt *= 1000;
+    dt /= deser_runs;
+    printf("deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
+    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (average of %d runs)\n",
+           tokutime_to_seconds(bfe.io_time)*1000,
+           tokutime_to_seconds(bfe.decompress_time)*1000,
+           tokutime_to_seconds(bfe.deserialize_time)*1000,
+           deser_runs
+           );
+
    toku_ftnode_free(&sn);

    toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
@@ -247,14 +278,12 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
    toku_free(brt_h->h);
    toku_free(brt_h);
    toku_free(brt);
-    toku_free(ndd);
-    toku_free(ndd2);

    r = close(fd); assert(r != -1);
 }

 static void
-test_serialize_nonleaf(int valsize, int nelts, double entropy) {
+test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
    //    struct ft_handle source_ft;
    struct ftnode sn, *dn;

@@ -353,7 +382,8 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
    gettimeofday(&t[1], NULL);
    double dt;
    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("serialize nonleaf:   %0.05lf\n", dt);
+    dt *= 1000;
+    printf("serialize nonleaf(ms):   %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);

    struct ftnode_fetch_extra bfe;
    fill_bfe_for_full_read(&bfe, brt_h);
@@ -363,11 +393,13 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
    assert(r==0);
    gettimeofday(&t[1], NULL);
    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
-    printf("deserialize nonleaf: %0.05lf\n", dt);
-    printf("io time %lf decompress time %lf deserialize time %lf\n",
-           tokutime_to_seconds(bfe.io_time),
-           tokutime_to_seconds(bfe.decompress_time),
-           tokutime_to_seconds(bfe.deserialize_time)
+    dt *= 1000;
+    printf("deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
+    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (IGNORED RUNS=%d)\n",
+           tokutime_to_seconds(bfe.io_time)*1000,
+           tokutime_to_seconds(bfe.decompress_time)*1000,
+           tokutime_to_seconds(bfe.deserialize_time)*1000,
+           deser_runs
           );

    toku_ftnode_free(&dn);
@@ -394,19 +426,32 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {

 int
 test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
-    long valsize, nelts;
+    const int DEFAULT_RUNS = 5;
+    long valsize, nelts, ser_runs = DEFAULT_RUNS, deser_runs = DEFAULT_RUNS;
    double entropy = 0.3;

-    if (argc != 3) {
-        fprintf(stderr, "Usage: %s <valsize> <nelts>\n", argv[0]);
+    if (argc != 3 && argc != 5) {
+        fprintf(stderr, "Usage: %s <valsize> <nelts> [<serialize_runs> <deserialize_runs>]\n", argv[0]);
+        fprintf(stderr, "Default (and min) runs is %d\n", DEFAULT_RUNS);
        return 2;
    }
    valsize = strtol(argv[1], NULL, 0);
    nelts = strtol(argv[2], NULL, 0);
+    if (argc == 5) {
+        ser_runs = strtol(argv[3], NULL, 0);
+        deser_runs = strtol(argv[4], NULL, 0);
+    }
+
+    if (ser_runs <= 0) {
+        ser_runs = DEFAULT_RUNS;
+    }
+    if (deser_runs <= 0) {
+        deser_runs = DEFAULT_RUNS;
+    }

    initialize_dummymsn();
-    test_serialize_leaf(valsize, nelts, entropy);
-    test_serialize_nonleaf(valsize, nelts, entropy);
+    test_serialize_leaf(valsize, nelts, entropy, ser_runs, deser_runs);
+    test_serialize_nonleaf(valsize, nelts, entropy, ser_runs, deser_runs);

    return 0;
 }
--- a/ft/tests/ft-serialize-test.cc
+++ b/ft/tests/ft-serialize-test.cc
--- a/ft/tests/test-pick-child-to-flush.cc
+++ b/ft/tests/test-pick-child-to-flush.cc
@@ -189,7 +189,7 @@ doit (void) {
    r = toku_testsetup_root(t, node_root);
    assert(r==0);

-    char filler[900];
+    char filler[900-2*bn_data::HEADER_LENGTH];
    memset(filler, 0, sizeof(filler));
    // now we insert filler data so that a merge does not happen
    r = toku_testsetup_insert_to_leaf (

--- a/ft/wbuf.h
+++ b/ft/wbuf.h
@@ -187,6 +187,13 @@ static inline void wbuf_uint (struct wbuf *w, uint32_t i) {
    wbuf_int(w, (int32_t)i);
 }

+static inline uint8_t* wbuf_nocrc_reserve_literal_bytes(struct wbuf *w, uint32_t nbytes) {
+    assert(w->ndone + nbytes <= w->size);
+    uint8_t * dest = w->buf + w->ndone;
+    w->ndone += nbytes;
+    return dest;
+}
+
 static inline void wbuf_nocrc_literal_bytes(struct wbuf *w, bytevec bytes_bv, uint32_t nbytes) {
    const unsigned char *bytes = (const unsigned char *) bytes_bv;
 #if 0

--- a/util/dmt.cc
+++ b/util/dmt.cc
--- a/util/dmt.h
+++ b/util/dmt.h
--- a/util/mempool.cc
+++ b/util/mempool.cc
@@ -131,7 +131,7 @@ void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_
 void toku_mempool_construct(struct mempool *mp, size_t data_size) {
    if (data_size) {
        size_t mpsize = data_size + (data_size/4);     // allow 1/4 room for expansion (would be wasted if read-only)
-        mp->base = toku_xmalloc(mpsize);               // allocate buffer for mempool
+        mp->base = toku_xmalloc_aligned(64, mpsize);   // allocate buffer for mempool
        mp->size = mpsize;
        mp->free_offset = 0;                     // address of first available memory for new data
        mp->frag_size = 0;                       // all allocated space is now in use
@@ -142,6 +142,16 @@ void toku_mempool_construct(struct mempool *mp, size_t data_size) {
    }
 }

+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size) {
+    invariant(data_size > mp->free_offset);
+    size_t mpsize = data_size + (data_size/4);     // allow 1/4 room for expansion (would be wasted if read-only)
+    void* newmem = toku_xmalloc_aligned(64, mpsize);   // allocate new buffer for mempool
+    memcpy(newmem, mp->base, mp->free_offset);  // Copy old info
+    toku_free(mp->base);
+    mp->base = newmem;
+    mp->size = mpsize;
+}
+

 void toku_mempool_destroy(struct mempool *mp) {
    // printf("mempool_destroy %p %p %lu %lu\n", mp, mp->base, mp->size, mp->frag_size);
@@ -150,27 +160,40 @@ void toku_mempool_destroy(struct mempool *mp) {
    toku_mempool_zero(mp);
 }

-void *toku_mempool_get_base(struct mempool *mp) {
+void *toku_mempool_get_base(const struct mempool *mp) {
    return mp->base;
 }

-size_t toku_mempool_get_size(struct mempool *mp) {
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset) {
+    return reinterpret_cast<void*>(reinterpret_cast<char*>(mp->base) + offset);
+}
+
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, void* p) {
+    paranoid_invariant(p >= mp->base);
+    return reinterpret_cast<char*>(p) - reinterpret_cast<char*>(mp->base);
+}
+
+size_t toku_mempool_get_size(const struct mempool *mp) {
    return mp->size;
 }

-size_t toku_mempool_get_frag_size(struct mempool *mp) {
+size_t toku_mempool_get_frag_size(const struct mempool *mp) {
    return mp->frag_size;
 }

-size_t toku_mempool_get_used_space(struct mempool *mp) {
+size_t toku_mempool_get_used_space(const struct mempool *mp) {
    return mp->free_offset - mp->frag_size;
 }

-size_t toku_mempool_get_free_space(struct mempool *mp) {
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp) {
+    return toku_mempool_get_pointer_from_base_and_offset(mp, mp->free_offset);
+}
+
+size_t toku_mempool_get_free_space(const struct mempool *mp) {
    return mp->size - mp->free_offset;
 }

-size_t toku_mempool_get_allocated_space(struct mempool *mp) {
+size_t toku_mempool_get_allocated_space(const struct mempool *mp) {
    return mp->free_offset;
 }

@@ -211,10 +234,10 @@ size_t toku_mempool_footprint(struct mempool *mp) {
    return rval;
 }

-void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp) {
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp) {
    new_mp->frag_size = orig_mp->frag_size;
    new_mp->free_offset = orig_mp->free_offset;
    new_mp->size = orig_mp->free_offset; // only make the cloned mempool store what is needed
-    new_mp->base = toku_xmalloc(new_mp->size);
+    new_mp->base = toku_xmalloc_aligned(64, new_mp->size);
    memcpy(new_mp->base, orig_mp->base, new_mp->size);
 }
--- a/util/mempool.h
+++ b/util/mempool.h
@@ -123,26 +123,39 @@ void toku_mempool_init(struct mempool *mp, void *base, size_t free_offset, size_
 */
 void toku_mempool_construct(struct mempool *mp, size_t data_size);

+/* reallocate memory for construct mempool
+ */
+void toku_mempool_realloc_larger(struct mempool *mp, size_t data_size);
+
 /* destroy the memory pool */
 void toku_mempool_destroy(struct mempool *mp);

 /* get the base address of the memory pool */
-void *toku_mempool_get_base(struct mempool *mp);
+void *toku_mempool_get_base(const struct mempool *mp);
+
+/* get the a pointer that is offset bytes in front of base of the memory pool */
+void *toku_mempool_get_pointer_from_base_and_offset(const struct mempool *mp, size_t offset);
+
+/* get the offset from base of a pointer */
+size_t toku_mempool_get_offset_from_pointer_and_base(const struct mempool *mp, void* p);
+
+/* get the a pointer of the first free byte (if any) */
+void* toku_mempool_get_next_free_ptr(const struct mempool *mp);

 /* get the size of the memory pool */
-size_t toku_mempool_get_size(struct mempool *mp);
+size_t toku_mempool_get_size(const struct mempool *mp);

 /* get the amount of fragmented (wasted) space in the memory pool */
-size_t toku_mempool_get_frag_size(struct mempool *mp);
+size_t toku_mempool_get_frag_size(const struct mempool *mp);

 /* get the amount of space that is holding useful data */
-size_t toku_mempool_get_used_space(struct mempool *mp);
+size_t toku_mempool_get_used_space(const struct mempool *mp);

 /* get the amount of space that is available for new data */
-size_t toku_mempool_get_free_space(struct mempool *mp);
+size_t toku_mempool_get_free_space(const struct mempool *mp);

 /* get the amount of space that has been allocated for use (wasted or not) */
-size_t toku_mempool_get_allocated_space(struct mempool *mp);
+size_t toku_mempool_get_allocated_space(const struct mempool *mp);

 /* allocate a chunk of memory from the memory pool suitably aligned */
 void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment);
@@ -160,6 +173,8 @@ static inline int toku_mempool_inrange(struct mempool *mp, void *vp, size_t size
 /* get memory footprint */
 size_t toku_mempool_footprint(struct mempool *mp);

-void toku_mempool_clone(struct mempool* orig_mp, struct mempool* new_mp);
+void toku_mempool_clone(const struct mempool* orig_mp, struct mempool* new_mp);
+
+

 #endif // UTIL_MEMPOOL_H