......@@ -33,7 +33,7 @@ typedef struct flusher_advice FLUSHER_ADVICE;
typedef int (*FA_PICK_CHILD)(FT h, FTNODE parent, void* extra);
* Decide whether to call `flush_some_child` on the child if it is
* Decide whether to call `toku_ft_flush_some_child` on the child if it is
* stable and a nonleaf node.
* Flusher threads: yes if child is gorged
......@@ -76,9 +76,9 @@ typedef bool (*FA_SHOULD_DESTROY_BN)(void* extra);
* Update `ft_flusher_status` in whatever way necessary. Called once
* by `flush_some_child` right before choosing what to do next (split,
* by `toku_ft_flush_some_child` right before choosing what to do next (split,
* merge, recurse), with the number of nodes that were dirtied by this
* execution of `flush_some_child`.
* execution of `toku_ft_flush_some_child`.
typedef void (*FA_UPDATE_STATUS)(FTNODE child, int dirtied, void* extra);
......@@ -109,17 +109,6 @@ struct flusher_advice {
void* extra; // parameter passed into callbacks
// FIXME all of these need the toku prefix
// how about:
// toku_ftnode_flush_some_child()
// toku_fa_flusher_advice_init()
// toku_fa_always_recursively_flush()
// toku_fa_dont_destroy_basement_nodes()
// toku_fa_default_merge_child()
// toku_fa_default_pick_child_after_split()
struct flusher_advice *fa,
......@@ -132,11 +121,11 @@ flusher_advice_init(
void* extra
FT h,
void toku_ft_flush_some_child(
FT ft,
FTNODE parent,
struct flusher_advice *fa);
struct flusher_advice *fa
always_recursively_flush(FTNODE child, void* extra);
This diff is collapsed.
......@@ -65,11 +65,11 @@ toku_flusher_thread_set_callback(
* Puts a workitem on the flusher thread queue, scheduling the node to be
* flushed by flush_some_child.
* flushed by toku_ft_flush_some_child.
FT h,
FT ft,
FTNODE parent
......@@ -141,7 +141,7 @@ hot_update_flusher_keys(FTNODE parent,
// Picks which child flush_some_child will use for flushing and
// Picks which child toku_ft_flush_some_child will use for flushing and
// recursion.
static int
hot_pick_child(FT h,
......@@ -308,7 +308,7 @@ toku_ft_hot_optimize(FT_HANDLE brt,
// This should recurse to the bottom of the tree and then
// return.
if (root->height > 0) {
flush_some_child(brt->ft, root, &advice);
toku_ft_flush_some_child(brt->ft, root, &advice);
} else {
// Since there are no children to flush, we should abort
// the HOT call.
......@@ -318,7 +318,7 @@ toku_ft_hot_optimize(FT_HANDLE brt,
// Set the highest pivot key seen here, since the parent may
// be unlocked and NULL'd later in our caller:
// flush_some_child().
// toku_ft_flush_some_child().
// This is where we determine if the traversal is finished or
......@@ -137,7 +137,7 @@ long toku_bnc_memory_size(NONLEAF_CHILDINFO bnc);
long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc);
void toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, DESCRIPTOR desc, ft_compare_func cmp);
void toku_bnc_empty(NONLEAF_CHILDINFO bnc);
void toku_bnc_flush_to_child(FT h, NONLEAF_CHILDINFO bnc, FTNODE child);
void toku_bnc_flush_to_child(FT h, NONLEAF_CHILDINFO bnc, FTNODE child, TXNID oldest_referenced_xid);
bool toku_bnc_should_promote(FT ft, NONLEAF_CHILDINFO bnc) __attribute__((const, nonnull));
bool toku_ft_nonleaf_is_gorged(FTNODE node, uint32_t nodesize);
......@@ -246,6 +246,17 @@ struct ftnode {
unsigned int totalchildkeylens;
DBT *childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Child 1's keys are > childkeys[0]. */
// What's the oldest referenced xid that this node knows about? The real oldest
// referenced xid might be younger, but this is our best estimate. We use it
// as a heuristic to transition provisional mvcc entries from provisional to
// committed (from implicity committed to really committed).
// A better heuristic would be the oldest live txnid, but we use this since it
// still works well most of the time, and its readily available on the inject
// code path.
TXNID oldest_known_referenced_xid;
// array of size n_children, consisting of ftnode partitions
// each one is associated with a child
// for internal nodes, the ith partition corresponds to the ith message buffer
......@@ -606,8 +617,6 @@ void toku_destroy_ftnode_internals(FTNODE node);
void toku_ftnode_free (FTNODE *node);
bool is_entire_node_in_memory(FTNODE node);
void toku_assert_entire_node_in_memory(FTNODE node);
// FIXME needs toku prefix
void bring_node_fully_into_memory(FTNODE node, FT h);
// append a child node to a parent node
void toku_ft_nonleaf_append_child(FTNODE node, FTNODE child, const DBT *pivotkey);
......@@ -1092,7 +1101,6 @@ toku_ft_leaf_apply_cmd (
FTNODE node,
int target_childnum,
FT_MSG cmd,
TXNID oldest_referenced_xid,
uint64_t *workdone,
STAT64INFO stats_to_update
......@@ -1107,7 +1115,6 @@ toku_ft_node_put_cmd (
FT_MSG cmd,
bool is_fresh,
size_t flow_deltas[],
TXNID oldest_referenced_xid,
STAT64INFO stats_to_update
This diff is collapsed.
......@@ -379,6 +379,7 @@ serialize_ft_min_size (uint32_t version) {
size_t size = 0;
switch(version) {
size += sizeof(MSN); // max_msn_in_ft
......@@ -146,7 +146,6 @@ int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM blocknum, const char
......@@ -912,3 +912,64 @@ void toku_ft_get_compression_method(FT ft, enum toku_compression_method *methodp
void toku_ft_set_blackhole(FT_HANDLE ft_handle) {
ft_handle->ft->blackhole = true;
struct garbage_helper_extra {
FT ft;
size_t total_space;
size_t used_space;
static int
garbage_leafentry_helper(OMTVALUE v, uint32_t UU(idx), void *extra) {
struct garbage_helper_extra *CAST_FROM_VOIDP(info, extra);
info->total_space += leafentry_disksize(le);
info->used_space += LE_CLEAN_MEMSIZE(le_latest_keylen(le), le_latest_vallen(le));
return 0;
static int
garbage_helper(BLOCKNUM blocknum, int64_t UU(size), int64_t UU(address), void *extra) {
struct garbage_helper_extra *CAST_FROM_VOIDP(info, extra);
FTNODE node;
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, info->ft);
int fd = toku_cachefile_get_fd(info->ft->cf);
int r = toku_deserialize_ftnode_from(fd, blocknum, 0, &node, &ndd, &bfe);
if (r != 0) {
goto no_node;
if (node->height > 0) {
goto exit;
for (int i = 0; i < node->n_children; ++i) {
BASEMENTNODE bn = BLB(node, i);
r = toku_omt_iterate(bn->buffer, garbage_leafentry_helper, info);
if (r != 0) {
goto exit;
return r;
void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space) {
// Effect: Iterates the FT's blocktable and calculates the total and used space for leaf blocks.
// Note: It is ok to call this function concurrently with reads/writes to the table since
// the blocktable lock is held, which means no new allocations or file writes can occur.
struct garbage_helper_extra info = {
.ft = ft,
.total_space = 0,
.used_space = 0
toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED, garbage_helper, &info, true, true);
*total_space = info.total_space;
*used_space = info.used_space;
......@@ -104,4 +104,8 @@ void toku_node_save_ct_pair(CACHEKEY UU(key), void *value_data, PAIR p);
// mark the ft as a blackhole. any message injections will be a no op.
void toku_ft_set_blackhole(FT_HANDLE ft_handle);
// Effect: Calculates the total space and used space for a FT's leaf data.
// The difference between the two is MVCC garbage.
void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space);
......@@ -30,6 +30,7 @@ enum ft_layout_version_e {
// last_xid to shutdown
FT_LAYOUT_VERSION_21 = 21, // Ming: Add max_msn_in_ft to header,
// Removed log suppression logentry
FT_LAYOUT_VERSION_22 = 22, // Ming: Add oldest known referenced xid to each ftnode, for better garbage collection
FT_NEXT_VERSION, // the version after the current version
FT_LAYOUT_VERSION = FT_NEXT_VERSION-1, // A hack so I don't have to change this line.
......@@ -367,6 +367,7 @@ serialize_ftnode_info_size(FTNODE node)
retval += 4; // nodesize
retval += 4; // flags
retval += 4; // height;
retval += 8; // oldest_known_referenced_xid
retval += node->totalchildkeylens; // total length of pivots
retval += (node->n_children-1)*4; // encode length of each pivot
if (node->height > 0) {
......@@ -390,6 +391,8 @@ static void serialize_ftnode_info(FTNODE node,
wbuf_nocrc_uint(&wb, 0); // write a dummy value for where node->nodesize used to be
wbuf_nocrc_uint(&wb, node->flags);
wbuf_nocrc_int (&wb, node->height);
wbuf_TXNID(&wb, node->oldest_known_referenced_xid);
// pivot information
for (int i = 0; i < node->n_children-1; i++) {
wbuf_nocrc_bytes(&wb, node->childkeys[i].data, node->childkeys[i].size);
......@@ -1264,6 +1267,9 @@ deserialize_ftnode_info(
if (node->layout_version_read_from_disk < FT_LAYOUT_VERSION_19) {
(void) rbuf_int(&rb); // optimized_for_upgrade
if (node->layout_version_read_from_disk >= FT_LAYOUT_VERSION_22) {
rbuf_TXNID(&rb, &node->oldest_known_referenced_xid);
// now create the basement nodes or childinfos, depending on whether this is a
// leaf node or internal node
......@@ -1505,6 +1511,17 @@ check_and_copy_compressed_sub_block_worker(struct rbuf curr_rbuf, struct sub_blo
return r;
static FTNODE alloc_ftnode_for_deserialize(uint32_t fullhash, BLOCKNUM blocknum) {
// Effect: Allocate an FTNODE and fill in the values that are not read from
node->fullhash = fullhash;
node->thisnodename = blocknum;
node->dirty = 0;
node->bp = nullptr;
node->oldest_known_referenced_xid = TXNID_NONE;
return node;
static int
deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
......@@ -1518,13 +1535,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
// Return 0 if it worked. If something goes wrong (including that we are looking at some old data format that doesn't have partitions) then return nonzero.
int r = 0;
// fill in values that are known and not stored in rb
node->fullhash = fullhash;
node->thisnodename = blocknum;
node->dirty = 0;
node->bp = NULL; // fill this in so we can free without a leak.
FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum);
if (rb->size < 24) {
// TODO: What error do we return here?
......@@ -2171,15 +2182,10 @@ deserialize_ftnode_from_rbuf(
// Effect: deserializes a ftnode that is in rb (with pointer of rb just past the magic) into a FTNODE.
int r = 0;
struct sub_block sb_node_info;
// fill in values that are known and not stored in rb
node->fullhash = fullhash;
node->thisnodename = blocknum;
node->dirty = 0;
FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum);
// now start reading from rbuf
// first thing we do is read the header information
bytevec magic;
rbuf_literal_bytes(rb, &magic, 8);
......@@ -280,61 +280,14 @@ dump_nodesizes(int f, FT h) {
printf("leafsizes: %" PRIu64 "\n", info.leafsizes);
typedef struct {
int f;
FT h;
size_t total_space;
size_t used_space;
} garbage_help_extra;
static int
garbage_leafentry_helper(OMTVALUE v, uint32_t UU(idx), void *extra) {
garbage_help_extra *CAST_FROM_VOIDP(info, extra);
info->total_space += leafentry_disksize(le);
info->used_space += LE_CLEAN_MEMSIZE(le_latest_keylen(le), le_latest_vallen(le));
return 0;
static int
garbage_helper(BLOCKNUM b, int64_t UU(size), int64_t UU(address), void *extra) {
garbage_help_extra *CAST_FROM_VOIDP(info, extra);
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, info->h);
int r = toku_deserialize_ftnode_from(info->f, b, 0, &n, &ndd, &bfe);
if (r != 0) {
goto no_node;
if (n->height > 0) {
goto exit;
for (int i = 0; i < n->n_children; ++i) {
r = toku_omt_iterate(bn->buffer, garbage_leafentry_helper, info);
if (r != 0) {
goto exit;
return r;
static void
dump_garbage_stats(int f, FT h) {
garbage_help_extra info;
memset(&info, 0, sizeof info);
info.f = f;
info.h = h;
toku_blocktable_iterate(h->blocktable, TRANSLATION_CHECKPOINTED,
garbage_helper, &info, true, true);
printf("total_size: %zu\n", info.total_space);
printf("used_size: %zu\n", info.used_space);
dump_garbage_stats(int f, FT ft) {
invariant(f == toku_cachefile_get_fd(ft->cf));
uint64_t total_space = 0;
uint64_t used_space = 0;
toku_ft_get_garbage(ft, &total_space, &used_space);
printf("total_size: %zu\n", total_space);
printf("used_size: %zu\n", used_space);
static uint32_t
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
......@@ -10,11 +12,12 @@
#include <util/mempool.h>
#include "txn_manager.h"
#include "rbuf.h"
#include "x1764.h"
#include "omt.h"
#if 0
Memory format of packed leaf entry
......@@ -35,10 +38,7 @@
#pragma pack(push, 1)
// enum of possible values for LEAFENTRY->type field
......@@ -94,9 +94,6 @@ struct __attribute__ ((__packed__)) leafentry {
static_assert(10 == sizeof(leafentry), "leafentry size is wrong");
static_assert(5 == __builtin_offsetof(leafentry, u), "union is in the wrong place");
#pragma pack(pop)
#define LE_CLEAN_MEMSIZE(_keylen, _vallen) \
(sizeof(((LEAFENTRY)NULL)->type) /* type */ \
......@@ -123,6 +120,10 @@ static_assert(5 == __builtin_offsetof(leafentry, u), "union is in the wrong plac
typedef struct leafentry *LEAFENTRY;
typedef struct leafentry_13 *LEAFENTRY_13;
// TODO: consistency among names is very poor.
size_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory.
size_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk.
void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
......@@ -144,19 +145,6 @@ void* le_key_and_len (LEAFENTRY le, uint32_t *len);
uint64_t le_outermost_uncommitted_xid (LEAFENTRY le);
le_committed_mvcc(uint8_t *key, uint32_t keylen,
uint8_t *val, uint32_t vallen,
TXNID xid,
void (*bytes)(struct dbuf *dbuf, const void *bytes, int nbytes),
struct dbuf *d);
le_clean(uint8_t *key, uint32_t keylen,
uint8_t *val, uint32_t vallen,
void (*bytes)(struct dbuf *dbuf, const void *bytes, int nbytes),
struct dbuf *d);
//Callback contract:
// Function checks to see if id is accepted by context.
// Returns:
......@@ -169,9 +157,9 @@ int le_iterate_is_del(LEAFENTRY le, LE_ITERATE_CALLBACK f, bool *is_empty, TOKUT
int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, uint32_t *vallenp, TOKUTXN context);
leafentry_disksize_13(LEAFENTRY_13 le);
toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored data.
size_t *new_leafentry_memorysize,
......@@ -179,7 +167,28 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored
OMT *omtp,
struct mempool *mp);
void toku_le_apply_msg(FT_MSG msg,
LEAFENTRY old_leafentry, // NULL if there was no stored data.
TXNID oldest_referenced_xid,
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
int64_t * numbytes_delta_p);
bool toku_le_worth_running_garbage_collection(LEAFENTRY le, TXNID oldest_known_referenced_xid);
void toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
LEAFENTRY *new_leaf_entry,
size_t *new_leaf_entry_memory_size,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
const xid_omt_t &snapshot_xids,
const rx_omt_t &referenced_xids,
const xid_omt_t &live_root_txns,
TXNID oldest_known_referenced_xid);
#endif /* TOKU_LEAFENTRY_H */
......@@ -48,7 +48,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
brt->ft->h->max_msn_in_ft = msn;
FT_MSG_S cmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &theval }} };
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd, TXNID_NONE, nullptr, nullptr);
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd, nullptr, nullptr);
int r = toku_ft_lookup(brt, &thekey, lookup_checkf, &pair);
......@@ -56,7 +56,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
FT_MSG_S badcmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval }} };
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &badcmd, TXNID_NONE, nullptr, nullptr);
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &badcmd, nullptr, nullptr);
// message should be rejected for duplicate msn, row should still have original val
......@@ -69,7 +69,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
msn = next_dummymsn();
brt->ft->h->max_msn_in_ft = msn;
FT_MSG_S cmd2 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &val2 }} };
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd2, TXNID_NONE, nullptr, nullptr);
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd2, nullptr, nullptr);
// message should be accepted, val should have new value
......@@ -81,7 +81,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
// now verify that message with lesser (older) msn is rejected
msn.msn = msn.msn - 10;
FT_MSG_S cmd3 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval } }};
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd3, TXNID_NONE, nullptr, nullptr);
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd3, nullptr, nullptr);
// message should be rejected, val should still have value in pair2
......@@ -123,8 +123,7 @@ insert_random_message_to_bn(FT_HANDLE t, BASEMENTNODE blb, LEAFENTRY *save, XIDS = valdbt;
size_t memsize;
int64_t numbytes;
int r = apply_msg_to_leafentry(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes);
toku_le_apply_msg(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes);
toku_ft_bn_apply_cmd(t->ft->compare_fun, t->ft->update_fun, NULL, blb, &msg, TXNID_NONE, NULL, NULL);
if (msn.msn > blb->max_msn_applied.msn) {
blb->max_msn_applied = msn;
......@@ -164,8 +163,7 @@ insert_same_message_to_bns(FT_HANDLE t, BASEMENTNODE blb1, BASEMENTNODE blb2, LE = valdbt;
size_t memsize;
int64_t numbytes;
int r = apply_msg_to_leafentry(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes);
toku_le_apply_msg(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes);
toku_ft_bn_apply_cmd(t->ft->compare_fun, t->ft->update_fun, NULL, blb1, &msg, TXNID_NONE, NULL, NULL);
if (msn.msn > blb1->max_msn_applied.msn) {
blb1->max_msn_applied = msn;
......@@ -274,7 +272,7 @@ flush_to_internal(FT_HANDLE t) {
set_BNC(child, 0, child_bnc);
BP_STATE(child, 0) = PT_AVAIL;
toku_bnc_flush_to_child(t->ft, parent_bnc, child);
toku_bnc_flush_to_child(t->ft, parent_bnc, child, TXNID_NONE);
int parent_messages_present[num_parent_messages];
int child_messages_present[num_child_messages];
......@@ -409,7 +407,7 @@ flush_to_internal_multiple(FT_HANDLE t) {
toku_bnc_flush_to_child(t->ft, parent_bnc, child);
toku_bnc_flush_to_child(t->ft, parent_bnc, child, TXNID_NONE);
int total_messages = 0;
for (i = 0; i < 8; ++i) {
......@@ -580,7 +578,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
if (make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) {
if (!parent_messages_is_fresh[i]) {
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], TXNID_NONE, NULL, NULL);
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], NULL, NULL);
for (i = 0; i < 8; ++i) {
......@@ -601,7 +599,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
if (use_flush) {
toku_bnc_flush_to_child(t->ft, parent_bnc, child);
toku_bnc_flush_to_child(t->ft, parent_bnc, child, TXNID_NONE);
} else {
FTNODE XMALLOC(parentnode);
......@@ -803,7 +801,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) {
if (dummy_cmp(NULL, parent_messages[i]->, &childkeys[7]) <= 0 &&
!parent_messages_is_fresh[i]) {
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], TXNID_NONE, NULL, NULL);
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], NULL, NULL);
for (i = 0; i < 8; ++i) {
......@@ -995,8 +993,8 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
if (make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) {
if (!parent_messages_is_fresh[i]) {
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child1, -1, parent_messages[i], TXNID_NONE, NULL, NULL);
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child2, -1, parent_messages[i], TXNID_NONE, NULL, NULL);
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child1, -1, parent_messages[i], NULL, NULL);
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child2, -1, parent_messages[i], NULL, NULL);
for (i = 0; i < 8; ++i) {
......@@ -1010,7 +1008,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
toku_bnc_flush_to_child(t->ft, parent_bnc, child1);
toku_bnc_flush_to_child(t->ft, parent_bnc, child1, TXNID_NONE);
FTNODE XMALLOC(parentnode);
BLOCKNUM parentblocknum = { 17 };
......@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child
// callback functions for toku_ft_flush_some_child
static bool
dont_destroy_bn(void* UU(extra))
......@@ -160,7 +160,7 @@ doit (bool after_child_pin) {
assert(toku_bnc_nbytesinbuf(BNC(node, 0)) > 0);
// do the flush
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
// now let's pin the root again and make sure it is flushed
......@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child
// callback functions for toku_ft_flush_some_child
static bool
dont_destroy_bn(void* UU(extra))
......@@ -177,7 +177,7 @@ doit (int state) {
assert(node->n_children == 2);
// do the flush
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
// now let's pin the root again and make sure it is has merged
......@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child
// callback functions for toku_ft_flush_some_child
static bool
dont_destroy_bn(void* UU(extra))
......@@ -197,7 +197,7 @@ doit (int state) {
assert(node->n_children == 2);
// do the flush
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
// now let's pin the root again and make sure it is has rebalanced
......@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child
// callback functions for toku_ft_flush_some_child
static bool
dont_destroy_bn(void* UU(extra))
......@@ -173,7 +173,7 @@ doit (bool after_split) {
assert(node->n_children == 1);
// do the flush
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
// now let's pin the root again and make sure it is has split
......@@ -396,14 +396,13 @@ test_le_apply(ULE ule_initial, FT_MSG msg, ULE ule_expected) {
size_t result_memsize;
int64_t ignoreme;
r = apply_msg_to_leafentry(msg,
NULL, NULL, &ignoreme);
NULL, NULL, &ignoreme);
if (le_result)
le_verify_accessors(le_result, ule_expected, result_memsize);
......@@ -702,6 +701,111 @@ test_le_apply_messages(void) {
static bool ule_worth_running_garbage_collection(ULE ule, TXNID oldest_known_referenced_xid) {
size_t initial_memsize;
int r = le_pack(ule, &initial_memsize, &le, nullptr, nullptr, nullptr); CKERR(r);
bool worth_running = toku_le_worth_running_garbage_collection(le, oldest_known_referenced_xid);
return worth_running;
static void test_le_garbage_collection_birdie(void) {
DBT key;
DBT val;
ULE_S ule_initial;
ULE_S ule_expected;
uint8_t keybuf[MAX_SIZE];
uint32_t keysize=8;
uint8_t valbuf[MAX_SIZE];
uint32_t valsize=8;
ule_initial.uxrs = ule_initial.uxrs_static;
ule_expected.uxrs = ule_expected.uxrs_static;
memset(&key, 0, sizeof(key));
memset(&val, 0, sizeof(val));
bool do_garbage_collect;
fillrandom(keybuf, keysize);
fillrandom(valbuf, valsize);
// Test garbage collection "worth-doing" heurstic
// Garbage collection should not be worth doing on a clean leafentry.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 0;
ule_initial.uxrs[0].xid = TXNID_NONE;
ule_initial.uxrs[0].type = XR_INSERT;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
// It is worth doing when there is more than one committed entry
ule_initial.num_cuxrs = 2;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 500;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
// It is not worth doing when there is one of each, when the
// provisional entry is newer than the oldest known referenced xid
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 1500;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
ule_initial.uxrs[1].xid = 200;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
// It is not worth doing when there is only one committed entry,
// multiple provisional entries, but the outermost entry is newer.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 3;
ule_initial.uxrs[1].xid = 201;
ule_initial.uxrs[2].xid = 206;
ule_initial.uxrs[3].xid = 215;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
// It is worth doing when the above scenario has an outermost entry
// older than the oldest known, even if its children seem newer.
// this children must have commit because the parent is not live.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 3;
ule_initial.uxrs[1].xid = 190;
ule_initial.uxrs[2].xid = 206;
ule_initial.uxrs[3].xid = 215;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
// It is worth doing when there is more than one committed entry,
// even if a provisional entry exists that is newer than the
// oldest known refrenced xid
ule_initial.num_cuxrs = 2;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 499;
ule_initial.uxrs[2].xid = 500;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
// It is worth doing when there is one of each, and the provisional
// entry is older than the oldest known referenced xid
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 199;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
// It is definately worth doing when the above case is true
// and there is more than one provisional entry.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 2;
ule_initial.uxrs[1].xid = 150;
ule_initial.uxrs[2].xid = 175;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
static void test_le_optimize(void) {
FT_MSG_S msg;
......@@ -900,6 +1004,7 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute_
return 0;
......@@ -162,7 +162,7 @@ doit (void) {
// now with setup done, start the test
// test that if flush_some_child properly honors
// test that if toku_ft_flush_some_child properly honors
// what we say and flushes the child we pick
toku_pin_node_with_min_bfe(&node, node_internal, t);
......@@ -185,7 +185,7 @@ doit (void) {
curr_child_to_flush = 0;
num_flushes_called = 0;
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 1);
toku_pin_node_with_min_bfe(&node, node_internal, t);
......@@ -203,7 +203,7 @@ doit (void) {
curr_child_to_flush = 1;
num_flushes_called = 0;
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 1);
toku_pin_node_with_min_bfe(&node, node_internal, t);
......@@ -221,7 +221,7 @@ doit (void) {
curr_child_to_flush = 0;
num_flushes_called = 0;
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 1);
toku_pin_node_with_min_bfe(&node, node_internal, t);
......@@ -250,7 +250,7 @@ doit (void) {
toku_assert_entire_node_in_memory(node); // entire root is in memory
curr_child_to_flush = i;
num_flushes_called = 0;
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 2);
toku_pin_node_with_min_bfe(&node, node_internal, t);
......@@ -296,7 +296,7 @@ doit (void) {
toku_assert_entire_node_in_memory(node); // entire root is in memory
curr_child_to_flush = 0;
num_flushes_called = 0;
flush_some_child(t->ft, node, &fa);
toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 2);
r = toku_close_ft_handle_nolsn(t, 0); assert(r==0);
......@@ -88,15 +88,13 @@ void toku_ule_free(ULEHANDLE ule_p) {
// Question: Can any software outside this file modify or read a leafentry?
// If so, is it worthwhile to put it all here?
// There are two entries, one each for modification and query:
// apply_msg_to_leafentry() performs all inserts/deletes/aborts
// toku_le_apply_msg() performs all inserts/deletes/aborts
......@@ -122,6 +120,7 @@ static void msg_init_empty_ule(ULE ule, FT_MSG msg);
static void msg_modify_ule(ULE ule, FT_MSG msg);
static void ule_init_empty_ule(ULE ule, uint32_t keylen, void * keyp);
static void ule_do_implicit_promotions(ULE ule, XIDS xids);
static void ule_try_promote_provisional_outermost(ULE ule, TXNID oldest_possible_live_xid);
static void ule_promote_provisional_innermost_to_index(ULE ule, uint32_t index);
static void ule_promote_provisional_innermost_to_committed(ULE ule);
static void ule_apply_insert(ULE ule, XIDS xids, uint32_t vallen, void * valp);
......@@ -165,8 +164,6 @@ le_malloc(OMT *omtp, struct mempool *mp, size_t size, void **maybe_free)
return rval;
// Garbage collection related functions
......@@ -215,7 +212,7 @@ xid_reads_committed_xid(TXNID tl1, TXNID xc, const xid_omt_t &snapshot_txnids, c
// so we get rid of them.
static void
simple_garbage_collection(ULE ule, TXNID oldest_referenced_xid) {
ule_simple_garbage_collection(ULE ule, TXNID oldest_referenced_xid) {
uint32_t curr_index = 0;
uint32_t num_entries;
if (ule->num_cuxrs == 1 || oldest_referenced_xid == TXNID_NONE) {
......@@ -244,7 +241,7 @@ done:;
static void
garbage_collection(ULE ule, const xid_omt_t &snapshot_xids, const rx_omt_t &referenced_xids, const xid_omt_t &live_root_txns) {
ule_garbage_collect(ULE ule, const xid_omt_t &snapshot_xids, const rx_omt_t &referenced_xids, const xid_omt_t &live_root_txns) {
if (ule->num_cuxrs == 1) goto done;
// will fail if too many num_cuxrs
bool necessary_static[MAX_TRANSACTION_RECORDS];
......@@ -340,7 +337,6 @@ garbage_collection(ULE ule, const xid_omt_t &snapshot_xids, const rx_omt_t &refe
// This is the big enchilada. (Bring Tums.) Note that this level of abstraction
// has no knowledge of the inner structure of either leafentry or msg. It makes
......@@ -353,42 +349,60 @@ done:;
// If the leafentry is destroyed it sets *new_leafentry_p to NULL.
// Otehrwise the new_leafentry_p points at the new leaf entry.
// As of October 2011, this function always returns 0.
apply_msg_to_leafentry(FT_MSG msg, // message to apply to leafentry
LEAFENTRY old_leafentry, // NULL if there was no stored data.
TXNID oldest_referenced_xid,
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
int64_t * numbytes_delta_p) { // change in total size of key and val, not including any overhead
toku_le_apply_msg(FT_MSG msg, // message to apply to leafentry
LEAFENTRY old_leafentry, // NULL if there was no stored data.
TXNID oldest_referenced_xid,
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
int64_t * numbytes_delta_p) { // change in total size of key and val, not including any overhead
ULE_S ule;
int rval;
int64_t oldnumbytes = 0;
int64_t newnumbytes = 0;
if (old_leafentry == NULL) // if leafentry does not exist ...
msg_init_empty_ule(&ule, msg); // ... create empty unpacked leaf entry
else {
if (old_leafentry == NULL) {
msg_init_empty_ule(&ule, msg);
} else {
le_unpack(&ule, old_leafentry); // otherwise unpack leafentry
oldnumbytes = ule_get_innermost_numbytes(&ule);
msg_modify_ule(&ule, msg); // modify unpacked leafentry
simple_garbage_collection(&ule, oldest_referenced_xid);
rval = le_pack(&ule, // create packed leafentry
if (new_leafentry_p)
ule_simple_garbage_collection(&ule, oldest_referenced_xid);
int rval = le_pack(&ule, // create packed leafentry
if (new_leafentry_p) {
newnumbytes = ule_get_innermost_numbytes(&ule);
*numbytes_delta_p = newnumbytes - oldnumbytes;
return rval;
bool toku_le_worth_running_garbage_collection(LEAFENTRY le, TXNID oldest_known_referenced_xid) {
// Effect: Quickly determines if it's worth trying to run garbage collection on a leafentry
// Return: True if it makes sense to try garbage collection, false otherwise.
// Rationale: Garbage collection is likely to clean up under two circumstances:
// 1.) There are multiple committed entries. Some may never be read by new txns.
// 2.) There is only one committed entry, but the outermost provisional entry
// is older than the oldest known referenced xid, so it must have commited.
// Therefor we can promote it to committed and get rid of the old commited entry.
if (le->type != LE_MVCC) {
return false;
if (le->u.mvcc.num_cxrs > 1) {
return true;
} else {
paranoid_invariant(le->u.mvcc.num_cxrs == 1);
return le->u.mvcc.num_pxrs > 0 && le_outermost_uncommitted_xid(le) < oldest_known_referenced_xid;
// Garbage collect one leaf entry, using the given OMT's.
// Parameters:
......@@ -408,29 +422,38 @@ apply_msg_to_leafentry(FT_MSG msg, // message to apply to leafentry
// -- referenced_xids : list of in memory active transactions.
// NOTE: it is not a good idea to garbage collect a leaf
// entry with only one committed value.
garbage_collect_leafentry(LEAFENTRY old_leaf_entry,
LEAFENTRY *new_leaf_entry,
size_t *new_leaf_entry_memory_size,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
const xid_omt_t &snapshot_xids,
const rx_omt_t &referenced_xids,
const xid_omt_t &live_root_txns) {
int r = 0;
toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
LEAFENTRY *new_leaf_entry,
size_t *new_leaf_entry_memory_size,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
const xid_omt_t &snapshot_xids,
const rx_omt_t &referenced_xids,
const xid_omt_t &live_root_txns,
TXNID oldest_known_referenced_xid) {
ULE_S ule;
le_unpack(&ule, old_leaf_entry);
garbage_collection(&ule, snapshot_xids, referenced_xids, live_root_txns);
r = le_pack(&ule,
// Before running garbage collection, try to promote the outermost provisional
// entries to committed if its xid is older than the oldest possible live xid.
// The oldest known refeferenced xid is a lower bound on the oldest possible
// live xid, so we use that. It's usually close enough to get rid of most
// garbage in leafentries.
TXNID oldest_possible_live_xid = oldest_known_referenced_xid;
ule_try_promote_provisional_outermost(&ule, oldest_possible_live_xid);
ule_garbage_collect(&ule, snapshot_xids, referenced_xids, live_root_txns);
int r = le_pack(&ule,
assert(r == 0);
return r;
......@@ -1529,6 +1552,15 @@ ule_promote_provisional_innermost_to_committed(ULE ule) {
static void
ule_try_promote_provisional_outermost(ULE ule, TXNID oldest_possible_live_xid) {
// Effect: If there is a provisional record whose outermost xid is older than
// the oldest known referenced_xid, promote it to committed.
if (ule->num_puxrs > 0 && ule_get_xid(ule, ule->num_cuxrs) < oldest_possible_live_xid) {
// Purpose is to promote the value (and type) of the innermost transaction
// record to the uxr at the specified index (keeping the txnid of the uxr at
// specified index.)
......@@ -45,31 +45,4 @@ TXNID uxr_get_txnid(UXRHANDLE uxr);
//1 does much slower debugging
void fast_msg_to_leafentry(
FT_MSG msg, // message to apply to leafentry
size_t *new_leafentry_memorysize,
size_t *new_leafentry_disksize,
LEAFENTRY *new_leafentry_p) ;
int apply_msg_to_leafentry(FT_MSG msg,
LEAFENTRY old_leafentry, // NULL if there was no stored data.
TXNID oldest_referenced_xid,
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
int64_t * numbytes_delta_p);
int garbage_collect_leafentry(LEAFENTRY old_leaf_entry,
LEAFENTRY *new_leaf_entry,
size_t *new_leaf_entry_memory_size,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
const xid_omt_t &snapshot_xids,
const rx_omt_t &referenced_xids,
const xid_omt_t &live_root_txns);
#endif // TOKU_ULE_H
......@@ -740,6 +740,9 @@ static int random_put_in_db(DB *db, DB_TXN *txn, ARG arg, bool ignore_errors, vo
} else {
rand_key_i[0] = arg->thread_idx;
if (arg->cli->num_elements > 0 && arg->bounded_element_range) {
rand_key_key[0] = rand_key_key[0] % arg->cli->num_elements;
fill_zeroed_array(valbuf, arg->cli->val_size, arg->random_data, arg->cli->compressibility);
DBT key, val;
dbt_init(&key, &rand_key_b, sizeof rand_key_b);
