Commit 4233ec1e authored by John Esmet's avatar John Esmet Committed by Yoni Fogel

refs #5559 merge 5559 to main


git-svn-id: file:///svn/toku/tokudb@50812 c7de825b-a66e-492c-adef-691d508d4ae1
parent b1d3e831
...@@ -33,7 +33,7 @@ typedef struct flusher_advice FLUSHER_ADVICE; ...@@ -33,7 +33,7 @@ typedef struct flusher_advice FLUSHER_ADVICE;
typedef int (*FA_PICK_CHILD)(FT h, FTNODE parent, void* extra); typedef int (*FA_PICK_CHILD)(FT h, FTNODE parent, void* extra);
/** /**
* Decide whether to call `flush_some_child` on the child if it is * Decide whether to call `toku_ft_flush_some_child` on the child if it is
* stable and a nonleaf node. * stable and a nonleaf node.
* *
* Flusher threads: yes if child is gorged * Flusher threads: yes if child is gorged
...@@ -76,9 +76,9 @@ typedef bool (*FA_SHOULD_DESTROY_BN)(void* extra); ...@@ -76,9 +76,9 @@ typedef bool (*FA_SHOULD_DESTROY_BN)(void* extra);
/** /**
* Update `ft_flusher_status` in whatever way necessary. Called once * Update `ft_flusher_status` in whatever way necessary. Called once
* by `flush_some_child` right before choosing what to do next (split, * by `toku_ft_flush_some_child` right before choosing what to do next (split,
* merge, recurse), with the number of nodes that were dirtied by this * merge, recurse), with the number of nodes that were dirtied by this
* execution of `flush_some_child`. * execution of `toku_ft_flush_some_child`.
*/ */
typedef void (*FA_UPDATE_STATUS)(FTNODE child, int dirtied, void* extra); typedef void (*FA_UPDATE_STATUS)(FTNODE child, int dirtied, void* extra);
...@@ -109,17 +109,6 @@ struct flusher_advice { ...@@ -109,17 +109,6 @@ struct flusher_advice {
void* extra; // parameter passed into callbacks void* extra; // parameter passed into callbacks
}; };
// FIXME all of these need the toku prefix
//
// how about:
//
// toku_ftnode_flush_some_child()
// toku_fa_flusher_advice_init()
// toku_fa_always_recursively_flush()
// toku_fa_dont_destroy_basement_nodes()
// toku_fa_default_merge_child()
// toku_fa_default_pick_child_after_split()
void void
flusher_advice_init( flusher_advice_init(
struct flusher_advice *fa, struct flusher_advice *fa,
...@@ -132,11 +121,11 @@ flusher_advice_init( ...@@ -132,11 +121,11 @@ flusher_advice_init(
void* extra void* extra
); );
void void toku_ft_flush_some_child(
flush_some_child( FT ft,
FT h,
FTNODE parent, FTNODE parent,
struct flusher_advice *fa); struct flusher_advice *fa
);
bool bool
always_recursively_flush(FTNODE child, void* extra); always_recursively_flush(FTNODE child, void* extra);
......
This diff is collapsed.
...@@ -65,11 +65,11 @@ toku_flusher_thread_set_callback( ...@@ -65,11 +65,11 @@ toku_flusher_thread_set_callback(
/** /**
* Puts a workitem on the flusher thread queue, scheduling the node to be * Puts a workitem on the flusher thread queue, scheduling the node to be
* flushed by flush_some_child. * flushed by toku_ft_flush_some_child.
*/ */
void void
flush_node_on_background_thread( toku_ft_flush_node_on_background_thread(
FT h, FT ft,
FTNODE parent FTNODE parent
); );
......
...@@ -141,7 +141,7 @@ hot_update_flusher_keys(FTNODE parent, ...@@ -141,7 +141,7 @@ hot_update_flusher_keys(FTNODE parent,
} }
} }
// Picks which child flush_some_child will use for flushing and // Picks which child toku_ft_flush_some_child will use for flushing and
// recursion. // recursion.
static int static int
hot_pick_child(FT h, hot_pick_child(FT h,
...@@ -308,7 +308,7 @@ toku_ft_hot_optimize(FT_HANDLE brt, ...@@ -308,7 +308,7 @@ toku_ft_hot_optimize(FT_HANDLE brt,
// This should recurse to the bottom of the tree and then // This should recurse to the bottom of the tree and then
// return. // return.
if (root->height > 0) { if (root->height > 0) {
flush_some_child(brt->ft, root, &advice); toku_ft_flush_some_child(brt->ft, root, &advice);
} else { } else {
// Since there are no children to flush, we should abort // Since there are no children to flush, we should abort
// the HOT call. // the HOT call.
...@@ -318,7 +318,7 @@ toku_ft_hot_optimize(FT_HANDLE brt, ...@@ -318,7 +318,7 @@ toku_ft_hot_optimize(FT_HANDLE brt,
// Set the highest pivot key seen here, since the parent may // Set the highest pivot key seen here, since the parent may
// be unlocked and NULL'd later in our caller: // be unlocked and NULL'd later in our caller:
// flush_some_child(). // toku_ft_flush_some_child().
hot_set_highest_key(&flusher); hot_set_highest_key(&flusher);
// This is where we determine if the traversal is finished or // This is where we determine if the traversal is finished or
......
...@@ -137,7 +137,7 @@ long toku_bnc_memory_size(NONLEAF_CHILDINFO bnc); ...@@ -137,7 +137,7 @@ long toku_bnc_memory_size(NONLEAF_CHILDINFO bnc);
long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc); long toku_bnc_memory_used(NONLEAF_CHILDINFO bnc);
void toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, DESCRIPTOR desc, ft_compare_func cmp); void toku_bnc_insert_msg(NONLEAF_CHILDINFO bnc, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, enum ft_msg_type type, MSN msn, XIDS xids, bool is_fresh, DESCRIPTOR desc, ft_compare_func cmp);
void toku_bnc_empty(NONLEAF_CHILDINFO bnc); void toku_bnc_empty(NONLEAF_CHILDINFO bnc);
void toku_bnc_flush_to_child(FT h, NONLEAF_CHILDINFO bnc, FTNODE child); void toku_bnc_flush_to_child(FT h, NONLEAF_CHILDINFO bnc, FTNODE child, TXNID oldest_referenced_xid);
bool toku_bnc_should_promote(FT ft, NONLEAF_CHILDINFO bnc) __attribute__((const, nonnull)); bool toku_bnc_should_promote(FT ft, NONLEAF_CHILDINFO bnc) __attribute__((const, nonnull));
bool toku_ft_nonleaf_is_gorged(FTNODE node, uint32_t nodesize); bool toku_ft_nonleaf_is_gorged(FTNODE node, uint32_t nodesize);
...@@ -246,6 +246,17 @@ struct ftnode { ...@@ -246,6 +246,17 @@ struct ftnode {
unsigned int totalchildkeylens; unsigned int totalchildkeylens;
DBT *childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1]. DBT *childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Child 1's keys are > childkeys[0]. */ Child 1's keys are > childkeys[0]. */
// What's the oldest referenced xid that this node knows about? The real oldest
// referenced xid might be younger, but this is our best estimate. We use it
// as a heuristic to transition provisional mvcc entries from provisional to
// committed (from implicity committed to really committed).
//
// A better heuristic would be the oldest live txnid, but we use this since it
// still works well most of the time, and its readily available on the inject
// code path.
TXNID oldest_known_referenced_xid;
// array of size n_children, consisting of ftnode partitions // array of size n_children, consisting of ftnode partitions
// each one is associated with a child // each one is associated with a child
// for internal nodes, the ith partition corresponds to the ith message buffer // for internal nodes, the ith partition corresponds to the ith message buffer
...@@ -606,8 +617,6 @@ void toku_destroy_ftnode_internals(FTNODE node); ...@@ -606,8 +617,6 @@ void toku_destroy_ftnode_internals(FTNODE node);
void toku_ftnode_free (FTNODE *node); void toku_ftnode_free (FTNODE *node);
bool is_entire_node_in_memory(FTNODE node); bool is_entire_node_in_memory(FTNODE node);
void toku_assert_entire_node_in_memory(FTNODE node); void toku_assert_entire_node_in_memory(FTNODE node);
// FIXME needs toku prefix
void bring_node_fully_into_memory(FTNODE node, FT h);
// append a child node to a parent node // append a child node to a parent node
void toku_ft_nonleaf_append_child(FTNODE node, FTNODE child, const DBT *pivotkey); void toku_ft_nonleaf_append_child(FTNODE node, FTNODE child, const DBT *pivotkey);
...@@ -1092,7 +1101,6 @@ toku_ft_leaf_apply_cmd ( ...@@ -1092,7 +1101,6 @@ toku_ft_leaf_apply_cmd (
FTNODE node, FTNODE node,
int target_childnum, int target_childnum,
FT_MSG cmd, FT_MSG cmd,
TXNID oldest_referenced_xid,
uint64_t *workdone, uint64_t *workdone,
STAT64INFO stats_to_update STAT64INFO stats_to_update
); );
...@@ -1107,7 +1115,6 @@ toku_ft_node_put_cmd ( ...@@ -1107,7 +1115,6 @@ toku_ft_node_put_cmd (
FT_MSG cmd, FT_MSG cmd,
bool is_fresh, bool is_fresh,
size_t flow_deltas[], size_t flow_deltas[],
TXNID oldest_referenced_xid,
STAT64INFO stats_to_update STAT64INFO stats_to_update
); );
......
This diff is collapsed.
...@@ -379,6 +379,7 @@ serialize_ft_min_size (uint32_t version) { ...@@ -379,6 +379,7 @@ serialize_ft_min_size (uint32_t version) {
size_t size = 0; size_t size = 0;
switch(version) { switch(version) {
case FT_LAYOUT_VERSION_22:
case FT_LAYOUT_VERSION_21: case FT_LAYOUT_VERSION_21:
size += sizeof(MSN); // max_msn_in_ft size += sizeof(MSN); // max_msn_in_ft
case FT_LAYOUT_VERSION_20: case FT_LAYOUT_VERSION_20:
......
...@@ -146,7 +146,6 @@ int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM blocknum, const char ...@@ -146,7 +146,6 @@ int toku_testsetup_insert_to_leaf (FT_HANDLE brt, BLOCKNUM blocknum, const char
&cmd, &cmd,
true, true,
zero_flow_deltas, zero_flow_deltas,
TXNID_NONE,
NULL NULL
); );
......
...@@ -912,3 +912,64 @@ void toku_ft_get_compression_method(FT ft, enum toku_compression_method *methodp ...@@ -912,3 +912,64 @@ void toku_ft_get_compression_method(FT ft, enum toku_compression_method *methodp
void toku_ft_set_blackhole(FT_HANDLE ft_handle) { void toku_ft_set_blackhole(FT_HANDLE ft_handle) {
ft_handle->ft->blackhole = true; ft_handle->ft->blackhole = true;
} }
struct garbage_helper_extra {
FT ft;
size_t total_space;
size_t used_space;
};
static int
garbage_leafentry_helper(OMTVALUE v, uint32_t UU(idx), void *extra) {
struct garbage_helper_extra *CAST_FROM_VOIDP(info, extra);
LEAFENTRY CAST_FROM_VOIDP(le, v);
info->total_space += leafentry_disksize(le);
info->used_space += LE_CLEAN_MEMSIZE(le_latest_keylen(le), le_latest_vallen(le));
return 0;
}
static int
garbage_helper(BLOCKNUM blocknum, int64_t UU(size), int64_t UU(address), void *extra) {
struct garbage_helper_extra *CAST_FROM_VOIDP(info, extra);
FTNODE node;
FTNODE_DISK_DATA ndd;
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, info->ft);
int fd = toku_cachefile_get_fd(info->ft->cf);
int r = toku_deserialize_ftnode_from(fd, blocknum, 0, &node, &ndd, &bfe);
if (r != 0) {
goto no_node;
}
if (node->height > 0) {
goto exit;
}
for (int i = 0; i < node->n_children; ++i) {
BASEMENTNODE bn = BLB(node, i);
r = toku_omt_iterate(bn->buffer, garbage_leafentry_helper, info);
if (r != 0) {
goto exit;
}
}
exit:
toku_ftnode_free(&node);
toku_free(ndd);
no_node:
return r;
}
void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space) {
// Effect: Iterates the FT's blocktable and calculates the total and used space for leaf blocks.
// Note: It is ok to call this function concurrently with reads/writes to the table since
// the blocktable lock is held, which means no new allocations or file writes can occur.
invariant_notnull(total_space);
invariant_notnull(used_space);
struct garbage_helper_extra info = {
.ft = ft,
.total_space = 0,
.used_space = 0
};
toku_blocktable_iterate(ft->blocktable, TRANSLATION_CHECKPOINTED, garbage_helper, &info, true, true);
*total_space = info.total_space;
*used_space = info.used_space;
}
...@@ -104,4 +104,8 @@ void toku_node_save_ct_pair(CACHEKEY UU(key), void *value_data, PAIR p); ...@@ -104,4 +104,8 @@ void toku_node_save_ct_pair(CACHEKEY UU(key), void *value_data, PAIR p);
// mark the ft as a blackhole. any message injections will be a no op. // mark the ft as a blackhole. any message injections will be a no op.
void toku_ft_set_blackhole(FT_HANDLE ft_handle); void toku_ft_set_blackhole(FT_HANDLE ft_handle);
// Effect: Calculates the total space and used space for a FT's leaf data.
// The difference between the two is MVCC garbage.
void toku_ft_get_garbage(FT ft, uint64_t *total_space, uint64_t *used_space);
#endif #endif
...@@ -30,6 +30,7 @@ enum ft_layout_version_e { ...@@ -30,6 +30,7 @@ enum ft_layout_version_e {
// last_xid to shutdown // last_xid to shutdown
FT_LAYOUT_VERSION_21 = 21, // Ming: Add max_msn_in_ft to header, FT_LAYOUT_VERSION_21 = 21, // Ming: Add max_msn_in_ft to header,
// Removed log suppression logentry // Removed log suppression logentry
FT_LAYOUT_VERSION_22 = 22, // Ming: Add oldest known referenced xid to each ftnode, for better garbage collection
FT_NEXT_VERSION, // the version after the current version FT_NEXT_VERSION, // the version after the current version
FT_LAYOUT_VERSION = FT_NEXT_VERSION-1, // A hack so I don't have to change this line. FT_LAYOUT_VERSION = FT_NEXT_VERSION-1, // A hack so I don't have to change this line.
FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported FT_LAYOUT_MIN_SUPPORTED_VERSION = FT_LAYOUT_VERSION_13, // Minimum version supported
......
...@@ -367,6 +367,7 @@ serialize_ftnode_info_size(FTNODE node) ...@@ -367,6 +367,7 @@ serialize_ftnode_info_size(FTNODE node)
retval += 4; // nodesize retval += 4; // nodesize
retval += 4; // flags retval += 4; // flags
retval += 4; // height; retval += 4; // height;
retval += 8; // oldest_known_referenced_xid
retval += node->totalchildkeylens; // total length of pivots retval += node->totalchildkeylens; // total length of pivots
retval += (node->n_children-1)*4; // encode length of each pivot retval += (node->n_children-1)*4; // encode length of each pivot
if (node->height > 0) { if (node->height > 0) {
...@@ -390,6 +391,8 @@ static void serialize_ftnode_info(FTNODE node, ...@@ -390,6 +391,8 @@ static void serialize_ftnode_info(FTNODE node,
wbuf_nocrc_uint(&wb, 0); // write a dummy value for where node->nodesize used to be wbuf_nocrc_uint(&wb, 0); // write a dummy value for where node->nodesize used to be
wbuf_nocrc_uint(&wb, node->flags); wbuf_nocrc_uint(&wb, node->flags);
wbuf_nocrc_int (&wb, node->height); wbuf_nocrc_int (&wb, node->height);
wbuf_TXNID(&wb, node->oldest_known_referenced_xid);
// pivot information // pivot information
for (int i = 0; i < node->n_children-1; i++) { for (int i = 0; i < node->n_children-1; i++) {
wbuf_nocrc_bytes(&wb, node->childkeys[i].data, node->childkeys[i].size); wbuf_nocrc_bytes(&wb, node->childkeys[i].data, node->childkeys[i].size);
...@@ -1264,6 +1267,9 @@ deserialize_ftnode_info( ...@@ -1264,6 +1267,9 @@ deserialize_ftnode_info(
if (node->layout_version_read_from_disk < FT_LAYOUT_VERSION_19) { if (node->layout_version_read_from_disk < FT_LAYOUT_VERSION_19) {
(void) rbuf_int(&rb); // optimized_for_upgrade (void) rbuf_int(&rb); // optimized_for_upgrade
} }
if (node->layout_version_read_from_disk >= FT_LAYOUT_VERSION_22) {
rbuf_TXNID(&rb, &node->oldest_known_referenced_xid);
}
// now create the basement nodes or childinfos, depending on whether this is a // now create the basement nodes or childinfos, depending on whether this is a
// leaf node or internal node // leaf node or internal node
...@@ -1505,6 +1511,17 @@ check_and_copy_compressed_sub_block_worker(struct rbuf curr_rbuf, struct sub_blo ...@@ -1505,6 +1511,17 @@ check_and_copy_compressed_sub_block_worker(struct rbuf curr_rbuf, struct sub_blo
return r; return r;
} }
static FTNODE alloc_ftnode_for_deserialize(uint32_t fullhash, BLOCKNUM blocknum) {
// Effect: Allocate an FTNODE and fill in the values that are not read from
FTNODE XMALLOC(node);
node->fullhash = fullhash;
node->thisnodename = blocknum;
node->dirty = 0;
node->bp = nullptr;
node->oldest_known_referenced_xid = TXNID_NONE;
return node;
}
static int static int
deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode, deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
FTNODE_DISK_DATA* ndd, FTNODE_DISK_DATA* ndd,
...@@ -1518,13 +1535,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode, ...@@ -1518,13 +1535,7 @@ deserialize_ftnode_header_from_rbuf_if_small_enough (FTNODE *ftnode,
// Return 0 if it worked. If something goes wrong (including that we are looking at some old data format that doesn't have partitions) then return nonzero. // Return 0 if it worked. If something goes wrong (including that we are looking at some old data format that doesn't have partitions) then return nonzero.
{ {
int r = 0; int r = 0;
FTNODE XMALLOC(node); FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum);
// fill in values that are known and not stored in rb
node->fullhash = fullhash;
node->thisnodename = blocknum;
node->dirty = 0;
node->bp = NULL; // fill this in so we can free without a leak.
if (rb->size < 24) { if (rb->size < 24) {
// TODO: What error do we return here? // TODO: What error do we return here?
...@@ -2171,15 +2182,10 @@ deserialize_ftnode_from_rbuf( ...@@ -2171,15 +2182,10 @@ deserialize_ftnode_from_rbuf(
// Effect: deserializes a ftnode that is in rb (with pointer of rb just past the magic) into a FTNODE. // Effect: deserializes a ftnode that is in rb (with pointer of rb just past the magic) into a FTNODE.
{ {
int r = 0; int r = 0;
FTNODE XMALLOC(node);
struct sub_block sb_node_info; struct sub_block sb_node_info;
// fill in values that are known and not stored in rb FTNODE node = alloc_ftnode_for_deserialize(fullhash, blocknum);
node->fullhash = fullhash;
node->thisnodename = blocknum;
node->dirty = 0;
// now start reading from rbuf // now start reading from rbuf
// first thing we do is read the header information // first thing we do is read the header information
bytevec magic; bytevec magic;
rbuf_literal_bytes(rb, &magic, 8); rbuf_literal_bytes(rb, &magic, 8);
......
...@@ -280,61 +280,14 @@ dump_nodesizes(int f, FT h) { ...@@ -280,61 +280,14 @@ dump_nodesizes(int f, FT h) {
printf("leafsizes: %" PRIu64 "\n", info.leafsizes); printf("leafsizes: %" PRIu64 "\n", info.leafsizes);
} }
typedef struct {
int f;
FT h;
size_t total_space;
size_t used_space;
} garbage_help_extra;
static int
garbage_leafentry_helper(OMTVALUE v, uint32_t UU(idx), void *extra) {
garbage_help_extra *CAST_FROM_VOIDP(info, extra);
LEAFENTRY CAST_FROM_VOIDP(le, v);
info->total_space += leafentry_disksize(le);
info->used_space += LE_CLEAN_MEMSIZE(le_latest_keylen(le), le_latest_vallen(le));
return 0;
}
static int
garbage_helper(BLOCKNUM b, int64_t UU(size), int64_t UU(address), void *extra) {
garbage_help_extra *CAST_FROM_VOIDP(info, extra);
FTNODE n;
FTNODE_DISK_DATA ndd = NULL;
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, info->h);
int r = toku_deserialize_ftnode_from(info->f, b, 0, &n, &ndd, &bfe);
if (r != 0) {
goto no_node;
}
if (n->height > 0) {
goto exit;
}
for (int i = 0; i < n->n_children; ++i) {
BASEMENTNODE bn = BLB(n, i);
r = toku_omt_iterate(bn->buffer, garbage_leafentry_helper, info);
if (r != 0) {
goto exit;
}
}
exit:
toku_ftnode_free(&n);
toku_free(ndd);
no_node:
return r;
}
static void static void
dump_garbage_stats(int f, FT h) { dump_garbage_stats(int f, FT ft) {
garbage_help_extra info; invariant(f == toku_cachefile_get_fd(ft->cf));
memset(&info, 0, sizeof info); uint64_t total_space = 0;
info.f = f; uint64_t used_space = 0;
info.h = h; toku_ft_get_garbage(ft, &total_space, &used_space);
toku_blocktable_iterate(h->blocktable, TRANSLATION_CHECKPOINTED, printf("total_size: %zu\n", total_space);
garbage_helper, &info, true, true); printf("used_size: %zu\n", used_space);
printf("total_size: %zu\n", info.total_space);
printf("used_size: %zu\n", info.used_space);
} }
static uint32_t static uint32_t
......
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef LEAFENTRY_H
#define LEAFENTRY_H #ifndef TOKU_LEAFENTRY_H
#define TOKU_LEAFENTRY_H
#ident "$Id$" #ident "$Id$"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved." #ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
...@@ -10,11 +12,12 @@ ...@@ -10,11 +12,12 @@
#include <util/mempool.h> #include <util/mempool.h>
#include "txn_manager.h"
#include "rbuf.h" #include "rbuf.h"
#include "x1764.h" #include "x1764.h"
#include "omt.h" #include "omt.h"
#if 0 /*
Memory format of packed leaf entry Memory format of packed leaf entry
CONSTANTS: CONSTANTS:
num_uxrs num_uxrs
...@@ -35,10 +38,7 @@ ...@@ -35,10 +38,7 @@
Run-time-constants Run-time-constants
key[] key[]
val[] val[]
#endif */
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
// //
// enum of possible values for LEAFENTRY->type field // enum of possible values for LEAFENTRY->type field
...@@ -94,9 +94,6 @@ struct __attribute__ ((__packed__)) leafentry { ...@@ -94,9 +94,6 @@ struct __attribute__ ((__packed__)) leafentry {
}; };
static_assert(10 == sizeof(leafentry), "leafentry size is wrong"); static_assert(10 == sizeof(leafentry), "leafentry size is wrong");
static_assert(5 == __builtin_offsetof(leafentry, u), "union is in the wrong place"); static_assert(5 == __builtin_offsetof(leafentry, u), "union is in the wrong place");
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
#define LE_CLEAN_MEMSIZE(_keylen, _vallen) \ #define LE_CLEAN_MEMSIZE(_keylen, _vallen) \
(sizeof(((LEAFENTRY)NULL)->type) /* type */ \ (sizeof(((LEAFENTRY)NULL)->type) /* type */ \
...@@ -123,6 +120,10 @@ static_assert(5 == __builtin_offsetof(leafentry, u), "union is in the wrong plac ...@@ -123,6 +120,10 @@ static_assert(5 == __builtin_offsetof(leafentry, u), "union is in the wrong plac
typedef struct leafentry *LEAFENTRY; typedef struct leafentry *LEAFENTRY;
typedef struct leafentry_13 *LEAFENTRY_13; typedef struct leafentry_13 *LEAFENTRY_13;
//
// TODO: consistency among names is very poor.
//
size_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory. size_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory.
size_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk. size_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk.
void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le); void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
...@@ -144,19 +145,6 @@ void* le_key_and_len (LEAFENTRY le, uint32_t *len); ...@@ -144,19 +145,6 @@ void* le_key_and_len (LEAFENTRY le, uint32_t *len);
uint64_t le_outermost_uncommitted_xid (LEAFENTRY le); uint64_t le_outermost_uncommitted_xid (LEAFENTRY le);
void
le_committed_mvcc(uint8_t *key, uint32_t keylen,
uint8_t *val, uint32_t vallen,
TXNID xid,
void (*bytes)(struct dbuf *dbuf, const void *bytes, int nbytes),
struct dbuf *d);
void
le_clean(uint8_t *key, uint32_t keylen,
uint8_t *val, uint32_t vallen,
void (*bytes)(struct dbuf *dbuf, const void *bytes, int nbytes),
struct dbuf *d);
//Callback contract: //Callback contract:
// Function checks to see if id is accepted by context. // Function checks to see if id is accepted by context.
// Returns: // Returns:
...@@ -169,9 +157,9 @@ int le_iterate_is_del(LEAFENTRY le, LE_ITERATE_CALLBACK f, bool *is_empty, TOKUT ...@@ -169,9 +157,9 @@ int le_iterate_is_del(LEAFENTRY le, LE_ITERATE_CALLBACK f, bool *is_empty, TOKUT
int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, uint32_t *vallenp, TOKUTXN context); int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, uint32_t *vallenp, TOKUTXN context);
size_t size_t
leafentry_disksize_13(LEAFENTRY_13 le); leafentry_disksize_13(LEAFENTRY_13 le);
int int
toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored data. toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored data.
size_t *new_leafentry_memorysize, size_t *new_leafentry_memorysize,
...@@ -179,7 +167,28 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored ...@@ -179,7 +167,28 @@ toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored
OMT *omtp, OMT *omtp,
struct mempool *mp); struct mempool *mp);
void toku_le_apply_msg(FT_MSG msg,
LEAFENTRY old_leafentry, // NULL if there was no stored data.
#endif TXNID oldest_referenced_xid,
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
int64_t * numbytes_delta_p);
bool toku_le_worth_running_garbage_collection(LEAFENTRY le, TXNID oldest_known_referenced_xid);
void toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
LEAFENTRY *new_leaf_entry,
size_t *new_leaf_entry_memory_size,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
const xid_omt_t &snapshot_xids,
const rx_omt_t &referenced_xids,
const xid_omt_t &live_root_txns,
TXNID oldest_known_referenced_xid);
#endif /* TOKU_LEAFENTRY_H */
...@@ -48,7 +48,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va ...@@ -48,7 +48,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
brt->ft->h->max_msn_in_ft = msn; brt->ft->h->max_msn_in_ft = msn;
FT_MSG_S cmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &theval }} }; FT_MSG_S cmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &theval }} };
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd, TXNID_NONE, nullptr, nullptr); toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd, nullptr, nullptr);
{ {
int r = toku_ft_lookup(brt, &thekey, lookup_checkf, &pair); int r = toku_ft_lookup(brt, &thekey, lookup_checkf, &pair);
assert(r==0); assert(r==0);
...@@ -56,7 +56,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va ...@@ -56,7 +56,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
} }
FT_MSG_S badcmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval }} }; FT_MSG_S badcmd = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval }} };
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &badcmd, TXNID_NONE, nullptr, nullptr); toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &badcmd, nullptr, nullptr);
// message should be rejected for duplicate msn, row should still have original val // message should be rejected for duplicate msn, row should still have original val
{ {
...@@ -69,7 +69,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va ...@@ -69,7 +69,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
msn = next_dummymsn(); msn = next_dummymsn();
brt->ft->h->max_msn_in_ft = msn; brt->ft->h->max_msn_in_ft = msn;
FT_MSG_S cmd2 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &val2 }} }; FT_MSG_S cmd2 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &val2 }} };
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd2, TXNID_NONE, nullptr, nullptr); toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd2, nullptr, nullptr);
// message should be accepted, val should have new value // message should be accepted, val should have new value
{ {
...@@ -81,7 +81,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va ...@@ -81,7 +81,7 @@ append_leaf(FT_HANDLE brt, FTNODE leafnode, void *key, uint32_t keylen, void *va
// now verify that message with lesser (older) msn is rejected // now verify that message with lesser (older) msn is rejected
msn.msn = msn.msn - 10; msn.msn = msn.msn - 10;
FT_MSG_S cmd3 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval } }}; FT_MSG_S cmd3 = { FT_INSERT, msn, xids_get_root_xids(), .u={.id = { &thekey, &badval } }};
toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd3, TXNID_NONE, nullptr, nullptr); toku_ft_leaf_apply_cmd(brt->ft->compare_fun, brt->ft->update_fun, &brt->ft->cmp_descriptor, leafnode, -1, &cmd3, nullptr, nullptr);
// message should be rejected, val should still have value in pair2 // message should be rejected, val should still have value in pair2
{ {
......
...@@ -123,8 +123,7 @@ insert_random_message_to_bn(FT_HANDLE t, BASEMENTNODE blb, LEAFENTRY *save, XIDS ...@@ -123,8 +123,7 @@ insert_random_message_to_bn(FT_HANDLE t, BASEMENTNODE blb, LEAFENTRY *save, XIDS
msg.u.id.val = valdbt; msg.u.id.val = valdbt;
size_t memsize; size_t memsize;
int64_t numbytes; int64_t numbytes;
int r = apply_msg_to_leafentry(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes); toku_le_apply_msg(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes);
assert_zero(r);
toku_ft_bn_apply_cmd(t->ft->compare_fun, t->ft->update_fun, NULL, blb, &msg, TXNID_NONE, NULL, NULL); toku_ft_bn_apply_cmd(t->ft->compare_fun, t->ft->update_fun, NULL, blb, &msg, TXNID_NONE, NULL, NULL);
if (msn.msn > blb->max_msn_applied.msn) { if (msn.msn > blb->max_msn_applied.msn) {
blb->max_msn_applied = msn; blb->max_msn_applied = msn;
...@@ -164,8 +163,7 @@ insert_same_message_to_bns(FT_HANDLE t, BASEMENTNODE blb1, BASEMENTNODE blb2, LE ...@@ -164,8 +163,7 @@ insert_same_message_to_bns(FT_HANDLE t, BASEMENTNODE blb1, BASEMENTNODE blb2, LE
msg.u.id.val = valdbt; msg.u.id.val = valdbt;
size_t memsize; size_t memsize;
int64_t numbytes; int64_t numbytes;
int r = apply_msg_to_leafentry(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes); toku_le_apply_msg(&msg, NULL, TXNID_NONE, &memsize, save, NULL, NULL, NULL, &numbytes);
assert_zero(r);
toku_ft_bn_apply_cmd(t->ft->compare_fun, t->ft->update_fun, NULL, blb1, &msg, TXNID_NONE, NULL, NULL); toku_ft_bn_apply_cmd(t->ft->compare_fun, t->ft->update_fun, NULL, blb1, &msg, TXNID_NONE, NULL, NULL);
if (msn.msn > blb1->max_msn_applied.msn) { if (msn.msn > blb1->max_msn_applied.msn) {
blb1->max_msn_applied = msn; blb1->max_msn_applied = msn;
...@@ -274,7 +272,7 @@ flush_to_internal(FT_HANDLE t) { ...@@ -274,7 +272,7 @@ flush_to_internal(FT_HANDLE t) {
set_BNC(child, 0, child_bnc); set_BNC(child, 0, child_bnc);
BP_STATE(child, 0) = PT_AVAIL; BP_STATE(child, 0) = PT_AVAIL;
toku_bnc_flush_to_child(t->ft, parent_bnc, child); toku_bnc_flush_to_child(t->ft, parent_bnc, child, TXNID_NONE);
int parent_messages_present[num_parent_messages]; int parent_messages_present[num_parent_messages];
int child_messages_present[num_child_messages]; int child_messages_present[num_child_messages];
...@@ -409,7 +407,7 @@ flush_to_internal_multiple(FT_HANDLE t) { ...@@ -409,7 +407,7 @@ flush_to_internal_multiple(FT_HANDLE t) {
} }
} }
toku_bnc_flush_to_child(t->ft, parent_bnc, child); toku_bnc_flush_to_child(t->ft, parent_bnc, child, TXNID_NONE);
int total_messages = 0; int total_messages = 0;
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
...@@ -580,7 +578,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) { ...@@ -580,7 +578,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
if (make_leaf_up_to_date) { if (make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) { for (i = 0; i < num_parent_messages; ++i) {
if (!parent_messages_is_fresh[i]) { if (!parent_messages_is_fresh[i]) {
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], TXNID_NONE, NULL, NULL); toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], NULL, NULL);
} }
} }
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
...@@ -601,7 +599,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) { ...@@ -601,7 +599,7 @@ flush_to_leaf(FT_HANDLE t, bool make_leaf_up_to_date, bool use_flush) {
} }
if (use_flush) { if (use_flush) {
toku_bnc_flush_to_child(t->ft, parent_bnc, child); toku_bnc_flush_to_child(t->ft, parent_bnc, child, TXNID_NONE);
destroy_nonleaf_childinfo(parent_bnc); destroy_nonleaf_childinfo(parent_bnc);
} else { } else {
FTNODE XMALLOC(parentnode); FTNODE XMALLOC(parentnode);
...@@ -803,7 +801,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) { ...@@ -803,7 +801,7 @@ flush_to_leaf_with_keyrange(FT_HANDLE t, bool make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) { for (i = 0; i < num_parent_messages; ++i) {
if (dummy_cmp(NULL, parent_messages[i]->u.id.key, &childkeys[7]) <= 0 && if (dummy_cmp(NULL, parent_messages[i]->u.id.key, &childkeys[7]) <= 0 &&
!parent_messages_is_fresh[i]) { !parent_messages_is_fresh[i]) {
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], TXNID_NONE, NULL, NULL); toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child, -1, parent_messages[i], NULL, NULL);
} }
} }
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
...@@ -995,8 +993,8 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) { ...@@ -995,8 +993,8 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
if (make_leaf_up_to_date) { if (make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) { for (i = 0; i < num_parent_messages; ++i) {
if (!parent_messages_is_fresh[i]) { if (!parent_messages_is_fresh[i]) {
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child1, -1, parent_messages[i], TXNID_NONE, NULL, NULL); toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child1, -1, parent_messages[i], NULL, NULL);
toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child2, -1, parent_messages[i], TXNID_NONE, NULL, NULL); toku_ft_leaf_apply_cmd(t->ft->compare_fun, t->ft->update_fun, &t->ft->descriptor, child2, -1, parent_messages[i], NULL, NULL);
} }
} }
for (i = 0; i < 8; ++i) { for (i = 0; i < 8; ++i) {
...@@ -1010,7 +1008,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) { ...@@ -1010,7 +1008,7 @@ compare_apply_and_flush(FT_HANDLE t, bool make_leaf_up_to_date) {
} }
} }
toku_bnc_flush_to_child(t->ft, parent_bnc, child1); toku_bnc_flush_to_child(t->ft, parent_bnc, child1, TXNID_NONE);
FTNODE XMALLOC(parentnode); FTNODE XMALLOC(parentnode);
BLOCKNUM parentblocknum = { 17 }; BLOCKNUM parentblocknum = { 17 };
......
...@@ -26,7 +26,7 @@ bool checkpoint_callback_called; ...@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid; toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child // callback functions for toku_ft_flush_some_child
static bool static bool
dont_destroy_bn(void* UU(extra)) dont_destroy_bn(void* UU(extra))
{ {
...@@ -160,7 +160,7 @@ doit (bool after_child_pin) { ...@@ -160,7 +160,7 @@ doit (bool after_child_pin) {
assert(toku_bnc_nbytesinbuf(BNC(node, 0)) > 0); assert(toku_bnc_nbytesinbuf(BNC(node, 0)) > 0);
// do the flush // do the flush
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(checkpoint_callback_called); assert(checkpoint_callback_called);
// now let's pin the root again and make sure it is flushed // now let's pin the root again and make sure it is flushed
......
...@@ -26,7 +26,7 @@ bool checkpoint_callback_called; ...@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid; toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child // callback functions for toku_ft_flush_some_child
static bool static bool
dont_destroy_bn(void* UU(extra)) dont_destroy_bn(void* UU(extra))
{ {
...@@ -177,7 +177,7 @@ doit (int state) { ...@@ -177,7 +177,7 @@ doit (int state) {
assert(node->n_children == 2); assert(node->n_children == 2);
// do the flush // do the flush
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(checkpoint_callback_called); assert(checkpoint_callback_called);
// now let's pin the root again and make sure it is has merged // now let's pin the root again and make sure it is has merged
......
...@@ -26,7 +26,7 @@ bool checkpoint_callback_called; ...@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid; toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child // callback functions for toku_ft_flush_some_child
static bool static bool
dont_destroy_bn(void* UU(extra)) dont_destroy_bn(void* UU(extra))
{ {
...@@ -197,7 +197,7 @@ doit (int state) { ...@@ -197,7 +197,7 @@ doit (int state) {
assert(node->n_children == 2); assert(node->n_children == 2);
// do the flush // do the flush
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(checkpoint_callback_called); assert(checkpoint_callback_called);
// now let's pin the root again and make sure it is has rebalanced // now let's pin the root again and make sure it is has rebalanced
......
...@@ -26,7 +26,7 @@ bool checkpoint_callback_called; ...@@ -26,7 +26,7 @@ bool checkpoint_callback_called;
toku_pthread_t checkpoint_tid; toku_pthread_t checkpoint_tid;
// callback functions for flush_some_child // callback functions for toku_ft_flush_some_child
static bool static bool
dont_destroy_bn(void* UU(extra)) dont_destroy_bn(void* UU(extra))
{ {
...@@ -173,7 +173,7 @@ doit (bool after_split) { ...@@ -173,7 +173,7 @@ doit (bool after_split) {
assert(node->n_children == 1); assert(node->n_children == 1);
// do the flush // do the flush
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(checkpoint_callback_called); assert(checkpoint_callback_called);
// now let's pin the root again and make sure it is has split // now let's pin the root again and make sure it is has split
......
...@@ -396,14 +396,13 @@ test_le_apply(ULE ule_initial, FT_MSG msg, ULE ule_expected) { ...@@ -396,14 +396,13 @@ test_le_apply(ULE ule_initial, FT_MSG msg, ULE ule_expected) {
size_t result_memsize; size_t result_memsize;
int64_t ignoreme; int64_t ignoreme;
r = apply_msg_to_leafentry(msg, toku_le_apply_msg(msg,
le_initial, le_initial,
TXNID_NONE, TXNID_NONE,
&result_memsize, &result_memsize,
&le_result, &le_result,
NULL, NULL,
NULL, NULL, &ignoreme); NULL, NULL, &ignoreme);
CKERR(r);
if (le_result) if (le_result)
le_verify_accessors(le_result, ule_expected, result_memsize); le_verify_accessors(le_result, ule_expected, result_memsize);
...@@ -702,6 +701,111 @@ test_le_apply_messages(void) { ...@@ -702,6 +701,111 @@ test_le_apply_messages(void) {
test_le_committed_apply(); test_le_committed_apply();
} }
static bool ule_worth_running_garbage_collection(ULE ule, TXNID oldest_known_referenced_xid) {
LEAFENTRY le;
size_t initial_memsize;
int r = le_pack(ule, &initial_memsize, &le, nullptr, nullptr, nullptr); CKERR(r);
invariant_notnull(le);
bool worth_running = toku_le_worth_running_garbage_collection(le, oldest_known_referenced_xid);
toku_free(le);
return worth_running;
}
static void test_le_garbage_collection_birdie(void) {
DBT key;
DBT val;
ULE_S ule_initial;
ULE_S ule_expected;
uint8_t keybuf[MAX_SIZE];
uint32_t keysize=8;
uint8_t valbuf[MAX_SIZE];
uint32_t valsize=8;
ule_initial.uxrs = ule_initial.uxrs_static;
ule_expected.uxrs = ule_expected.uxrs_static;
memset(&key, 0, sizeof(key));
memset(&val, 0, sizeof(val));
bool do_garbage_collect;
fillrandom(keybuf, keysize);
fillrandom(valbuf, valsize);
//
// Test garbage collection "worth-doing" heurstic
//
// Garbage collection should not be worth doing on a clean leafentry.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 0;
ule_initial.uxrs[0].xid = TXNID_NONE;
ule_initial.uxrs[0].type = XR_INSERT;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(!do_garbage_collect);
// It is worth doing when there is more than one committed entry
ule_initial.num_cuxrs = 2;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 500;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(do_garbage_collect);
// It is not worth doing when there is one of each, when the
// provisional entry is newer than the oldest known referenced xid
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 1500;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(!do_garbage_collect);
ule_initial.uxrs[1].xid = 200;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(!do_garbage_collect);
// It is not worth doing when there is only one committed entry,
// multiple provisional entries, but the outermost entry is newer.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 3;
ule_initial.uxrs[1].xid = 201;
ule_initial.uxrs[2].xid = 206;
ule_initial.uxrs[3].xid = 215;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(!do_garbage_collect);
// It is worth doing when the above scenario has an outermost entry
// older than the oldest known, even if its children seem newer.
// this children must have commit because the parent is not live.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 3;
ule_initial.uxrs[1].xid = 190;
ule_initial.uxrs[2].xid = 206;
ule_initial.uxrs[3].xid = 215;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(do_garbage_collect);
// It is worth doing when there is more than one committed entry,
// even if a provisional entry exists that is newer than the
// oldest known refrenced xid
ule_initial.num_cuxrs = 2;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 499;
ule_initial.uxrs[2].xid = 500;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(do_garbage_collect);
// It is worth doing when there is one of each, and the provisional
// entry is older than the oldest known referenced xid
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 1;
ule_initial.uxrs[1].xid = 199;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(do_garbage_collect);
// It is definately worth doing when the above case is true
// and there is more than one provisional entry.
ule_initial.num_cuxrs = 1;
ule_initial.num_puxrs = 2;
ule_initial.uxrs[1].xid = 150;
ule_initial.uxrs[2].xid = 175;
do_garbage_collect = ule_worth_running_garbage_collection(&ule_initial, 200);
invariant(do_garbage_collect);
}
static void test_le_optimize(void) { static void test_le_optimize(void) {
FT_MSG_S msg; FT_MSG_S msg;
...@@ -900,6 +1004,7 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute_ ...@@ -900,6 +1004,7 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute_
test_le_pack(); test_le_pack();
test_le_apply_messages(); test_le_apply_messages();
test_le_optimize(); test_le_optimize();
test_le_garbage_collection_birdie();
destroy_xids(); destroy_xids();
return 0; return 0;
} }
......
...@@ -162,7 +162,7 @@ doit (void) { ...@@ -162,7 +162,7 @@ doit (void) {
assert_zero(r); assert_zero(r);
// now with setup done, start the test // now with setup done, start the test
// test that if flush_some_child properly honors // test that if toku_ft_flush_some_child properly honors
// what we say and flushes the child we pick // what we say and flushes the child we pick
FTNODE node = NULL; FTNODE node = NULL;
toku_pin_node_with_min_bfe(&node, node_internal, t); toku_pin_node_with_min_bfe(&node, node_internal, t);
...@@ -185,7 +185,7 @@ doit (void) { ...@@ -185,7 +185,7 @@ doit (void) {
); );
curr_child_to_flush = 0; curr_child_to_flush = 0;
num_flushes_called = 0; num_flushes_called = 0;
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 1); assert(num_flushes_called == 1);
toku_pin_node_with_min_bfe(&node, node_internal, t); toku_pin_node_with_min_bfe(&node, node_internal, t);
...@@ -203,7 +203,7 @@ doit (void) { ...@@ -203,7 +203,7 @@ doit (void) {
assert(!node->dirty); assert(!node->dirty);
curr_child_to_flush = 1; curr_child_to_flush = 1;
num_flushes_called = 0; num_flushes_called = 0;
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 1); assert(num_flushes_called == 1);
toku_pin_node_with_min_bfe(&node, node_internal, t); toku_pin_node_with_min_bfe(&node, node_internal, t);
...@@ -221,7 +221,7 @@ doit (void) { ...@@ -221,7 +221,7 @@ doit (void) {
assert(!node->dirty); assert(!node->dirty);
curr_child_to_flush = 0; curr_child_to_flush = 0;
num_flushes_called = 0; num_flushes_called = 0;
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 1); assert(num_flushes_called == 1);
toku_pin_node_with_min_bfe(&node, node_internal, t); toku_pin_node_with_min_bfe(&node, node_internal, t);
...@@ -250,7 +250,7 @@ doit (void) { ...@@ -250,7 +250,7 @@ doit (void) {
toku_assert_entire_node_in_memory(node); // entire root is in memory toku_assert_entire_node_in_memory(node); // entire root is in memory
curr_child_to_flush = i; curr_child_to_flush = i;
num_flushes_called = 0; num_flushes_called = 0;
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 2); assert(num_flushes_called == 2);
toku_pin_node_with_min_bfe(&node, node_internal, t); toku_pin_node_with_min_bfe(&node, node_internal, t);
...@@ -296,7 +296,7 @@ doit (void) { ...@@ -296,7 +296,7 @@ doit (void) {
toku_assert_entire_node_in_memory(node); // entire root is in memory toku_assert_entire_node_in_memory(node); // entire root is in memory
curr_child_to_flush = 0; curr_child_to_flush = 0;
num_flushes_called = 0; num_flushes_called = 0;
flush_some_child(t->ft, node, &fa); toku_ft_flush_some_child(t->ft, node, &fa);
assert(num_flushes_called == 2); assert(num_flushes_called == 2);
r = toku_close_ft_handle_nolsn(t, 0); assert(r==0); r = toku_close_ft_handle_nolsn(t, 0); assert(r==0);
......
...@@ -88,15 +88,13 @@ void toku_ule_free(ULEHANDLE ule_p) { ...@@ -88,15 +88,13 @@ void toku_ule_free(ULEHANDLE ule_p) {
toku_free(ule_p); toku_free(ule_p);
} }
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
// //
// Question: Can any software outside this file modify or read a leafentry? // Question: Can any software outside this file modify or read a leafentry?
// If so, is it worthwhile to put it all here? // If so, is it worthwhile to put it all here?
// //
// There are two entries, one each for modification and query: // There are two entries, one each for modification and query:
// apply_msg_to_leafentry() performs all inserts/deletes/aborts // toku_le_apply_msg() performs all inserts/deletes/aborts
// //
// //
// //
...@@ -122,6 +120,7 @@ static void msg_init_empty_ule(ULE ule, FT_MSG msg); ...@@ -122,6 +120,7 @@ static void msg_init_empty_ule(ULE ule, FT_MSG msg);
static void msg_modify_ule(ULE ule, FT_MSG msg); static void msg_modify_ule(ULE ule, FT_MSG msg);
static void ule_init_empty_ule(ULE ule, uint32_t keylen, void * keyp); static void ule_init_empty_ule(ULE ule, uint32_t keylen, void * keyp);
static void ule_do_implicit_promotions(ULE ule, XIDS xids); static void ule_do_implicit_promotions(ULE ule, XIDS xids);
static void ule_try_promote_provisional_outermost(ULE ule, TXNID oldest_possible_live_xid);
static void ule_promote_provisional_innermost_to_index(ULE ule, uint32_t index); static void ule_promote_provisional_innermost_to_index(ULE ule, uint32_t index);
static void ule_promote_provisional_innermost_to_committed(ULE ule); static void ule_promote_provisional_innermost_to_committed(ULE ule);
static void ule_apply_insert(ULE ule, XIDS xids, uint32_t vallen, void * valp); static void ule_apply_insert(ULE ule, XIDS xids, uint32_t vallen, void * valp);
...@@ -165,8 +164,6 @@ le_malloc(OMT *omtp, struct mempool *mp, size_t size, void **maybe_free) ...@@ -165,8 +164,6 @@ le_malloc(OMT *omtp, struct mempool *mp, size_t size, void **maybe_free)
return rval; return rval;
} }
///////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////
// Garbage collection related functions // Garbage collection related functions
// //
...@@ -215,7 +212,7 @@ xid_reads_committed_xid(TXNID tl1, TXNID xc, const xid_omt_t &snapshot_txnids, c ...@@ -215,7 +212,7 @@ xid_reads_committed_xid(TXNID tl1, TXNID xc, const xid_omt_t &snapshot_txnids, c
// so we get rid of them. // so we get rid of them.
// //
static void static void
simple_garbage_collection(ULE ule, TXNID oldest_referenced_xid) { ule_simple_garbage_collection(ULE ule, TXNID oldest_referenced_xid) {
uint32_t curr_index = 0; uint32_t curr_index = 0;
uint32_t num_entries; uint32_t num_entries;
if (ule->num_cuxrs == 1 || oldest_referenced_xid == TXNID_NONE) { if (ule->num_cuxrs == 1 || oldest_referenced_xid == TXNID_NONE) {
...@@ -244,7 +241,7 @@ done:; ...@@ -244,7 +241,7 @@ done:;
} }
static void static void
garbage_collection(ULE ule, const xid_omt_t &snapshot_xids, const rx_omt_t &referenced_xids, const xid_omt_t &live_root_txns) { ule_garbage_collect(ULE ule, const xid_omt_t &snapshot_xids, const rx_omt_t &referenced_xids, const xid_omt_t &live_root_txns) {
if (ule->num_cuxrs == 1) goto done; if (ule->num_cuxrs == 1) goto done;
// will fail if too many num_cuxrs // will fail if too many num_cuxrs
bool necessary_static[MAX_TRANSACTION_RECORDS]; bool necessary_static[MAX_TRANSACTION_RECORDS];
...@@ -340,7 +337,6 @@ garbage_collection(ULE ule, const xid_omt_t &snapshot_xids, const rx_omt_t &refe ...@@ -340,7 +337,6 @@ garbage_collection(ULE ule, const xid_omt_t &snapshot_xids, const rx_omt_t &refe
done:; done:;
} }
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
// This is the big enchilada. (Bring Tums.) Note that this level of abstraction // This is the big enchilada. (Bring Tums.) Note that this level of abstraction
// has no knowledge of the inner structure of either leafentry or msg. It makes // has no knowledge of the inner structure of either leafentry or msg. It makes
...@@ -353,42 +349,60 @@ done:; ...@@ -353,42 +349,60 @@ done:;
// If the leafentry is destroyed it sets *new_leafentry_p to NULL. // If the leafentry is destroyed it sets *new_leafentry_p to NULL.
// Otehrwise the new_leafentry_p points at the new leaf entry. // Otehrwise the new_leafentry_p points at the new leaf entry.
// As of October 2011, this function always returns 0. // As of October 2011, this function always returns 0.
int void
apply_msg_to_leafentry(FT_MSG msg, // message to apply to leafentry toku_le_apply_msg(FT_MSG msg, // message to apply to leafentry
LEAFENTRY old_leafentry, // NULL if there was no stored data. LEAFENTRY old_leafentry, // NULL if there was no stored data.
TXNID oldest_referenced_xid, TXNID oldest_referenced_xid,
size_t *new_leafentry_memorysize, size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p, LEAFENTRY *new_leafentry_p,
OMT *omtp, OMT *omtp,
struct mempool *mp, struct mempool *mp,
void **maybe_free, void **maybe_free,
int64_t * numbytes_delta_p) { // change in total size of key and val, not including any overhead int64_t * numbytes_delta_p) { // change in total size of key and val, not including any overhead
ULE_S ule; ULE_S ule;
int rval;
int64_t oldnumbytes = 0; int64_t oldnumbytes = 0;
int64_t newnumbytes = 0; int64_t newnumbytes = 0;
if (old_leafentry == NULL) // if leafentry does not exist ... if (old_leafentry == NULL) {
msg_init_empty_ule(&ule, msg); // ... create empty unpacked leaf entry msg_init_empty_ule(&ule, msg);
else { } else {
le_unpack(&ule, old_leafentry); // otherwise unpack leafentry le_unpack(&ule, old_leafentry); // otherwise unpack leafentry
oldnumbytes = ule_get_innermost_numbytes(&ule); oldnumbytes = ule_get_innermost_numbytes(&ule);
} }
msg_modify_ule(&ule, msg); // modify unpacked leafentry msg_modify_ule(&ule, msg); // modify unpacked leafentry
simple_garbage_collection(&ule, oldest_referenced_xid); ule_simple_garbage_collection(&ule, oldest_referenced_xid);
rval = le_pack(&ule, // create packed leafentry int rval = le_pack(&ule, // create packed leafentry
new_leafentry_memorysize, new_leafentry_memorysize,
new_leafentry_p, new_leafentry_p,
omtp, omtp,
mp, mp,
maybe_free); maybe_free);
if (new_leafentry_p) invariant_zero(rval);
if (new_leafentry_p) {
newnumbytes = ule_get_innermost_numbytes(&ule); newnumbytes = ule_get_innermost_numbytes(&ule);
}
*numbytes_delta_p = newnumbytes - oldnumbytes; *numbytes_delta_p = newnumbytes - oldnumbytes;
ule_cleanup(&ule); ule_cleanup(&ule);
return rval;
} }
bool toku_le_worth_running_garbage_collection(LEAFENTRY le, TXNID oldest_known_referenced_xid) {
// Effect: Quickly determines if it's worth trying to run garbage collection on a leafentry
// Return: True if it makes sense to try garbage collection, false otherwise.
// Rationale: Garbage collection is likely to clean up under two circumstances:
// 1.) There are multiple committed entries. Some may never be read by new txns.
// 2.) There is only one committed entry, but the outermost provisional entry
// is older than the oldest known referenced xid, so it must have commited.
// Therefor we can promote it to committed and get rid of the old commited entry.
if (le->type != LE_MVCC) {
return false;
}
if (le->u.mvcc.num_cxrs > 1) {
return true;
} else {
paranoid_invariant(le->u.mvcc.num_cxrs == 1);
}
return le->u.mvcc.num_pxrs > 0 && le_outermost_uncommitted_xid(le) < oldest_known_referenced_xid;
}
// Garbage collect one leaf entry, using the given OMT's. // Garbage collect one leaf entry, using the given OMT's.
// Parameters: // Parameters:
...@@ -408,29 +422,38 @@ apply_msg_to_leafentry(FT_MSG msg, // message to apply to leafentry ...@@ -408,29 +422,38 @@ apply_msg_to_leafentry(FT_MSG msg, // message to apply to leafentry
// -- referenced_xids : list of in memory active transactions. // -- referenced_xids : list of in memory active transactions.
// NOTE: it is not a good idea to garbage collect a leaf // NOTE: it is not a good idea to garbage collect a leaf
// entry with only one committed value. // entry with only one committed value.
int void
garbage_collect_leafentry(LEAFENTRY old_leaf_entry, toku_le_garbage_collect(LEAFENTRY old_leaf_entry,
LEAFENTRY *new_leaf_entry, LEAFENTRY *new_leaf_entry,
size_t *new_leaf_entry_memory_size, size_t *new_leaf_entry_memory_size,
OMT *omtp, OMT *omtp,
struct mempool *mp, struct mempool *mp,
void **maybe_free, void **maybe_free,
const xid_omt_t &snapshot_xids, const xid_omt_t &snapshot_xids,
const rx_omt_t &referenced_xids, const rx_omt_t &referenced_xids,
const xid_omt_t &live_root_txns) { const xid_omt_t &live_root_txns,
int r = 0; TXNID oldest_known_referenced_xid) {
ULE_S ule; ULE_S ule;
le_unpack(&ule, old_leaf_entry); le_unpack(&ule, old_leaf_entry);
garbage_collection(&ule, snapshot_xids, referenced_xids, live_root_txns);
r = le_pack(&ule, // Before running garbage collection, try to promote the outermost provisional
new_leaf_entry_memory_size, // entries to committed if its xid is older than the oldest possible live xid.
new_leaf_entry, //
omtp, // The oldest known refeferenced xid is a lower bound on the oldest possible
mp, // live xid, so we use that. It's usually close enough to get rid of most
maybe_free); // garbage in leafentries.
TXNID oldest_possible_live_xid = oldest_known_referenced_xid;
ule_try_promote_provisional_outermost(&ule, oldest_possible_live_xid);
ule_garbage_collect(&ule, snapshot_xids, referenced_xids, live_root_txns);
int r = le_pack(&ule,
new_leaf_entry_memory_size,
new_leaf_entry,
omtp,
mp,
maybe_free);
assert(r == 0); assert(r == 0);
ule_cleanup(&ule); ule_cleanup(&ule);
return r;
} }
///////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////
...@@ -1529,6 +1552,15 @@ ule_promote_provisional_innermost_to_committed(ULE ule) { ...@@ -1529,6 +1552,15 @@ ule_promote_provisional_innermost_to_committed(ULE ule) {
} }
} }
static void
ule_try_promote_provisional_outermost(ULE ule, TXNID oldest_possible_live_xid) {
// Effect: If there is a provisional record whose outermost xid is older than
// the oldest known referenced_xid, promote it to committed.
if (ule->num_puxrs > 0 && ule_get_xid(ule, ule->num_cuxrs) < oldest_possible_live_xid) {
ule_promote_provisional_innermost_to_committed(ule);
}
}
// Purpose is to promote the value (and type) of the innermost transaction // Purpose is to promote the value (and type) of the innermost transaction
// record to the uxr at the specified index (keeping the txnid of the uxr at // record to the uxr at the specified index (keeping the txnid of the uxr at
// specified index.) // specified index.)
......
...@@ -45,31 +45,4 @@ TXNID uxr_get_txnid(UXRHANDLE uxr); ...@@ -45,31 +45,4 @@ TXNID uxr_get_txnid(UXRHANDLE uxr);
//1 does much slower debugging //1 does much slower debugging
#define GARBAGE_COLLECTION_DEBUG 0 #define GARBAGE_COLLECTION_DEBUG 0
void fast_msg_to_leafentry(
FT_MSG msg, // message to apply to leafentry
size_t *new_leafentry_memorysize,
size_t *new_leafentry_disksize,
LEAFENTRY *new_leafentry_p) ;
int apply_msg_to_leafentry(FT_MSG msg,
LEAFENTRY old_leafentry, // NULL if there was no stored data.
TXNID oldest_referenced_xid,
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
int64_t * numbytes_delta_p);
int garbage_collect_leafentry(LEAFENTRY old_leaf_entry,
LEAFENTRY *new_leaf_entry,
size_t *new_leaf_entry_memory_size,
OMT *omtp,
struct mempool *mp,
void **maybe_free,
const xid_omt_t &snapshot_xids,
const rx_omt_t &referenced_xids,
const xid_omt_t &live_root_txns);
#endif // TOKU_ULE_H #endif // TOKU_ULE_H
...@@ -740,6 +740,9 @@ static int random_put_in_db(DB *db, DB_TXN *txn, ARG arg, bool ignore_errors, vo ...@@ -740,6 +740,9 @@ static int random_put_in_db(DB *db, DB_TXN *txn, ARG arg, bool ignore_errors, vo
} else { } else {
rand_key_i[0] = arg->thread_idx; rand_key_i[0] = arg->thread_idx;
} }
if (arg->cli->num_elements > 0 && arg->bounded_element_range) {
rand_key_key[0] = rand_key_key[0] % arg->cli->num_elements;
}
fill_zeroed_array(valbuf, arg->cli->val_size, arg->random_data, arg->cli->compressibility); fill_zeroed_array(valbuf, arg->cli->val_size, arg->random_data, arg->cli->compressibility);
DBT key, val; DBT key, val;
dbt_init(&key, &rand_key_b, sizeof rand_key_b); dbt_init(&key, &rand_key_b, sizeof rand_key_b);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment