Commit e0de0b2d authored by John Esmet's avatar John Esmet

fixes #158 Use promotion to record the blocknum of the rightmost

non-root leaf node in each FT. When the FT detects a rightmost insertion
pattern, it attempts to do inserts and unique checks directly into the
rightmost leaf node, greatly optimizing sequential insert speed.
parent e6190351
...@@ -403,3 +403,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node) ...@@ -403,3 +403,25 @@ toku_unpin_ftnode_read_only(FT ft, FTNODE node)
); );
assert(r==0); assert(r==0);
} }
void toku_ftnode_swap_pair_values(FTNODE a, FTNODE b)
// Effect: Swap the blocknum, fullhash, and PAIR for for a and b
// Requires: Both nodes are pinned
{
BLOCKNUM tmp_blocknum = a->thisnodename;
uint32_t tmp_fullhash = a->fullhash;
PAIR tmp_pair = a->ct_pair;
a->thisnodename = b->thisnodename;
a->fullhash = b->fullhash;
a->ct_pair = b->ct_pair;
b->thisnodename = tmp_blocknum;
b->fullhash = tmp_fullhash;
b->ct_pair = tmp_pair;
// A and B swapped pair pointers, but we still have to swap
// the actual pair values (ie: the FTNODEs they represent)
// in the cachetable.
toku_cachetable_swap_pair_values(a->ct_pair, b->ct_pair);
}
...@@ -190,4 +190,7 @@ int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pai ...@@ -190,4 +190,7 @@ int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, pai
void toku_unpin_ftnode(FT h, FTNODE node); void toku_unpin_ftnode(FT h, FTNODE node);
void toku_unpin_ftnode_read_only(FT ft, FTNODE node); void toku_unpin_ftnode_read_only(FT ft, FTNODE node);
// Effect: Swaps pair values of two pinned nodes
void toku_ftnode_swap_pair_values(FTNODE nodea, FTNODE nodeb);
#endif #endif
...@@ -565,6 +565,7 @@ static bool may_node_be_reactive(FT ft, FTNODE node) ...@@ -565,6 +565,7 @@ static bool may_node_be_reactive(FT ft, FTNODE node)
*/ */
static void static void
handle_split_of_child( handle_split_of_child(
FT ft,
FTNODE node, FTNODE node,
int childnum, int childnum,
FTNODE childa, FTNODE childa,
...@@ -607,8 +608,20 @@ handle_split_of_child( ...@@ -607,8 +608,20 @@ handle_split_of_child(
paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child paranoid_invariant(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child
// We never set the rightmost blocknum to be the root.
// Instead, we wait for the root to split and let promotion initialize the rightmost
// blocknum to be the first non-root leaf node on the right extreme to recieve an insert.
invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b);
if (childa->thisnodename.b == ft->rightmost_blocknum.b) {
// The rightmost leaf (a) split into (a) and (b). We want (b) to swap pair values
// with (a), now that it is the new rightmost leaf. This keeps the rightmost blocknum
// constant, the same the way we keep the root blocknum constant.
toku_ftnode_swap_pair_values(childa, childb);
BP_BLOCKNUM(node, childnum) = childa->thisnodename;
}
BP_BLOCKNUM(node, childnum+1) = childb->thisnodename; BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
BP_WORKDONE(node, childnum+1) = 0; BP_WORKDONE(node, childnum+1) = 0;
BP_STATE(node,childnum+1) = PT_AVAIL; BP_STATE(node,childnum+1) = PT_AVAIL;
NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl(); NONLEAF_CHILDINFO new_bnc = toku_create_empty_nl();
...@@ -1071,7 +1084,7 @@ ft_split_child( ...@@ -1071,7 +1084,7 @@ ft_split_child(
ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes); ft_nonleaf_split(h, child, &nodea, &nodeb, &splitk, 2, dep_nodes);
} }
// printf("%s:%d child did split\n", __FILE__, __LINE__); // printf("%s:%d child did split\n", __FILE__, __LINE__);
handle_split_of_child (node, childnum, nodea, nodeb, &splitk); handle_split_of_child (h, node, childnum, nodea, nodeb, &splitk);
// for test // for test
call_flusher_thread_callback(flt_flush_during_split); call_flusher_thread_callback(flt_flush_during_split);
...@@ -1489,6 +1502,14 @@ ft_merge_child( ...@@ -1489,6 +1502,14 @@ ft_merge_child(
&node->childkeys[childnuma+1], &node->childkeys[childnuma+1],
(node->n_children-childnumb)*sizeof(node->childkeys[0])); (node->n_children-childnumb)*sizeof(node->childkeys[0]));
REALLOC_N(node->n_children-1, node->childkeys); REALLOC_N(node->n_children-1, node->childkeys);
// Handle a merge of the rightmost leaf node.
if (did_merge && childb->thisnodename.b == h->rightmost_blocknum.b) {
invariant(childb->thisnodename.b != h->h->root_blocknum.b);
toku_ftnode_swap_pair_values(childa, childb);
BP_BLOCKNUM(node, childnuma) = childa->thisnodename;
}
paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b); paranoid_invariant(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b);
childa->dirty = 1; // just to make sure childa->dirty = 1; // just to make sure
childb->dirty = 1; // just to make sure childb->dirty = 1; // just to make sure
......
...@@ -123,6 +123,10 @@ enum { FT_DEFAULT_FANOUT = 16 }; ...@@ -123,6 +123,10 @@ enum { FT_DEFAULT_FANOUT = 16 };
enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 }; enum { FT_DEFAULT_NODE_SIZE = 4 * 1024 * 1024 };
enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 }; enum { FT_DEFAULT_BASEMENT_NODE_SIZE = 128 * 1024 };
// We optimize for a sequential insert pattern if 100 consecutive injections
// happen into the rightmost leaf node due to promotion.
enum { FT_SEQINSERT_SCORE_THRESHOLD = 100 };
// //
// Field in ftnode_fetch_extra that tells the // Field in ftnode_fetch_extra that tells the
// partial fetch callback what piece of the node // partial fetch callback what piece of the node
...@@ -572,6 +576,22 @@ struct ft { ...@@ -572,6 +576,22 @@ struct ft {
// is this ft a blackhole? if so, all messages are dropped. // is this ft a blackhole? if so, all messages are dropped.
bool blackhole; bool blackhole;
// The blocknum of the rightmost leaf node in the tree. Stays constant through splits
// and merges using pair-swapping (like the root node, see toku_ftnode_swap_pair_values())
//
// This field only transitions from RESERVED_BLOCKNUM_NULL to non-null, never back.
// We initialize it when promotion inserts into a non-root leaf node on the right extreme.
// We use the blocktable lock to protect the initialize transition, though it's not really
// necessary since all threads should be setting it to the same value. We maintain that invariant
// on first initialization, see ft_set_or_verify_rightmost_blocknum()
BLOCKNUM rightmost_blocknum;
// sequential access pattern heuristic
// - when promotion pushes a message directly into the rightmost leaf, the score goes up.
// - if the score is high enough, we optimistically attempt to insert directly into the rightmost leaf
// - if our attempt fails because the key was not in range of the rightmost leaf, we reset the score back to 0
uint32_t seqinsert_score;
}; };
// Allocate a DB struct off the stack and only set its comparison // Allocate a DB struct off the stack and only set its comparison
...@@ -1186,6 +1206,9 @@ typedef enum { ...@@ -1186,6 +1206,9 @@ typedef enum {
FT_PRO_NUM_DIDNT_WANT_PROMOTE, FT_PRO_NUM_DIDNT_WANT_PROMOTE,
FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, // how many basement nodes were deserialized with a fixed keysize
FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, // how many basement nodes were deserialized with a variable keysize
FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS,
FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS,
FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE,
FT_STATUS_NUM_ROWS FT_STATUS_NUM_ROWS
} ft_status_entry; } ft_status_entry;
......
...@@ -367,6 +367,9 @@ status_init(void) ...@@ -367,6 +367,9 @@ status_init(void)
STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE, PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(FT_PRO_NUM_DIDNT_WANT_PROMOTE, PROMOTION_STOPPED_AFTER_LOCKING_CHILD, PARCOUNT, "promotion: stopped anyway, after locking the child", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(FT_BASEMENT_DESERIALIZE_FIXED_KEYSIZE, BASEMENT_DESERIALIZATION_FIXED_KEY, PARCOUNT, "basement nodes deserialized with fixed-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS); STATUS_INIT(FT_BASEMENT_DESERIALIZE_VARIABLE_KEYSIZE, BASEMENT_DESERIALIZATION_VARIABLE_KEY, PARCOUNT, "basement nodes deserialized with variable-keysize", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, nullptr, PARCOUNT, "promotion: succeeded in using the rightmost leaf shortcut", TOKU_ENGINE_STATUS);
STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (out-of-bounds)", TOKU_ENGINE_STATUS);
STATUS_INIT(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE,nullptr, PARCOUNT, "promotion: tried the rightmost leaf shorcut but failed (child reactive)", TOKU_ENGINE_STATUS);
ft_status.initialized = true; ft_status.initialized = true;
} }
...@@ -1643,12 +1646,10 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) ...@@ -1643,12 +1646,10 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp)
BLOCKNUM old_blocknum = oldroot->thisnodename; BLOCKNUM old_blocknum = oldroot->thisnodename;
uint32_t old_fullhash = oldroot->fullhash; uint32_t old_fullhash = oldroot->fullhash;
PAIR old_pair = oldroot->ct_pair;
int new_height = oldroot->height+1; int new_height = oldroot->height+1;
uint32_t new_fullhash; uint32_t new_fullhash;
BLOCKNUM new_blocknum; BLOCKNUM new_blocknum;
PAIR new_pair = NULL;
cachetable_put_empty_node_with_dep_nodes( cachetable_put_empty_node_with_dep_nodes(
ft, ft,
...@@ -1658,7 +1659,6 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) ...@@ -1658,7 +1659,6 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp)
&new_fullhash, &new_fullhash,
&newroot &newroot
); );
new_pair = newroot->ct_pair;
assert(newroot); assert(newroot);
assert(new_height > 0); assert(new_height > 0);
...@@ -1670,22 +1670,18 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp) ...@@ -1670,22 +1670,18 @@ ft_init_new_root(FT ft, FTNODE oldroot, FTNODE *newrootp)
ft->h->layout_version, ft->h->layout_version,
ft->h->flags ft->h->flags
); );
newroot->fullhash = new_fullhash;
MSN msna = oldroot->max_msn_applied_to_node_on_disk; MSN msna = oldroot->max_msn_applied_to_node_on_disk;
newroot->max_msn_applied_to_node_on_disk = msna; newroot->max_msn_applied_to_node_on_disk = msna;
BP_STATE(newroot,0) = PT_AVAIL; BP_STATE(newroot,0) = PT_AVAIL;
newroot->dirty = 1; newroot->dirty = 1;
// now do the "switcheroo" // Set the first child to have the new blocknum,
BP_BLOCKNUM(newroot,0) = new_blocknum; // and then swap newroot with oldroot. The new root
newroot->thisnodename = old_blocknum; // will inherit the hash/blocknum/pair from oldroot,
newroot->fullhash = old_fullhash; // keeping the root blocknum constant.
newroot->ct_pair = old_pair; BP_BLOCKNUM(newroot, 0) = new_blocknum;
toku_ftnode_swap_pair_values(newroot, oldroot);
oldroot->thisnodename = new_blocknum;
oldroot->fullhash = new_fullhash;
oldroot->ct_pair = new_pair;
toku_cachetable_swap_pair_values(old_pair, new_pair);
toku_ft_split_child( toku_ft_split_child(
ft, ft,
...@@ -2774,6 +2770,16 @@ static void inject_message_in_locked_node( ...@@ -2774,6 +2770,16 @@ static void inject_message_in_locked_node(
// verify that msn of latest message was captured in root node // verify that msn of latest message was captured in root node
paranoid_invariant(msg->msn.msn == node->max_msn_applied_to_node_on_disk.msn); paranoid_invariant(msg->msn.msn == node->max_msn_applied_to_node_on_disk.msn);
if (node->thisnodename.b == ft->rightmost_blocknum.b) {
if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) {
// we promoted to the rightmost leaf node and the seqinsert score has not yet saturated.
toku_sync_fetch_and_add(&ft->seqinsert_score, 1);
}
} else if (ft->seqinsert_score != 0) {
// we promoted to something other than the rightmost leaf node and the score should reset
ft->seqinsert_score = 0;
}
// if we call toku_ft_flush_some_child, then that function unpins the root // if we call toku_ft_flush_some_child, then that function unpins the root
// otherwise, we unpin ourselves // otherwise, we unpin ourselves
if (node->height > 0 && toku_ft_nonleaf_is_gorged(node, ft->h->nodesize)) { if (node->height > 0 && toku_ft_nonleaf_is_gorged(node, ft->h->nodesize)) {
...@@ -2930,6 +2936,21 @@ static inline bool should_inject_in_node(seqinsert_loc loc, int height, int dept ...@@ -2930,6 +2936,21 @@ static inline bool should_inject_in_node(seqinsert_loc loc, int height, int dept
return (height == 0 || (loc == NEITHER_EXTREME && (height <= 1 || depth >= 2))); return (height == 0 || (loc == NEITHER_EXTREME && (height <= 1 || depth >= 2)));
} }
static void ft_set_or_verify_rightmost_blocknum(FT ft, BLOCKNUM b)
// Given: 'b', the _definitive_ and constant rightmost blocknum of 'ft'
{
if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) {
toku_ft_lock(ft);
if (ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL) {
ft->rightmost_blocknum = b;
}
toku_ft_unlock(ft);
}
// The rightmost blocknum only transitions from RESERVED_BLOCKNUM_NULL to non-null.
// If it's already set, verify that the stored value is consistent with 'b'
invariant(ft->rightmost_blocknum.b == b.b);
}
static void push_something_in_subtree( static void push_something_in_subtree(
FT ft, FT ft,
FTNODE subtree_root, FTNODE subtree_root,
...@@ -2977,6 +2998,14 @@ static void push_something_in_subtree( ...@@ -2977,6 +2998,14 @@ static void push_something_in_subtree(
default: default:
STATUS_INC(FT_PRO_NUM_INJECT_DEPTH_GT3, 1); break; STATUS_INC(FT_PRO_NUM_INJECT_DEPTH_GT3, 1); break;
} }
// If the target node is a non-root leaf node on the right extreme,
// set the rightmost blocknum. We know there are no messages above us
// because promotion would not chose to inject directly into this leaf
// otherwise. We explicitly skip the root node because then we don't have
// to worry about changing the rightmost blocknum when the root splits.
if (subtree_root->height == 0 && loc == RIGHT_EXTREME && subtree_root->thisnodename.b != ft->h->root_blocknum.b) {
ft_set_or_verify_rightmost_blocknum(ft, subtree_root->thisnodename);
}
inject_message_in_locked_node(ft, subtree_root, target_childnum, msg, flow_deltas, gc_info); inject_message_in_locked_node(ft, subtree_root, target_childnum, msg, flow_deltas, gc_info);
} else { } else {
int r; int r;
...@@ -3247,7 +3276,260 @@ void toku_ft_root_put_msg( ...@@ -3247,7 +3276,260 @@ void toku_ft_root_put_msg(
} }
} }
// Effect: Insert the key-val pair into ft. static int ft_compare_keys(FT ft, const DBT *a, const DBT *b)
// Effect: Compare two keys using the given fractal tree's comparator/descriptor
{
FAKE_DB(db, &ft->cmp_descriptor);
return ft->compare_fun(&db, a, b);
}
static LEAFENTRY bn_get_le_and_key(BASEMENTNODE bn, int idx, DBT *key)
// Effect: Gets the i'th leafentry from the given basement node and
// fill its key in *key
// Requires: The i'th leafentry exists.
{
LEAFENTRY le;
uint32_t le_len;
void *le_key;
int r = bn->data_buffer.fetch_klpair(idx, &le, &le_len, &le_key);
invariant_zero(r);
toku_fill_dbt(key, le_key, le_len);
return le;
}
static LEAFENTRY ft_leaf_leftmost_le_and_key(FTNODE leaf, DBT *leftmost_key)
// Effect: If a leftmost key exists in the given leaf, toku_fill_dbt()
// the key into *leftmost_key
// Requires: Leaf is fully in memory and pinned for read or write.
// Return: leafentry if it exists, nullptr otherwise
{
for (int i = 0; i < leaf->n_children; i++) {
BASEMENTNODE bn = BLB(leaf, i);
if (bn->data_buffer.num_klpairs() > 0) {
// Get the first (leftmost) leafentry and its key
return bn_get_le_and_key(bn, 0, leftmost_key);
}
}
return nullptr;
}
static LEAFENTRY ft_leaf_rightmost_le_and_key(FTNODE leaf, DBT *rightmost_key)
// Effect: If a rightmost key exists in the given leaf, toku_fill_dbt()
// the key into *rightmost_key
// Requires: Leaf is fully in memory and pinned for read or write.
// Return: leafentry if it exists, nullptr otherwise
{
for (int i = leaf->n_children - 1; i >= 0; i--) {
BASEMENTNODE bn = BLB(leaf, i);
size_t num_les = bn->data_buffer.num_klpairs();
if (num_les > 0) {
// Get the last (rightmost) leafentry and its key
return bn_get_le_and_key(bn, num_les - 1, rightmost_key);
}
}
return nullptr;
}
static int ft_leaf_get_relative_key_pos(FT ft, FTNODE leaf, const DBT *key, bool *nondeleted_key_found, int *target_childnum)
// Effect: Determines what the relative position of the given key is with
// respect to a leaf node, and if it exists.
// Requires: Leaf is fully in memory and pinned for read or write.
// Requires: target_childnum is non-null
// Return: < 0 if key is less than the leftmost key in the leaf OR the relative position is unknown, for any reason.
// 0 if key is in the bounds [leftmost_key, rightmost_key] for this leaf or the leaf is empty
// > 0 if key is greater than the rightmost key in the leaf
// *nondeleted_key_found is set (if non-null) if the target key was found and is not deleted, unmodified otherwise
// *target_childnum is set to the child that (does or would) contain the key, if calculated, unmodified otherwise
{
DBT rightmost_key;
LEAFENTRY rightmost_le = ft_leaf_rightmost_le_and_key(leaf, &rightmost_key);
if (rightmost_le == nullptr) {
// If we can't get a rightmost key then the leaf is empty.
// In such a case, we don't have any information about what keys would be in this leaf.
// We have to assume the leaf node that would contain this key is to the left.
return -1;
}
// We have a rightmost leafentry, so it must exist in some child node
invariant(leaf->n_children > 0);
int relative_pos = 0;
int c = ft_compare_keys(ft, key, &rightmost_key);
if (c > 0) {
relative_pos = 1;
*target_childnum = leaf->n_children - 1;
} else if (c == 0) {
if (nondeleted_key_found != nullptr && !le_latest_is_del(rightmost_le)) {
*nondeleted_key_found = true;
}
relative_pos = 0;
*target_childnum = leaf->n_children - 1;
} else {
// The key is less than the rightmost. It may still be in bounds if it's >= the leftmost.
DBT leftmost_key;
LEAFENTRY leftmost_le = ft_leaf_leftmost_le_and_key(leaf, &leftmost_key);
invariant_notnull(leftmost_le); // Must exist because a rightmost exists
c = ft_compare_keys(ft, key, &leftmost_key);
if (c > 0) {
if (nondeleted_key_found != nullptr) {
// The caller wants to know if a nondeleted key can be found.
LEAFENTRY target_le;
int childnum = toku_ftnode_which_child(leaf, key, &ft->cmp_descriptor, ft->compare_fun);
BASEMENTNODE bn = BLB(leaf, childnum);
struct msg_leafval_heaviside_extra extra = { ft->compare_fun, &ft->cmp_descriptor, key };
int r = bn->data_buffer.find_zero<decltype(extra), toku_msg_leafval_heaviside>(
extra,
&target_le,
nullptr, nullptr, nullptr
);
*target_childnum = childnum;
if (r == 0 && !le_latest_is_del(leftmost_le)) {
*nondeleted_key_found = true;
}
}
relative_pos = 0;
} else if (c == 0) {
if (nondeleted_key_found != nullptr && !le_latest_is_del(leftmost_le)) {
*nondeleted_key_found = true;
}
relative_pos = 0;
*target_childnum = 0;
} else {
relative_pos = -1;
}
}
return relative_pos;
}
static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val,
XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info);
static int getf_nothing(ITEMLEN, bytevec, ITEMLEN, bytevec, void *, bool);
static int ft_maybe_insert_into_rightmost_leaf(FT ft, DBT *key, DBT *val, XIDS message_xids, enum ft_msg_type type,
txn_gc_info *gc_info, bool unique)
// Effect: Pins the rightmost leaf node and attempts to do an insert.
// There are three reasons why we may not succeed.
// - The rightmost leaf is too full and needs a split.
// - The key to insert is not within the provable bounds of this leaf node.
// - The key is within bounds, but it already exists.
// Return: 0 if this function did insert, DB_KEYEXIST if a unique key constraint exists and
// some nondeleted leafentry with the same key exists
// < 0 if this function did not insert, for a reason other than DB_KEYEXIST.
// Note: Treat this function as a possible, but not necessary, optimization for insert.
// Rationale: We want O(1) insertions down the rightmost path of the tree.
{
int r = -1;
uint32_t rightmost_fullhash;
BLOCKNUM rightmost_blocknum = ft->rightmost_blocknum;
FTNODE rightmost_leaf = nullptr;
// Don't do the optimization if our heurstic suggests that
// insertion pattern is not sequential.
if (ft->seqinsert_score < FT_SEQINSERT_SCORE_THRESHOLD) {
goto cleanup;
}
// We know the seqinsert score is high enough that we should
// attemp to directly insert into the right most leaf. Because
// the score is non-zero, the rightmost blocknum must have been
// set. See inject_message_in_locked_node(), which only increases
// the score if the target node blocknum == rightmost_blocknum
invariant(rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
// Pin the rightmost leaf with a write lock.
rightmost_fullhash = toku_cachetable_hash(ft->cf, rightmost_blocknum);
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, ft);
toku_pin_ftnode(ft, rightmost_blocknum, rightmost_fullhash, &bfe, PL_WRITE_CHEAP, &rightmost_leaf, true);
// The rightmost blocknum never chances once it is initialized to something
// other than null. Verify that the pinned node has the correct blocknum.
invariant(rightmost_leaf->thisnodename.b == rightmost_blocknum.b);
// If the rightmost leaf is reactive, bail out out and let the normal promotion pass
// take care of it. This also ensures that if any of our ancestors are reactive,
// they'll be taken care of too.
if (get_leaf_reactivity(rightmost_leaf, ft->h->nodesize) != RE_STABLE) {
STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_REACTIVE, 1);
goto cleanup;
}
// The groundwork has been laid for an insertion directly into the rightmost
// leaf node. We know that it is pinned for write, fully in memory, has
// no messages above it, and is not reactive.
//
// Now, two more things must be true for this insertion to actually happen:
// 1. The key to insert is within the bounds of this leafnode, or to the right.
// 2. If there is a uniqueness constraint, it passes.
bool nondeleted_key_found;
int relative_pos;
int target_childnum;
nondeleted_key_found = false;
target_childnum = -1;
relative_pos = ft_leaf_get_relative_key_pos(ft, rightmost_leaf, key,
unique ? &nondeleted_key_found : nullptr,
&target_childnum);
if (relative_pos >= 0) {
STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_SUCCESS, 1);
if (unique && nondeleted_key_found) {
r = DB_KEYEXIST;
} else {
ft_insert_directly_into_leaf(ft, rightmost_leaf, target_childnum,
key, val, message_xids, type, gc_info);
r = 0;
}
} else {
STATUS_INC(FT_PRO_RIGHTMOST_LEAF_SHORTCUT_FAIL_POS, 1);
r = -1;
}
cleanup:
// If we did the insert, the rightmost leaf was unpinned for us.
if (r != 0 && rightmost_leaf != nullptr) {
toku_unpin_ftnode(ft, rightmost_leaf);
}
return r;
}
static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type);
int toku_ft_insert_unique(FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool do_logging) {
// Effect: Insert a unique key-val pair into the fractal tree.
// Return: 0 on success, DB_KEYEXIST if the overwrite constraint failed
XIDS message_xids = txn != nullptr ? toku_txn_get_xids(txn) : xids_get_root_xids();
TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h);
txn_manager_state txn_state_for_gc(txn_manager);
TXNID oldest_referenced_xid_estimate = toku_ft_get_oldest_referenced_xid_estimate(ft_h);
txn_gc_info gc_info(&txn_state_for_gc,
oldest_referenced_xid_estimate,
// no messages above us, we can implicitly promote uxrs based on this xid
oldest_referenced_xid_estimate,
true);
int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, true);
if (r != 0 && r != DB_KEYEXIST) {
// Default to a regular unique check + insert algorithm if we couldn't
// do it based on the rightmost leaf alone.
int lookup_r = toku_ft_lookup(ft_h, key, getf_nothing, nullptr);
if (lookup_r == DB_NOTFOUND) {
toku_ft_send_insert(ft_h, key, val, message_xids, FT_INSERT, &gc_info);
r = 0;
} else {
r = DB_KEYEXIST;
}
}
if (r == 0) {
ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, FT_INSERT);
}
return r;
}
// Effect: Insert the key-val pair into an ft.
void toku_ft_insert (FT_HANDLE ft_handle, DBT *key, DBT *val, TOKUTXN txn) { void toku_ft_insert (FT_HANDLE ft_handle, DBT *key, DBT *val, TOKUTXN txn) {
toku_ft_maybe_insert(ft_handle, key, val, txn, false, ZERO_LSN, true, FT_INSERT); toku_ft_maybe_insert(ft_handle, key, val, txn, false, ZERO_LSN, true, FT_INSERT);
} }
...@@ -3373,32 +3655,38 @@ TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h) { ...@@ -3373,32 +3655,38 @@ TXNID toku_ft_get_oldest_referenced_xid_estimate(FT_HANDLE ft_h) {
return txn_manager != nullptr ? toku_txn_manager_get_oldest_referenced_xid_estimate(txn_manager) : TXNID_NONE; return txn_manager != nullptr ? toku_txn_manager_get_oldest_referenced_xid_estimate(txn_manager) : TXNID_NONE;
} }
void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) { static void ft_txn_log_insert(FT ft, DBT *key, DBT *val, TOKUTXN txn, bool do_logging, enum ft_msg_type type) {
paranoid_invariant(type==FT_INSERT || type==FT_INSERT_NO_OVERWRITE); paranoid_invariant(type == FT_INSERT || type == FT_INSERT_NO_OVERWRITE);
XIDS message_xids = xids_get_root_xids(); //By default use committed messages
//By default use committed messages
TXNID_PAIR xid = toku_txn_get_txnid(txn); TXNID_PAIR xid = toku_txn_get_txnid(txn);
if (txn) { if (txn) {
BYTESTRING keybs = {key->size, (char *) key->data}; BYTESTRING keybs = {key->size, (char *) key->data};
toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs); toku_logger_save_rollback_cmdinsert(txn, toku_cachefile_filenum(ft->cf), &keybs);
toku_txn_maybe_note_ft(txn, ft_h->ft); toku_txn_maybe_note_ft(txn, ft);
message_xids = toku_txn_get_xids(txn);
} }
TOKULOGGER logger = toku_txn_logger(txn); TOKULOGGER logger = toku_txn_logger(txn);
if (do_logging && logger) { if (do_logging && logger) {
BYTESTRING keybs = {.len=key->size, .data=(char *) key->data}; BYTESTRING keybs = {.len=key->size, .data=(char *) key->data};
BYTESTRING valbs = {.len=val->size, .data=(char *) val->data}; BYTESTRING valbs = {.len=val->size, .data=(char *) val->data};
if (type == FT_INSERT) { if (type == FT_INSERT) {
toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs); toku_log_enq_insert(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs);
} }
else { else {
toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft_h->ft->cf), xid, keybs, valbs); toku_log_enq_insert_no_overwrite(logger, (LSN*)0, 0, txn, toku_cachefile_filenum(ft->cf), xid, keybs, valbs);
} }
} }
}
void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool oplsn_valid, LSN oplsn, bool do_logging, enum ft_msg_type type) {
ft_txn_log_insert(ft_h->ft, key, val, txn, do_logging, type);
LSN treelsn; LSN treelsn;
if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) { if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
// do nothing // do nothing
} else { } else {
XIDS message_xids = txn ? toku_txn_get_xids(txn) : xids_get_root_xids();
TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h); TXN_MANAGER txn_manager = toku_ft_get_txn_manager(ft_h);
txn_manager_state txn_state_for_gc(txn_manager); txn_manager_state txn_state_for_gc(txn_manager);
...@@ -3408,10 +3696,26 @@ void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool ...@@ -3408,10 +3696,26 @@ void toku_ft_maybe_insert (FT_HANDLE ft_h, DBT *key, DBT *val, TOKUTXN txn, bool
// no messages above us, we can implicitly promote uxrs based on this xid // no messages above us, we can implicitly promote uxrs based on this xid
oldest_referenced_xid_estimate, oldest_referenced_xid_estimate,
txn != nullptr ? !txn->for_recovery : false); txn != nullptr ? !txn->for_recovery : false);
toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info); int r = ft_maybe_insert_into_rightmost_leaf(ft_h->ft, key, val, message_xids, FT_INSERT, &gc_info, false);
if (r != 0) {
toku_ft_send_insert(ft_h, key, val, message_xids, type, &gc_info);
}
} }
} }
static void ft_insert_directly_into_leaf(FT ft, FTNODE leaf, int target_childnum, DBT *key, DBT *val,
XIDS message_xids, enum ft_msg_type type, txn_gc_info *gc_info)
// Effect: Insert directly into a leaf node a fractal tree. Does not do any logging.
// Requires: Leaf is fully in memory and pinned for write.
// Requires: If this insertion were to happen through the root node, the promotion
// algorithm would have selected the given leaf node as the point of injection.
// That means this function relies on the current implementation of promotion.
{
FT_MSG_S ftcmd = { type, ZERO_MSN, message_xids, .u = { .id = { key, val } } };
size_t flow_deltas[] = { 0, 0 };
inject_message_in_locked_node(ft, leaf, target_childnum, &ftcmd, flow_deltas, gc_info);
}
static void static void
ft_send_update_msg(FT_HANDLE ft_h, FT_MSG_S *msg, TOKUTXN txn) { ft_send_update_msg(FT_HANDLE ft_h, FT_MSG_S *msg, TOKUTXN txn) {
msg->xids = (txn msg->xids = (txn
......
...@@ -213,6 +213,9 @@ int toku_ft_lookup (FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void ...@@ -213,6 +213,9 @@ int toku_ft_lookup (FT_HANDLE ft_h, DBT *k, FT_GET_CALLBACK_FUNCTION getf, void
// Effect: Insert a key and data pair into an ft // Effect: Insert a key and data pair into an ft
void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn); void toku_ft_insert (FT_HANDLE ft_h, DBT *k, DBT *v, TOKUTXN txn);
// Returns: 0 if the key was inserted, DB_KEYEXIST if the key already exists
int toku_ft_insert_unique(FT_HANDLE ft, DBT *k, DBT *v, TOKUTXN txn, bool do_logging);
// Effect: Optimize the ft // Effect: Optimize the ft
void toku_ft_optimize (FT_HANDLE ft_h); void toku_ft_optimize (FT_HANDLE ft_h);
......
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2014 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved."
#include "test.h"
#include <ft/ybt.h>
#include <ft/ft-cachetable-wrappers.h>
// Each FT maintains a sequential insert heuristic to determine if its
// worth trying to insert directly into a well-known rightmost leaf node.
//
// The heuristic is only maintained when a rightmost leaf node is known.
//
// This test verifies that sequential inserts increase the seqinsert score
// and that a single non-sequential insert resets the score.
static void test_seqinsert_heuristic(void) {
int r = 0;
char name[TOKU_PATH_MAX + 1];
toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata");
toku_os_recursive_delete(TOKU_TEST_FILENAME);
r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r);
FT_HANDLE ft_handle;
CACHETABLE ct;
toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER);
r = toku_open_ft_handle(name, 1, &ft_handle,
4*1024*1024, 64*1024,
TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL,
toku_builtin_compare_fun); CKERR(r);
FT ft = ft_handle->ft;
int k;
DBT key, val;
const int val_size = 1024 * 1024;
char *XMALLOC_N(val_size, val_buf);
memset(val_buf, 'x', val_size);
toku_fill_dbt(&val, val_buf, val_size);
// Insert many rows sequentially. This is enough data to:
// - force the root to split (the righmost leaf will then be known)
// - raise the seqinsert score high enough to enable direct rightmost injections
const int rows_to_insert = 200;
for (int i = 0; i < rows_to_insert; i++) {
k = toku_htonl(i);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
}
invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
invariant(ft->seqinsert_score == FT_SEQINSERT_SCORE_THRESHOLD);
// Insert on the left extreme. The seq insert score is high enough
// that we will attempt to insert into the rightmost leaf. We won't
// be successful because key 0 won't be in the bounds of the rightmost leaf.
// This failure should reset the seqinsert score back to 0.
k = toku_htonl(0);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 0);
// Insert in the middle. The score should not go up.
k = toku_htonl(rows_to_insert / 2);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 0);
// Insert on the right extreme. The score should go up.
k = toku_htonl(rows_to_insert);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 1);
// Insert again on the right extreme again, the score should go up.
k = toku_htonl(rows_to_insert + 1);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 2);
// Insert close to, but not at, the right extreme. The score should reset.
// -- the magic number 4 derives from the fact that vals are 1mb and nodes are 4mb
k = toku_htonl(rows_to_insert - 4);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
invariant(ft->seqinsert_score == 0);
toku_free(val_buf);
toku_ft_handle_close(ft_handle);
toku_cachetable_close(&ct);
toku_os_recursive_delete(TOKU_TEST_FILENAME);
}
int test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_seqinsert_heuristic();
return 0;
}
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2014 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2014 Tokutek Inc. All rights reserved."
#include "test.h"
#include <ft/ybt.h>
#include <ft/ft-cachetable-wrappers.h>
// Promotion tracks the rightmost blocknum in the FT when a message
// is successfully promoted to a non-root leaf node on the right extreme.
//
// This test verifies that a split or merge of the rightmost leaf properly
// maintains the rightmost blocknum (which is constant - the pair's swap values,
// like the root blocknum).
static void test_split_merge(void) {
int r = 0;
char name[TOKU_PATH_MAX + 1];
toku_path_join(name, 2, TOKU_TEST_FILENAME, "ftdata");
toku_os_recursive_delete(TOKU_TEST_FILENAME);
r = toku_os_mkdir(TOKU_TEST_FILENAME, S_IRWXU); CKERR(r);
FT_HANDLE ft_handle;
CACHETABLE ct;
toku_cachetable_create(&ct, 0, ZERO_LSN, NULL_LOGGER);
r = toku_open_ft_handle(name, 1, &ft_handle,
4*1024*1024, 64*1024,
TOKU_DEFAULT_COMPRESSION_METHOD, ct, NULL,
toku_builtin_compare_fun); CKERR(r);
// We have a root blocknum, but no rightmost blocknum yet.
FT ft = ft_handle->ft;
invariant(ft->h->root_blocknum.b != RESERVED_BLOCKNUM_NULL);
invariant(ft->rightmost_blocknum.b == RESERVED_BLOCKNUM_NULL);
int k;
DBT key, val;
const int val_size = 1 * 1024 * 1024;
char *XMALLOC_N(val_size, val_buf);
memset(val_buf, 'x', val_size);
toku_fill_dbt(&val, val_buf, val_size);
// Insert 16 rows (should induce a few splits)
const int rows_to_insert = 16;
for (int i = 0; i < rows_to_insert; i++) {
k = toku_htonl(i);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_insert(ft_handle, &key, &val, NULL);
}
// rightmost blocknum should be set, because the root split and promotion
// did a rightmost insertion directly into the rightmost leaf, lazily
// initializing the rightmost blocknum.
invariant(ft->rightmost_blocknum.b != RESERVED_BLOCKNUM_NULL);
BLOCKNUM root_blocknum = ft->h->root_blocknum;
FTNODE root_node;
struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, ft);
toku_pin_ftnode(ft, root_blocknum,
toku_cachetable_hash(ft->cf, ft->h->root_blocknum),
&bfe, PL_WRITE_EXPENSIVE, &root_node, true);
// root blocknum should be consistent
invariant(root_node->thisnodename.b == ft->h->root_blocknum.b);
// root should have split at least once, and it should now be at height 1
invariant(root_node->n_children > 1);
invariant(root_node->height == 1);
// rightmost blocknum should no longer be the root, since the root split
invariant(ft->h->root_blocknum.b != ft->rightmost_blocknum.b);
// the right child should have the rightmost blocknum
invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b);
BLOCKNUM rightmost_blocknum_before_merge = ft->rightmost_blocknum;
const int num_children_before_merge = root_node->n_children;
// delete the last 6 rows.
// - 1mb each, so 6mb deleted
// - should be enough to delete the entire rightmost leaf + some of its neighbor
const int rows_to_delete = 6;
toku_unpin_ftnode(ft, root_node);
for (int i = 0; i < rows_to_delete; i++) {
k = toku_htonl(rows_to_insert - i);
toku_fill_dbt(&key, &k, sizeof(k));
toku_ft_delete(ft_handle, &key, NULL);
}
toku_pin_ftnode(ft, root_blocknum,
toku_cachetable_hash(ft->cf, root_blocknum),
&bfe, PL_WRITE_EXPENSIVE, &root_node, true);
// - rightmost leaf should be fusible after those deletes (which were promoted directly to the leaf)
FTNODE rightmost_leaf;
toku_pin_ftnode(ft, rightmost_blocknum_before_merge,
toku_cachetable_hash(ft->cf, rightmost_blocknum_before_merge),
&bfe, PL_WRITE_EXPENSIVE, &rightmost_leaf, true);
invariant(get_node_reactivity(ft, rightmost_leaf) == RE_FUSIBLE);
toku_unpin_ftnode(ft, rightmost_leaf);
// - merge the rightmost child now that it's fusible
toku_ft_merge_child(ft, root_node, root_node->n_children - 1);
toku_pin_ftnode(ft, root_blocknum,
toku_cachetable_hash(ft->cf, root_blocknum),
&bfe, PL_WRITE_EXPENSIVE, &root_node, true);
// the merge should have worked, and the root should still be at height 1
invariant(root_node->n_children < num_children_before_merge);
invariant(root_node->height == 1);
// the rightmost child of the root has the rightmost blocknum
invariant(BP_BLOCKNUM(root_node, root_node->n_children - 1).b == ft->rightmost_blocknum.b);
// the value for rightmost blocknum itself should not have changed
// (we keep it constant, like the root blocknum)
invariant(rightmost_blocknum_before_merge.b == ft->rightmost_blocknum.b);
toku_unpin_ftnode(ft, root_node);
toku_free(val_buf);
toku_ft_handle_close(ft_handle);
toku_cachetable_close(&ct);
toku_os_recursive_delete(TOKU_TEST_FILENAME);
}
int test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_split_merge();
return 0;
}
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
/*
COPYING CONDITIONS NOTICE:
This program is free software; you can redistribute it and/or modify
it under the terms of version 2 of the GNU General Public License as
published by the Free Software Foundation, and provided that the
following conditions are met:
* Redistributions of source code must retain this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below).
* Redistributions in binary form must reproduce this COPYING
CONDITIONS NOTICE, the COPYRIGHT NOTICE (below), the
DISCLAIMER (below), the UNIVERSITY PATENT NOTICE (below), the
PATENT MARKING NOTICE (below), and the PATENT RIGHTS
GRANT (below) in the documentation and/or other materials
provided with the distribution.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301, USA.
COPYRIGHT NOTICE:
TokuDB, Tokutek Fractal Tree Indexing Library.
Copyright (C) 2007-2013 Tokutek, Inc.
DISCLAIMER:
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
UNIVERSITY PATENT NOTICE:
The technology is licensed by the Massachusetts Institute of
Technology, Rutgers State University of New Jersey, and the Research
Foundation of State University of New York at Stony Brook under
United States of America Serial No. 11/760379 and to the patents
and/or patent applications resulting from it.
PATENT MARKING NOTICE:
This software is covered by US Patent No. 8,185,551.
This software is covered by US Patent No. 8,489,638.
PATENT RIGHTS GRANT:
"THIS IMPLEMENTATION" means the copyrightable works distributed by
Tokutek as part of the Fractal Tree project.
"PATENT CLAIMS" means the claims of patents that are owned or
licensable by Tokutek, both currently or in the future; and that in
the absence of this license would be infringed by THIS
IMPLEMENTATION or by using or running THIS IMPLEMENTATION.
"PATENT CHALLENGE" shall mean a challenge to the validity,
patentability, enforceability and/or non-infringement of any of the
PATENT CLAIMS or otherwise opposing any of the PATENT CLAIMS.
Tokutek hereby grants to you, for the term and geographical scope of
the PATENT CLAIMS, a non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to
make, have made, use, offer to sell, sell, import, transfer, and
otherwise run, modify, and propagate the contents of THIS
IMPLEMENTATION, where such license applies only to the PATENT
CLAIMS. This grant does not include claims that would be infringed
only as a consequence of further modifications of THIS
IMPLEMENTATION. If you or your agent or licensee institute or order
or agree to the institution of patent litigation against any entity
(including a cross-claim or counterclaim in a lawsuit) alleging that
THIS IMPLEMENTATION constitutes direct or contributory patent
infringement, or inducement of patent infringement, then any rights
granted to you under this License shall terminate as of the date
such litigation is filed. If you or your agent or exclusive
licensee institute or order or agree to the institution of a PATENT
CHALLENGE, then Tokutek may terminate any rights granted to you
under this License.
*/
#ident "Copyright (c) 2007-2013 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
/**
* Test that unique inserts work correctly. This exercises the rightmost leaf inject optimization.
*/
#include <portability/toku_random.h>
#include "test.h"
static char random_buf[8];
static struct random_data random_data;
static void test_simple_unique_insert(DB_ENV *env) {
int r;
DB *db;
r = db_create(&db, env, 0); CKERR(r);
r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r);
DBT key1, key2, key3;
dbt_init(&key1, "a", sizeof("a"));
dbt_init(&key2, "b", sizeof("b"));
dbt_init(&key3, "c", sizeof("c"));
r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR(r);
r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR(r);
r = db->put(db, NULL, &key3, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR(r);
r = db->put(db, NULL, &key2, &key2, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
// sanity check
r = db->put(db, NULL, &key1, &key1, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->put(db, NULL, &key1, &key3, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
r = db->close(db, 0); CKERR(r);
r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r);
}
static void test_large_sequential_insert_unique(DB_ENV *env) {
int r;
DB *db;
r = db_create(&db, env, 0); CKERR(r);
// very small nodes/basements to make a taller tree
r = db->set_pagesize(db, 8 * 1024); CKERR(r);
r = db->set_readpagesize(db, 2 * 1024); CKERR(r);
r = db->open(db, NULL, "db", NULL, DB_BTREE, DB_CREATE, 0644); CKERR(r);
const int val_size = 1024;
char *XMALLOC_N(val_size, val_buf);
memset(val_buf, 'k', val_size);
DBT val;
dbt_init(&val, val_buf, val_size);
// grow a tree to about depth 3, taking sanity checks along the way
const int start_num_rows = (64 * 1024 * 1024) / val_size;
for (int i = 0; i < start_num_rows; i++) {
DBT key;
int k = toku_htonl(i);
dbt_init(&key, &k, sizeof(k));
r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r);
if (i % 50 == 0) {
// sanity check - should not be able to insert this key twice in a row
r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
// .. but re-inserting is okay, if we provisionally deleted the row
DB_TXN *txn;
r = env->txn_begin(env, NULL, &txn, 0); CKERR(r);
r = db->del(db, NULL, &key, DB_DELETE_ANY); CKERR(r);
r = db->put(db, NULL, &key, &val, DB_NOOVERWRITE); CKERR(r);
r = txn->commit(txn, 0); CKERR(r);
}
if (i > 0 && i % 250 == 0) {
// sanity check - unique checks on random keys we already inserted should
// fail (exercises middle-of-the-tree checks)
for (int check_i = 0; check_i < 4; check_i++) {
DBT rand_key;
int rand_k = toku_htonl(myrandom_r(&random_data) % i);
dbt_init(&rand_key, &rand_k, sizeof(rand_k));
r = db->put(db, NULL, &rand_key, &val, DB_NOOVERWRITE); CKERR2(r, DB_KEYEXIST);
}
}
}
toku_free(val_buf);
r = db->close(db, 0); CKERR(r);
r = env->dbremove(env, NULL, "db", NULL, 0); CKERR(r);
}
int test_main(int argc, char * const argv[]) {
default_parse_args(argc, argv);
int r;
const int envflags = DB_INIT_MPOOL | DB_CREATE | DB_THREAD |
DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_TXN | DB_PRIVATE;
// startup
DB_ENV *env;
toku_os_recursive_delete(TOKU_TEST_FILENAME);
r = toku_os_mkdir(TOKU_TEST_FILENAME, 0755); CKERR(r);
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, TOKU_TEST_FILENAME, envflags, 0755);
r = myinitstate_r(random(), random_buf, 8, &random_data); CKERR(r);
test_simple_unique_insert(env);
test_large_sequential_insert_unique(env);
// cleanup
r = env->close(env, 0); CKERR(r);
return 0;
}
...@@ -253,6 +253,30 @@ toku_db_del(DB *db, DB_TXN *txn, DBT *key, uint32_t flags, bool holds_mo_lock) { ...@@ -253,6 +253,30 @@ toku_db_del(DB *db, DB_TXN *txn, DBT *key, uint32_t flags, bool holds_mo_lock) {
return r; return r;
} }
static int
db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, int flags, bool do_log) {
int r = 0;
bool unique = false;
enum ft_msg_type type = FT_INSERT;
if (flags == DB_NOOVERWRITE) {
unique = true;
} else if (flags == DB_NOOVERWRITE_NO_ERROR) {
type = FT_INSERT_NO_OVERWRITE;
} else if (flags != 0) {
// All other non-zero flags are unsupported
r = EINVAL;
}
if (r == 0) {
TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : nullptr;
if (unique) {
r = toku_ft_insert_unique(db->i->ft_handle, key, val, ttxn, do_log);
} else {
toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, do_log, type);
}
invariant(r == DB_KEYEXIST || r == 0);
}
return r;
}
int int
toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_mo_lock) { toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_mo_lock) {
...@@ -265,25 +289,16 @@ toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_ ...@@ -265,25 +289,16 @@ toku_db_put(DB *db, DB_TXN *txn, DBT *key, DBT *val, uint32_t flags, bool holds_
flags &= ~lock_flags; flags &= ~lock_flags;
r = db_put_check_size_constraints(db, key, val); r = db_put_check_size_constraints(db, key, val);
if (r == 0) {
//Do any checking required by the flags. //Do locking if necessary.
r = db_put_check_overwrite_constraint(db, txn, key, lock_flags, flags);
}
//Do locking if necessary. Do not grab the lock again if this DB had a unique
//check performed because the lock was already grabbed by its cursor callback.
bool do_locking = (bool)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)); bool do_locking = (bool)(db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE));
if (r == 0 && do_locking && !(flags & DB_NOOVERWRITE)) { if (r == 0 && do_locking) {
r = toku_db_get_point_write_lock(db, txn, key); r = toku_db_get_point_write_lock(db, txn, key);
} }
if (r == 0) { if (r == 0) {
//Insert into the ft. //Insert into the ft.
TOKUTXN ttxn = txn ? db_txn_struct_i(txn)->tokutxn : NULL;
enum ft_msg_type type = FT_INSERT;
if (flags==DB_NOOVERWRITE_NO_ERROR) {
type = FT_INSERT_NO_OVERWRITE;
}
if (!holds_mo_lock) toku_multi_operation_client_lock(); if (!holds_mo_lock) toku_multi_operation_client_lock();
toku_ft_maybe_insert(db->i->ft_handle, key, val, ttxn, false, ZERO_LSN, true, type); r = db_put(db, txn, key, val, flags, true);
if (!holds_mo_lock) toku_multi_operation_client_unlock(); if (!holds_mo_lock) toku_multi_operation_client_unlock();
} }
...@@ -635,9 +650,11 @@ log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val ...@@ -635,9 +650,11 @@ log_put_multiple(DB_TXN *txn, DB *src_db, const DBT *src_key, const DBT *src_val
} }
} }
// Requires: If remaining_flags is non-null, this function performs any required uniqueness checks
// Otherwise, the caller is responsible.
static int static int
do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], DB *src_db, const DBT *src_key, bool indexer_shortcut) { do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], DBT_ARRAY vals[], uint32_t *remaining_flags, DB *src_db, const DBT *src_key, bool indexer_shortcut) {
TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn; int r = 0;
for (uint32_t which_db = 0; which_db < num_dbs; which_db++) { for (uint32_t which_db = 0; which_db < num_dbs; which_db++) {
DB *db = db_array[which_db]; DB *db = db_array[which_db];
...@@ -666,16 +683,21 @@ do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[], ...@@ -666,16 +683,21 @@ do_put_multiple(DB_TXN *txn, uint32_t num_dbs, DB *db_array[], DBT_ARRAY keys[],
} }
if (do_put) { if (do_put) {
for (uint32_t i = 0; i < keys[which_db].size; i++) { for (uint32_t i = 0; i < keys[which_db].size; i++) {
// if db is being indexed by an indexer, then put into that db if the src key is to the left or equal to the int flags = 0;
// indexers cursor. we have to get the src_db from the indexer and find it in the db_array. if (remaining_flags != nullptr) {
toku_ft_maybe_insert(db->i->ft_handle, flags = remaining_flags[which_db];
&keys[which_db].dbts[i], &vals[which_db].dbts[i], invariant(!(flags & DB_NOOVERWRITE_NO_ERROR));
ttxn, false, ZERO_LSN, false, FT_INSERT); }
r = db_put(db, txn, &keys[which_db].dbts[i], &vals[which_db].dbts[i], flags, false);
if (r != 0) {
goto done;
}
} }
} }
} }
} }
return 0; done:
return r;
} }
static int static int
...@@ -754,20 +776,14 @@ env_put_multiple_internal( ...@@ -754,20 +776,14 @@ env_put_multiple_internal(
r = db_put_check_size_constraints(db, &put_key, &put_val); r = db_put_check_size_constraints(db, &put_key, &put_val);
if (r != 0) goto cleanup; if (r != 0) goto cleanup;
//Check overwrite constraints
r = db_put_check_overwrite_constraint(db, txn,
&put_key,
lock_flags[which_db], remaining_flags[which_db]);
if (r != 0) goto cleanup;
if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) { if (remaining_flags[which_db] == DB_NOOVERWRITE_NO_ERROR) {
//put_multiple does not support delaying the no error, since we would //put_multiple does not support delaying the no error, since we would
//have to log the flag in the put_multiple. //have to log the flag in the put_multiple.
r = EINVAL; goto cleanup; r = EINVAL; goto cleanup;
} }
//Do locking if necessary. Do not grab the lock again if this DB had a unique //Do locking if necessary.
//check performed because the lock was already grabbed by its cursor callback. if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE)) {
if (db->i->lt && !(lock_flags[which_db] & DB_PRELOCKED_WRITE) && !(remaining_flags[which_db] & DB_NOOVERWRITE)) {
//Needs locking //Needs locking
r = toku_db_get_point_write_lock(db, txn, &put_key); r = toku_db_get_point_write_lock(db, txn, &put_key);
if (r != 0) goto cleanup; if (r != 0) goto cleanup;
...@@ -790,8 +806,10 @@ env_put_multiple_internal( ...@@ -790,8 +806,10 @@ env_put_multiple_internal(
} }
} }
toku_multi_operation_client_lock(); toku_multi_operation_client_lock();
log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts); r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, remaining_flags, src_db, src_key, indexer_shortcut);
r = do_put_multiple(txn, num_dbs, db_array, put_keys, put_vals, src_db, src_key, indexer_shortcut); if (r == 0) {
log_put_multiple(txn, src_db, src_key, src_val, num_dbs, fts);
}
toku_multi_operation_client_unlock(); toku_multi_operation_client_unlock();
if (indexer_lock_taken) { if (indexer_lock_taken) {
toku_indexer_unlock(indexer); toku_indexer_unlock(indexer);
...@@ -1075,7 +1093,7 @@ env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn, ...@@ -1075,7 +1093,7 @@ env_update_multiple(DB_ENV *env, DB *src_db, DB_TXN *txn,
// recovery so we don't end up losing data. // recovery so we don't end up losing data.
// So unlike env->put_multiple, we ONLY log a 'put_multiple' log entry. // So unlike env->put_multiple, we ONLY log a 'put_multiple' log entry.
log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_fts); log_put_multiple(txn, src_db, new_src_key, new_src_data, n_put_dbs, put_fts);
r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, src_db, new_src_key, indexer_shortcut); r = do_put_multiple(txn, n_put_dbs, put_dbs, put_key_arrays, put_val_arrays, nullptr, src_db, new_src_key, indexer_shortcut);
} }
toku_multi_operation_client_unlock(); toku_multi_operation_client_unlock();
if (indexer_lock_taken) { if (indexer_lock_taken) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment