Commit 7427e0da authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

[t:3703], merge to main

git-svn-id: file:///svn/toku/tokudb@32737 c7de825b-a66e-492c-adef-691d508d4ae1
parent 0e07b35a
...@@ -140,7 +140,6 @@ struct brtnode_leaf_basement_node { ...@@ -140,7 +140,6 @@ struct brtnode_leaf_basement_node {
OMT buffer; OMT buffer;
unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */ unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */
unsigned int seqinsert; /* number of sequential inserts to this leaf */ unsigned int seqinsert; /* number of sequential inserts to this leaf */
MSN max_msn_applied;
}; };
#define PT_INVALID 0 #define PT_INVALID 0
...@@ -301,7 +300,6 @@ static inline void set_BSB(BRTNODE node, int i, SUB_BLOCK sb) { ...@@ -301,7 +300,6 @@ static inline void set_BSB(BRTNODE node, int i, SUB_BLOCK sb) {
#define BNC_BUFFER(node,i) (BNC(node,i)->buffer) #define BNC_BUFFER(node,i) (BNC(node,i)->buffer)
#define BNC_NBYTESINBUF(node,i) (BNC(node,i)->n_bytes_in_buffer) #define BNC_NBYTESINBUF(node,i) (BNC(node,i)->n_bytes_in_buffer)
#define BNC_NBYTESINBUF(node,i) (BNC(node,i)->n_bytes_in_buffer)
// leaf node macros // leaf node macros
#define BLB_OPTIMIZEDFORUPGRADE(node,i) (BLB(node,i)->optimized_for_upgrade) #define BLB_OPTIMIZEDFORUPGRADE(node,i) (BLB(node,i)->optimized_for_upgrade)
...@@ -614,7 +612,6 @@ brt_leaf_apply_cmd_once ( ...@@ -614,7 +612,6 @@ brt_leaf_apply_cmd_once (
TOKULOGGER logger, TOKULOGGER logger,
uint64_t *workdonep uint64_t *workdonep
); );
void brt_leaf_put_cmd (BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRT_MSG cmd, bool *made_change, uint64_t *workdonep);
void void
toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, uint64_t *workdonep); toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, bool *made_change, uint64_t *workdonep);
......
...@@ -804,7 +804,6 @@ BASEMENTNODE toku_create_empty_bn_no_buffer(void) { ...@@ -804,7 +804,6 @@ BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
bn->n_bytes_in_buffer = 0; bn->n_bytes_in_buffer = 0;
bn->seqinsert = 0; bn->seqinsert = 0;
bn->optimized_for_upgrade = 0; bn->optimized_for_upgrade = 0;
bn->max_msn_applied = ZERO_MSN;
return bn; return bn;
} }
...@@ -1017,6 +1016,7 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) { ...@@ -1017,6 +1016,7 @@ setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe) {
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
BP_INIT_UNTOUCHED_CLOCK(node,i); BP_INIT_UNTOUCHED_CLOCK(node,i);
BP_STATE(node,i) = toku_brtnode_partition_state(bfe, i); BP_STATE(node,i) = toku_brtnode_partition_state(bfe, i);
BP_WORKDONE(node,i) = 0;
if (BP_STATE(node,i) == PT_AVAIL) { if (BP_STATE(node,i) == PT_AVAIL) {
setup_available_brtnode_partition(node, i); setup_available_brtnode_partition(node, i);
BP_TOUCH_CLOCK(node,i); BP_TOUCH_CLOCK(node,i);
......
...@@ -196,13 +196,22 @@ nonleaf_node_is_gorged (BRTNODE node) { ...@@ -196,13 +196,22 @@ nonleaf_node_is_gorged (BRTNODE node) {
bool buffers_are_empty = TRUE; bool buffers_are_empty = TRUE;
toku_assert_entire_node_in_memory(node); toku_assert_entire_node_in_memory(node);
//
// the nonleaf node is gorged if the following holds true:
// - the buffers are non-empty
// - the total workdone by the buffers PLUS the size of the buffers
// is greater than node->nodesize (which as of Maxwell should be
// 4MB)
//
assert(node->height > 0); assert(node->height > 0);
for (int child = 0; child < node->n_children; ++child) {
size += BP_WORKDONE(node, child);
}
for (int child = 0; child < node->n_children; ++child) { for (int child = 0; child < node->n_children; ++child) {
if (BNC_NBYTESINBUF(node, child) > 0) { if (BNC_NBYTESINBUF(node, child) > 0) {
buffers_are_empty = FALSE; buffers_are_empty = FALSE;
break; break;
} }
size += BP_WORKDONE(node, child);
} }
return ((size > node->nodesize) return ((size > node->nodesize)
&& &&
...@@ -255,7 +264,7 @@ static u_int32_t compute_child_fullhash (CACHEFILE cf, BRTNODE node, int childnu ...@@ -255,7 +264,7 @@ static u_int32_t compute_child_fullhash (CACHEFILE cf, BRTNODE node, int childnu
abort(); return 0; abort(); return 0;
} }
static void maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool *made_change); static void maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds);
static long brtnode_memory_size (BRTNODE node); static long brtnode_memory_size (BRTNODE node);
...@@ -281,8 +290,7 @@ int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash, ...@@ -281,8 +290,7 @@ int toku_pin_brtnode (BRT brt, BLOCKNUM blocknum, u_int32_t fullhash,
unlockers); unlockers);
if (r==0) { if (r==0) {
BRTNODE node = node_v; BRTNODE node = node_v;
bool made_change; maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, &made_change);
*node_p = node; *node_p = node;
// printf("%*sPin %ld\n", 8-node->height, "", blocknum.b); // printf("%*sPin %ld\n", 8-node->height, "", blocknum.b);
} else { } else {
...@@ -313,8 +321,7 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha ...@@ -313,8 +321,7 @@ void toku_pin_brtnode_holding_lock (BRT brt, BLOCKNUM blocknum, u_int32_t fullha
); );
assert(r==0); assert(r==0);
BRTNODE node = node_v; BRTNODE node = node_v;
bool made_change; maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds);
maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, &made_change);
*node_p = node; *node_p = node;
} }
...@@ -1614,7 +1621,7 @@ struct setval_extra_s { ...@@ -1614,7 +1621,7 @@ struct setval_extra_s {
u_int32_t idx; u_int32_t idx;
LEAFENTRY le; LEAFENTRY le;
TOKULOGGER logger; TOKULOGGER logger;
int made_change; bool made_change;
uint64_t * workdonep; // set by brt_leaf_apply_cmd_once() uint64_t * workdonep; // set by brt_leaf_apply_cmd_once()
}; };
...@@ -1723,7 +1730,7 @@ static int do_update(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRT_MSG cmd, int id ...@@ -1723,7 +1730,7 @@ static int do_update(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRT_MSG cmd, int id
} }
// should be static, but used by test program(s) // should be static, but used by test program(s)
void static void
brt_leaf_put_cmd ( brt_leaf_put_cmd (
BRT t, BRT t,
BASEMENTNODE bn, BASEMENTNODE bn,
...@@ -1738,13 +1745,6 @@ brt_leaf_put_cmd ( ...@@ -1738,13 +1745,6 @@ brt_leaf_put_cmd (
{ {
uint64_t workdone_total = 0; // may be for one row or for many (or all) rows in leaf (if broadcast message) uint64_t workdone_total = 0; // may be for one row or for many (or all) rows in leaf (if broadcast message)
// ignore messages that have already been applied to this leaf
if (cmd->msn.msn <= bn->max_msn_applied.msn) {
// TODO3514 add accountability counter here
goto exit;
}
else bn->max_msn_applied = cmd->msn;
TOKULOGGER logger = toku_cachefile_logger(t->cf); TOKULOGGER logger = toku_cachefile_logger(t->cf);
LEAFENTRY storeddata; LEAFENTRY storeddata;
...@@ -1951,10 +1951,8 @@ brt_leaf_put_cmd ( ...@@ -1951,10 +1951,8 @@ brt_leaf_put_cmd (
// node->dirty = 1; // node->dirty = 1;
exit:
if (workdonep) if (workdonep)
*workdonep = workdone_total; *workdonep = workdone_total;
VERIFY_NODE(t, node);
return; return;
} }
...@@ -4922,8 +4920,8 @@ apply_buffer_messages_to_basement_node ( ...@@ -4922,8 +4920,8 @@ apply_buffer_messages_to_basement_node (
SUBTREE_EST se, SUBTREE_EST se,
BRTNODE ancestor, BRTNODE ancestor,
int childnum, int childnum,
struct pivot_bounds const * const bounds, MSN min_applied_msn,
bool *made_change struct pivot_bounds const * const bounds
) )
// Effect: For each messages in ANCESTOR that is between lower_bound_exclusive (exclusive) and upper_bound_inclusive (inclusive), apply the message to the node. // Effect: For each messages in ANCESTOR that is between lower_bound_exclusive (exclusive) and upper_bound_inclusive (inclusive), apply the message to the node.
// In ANCESTOR, the relevant messages are all in the buffer for child number CHILDNUM. // In ANCESTOR, the relevant messages are all in the buffer for child number CHILDNUM.
...@@ -4957,14 +4955,15 @@ apply_buffer_messages_to_basement_node ( ...@@ -4957,14 +4955,15 @@ apply_buffer_messages_to_basement_node (
({ ({
DBT hk; DBT hk;
toku_fill_dbt(&hk, key, keylen); toku_fill_dbt(&hk, key, keylen);
if (msn.msn > bn->max_msn_applied.msn && (!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr))) { if (msn.msn > min_applied_msn.msn && (!msg_type_has_key(type) || key_is_in_leaf_range(t, &hk, lbe_ptr, ubi_ptr))) {
DBT hv; DBT hv;
BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id = {&hk, BRT_MSG_S brtcmd = { (enum brt_msg_type)type, msn, xids, .u.id = {&hk,
toku_fill_dbt(&hv, val, vallen)} }; toku_fill_dbt(&hv, val, vallen)} };
uint64_t workdone_this_leaf = 0; uint64_t workdone_this_leaf = 0;
bool made_change;
brt_leaf_put_cmd(t, brt_leaf_put_cmd(t,
bn, se, bn, se,
&brtcmd, made_change, &workdone_this_leaf); &brtcmd, &made_change, &workdone_this_leaf);
BP_WORKDONE(ancestor, childnum) += workdone_this_leaf; BP_WORKDONE(ancestor, childnum) += workdone_this_leaf;
workdone_this_leaf_total += workdone_this_leaf; workdone_this_leaf_total += workdone_this_leaf;
} }
...@@ -4977,6 +4976,7 @@ apply_buffer_messages_to_basement_node ( ...@@ -4977,6 +4976,7 @@ apply_buffer_messages_to_basement_node (
return r; return r;
} }
/*
//########### //###########
static void static void
maybe_flush_pinned_node(BRT t, BRTNODE node, int childnum, BRTNODE child) { maybe_flush_pinned_node(BRT t, BRTNODE node, int childnum, BRTNODE child) {
...@@ -5065,11 +5065,13 @@ maybe_flush_pinned_node(BRT t, BRTNODE node, int childnum, BRTNODE child) { ...@@ -5065,11 +5065,13 @@ maybe_flush_pinned_node(BRT t, BRTNODE node, int childnum, BRTNODE child) {
} }
} }
*/
/*
static void static void
apply_ancestors_messages_to_leafnode_and_maybe_flush (BRT t, BASEMENTNODE bm, SUBTREE_EST se, ANCESTORS ancestors, BRTNODE child, apply_ancestors_messages_to_leafnode_and_maybe_flush (BRT t, BASEMENTNODE bm, SUBTREE_EST se, ANCESTORS ancestors, BRTNODE child,
const struct pivot_bounds const *bounds, bool *made_change) MSN min_applied_msn,
const struct pivot_bounds const *bounds)
// Effect: Go through ancestors list applying messages from first ancestor (height one), then next, until // Effect: Go through ancestors list applying messages from first ancestor (height one), then next, until
// all messages have been applied. // all messages have been applied.
// Then mark the node as up_to_date. // Then mark the node as up_to_date.
...@@ -5086,17 +5088,18 @@ apply_ancestors_messages_to_leafnode_and_maybe_flush (BRT t, BASEMENTNODE bm, SU ...@@ -5086,17 +5088,18 @@ apply_ancestors_messages_to_leafnode_and_maybe_flush (BRT t, BASEMENTNODE bm, SU
// With background flushing we will be able to back to a simpler loop (since the recursion will be tail recursion). // With background flushing we will be able to back to a simpler loop (since the recursion will be tail recursion).
{ {
if (ancestors) { if (ancestors) {
apply_buffer_messages_to_basement_node(t, bm, se, ancestors->node, ancestors->childnum, bounds, made_change); apply_buffer_messages_to_basement_node(t, bm, se, ancestors->node, ancestors->childnum, min_applied_msn, bounds);
apply_ancestors_messages_to_leafnode_and_maybe_flush(t, bm, se, ancestors->next, ancestors->node, bounds, made_change); apply_ancestors_messages_to_leafnode_and_maybe_flush(t, bm, se, ancestors->next, ancestors->node, min_applied_msn, bounds);
maybe_flush_pinned_node(t, ancestors->node, ancestors->childnum, child); maybe_flush_pinned_node(t, ancestors->node, ancestors->childnum, child);
} else { } else {
// have just applied messages stored in root // have just applied messages stored in root
bm->soft_copy_is_up_to_date = true; bm->soft_copy_is_up_to_date = true;
} }
} }
*/
static void static void
maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, bool *made_change) maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds)
// Effect: // Effect:
// Bring a leaf node up-to-date according to all the messages in the ancestors. // Bring a leaf node up-to-date according to all the messages in the ancestors.
// If the leaf node is already up-to-date then do nothing. // If the leaf node is already up-to-date then do nothing.
...@@ -5119,22 +5122,20 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -5119,22 +5122,20 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i); SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
ANCESTORS curr_ancestors = ancestors; ANCESTORS curr_ancestors = ancestors;
struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds); struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
BRTNODE child = node;
while (curr_ancestors) { while (curr_ancestors) {
height++; height++;
apply_ancestors_messages_to_leafnode_and_maybe_flush( apply_buffer_messages_to_basement_node(
t, t,
curr_bn, curr_bn,
curr_se, curr_se,
curr_ancestors, curr_ancestors->node,
child, curr_ancestors->childnum,
&curr_bounds, node->max_msn_applied_to_node_on_disk,
made_change &curr_bounds
); );
if (curr_ancestors->node->max_msn_applied_to_node_in_memory.msn > node->max_msn_applied_to_node_in_memory.msn) { if (curr_ancestors->node->max_msn_applied_to_node_in_memory.msn > node->max_msn_applied_to_node_in_memory.msn) {
node->max_msn_applied_to_node_in_memory = curr_ancestors->node->max_msn_applied_to_node_in_memory; node->max_msn_applied_to_node_in_memory = curr_ancestors->node->max_msn_applied_to_node_in_memory;
} }
child = curr_ancestors->node;
curr_ancestors= curr_ancestors->next; curr_ancestors= curr_ancestors->next;
} }
BLB_SOFTCOPYISUPTODATE(node, i) = TRUE; BLB_SOFTCOPYISUPTODATE(node, i) = TRUE;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment