diff --git a/newbrt/brt-cachetable-wrappers.c b/newbrt/brt-cachetable-wrappers.c index 64f738b9721a8e34f60826d53779339f6936d484..f75f11f202044676cbe8f2e11cabf357aeff5127 100644 --- a/newbrt/brt-cachetable-wrappers.c +++ b/newbrt/brt-cachetable-wrappers.c @@ -128,9 +128,11 @@ toku_pin_brtnode( const PIVOT_BOUNDS bounds, BRTNODE_FETCH_EXTRA bfe, BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this - BRTNODE *node_p) + BRTNODE *node_p, + BOOL* msgs_applied) { void *node_v; + *msgs_applied = FALSE; int r = toku_cachetable_get_and_pin_nonblocking( brt->cf, blocknum, @@ -146,7 +148,7 @@ toku_pin_brtnode( if (r==0) { BRTNODE node = node_v; if (apply_ancestor_messages) { - maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds); + maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied); } *node_p = node; // printf("%*sPin %ld\n", 8-node->height, "", blocknum.b); @@ -183,7 +185,8 @@ toku_pin_brtnode_holding_lock( ); assert(r==0); BRTNODE node = node_v; - if (apply_ancestor_messages) maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds); + BOOL msgs_applied; + if (apply_ancestor_messages) maybe_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, &msgs_applied); *node_p = node; } diff --git a/newbrt/brt-cachetable-wrappers.h b/newbrt/brt-cachetable-wrappers.h index 847359a41f0c34bcfa1fcef5e9e6e40d6a17d808..8f96c093d7f1016ba1175fc8fcdc1057b9e70c61 100644 --- a/newbrt/brt-cachetable-wrappers.h +++ b/newbrt/brt-cachetable-wrappers.h @@ -72,7 +72,8 @@ toku_pin_brtnode( const PIVOT_BOUNDS pbounds, BRTNODE_FETCH_EXTRA bfe, BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this - BRTNODE *node_p + BRTNODE *node_p, + BOOL* msgs_applied ) __attribute__((__warn_unused_result__)); /** diff --git a/newbrt/brt-internal.h b/newbrt/brt-internal.h index 2845d5df456492cc31756a01f895f2795eed83a5..42efdd7155da7cc2b9f5e20842ce3e5e68c9ff8d 100644 --- a/newbrt/brt-internal.h +++ b/newbrt/brt-internal.h @@ -706,7 +706,7 @@ struct pivot_bounds { }; // FIXME needs toku prefix -void maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds); +void maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, BOOL* msgs_applied); int toku_brt_search_which_child( diff --git a/newbrt/brt.c b/newbrt/brt.c index 50125ebcf56ad274f431267f100b1e3d8541c8f0..0ae9f846c0b385a8ff7aae2451aa305a110d7058 100644 --- a/newbrt/brt.c +++ b/newbrt/brt.c @@ -4777,7 +4777,8 @@ bnc_apply_messages_to_basement_node( BASEMENTNODE bn, // where to apply messages BRTNODE ancestor, // the ancestor node where we can find messages to apply int childnum, // which child buffer of ancestor contains messages we want - struct pivot_bounds const * const bounds // contains pivot key bounds of this basement node + struct pivot_bounds const * const bounds, // contains pivot key bounds of this basement node + BOOL* msgs_applied ) { int r; @@ -4830,6 +4831,7 @@ bnc_apply_messages_to_basement_node( // Apply the messages in MSN order. for (int i = 0; i < buffer_size; ++i) { + *msgs_applied = TRUE; const struct fifo_entry *entry = toku_fifo_get_entry(bnc->buffer, offsets[i]); do_brt_leaf_put_cmd(t, leafnode, bn, ancestor, childnum, entry); } @@ -4838,12 +4840,13 @@ bnc_apply_messages_to_basement_node( } else if (stale_lbi == stale_ube) { // No stale messages to apply, we just apply fresh messages. struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .leafnode = leafnode, .bn = bn, .ancestor = ancestor, .childnum = childnum }; - + if (fresh_ube - fresh_lbi > 0) *msgs_applied = TRUE; r = toku_omt_iterate_on_range(bnc->fresh_message_tree, fresh_lbi, fresh_ube, iterate_do_brt_leaf_put_cmd, &iter_extra); assert_zero(r); } else if (fresh_lbi == fresh_ube) { // No fresh messages to apply, we just apply stale messages. + if (stale_ube - stale_lbi > 0) *msgs_applied = TRUE; struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .leafnode = leafnode, .bn = bn, .ancestor = ancestor, .childnum = childnum }; r = toku_omt_iterate_on_range(bnc->stale_message_tree, stale_lbi, stale_ube, iterate_do_brt_leaf_put_cmd, &iter_extra); @@ -4867,6 +4870,7 @@ bnc_apply_messages_to_basement_node( // Iterate over both lists, applying the smaller (in (key, msn) // order) message at each step while (stale_i < stale_ube && fresh_i < fresh_ube) { + *msgs_applied = TRUE; const long stale_offset = (long) stale_v; const long fresh_offset = (long) fresh_v; int c = toku_fifo_entry_key_msn_cmp(&extra, &stale_offset, &fresh_offset); @@ -4944,7 +4948,7 @@ bnc_apply_messages_to_basement_node( } void -maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds) +maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds, BOOL* msgs_applied) // Effect: // Bring a leaf node up-to-date according to all the messages in the ancestors. // If the leaf node is already up-to-date then do nothing. @@ -4975,7 +4979,8 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors curr_bn, curr_ancestors->node, curr_ancestors->childnum, - &curr_bounds + &curr_bounds, + msgs_applied ); // We don't want to check this ancestor node again if the // next time we query it, the msn hasn't changed. @@ -5186,6 +5191,7 @@ brt_node_maybe_prefetch(BRT brt, BRTNODE node, int childnum, BRT_CURSOR brtcurso struct unlock_brtnode_extra { BRT brt; BRTNODE node; + BOOL msgs_applied; }; // When this is called, the cachetable lock is held static void @@ -5194,7 +5200,13 @@ unlock_brtnode_fun (void *v) { BRT brt = x->brt; BRTNODE node = x->node; // CT lock is held - int r = toku_cachetable_unpin_ct_prelocked_no_flush(brt->cf, node->thisnodename, node->fullhash, (enum cachetable_dirty) node->dirty, make_brtnode_pair_attr(node)); + int r = toku_cachetable_unpin_ct_prelocked_no_flush( + brt->cf, + node->thisnodename, + node->fullhash, + (enum cachetable_dirty) node->dirty, + x->msgs_applied ? make_brtnode_pair_attr(node) : make_invalid_pair_attr() + ); assert(r==0); } @@ -5221,24 +5233,24 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_ brtcursor->right_is_pos_infty, brtcursor->disable_prefetching ); + BOOL msgs_applied = FALSE; { int rr = toku_pin_brtnode(brt, childblocknum, fullhash, unlockers, &next_ancestors, bounds, &bfe, TRUE, - &childnode); + &childnode, + &msgs_applied); if (rr==TOKUDB_TRY_AGAIN) return rr; assert(rr==0); } - struct unlock_brtnode_extra unlock_extra = {brt,childnode}; + struct unlock_brtnode_extra unlock_extra = {brt,childnode,msgs_applied}; struct unlockers next_unlockers = {TRUE, unlock_brtnode_fun, (void*)&unlock_extra, unlockers}; int r = brt_search_node(brt, childnode, search, bfe.child_to_read, getf, getf_v, doprefetch, brtcursor, &next_unlockers, &next_ancestors, bounds, can_bulk_fetch); if (r!=TOKUDB_TRY_AGAIN) { - // Even if r is reactive, we want to handle the maybe reactive child. - #if TOKU_DO_PREFETCH // maybe prefetch the next child if (r == 0 && node->height == 1) { @@ -5247,7 +5259,12 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_ #endif assert(next_unlockers.locked); - toku_unpin_brtnode_read_only(brt, childnode); // unpin the childnode before handling the reactive child (because that may make the childnode disappear.) + if (msgs_applied) { + toku_unpin_brtnode(brt, childnode); + } + else { + toku_unpin_brtnode_read_only(brt, childnode); + } } else { // try again. @@ -5258,7 +5275,12 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_ // some piece of a node that it needed was not in memory. In this case, // the node was not unpinned, so we unpin it here if (next_unlockers.locked) { - toku_unpin_brtnode_read_only(brt, childnode); + if (msgs_applied) { + toku_unpin_brtnode(brt, childnode); + } + else { + toku_unpin_brtnode_read_only(brt, childnode); + } } } @@ -5557,7 +5579,7 @@ try_again: //uint tree_height = node->height + 1; // How high is the tree? This is the height of the root node plus one (leaf is at height 0). - struct unlock_brtnode_extra unlock_extra = {brt,node}; + struct unlock_brtnode_extra unlock_extra = {brt,node,FALSE}; struct unlockers unlockers = {TRUE, unlock_brtnode_fun, (void*)&unlock_extra, (UNLOCKERS)NULL}; { @@ -6061,11 +6083,13 @@ toku_brt_keyrange_internal (BRT brt, BRTNODE node, BLOCKNUM childblocknum = BP_BLOCKNUM(node, child_number); u_int32_t fullhash = compute_child_fullhash(brt->cf, node, child_number); BRTNODE childnode; - r = toku_pin_brtnode(brt, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, FALSE, &childnode); + BOOL msgs_applied = FALSE; + r = toku_pin_brtnode(brt, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, FALSE, &childnode, &msgs_applied); + assert(!msgs_applied); if (r != TOKUDB_TRY_AGAIN) { assert(r == 0); - struct unlock_brtnode_extra unlock_extra = {brt,childnode}; + struct unlock_brtnode_extra unlock_extra = {brt,childnode,FALSE}; struct unlockers next_unlockers = {TRUE, unlock_brtnode_fun, (void*)&unlock_extra, unlockers}; const struct pivot_bounds next_bounds = next_pivot_keys(node, child_number, bounds); @@ -6119,7 +6143,7 @@ toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less_p, u_int64_t *equal_p, u_i toku_brtheader_release_treelock(brt->h); } - struct unlock_brtnode_extra unlock_extra = {brt,node}; + struct unlock_brtnode_extra unlock_extra = {brt,node,FALSE}; struct unlockers unlockers = {TRUE, unlock_brtnode_fun, (void*)&unlock_extra, (UNLOCKERS)NULL}; { diff --git a/newbrt/tests/orthopush-flush.c b/newbrt/tests/orthopush-flush.c index 13f887c785f3b9ad9ab96722641e683d6fd6e3ec..61378d63c411cdd37b1fea5c9f2dd7ab6ca1bc50 100644 --- a/newbrt/tests/orthopush-flush.c +++ b/newbrt/tests/orthopush-flush.c @@ -620,7 +620,8 @@ flush_to_leaf(BRT t, bool make_leaf_up_to_date, bool use_flush) { parentnode->max_msn_applied_to_node_on_disk = max_parent_msn; struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL }; const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL }; - maybe_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds); + BOOL* msgs_applied; + maybe_apply_ancestors_messages_to_node(t, child, &ancestors, &infinite_bounds, &msgs_applied); FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, { @@ -840,7 +841,8 @@ flush_to_leaf_with_keyrange(BRT t, bool make_leaf_up_to_date) { parentnode->max_msn_applied_to_node_on_disk = max_parent_msn; struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL }; const struct pivot_bounds bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = kv_pair_malloc(childkeys[7].data, childkeys[7].size, NULL, 0) }; - maybe_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds); + BOOL msgs_applied; + maybe_apply_ancestors_messages_to_node(t, child, &ancestors, &bounds, &msgs_applied); FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, { @@ -1024,7 +1026,8 @@ compare_apply_and_flush(BRT t, bool make_leaf_up_to_date) { parentnode->max_msn_applied_to_node_on_disk = max_parent_msn; struct ancestors ancestors = { .node = parentnode, .childnum = 0, .next = NULL }; const struct pivot_bounds infinite_bounds = { .lower_bound_exclusive = NULL, .upper_bound_inclusive = NULL }; - maybe_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds); + BOOL msgs_applied; + maybe_apply_ancestors_messages_to_node(t, child2, &ancestors, &infinite_bounds, &msgs_applied); FIFO_ITERATE(parent_bnc->buffer, key, keylen, val, vallen, type, msn, xids, is_fresh, {