Commit 489e8c7b authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

close[t:3994] {{{svn merge -r36763:36780 ../tokudb.3994c}}}. Closes #3994.

git-svn-id: file:///svn/toku/tokudb@36781 c7de825b-a66e-492c-adef-691d508d4ae1
parent 87b69d9f
...@@ -50,36 +50,6 @@ enum { BUFFER_HEADER_SIZE = (4 // height// ...@@ -50,36 +50,6 @@ enum { BUFFER_HEADER_SIZE = (4 // height//
+ TREE_FANOUT * 8 // children + TREE_FANOUT * 8 // children
) }; ) };
struct __attribute__((__packed__)) subtree_estimates {
// estimate number of rows in the tree by counting the number of rows
// in the leaves. The stuff in the internal nodes is likely to be off O(1).
u_int64_t nkeys; // number of distinct keys (obsolete with removal of dupsort, but not worth removing)
u_int64_t ndata; // number of key-data pairs (previously leafentry_estimate)
u_int64_t dsize; // total size of leafentries
BOOL exact; // are the estimates exact?
};
static struct subtree_estimates const zero_estimates = {0,0,0,TRUE};
static inline struct subtree_estimates __attribute__((__unused__))
make_subtree_estimates (u_int64_t nkeys, u_int64_t ndata, u_int64_t dsize, BOOL exact) {
return (struct subtree_estimates){nkeys, ndata, dsize, exact};
}
static inline void
subtract_estimates (struct subtree_estimates *a, struct subtree_estimates *b) {
if (a->nkeys >= b->nkeys) a->nkeys -= b->nkeys; else a->nkeys=0;
if (a->ndata >= b->ndata) a->ndata -= b->ndata; else a->ndata=0;
if (a->dsize >= b->dsize) a->dsize -= b->dsize; else a->dsize=0;
}
static inline void
add_estimates (struct subtree_estimates *a, struct subtree_estimates *b) {
a->nkeys += b->nkeys;
a->ndata += b->ndata;
a->dsize += b->dsize;
}
// //
// Field in brtnode_fetch_extra that tells the // Field in brtnode_fetch_extra that tells the
// partial fetch callback what piece of the node // partial fetch callback what piece of the node
...@@ -198,8 +168,6 @@ struct __attribute__((__packed__)) brtnode_partition { ...@@ -198,8 +168,6 @@ struct __attribute__((__packed__)) brtnode_partition {
// for leaf nodes, they are meaningless // for leaf nodes, they are meaningless
BLOCKNUM blocknum; // blocknum of child BLOCKNUM blocknum; // blocknum of child
//estimates for a child, for leaf nodes, are estimates of basement nodes
struct subtree_estimates subtree_estimates;
// //
// at any time, the partitions may be in one of the following three states (stored in pt_state): // at any time, the partitions may be in one of the following three states (stored in pt_state):
// PT_INVALID - means that the partition was just initialized // PT_INVALID - means that the partition was just initialized
...@@ -268,7 +236,6 @@ struct brtnode { ...@@ -268,7 +236,6 @@ struct brtnode {
#define BP_STATE(node,i) ((node)->bp[i].state) #define BP_STATE(node,i) ((node)->bp[i].state)
#define BP_START(node,i) ((node)->bp[i].start) #define BP_START(node,i) ((node)->bp[i].start)
#define BP_SIZE(node,i) ((node)->bp[i].size) #define BP_SIZE(node,i) ((node)->bp[i].size)
#define BP_SUBTREE_EST(node,i) ((node)->bp[i].subtree_estimates)
#define BP_WORKDONE(node, i)((node)->bp[i].workdone) #define BP_WORKDONE(node, i)((node)->bp[i].workdone)
// //
...@@ -780,7 +747,6 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -780,7 +747,6 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node
void void
brt_leaf_apply_cmd_once ( brt_leaf_apply_cmd_once (
BASEMENTNODE bn, BASEMENTNODE bn,
SUBTREE_EST se,
const BRT_MSG cmd, const BRT_MSG cmd,
u_int32_t idx, u_int32_t idx,
LEAFENTRY le, LEAFENTRY le,
...@@ -795,7 +761,6 @@ brt_leaf_put_cmd ( ...@@ -795,7 +761,6 @@ brt_leaf_put_cmd (
brt_update_func update_fun, brt_update_func update_fun,
DESCRIPTOR desc, DESCRIPTOR desc,
BASEMENTNODE bn, BASEMENTNODE bn,
SUBTREE_EST se,
BRT_MSG cmd, BRT_MSG cmd,
bool* made_change, bool* made_change,
uint64_t *workdone, uint64_t *workdone,
......
...@@ -381,7 +381,6 @@ serialize_brtnode_info_size(BRTNODE node) ...@@ -381,7 +381,6 @@ serialize_brtnode_info_size(BRTNODE node)
retval += 4; // flags retval += 4; // flags
retval += 4; // height; retval += 4; // height;
retval += 4; // optimized_for_upgrade retval += 4; // optimized_for_upgrade
retval += (3*8+1)*node->n_children; // subtree estimates for each child
retval += node->totalchildkeylens; // total length of pivots retval += node->totalchildkeylens; // total length of pivots
retval += (node->n_children-1)*4; // encode length of each pivot retval += (node->n_children-1)*4; // encode length of each pivot
if (node->height > 0) { if (node->height > 0) {
...@@ -407,13 +406,6 @@ static void serialize_brtnode_info(BRTNODE node, ...@@ -407,13 +406,6 @@ static void serialize_brtnode_info(BRTNODE node,
wbuf_nocrc_uint(&wb, node->flags); wbuf_nocrc_uint(&wb, node->flags);
wbuf_nocrc_int (&wb, node->height); wbuf_nocrc_int (&wb, node->height);
wbuf_nocrc_int (&wb, node->optimized_for_upgrade); wbuf_nocrc_int (&wb, node->optimized_for_upgrade);
// subtree estimates of each child
for (int i = 0; i < node->n_children; i++) {
wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).nkeys);
wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).ndata);
wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).dsize);
wbuf_nocrc_char (&wb, (char)BP_SUBTREE_EST(node,i).exact);
}
// pivot information // pivot information
for (int i = 0; i < node->n_children-1; i++) { for (int i = 0; i < node->n_children-1; i++) {
wbuf_nocrc_bytes(&wb, kv_pair_key(node->childkeys[i]), toku_brt_pivot_key_len(node->childkeys[i])); wbuf_nocrc_bytes(&wb, kv_pair_key(node->childkeys[i]), toku_brt_pivot_key_len(node->childkeys[i]));
...@@ -599,9 +591,6 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize) ...@@ -599,9 +591,6 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
} }
node->max_msn_applied_to_node_on_disk = max_msn; node->max_msn_applied_to_node_on_disk = max_msn;
// now the subtree estimates
toku_brt_leaf_reset_calc_leaf_stats(node);
toku_free(array); toku_free(array);
toku_free(new_pivots); toku_free(new_pivots);
} }
...@@ -1132,14 +1121,6 @@ deserialize_brtnode_info( ...@@ -1132,14 +1121,6 @@ deserialize_brtnode_info(
// n_children is now in the header, nd the allocatio of the node->bp is in deserialize_brtnode_from_rbuf. // n_children is now in the header, nd the allocatio of the node->bp is in deserialize_brtnode_from_rbuf.
assert(node->bp!=NULL); // assert(node->bp!=NULL); //
for (int i=0; i < node->n_children; i++) {
SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
curr_se->nkeys = rbuf_ulonglong(&rb);
curr_se->ndata = rbuf_ulonglong(&rb);
curr_se->dsize = rbuf_ulonglong(&rb);
curr_se->exact = (BOOL) (rbuf_char(&rb) != 0);
}
// now the pivots // now the pivots
node->totalchildkeylens = 0; node->totalchildkeylens = 0;
if (node->n_children > 1) { if (node->n_children > 1) {
...@@ -1630,9 +1611,6 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode ...@@ -1630,9 +1611,6 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode
read_and_decompress_sub_block(&rb, &curr_sb); read_and_decompress_sub_block(&rb, &curr_sb);
// at this point, sb->uncompressed_ptr stores the serialized node partition // at this point, sb->uncompressed_ptr stores the serialized node partition
deserialize_brtnode_partition(&curr_sb, node, childnum, &bfe->h->descriptor, bfe->h->compare_fun); deserialize_brtnode_partition(&curr_sb, node, childnum, &bfe->h->descriptor, bfe->h->compare_fun);
if (node->height == 0) {
toku_brt_bn_reset_stats(node, childnum);
}
toku_free(curr_sb.uncompressed_ptr); toku_free(curr_sb.uncompressed_ptr);
toku_free(raw_block); toku_free(raw_block);
} }
...@@ -1657,9 +1635,6 @@ toku_deserialize_bp_from_compressed(BRTNODE node, int childnum, ...@@ -1657,9 +1635,6 @@ toku_deserialize_bp_from_compressed(BRTNODE node, int childnum,
curr_sb->compressed_size curr_sb->compressed_size
); );
deserialize_brtnode_partition(curr_sb, node, childnum, desc, cmp); deserialize_brtnode_partition(curr_sb, node, childnum, desc, cmp);
if (node->height == 0) {
toku_brt_bn_reset_stats(node, childnum);
}
toku_free(curr_sb->uncompressed_ptr); toku_free(curr_sb->uncompressed_ptr);
toku_free(curr_sb->compressed_ptr); toku_free(curr_sb->compressed_ptr);
toku_free(curr_sb); toku_free(curr_sb);
......
...@@ -417,9 +417,6 @@ static void checkpoint_nodes(struct brt_header* h, u_int32_t num_dependent_nodes ...@@ -417,9 +417,6 @@ static void checkpoint_nodes(struct brt_header* h, u_int32_t num_dependent_nodes
); );
} }
static inline void
toku_verify_estimates (BRT t, BRTNODE node, ANCESTORS, struct pivot_bounds const * const bounds);
static void toku_unpin_brtnode_off_client_thread (struct brt_header* h, BRTNODE node) static void toku_unpin_brtnode_off_client_thread (struct brt_header* h, BRTNODE node)
// Effect: Unpin a brt node. // Effect: Unpin a brt node.
{ {
...@@ -442,49 +439,6 @@ void toku_unpin_brtnode (BRT brt, BRTNODE node) ...@@ -442,49 +439,6 @@ void toku_unpin_brtnode (BRT brt, BRTNODE node)
toku_unpin_brtnode_off_client_thread(brt->h, node); toku_unpin_brtnode_off_client_thread(brt->h, node);
} }
struct fill_leafnode_estimates_state {
SUBTREE_EST e;
};
static int
fill_leafnode_estimates (OMTVALUE val, u_int32_t UU(idx), void *vs)
{
LEAFENTRY le = val;
struct fill_leafnode_estimates_state *s = vs;
s->e->dsize += le_keylen(le) + le_latest_vallen(le);
s->e->ndata++;
s->e->nkeys++;
return 0; // must return 0 to work with an omt_iterator
}
static struct subtree_estimates
calc_leaf_stats (OMT buffer) {
struct subtree_estimates e = zero_estimates;
struct fill_leafnode_estimates_state f = {&e};
toku_omt_iterate(buffer, fill_leafnode_estimates, &f);
return e;
}
void
toku_brt_bn_reset_stats(BRTNODE node, int childnum)
{
// basement node may be evicted, so only update stats if the basement node
// is fully in memory
// TODO: (Zardosht) for row cache, figure out a better way to do this
if (BP_STATE(node,childnum) == PT_AVAIL) {
BP_SUBTREE_EST(node,childnum) = calc_leaf_stats(BLB_BUFFER(node, childnum));
}
}
void
toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) {
invariant(node->height==0);
int i = 0;
for (i = 0; i < node->n_children; i++) {
toku_brt_bn_reset_stats(node,i);
}
}
// TODO: (Zardosht) look into this and possibly fix and use // TODO: (Zardosht) look into this and possibly fix and use
static void __attribute__((__unused__)) static void __attribute__((__unused__))
brt_leaf_check_leaf_stats (BRTNODE node) brt_leaf_check_leaf_stats (BRTNODE node)
...@@ -506,40 +460,6 @@ toku_bnc_n_entries(NONLEAF_CHILDINFO bnc) ...@@ -506,40 +460,6 @@ toku_bnc_n_entries(NONLEAF_CHILDINFO bnc)
return toku_fifo_n_entries(bnc->buffer); return toku_fifo_n_entries(bnc->buffer);
} }
// This should be done incrementally in most cases.
static void
fixup_child_estimates (BRTNODE node, int childnum_of_node, BRTNODE child, BOOL dirty_it)
// Effect: Sum the child leafentry estimates and store them in NODE.
// Parameters:
// node The node to modify
// childnum_of_node Which child changed (PERFORMANCE: Later we could compute this incrementally)
// child The child that changed.
// dirty_it If true, then mark the node dirty. (Don't want to do this when updating in in-memory leaf. Only force dirty when messages are being pushed down.
{
struct subtree_estimates estimates = zero_estimates;
estimates.exact = TRUE;
int i;
for (i=0; i<child->n_children; i++) {
SUBTREE_EST child_se = &BP_SUBTREE_EST(child,i);
estimates.nkeys += child_se->nkeys;
estimates.ndata += child_se->ndata;
estimates.dsize += child_se->dsize;
if (!child_se->exact) estimates.exact = FALSE;
if (child->height>0) {
if (BP_STATE(child,i) != PT_AVAIL ||
toku_bnc_n_entries(BNC(child,i))!=0)
{
estimates.exact=FALSE;
}
}
}
// We only call this function if we have reason to believe that the child changed.
BP_SUBTREE_EST(node,childnum_of_node) = estimates;
if (dirty_it) {
node->dirty=1;
}
}
static struct kv_pair const *prepivotkey (BRTNODE node, int childnum, struct kv_pair const * const lower_bound_exclusive) { static struct kv_pair const *prepivotkey (BRTNODE node, int childnum, struct kv_pair const * const lower_bound_exclusive) {
if (childnum==0) if (childnum==0)
return lower_bound_exclusive; return lower_bound_exclusive;
...@@ -561,36 +481,6 @@ static struct pivot_bounds next_pivot_keys (BRTNODE node, int childnum, struct p ...@@ -561,36 +481,6 @@ static struct pivot_bounds next_pivot_keys (BRTNODE node, int childnum, struct p
return pb; return pb;
} }
static inline void
toku_verify_estimates (BRT t, BRTNODE node, ANCESTORS ancestors, struct pivot_bounds const * const bounds) {
int childnum;
for (childnum=0; childnum<node->n_children; childnum++) {
// we'll just do this estimate
u_int64_t child_estimate = 0;
// can only check the state of available partitions
if (BP_STATE(node, childnum) == PT_AVAIL) {
if (node->height > 0) {
struct ancestors next_ancestors = {node, childnum, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
BLOCKNUM childblocknum = BP_BLOCKNUM(node, childnum);
u_int32_t fullhash = compute_child_fullhash(t->cf, node, childnum);
BRTNODE childnode;
struct brtnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, t->h);
toku_pin_brtnode_holding_lock(t, childblocknum, fullhash, &next_ancestors, &next_bounds, &bfe, TRUE, &childnode);
for (int i=0; i<childnode->n_children; i++) {
child_estimate += BP_SUBTREE_EST(childnode, i).ndata;
}
toku_unpin_brtnode(t, childnode);
}
else {
child_estimate = toku_omt_size(BLB_BUFFER(node, childnum));
}
assert(BP_SUBTREE_EST(node,childnum).ndata==child_estimate);
}
}
}
static LEAFENTRY static LEAFENTRY
fetch_from_buf (OMT omt, u_int32_t idx) { fetch_from_buf (OMT omt, u_int32_t idx) {
OMTVALUE v = 0; OMTVALUE v = 0;
...@@ -1293,7 +1183,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num ...@@ -1293,7 +1183,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
BP_STATE(n,i) = PT_INVALID; BP_STATE(n,i) = PT_INVALID;
BP_START(n,i) = 0; BP_START(n,i) = 0;
BP_SIZE (n,i) = 0; BP_SIZE (n,i) = 0;
BP_SUBTREE_EST(n,i) = zero_estimates;
BP_WORKDONE(n,i) = 0; BP_WORKDONE(n,i) = 0;
BP_INIT_TOUCHED_CLOCK(n, i); BP_INIT_TOUCHED_CLOCK(n, i);
set_BNULL(n,i); set_BNULL(n,i);
...@@ -1327,8 +1216,6 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r ...@@ -1327,8 +1216,6 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r
newroot->totalchildkeylens=splitk.size; newroot->totalchildkeylens=splitk.size;
BP_BLOCKNUM(newroot,0)=nodea->thisnodename; BP_BLOCKNUM(newroot,0)=nodea->thisnodename;
BP_BLOCKNUM(newroot,1)=nodeb->thisnodename; BP_BLOCKNUM(newroot,1)=nodeb->thisnodename;
fixup_child_estimates(newroot, 0, nodea, TRUE);
fixup_child_estimates(newroot, 1, nodeb, TRUE);
{ {
MSN msna = nodea->max_msn_applied_to_node_on_disk; MSN msna = nodea->max_msn_applied_to_node_on_disk;
MSN msnb = nodeb->max_msn_applied_to_node_on_disk; MSN msnb = nodeb->max_msn_applied_to_node_on_disk;
...@@ -1459,7 +1346,6 @@ init_childinfo(BRTNODE node, int childnum, BRTNODE child) { ...@@ -1459,7 +1346,6 @@ init_childinfo(BRTNODE node, int childnum, BRTNODE child) {
BP_STATE(node,childnum) = PT_AVAIL; BP_STATE(node,childnum) = PT_AVAIL;
BP_START(node,childnum) = 0; BP_START(node,childnum) = 0;
BP_SIZE (node,childnum) = 0; BP_SIZE (node,childnum) = 0;
BP_SUBTREE_EST(node,childnum) = zero_estimates;
BP_WORKDONE(node, childnum) = 0; BP_WORKDONE(node, childnum) = 0;
set_BNC(node, childnum, toku_create_empty_nl()); set_BNC(node, childnum, toku_create_empty_nl());
} }
...@@ -1568,7 +1454,6 @@ move_leafentries( ...@@ -1568,7 +1454,6 @@ move_leafentries(
OMT src_omt, OMT src_omt,
u_int32_t lbi, //lower bound inclusive u_int32_t lbi, //lower bound inclusive
u_int32_t ube, //upper bound exclusive u_int32_t ube, //upper bound exclusive
SUBTREE_EST se_diff,
u_int32_t* num_bytes_moved u_int32_t* num_bytes_moved
) )
//Effect: move leafentries in the range [lbi, upe) from src_omt to newly created dest_omt //Effect: move leafentries in the range [lbi, upe) from src_omt to newly created dest_omt
...@@ -1581,10 +1466,6 @@ move_leafentries( ...@@ -1581,10 +1466,6 @@ move_leafentries(
LEAFENTRY curr_le = NULL; LEAFENTRY curr_le = NULL;
curr_le = fetch_from_buf(src_omt, i); curr_le = fetch_from_buf(src_omt, i);
se_diff->nkeys++;
se_diff->ndata++;
se_diff->dsize += le_keylen(curr_le) + le_latest_vallen(curr_le);
*num_bytes_moved += leafentry_disksize(curr_le); *num_bytes_moved += leafentry_disksize(curr_le);
new_le[i-lbi] = curr_le; new_le[i-lbi] = curr_le;
} }
...@@ -1711,7 +1592,6 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -1711,7 +1592,6 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node
// handle the move of a subset of data in split_node from node to B // handle the move of a subset of data in split_node from node to B
if (!split_on_boundary) { if (!split_on_boundary) {
BP_STATE(B,curr_dest_bn_index) = PT_AVAIL; BP_STATE(B,curr_dest_bn_index) = PT_AVAIL;
struct subtree_estimates se_diff = zero_estimates;
u_int32_t diff_size = 0; u_int32_t diff_size = 0;
destroy_basement_node (BLB(B, curr_dest_bn_index)); // Destroy B's empty OMT, so I can rebuild it from an array destroy_basement_node (BLB(B, curr_dest_bn_index)); // Destroy B's empty OMT, so I can rebuild it from an array
set_BNULL(B, curr_dest_bn_index); set_BNULL(B, curr_dest_bn_index);
...@@ -1721,13 +1601,10 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -1721,13 +1601,10 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node
BLB_BUFFER(node, curr_src_bn_index), BLB_BUFFER(node, curr_src_bn_index),
split_at_in_node+1, split_at_in_node+1,
toku_omt_size(BLB_BUFFER(node, curr_src_bn_index)), toku_omt_size(BLB_BUFFER(node, curr_src_bn_index)),
&se_diff,
&diff_size &diff_size
); );
BLB_NBYTESINBUF(node, curr_src_bn_index) -= diff_size; BLB_NBYTESINBUF(node, curr_src_bn_index) -= diff_size;
BLB_NBYTESINBUF(B, curr_dest_bn_index) += diff_size; BLB_NBYTESINBUF(B, curr_dest_bn_index) += diff_size;
subtract_estimates(&BP_SUBTREE_EST(node,curr_src_bn_index), &se_diff);
add_estimates(&BP_SUBTREE_EST(B,curr_dest_bn_index), &se_diff);
curr_dest_bn_index++; curr_dest_bn_index++;
} }
curr_src_bn_index++; curr_src_bn_index++;
...@@ -1763,10 +1640,6 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -1763,10 +1640,6 @@ brtleaf_split (struct brt_header* h, BRTNODE node, BRTNODE *nodea, BRTNODE *node
REALLOC_N(num_children_in_node, node->bp); REALLOC_N(num_children_in_node, node->bp);
REALLOC_N(num_children_in_node-1, node->childkeys); REALLOC_N(num_children_in_node-1, node->childkeys);
// this may be unnecessary. Not sure
// but it is safe to do. Splits are infrequent
toku_brt_leaf_reset_calc_leaf_stats(node);
toku_brt_leaf_reset_calc_leaf_stats(B);
} }
if (splitk) { if (splitk) {
memset(splitk, 0, sizeof *splitk); memset(splitk, 0, sizeof *splitk);
...@@ -1919,13 +1792,10 @@ handle_split_of_child (BRTNODE node, int childnum, ...@@ -1919,13 +1792,10 @@ handle_split_of_child (BRTNODE node, int childnum,
assert(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child assert(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child
BP_BLOCKNUM(node, childnum+1) = childb->thisnodename; BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
BP_SUBTREE_EST(node,childnum+1) = zero_estimates;
BP_WORKDONE(node, childnum+1) = 0; BP_WORKDONE(node, childnum+1) = 0;
BP_STATE(node,childnum+1) = PT_AVAIL; BP_STATE(node,childnum+1) = PT_AVAIL;
BP_START(node,childnum+1) = 0; BP_START(node,childnum+1) = 0;
BP_SIZE(node,childnum+1) = 0; BP_SIZE(node,childnum+1) = 0;
fixup_child_estimates(node, childnum, childa, TRUE);
fixup_child_estimates(node, childnum+1, childb, TRUE);
set_BNC(node, childnum+1, toku_create_empty_nl()); set_BNC(node, childnum+1, toku_create_empty_nl());
...@@ -2042,17 +1912,9 @@ brt_split_child (struct brt_header* h, BRTNODE node, int childnum, BRTNODE child ...@@ -2042,17 +1912,9 @@ brt_split_child (struct brt_header* h, BRTNODE node, int childnum, BRTNODE child
} }
} }
static void
bump_nkeys (SUBTREE_EST a, int direction) {
int keybump=direction;
a->nkeys += keybump;
assert(a->exact);
}
static void static void
brt_leaf_delete_leafentry ( brt_leaf_delete_leafentry (
BASEMENTNODE bn, BASEMENTNODE bn,
SUBTREE_EST se,
u_int32_t idx, u_int32_t idx,
LEAFENTRY le LEAFENTRY le
) )
...@@ -2061,7 +1923,6 @@ brt_leaf_delete_leafentry ( ...@@ -2061,7 +1923,6 @@ brt_leaf_delete_leafentry (
// le is the leafentry to be deleted // le is the leafentry to be deleted
{ {
// Figure out if one of the other keys is the same key // Figure out if one of the other keys is the same key
bump_nkeys(se, -1);
{ {
int r = toku_omt_delete_at(bn->buffer, idx); int r = toku_omt_delete_at(bn->buffer, idx);
...@@ -2070,19 +1931,11 @@ brt_leaf_delete_leafentry ( ...@@ -2070,19 +1931,11 @@ brt_leaf_delete_leafentry (
bn->n_bytes_in_buffer -= leafentry_disksize(le); bn->n_bytes_in_buffer -= leafentry_disksize(le);
{
u_int32_t oldlen = le_latest_vallen(le) + le_keylen(le);
assert(se->dsize >= oldlen);
se->dsize -= oldlen;
}
assert(se->dsize < (1U<<31)); // make sure we didn't underflow
se->ndata --;
} }
void void
brt_leaf_apply_cmd_once ( brt_leaf_apply_cmd_once (
BASEMENTNODE bn, BASEMENTNODE bn,
SUBTREE_EST se,
const BRT_MSG cmd, const BRT_MSG cmd,
u_int32_t idx, u_int32_t idx,
LEAFENTRY le, LEAFENTRY le,
...@@ -2114,14 +1967,6 @@ brt_leaf_apply_cmd_once ( ...@@ -2114,14 +1967,6 @@ brt_leaf_apply_cmd_once (
if (le && new_le) { if (le && new_le) {
// If we are replacing a leafentry, then the counts on the estimates remain unchanged, but the size might change // If we are replacing a leafentry, then the counts on the estimates remain unchanged, but the size might change
{
u_int32_t oldlen = le_keylen(le) + le_latest_vallen(le);
assert(se->dsize >= oldlen);
assert(se->dsize < (1U<<31)); // make sure we didn't underflow
se->dsize -= oldlen;
se->dsize += le_keylen(new_le) + le_latest_vallen(new_le); // add it in two pieces to avoid ugly overflow
assert(se->dsize < (1U<<31)); // make sure we didn't underflow
}
bn->n_bytes_in_buffer -= leafentry_disksize(le); bn->n_bytes_in_buffer -= leafentry_disksize(le);
...@@ -2137,7 +1982,7 @@ brt_leaf_apply_cmd_once ( ...@@ -2137,7 +1982,7 @@ brt_leaf_apply_cmd_once (
} else { } else {
if (le) { if (le) {
brt_leaf_delete_leafentry (bn, se, idx, le); brt_leaf_delete_leafentry (bn, idx, le);
toku_free(le); toku_free(le);
workdone_this_le = oldsize; workdone_this_le = oldsize;
} }
...@@ -2147,10 +1992,6 @@ brt_leaf_apply_cmd_once ( ...@@ -2147,10 +1992,6 @@ brt_leaf_apply_cmd_once (
bn->n_bytes_in_buffer += newdisksize; bn->n_bytes_in_buffer += newdisksize;
se->dsize += le_latest_vallen(new_le) + le_keylen(new_le);
assert(se->dsize < (1U<<31)); // make sure we didn't underflow
se->ndata++;
bump_nkeys(se, 1);
workdone_this_le = newlen; workdone_this_le = newlen;
} }
} }
...@@ -2171,7 +2012,6 @@ struct setval_extra_s { ...@@ -2171,7 +2012,6 @@ struct setval_extra_s {
int setval_r; // any error code that setval_fun wants to return goes here. int setval_r; // any error code that setval_fun wants to return goes here.
// need arguments for brt_leaf_apply_cmd_once // need arguments for brt_leaf_apply_cmd_once
BASEMENTNODE bn; BASEMENTNODE bn;
SUBTREE_EST se;
MSN msn; // captured from original message, not currently used MSN msn; // captured from original message, not currently used
XIDS xids; XIDS xids;
const DBT *key; const DBT *key;
...@@ -2209,7 +2049,7 @@ static void setval_fun (const DBT *new_val, void *svextra_v) { ...@@ -2209,7 +2049,7 @@ static void setval_fun (const DBT *new_val, void *svextra_v) {
toku_init_dbt(&val); toku_init_dbt(&val);
msg.u.id.val = &val; msg.u.id.val = &val;
} }
brt_leaf_apply_cmd_once(svextra->bn, svextra->se, &msg, brt_leaf_apply_cmd_once(svextra->bn, &msg,
svextra->idx, svextra->le, svextra->idx, svextra->le,
svextra->snapshot_txnids, svextra->live_list_reverse, svextra->snapshot_txnids, svextra->live_list_reverse,
svextra->workdone); svextra->workdone);
...@@ -2222,7 +2062,7 @@ static void setval_fun (const DBT *new_val, void *svextra_v) { ...@@ -2222,7 +2062,7 @@ static void setval_fun (const DBT *new_val, void *svextra_v) {
// so capturing the msn in the setval_extra_s is not strictly required. The alternative // so capturing the msn in the setval_extra_s is not strictly required. The alternative
// would be to put a dummy msn in the messages created by setval_fun(), but preserving // would be to put a dummy msn in the messages created by setval_fun(), but preserving
// the original msn seems cleaner and it preserves accountability at a lower layer. // the original msn seems cleaner and it preserves accountability at a lower layer.
static int do_update(brt_update_func update_fun, DESCRIPTOR desc, BASEMENTNODE bn, SUBTREE_EST se, BRT_MSG cmd, int idx, static int do_update(brt_update_func update_fun, DESCRIPTOR desc, BASEMENTNODE bn, BRT_MSG cmd, int idx,
LEAFENTRY le, OMT snapshot_txnids, OMT live_list_reverse, bool* made_change, LEAFENTRY le, OMT snapshot_txnids, OMT live_list_reverse, bool* made_change,
uint64_t * workdone) { uint64_t * workdone) {
LEAFENTRY le_for_update; LEAFENTRY le_for_update;
...@@ -2265,7 +2105,7 @@ static int do_update(brt_update_func update_fun, DESCRIPTOR desc, BASEMENTNODE b ...@@ -2265,7 +2105,7 @@ static int do_update(brt_update_func update_fun, DESCRIPTOR desc, BASEMENTNODE b
le_for_update = NULL; le_for_update = NULL;
} }
struct setval_extra_s setval_extra = {setval_tag, FALSE, 0, bn, se, cmd->msn, cmd->xids, struct setval_extra_s setval_extra = {setval_tag, FALSE, 0, bn, cmd->msn, cmd->xids,
keyp, idx, le_for_update, snapshot_txnids, live_list_reverse, 0, workdone}; keyp, idx, le_for_update, snapshot_txnids, live_list_reverse, 0, workdone};
// call handlerton's brt->update_fun(), which passes setval_extra to setval_fun() // call handlerton's brt->update_fun(), which passes setval_extra to setval_fun()
FAKE_DB(db, desc); FAKE_DB(db, desc);
...@@ -2292,7 +2132,6 @@ brt_leaf_put_cmd ( ...@@ -2292,7 +2132,6 @@ brt_leaf_put_cmd (
brt_update_func update_fun, brt_update_func update_fun,
DESCRIPTOR desc, DESCRIPTOR desc,
BASEMENTNODE bn, BASEMENTNODE bn,
SUBTREE_EST se,
BRT_MSG cmd, BRT_MSG cmd,
bool* made_change, bool* made_change,
uint64_t *workdone, uint64_t *workdone,
...@@ -2339,7 +2178,7 @@ brt_leaf_put_cmd ( ...@@ -2339,7 +2178,7 @@ brt_leaf_put_cmd (
assert(r==0); assert(r==0);
storeddata=storeddatav; storeddata=storeddatav;
} }
brt_leaf_apply_cmd_once(bn, se, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone); brt_leaf_apply_cmd_once(bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone);
// if the insertion point is within a window of the right edge of // if the insertion point is within a window of the right edge of
// the leaf then it is sequential // the leaf then it is sequential
...@@ -2371,7 +2210,7 @@ brt_leaf_put_cmd ( ...@@ -2371,7 +2210,7 @@ brt_leaf_put_cmd (
while (1) { while (1) {
u_int32_t num_leafentries_before = toku_omt_size(bn->buffer); u_int32_t num_leafentries_before = toku_omt_size(bn->buffer);
brt_leaf_apply_cmd_once(bn, se, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone); brt_leaf_apply_cmd_once(bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone);
*made_change = 1; *made_change = 1;
{ {
...@@ -2421,7 +2260,7 @@ brt_leaf_put_cmd ( ...@@ -2421,7 +2260,7 @@ brt_leaf_put_cmd (
storeddata=storeddatav; storeddata=storeddatav;
int deleted = 0; int deleted = 0;
if (!le_is_clean(storeddata)) { //If already clean, nothing to do. if (!le_is_clean(storeddata)) { //If already clean, nothing to do.
brt_leaf_apply_cmd_once(bn, se, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone); brt_leaf_apply_cmd_once(bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone);
u_int32_t new_omt_size = toku_omt_size(bn->buffer); u_int32_t new_omt_size = toku_omt_size(bn->buffer);
if (new_omt_size != omt_size) { if (new_omt_size != omt_size) {
assert(new_omt_size+1 == omt_size); assert(new_omt_size+1 == omt_size);
...@@ -2448,7 +2287,7 @@ brt_leaf_put_cmd ( ...@@ -2448,7 +2287,7 @@ brt_leaf_put_cmd (
storeddata=storeddatav; storeddata=storeddatav;
int deleted = 0; int deleted = 0;
if (le_has_xids(storeddata, cmd->xids)) { if (le_has_xids(storeddata, cmd->xids)) {
brt_leaf_apply_cmd_once(bn, se, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone); brt_leaf_apply_cmd_once(bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone);
u_int32_t new_omt_size = toku_omt_size(bn->buffer); u_int32_t new_omt_size = toku_omt_size(bn->buffer);
if (new_omt_size != omt_size) { if (new_omt_size != omt_size) {
assert(new_omt_size+1 == omt_size); assert(new_omt_size+1 == omt_size);
...@@ -2470,10 +2309,10 @@ brt_leaf_put_cmd ( ...@@ -2470,10 +2309,10 @@ brt_leaf_put_cmd (
r = toku_omt_find_zero(bn->buffer, toku_cmd_leafval_heaviside, &be, r = toku_omt_find_zero(bn->buffer, toku_cmd_leafval_heaviside, &be,
&storeddatav, &idx); &storeddatav, &idx);
if (r==DB_NOTFOUND) { if (r==DB_NOTFOUND) {
r = do_update(update_fun, desc, bn, se, cmd, idx, NULL, snapshot_txnids, live_list_reverse, made_change, workdone); r = do_update(update_fun, desc, bn, cmd, idx, NULL, snapshot_txnids, live_list_reverse, made_change, workdone);
} else if (r==0) { } else if (r==0) {
storeddata=storeddatav; storeddata=storeddatav;
r = do_update(update_fun, desc, bn, se, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, made_change, workdone); r = do_update(update_fun, desc, bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, made_change, workdone);
} // otherwise, a worse error, just return it } // otherwise, a worse error, just return it
break; break;
} }
...@@ -2485,7 +2324,7 @@ brt_leaf_put_cmd ( ...@@ -2485,7 +2324,7 @@ brt_leaf_put_cmd (
r = toku_omt_fetch(bn->buffer, idx, &storeddatav); r = toku_omt_fetch(bn->buffer, idx, &storeddatav);
assert(r==0); assert(r==0);
storeddata=storeddatav; storeddata=storeddatav;
r = do_update(update_fun, desc, bn, se, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, made_change, workdone); r = do_update(update_fun, desc, bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, made_change, workdone);
// TODO(leif): This early return means get_leaf_reactivity() // TODO(leif): This early return means get_leaf_reactivity()
// and VERIFY_NODE() never get called. Is this a problem? // and VERIFY_NODE() never get called. Is this a problem?
assert(r==0); assert(r==0);
...@@ -2796,8 +2635,7 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk) ...@@ -2796,8 +2635,7 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk)
static void static void
maybe_merge_pinned_leaf_nodes (BRTNODE parent, int childnum_of_parent, maybe_merge_pinned_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair *parent_splitk,
BRTNODE a, BRTNODE b, struct kv_pair *parent_splitk,
BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk) BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk)
// Effect: Either merge a and b into one one node (merge them into a) and set *did_merge = TRUE. // Effect: Either merge a and b into one one node (merge them into a) and set *did_merge = TRUE.
// (We do this if the resulting node is not fissible) // (We do this if the resulting node is not fissible)
...@@ -2828,12 +2666,10 @@ maybe_merge_pinned_leaf_nodes (BRTNODE parent, int childnum_of_parent, ...@@ -2828,12 +2666,10 @@ maybe_merge_pinned_leaf_nodes (BRTNODE parent, int childnum_of_parent,
toku_free(parent_splitk); // if we are merging, the splitk gets freed. toku_free(parent_splitk); // if we are merging, the splitk gets freed.
merge_leaf_nodes(a, b); merge_leaf_nodes(a, b);
} }
fixup_child_estimates(parent, childnum_of_parent, a, TRUE);
fixup_child_estimates(parent, childnum_of_parent+1, b, TRUE);
} }
static void static void
maybe_merge_pinned_nonleaf_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair *parent_splitk, maybe_merge_pinned_nonleaf_nodes (struct kv_pair *parent_splitk,
BRTNODE a, BRTNODE b, BRTNODE a, BRTNODE b,
BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk) BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk)
{ {
...@@ -2862,7 +2698,6 @@ maybe_merge_pinned_nonleaf_nodes (BRTNODE parent, int childnum_of_parent, struct ...@@ -2862,7 +2698,6 @@ maybe_merge_pinned_nonleaf_nodes (BRTNODE parent, int childnum_of_parent, struct
a->dirty = 1; a->dirty = 1;
b->dirty = 1; b->dirty = 1;
fixup_child_estimates(parent, childnum_of_parent, a, TRUE);
*did_merge = TRUE; *did_merge = TRUE;
*did_rebalance = FALSE; *did_rebalance = FALSE;
*splitk = NULL; *splitk = NULL;
...@@ -2870,7 +2705,7 @@ maybe_merge_pinned_nonleaf_nodes (BRTNODE parent, int childnum_of_parent, struct ...@@ -2870,7 +2705,7 @@ maybe_merge_pinned_nonleaf_nodes (BRTNODE parent, int childnum_of_parent, struct
} }
static void static void
maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair *parent_splitk, maybe_merge_pinned_nodes (BRTNODE parent, struct kv_pair *parent_splitk,
BRTNODE a, BRTNODE b, BRTNODE a, BRTNODE b,
BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk) BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk)
// Effect: either merge a and b into one node (merge them into a) and set *did_merge = TRUE. // Effect: either merge a and b into one node (merge them into a) and set *did_merge = TRUE.
...@@ -2884,7 +2719,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2884,7 +2719,6 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
// Parameters: // Parameters:
// t The BRT. // t The BRT.
// parent The parent of the two nodes to be split. // parent The parent of the two nodes to be split.
// childnum_of_parent Which child of the parent is a? (b is the next child.)
// parent_splitk The pivot key between a and b. This is either free()'d or returned in *splitk. // parent_splitk The pivot key between a and b. This is either free()'d or returned in *splitk.
// a The first node to merge. // a The first node to merge.
// b The second node to merge. // b The second node to merge.
...@@ -2907,9 +2741,9 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2907,9 +2741,9 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
} }
} }
if (a->height == 0) { if (a->height == 0) {
maybe_merge_pinned_leaf_nodes(parent, childnum_of_parent, a, b, parent_splitk, did_merge, did_rebalance, splitk); maybe_merge_pinned_leaf_nodes(a, b, parent_splitk, did_merge, did_rebalance, splitk);
} else { } else {
maybe_merge_pinned_nonleaf_nodes(parent, childnum_of_parent, parent_splitk, a, b, did_merge, did_rebalance, splitk); maybe_merge_pinned_nonleaf_nodes(parent_splitk, a, b, did_merge, did_rebalance, splitk);
} }
if (*did_merge || *did_rebalance) { if (*did_merge || *did_rebalance) {
// accurate for leaf nodes because all msgs above have been applied, // accurate for leaf nodes because all msgs above have been applied,
...@@ -2995,7 +2829,7 @@ brt_merge_child (struct brt_header* h, BRTNODE node, int childnum_to_merge, BOOL ...@@ -2995,7 +2829,7 @@ brt_merge_child (struct brt_header* h, BRTNODE node, int childnum_to_merge, BOOL
struct kv_pair *splitk_kvpair = 0; struct kv_pair *splitk_kvpair = 0;
struct kv_pair *old_split_key = node->childkeys[childnuma]; struct kv_pair *old_split_key = node->childkeys[childnuma];
unsigned int deleted_size = toku_brt_pivot_key_len(old_split_key); unsigned int deleted_size = toku_brt_pivot_key_len(old_split_key);
maybe_merge_pinned_nodes(node, childnuma, node->childkeys[childnuma], childa, childb, &did_merge, &did_rebalance, &splitk_kvpair); maybe_merge_pinned_nodes(node, node->childkeys[childnuma], childa, childb, &did_merge, &did_rebalance, &splitk_kvpair);
if (childa->height>0) { int i; for (i=0; i+1<childa->n_children; i++) assert(childa->childkeys[i]); } if (childa->height>0) { int i; for (i=0; i+1<childa->n_children; i++) assert(childa->childkeys[i]); }
//toku_verify_estimates(t,childa); //toku_verify_estimates(t,childa);
// the tree did react if a merge (did_merge) or rebalance (new spkit key) occurred // the tree did react if a merge (did_merge) or rebalance (new spkit key) occurred
...@@ -3016,7 +2850,6 @@ brt_merge_child (struct brt_header* h, BRTNODE node, int childnum_to_merge, BOOL ...@@ -3016,7 +2850,6 @@ brt_merge_child (struct brt_header* h, BRTNODE node, int childnum_to_merge, BOOL
&node->childkeys[childnuma+1], &node->childkeys[childnuma+1],
(node->n_children-childnumb)*sizeof(node->childkeys[0])); (node->n_children-childnumb)*sizeof(node->childkeys[0]));
REALLOC_N(node->n_children-1, node->childkeys); REALLOC_N(node->n_children-1, node->childkeys);
fixup_child_estimates(node, childnuma, childa, TRUE);
assert(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b); assert(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b);
childa->dirty = 1; // just to make sure childa->dirty = 1; // just to make sure
childb->dirty = 1; // just to make sure childb->dirty = 1; // just to make sure
...@@ -3264,10 +3097,6 @@ flush_some_child (struct brt_header* h, BRTNODE parent) ...@@ -3264,10 +3097,6 @@ flush_some_child (struct brt_header* h, BRTNODE parent)
maybe_destroy_child_blbs(parent, child); maybe_destroy_child_blbs(parent, child);
// first thing we do is fixup estimates, they will be a bit out of date, because
// they will not contain result of this flush
fixup_child_estimates(parent, childnum, child, TRUE);
//Note that at this point, we don't have the entire child in. //Note that at this point, we don't have the entire child in.
// Let's do a quick check to see if the child may be reactive // Let's do a quick check to see if the child may be reactive
// If the child cannot be reactive, then we can safely unlock // If the child cannot be reactive, then we can safely unlock
...@@ -3399,7 +3228,6 @@ flush_this_child (struct brt_header* h, BRTNODE node, BRTNODE child, int childnu ...@@ -3399,7 +3228,6 @@ flush_this_child (struct brt_header* h, BRTNODE node, BRTNODE child, int childnu
r = toku_bnc_flush_to_child(h->compare_fun, h->update_fun, &h->descriptor, h->cf, bnc, child); assert_zero(r); r = toku_bnc_flush_to_child(h->compare_fun, h->update_fun, &h->descriptor, h->cf, bnc, child); assert_zero(r);
destroy_nonleaf_childinfo(bnc); destroy_nonleaf_childinfo(bnc);
fixup_child_estimates(node, childnum, child, TRUE);
} }
static void static void
...@@ -3476,7 +3304,6 @@ void toku_apply_cmd_to_leaf( ...@@ -3476,7 +3304,6 @@ void toku_apply_cmd_to_leaf(
update_fun, update_fun,
desc, desc,
BLB(node, childnum), BLB(node, childnum),
&BP_SUBTREE_EST(node, childnum),
cmd, cmd,
made_change, made_change,
workdone, workdone,
...@@ -3499,7 +3326,6 @@ void toku_apply_cmd_to_leaf( ...@@ -3499,7 +3326,6 @@ void toku_apply_cmd_to_leaf(
update_fun, update_fun,
desc, desc,
BLB(node, childnum), BLB(node, childnum),
&BP_SUBTREE_EST(node,childnum),
cmd, cmd,
&bn_made_change, &bn_made_change,
workdone, workdone,
...@@ -5722,7 +5548,7 @@ fifo_offset_msn_cmp(void *extrap, const void *va, const void *vb) ...@@ -5722,7 +5548,7 @@ fifo_offset_msn_cmp(void *extrap, const void *va, const void *vb)
} }
static void static void
do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRTNODE ancestor, int childnum, OMT snapshot_txnids, OMT live_list_reverse, MSN *max_msn_applied, const struct fifo_entry *entry) do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, BRTNODE ancestor, int childnum, OMT snapshot_txnids, OMT live_list_reverse, MSN *max_msn_applied, const struct fifo_entry *entry)
{ {
ITEMLEN keylen = entry->keylen; ITEMLEN keylen = entry->keylen;
ITEMLEN vallen = entry->vallen; ITEMLEN vallen = entry->vallen;
...@@ -5744,7 +5570,7 @@ do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRTNODE ancestor, in ...@@ -5744,7 +5570,7 @@ do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRTNODE ancestor, in
if (brtcmd.msn.msn > max_msn_applied->msn) { if (brtcmd.msn.msn > max_msn_applied->msn) {
*max_msn_applied = brtcmd.msn; *max_msn_applied = brtcmd.msn;
} }
brt_leaf_put_cmd(t->compare_fun, t->update_fun, &t->h->descriptor, bn, se, &brtcmd, &made_change, &BP_WORKDONE(ancestor, childnum), snapshot_txnids, live_list_reverse); brt_leaf_put_cmd(t->compare_fun, t->update_fun, &t->h->descriptor, bn, &brtcmd, &made_change, &BP_WORKDONE(ancestor, childnum), snapshot_txnids, live_list_reverse);
} else { } else {
add_to_brt_status(&brt_status.msn_discards,1); add_to_brt_status(&brt_status.msn_discards,1);
} }
...@@ -5753,7 +5579,6 @@ do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRTNODE ancestor, in ...@@ -5753,7 +5579,6 @@ do_brt_leaf_put_cmd(BRT t, BASEMENTNODE bn, SUBTREE_EST se, BRTNODE ancestor, in
struct iterate_do_brt_leaf_put_cmd_extra { struct iterate_do_brt_leaf_put_cmd_extra {
BRT t; BRT t;
BASEMENTNODE bn; BASEMENTNODE bn;
SUBTREE_EST se;
BRTNODE ancestor; BRTNODE ancestor;
int childnum; int childnum;
OMT snapshot_txnids; OMT snapshot_txnids;
...@@ -5768,7 +5593,7 @@ iterate_do_brt_leaf_put_cmd(OMTVALUE v, u_int32_t UU(idx), void *extrap) ...@@ -5768,7 +5593,7 @@ iterate_do_brt_leaf_put_cmd(OMTVALUE v, u_int32_t UU(idx), void *extrap)
const long offset = (long) v; const long offset = (long) v;
NONLEAF_CHILDINFO bnc = BNC(e->ancestor, e->childnum); NONLEAF_CHILDINFO bnc = BNC(e->ancestor, e->childnum);
const struct fifo_entry *entry = toku_fifo_get_entry(bnc->buffer, offset); const struct fifo_entry *entry = toku_fifo_get_entry(bnc->buffer, offset);
do_brt_leaf_put_cmd(e->t, e->bn, e->se, e->ancestor, e->childnum, e->snapshot_txnids, e->live_list_reverse, e->max_msn_applied, entry); do_brt_leaf_put_cmd(e->t, e->bn, e->ancestor, e->childnum, e->snapshot_txnids, e->live_list_reverse, e->max_msn_applied, entry);
return 0; return 0;
} }
...@@ -5861,7 +5686,6 @@ static int ...@@ -5861,7 +5686,6 @@ static int
bnc_apply_messages_to_basement_node( bnc_apply_messages_to_basement_node(
BRT t, BRT t,
BASEMENTNODE bn, BASEMENTNODE bn,
SUBTREE_EST se,
BRTNODE ancestor, BRTNODE ancestor,
int childnum, int childnum,
struct pivot_bounds const * const bounds struct pivot_bounds const * const bounds
...@@ -5902,16 +5726,16 @@ bnc_apply_messages_to_basement_node( ...@@ -5902,16 +5726,16 @@ bnc_apply_messages_to_basement_node(
r = mergesort_r(offsets, buffer_size, sizeof offsets[0], bnc->buffer, fifo_offset_msn_cmp); assert_zero(r); r = mergesort_r(offsets, buffer_size, sizeof offsets[0], bnc->buffer, fifo_offset_msn_cmp); assert_zero(r);
for (int i = 0; i < buffer_size; ++i) { for (int i = 0; i < buffer_size; ++i) {
const struct fifo_entry *entry = toku_fifo_get_entry(bnc->buffer, offsets[i]); const struct fifo_entry *entry = toku_fifo_get_entry(bnc->buffer, offsets[i]);
do_brt_leaf_put_cmd(t, bn, se, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, entry); do_brt_leaf_put_cmd(t, bn, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, entry);
} }
toku_free(offsets); toku_free(offsets);
} else if (stale_lbi == stale_ube) { } else if (stale_lbi == stale_ube) {
struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .bn = bn, .se = se, .ancestor = ancestor, .childnum = childnum, .snapshot_txnids = snapshot_txnids, .live_list_reverse = live_list_reverse, .max_msn_applied = &max_msn_applied }; struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .bn = bn, .ancestor = ancestor, .childnum = childnum, .snapshot_txnids = snapshot_txnids, .live_list_reverse = live_list_reverse, .max_msn_applied = &max_msn_applied };
struct iterate_do_brt_leaf_put_cmd_and_move_to_stale_extra iter_amts_extra = { .brt = t, .iter_extra = &iter_extra, .bnc = bnc }; struct iterate_do_brt_leaf_put_cmd_and_move_to_stale_extra iter_amts_extra = { .brt = t, .iter_extra = &iter_extra, .bnc = bnc };
r = toku_omt_iterate_on_range(bnc->fresh_message_tree, fresh_lbi, fresh_ube, iterate_do_brt_leaf_put_cmd_and_move_to_stale, &iter_amts_extra); assert_zero(r); r = toku_omt_iterate_on_range(bnc->fresh_message_tree, fresh_lbi, fresh_ube, iterate_do_brt_leaf_put_cmd_and_move_to_stale, &iter_amts_extra); assert_zero(r);
} else if (fresh_lbi == fresh_ube) { } else if (fresh_lbi == fresh_ube) {
struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .bn = bn, .se = se, .ancestor = ancestor, .childnum = childnum, .snapshot_txnids = snapshot_txnids, .live_list_reverse = live_list_reverse, .max_msn_applied = &max_msn_applied }; struct iterate_do_brt_leaf_put_cmd_extra iter_extra = { .t = t, .bn = bn, .ancestor = ancestor, .childnum = childnum, .snapshot_txnids = snapshot_txnids, .live_list_reverse = live_list_reverse, .max_msn_applied = &max_msn_applied };
r = toku_omt_iterate_on_range(bnc->stale_message_tree, stale_lbi, stale_ube, iterate_do_brt_leaf_put_cmd, &iter_extra); assert_zero(r); r = toku_omt_iterate_on_range(bnc->stale_message_tree, stale_lbi, stale_ube, iterate_do_brt_leaf_put_cmd, &iter_extra); assert_zero(r);
} else { } else {
long *XMALLOC_N(fresh_ube - fresh_lbi, fresh_offsets_to_move); long *XMALLOC_N(fresh_ube - fresh_lbi, fresh_offsets_to_move);
...@@ -5926,7 +5750,7 @@ bnc_apply_messages_to_basement_node( ...@@ -5926,7 +5750,7 @@ bnc_apply_messages_to_basement_node(
int c = toku_fifo_entry_key_msn_cmp(&extra, &stale_offset, &fresh_offset); int c = toku_fifo_entry_key_msn_cmp(&extra, &stale_offset, &fresh_offset);
if (c < 0) { if (c < 0) {
const struct fifo_entry *stale_entry = toku_fifo_get_entry(bnc->buffer, stale_offset); const struct fifo_entry *stale_entry = toku_fifo_get_entry(bnc->buffer, stale_offset);
do_brt_leaf_put_cmd(t, bn, se, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, stale_entry); do_brt_leaf_put_cmd(t, bn, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, stale_entry);
stale_i++; stale_i++;
if (stale_i != stale_ube) { if (stale_i != stale_ube) {
r = toku_omt_fetch(bnc->stale_message_tree, stale_i, &stale_v); assert_zero(r); r = toku_omt_fetch(bnc->stale_message_tree, stale_i, &stale_v); assert_zero(r);
...@@ -5934,7 +5758,7 @@ bnc_apply_messages_to_basement_node( ...@@ -5934,7 +5758,7 @@ bnc_apply_messages_to_basement_node(
} else if (c > 0) { } else if (c > 0) {
fresh_offsets_to_move[fresh_i - fresh_lbi] = fresh_offset; fresh_offsets_to_move[fresh_i - fresh_lbi] = fresh_offset;
const struct fifo_entry *fresh_entry = toku_fifo_get_entry(bnc->buffer, fresh_offset); const struct fifo_entry *fresh_entry = toku_fifo_get_entry(bnc->buffer, fresh_offset);
do_brt_leaf_put_cmd(t, bn, se, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, fresh_entry); do_brt_leaf_put_cmd(t, bn, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, fresh_entry);
fresh_i++; fresh_i++;
if (fresh_i != fresh_ube) { if (fresh_i != fresh_ube) {
r = toku_omt_fetch(bnc->fresh_message_tree, fresh_i, &fresh_v); assert_zero(r); r = toku_omt_fetch(bnc->fresh_message_tree, fresh_i, &fresh_v); assert_zero(r);
...@@ -5947,7 +5771,7 @@ bnc_apply_messages_to_basement_node( ...@@ -5947,7 +5771,7 @@ bnc_apply_messages_to_basement_node(
while (stale_i < stale_ube) { while (stale_i < stale_ube) {
const long stale_offset = (long) stale_v; const long stale_offset = (long) stale_v;
const struct fifo_entry *stale_entry = toku_fifo_get_entry(bnc->buffer, stale_offset); const struct fifo_entry *stale_entry = toku_fifo_get_entry(bnc->buffer, stale_offset);
do_brt_leaf_put_cmd(t, bn, se, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, stale_entry); do_brt_leaf_put_cmd(t, bn, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, stale_entry);
stale_i++; stale_i++;
if (stale_i != stale_ube) { if (stale_i != stale_ube) {
r = toku_omt_fetch(bnc->stale_message_tree, stale_i, &stale_v); assert_zero(r); r = toku_omt_fetch(bnc->stale_message_tree, stale_i, &stale_v); assert_zero(r);
...@@ -5957,7 +5781,7 @@ bnc_apply_messages_to_basement_node( ...@@ -5957,7 +5781,7 @@ bnc_apply_messages_to_basement_node(
const long fresh_offset = (long) fresh_v; const long fresh_offset = (long) fresh_v;
fresh_offsets_to_move[fresh_i - fresh_lbi] = fresh_offset; fresh_offsets_to_move[fresh_i - fresh_lbi] = fresh_offset;
const struct fifo_entry *fresh_entry = toku_fifo_get_entry(bnc->buffer, fresh_offset); const struct fifo_entry *fresh_entry = toku_fifo_get_entry(bnc->buffer, fresh_offset);
do_brt_leaf_put_cmd(t, bn, se, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, fresh_entry); do_brt_leaf_put_cmd(t, bn, ancestor, childnum, snapshot_txnids, live_list_reverse, &max_msn_applied, fresh_entry);
fresh_i++; fresh_i++;
if (fresh_i != fresh_ube) { if (fresh_i != fresh_ube) {
r = toku_omt_fetch(bnc->fresh_message_tree, fresh_i, &fresh_v); assert_zero(r); r = toku_omt_fetch(bnc->fresh_message_tree, fresh_i, &fresh_v); assert_zero(r);
...@@ -5997,7 +5821,6 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -5997,7 +5821,6 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
int height = 0; int height = 0;
if (BP_STATE(node, i) != PT_AVAIL) { continue; } if (BP_STATE(node, i) != PT_AVAIL) { continue; }
BASEMENTNODE curr_bn = BLB(node, i); BASEMENTNODE curr_bn = BLB(node, i);
SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds); struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) { for (ANCESTORS curr_ancestors = ancestors; curr_ancestors; curr_ancestors = curr_ancestors->next) {
height++; height++;
...@@ -6006,7 +5829,6 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -6006,7 +5829,6 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
bnc_apply_messages_to_basement_node( bnc_apply_messages_to_basement_node(
t, t,
curr_bn, curr_bn,
curr_se,
curr_ancestors->node, curr_ancestors->node,
curr_ancestors->childnum, curr_ancestors->childnum,
&curr_bounds &curr_bounds
...@@ -6026,7 +5848,6 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -6026,7 +5848,6 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
BRTNODE prev_node = node; BRTNODE prev_node = node;
while (curr_ancestors) { while (curr_ancestors) {
BRTNODE next_node = curr_ancestors->node; BRTNODE next_node = curr_ancestors->node;
fixup_child_estimates(next_node, curr_ancestors->childnum, prev_node, FALSE);
prev_node = next_node; prev_node = next_node;
curr_ancestors = curr_ancestors->next; curr_ancestors = curr_ancestors->next;
} }
...@@ -6971,87 +6792,173 @@ toku_brt_cursor_delete(BRT_CURSOR cursor, int flags, TOKUTXN txn) { ...@@ -6971,87 +6792,173 @@ toku_brt_cursor_delete(BRT_CURSOR cursor, int flags, TOKUTXN txn) {
/* ********************* keyrange ************************ */ /* ********************* keyrange ************************ */
static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename, struct keyrange_compare_s {
u_int32_t fullhash, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, BRT brt;
ANCESTORS ancestors, struct pivot_bounds const * const bounds) { DBT *key;
BRTNODE node; };
{
//assert(fullhash == toku_cachetable_hash(brt->cf, nodename)); static int keyrange_compare (OMTVALUE lev, void *extra) {
struct brtnode_fetch_extra bfe; LEAFENTRY le = lev;
fill_bfe_for_min_read(&bfe, brt->h); u_int32_t keylen;
toku_pin_brtnode_holding_lock(brt, nodename, fullhash, void* key = le_key_and_len(le, &keylen);
ancestors, bounds, &bfe, FALSE, DBT omt_dbt;
&node); toku_fill_dbt(&omt_dbt, key, keylen);
assert(node->fullhash==fullhash); struct keyrange_compare_s *s = extra;
} return s->brt->compare_fun(s->brt->db, &omt_dbt, s->key);
int n_keys = node->n_children-1; }
int compares[n_keys];
int i; static void keyrange_in_leaf_partition (BRT brt, BRTNODE node, DBT *key, int child_number, int estimated_row_size,
FAKE_DB(db, &brt->h->descriptor); u_int64_t *less, u_int64_t *equal, u_int64_t *greater)
for (i=0; i<n_keys; i++) { // If the partition is in main memory then estimate the number
struct kv_pair *pivot = node->childkeys[i]; {
DBT dbt; assert(node->height==0); // we are in a leaf
compares[i] = brt->compare_fun(&db, toku_fill_dbt(&dbt, kv_pair_key(pivot), kv_pair_keylen(pivot)), key); if (BP_STATE(node, child_number) == PT_AVAIL) {
} // If the partition is in main memory then get an exact count.
for (i=0; i<node->n_children; i++) { struct keyrange_compare_s s = {brt,key};
int prevcomp = (i==0) ? -1 : compares[i-1]; BASEMENTNODE bn = BLB(node, child_number);
int nextcomp = (i+1 >= n_keys) ? 1 : compares[i]; OMTVALUE datav;
u_int64_t subest = BP_SUBTREE_EST(node,i).ndata; u_int32_t idx = 0;
if (nextcomp < 0) { int r = toku_omt_find_zero(bn->buffer, keyrange_compare, &s, &datav, &idx);
// We're definitely looking too far to the left if (r==0) {
*less += subest; *less = idx;
} else if (prevcomp > 0) { *equal = 1;
// We're definitely looking too far to the right *greater = toku_omt_size(bn->buffer)-idx-1;
*greater += subest;
} else if (prevcomp == 0 && nextcomp == 0) {
// We're looking at a subtree that contains all zeros
*equal += subest;
} else { } else {
// nextcomp>=0 and prevcomp<=0, so something in the subtree could match // If not found, then the idx says where it's between.
// but they are not both zero, so it's not the whole subtree, so we need to recurse *less = idx;
struct ancestors next_ancestors = {node, i, ancestors}; *equal = 0;
const struct pivot_bounds next_bounds = next_pivot_keys(node, i, bounds); *greater = toku_omt_size(bn->buffer)-idx;
if (node->height > 0) { }
toku_brt_keyrange_internal(brt, BP_BLOCKNUM(node, i), compute_child_fullhash(brt->cf, node, i), key, less, equal, greater, } else {
&next_ancestors, &next_bounds); u_int32_t size = BP_SIZE(node, child_number);
} *less = (size/2)/estimated_row_size;
else { *equal = 0;
if (BP_STATE(node,i) == PT_AVAIL) { *greater = *less;
struct cmd_leafval_heaviside_extra be = {brt->compare_fun, &brt->h->descriptor, key}; }
u_int32_t idx; }
int r = toku_omt_find_zero(BLB_BUFFER(node, i), toku_cmd_leafval_heaviside, &be, 0, &idx);
*less += idx; static u_int64_t estimate_rows_in_leaf_partition (BRTNODE node, int child_number, int estimated_row_size) {
*greater += toku_omt_size(BLB_BUFFER(node, i))-idx; assert(node->height==0);
if (r==0) { if (BP_STATE(node, child_number) == PT_AVAIL) {
(*greater)--; return toku_omt_size(BLB(node, child_number)->buffer);
(*equal)++; } else {
} return BP_SIZE(node, child_number)/estimated_row_size;
} }
else { }
// In this case, we need to search the basement node, but it is not available
// We do not want to incur a disk seek just to get an estimate, so, we static u_int64_t ipow (u_int64_t v, int exp)
// just take a guess. Arbitrarily say half the elements are less, and half are greater // Compute v^exp
u_int64_t bn_subest = BP_SUBTREE_EST(node,i).ndata; {
*less += bn_subest/2; u_int64_t result = 1;
*greater += bn_subest - (bn_subest/2); for (int j=0; j<exp; j++) {
} result*=v;
}
return result;
}
static int toku_brt_keyrange_internal (BRT brt, BRTNODE node,
DBT *key,
int estimated_row_size,
struct brtnode_fetch_extra *bfe, // set up to read a minimal read.
u_int64_t *less, u_int64_t *equal, u_int64_t *greater,
struct unlockers *unlockers, ANCESTORS ancestors, struct pivot_bounds const * const bounds)
// Implementation note: Assign values to less, equal, and greater, and then on the way out (returning up the stack) we add more values in.
{
int r = 0;
int child_number = toku_brtnode_which_child (node, key, &brt->h->descriptor, brt->compare_fun);
if (node->height==0) {
// we are at the leaf.
keyrange_in_leaf_partition(brt, node, key, child_number, estimated_row_size, less, equal, greater);
for (int j=0; j<child_number; j++) {
*less += estimate_rows_in_leaf_partition(node, j, estimated_row_size);
}
for (int j=child_number+1; j<node->n_children; j++) {
*greater += estimate_rows_in_leaf_partition(node, j, estimated_row_size);
}
} else {
// do the child.
struct ancestors next_ancestors = {node, child_number, ancestors};
BLOCKNUM childblocknum = BP_BLOCKNUM(node, child_number);
u_int32_t fullhash = compute_child_fullhash(brt->cf, node, child_number);
BRTNODE childnode;
r = toku_pin_brtnode(brt, childblocknum, fullhash, unlockers, &next_ancestors, bounds, bfe, &childnode);
if (r!=TOKUDB_TRY_AGAIN) {
assert(r==0);
struct unlock_brtnode_extra unlock_extra = {brt,childnode};
struct unlockers next_unlockers = {TRUE, unlock_brtnode_fun, (void*)&unlock_extra, unlockers};
const struct pivot_bounds next_bounds = next_pivot_keys(node, child_number, bounds);
r = toku_brt_keyrange_internal(brt, childnode, key, estimated_row_size, bfe,
less, equal, greater,
&next_unlockers, &next_ancestors, &next_bounds);
if (r!=TOKUDB_TRY_AGAIN) {
assert(r==0);
// Then update the left and right
// Estimate the number of leaf nodes below assuming the fanout is BRT_FANOUT*3/4.
// Estimate the size of each leaf node as if it is 3/4 full.
u_int64_t subtree_rows_estimate = (ipow(BRT_FANOUT*3/4, node->height)*(brt->nodesize*3/4))/estimated_row_size;
*less += subtree_rows_estimate*child_number;
*greater += subtree_rows_estimate*(node->n_children-child_number-1);
assert(unlockers->locked);
toku_unpin_brtnode(brt, childnode);
} }
} }
} }
toku_unpin_brtnode(brt, node); return r;
} }
int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater) { int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less_p, u_int64_t *equal_p, u_int64_t *greater_p)
assert(brt->h); // Effect: Return an estimate of the number of keys to the left, the number equal, and the number to the right of the key.
u_int32_t fullhash; // The values are an estimate.
CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash); // If you perform a keyrange on two keys that are in the same in-memory leaf entry, you can the keys_right numbers (or the keys_left) numbers
// to get an exact number keys in the range.
{
try_again:
{
u_int64_t less = 0, equal = 0, greater = 0;
assert(brt->h);
u_int32_t fullhash;
CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
*less = *equal = *greater = 0; struct brtnode_fetch_extra bfe;
toku_brt_keyrange_internal (brt, *rootp, fullhash, key, less, equal, greater, fill_bfe_for_min_read(&bfe,
(ANCESTORS)NULL, &infinite_bounds); brt->h);
BRTNODE node;
{
int r = toku_pin_brtnode(brt, *rootp, fullhash,(UNLOCKERS)NULL,(ANCESTORS)NULL, &infinite_bounds, &bfe, &node);
assert(r==0 || r== TOKUDB_TRY_AGAIN);
if (r == TOKUDB_TRY_AGAIN) {
goto try_again;
}
}
struct unlock_brtnode_extra unlock_extra = {brt,node};
struct unlockers unlockers = {TRUE, unlock_brtnode_fun, (void*)&unlock_extra, (UNLOCKERS)NULL};
{
int r = toku_brt_keyrange_internal (brt, node, key,
100, /* for now we are using 100 as the estimate of the row size. Later we'd like a better estimate. */
&bfe, &less, &equal, &greater,
&unlockers, (ANCESTORS)NULL, &infinite_bounds);
assert(r==0 || r== TOKUDB_TRY_AGAIN);
if (r==TOKUDB_TRY_AGAIN) {
assert(!unlockers.locked);
goto try_again;
}
}
assert(unlockers.locked);
toku_unpin_brtnode(brt, node);
*less_p = less;
*equal_p = equal;
*greater_p = greater;
}
return 0; return 0;
} }
int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) { int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) {
{ {
int64_t file_size; int64_t file_size;
...@@ -7063,6 +6970,7 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) { ...@@ -7063,6 +6970,7 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) {
} }
assert(brt->h); assert(brt->h);
#if 0
u_int32_t fullhash; u_int32_t fullhash;
CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash); CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
CACHEKEY root = *rootp; CACHEKEY root = *rootp;
...@@ -7079,13 +6987,27 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) { ...@@ -7079,13 +6987,27 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) {
s->ndata += se->ndata; s->ndata += se->ndata;
s->dsize += se->dsize; s->dsize += se->dsize;
} }
{
int r = toku_cachetable_unpin(brt->cf, root, fullhash, CACHETABLE_CLEAN, make_brtnode_pair_attr(node));
if (r!=0) return r;
}
#else
// A hack for now.
{
DBT key = {.size=0, .data=""};
u_int64_t less=0, equal=0, greater=0;
int r = toku_brt_keyrange(brt, &key, &less, &equal, &greater);
assert(r==0);
s->nkeys = less + equal + greater;
s->ndata = less + equal + greater;
s->dsize = s->nkeys * 100; // estimate for now.
}
#endif
// 4018 // 4018
s->create_time_sec = brt->h->time_of_creation; s->create_time_sec = brt->h->time_of_creation;
s->modify_time_sec = brt->h->time_of_last_modification; s->modify_time_sec = brt->h->time_of_last_modification;
s->verify_time_sec = brt->h->time_of_last_verification; s->verify_time_sec = brt->h->time_of_last_verification;
int r = toku_cachetable_unpin(brt->cf, root, fullhash, CACHETABLE_CLEAN, make_brtnode_pair_attr(node));
if (r!=0) return r;
return 0; return 0;
} }
...@@ -7130,12 +7052,6 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_ ...@@ -7130,12 +7052,6 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
fprintf(file, "\n"); fprintf(file, "\n");
} }
for (i=0; i< node->n_children; i++) { for (i=0; i< node->n_children; i++) {
{
SUBTREE_EST e = &BP_SUBTREE_EST(node,i);
fprintf(file, " est={n=%" PRIu64 " k=%" PRIu64 " s=%" PRIu64 " e=%d}",
e->ndata, e->nkeys, e->dsize, (int)e->exact);
}
fprintf(file, "\n");
if (node->height > 0) { if (node->height > 0) {
NONLEAF_CHILDINFO bnc = BNC(node, i); NONLEAF_CHILDINFO bnc = BNC(node, i);
fprintf(file, "%*schild %d buffered (%d entries):", depth+1, "", i, toku_bnc_n_entries(bnc)); fprintf(file, "%*schild %d buffered (%d entries):", depth+1, "", i, toku_bnc_n_entries(bnc));
......
...@@ -296,9 +296,6 @@ BOOL toku_brt_is_empty_fast (BRT brt) __attribute__ ((warn_unused_result)); ...@@ -296,9 +296,6 @@ BOOL toku_brt_is_empty_fast (BRT brt) __attribute__ ((warn_unused_result));
BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_result)); BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_result));
void toku_brt_bn_reset_stats(BRTNODE node, int childnum);
void toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node);
int toku_brt_strerror_r(int error, char *buf, size_t buflen); int toku_brt_strerror_r(int error, char *buf, size_t buflen);
// Effect: LIke the XSI-compliant strerorr_r, extended to db_strerror(). // Effect: LIke the XSI-compliant strerorr_r, extended to db_strerror().
// If error>=0 then the result is to do strerror_r(error, buf, buflen), that is fill buf with a descriptive error message. // If error>=0 then the result is to do strerror_r(error, buf, buflen), that is fill buf with a descriptive error message.
......
...@@ -145,13 +145,6 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { ...@@ -145,13 +145,6 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
printf(" n_children=%d\n", n->n_children); printf(" n_children=%d\n", n->n_children);
printf(" total_childkeylens=%u\n", n->totalchildkeylens); printf(" total_childkeylens=%u\n", n->totalchildkeylens);
printf(" subleafentry_estimates={");
for (int i=0; i<n->n_children; i++) {
if (i>0) printf(" ");
struct subtree_estimates *est = &BP_SUBTREE_EST(n,i);
printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 " %s }", est->nkeys, est->ndata, est->dsize, est->exact ? "T" : "F");
}
printf("}\n");
printf(" pivots:\n"); printf(" pivots:\n");
for (int i=0; i<n->n_children-1; i++) { for (int i=0; i<n->n_children-1; i++) {
struct kv_pair *piv = n->childkeys[i]; struct kv_pair *piv = n->childkeys[i];
......
...@@ -2004,7 +2004,6 @@ int merge_files (struct merge_fileset *fs, ...@@ -2004,7 +2004,6 @@ int merge_files (struct merge_fileset *fs,
struct subtree_info { struct subtree_info {
int64_t block; int64_t block;
struct subtree_estimates subtree_estimates;
}; };
struct subtrees_info { struct subtrees_info {
...@@ -2024,12 +2023,11 @@ static void subtrees_info_destroy(struct subtrees_info *p) { ...@@ -2024,12 +2023,11 @@ static void subtrees_info_destroy(struct subtrees_info *p) {
p->subtrees = NULL; p->subtrees = NULL;
} }
static void allocate_node (struct subtrees_info *sts, int64_t b, const struct subtree_estimates est) { static void allocate_node (struct subtrees_info *sts, int64_t b) {
if (sts->n_subtrees >= sts->n_subtrees_limit) { if (sts->n_subtrees >= sts->n_subtrees_limit) {
sts->n_subtrees_limit *= 2; sts->n_subtrees_limit *= 2;
XREALLOC_N(sts->n_subtrees_limit, sts->subtrees); XREALLOC_N(sts->n_subtrees_limit, sts->subtrees);
} }
sts->subtrees[sts->n_subtrees].subtree_estimates = est;
sts->subtrees[sts->n_subtrees].block = b; sts->subtrees[sts->n_subtrees].block = b;
sts->n_subtrees++; sts->n_subtrees++;
} }
...@@ -2363,8 +2361,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, ...@@ -2363,8 +2361,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl,
progress_allocation -= progress_this_node; progress_allocation -= progress_this_node;
old_n_rows_remaining = n_rows_remaining; old_n_rows_remaining = n_rows_remaining;
struct subtree_estimates est = make_subtree_estimates(lbuf->nkeys, lbuf->ndata, lbuf->dsize, TRUE); allocate_node(&sts, lblock);
allocate_node(&sts, lblock, est);
n_pivots++; n_pivots++;
...@@ -2406,8 +2403,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl, ...@@ -2406,8 +2403,7 @@ static int toku_loader_write_brt_from_q (BRTLOADER bl,
cleanup_maxkey(&maxkey); cleanup_maxkey(&maxkey);
if (lbuf) { if (lbuf) {
struct subtree_estimates est = make_subtree_estimates(lbuf->nkeys, lbuf->ndata, lbuf->dsize, TRUE); allocate_node(&sts, lblock);
allocate_node(&sts, lblock, est);
{ {
int p = progress_allocation/2; int p = progress_allocation/2;
finish_leafnode(&out, lbuf, p, bl, target_basementnodesize); finish_leafnode(&out, lbuf, p, bl, target_basementnodesize);
...@@ -2776,7 +2772,7 @@ static void add_pair_to_leafnode (struct leaf_buf *lbuf, unsigned char *key, int ...@@ -2776,7 +2772,7 @@ static void add_pair_to_leafnode (struct leaf_buf *lbuf, unsigned char *key, int
DBT theval = { .data = val, .size = vallen }; DBT theval = { .data = val, .size = vallen };
BRT_MSG_S cmd = { BRT_INSERT, ZERO_MSN, lbuf->xids, .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, ZERO_MSN, lbuf->xids, .u.id = { &thekey, &theval } };
uint64_t workdone=0; uint64_t workdone=0;
brt_leaf_apply_cmd_once(BLB(leafnode,0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, &workdone); brt_leaf_apply_cmd_once(BLB(leafnode,0), &cmd, idx, NULL, NULL, NULL, &workdone);
} }
static int write_literal(struct dbout *out, void*data, size_t len) { static int write_literal(struct dbout *out, void*data, size_t len) {
...@@ -2961,13 +2957,10 @@ static int setup_nonleaf_block (int n_children, ...@@ -2961,13 +2957,10 @@ static int setup_nonleaf_block (int n_children,
toku_free(pivots[n_children-1].data); toku_free(pivots[n_children-1].data);
pivots[n_children-1] = zero_dbt; pivots[n_children-1] = zero_dbt;
struct subtree_estimates new_subtree_estimates = zero_estimates;
struct subtree_info *XMALLOC_N(n_children, subtrees_array); struct subtree_info *XMALLOC_N(n_children, subtrees_array);
for (int i = 0; i < n_children; i++) { for (int i = 0; i < n_children; i++) {
int64_t from_blocknum = first_child_offset_in_subtrees + i; int64_t from_blocknum = first_child_offset_in_subtrees + i;
subtrees_array[i] = subtrees->subtrees[from_blocknum]; subtrees_array[i] = subtrees->subtrees[from_blocknum];
add_estimates(&new_subtree_estimates, &subtrees->subtrees[from_blocknum].subtree_estimates);
} }
int r = allocate_block(out, blocknum); int r = allocate_block(out, blocknum);
...@@ -2975,7 +2968,7 @@ static int setup_nonleaf_block (int n_children, ...@@ -2975,7 +2968,7 @@ static int setup_nonleaf_block (int n_children,
toku_free(subtrees_array); toku_free(subtrees_array);
result = r; result = r;
} else { } else {
allocate_node(next_subtrees, *blocknum, new_subtree_estimates); allocate_node(next_subtrees, *blocknum);
*pivots_p = pivots; *pivots_p = pivots;
*subtrees_info_p = subtrees_array; *subtrees_info_p = subtrees_array;
...@@ -3021,7 +3014,6 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu ...@@ -3021,7 +3014,6 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu
assert(node->bp); assert(node->bp);
for (int i=0; i<n_children; i++) { for (int i=0; i<n_children; i++) {
BP_BLOCKNUM(node,i) = make_blocknum(subtree_info[i].block); BP_BLOCKNUM(node,i) = make_blocknum(subtree_info[i].block);
BP_SUBTREE_EST(node,i) = subtree_info[i].subtree_estimates;
BP_STATE(node,i) = PT_AVAIL; BP_STATE(node,i) = PT_AVAIL;
} }
......
...@@ -24,7 +24,6 @@ typedef struct brtnode *BRTNODE; ...@@ -24,7 +24,6 @@ typedef struct brtnode *BRTNODE;
typedef struct brtnode_leaf_basement_node *BASEMENTNODE; typedef struct brtnode_leaf_basement_node *BASEMENTNODE;
typedef struct brtnode_nonleaf_childinfo *NONLEAF_CHILDINFO; typedef struct brtnode_nonleaf_childinfo *NONLEAF_CHILDINFO;
typedef struct sub_block *SUB_BLOCK; typedef struct sub_block *SUB_BLOCK;
typedef struct subtree_estimates *SUBTREE_EST;
struct brt_header; struct brt_header;
struct wbuf; struct wbuf;
struct dbuf; struct dbuf;
......
...@@ -247,14 +247,6 @@ test_serialize_nonleaf(void) { ...@@ -247,14 +247,6 @@ test_serialize_nonleaf(void) {
sn.totalchildkeylens = 6; sn.totalchildkeylens = 6;
BP_BLOCKNUM(&sn, 0).b = 30; BP_BLOCKNUM(&sn, 0).b = 30;
BP_BLOCKNUM(&sn, 1).b = 35; BP_BLOCKNUM(&sn, 1).b = 35;
BP_SUBTREE_EST(&sn,0).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).exact = (BOOL)(random()%2 != 0);
BP_SUBTREE_EST(&sn,1).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,0) = PT_AVAIL; BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL; BP_STATE(&sn,1) = PT_AVAIL;
set_BNC(&sn, 0, toku_create_empty_nl()); set_BNC(&sn, 0, toku_create_empty_nl());
...@@ -352,14 +344,6 @@ test_serialize_leaf(void) { ...@@ -352,14 +344,6 @@ test_serialize_leaf(void) {
MALLOC_N(1, sn.childkeys); MALLOC_N(1, sn.childkeys);
sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0); sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0);
sn.totalchildkeylens = 2; sn.totalchildkeylens = 2;
BP_SUBTREE_EST(&sn,0).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).exact = (BOOL)(random()%2 != 0);
BP_SUBTREE_EST(&sn,1).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,0) = PT_AVAIL; BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL; BP_STATE(&sn,1) = PT_AVAIL;
set_BLB(&sn, 0, toku_create_empty_bn()); set_BLB(&sn, 0, toku_create_empty_bn());
......
...@@ -74,10 +74,6 @@ test_serialize_leaf(int valsize, int nelts, double entropy) { ...@@ -74,10 +74,6 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
MALLOC_N(sn.n_children-1, sn.childkeys); MALLOC_N(sn.n_children-1, sn.childkeys);
sn.totalchildkeylens = 0; sn.totalchildkeylens = 0;
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,i) = PT_AVAIL; BP_STATE(&sn,i) = PT_AVAIL;
set_BLB(&sn, i, toku_create_empty_bn()); set_BLB(&sn, i, toku_create_empty_bn());
} }
...@@ -197,10 +193,6 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) { ...@@ -197,10 +193,6 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
sn.totalchildkeylens = 0; sn.totalchildkeylens = 0;
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
BP_BLOCKNUM(&sn, i).b = 30 + (i*5); BP_BLOCKNUM(&sn, i).b = 30 + (i*5);
BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,i) = PT_AVAIL; BP_STATE(&sn,i) = PT_AVAIL;
set_BNC(&sn, i, toku_create_empty_nl()); set_BNC(&sn, i, toku_create_empty_nl());
} }
......
...@@ -201,14 +201,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) { ...@@ -201,14 +201,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
MALLOC_N(1, sn.childkeys); MALLOC_N(1, sn.childkeys);
sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0); sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0);
sn.totalchildkeylens = 2; sn.totalchildkeylens = 2;
BP_SUBTREE_EST(&sn,0).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).exact = (BOOL)(random()%2 != 0);
BP_SUBTREE_EST(&sn,1).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,0) = PT_AVAIL; BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL; BP_STATE(&sn,1) = PT_AVAIL;
set_BLB(&sn, 0, toku_create_empty_bn()); set_BLB(&sn, 0, toku_create_empty_bn());
...@@ -251,16 +243,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) { ...@@ -251,16 +243,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
setup_dn(bft, fd, brt_h, &dn); setup_dn(bft, fd, brt_h, &dn);
//
// test that subtree estimates get set
// rebalancing should make it 1 basement
//
assert(BP_SUBTREE_EST(&sn,0).nkeys == 3);
assert(BP_SUBTREE_EST(dn,0).nkeys == 3);
assert(BP_SUBTREE_EST(&sn,0).ndata == 3);
assert(BP_SUBTREE_EST(dn,0).ndata == 3);
assert(dn->thisnodename.b==20); assert(dn->thisnodename.b==20);
assert(dn->layout_version ==BRT_LAYOUT_VERSION); assert(dn->layout_version ==BRT_LAYOUT_VERSION);
...@@ -349,10 +331,6 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) { ...@@ -349,10 +331,6 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
sn.totalchildkeylens = (sn.n_children-1)*sizeof(int); sn.totalchildkeylens = (sn.n_children-1)*sizeof(int);
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
BP_STATE(&sn,i) = PT_AVAIL; BP_STATE(&sn,i) = PT_AVAIL;
BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
set_BLB(&sn, i, toku_create_empty_bn()); set_BLB(&sn, i, toku_create_empty_bn());
} }
for (int i = 0; i < nrows; ++i) { for (int i = 0; i < nrows; ++i) {
...@@ -471,10 +449,6 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) { ...@@ -471,10 +449,6 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
sn.totalchildkeylens = (sn.n_children-1)*sizeof(int); sn.totalchildkeylens = (sn.n_children-1)*sizeof(int);
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
BP_STATE(&sn,i) = PT_AVAIL; BP_STATE(&sn,i) = PT_AVAIL;
BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
set_BLB(&sn, i, toku_create_empty_bn()); set_BLB(&sn, i, toku_create_empty_bn());
} }
BLB_NBYTESINBUF(&sn, 0) = 0; BLB_NBYTESINBUF(&sn, 0) = 0;
...@@ -595,10 +569,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) { ...@@ -595,10 +569,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
sn.totalchildkeylens = (sn.n_children-1)*8; sn.totalchildkeylens = (sn.n_children-1)*8;
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
BP_STATE(&sn,i) = PT_AVAIL; BP_STATE(&sn,i) = PT_AVAIL;
BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long) random())<<32);
BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
set_BLB(&sn, i, toku_create_empty_bn()); set_BLB(&sn, i, toku_create_empty_bn());
} }
BLB_NBYTESINBUF(&sn, 0) = 0; BLB_NBYTESINBUF(&sn, 0) = 0;
...@@ -717,10 +687,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) { ...@@ -717,10 +687,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
sn.totalchildkeylens = (sn.n_children-1)*2; sn.totalchildkeylens = (sn.n_children-1)*2;
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
BP_STATE(&sn,i) = PT_AVAIL; BP_STATE(&sn,i) = PT_AVAIL;
BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
set_BLB(&sn, i, toku_create_empty_bn()); set_BLB(&sn, i, toku_create_empty_bn());
BLB_SEQINSERT(&sn, i) = 0; BLB_SEQINSERT(&sn, i) = 0;
} }
...@@ -841,10 +807,6 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type ...@@ -841,10 +807,6 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
sn.totalchildkeylens = (sn.n_children-1)*2; sn.totalchildkeylens = (sn.n_children-1)*2;
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
BP_STATE(&sn,i) = PT_AVAIL; BP_STATE(&sn,i) = PT_AVAIL;
BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
set_BLB(&sn, i, toku_create_empty_bn()); set_BLB(&sn, i, toku_create_empty_bn());
} }
BLB_NBYTESINBUF(&sn, 0) = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 0)); BLB_NBYTESINBUF(&sn, 0) = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 0));
...@@ -956,14 +918,6 @@ test_serialize_leaf(enum brtnode_verify_type bft) { ...@@ -956,14 +918,6 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
MALLOC_N(1, sn.childkeys); MALLOC_N(1, sn.childkeys);
sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0); sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0);
sn.totalchildkeylens = 2; sn.totalchildkeylens = 2;
BP_SUBTREE_EST(&sn,0).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).exact = (BOOL)(random()%2 != 0);
BP_SUBTREE_EST(&sn,1).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,0) = PT_AVAIL; BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL; BP_STATE(&sn,1) = PT_AVAIL;
set_BLB(&sn, 0, toku_create_empty_bn()); set_BLB(&sn, 0, toku_create_empty_bn());
...@@ -1085,14 +1039,6 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) { ...@@ -1085,14 +1039,6 @@ test_serialize_nonleaf(enum brtnode_verify_type bft) {
sn.totalchildkeylens = 6; sn.totalchildkeylens = 6;
BP_BLOCKNUM(&sn, 0).b = 30; BP_BLOCKNUM(&sn, 0).b = 30;
BP_BLOCKNUM(&sn, 1).b = 35; BP_BLOCKNUM(&sn, 1).b = 35;
BP_SUBTREE_EST(&sn,0).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,1).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,0).exact = (BOOL)(random()%2 != 0);
BP_SUBTREE_EST(&sn,1).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,0) = PT_AVAIL; BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL; BP_STATE(&sn,1) = PT_AVAIL;
set_BNC(&sn, 0, toku_create_empty_nl()); set_BNC(&sn, 0, toku_create_empty_nl());
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
// verify that key_range64 can deal with >2G number of keys.
// create a height 1 tree with 1 key in each subtree.
// artificially set the key estimates in the subtrees to huge (2**31).
#include "includes.h"
#include "test.h"
static BRTNODE
make_node(BRT brt, int height) {
BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node;
}
static void
append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen) {
assert(leafnode->height == 0);
DBT thekey; toku_fill_dbt(&thekey, key, keylen);
DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node
MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode, 0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL);
// dont forget to dirty the node
leafnode->dirty = 1;
}
static void
populate_leaf(BRTNODE leafnode, int seq, int n, int *minkey, int *maxkey) {
for (int i = 0; i < n; i++) {
int k = htonl(seq + i);
int v = seq + i;
append_leaf(leafnode, &k, sizeof k, &v, sizeof v);
}
*minkey = htonl(seq);
*maxkey = htonl(seq + n - 1);
}
static BRTNODE
make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey, int *maxkey, uint64_t subtree_size) {
BRTNODE node;
if (height == 0) {
node = make_node(brt, 0);
populate_leaf(node, *seq, nperleaf, minkey, maxkey);
*seq += nperleaf;
} else {
node = make_node(brt, height);
int minkeys[fanout], maxkeys[fanout];
for (int childnum = 0; childnum < fanout; childnum++) {
BRTNODE child = make_tree(brt, height-1, fanout, nperleaf, seq, &minkeys[childnum], &maxkeys[childnum], subtree_size);
if (childnum == 0)
toku_brt_nonleaf_append_child(node, child, NULL, 0);
else {
int k = minkeys[childnum]; // use the min key of the right subtree, which creates a broken tree
struct kv_pair *pivotkey = kv_pair_malloc(&k, sizeof k, NULL, 0);
toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
}
BP_SUBTREE_EST(node,childnum) = make_subtree_estimates(subtree_size, subtree_size, 0, FALSE);
toku_unpin_brtnode(brt, child);
}
*minkey = minkeys[0];
*maxkey = maxkeys[0];
for (int i = 1; i < fanout; i++) {
if (memcmp(minkey, &minkeys[i], sizeof minkeys[i]) > 0)
*minkey = minkeys[i];
if (memcmp(maxkey, &maxkeys[i], sizeof maxkeys[i]) < 0)
*maxkey = maxkeys[i];
}
}
return node;
}
static void
test_make_tree(int height, int fanout, int nperleaf, uint64_t subtree_size) {
int r;
// cleanup
char fname[]= __FILE__ ".brt";
r = unlink(fname);
assert(r == 0 || (r == -1 && errno == ENOENT));
// create a cachetable
CACHETABLE ct = NULL;
r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r == 0);
// create the brt
TOKUTXN null_txn = NULL;
DB *null_db = NULL;
BRT brt = NULL;
r = toku_open_brt(fname, 1, &brt, 1024, 256, ct, null_txn, toku_builtin_compare_fun, null_db);
assert(r == 0);
// make a tree
int seq = 0, minkey, maxkey;
BRTNODE newroot = make_tree(brt, height, fanout, nperleaf, &seq, &minkey, &maxkey, subtree_size);
// discard the old root block
u_int32_t fullhash = 0;
CACHEKEY *rootp;
rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
// set the new root to point to the new tree
*rootp = newroot->thisnodename;
// unpin the new root
toku_unpin_brtnode(brt, newroot);
// test the key range estimate
uint64_t less, equal, greater;
int k = htonl(0);
DBT key; toku_fill_dbt(&key, &k, sizeof k);
r = toku_brt_keyrange(brt, &key, &less, &equal, &greater); assert_zero(r);
assert(less == 0 && equal == 1 && greater == subtree_size);
// flush to the file system
r = toku_close_brt(brt, 0);
assert(r == 0);
// shutdown the cachetable
r = toku_cachetable_close(&ct);
assert(r == 0);
}
static int
usage(void) {
return 1;
}
int
test_main (int argc , const char *argv[]) {
int height = 1;
int fanout = 2;
int nperleaf = 1;
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
verbose++;
continue;
}
if (strcmp(arg, "-q") == 0) {
verbose = 0;
continue;
}
if (strcmp(arg, "--height") == 0 && i+1 < argc) {
height = atoi(argv[++i]);
continue;
}
if (strcmp(arg, "--fanout") == 0 && i+1 < argc) {
fanout = atoi(argv[++i]);
continue;
}
if (strcmp(arg, "--nperleaf") == 0 && i+1 < argc) {
nperleaf = atoi(argv[++i]);
continue;
}
return usage();
}
test_make_tree(height, fanout, nperleaf, 0);
test_make_tree(height, fanout, nperleaf, 1ULL << 30);
test_make_tree(height, fanout, nperleaf, 1ULL << 31);
test_make_tree(height, fanout, nperleaf, 1ULL << 32);
return 0;
}
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#ident "$Id$" #ident "$Id$"
#ident "Copyright (c) 2008 Tokutek Inc. All rights reserved." #ident "Copyright (c) 2008 Tokutek Inc. All rights reserved."
// Test keyrange
#include "includes.h" #include "includes.h"
#include "test.h" #include "test.h"
...@@ -10,67 +12,160 @@ ...@@ -10,67 +12,160 @@
static TOKUTXN const null_txn = 0; static TOKUTXN const null_txn = 0;
static DB * const null_db = 0; static DB * const null_db = 0;
static void test_flat (void) { char fname[]= __FILE__ ".brt";
char fname[]= __FILE__ ".brt"; CACHETABLE ct;
u_int64_t limit=30000; BRT t;
unlink(fname);
CACHETABLE ct; static void close_brt_and_ct (void) {
int r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0); int r;
BRT t; r = toku_close_brt(t, 0); assert(r==0);
r = toku_cachetable_close(&ct); assert(r==0);
}
static void open_brt_and_ct (bool unlink_old) {
int r;
if (unlink_old) unlink(fname);
r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = toku_open_brt(fname, 1, &t, 1<<12, 1<<9, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0); r = toku_open_brt(fname, 1, &t, 1<<12, 1<<9, ct, null_txn, toku_builtin_compare_fun, null_db); assert(r==0);
u_int64_t i; }
for (i=0; i<limit; i++) {
static void close_and_reopen (void) {
close_brt_and_ct();
open_brt_and_ct(false);
}
static void reload (u_int64_t limit) {
for (u_int64_t i=0; i<limit; i++) {
char key[100],val[100]; char key[100],val[100];
snprintf(key, 100, "%08llu", (unsigned long long)2*i+1); snprintf(key, 100, "%08llu", (unsigned long long)2*i+1);
snprintf(val, 100, "%08llu", (unsigned long long)2*i+1); snprintf(val, 100, "%08llu", (unsigned long long)2*i+1);
DBT k,v; brt_lookup_and_check_nodup(t, key, val);
r = toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v,val, 1+strlen(val)), null_txn);
assert(r == 0);
} }
// flatten it. }
for (i=0; i<limit; i++) {
char key[100]; enum memory_state {
snprintf(key, 100, "%08llu", (unsigned long long)2*i+1); LEAVE_IN_MEMORY, // leave the state in main memory
DBT k; CLOSE_AND_RELOAD, // close the brts and reload them into main memory (that will cause >1 partitio in many leaves.)
struct check_pair pair = {1+strlen(key), key, 1+strlen(key), key, 0}; CLOSE_AND_REOPEN_LEAVE_ON_DISK // close the brts, reopen them, but leave the state on disk.
r = toku_brt_lookup(t, toku_fill_dbt(&k, key, 1+strlen(key)), lookup_checkf, &pair); };
assert(r==0);
assert(pair.call_count==1); static void maybe_reopen (enum memory_state ms, u_int64_t limit) {
switch (ms) {
case CLOSE_AND_RELOAD:
close_and_reopen();
reload(limit);
return;
case CLOSE_AND_REOPEN_LEAVE_ON_DISK:
close_and_reopen();
return;
case LEAVE_IN_MEMORY:
return;
} }
for (i=0; i<limit; i++) { assert(0);
char key[100]; }
static void test_keyrange (enum memory_state ms) {
u_int64_t limit=30000;
open_brt_and_ct(true);
for (u_int64_t i=0; i<limit; i++) {
char key[100],val[100];
snprintf(key, 100, "%08llu", (unsigned long long)2*i+1); snprintf(key, 100, "%08llu", (unsigned long long)2*i+1);
DBT k; snprintf(val, 100, "%08llu", (unsigned long long)2*i+1);
u_int64_t less,equal,greater; DBT k,v;
r = toku_brt_keyrange(t, toku_fill_dbt(&k, key, 1+strlen(key)), &less, &equal, &greater); int r = toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v,val, 1+strlen(val)), null_txn);
assert(r == 0); assert(r == 0);
//printf("key %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater);
assert(less==(u_int64_t)i);
assert(equal==1);
assert(less+equal+greater == limit);
} }
for (i=0; i<1+limit; i++) { maybe_reopen(ms, limit);
{
u_int64_t prev_less = 0, prev_greater = 1LL << 60;
u_int64_t count_less_adjacent = 0, count_greater_adjacent = 0; // count the number of times that the next value is 1 more (less) than the previous.
for (u_int64_t i=0; i<limit; i++) {
char key[100];
snprintf(key, 100, "%08llu", (unsigned long long)2*i+1);
DBT k;
u_int64_t less,equal,greater;
int r = toku_brt_keyrange(t, toku_fill_dbt(&k, key, 1+strlen(key)), &less, &equal, &greater);
assert(r == 0);
//printf("Pkey %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i+1, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater);
// It's an estimate, and it the values don't even change monotonically.
// And all the leaves are in main memory so it's always present.
if (ms!=CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
assert(equal==1);
// The first few items are exact for less.
if (i<70) {
assert(less==i);
}
// The last few items are exact for greater.
if (limit-i<70) {
assert(greater==limit-i-1);
}
} else {
// If we closed it, it's not in main memory, and so the less and greater estimates are wrong, and we set equal to 0.
assert(equal==0);
if (i<10) {
assert(less==0);
}
if (limit-i<10) {
assert(greater==0);
}
}
// Count the number of times that prev_less is 1 less than less.
if (prev_less+1 == less) {
count_less_adjacent++;
}
if (prev_greater-1 == greater) {
count_greater_adjacent++;
}
// the best we can do: It's an estimate. At least in the current implementation for this test (which has small rows)
// the estimate grows monotonically as the leaf grows.
prev_less = less;
prev_greater = greater;
}
if (ms!=CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
// If we were doing the in-memory case then most keys are adjacent.
assert(count_less_adjacent >= 0.9 * limit); // we expect at least 90% to be right.
assert(count_greater_adjacent >= 0.9 * limit); // we expect at least 90% to be right.
}
}
maybe_reopen(ms, limit);
for (u_int64_t i=0; i<1+limit; i++) {
char key[100]; char key[100];
snprintf(key, 100, "%08llu", (unsigned long long)2*i); snprintf(key, 100, "%08llu", (unsigned long long)2*i);
DBT k; DBT k;
u_int64_t less,equal,greater; u_int64_t less,equal,greater;
r = toku_brt_keyrange(t, toku_fill_dbt(&k, key, 1+strlen(key)), &less, &equal, &greater); int r = toku_brt_keyrange(t, toku_fill_dbt(&k, key, 1+strlen(key)), &less, &equal, &greater);
assert(r == 0); assert(r == 0);
//printf("key %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater); //printf("Akey %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater);
assert(less==(u_int64_t)i);
assert(equal==0); assert(equal==0);
assert(less+equal+greater == limit); // The first few items are exact (looking a key that's not there)
if (ms!=CLOSE_AND_REOPEN_LEAVE_ON_DISK) {
if (i<70) {
assert(less==i);
}
// The last few items are exact (looking up a key that's not there)
if (limit-i<70) {
assert(greater==limit-i);
}
} else {
if (i<10) {
assert(less==0);
}
if (limit-i<10) {
assert(greater==0);
}
}
} }
r = toku_close_brt(t, 0); assert(r==0); close_brt_and_ct();
r = toku_cachetable_close(&ct); assert(r==0);
} }
int int
test_main (int argc , const char *argv[]) { test_main (int argc , const char *argv[]) {
default_parse_args(argc, argv); default_parse_args(argc, argv);
test_flat(); test_keyrange(LEAVE_IN_MEMORY);
test_keyrange(CLOSE_AND_RELOAD);
test_keyrange(CLOSE_AND_REOPEN_LEAVE_ON_DISK);
if (verbose) printf("test ok\n"); if (verbose) printf("test ok\n");
return 0; return 0;
} }
......
...@@ -38,7 +38,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -38,7 +38,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode,0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL);
leafnode->max_msn_applied_to_node_on_disk = msn; leafnode->max_msn_applied_to_node_on_disk = msn;
......
...@@ -89,7 +89,7 @@ insert_random_message(NONLEAF_CHILDINFO bnc, BRT_MSG_S **save, bool *is_fresh_ou ...@@ -89,7 +89,7 @@ insert_random_message(NONLEAF_CHILDINFO bnc, BRT_MSG_S **save, bool *is_fresh_ou
} }
static void static void
insert_random_message_to_leaf(BRT t, BRTNODE leaf, int childnum, BASEMENTNODE blb, LEAFENTRY *save, XIDS xids, int pfx) insert_random_message_to_leaf(BRT t, BASEMENTNODE blb, LEAFENTRY *save, XIDS xids, int pfx)
{ {
int keylen = (random() % 16) + 16; int keylen = (random() % 16) + 16;
int vallen = (random() % 1024) + 16; int vallen = (random() % 1024) + 16;
...@@ -116,14 +116,14 @@ insert_random_message_to_leaf(BRT t, BRTNODE leaf, int childnum, BASEMENTNODE bl ...@@ -116,14 +116,14 @@ insert_random_message_to_leaf(BRT t, BRTNODE leaf, int childnum, BASEMENTNODE bl
int r = apply_msg_to_leafentry(result, NULL, &memsize, &disksize, save, NULL, NULL); int r = apply_msg_to_leafentry(result, NULL, &memsize, &disksize, save, NULL, NULL);
assert_zero(r); assert_zero(r);
bool made_change; bool made_change;
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, blb, &BP_SUBTREE_EST(leaf, childnum), result, &made_change, NULL, NULL, NULL); brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, blb, result, &made_change, NULL, NULL, NULL);
if (msn.msn > blb->max_msn_applied.msn) { if (msn.msn > blb->max_msn_applied.msn) {
blb->max_msn_applied = msn; blb->max_msn_applied = msn;
} }
} }
static void static void
insert_same_message_to_leaves(BRT t, BRTNODE leaf1, BRTNODE leaf2, int childnum, BASEMENTNODE blb1, BASEMENTNODE blb2, LEAFENTRY *save, XIDS xids, int pfx) insert_same_message_to_leaves(BRT t, BASEMENTNODE blb1, BASEMENTNODE blb2, LEAFENTRY *save, XIDS xids, int pfx)
{ {
int keylen = (random() % 16) + 16; int keylen = (random() % 16) + 16;
int vallen = (random() % 1024) + 16; int vallen = (random() % 1024) + 16;
...@@ -150,11 +150,11 @@ insert_same_message_to_leaves(BRT t, BRTNODE leaf1, BRTNODE leaf2, int childnum, ...@@ -150,11 +150,11 @@ insert_same_message_to_leaves(BRT t, BRTNODE leaf1, BRTNODE leaf2, int childnum,
int r = apply_msg_to_leafentry(result, NULL, &memsize, &disksize, save, NULL, NULL); int r = apply_msg_to_leafentry(result, NULL, &memsize, &disksize, save, NULL, NULL);
assert_zero(r); assert_zero(r);
bool made_change; bool made_change;
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, blb1, &BP_SUBTREE_EST(leaf1, childnum), result, &made_change, NULL, NULL, NULL); brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, blb1, result, &made_change, NULL, NULL, NULL);
if (msn.msn > blb1->max_msn_applied.msn) { if (msn.msn > blb1->max_msn_applied.msn) {
blb1->max_msn_applied = msn; blb1->max_msn_applied = msn;
} }
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, blb2, &BP_SUBTREE_EST(leaf2, childnum), result, &made_change, NULL, NULL, NULL); brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, blb2, result, &made_change, NULL, NULL, NULL);
if (msn.msn > blb2->max_msn_applied.msn) { if (msn.msn > blb2->max_msn_applied.msn) {
blb2->max_msn_applied = msn; blb2->max_msn_applied = msn;
} }
...@@ -509,7 +509,7 @@ flush_to_leaf(BRT t, bool make_leaf_up_to_date, bool use_flush) { ...@@ -509,7 +509,7 @@ flush_to_leaf(BRT t, bool make_leaf_up_to_date, bool use_flush) {
int total_size = 0; int total_size = 0;
for (i = 0; total_size < 4*M; ++i) { for (i = 0; total_size < 4*M; ++i) {
total_size -= child_blbs[i%8]->n_bytes_in_buffer; total_size -= child_blbs[i%8]->n_bytes_in_buffer;
insert_random_message_to_leaf(t, child, i%8, child_blbs[i%8], &child_messages[i], xids_123, i%8); insert_random_message_to_leaf(t, child_blbs[i%8], &child_messages[i], xids_123, i%8);
total_size += child_blbs[i%8]->n_bytes_in_buffer; total_size += child_blbs[i%8]->n_bytes_in_buffer;
if (i % 8 < 7) { if (i % 8 < 7) {
u_int32_t keylen; u_int32_t keylen;
...@@ -730,7 +730,7 @@ flush_to_leaf_with_keyrange(BRT t, bool make_leaf_up_to_date) { ...@@ -730,7 +730,7 @@ flush_to_leaf_with_keyrange(BRT t, bool make_leaf_up_to_date) {
int total_size = 0; int total_size = 0;
for (i = 0; total_size < 4*M; ++i) { for (i = 0; total_size < 4*M; ++i) {
total_size -= child_blbs[i%8]->n_bytes_in_buffer; total_size -= child_blbs[i%8]->n_bytes_in_buffer;
insert_random_message_to_leaf(t, child, i%8, child_blbs[i%8], &child_messages[i], xids_123, i%8); insert_random_message_to_leaf(t, child_blbs[i%8], &child_messages[i], xids_123, i%8);
total_size += child_blbs[i%8]->n_bytes_in_buffer; total_size += child_blbs[i%8]->n_bytes_in_buffer;
u_int32_t keylen; u_int32_t keylen;
char *key = le_key_and_len(child_messages[i], &keylen); char *key = le_key_and_len(child_messages[i], &keylen);
...@@ -904,7 +904,7 @@ compare_apply_and_flush(BRT t, bool make_leaf_up_to_date) { ...@@ -904,7 +904,7 @@ compare_apply_and_flush(BRT t, bool make_leaf_up_to_date) {
int total_size = 0; int total_size = 0;
for (i = 0; total_size < 4*M; ++i) { for (i = 0; total_size < 4*M; ++i) {
total_size -= child1_blbs[i%8]->n_bytes_in_buffer; total_size -= child1_blbs[i%8]->n_bytes_in_buffer;
insert_same_message_to_leaves(t, child1, child2, i%8, child1_blbs[i%8], child2_blbs[i%8], &child_messages[i], xids_123, i%8); insert_same_message_to_leaves(t, child1_blbs[i%8], child2_blbs[i%8], &child_messages[i], xids_123, i%8);
total_size += child1_blbs[i%8]->n_bytes_in_buffer; total_size += child1_blbs[i%8]->n_bytes_in_buffer;
if (i % 8 < 7) { if (i % 8 < 7) {
u_int32_t keylen; u_int32_t keylen;
......
...@@ -71,10 +71,6 @@ test_split_on_boundary(void) ...@@ -71,10 +71,6 @@ test_split_on_boundary(void)
MALLOC_N(sn.n_children - 1, sn.childkeys); MALLOC_N(sn.n_children - 1, sn.childkeys);
sn.totalchildkeylens = 0; sn.totalchildkeylens = 0;
for (int bn = 0; bn < sn.n_children; ++bn) { for (int bn = 0; bn < sn.n_children; ++bn) {
BP_SUBTREE_EST(&sn,bn).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,bn) = PT_AVAIL; BP_STATE(&sn,bn) = PT_AVAIL;
set_BLB(&sn, bn, toku_create_empty_bn()); set_BLB(&sn, bn, toku_create_empty_bn());
BLB_NBYTESINBUF(&sn,bn) = 0; BLB_NBYTESINBUF(&sn,bn) = 0;
...@@ -155,10 +151,6 @@ test_split_with_everything_on_the_left(void) ...@@ -155,10 +151,6 @@ test_split_with_everything_on_the_left(void)
LEAFENTRY big_element; LEAFENTRY big_element;
char *big_val = NULL; char *big_val = NULL;
for (int bn = 0; bn < sn.n_children; ++bn) { for (int bn = 0; bn < sn.n_children; ++bn) {
BP_SUBTREE_EST(&sn,bn).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,bn) = PT_AVAIL; BP_STATE(&sn,bn) = PT_AVAIL;
set_BLB(&sn, bn, toku_create_empty_bn()); set_BLB(&sn, bn, toku_create_empty_bn());
BLB_NBYTESINBUF(&sn,bn) = 0; BLB_NBYTESINBUF(&sn,bn) = 0;
...@@ -249,10 +241,6 @@ test_split_on_boundary_of_last_node(void) ...@@ -249,10 +241,6 @@ test_split_on_boundary_of_last_node(void)
LEAFENTRY big_element; LEAFENTRY big_element;
char *big_val = NULL; char *big_val = NULL;
for (int bn = 0; bn < sn.n_children; ++bn) { for (int bn = 0; bn < sn.n_children; ++bn) {
BP_SUBTREE_EST(&sn,bn).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,bn) = PT_AVAIL; BP_STATE(&sn,bn) = PT_AVAIL;
set_BLB(&sn, bn, toku_create_empty_bn()); set_BLB(&sn, bn, toku_create_empty_bn());
BLB_NBYTESINBUF(&sn,bn) = 0; BLB_NBYTESINBUF(&sn,bn) = 0;
...@@ -342,10 +330,6 @@ test_split_at_begin(void) ...@@ -342,10 +330,6 @@ test_split_at_begin(void)
sn.totalchildkeylens = 0; sn.totalchildkeylens = 0;
long totalbytes = 0; long totalbytes = 0;
for (int bn = 0; bn < sn.n_children; ++bn) { for (int bn = 0; bn < sn.n_children; ++bn) {
BP_SUBTREE_EST(&sn,bn).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,bn) = PT_AVAIL; BP_STATE(&sn,bn) = PT_AVAIL;
set_BLB(&sn, bn, toku_create_empty_bn()); set_BLB(&sn, bn, toku_create_empty_bn());
BLB_NBYTESINBUF(&sn,bn) = 0; BLB_NBYTESINBUF(&sn,bn) = 0;
...@@ -440,10 +424,6 @@ test_split_at_end(void) ...@@ -440,10 +424,6 @@ test_split_at_end(void)
sn.totalchildkeylens = 0; sn.totalchildkeylens = 0;
long totalbytes = 0; long totalbytes = 0;
for (int bn = 0; bn < sn.n_children; ++bn) { for (int bn = 0; bn < sn.n_children; ++bn) {
BP_SUBTREE_EST(&sn,bn).ndata = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).nkeys = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).dsize = random() + (((long long)random())<<32);
BP_SUBTREE_EST(&sn,bn).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,bn) = PT_AVAIL; BP_STATE(&sn,bn) = PT_AVAIL;
set_BLB(&sn, bn, toku_create_empty_bn()); set_BLB(&sn, bn, toku_create_empty_bn());
BLB_NBYTESINBUF(&sn,bn) = 0; BLB_NBYTESINBUF(&sn,bn) = 0;
......
...@@ -41,7 +41,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -41,7 +41,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode, 0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode, 0), &cmd, idx, NULL, NULL, NULL, NULL);
// Create bad tree (don't do following): // Create bad tree (don't do following):
// leafnode->max_msn_applied_to_node = msn; // leafnode->max_msn_applied_to_node = msn;
......
...@@ -29,7 +29,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -29,7 +29,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode, 0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode, 0), &cmd, idx, NULL, NULL, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -30,7 +30,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -30,7 +30,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode, 0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode, 0), &cmd, idx, NULL, NULL, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -29,7 +29,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -29,7 +29,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode, 0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode, 0), &cmd, idx, NULL, NULL, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -30,7 +30,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -30,7 +30,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode,0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -30,7 +30,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -30,7 +30,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode, 0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode, 0), &cmd, idx, NULL, NULL, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -29,7 +29,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -29,7 +29,7 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(BLB(leafnode, 0), &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL, NULL, NULL); brt_leaf_apply_cmd_once(BLB(leafnode, 0), &cmd, idx, NULL, NULL, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -20,6 +20,7 @@ static void test (void) { ...@@ -20,6 +20,7 @@ static void test (void) {
r=toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r=toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r=db_env_create(&env, 0); CKERR(r); r=db_env_create(&env, 0); CKERR(r);
env->set_errfile(env, stderr); env->set_errfile(env, stderr);
r = env->set_redzone(env, 0); CKERR(r);
r=env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r=env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r=db_create(&db, env, 0); CKERR(r); r=db_create(&db, env, 0); CKERR(r);
r=db->set_pagesize(db, 4096); r=db->set_pagesize(db, 4096);
...@@ -57,9 +58,13 @@ static void test (void) { ...@@ -57,9 +58,13 @@ static void test (void) {
r = db->key_range64(db, txn, dbt_init(&k, key, 1+strlen(key)), &less, &equal, &greater, &is_exact); r = db->key_range64(db, txn, dbt_init(&k, key, 1+strlen(key)), &less, &equal, &greater, &is_exact);
assert(r == 0); assert(r == 0);
//printf("key %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater); //printf("key %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater);
assert(less==(u_int64_t)i); if (i<30) {
assert(less==(u_int64_t)i);
}
if (i+30 > limit) {
assert(greater==limit-i-1);
}
assert(equal==1); assert(equal==1);
assert(less+equal+greater == limit);
} }
for (i=0; i<1+limit; i++) { for (i=0; i<1+limit; i++) {
char key[100]; char key[100];
...@@ -70,9 +75,13 @@ static void test (void) { ...@@ -70,9 +75,13 @@ static void test (void) {
r = db->key_range64(db, txn, dbt_init(&k, key, 1+strlen(key)), &less, &equal, &greater, &is_exact); r = db->key_range64(db, txn, dbt_init(&k, key, 1+strlen(key)), &less, &equal, &greater, &is_exact);
assert(r == 0); assert(r == 0);
//printf("key %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater); //printf("key %llu/%llu %llu %llu %llu\n", (unsigned long long)2*i, (unsigned long long)2*limit, (unsigned long long)less, (unsigned long long)equal, (unsigned long long)greater);
assert(less==(u_int64_t)i);
assert(equal==0); assert(equal==0);
assert(less+equal+greater == limit); if (i<30) {
assert(less==(u_int64_t)i);
}
if (i+30 > limit) {
assert(greater==limit-i);
}
} }
r=txn->commit(txn, 0); assert(r==0); r=txn->commit(txn, 0); assert(r==0);
r = db->close(db, 0); assert(r==0); r = db->close(db, 0); assert(r==0);
......
...@@ -21,6 +21,7 @@ test_stat64 (unsigned int N) { ...@@ -21,6 +21,7 @@ test_stat64 (unsigned int N) {
DB *db; DB *db;
DB_TXN *txn; DB_TXN *txn;
r = db_env_create(&env, 0); CKERR(r); r = db_env_create(&env, 0); CKERR(r);
r = env->set_redzone(env, 0); CKERR(r);
r = env->set_cachesize(env, 0, 20*1000000, 1); r = env->set_cachesize(env, 0, 20*1000000, 1);
r = env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r); r = env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment