Commit 527eeb1f authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

[t:3641], merge refactoring of BRTNODE to main from tokudb.3627

git-svn-id: file:///svn/toku/tokudb@32481 c7de825b-a66e-492c-adef-691d508d4ae1
parent 4a13ed4c
...@@ -82,9 +82,6 @@ add_estimates (struct subtree_estimates *a, struct subtree_estimates *b) { ...@@ -82,9 +82,6 @@ add_estimates (struct subtree_estimates *a, struct subtree_estimates *b) {
struct brtnode_nonleaf_childinfo { struct brtnode_nonleaf_childinfo {
BLOCKNUM blocknum;
BOOL have_fullhash; // do we have the full hash?
u_int32_t fullhash; // the fullhash of the child
FIFO buffer; FIFO buffer;
unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */ unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */
}; };
...@@ -97,9 +94,65 @@ struct brtnode_leaf_basement_node { ...@@ -97,9 +94,65 @@ struct brtnode_leaf_basement_node {
unsigned int seqinsert; /* number of sequential inserts to this leaf */ unsigned int seqinsert; /* number of sequential inserts to this leaf */
}; };
/* Internal nodes. */ #define PT_INVALID 0
#define PT_ON_DISK 1
#define PT_COMPRESSED 2
#define PT_AVAIL 3
// a brtnode partition represents
struct brtnode_partition {
BLOCKNUM blocknum;
BOOL have_fullhash; // do we have the full hash?
u_int32_t fullhash; // the fullhash of the child
struct subtree_estimates subtree_estimates; //estimates for a child, for leaf nodes, are estimates of basement nodes
//
// at any time, the partitions may be in one of the following three states (stored in pt_state):
// PT_INVALID - means that the partition was just initialized
// PT_ON_DISK - means that the partition is not in memory and needs to be read from disk. To use, must read off disk and decompress
// PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress
// PT_AVAIL - means the partition is decompressed and in memory
//
u_int8_t state;
//
// stores the offset to the end of the partition on disk from the brtnode, needed to read a partition off of disk
// the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
//
u_int32_t offset;
//
// pointer to the partition. Depending on the state, they may be different things
// if state == PT_INVALID, then the node was just initialized and ptr == NULL
// if state == PT_ON_DISK, then ptr == NULL
// if state == PT_COMPRESSED, then ptr points to a struct sub_block*
// if state == PT_AVAIL, then ptr is:
// a struct brtnode_nonleaf_childinfo for internal nodes,
// a struct brtnode_leaf_basement_node for leaf nodes
//
void* ptr;
};
// brtnode partition macros
#define BP_BLOCKNUM(node,i) ((node)->bp[i].blocknum)
#define BP_HAVE_FULLHASH(node,i) ((node)->bp[i].have_fullhash)
#define BP_FULLHASH(node,i) ((node)->bp[i].fullhash)
#define BP_STATE(node,i) ((node)->bp[i].state)
#define BP_OFFSET(node,i) ((node)->bp[i].offset)
#define BP_SUBTREE_EST(node,i) ((node)->bp[i].subtree_estimates)
// internal node macros
#define BNC_BUFFER(node,i) (((struct brtnode_nonleaf_childinfo*)((node)->bp[i].ptr))->buffer)
#define BNC_NBYTESINBUF(node,i) (((struct brtnode_nonleaf_childinfo*)((node)->bp[i].ptr))->n_bytes_in_buffer)
// leaf node macros
#define BLB_OPTIMIZEDFORUPGRADE(node,i) (((struct brtnode_leaf_basement_node*)((node)->bp[i].ptr))->optimized_for_upgrade)
#define BLB_SOFTCOPYISUPTODATE(node,i) (((struct brtnode_leaf_basement_node*)((node)->bp[i].ptr))->soft_copy_is_up_to_date)
#define BLB_BUFFER(node,i) (((struct brtnode_leaf_basement_node*)((node)->bp[i].ptr))->buffer)
#define BLB_NBYTESINBUF(node,i) (((struct brtnode_leaf_basement_node*)((node)->bp[i].ptr))->n_bytes_in_buffer)
#define BLB_SEQINSERT(node,i) (((struct brtnode_leaf_basement_node*)((node)->bp[i].ptr))->seqinsert)
struct brtnode { struct brtnode {
MSN max_msn_applied_to_node; // max msn that has been applied to this node (for root node, this is max msn for the tree) MSN max_msn_applied_to_node_in_memory; // max msn that has been applied to this node (for root node, this is max msn for the tree)
MSN max_msn_applied_to_node_on_disk; // same as above, but for data on disk, only meaningful if node is clean
unsigned int nodesize; unsigned int nodesize;
unsigned int flags; unsigned int flags;
BLOCKNUM thisnodename; // Which block number is this node? BLOCKNUM thisnodename; // Which block number is this node?
...@@ -115,26 +168,11 @@ struct brtnode { ...@@ -115,26 +168,11 @@ struct brtnode {
unsigned int totalchildkeylens; unsigned int totalchildkeylens;
struct kv_pair **childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1]. struct kv_pair **childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Child 1's keys are > childkeys[0]. */ Child 1's keys are > childkeys[0]. */
// array of brtnode partitions
struct subtree_estimates *subtree_estimates; //array of estimates for each child, for leaf nodes, are estimates // each one is associated with a child
// of basement nodes // for internal nodes, the ith partition corresponds to the ith message buffer
union node { // for leaf nodes, the ith partition corresponds to the ith basement node
struct nonleaf { struct brtnode_partition *bp;
unsigned int n_bytes_in_buffers;
struct brtnode_nonleaf_childinfo *childinfos; /* One extra so we can grow */
#define BNC_BLOCKNUM(node,i) ((node)->u.n.childinfos[i].blocknum)
#define BNC_BUFFER(node,i) ((node)->u.n.childinfos[i].buffer)
#define BNC_NBYTESINBUF(node,i) ((node)->u.n.childinfos[i].n_bytes_in_buffer)
#define BNC_HAVE_FULLHASH(node,i) ((node)->u.n.childinfos[i].have_fullhash)
#define BNC_FULLHASH(node,i) ((node)->u.n.childinfos[i].fullhash)
} n;
struct leaf {
struct brtnode_leaf_basement_node *bn; // individual basement nodes of a leaf
} l;
} u;
}; };
/* pivot flags (must fit in 8 bits) */ /* pivot flags (must fit in 8 bits) */
...@@ -248,9 +286,10 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *, struct brt_header *h, int6 ...@@ -248,9 +286,10 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *, struct brt_header *h, int6
int toku_deserialize_brtheader_from (int fd, LSN max_acceptable_lsn, struct brt_header **brth); int toku_deserialize_brtheader_from (int fd, LSN max_acceptable_lsn, struct brt_header **brth);
int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset); int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset);
void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc); void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc);
void toku_setup_empty_leafnode( BRTNODE n, u_int32_t num_bn); void toku_setup_empty_bn(BASEMENTNODE bn);
void toku_destroy_brtnode_internals(BRTNODE node); void toku_destroy_brtnode_internals(BRTNODE node);
void toku_brtnode_free (BRTNODE *node); void toku_brtnode_free (BRTNODE *node);
void toku_assert_entire_node_in_memory(BRTNODE node);
// append a child node to a parent node // append a child node to a parent node
void toku_brt_nonleaf_append_child(BRTNODE node, BRTNODE child, struct kv_pair *pivotkey, size_t pivotkeysize); void toku_brt_nonleaf_append_child(BRTNODE node, BRTNODE child, struct kv_pair *pivotkey, size_t pivotkeysize);
......
...@@ -224,15 +224,16 @@ static u_int32_t ...@@ -224,15 +224,16 @@ static u_int32_t
serialize_brtnode_partition_size (BRTNODE node, int i) serialize_brtnode_partition_size (BRTNODE node, int i)
{ {
u_int32_t result = 0; u_int32_t result = 0;
assert(node->bp[i].state == PT_AVAIL);
result++; // Byte that states what the partition is result++; // Byte that states what the partition is
if (node->height > 0) { if (node->height > 0) {
result += 4; // size of bytes in buffer table result += 4; // size of bytes in buffer table
result += node->u.n.childinfos[i].n_bytes_in_buffer; result += BNC_NBYTESINBUF(node, i);
} }
else { else {
result += 4; // n_entries in buffer table result += 4; // n_entries in buffer table
result += 4; // optimized_for_upgrade, see if we can get rid of this result += 4; // optimized_for_upgrade, see if we can get rid of this
result += node->u.l.bn[i].n_bytes_in_buffer; result += BLB_NBYTESINBUF(node, i);
} }
result += 4; // checksum result += 4; // checksum
return result; return result;
...@@ -276,14 +277,14 @@ serialize_brtnode_partition(BRTNODE node, int i, struct sub_block *sb) { ...@@ -276,14 +277,14 @@ serialize_brtnode_partition(BRTNODE node, int i, struct sub_block *sb) {
else { else {
unsigned char ch = BRTNODE_PARTITION_OMT_LEAVES; unsigned char ch = BRTNODE_PARTITION_OMT_LEAVES;
wbuf_nocrc_char(&wb, ch); wbuf_nocrc_char(&wb, ch);
wbuf_nocrc_int(&wb, node->u.l.bn[i].optimized_for_upgrade); wbuf_nocrc_int(&wb, BLB_OPTIMIZEDFORUPGRADE(node, i));
wbuf_nocrc_uint(&wb, toku_omt_size(node->u.l.bn[i].buffer)); wbuf_nocrc_uint(&wb, toku_omt_size(BLB_BUFFER(node, i)));
// //
// iterate over leafentries and place them into the buffer // iterate over leafentries and place them into the buffer
// //
toku_omt_iterate(node->u.l.bn[i].buffer, wbufwriteleafentry, &wb); toku_omt_iterate(BLB_BUFFER(node, i), wbufwriteleafentry, &wb);
} }
u_int32_t end_to_end_checksum = x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb)); u_int32_t end_to_end_checksum = x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb));
wbuf_nocrc_int(&wb, end_to_end_checksum); wbuf_nocrc_int(&wb, end_to_end_checksum);
...@@ -352,7 +353,7 @@ static u_int32_t ...@@ -352,7 +353,7 @@ static u_int32_t
serialize_brtnode_info_size(BRTNODE node) serialize_brtnode_info_size(BRTNODE node)
{ {
u_int32_t retval = 0; u_int32_t retval = 0;
retval += 8; // max_msn_applied_to_node retval += 8; // max_msn_applied_to_node_on_disk
retval += 4; // nodesize retval += 4; // nodesize
retval += 4; // flags retval += 4; // flags
retval += 4; // height; retval += 4; // height;
...@@ -383,17 +384,17 @@ serialize_brtnode_info( ...@@ -383,17 +384,17 @@ serialize_brtnode_info(
struct wbuf wb; struct wbuf wb;
wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size); wbuf_init(&wb, sb->uncompressed_ptr, sb->uncompressed_size);
wbuf_MSN(&wb, node->max_msn_applied_to_node); wbuf_MSN(&wb, node->max_msn_applied_to_node_in_memory);
wbuf_nocrc_uint(&wb, node->nodesize); wbuf_nocrc_uint(&wb, node->nodesize);
wbuf_nocrc_uint(&wb, node->flags); wbuf_nocrc_uint(&wb, node->flags);
wbuf_nocrc_int (&wb, node->height); wbuf_nocrc_int (&wb, node->height);
wbuf_nocrc_int (&wb, node->n_children); wbuf_nocrc_int (&wb, node->n_children);
// subtree estimates of each child // subtree estimates of each child
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
wbuf_nocrc_ulonglong(&wb, node->subtree_estimates[i].nkeys); wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).nkeys);
wbuf_nocrc_ulonglong(&wb, node->subtree_estimates[i].ndata); wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).ndata);
wbuf_nocrc_ulonglong(&wb, node->subtree_estimates[i].dsize); wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).dsize);
wbuf_nocrc_char (&wb, (char)node->subtree_estimates[i].exact); wbuf_nocrc_char (&wb, (char)BP_SUBTREE_EST(node,i).exact);
} }
// pivot information // pivot information
for (int i = 0; i < node->n_children-1; i++) { for (int i = 0; i < node->n_children-1; i++) {
...@@ -402,7 +403,7 @@ serialize_brtnode_info( ...@@ -402,7 +403,7 @@ serialize_brtnode_info(
// child blocks, only for internal nodes // child blocks, only for internal nodes
if (node->height > 0) { if (node->height > 0) {
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
wbuf_nocrc_BLOCKNUM(&wb, BNC_BLOCKNUM(node,i)); wbuf_nocrc_BLOCKNUM(&wb, BP_BLOCKNUM(node,i));
} }
} }
...@@ -411,6 +412,10 @@ serialize_brtnode_info( ...@@ -411,6 +412,10 @@ serialize_brtnode_info(
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
// TODO: (Zardosht) figure out if we want to put some padding to align partitions // TODO: (Zardosht) figure out if we want to put some padding to align partitions
curr_offset += sb_parts[i].compressed_size + 4; // data and checksum curr_offset += sb_parts[i].compressed_size + 4; // data and checksum
//
// update the offset in the node
//
BP_OFFSET(node,i) = curr_offset;
wbuf_nocrc_int(&wb, curr_offset); wbuf_nocrc_int(&wb, curr_offset);
} }
...@@ -420,10 +425,16 @@ serialize_brtnode_info( ...@@ -420,10 +425,16 @@ serialize_brtnode_info(
invariant(sb->uncompressed_size==wb.ndone); invariant(sb->uncompressed_size==wb.ndone);
} }
// This is the size of the uncompressed data, not including the compression headers // This is the size of the uncompressed data, not including the compression headers
unsigned int unsigned int
toku_serialize_brtnode_size (BRTNODE node) { toku_serialize_brtnode_size (BRTNODE node) {
unsigned int result = 0; unsigned int result = 0;
//
// As of now, this seems to be called if and only if the entire node is supposed
// to be in memory, so we will assert it.
//
toku_assert_entire_node_in_memory(node);
result += serialize_node_header_size(node); result += serialize_node_header_size(node);
result += serialize_brtnode_info_size(node); result += serialize_brtnode_info_size(node);
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
...@@ -462,16 +473,16 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) { ...@@ -462,16 +473,16 @@ sum_item (OMTVALUE lev, u_int32_t UU(idx), void *vsi) {
return 0; return 0;
} }
// // There must still be at least one child // There must still be at least one child
static void static void
rebalance_brtnode_leaf(BRTNODE node) rebalance_brtnode_leaf(BRTNODE node)
{ {
assert(node->height ==0); assert(node->height == 0);
// first create an array of OMTVALUE's that store all the data // first create an array of OMTVALUE's that store all the data
u_int32_t num_le = 0; u_int32_t num_le = 0;
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
lazy_assert(node->u.l.bn[i].buffer); lazy_assert(BLB_BUFFER(node, i));
num_le += toku_omt_size(node->u.l.bn[i].buffer); num_le += toku_omt_size(BLB_BUFFER(node, i));
} }
OMTVALUE *XMALLOC_N(num_le, array); OMTVALUE *XMALLOC_N(num_le, array);
// creating array that will store id's of new pivots. // creating array that will store id's of new pivots.
...@@ -480,7 +491,7 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -480,7 +491,7 @@ rebalance_brtnode_leaf(BRTNODE node)
// now fill in the values into array // now fill in the values into array
u_int32_t curr_le = 0; u_int32_t curr_le = 0;
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
OMT curr_omt = node->u.l.bn[i].buffer; OMT curr_omt = BLB_BUFFER(node, i);
struct array_info ai; struct array_info ai;
ai.offset = curr_le; ai.offset = curr_le;
ai.array = array; ai.array = array;
...@@ -511,8 +522,8 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -511,8 +522,8 @@ rebalance_brtnode_leaf(BRTNODE node)
// Need to figure out how to properly deal with the values seqinsert // Need to figure out how to properly deal with the values seqinsert
// and optimized_for_upgrade. I am not happy with how this is being // and optimized_for_upgrade. I am not happy with how this is being
// handled with basement nodes // handled with basement nodes
u_int32_t tmp_optimized_for_upgrade = node->u.l.bn[node->n_children-1].optimized_for_upgrade; u_int32_t tmp_optimized_for_upgrade = BLB_OPTIMIZEDFORUPGRADE(node, node->n_children-1);
u_int32_t tmp_seqinsert = node->u.l.bn[node->n_children-1].seqinsert; u_int32_t tmp_seqinsert = BLB_SEQINSERT(node, node->n_children-1);
// Now destroy the old stuff; // Now destroy the old stuff;
toku_destroy_brtnode_internals(node); toku_destroy_brtnode_internals(node);
...@@ -521,14 +532,16 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -521,14 +532,16 @@ rebalance_brtnode_leaf(BRTNODE node)
int num_children = curr_pivot + 1; int num_children = curr_pivot + 1;
assert(num_children > 0); assert(num_children > 0);
node->totalchildkeylens = 0; node->totalchildkeylens = 0;
XMALLOC_N(num_children-1, node->childkeys); XMALLOC_N(num_children-1, node->childkeys);
assert(node->childkeys);
XMALLOC_N(num_children, node->subtree_estimates);
assert(node->subtree_estimates);
node->n_children = num_children; node->n_children = num_children;
XMALLOC_N(num_children, node->u.l.bn); XMALLOC_N(num_children, node->bp);
assert(node->u.l.bn); for (int i = 0; i < num_children; i++) {
toku_setup_empty_leafnode(node, num_children); node->bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
BASEMENTNODE bn = (BASEMENTNODE)node->bp[i].ptr;
memset(bn, 0, sizeof(struct brtnode_leaf_basement_node));
toku_setup_empty_bn(bn);
}
// now we start to fill in the data // now we start to fill in the data
...@@ -547,8 +560,8 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -547,8 +560,8 @@ rebalance_brtnode_leaf(BRTNODE node)
// now the basement nodes // now the basement nodes
for (int i = 0; i < num_children; i++) { for (int i = 0; i < num_children; i++) {
// put back optimized_for_upgrade and seqinsert // put back optimized_for_upgrade and seqinsert
node->u.l.bn[i].seqinsert = tmp_seqinsert; BLB_SEQINSERT(node, i) = tmp_seqinsert;
node->u.l.bn[i].optimized_for_upgrade = tmp_optimized_for_upgrade; BLB_OPTIMIZEDFORUPGRADE(node, i) = tmp_optimized_for_upgrade;
// create start (inclusive) and end (exclusive) boundaries for data of basement node // create start (inclusive) and end (exclusive) boundaries for data of basement node
u_int32_t curr_start = (i==0) ? 0 : new_pivots[i-1]+1; u_int32_t curr_start = (i==0) ? 0 : new_pivots[i-1]+1;
...@@ -558,24 +571,23 @@ rebalance_brtnode_leaf(BRTNODE node) ...@@ -558,24 +571,23 @@ rebalance_brtnode_leaf(BRTNODE node)
OMTVALUE *XMALLOC_N(num_in_bn, bn_array); OMTVALUE *XMALLOC_N(num_in_bn, bn_array);
assert(bn_array); assert(bn_array);
memcpy(bn_array, &array[curr_start], num_in_bn*(sizeof(array[0]))); memcpy(bn_array, &array[curr_start], num_in_bn*(sizeof(array[0])));
toku_omt_destroy(&node->u.l.bn[i].buffer); toku_omt_destroy(&BLB_BUFFER(node, i));
int r = toku_omt_create_steal_sorted_array( int r = toku_omt_create_steal_sorted_array(
&node->u.l.bn[i].buffer, &BLB_BUFFER(node, i),
&bn_array, &bn_array,
num_in_bn, num_in_bn,
num_in_bn num_in_bn
); );
lazy_assert_zero(r); lazy_assert_zero(r);
struct sum_info sum_info = {0,0,0}; struct sum_info sum_info = {0,0,0};
toku_omt_iterate(node->u.l.bn[i].buffer, sum_item, &sum_info); toku_omt_iterate(BLB_BUFFER(node, i), sum_item, &sum_info);
node->u.l.bn[i].n_bytes_in_buffer = sum_info.dsum; BLB_NBYTESINBUF(node, i) = sum_info.dsum;
BP_STATE(node,i) = PT_AVAIL;
} }
// now the subtree estimates // now the subtree estimates
toku_brt_leaf_reset_calc_leaf_stats(node); toku_brt_leaf_reset_calc_leaf_stats(node);
// TODO: (Zardosht) add some verification
toku_free(array); toku_free(array);
toku_free(new_pivots); toku_free(new_pivots);
} }
...@@ -688,6 +700,7 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h ...@@ -688,6 +700,7 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h
size_t n_to_write; size_t n_to_write;
char *compressed_buf = NULL; char *compressed_buf = NULL;
{ {
toku_assert_entire_node_in_memory(node);
int r = toku_serialize_brtnode_to_memory (node, &n_to_write, &compressed_buf); int r = toku_serialize_brtnode_to_memory (node, &n_to_write, &compressed_buf);
if (r!=0) return r; if (r!=0) return r;
} }
...@@ -712,6 +725,7 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h ...@@ -712,6 +725,7 @@ toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct brt_h
//printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size); //printf("%s:%d wrote %d bytes for %lld size=%lld\n", __FILE__, __LINE__, w.ndone, off, size);
toku_free(compressed_buf); toku_free(compressed_buf);
node->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction. node->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_in_memory;
return 0; return 0;
} }
...@@ -776,24 +790,17 @@ dump_bad_block(unsigned char *vp, u_int64_t size) { ...@@ -776,24 +790,17 @@ dump_bad_block(unsigned char *vp, u_int64_t size) {
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////
void toku_setup_empty_leafnode( BRTNODE n, u_int32_t num_bn) { void toku_setup_empty_bn(BASEMENTNODE bn) {
u_int32_t i; bn->soft_copy_is_up_to_date = TRUE;
for (i = 0; i < num_bn; i++) { int r;
BASEMENTNODE bn = &n->u.l.bn[i]; r = toku_omt_create(&bn->buffer);
bn->soft_copy_is_up_to_date = TRUE; assert_zero(r);
int r; bn->n_bytes_in_buffer = 0;
r = toku_omt_create(&bn->buffer); bn->seqinsert = 0;
assert_zero(r); bn->optimized_for_upgrade = 0;
bn->n_bytes_in_buffer = 0;
bn->seqinsert = 0;
bn->optimized_for_upgrade = 0;
n->subtree_estimates[i] = zero_estimates;
}
} }
//
static int static int
read_block_from_fd_into_rbuf( read_block_from_fd_into_rbuf(
int fd, int fd,
...@@ -860,7 +867,7 @@ verify_brtnode_sub_block (struct sub_block *sb) ...@@ -860,7 +867,7 @@ verify_brtnode_sub_block (struct sub_block *sb)
// This function deserializes the data stored by serialize_brtnode_info // This function deserializes the data stored by serialize_brtnode_info
static void static void
deserialize_brtnode_info(struct sub_block *sb, BRTNODE node, u_int32_t** out_offsets) deserialize_brtnode_info(struct sub_block *sb, BRTNODE node)
{ {
// sb_node_info->uncompressed_ptr stores the serialized node information // sb_node_info->uncompressed_ptr stores the serialized node information
// this function puts that information into node // this function puts that information into node
...@@ -873,7 +880,8 @@ deserialize_brtnode_info(struct sub_block *sb, BRTNODE node, u_int32_t** out_off ...@@ -873,7 +880,8 @@ deserialize_brtnode_info(struct sub_block *sb, BRTNODE node, u_int32_t** out_off
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
rbuf_init(&rb, sb->uncompressed_ptr, data_size); rbuf_init(&rb, sb->uncompressed_ptr, data_size);
node->max_msn_applied_to_node = rbuf_msn(&rb); node->max_msn_applied_to_node_on_disk = rbuf_msn(&rb);
node->max_msn_applied_to_node_in_memory = node->max_msn_applied_to_node_on_disk;
node->nodesize = rbuf_int(&rb); node->nodesize = rbuf_int(&rb);
node->flags = rbuf_int(&rb); node->flags = rbuf_int(&rb);
node->height = rbuf_int(&rb); node->height = rbuf_int(&rb);
...@@ -882,20 +890,26 @@ deserialize_brtnode_info(struct sub_block *sb, BRTNODE node, u_int32_t** out_off ...@@ -882,20 +890,26 @@ deserialize_brtnode_info(struct sub_block *sb, BRTNODE node, u_int32_t** out_off
// now create the basement nodes or childinfos, depending on whether this is a // now create the basement nodes or childinfos, depending on whether this is a
// leaf node or internal node // leaf node or internal node
// now the subtree_estimates // now the subtree_estimates
XMALLOC_N(node->n_children, node->subtree_estimates); XMALLOC_N(node->n_children, node->bp);
assert(node->subtree_estimates); //
if (node->height>0) { // setup memory needed for the node
node->u.n.n_bytes_in_buffers = 0; //
XMALLOC_N(node->n_children, node->u.n.childinfos); for (int i = 0; i < node->n_children; i++) {
} if (node->height == 0) {
else { node->bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
XMALLOC_N(node->n_children, node->u.l.bn); BASEMENTNODE bn = (BASEMENTNODE)node->bp[i].ptr;
assert(node->u.l.bn); toku_setup_empty_bn(bn);
toku_setup_empty_leafnode(node, node->n_children); }
else {
node->bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_nonleaf_childinfo));
int r = toku_fifo_create(&BNC_BUFFER(node,i));
assert(r == 0);
}
BP_STATE(node,i) = PT_AVAIL;
} }
for (int i=0; i < node->n_children; i++) { for (int i=0; i < node->n_children; i++) {
SUBTREE_EST curr_se = &node->subtree_estimates[i]; SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
curr_se->nkeys = rbuf_ulonglong(&rb); curr_se->nkeys = rbuf_ulonglong(&rb);
curr_se->ndata = rbuf_ulonglong(&rb); curr_se->ndata = rbuf_ulonglong(&rb);
curr_se->dsize = rbuf_ulonglong(&rb); curr_se->dsize = rbuf_ulonglong(&rb);
...@@ -924,20 +938,16 @@ deserialize_brtnode_info(struct sub_block *sb, BRTNODE node, u_int32_t** out_off ...@@ -924,20 +938,16 @@ deserialize_brtnode_info(struct sub_block *sb, BRTNODE node, u_int32_t** out_off
// of childinfo // of childinfo
if (node->height > 0) { if (node->height > 0) {
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
BNC_BLOCKNUM(node,i) = rbuf_blocknum(&rb); BP_BLOCKNUM(node,i) = rbuf_blocknum(&rb);
BNC_HAVE_FULLHASH(node, i) = FALSE; BP_HAVE_FULLHASH(node, i) = FALSE;
BNC_NBYTESINBUF(node,i) = 0; BP_FULLHASH(node,i) = 0;
} }
} }
// read the offsets // read the offsets
u_int32_t* offsets = NULL;
offsets = toku_xmalloc(sizeof(u_int32_t *)*node->n_children);
assert(offsets);
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
offsets[i] = rbuf_int(&rb); BP_OFFSET(node,i) = rbuf_int(&rb);
} }
*out_offsets = offsets;
// make sure that all the data was read // make sure that all the data was read
if (data_size != rb.ndone) { if (data_size != rb.ndone) {
...@@ -964,16 +974,14 @@ deserialize_brtnode_partition( ...@@ -964,16 +974,14 @@ deserialize_brtnode_partition(
if (node->height > 0) { if (node->height > 0) {
unsigned char ch = rbuf_char(&rb); unsigned char ch = rbuf_char(&rb);
assert(ch == BRTNODE_PARTITION_FIFO_MSG); assert(ch == BRTNODE_PARTITION_FIFO_MSG);
int r = toku_fifo_create(&BNC_BUFFER(node,index));
assert(r == 0);
deserialize_child_buffer(node, index, &rb); deserialize_child_buffer(node, index, &rb);
} }
else { else {
unsigned char ch = rbuf_char(&rb); unsigned char ch = rbuf_char(&rb);
assert(ch == BRTNODE_PARTITION_OMT_LEAVES); assert(ch == BRTNODE_PARTITION_OMT_LEAVES);
node->u.l.bn[index].optimized_for_upgrade = rbuf_int(&rb); BLB_OPTIMIZEDFORUPGRADE(node, index) = rbuf_int(&rb);
node->u.l.bn[index].soft_copy_is_up_to_date = FALSE; BLB_SOFTCOPYISUPTODATE(node, index) = FALSE;
node->u.l.bn[index].seqinsert = 0; BLB_SEQINSERT(node, index) = 0;
u_int32_t num_entries = rbuf_int(&rb); u_int32_t num_entries = rbuf_int(&rb);
OMTVALUE *XMALLOC_N(num_entries, array); OMTVALUE *XMALLOC_N(num_entries, array);
start_of_data = rb.ndone; start_of_data = rb.ndone;
...@@ -987,10 +995,10 @@ deserialize_brtnode_partition( ...@@ -987,10 +995,10 @@ deserialize_brtnode_partition(
memcpy(array[i], le, disksize); memcpy(array[i], le, disksize);
} }
u_int32_t end_of_data = rb.ndone; u_int32_t end_of_data = rb.ndone;
node->u.l.bn[index].n_bytes_in_buffer += end_of_data-start_of_data + num_entries*OMT_ITEM_OVERHEAD; BLB_NBYTESINBUF(node, index) += end_of_data-start_of_data + num_entries*OMT_ITEM_OVERHEAD;
// destroy old buffer that was created by toku_setup_empty_leafnode, so we can create a new one // destroy old buffer that was created by toku_setup_basementnode, so we can create a new one
toku_omt_destroy(&node->u.l.bn[index].buffer); toku_omt_destroy(&BLB_BUFFER(node, index));
int r = toku_omt_create_steal_sorted_array(&node->u.l.bn[index].buffer, &array, num_entries, num_entries); int r = toku_omt_create_steal_sorted_array(&BLB_BUFFER(node, index), &array, num_entries, num_entries);
assert(r == 0); assert(r == 0);
} }
assert(rb.ndone == rb.size); assert(rb.ndone == rb.size);
...@@ -1009,7 +1017,6 @@ deserialize_brtnode_from_rbuf( ...@@ -1009,7 +1017,6 @@ deserialize_brtnode_from_rbuf(
) )
{ {
int r = 0; int r = 0;
u_int32_t* offsets = NULL;
BRTNODE node = NULL; BRTNODE node = NULL;
u_int32_t stored_checksum, checksum; u_int32_t stored_checksum, checksum;
struct sub_block sb_node_info; struct sub_block sb_node_info;
...@@ -1045,15 +1052,15 @@ deserialize_brtnode_from_rbuf( ...@@ -1045,15 +1052,15 @@ deserialize_brtnode_from_rbuf(
sub_block_init(&sb_node_info); sub_block_init(&sb_node_info);
read_compressed_sub_block(rb, &sb_node_info); read_compressed_sub_block(rb, &sb_node_info);
// at this point, sb->uncompressed_ptr stores the serialized node info // at this point, sb->uncompressed_ptr stores the serialized node info
deserialize_brtnode_info(&sb_node_info, node, &offsets); deserialize_brtnode_info(&sb_node_info, node);
toku_free(sb_node_info.uncompressed_ptr); toku_free(sb_node_info.uncompressed_ptr);
// now that the node info has been deserialized, we can proceed to deserialize // now that the node info has been deserialized, we can proceed to deserialize
// the individual sub blocks // the individual sub blocks
// TODO: (Zardosht) Cilkify this // TODO: (Zardosht) Cilkify this
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
u_int32_t curr_offset = (i==0) ? 0 : offsets[i-1]; u_int32_t curr_offset = (i==0) ? 0 : BP_OFFSET(node,i-1);
u_int32_t curr_size = (i==0) ? offsets[i] : (offsets[i] - offsets[i-1]); u_int32_t curr_size = (i==0) ? BP_OFFSET(node,i) : (BP_OFFSET(node,i) - BP_OFFSET(node,i-1));
// the compressed, serialized partitions start at where rb is currently pointing, // the compressed, serialized partitions start at where rb is currently pointing,
// which would be rb->buf + rb->ndone // which would be rb->buf + rb->ndone
...@@ -1068,19 +1075,12 @@ deserialize_brtnode_from_rbuf( ...@@ -1068,19 +1075,12 @@ deserialize_brtnode_from_rbuf(
deserialize_brtnode_partition(&curr_sb, node, i); deserialize_brtnode_partition(&curr_sb, node, i);
toku_free(curr_sb.uncompressed_ptr); toku_free(curr_sb.uncompressed_ptr);
} }
if (node->height > 0) {
node->u.n.n_bytes_in_buffers = 0;
for (int i = 0; i < node->n_children; i++) {
node->u.n.n_bytes_in_buffers += node->u.n.childinfos[i].n_bytes_in_buffer;
}
}
*brtnode = node; *brtnode = node;
r = 0; r = 0;
cleanup: cleanup:
if (r != 0) { if (r != 0) {
if (node) toku_free(node); if (node) toku_free(node);
} }
if(offsets) { toku_free(offsets); }
return r; return r;
} }
...@@ -1172,21 +1172,16 @@ toku_verify_or_set_counts (BRTNODE node) { ...@@ -1172,21 +1172,16 @@ toku_verify_or_set_counts (BRTNODE node) {
node = node; node = node;
if (node->height==0) { if (node->height==0) {
for (int i=0; i<node->n_children; i++) { for (int i=0; i<node->n_children; i++) {
lazy_assert(node->u.l.bn[i].buffer); lazy_assert(BLB_BUFFER(node, i));
struct sum_info sum_info = {0,0,0}; struct sum_info sum_info = {0,0,0};
toku_omt_iterate(node->u.l.bn[i].buffer, sum_item, &sum_info); toku_omt_iterate(BLB_BUFFER(node, i), sum_item, &sum_info);
lazy_assert(sum_info.count==toku_omt_size(node->u.l.bn[i].buffer)); lazy_assert(sum_info.count==toku_omt_size(BLB_BUFFER(node, i)));
lazy_assert(sum_info.dsum==node->u.l.bn[i].n_bytes_in_buffer); lazy_assert(sum_info.dsum==BLB_NBYTESINBUF(node, i));
} }
} }
else { else {
unsigned int sum = 0; // nothing to do because we no longer store n_bytes_in_buffers for
for (int i=0; i<node->n_children; i++) { // the whole node
sum += BNC_NBYTESINBUF(node,i);
}
// We don't really care if the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
// But for now the code always initializes the later tables so they are 0.
lazy_assert(sum==node->u.n.n_bytes_in_buffers);
} }
} }
......
...@@ -35,6 +35,7 @@ int toku_testsetup_leaf(BRT brt, BLOCKNUM *blocknum) { ...@@ -35,6 +35,7 @@ int toku_testsetup_leaf(BRT brt, BLOCKNUM *blocknum) {
int r = toku_read_brt_header_and_store_in_cachefile(brt->cf, MAX_LSN, &brt->h, &ignore_if_was_already_open); int r = toku_read_brt_header_and_store_in_cachefile(brt->cf, MAX_LSN, &brt->h, &ignore_if_was_already_open);
if (r!=0) return r; if (r!=0) return r;
toku_create_new_brtnode(brt, &node, 0, 1); toku_create_new_brtnode(brt, &node, 0, 1);
BP_STATE(node,0) = PT_AVAIL;
*blocknum = node->thisnodename; *blocknum = node->thisnodename;
toku_unpin_brtnode(brt, node); toku_unpin_brtnode(brt, node);
...@@ -51,7 +52,8 @@ int toku_testsetup_nonleaf (BRT brt, int height, BLOCKNUM *blocknum, int n_child ...@@ -51,7 +52,8 @@ int toku_testsetup_nonleaf (BRT brt, int height, BLOCKNUM *blocknum, int n_child
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
int i; int i;
for (i=0; i<n_children; i++) { for (i=0; i<n_children; i++) {
node->u.n.childinfos[i].blocknum = children[i]; BP_BLOCKNUM(node, i) = children[i];
BP_STATE(node,i) = PT_AVAIL;
} }
for (i=0; i+1<n_children; i++) { for (i=0; i+1<n_children; i++) {
node->childkeys[i] = kv_pair_malloc(keys[i], keylens[i], 0, 0); node->childkeys[i] = kv_pair_malloc(keys[i], keylens[i], 0, 0);
...@@ -113,22 +115,22 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke ...@@ -113,22 +115,22 @@ int toku_testsetup_insert_to_leaf (BRT brt, BLOCKNUM blocknum, char *key, int ke
struct cmd_leafval_heaviside_extra be = {brt, &keydbt}; struct cmd_leafval_heaviside_extra be = {brt, &keydbt};
r = toku_omt_find_zero(node->u.l.bn[0].buffer, toku_cmd_leafval_heaviside, &be, &storeddatav, &idx, NULL); r = toku_omt_find_zero(BLB_BUFFER(node, 0), toku_cmd_leafval_heaviside, &be, &storeddatav, &idx, NULL);
if (r==0) { if (r==0) {
LEAFENTRY storeddata=storeddatav; LEAFENTRY storeddata=storeddatav;
// It's already there. So now we have to remove it and put the new one back in. // It's already there. So now we have to remove it and put the new one back in.
node->u.l.bn[0].n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + leafentry_disksize(storeddata); BLB_NBYTESINBUF(node, 0) -= OMT_ITEM_OVERHEAD + leafentry_disksize(storeddata);
toku_free(storeddata); toku_free(storeddata);
// Now put the new kv in. // Now put the new kv in.
toku_omt_set_at(node->u.l.bn[0].buffer, leafentry, idx); toku_omt_set_at(BLB_BUFFER(node, 0), leafentry, idx);
} else { } else {
r = toku_omt_insert(node->u.l.bn[0].buffer, leafentry, toku_cmd_leafval_heaviside, &be, 0); r = toku_omt_insert(BLB_BUFFER(node, 0), leafentry, toku_cmd_leafval_heaviside, &be, 0);
assert(r==0); assert(r==0);
} }
node->u.l.bn[0].n_bytes_in_buffer += OMT_ITEM_OVERHEAD + disksize; BLB_NBYTESINBUF(node, 0) += OMT_ITEM_OVERHEAD + disksize;
node->dirty=1; node->dirty=1;
...@@ -160,7 +162,6 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t ...@@ -160,7 +162,6 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, BLOCKNUM blocknum, enum brt_msg_t
r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, msn, xids_0); r = toku_fifo_enq(BNC_BUFFER(node, childnum), key, keylen, val, vallen, cmdtype, msn, xids_0);
assert(r==0); assert(r==0);
int sizediff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids_0); int sizediff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids_0);
node->u.n.n_bytes_in_buffers += sizediff;
BNC_NBYTESINBUF(node, childnum) += sizediff; BNC_NBYTESINBUF(node, childnum) += sizediff;
node->dirty = 1; node->dirty = 1;
......
...@@ -119,7 +119,8 @@ toku_verify_brtnode (BRT brt, ...@@ -119,7 +119,8 @@ toku_verify_brtnode (BRT brt,
} }
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v); //printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
node = node_v; node = node_v;
thismsn = node->max_msn_applied_to_node; toku_assert_entire_node_in_memory(node);
thismsn = node->max_msn_applied_to_node_in_memory;
if (rootmsn.msn == ZERO_MSN.msn) { if (rootmsn.msn == ZERO_MSN.msn) {
assert(parentmsn.msn == ZERO_MSN.msn); assert(parentmsn.msn == ZERO_MSN.msn);
rootmsn = thismsn; rootmsn = thismsn;
...@@ -163,11 +164,11 @@ toku_verify_brtnode (BRT brt, ...@@ -163,11 +164,11 @@ toku_verify_brtnode (BRT brt,
curr_geq_pivot); curr_geq_pivot);
VERIFY_ASSERTION(r==0, i, "A message in the buffer is out of place"); VERIFY_ASSERTION(r==0, i, "A message in the buffer is out of place");
VERIFY_ASSERTION((msn.msn > lastmsn.msn), i, "msn per msg must be monotonically increasing toward newer messages in buffer"); VERIFY_ASSERTION((msn.msn > lastmsn.msn), i, "msn per msg must be monotonically increasing toward newer messages in buffer");
VERIFY_ASSERTION((msn.msn <= thismsn.msn), i, "all messages must have msn within limit of this node's max_msn_applied_to_node"); VERIFY_ASSERTION((msn.msn <= thismsn.msn), i, "all messages must have msn within limit of this node's max_msn_applied_to_node_in_memory");
}); });
} }
else { else {
BASEMENTNODE bn = &node->u.l.bn[i]; BASEMENTNODE bn = (BASEMENTNODE)node->bp[i].ptr;
for (u_int32_t j = 0; j < toku_omt_size(bn->buffer); j++) { for (u_int32_t j = 0; j < toku_omt_size(bn->buffer); j++) {
VERIFY_ASSERTION((rootmsn.msn >= thismsn.msn), 0, "leaf may have latest msn, but cannot be greater than root msn"); VERIFY_ASSERTION((rootmsn.msn >= thismsn.msn), 0, "leaf may have latest msn, but cannot be greater than root msn");
LEAFENTRY le = get_ith_leafentry(bn, j); LEAFENTRY le = get_ith_leafentry(bn, j);
...@@ -192,7 +193,7 @@ toku_verify_brtnode (BRT brt, ...@@ -192,7 +193,7 @@ toku_verify_brtnode (BRT brt,
if (recurse && node->height > 0) { if (recurse && node->height > 0) {
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
int r = toku_verify_brtnode(brt, rootmsn, thismsn, int r = toku_verify_brtnode(brt, rootmsn, thismsn,
BNC_BLOCKNUM(node, i), node->height-1, BP_BLOCKNUM(node, i), node->height-1,
(i==0) ? lesser_pivot : node->childkeys[i-1], (i==0) ? lesser_pivot : node->childkeys[i-1],
(i==node->n_children-1) ? greatereq_pivot : node->childkeys[i], (i==node->n_children-1) ? greatereq_pivot : node->childkeys[i],
progress_callback, progress_extra, progress_callback, progress_extra,
......
...@@ -108,6 +108,7 @@ Lookup: ...@@ -108,6 +108,7 @@ Lookup:
#include "xids.h" #include "xids.h"
#include "roll.h" #include "roll.h"
#include "toku_atomic.h" #include "toku_atomic.h"
#include "sub_block.h"
static const uint32_t this_version = BRT_LAYOUT_VERSION; static const uint32_t this_version = BRT_LAYOUT_VERSION;
...@@ -136,12 +137,20 @@ static u_int64_t global_root_put_counter = 0; ...@@ -136,12 +137,20 @@ static u_int64_t global_root_put_counter = 0;
enum reactivity { RE_STABLE, RE_FUSIBLE, RE_FISSIBLE }; enum reactivity { RE_STABLE, RE_FUSIBLE, RE_FISSIBLE };
void
toku_assert_entire_node_in_memory(BRTNODE node) {
for (int i = 0; i < node->n_children; i++) {
assert(BP_STATE(node,i) == PT_AVAIL);
}
}
static u_int32_t static u_int32_t
get_leaf_num_entries(BRTNODE node) { get_leaf_num_entries(BRTNODE node) {
u_int32_t result = 0; u_int32_t result = 0;
int i; int i;
toku_assert_entire_node_in_memory(node);
for ( i = 0; i < node->n_children; i++) { for ( i = 0; i < node->n_children; i++) {
result += toku_omt_size(node->u.l.bn[i].buffer); result += toku_omt_size(BLB_BUFFER(node, i));
} }
return result; return result;
} }
...@@ -155,7 +164,7 @@ get_leaf_reactivity (BRTNODE node) { ...@@ -155,7 +164,7 @@ get_leaf_reactivity (BRTNODE node) {
if (size > node->nodesize && get_leaf_num_entries(node) > 1) { if (size > node->nodesize && get_leaf_num_entries(node) > 1) {
re = RE_FISSIBLE; re = RE_FISSIBLE;
} }
else if ((size*4) < node->nodesize && !node->u.l.bn[node->n_children-1].seqinsert) { else if ((size*4) < node->nodesize && !BLB_SEQINSERT(node, node->n_children-1)) {
re = RE_FUSIBLE; re = RE_FUSIBLE;
} }
} }
...@@ -173,6 +182,7 @@ get_nonleaf_reactivity (BRTNODE node) { ...@@ -173,6 +182,7 @@ get_nonleaf_reactivity (BRTNODE node) {
static enum reactivity static enum reactivity
get_node_reactivity (BRTNODE node) { get_node_reactivity (BRTNODE node) {
toku_assert_entire_node_in_memory(node);
if (node->height==0) if (node->height==0)
return get_leaf_reactivity(node); return get_leaf_reactivity(node);
else else
...@@ -181,9 +191,18 @@ get_node_reactivity (BRTNODE node) { ...@@ -181,9 +191,18 @@ get_node_reactivity (BRTNODE node) {
static BOOL static BOOL
nonleaf_node_is_gorged (BRTNODE node) { nonleaf_node_is_gorged (BRTNODE node) {
BOOL buffers_are_empty = TRUE;
toku_assert_entire_node_in_memory(node);
assert(node->height > 0);
for (int child = 0; child < node->n_children; ++child) {
if (BNC_NBYTESINBUF(node, child) > 0) {
buffers_are_empty = FALSE;
break;
}
}
return (BOOL)((toku_serialize_brtnode_size(node) > node->nodesize) return (BOOL)((toku_serialize_brtnode_size(node) > node->nodesize)
&& &&
(node->u.n.n_bytes_in_buffers>0)); (!buffers_are_empty));
} }
static void brtnode_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd); static void brtnode_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd);
...@@ -215,17 +234,17 @@ int toku_brt_debug_mode = 0; ...@@ -215,17 +234,17 @@ int toku_brt_debug_mode = 0;
static u_int32_t compute_child_fullhash (CACHEFILE cf, BRTNODE node, int childnum) { static u_int32_t compute_child_fullhash (CACHEFILE cf, BRTNODE node, int childnum) {
assert(node->height>0 && childnum<node->n_children); assert(node->height>0 && childnum<node->n_children);
switch (BNC_HAVE_FULLHASH(node, childnum)) { switch (BP_HAVE_FULLHASH(node, childnum)) {
case TRUE: case TRUE:
{ {
assert(BNC_FULLHASH(node, childnum)==toku_cachetable_hash(cf, BNC_BLOCKNUM(node, childnum))); assert(BP_FULLHASH(node, childnum)==toku_cachetable_hash(cf, BP_BLOCKNUM(node, childnum)));
return BNC_FULLHASH(node, childnum); return BP_FULLHASH(node, childnum);
} }
case FALSE: case FALSE:
{ {
u_int32_t child_fullhash = toku_cachetable_hash(cf, BNC_BLOCKNUM(node, childnum)); u_int32_t child_fullhash = toku_cachetable_hash(cf, BP_BLOCKNUM(node, childnum));
BNC_HAVE_FULLHASH(node, childnum) = TRUE; BP_HAVE_FULLHASH(node, childnum) = TRUE;
BNC_FULLHASH(node, childnum) = child_fullhash; BP_FULLHASH(node, childnum) = child_fullhash;
return child_fullhash; return child_fullhash;
} }
} }
...@@ -279,8 +298,6 @@ void toku_unpin_brtnode (BRT brt, BRTNODE node) ...@@ -279,8 +298,6 @@ void toku_unpin_brtnode (BRT brt, BRTNODE node)
struct fill_leafnode_estimates_state { struct fill_leafnode_estimates_state {
SUBTREE_EST e; SUBTREE_EST e;
// TODO: (ZARDOSHT) figure out if this variable is really necessary
OMTVALUE prevval;
}; };
static int static int
...@@ -291,14 +308,13 @@ fill_leafnode_estimates (OMTVALUE val, u_int32_t UU(idx), void *vs) ...@@ -291,14 +308,13 @@ fill_leafnode_estimates (OMTVALUE val, u_int32_t UU(idx), void *vs)
s->e->dsize += le_keylen(le) + le_latest_vallen(le); s->e->dsize += le_keylen(le) + le_latest_vallen(le);
s->e->ndata++; s->e->ndata++;
s->e->nkeys++; s->e->nkeys++;
s->prevval = le;
return 0; // must return 0 to work with an omt_iterator return 0; // must return 0 to work with an omt_iterator
} }
static struct subtree_estimates static struct subtree_estimates
calc_leaf_stats (OMT buffer) { calc_leaf_stats (OMT buffer) {
struct subtree_estimates e = zero_estimates; struct subtree_estimates e = zero_estimates;
struct fill_leafnode_estimates_state f = {&e, (OMTVALUE)NULL}; struct fill_leafnode_estimates_state f = {&e};
toku_omt_iterate(buffer, fill_leafnode_estimates, &f); toku_omt_iterate(buffer, fill_leafnode_estimates, &f);
return e; return e;
} }
...@@ -308,15 +324,16 @@ toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) { ...@@ -308,15 +324,16 @@ toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) {
invariant(node->height==0); invariant(node->height==0);
int i = 0; int i = 0;
for (i = 0; i < node->n_children; i++) { for (i = 0; i < node->n_children; i++) {
// basement node may be evicted, so only update stats if the basement node // basement node may be evicted, so only update stats if the basement node
// is fully in memory // is fully in memory
// TODO: (Zardosht) for row cache, figure out a better way to do this // TODO: (Zardosht) for row cache, figure out a better way to do this
if (node->u.l.bn[i].buffer) { if (BP_STATE(node,i) == PT_AVAIL) {
node->subtree_estimates[i] = calc_leaf_stats(node->u.l.bn[i].buffer); node->bp[i].subtree_estimates = calc_leaf_stats(BLB_BUFFER(node, i));
} }
} }
} }
// TODO: (Zardosht) look into this and possibly fix and use
static void __attribute__((__unused__)) static void __attribute__((__unused__))
brt_leaf_check_leaf_stats (BRTNODE node) brt_leaf_check_leaf_stats (BRTNODE node)
{ {
...@@ -345,19 +362,21 @@ fixup_child_estimates (BRTNODE node, int childnum_of_node, BRTNODE child, BOOL d ...@@ -345,19 +362,21 @@ fixup_child_estimates (BRTNODE node, int childnum_of_node, BRTNODE child, BOOL d
estimates.exact = TRUE; estimates.exact = TRUE;
int i; int i;
for (i=0; i<child->n_children; i++) { for (i=0; i<child->n_children; i++) {
SUBTREE_EST child_se = &child->subtree_estimates[i]; SUBTREE_EST child_se = &BP_SUBTREE_EST(child,i);
estimates.nkeys += child_se->nkeys; estimates.nkeys += child_se->nkeys;
estimates.ndata += child_se->ndata; estimates.ndata += child_se->ndata;
estimates.dsize += child_se->dsize; estimates.dsize += child_se->dsize;
if (!child_se->exact) estimates.exact = FALSE; if (!child_se->exact) estimates.exact = FALSE;
if (child->height>0) { if (child->height>0) {
// only execute this if the child's partition is available, as checked above
if (toku_fifo_n_entries(BNC_BUFFER(child,i))!=0) estimates.exact=FALSE; if (toku_fifo_n_entries(BNC_BUFFER(child,i))!=0) estimates.exact=FALSE;
} }
} }
// We only call this function if we have reason to believe that the child changed. // We only call this function if we have reason to believe that the child changed.
node->subtree_estimates[childnum_of_node] = estimates; BP_SUBTREE_EST(node,childnum_of_node) = estimates;
if (dirty_it) if (dirty_it) {
node->dirty=1; node->dirty=1;
}
} }
...@@ -367,23 +386,25 @@ toku_verify_estimates (BRT t, BRTNODE node) { ...@@ -367,23 +386,25 @@ toku_verify_estimates (BRT t, BRTNODE node) {
for (childnum=0; childnum<node->n_children; childnum++) { for (childnum=0; childnum<node->n_children; childnum++) {
// we'll just do this estimate // we'll just do this estimate
u_int64_t child_estimate = 0; u_int64_t child_estimate = 0;
if (node->height > 0) { // can only check the state of available partitions
BLOCKNUM childblocknum = BNC_BLOCKNUM(node, childnum); if (BP_STATE(node, childnum) == PT_AVAIL) {
u_int32_t fullhash = compute_child_fullhash(t->cf, node, childnum); if (node->height > 0) {
void *childnode_v; BLOCKNUM childblocknum = BP_BLOCKNUM(node, childnum);
int r = toku_cachetable_get_and_pin(t->cf, childblocknum, fullhash, &childnode_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, t->h); u_int32_t fullhash = compute_child_fullhash(t->cf, node, childnum);
assert_zero(r); void *childnode_v;
BRTNODE childnode = childnode_v; int r = toku_cachetable_get_and_pin(t->cf, childblocknum, fullhash, &childnode_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, t->h);
int i; assert_zero(r);
for (i=0; i<childnode->n_children; i++) { BRTNODE childnode = childnode_v;
child_estimate += childnode->subtree_estimates[i].ndata; for (int i=0; i<childnode->n_children; i++) {
} child_estimate += childnode->bp[i].subtree_estimates.ndata;
toku_unpin_brtnode(t, childnode); }
} toku_unpin_brtnode(t, childnode);
else { }
child_estimate = toku_omt_size(node->u.l.bn[childnum].buffer); else {
} child_estimate = toku_omt_size(BLB_BUFFER(node, childnum));
assert(node->subtree_estimates[childnum].ndata==child_estimate); }
assert(node->bp[childnum].subtree_estimates.ndata==child_estimate);
}
} }
} }
...@@ -402,23 +423,36 @@ brtnode_memory_size (BRTNODE node) ...@@ -402,23 +423,36 @@ brtnode_memory_size (BRTNODE node)
long retval = 0; long retval = 0;
int n_children = node->n_children; int n_children = node->n_children;
retval += sizeof(*node); retval += sizeof(*node);
retval += (n_children)*(sizeof(node->childkeys[0])); retval += (n_children)*(sizeof(node->bp[0]));
retval += (n_children)*(sizeof(node->subtree_estimates[0]));
retval += node->totalchildkeylens; retval += node->totalchildkeylens;
int i;
if (node->height>0) { // now calculate the sizes of the partitions
for (i=0; i<n_children; i++) { for (int i = 0; i < n_children; i++) {
retval += toku_fifo_memory_size(node->u.n.childinfos[i].buffer); if (BP_STATE(node,i) == PT_INVALID || BP_STATE(node,i) == PT_ON_DISK) {
} continue;
retval += (1+n_children)*(sizeof(node->u.n.childinfos[0])); }
} else if (BP_STATE(node,i) == PT_COMPRESSED) {
else { struct sub_block* sb = (struct sub_block*)node->bp[i].ptr;
for (i=0; i<n_children; i++) { retval += sizeof(*sb);
OMT curr_omt = node->u.l.bn[i].buffer; retval += sb->compressed_size;
retval += (toku_omt_memory_size(curr_omt)); }
retval += node->u.l.bn[i].n_bytes_in_buffer; else if (BP_STATE(node,i) == PT_AVAIL) {
} if (node->height > 0) {
retval += n_children * (sizeof(node->u.l.bn[0])); NONLEAF_CHILDINFO childinfo = (NONLEAF_CHILDINFO)node->bp[i].ptr;
retval += sizeof(*childinfo);
retval += toku_fifo_memory_size(BNC_BUFFER(node, i));
}
else {
BASEMENTNODE bn = node->bp[i].ptr;
retval += sizeof(*bn);
retval += BLB_NBYTESINBUF(node,i);
OMT curr_omt = BLB_BUFFER(node, i);
retval += (toku_omt_memory_size(curr_omt));
}
}
else {
assert(FALSE);
}
} }
return retval; return retval;
} }
...@@ -442,6 +476,7 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename ...@@ -442,6 +476,7 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]); //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, brtnode, brtnode->mdicts[0]);
if (write_me) { if (write_me) {
if (!h->panic) { // if the brt panicked, stop writing, otherwise try to write it. if (!h->panic) { // if the brt panicked, stop writing, otherwise try to write it.
toku_assert_entire_node_in_memory(brtnode);
int n_workitems, n_threads; int n_workitems, n_threads;
toku_cachefile_get_workqueue_load(cachefile, &n_workitems, &n_threads); toku_cachefile_get_workqueue_load(cachefile, &n_workitems, &n_threads);
int r = toku_serialize_brtnode_to(fd, brtnode->thisnodename, brtnode, h, n_workitems, n_threads, for_checkpoint); int r = toku_serialize_brtnode_to(fd, brtnode->thisnodename, brtnode, h, n_workitems, n_threads, for_checkpoint);
...@@ -468,6 +503,7 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename ...@@ -468,6 +503,7 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename
int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash, int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash,
void **brtnode_pv, long *sizep, int *dirtyp, void *extraargs) { void **brtnode_pv, long *sizep, int *dirtyp, void *extraargs) {
assert(extraargs); assert(extraargs);
assert(*brtnode_pv == NULL);
struct brt_header *h = extraargs; struct brt_header *h = extraargs;
BRTNODE *result=(BRTNODE*)brtnode_pv; BRTNODE *result=(BRTNODE*)brtnode_pv;
int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, result, h); int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, result, h);
...@@ -475,14 +511,6 @@ int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM noden ...@@ -475,14 +511,6 @@ int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM noden
*sizep = brtnode_memory_size(*result); *sizep = brtnode_memory_size(*result);
*dirtyp = (*result)->dirty; *dirtyp = (*result)->dirty;
} }
//(*result)->parent_brtnode = 0; /* Don't know it right now. */
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
if ((*result)->height==0) {
int i = 0;
for (i = 0; i < (*result)->n_children; i++) {
(*result)->u.l.bn[i].soft_copy_is_up_to_date = FALSE;
}
}
return r; return r;
} }
...@@ -545,10 +573,6 @@ destroy_basement_node (BASEMENTNODE bn) ...@@ -545,10 +573,6 @@ destroy_basement_node (BASEMENTNODE bn)
bn->buffer = NULL; bn->buffer = NULL;
} }
} }
static void
erase_basement_node(BASEMENTNODE bn) {
bn->buffer = NULL;
}
// destroys the internals of the brtnode, but it does not free the values // destroys the internals of the brtnode, but it does not free the values
// that are stored // that are stored
...@@ -556,31 +580,37 @@ erase_basement_node(BASEMENTNODE bn) { ...@@ -556,31 +580,37 @@ erase_basement_node(BASEMENTNODE bn) {
// MUST NOT do anything besides free the structures that have been allocated // MUST NOT do anything besides free the structures that have been allocated
void toku_destroy_brtnode_internals(BRTNODE node) void toku_destroy_brtnode_internals(BRTNODE node)
{ {
int i; for (int i=0; i<node->n_children-1; i++) {
for (i=0; i<node->n_children-1; i++) {
toku_free(node->childkeys[i]); toku_free(node->childkeys[i]);
} }
toku_free(node->childkeys); toku_free(node->childkeys);
node->childkeys = NULL; node->childkeys = NULL;
toku_free(node->subtree_estimates);
node->subtree_estimates = NULL; for (int i=0; i < node->n_children; i++) {
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, node, node->mdicts[0]); if (BP_STATE(node,i) == PT_AVAIL) {
if (node->height>0) { if (node->height > 0) {
for (i=0; i<node->n_children; i++) { if (BNC_BUFFER(node,i)) {
if (BNC_BUFFER(node,i)) { toku_fifo_free(&BNC_BUFFER(node,i));
toku_fifo_free(&BNC_BUFFER(node,i)); }
} }
} else {
toku_free(node->u.n.childinfos); BASEMENTNODE bn = (BASEMENTNODE)node->bp[i].ptr;
node->u.n.childinfos = NULL; destroy_basement_node(bn);
} }
else { }
for (i=0; i<node->n_children; i++) { else if (BP_STATE(node,i) == PT_COMPRESSED) {
destroy_basement_node(&node->u.l.bn[i]); struct sub_block* sb = (struct sub_block*)node->bp[i].ptr;
} toku_free(sb->compressed_ptr);
toku_free(node->u.l.bn); }
node->u.l.bn = NULL; else {
} assert(node->bp[i].ptr == NULL);
}
// otherwise, there is nothing
toku_free(node->bp[i].ptr);
}
toku_free(node->bp);
node->bp = NULL;
} }
...@@ -592,8 +622,10 @@ void toku_brtnode_free (BRTNODE *nodep) { ...@@ -592,8 +622,10 @@ void toku_brtnode_free (BRTNODE *nodep) {
BRTNODE node=*nodep; BRTNODE node=*nodep;
if (node->height == 0) { if (node->height == 0) {
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
OMT curr_omt = node->u.l.bn[i].buffer; if (BP_STATE(node,i) == PT_AVAIL) {
toku_omt_free_items(curr_omt); OMT curr_omt = BLB_BUFFER(node, i);
toku_omt_free_items(curr_omt);
}
} }
} }
toku_destroy_brtnode_internals(node); toku_destroy_brtnode_internals(node);
...@@ -659,7 +691,8 @@ static void ...@@ -659,7 +691,8 @@ static void
initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, int num_children) initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, int num_children)
// Effect: Fill in N as an empty brtnode. // Effect: Fill in N as an empty brtnode.
{ {
n->max_msn_applied_to_node = MIN_MSN; // correct value for root node, harmless for others n->max_msn_applied_to_node_on_disk = MIN_MSN; // correct value for root node, harmless for others
n->max_msn_applied_to_node_in_memory = MIN_MSN; // correct value for root node, harmless for others
n->nodesize = t->h->nodesize; n->nodesize = t->h->nodesize;
n->flags = t->flags; n->flags = t->flags;
n->thisnodename = nodename; n->thisnodename = nodename;
...@@ -672,34 +705,36 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, int n ...@@ -672,34 +705,36 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, int n
assert(height>=0); assert(height>=0);
n->totalchildkeylens = 0; n->totalchildkeylens = 0;
n->childkeys=0; n->childkeys=0;
n->subtree_estimates = 0; n->bp = 0;
n->n_children = num_children;
if (num_children > 0) { if (num_children > 0) {
MALLOC_N(num_children-1, n->childkeys); MALLOC_N(num_children-1, n->childkeys);
assert(n->childkeys); assert(n->childkeys);
MALLOC_N(num_children, n->subtree_estimates); MALLOC_N(num_children, n->bp);
assert(n->subtree_estimates); assert(n->bp);
for (int i = 0; i < num_children; i++) {
n->subtree_estimates[i] = zero_estimates;
}
}
n->n_children = num_children;
if (height>0) {
n->u.n.n_bytes_in_buffers = 0;
MALLOC_N(num_children, n->u.n.childinfos);
memset(n->u.n.childinfos, 0, num_children*sizeof(n->u.n.childinfos));
for (int i = 0; i < num_children; i++) { for (int i = 0; i < num_children; i++) {
int r = toku_fifo_create(&BNC_BUFFER(n,i)); BP_FULLHASH(n,i)=0;
assert(r==0); BP_HAVE_FULLHASH(n,i)=FALSE;
BNC_NBYTESINBUF(n,i)=0; BP_BLOCKNUM(n,i).b=0;
BNC_FULLHASH(n,i)=0; BP_STATE(n,i) = PT_INVALID;
BNC_HAVE_FULLHASH(n,i)=FALSE; BP_OFFSET(n,i) = 0;
BNC_BLOCKNUM(n,i).b=0; BP_SUBTREE_EST(n,i) = zero_estimates;
n->bp[i].ptr = NULL;
if (height > 0) {
n->bp[i].ptr = toku_malloc(sizeof(struct brtnode_nonleaf_childinfo));
memset(n->bp[i].ptr, 0, sizeof(struct brtnode_nonleaf_childinfo));
int r = toku_fifo_create(&BNC_BUFFER(n,i));
assert(r==0);
BNC_NBYTESINBUF(n,i) = 0;
}
else {
n->bp[i].ptr = toku_malloc(sizeof(struct brtnode_leaf_basement_node));
BASEMENTNODE bn = (BASEMENTNODE)n->bp[i].ptr;
memset(bn, 0, sizeof(struct brtnode_leaf_basement_node));
toku_setup_empty_bn(bn);
}
} }
}
else {
MALLOC_N(num_children, n->u.l.bn);
assert(n->u.l.bn);
toku_setup_empty_leafnode(n,num_children);
} }
} }
...@@ -722,18 +757,21 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r ...@@ -722,18 +757,21 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r
//printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey); //printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey);
newroot->childkeys[0] = splitk.data; newroot->childkeys[0] = splitk.data;
newroot->totalchildkeylens=splitk.size; newroot->totalchildkeylens=splitk.size;
BNC_BLOCKNUM(newroot,0)=nodea->thisnodename; BP_BLOCKNUM(newroot,0)=nodea->thisnodename;
BNC_BLOCKNUM(newroot,1)=nodeb->thisnodename; BP_BLOCKNUM(newroot,1)=nodeb->thisnodename;
BNC_HAVE_FULLHASH(newroot, 0) = FALSE; BP_HAVE_FULLHASH(newroot, 0) = FALSE;
BNC_HAVE_FULLHASH(newroot, 1) = FALSE; BP_HAVE_FULLHASH(newroot, 1) = FALSE;
fixup_child_estimates(newroot, 0, nodea, TRUE); fixup_child_estimates(newroot, 0, nodea, TRUE);
fixup_child_estimates(newroot, 1, nodeb, TRUE); fixup_child_estimates(newroot, 1, nodeb, TRUE);
{ {
MSN msna = nodea->max_msn_applied_to_node; MSN msna = nodea->max_msn_applied_to_node_in_memory;
MSN msnb = nodeb->max_msn_applied_to_node; MSN msnb = nodeb->max_msn_applied_to_node_in_memory;
invariant(msna.msn == msnb.msn); invariant(msna.msn == msnb.msn);
newroot->max_msn_applied_to_node = msna; newroot->max_msn_applied_to_node_in_memory = msna;
} }
BP_STATE(newroot,0) = PT_AVAIL;
BP_STATE(newroot,1) = PT_AVAIL;
newroot->dirty = 1;
toku_unpin_brtnode(brt, nodea); toku_unpin_brtnode(brt, nodea);
toku_unpin_brtnode(brt, nodeb); toku_unpin_brtnode(brt, nodeb);
//printf("%s:%d put %lld\n", __FILE__, __LINE__, newroot_diskoff); //printf("%s:%d put %lld\n", __FILE__, __LINE__, newroot_diskoff);
...@@ -768,10 +806,14 @@ void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children ...@@ -768,10 +806,14 @@ void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, int n_children
static void static void
init_childinfo(BRTNODE node, int childnum, BRTNODE child) { init_childinfo(BRTNODE node, int childnum, BRTNODE child) {
BNC_BLOCKNUM(node,childnum) = child->thisnodename; BP_BLOCKNUM(node,childnum) = child->thisnodename;
BNC_HAVE_FULLHASH(node,childnum) = FALSE; BP_HAVE_FULLHASH(node,childnum) = FALSE;
BP_STATE(node,childnum) = PT_AVAIL;
BP_OFFSET(node,childnum) = 0;
BP_SUBTREE_EST(node,childnum) = zero_estimates;
node->bp[childnum].ptr = toku_malloc(sizeof(struct brtnode_nonleaf_childinfo));
assert(node->bp[childnum].ptr);
BNC_NBYTESINBUF(node,childnum) = 0; BNC_NBYTESINBUF(node,childnum) = 0;
node->subtree_estimates[childnum] = zero_estimates;
int r = toku_fifo_create(&BNC_BUFFER(node,childnum)); int r = toku_fifo_create(&BNC_BUFFER(node,childnum));
resource_assert_zero(r); resource_assert_zero(r);
} }
...@@ -808,8 +850,7 @@ void ...@@ -808,8 +850,7 @@ void
toku_brt_nonleaf_append_child(BRTNODE node, BRTNODE child, struct kv_pair *pivotkey, size_t pivotkeysize) { toku_brt_nonleaf_append_child(BRTNODE node, BRTNODE child, struct kv_pair *pivotkey, size_t pivotkeysize) {
int childnum = node->n_children; int childnum = node->n_children;
node->n_children++; node->n_children++;
XREALLOC_N(node->n_children, node->u.n.childinfos); XREALLOC_N(node->n_children, node->bp);
XREALLOC_N(node->n_children, node->subtree_estimates);
init_childinfo(node, childnum, child); init_childinfo(node, childnum, child);
XREALLOC_N(node->n_children-1, node->childkeys); XREALLOC_N(node->n_children-1, node->childkeys);
if (pivotkey) { if (pivotkey) {
...@@ -824,10 +865,11 @@ brtleaf_disk_size(BRTNODE node) ...@@ -824,10 +865,11 @@ brtleaf_disk_size(BRTNODE node)
// Effect: get the disk size of a leafentry // Effect: get the disk size of a leafentry
{ {
assert(node->height == 0); assert(node->height == 0);
toku_assert_entire_node_in_memory(node);
u_int64_t retval = 0; u_int64_t retval = 0;
int i; int i;
for (i = 0; i < node->n_children; i++) { for (i = 0; i < node->n_children; i++) {
OMT curr_buffer = node->u.l.bn[i].buffer; OMT curr_buffer = BLB_BUFFER(node, i);
u_int32_t n_leafentries = toku_omt_size(curr_buffer); u_int32_t n_leafentries = toku_omt_size(curr_buffer);
u_int32_t j; u_int32_t j;
for (j=0; j < n_leafentries; j++) { for (j=0; j < n_leafentries; j++) {
...@@ -855,7 +897,7 @@ brtleaf_get_split_loc( ...@@ -855,7 +897,7 @@ brtleaf_get_split_loc(
u_int32_t size_so_far = 0; u_int32_t size_so_far = 0;
int i; int i;
for (i = 0; i < node->n_children; i++) { for (i = 0; i < node->n_children; i++) {
OMT curr_buffer = node->u.l.bn[i].buffer; OMT curr_buffer = BLB_BUFFER(node, i);
u_int32_t n_leafentries = toku_omt_size(curr_buffer); u_int32_t n_leafentries = toku_omt_size(curr_buffer);
u_int32_t j; u_int32_t j;
for (j=0; j < n_leafentries; j++) { for (j=0; j < n_leafentries; j++) {
...@@ -927,7 +969,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -927,7 +969,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
assert(node->height==0); assert(node->height==0);
assert(node->nodesize>0); assert(node->nodesize>0);
MSN max_msn_applied_to_node = node->max_msn_applied_to_node; toku_assert_entire_node_in_memory(node);
MSN max_msn_applied_to_node = node->max_msn_applied_to_node_in_memory;
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename); //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
//printf("%s:%d B is at %lld nodesize=%d\n", __FILE__, __LINE__, B->thisnodename, B->nodesize); //printf("%s:%d B is at %lld nodesize=%d\n", __FILE__, __LINE__, B->thisnodename, B->nodesize);
...@@ -973,42 +1016,48 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -973,42 +1016,48 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
} }
else { else {
B = *nodeb; B = *nodeb;
REALLOC_N(num_children_in_b, B->u.l.bn);
REALLOC_N(num_children_in_b, B->subtree_estimates);
for (int i = 0; i < num_children_in_b; i++) {
B->subtree_estimates[i] = zero_estimates;
}
REALLOC_N(num_children_in_b-1, B->childkeys); REALLOC_N(num_children_in_b-1, B->childkeys);
toku_setup_empty_leafnode(B,num_children_in_b); REALLOC_N(num_children_in_b, B->bp);
for (int i = 0; i < num_children_in_b; i++) {
BP_STATE(B,i) = PT_AVAIL;
BP_OFFSET(B,i) = 0;
BP_BLOCKNUM(B,i).b = 0;
BP_FULLHASH(B,i) = 0;
BP_HAVE_FULLHASH(B,i) = FALSE;
BP_SUBTREE_EST(B,i)= zero_estimates;
B->bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
BASEMENTNODE bn = (BASEMENTNODE)B->bp[i].ptr;
toku_setup_empty_bn(bn);
}
} }
// //
// first move all the data // first move all the data
// //
// handle the move of a subset of data in split_node from node to B // handle the move of a subset of data in split_node from node to B
BP_STATE(B,0) = PT_AVAIL;
struct subtree_estimates se_diff = zero_estimates; struct subtree_estimates se_diff = zero_estimates;
u_int32_t diff_size = 0; u_int32_t diff_size = 0;
destroy_basement_node (&B->u.l.bn[0]); // Destroy B's empty OMT, so I can rebuild it from an array destroy_basement_node ((BASEMENTNODE)B->bp[0].ptr); // Destroy B's empty OMT, so I can rebuild it from an array
move_leafentries( move_leafentries(
&B->u.l.bn[0].buffer, &BLB_BUFFER(B, 0),
node->u.l.bn[split_node].buffer, BLB_BUFFER(node, split_node),
split_at_in_node+1, split_at_in_node+1,
toku_omt_size(node->u.l.bn[split_node].buffer), toku_omt_size(BLB_BUFFER(node, split_node)),
&se_diff, &se_diff,
&diff_size &diff_size
); );
node->u.l.bn[split_node].n_bytes_in_buffer -= diff_size; BLB_NBYTESINBUF(node, split_node) -= diff_size;
B->u.l.bn[0].n_bytes_in_buffer += diff_size; BLB_NBYTESINBUF(B, 0) += diff_size;
subtract_estimates(&node->subtree_estimates[split_node], &se_diff); subtract_estimates(&BP_SUBTREE_EST(node,split_node), &se_diff);
add_estimates(&B->subtree_estimates[0], &se_diff); add_estimates(&BP_SUBTREE_EST(B,0), &se_diff);
// move the rest of the basement nodes // move the rest of the basement nodes
int curr_dest_bn_index = 1; int curr_dest_bn_index = 1;
int i; for (int i = num_children_in_node; i < node->n_children; i++, curr_dest_bn_index++) {
for (i = num_children_in_node; i < node->n_children; i++, curr_dest_bn_index++) { destroy_basement_node((BASEMENTNODE)B->bp[curr_dest_bn_index].ptr);
destroy_basement_node (&B->u.l.bn[curr_dest_bn_index]); toku_free(B->bp[curr_dest_bn_index].ptr);
B->u.l.bn[curr_dest_bn_index] = node->u.l.bn[i]; B->bp[curr_dest_bn_index] = node->bp[i];
B->subtree_estimates[curr_dest_bn_index] = node->subtree_estimates[i];
} }
node->n_children = num_children_in_node; node->n_children = num_children_in_node;
B->n_children = num_children_in_b; B->n_children = num_children_in_b;
...@@ -1018,15 +1067,13 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1018,15 +1067,13 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
// //
// make pivots in B // make pivots in B
i = 0; for (int i=0; i < num_children_in_b-1; i++) {
for (i=0; i < num_children_in_b-1; i++) {
B->childkeys[i] = node->childkeys[i+split_node]; B->childkeys[i] = node->childkeys[i+split_node];
B->totalchildkeylens += toku_brt_pivot_key_len(node->childkeys[i+split_node]); B->totalchildkeylens += toku_brt_pivot_key_len(node->childkeys[i+split_node]);
node->totalchildkeylens -= toku_brt_pivot_key_len(node->childkeys[i+split_node]); node->totalchildkeylens -= toku_brt_pivot_key_len(node->childkeys[i+split_node]);
node->childkeys[i+split_node] = NULL; node->childkeys[i+split_node] = NULL;
} }
REALLOC_N(num_children_in_node, node->u.l.bn); REALLOC_N(num_children_in_node, node->bp);
REALLOC_N(num_children_in_node, node->subtree_estimates);
REALLOC_N(num_children_in_node-1, node->childkeys); REALLOC_N(num_children_in_node-1, node->childkeys);
toku_brt_leaf_reset_calc_leaf_stats(node); toku_brt_leaf_reset_calc_leaf_stats(node);
...@@ -1036,7 +1083,7 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1036,7 +1083,7 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
if (splitk) { if (splitk) {
memset(splitk, 0, sizeof *splitk); memset(splitk, 0, sizeof *splitk);
OMTVALUE lev = 0; OMTVALUE lev = 0;
int r=toku_omt_fetch(node->u.l.bn[split_node].buffer, toku_omt_size(node->u.l.bn[split_node].buffer)-1, &lev, NULL); int r=toku_omt_fetch(BLB_BUFFER(node, split_node), toku_omt_size(BLB_BUFFER(node, split_node))-1, &lev, NULL);
assert_zero(r); // that fetch should have worked. assert_zero(r); // that fetch should have worked.
LEAFENTRY le=lev; LEAFENTRY le=lev;
splitk->size = le_keylen(le); splitk->size = le_keylen(le);
...@@ -1044,8 +1091,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1044,8 +1091,8 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
splitk->flags=0; splitk->flags=0;
} }
node->max_msn_applied_to_node = max_msn_applied_to_node; node->max_msn_applied_to_node_in_memory = max_msn_applied_to_node;
B ->max_msn_applied_to_node = max_msn_applied_to_node; B ->max_msn_applied_to_node_in_memory = max_msn_applied_to_node;
node->dirty = 1; node->dirty = 1;
B->dirty = 1; B->dirty = 1;
...@@ -1070,22 +1117,15 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -1070,22 +1117,15 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
// but it does not guarantee that the resulting nodes are smaller than nodesize. // but it does not guarantee that the resulting nodes are smaller than nodesize.
{ {
VERIFY_NODE(t,node); VERIFY_NODE(t,node);
toku_assert_entire_node_in_memory(node);
int old_n_children = node->n_children; int old_n_children = node->n_children;
int n_children_in_a = old_n_children/2; int n_children_in_a = old_n_children/2;
int n_children_in_b = old_n_children-n_children_in_a; int n_children_in_b = old_n_children-n_children_in_a;
MSN max_msn_applied_to_node = node->max_msn_applied_to_node; MSN max_msn_applied_to_node = node->max_msn_applied_to_node_in_memory;
BRTNODE B; BRTNODE B;
assert(node->height>0); assert(node->height>0);
assert(node->n_children>=2); // Otherwise, how do we split? We need at least two children to split. */ assert(node->n_children>=2); // Otherwise, how do we split? We need at least two children to split. */
toku_create_new_brtnode(t, &B, node->height, n_children_in_b); toku_create_new_brtnode(t, &B, node->height, n_children_in_b);
if (0) {
printf("%s:%d %p (%" PRId64 ") splits, old estimates:", __FILE__, __LINE__, node, node->thisnodename.b);
//int i;
//for (i=0; i<node->u.n.n_children; i++) printf(" %" PRIu64, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i));
printf("\n");
}
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
{ {
/* The first n_children_in_a go into node a. /* The first n_children_in_a go into node a.
* That means that the first n_children_in_a-1 keys go into node a. * That means that the first n_children_in_a-1 keys go into node a.
...@@ -1095,34 +1135,20 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -1095,34 +1135,20 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
for (i=n_children_in_a; i<old_n_children; i++) { for (i=n_children_in_a; i<old_n_children; i++) {
int targchild = i-n_children_in_a; int targchild = i-n_children_in_a;
FIFO from_htab = BNC_BUFFER(node,i); // TODO: Figure out better way to handle this
FIFO to_htab = BNC_BUFFER(B, targchild); // the problem is that toku_create_new_brtnode for B creates
BLOCKNUM thischildblocknum = BNC_BLOCKNUM(node, i); // all the data structures, whereas we really don't want it to fill
// in anything for the bp's.
BNC_BLOCKNUM(B, targchild) = thischildblocknum; // Now we have to go free what it just created so we can
BNC_HAVE_FULLHASH(B,targchild) = BNC_HAVE_FULLHASH(node,i); // slide the bp over
BNC_FULLHASH(B,targchild) = BNC_FULLHASH(node, i); if (BNC_BUFFER(B,targchild)) {
toku_fifo_free(&BNC_BUFFER(B,targchild));
while (1) { }
bytevec key, data; toku_free(B->bp[targchild].ptr);
unsigned int keylen, datalen; // now move the bp over
u_int32_t type; B->bp[targchild] = node->bp[i];
MSN msn; memset(&node->bp[i], 0, sizeof(node->bp[0]));
XIDS xids;
int fr = toku_fifo_peek(from_htab, &key, &keylen, &data, &datalen, &type, &msn, &xids);
if (fr!=0) break;
int n_bytes_moved = keylen+datalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
int r = toku_fifo_enq(to_htab, key, keylen, data, datalen, type, msn, xids);
assert(r==0);
toku_fifo_deq(from_htab);
// key and data will no longer be valid
B->u.n.n_bytes_in_buffers += n_bytes_moved;
BNC_NBYTESINBUF(B, targchild) += n_bytes_moved;
node->u.n.n_bytes_in_buffers -= n_bytes_moved;
BNC_NBYTESINBUF(node, i) -= n_bytes_moved;
}
// Delete a child, removing the preceeding pivot key. The child number must be > 0 // Delete a child, removing the preceeding pivot key. The child number must be > 0
{ {
assert(i>0); assert(i>0);
...@@ -1133,35 +1159,26 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -1133,35 +1159,26 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
node->childkeys[i-1] = 0; node->childkeys[i-1] = 0;
} }
} }
BNC_BLOCKNUM(node, i) = make_blocknum(0);
BNC_HAVE_FULLHASH(node, i) = FALSE;
B->subtree_estimates[targchild] = node->subtree_estimates[i];
node->subtree_estimates[i] = zero_estimates;
assert(BNC_NBYTESINBUF(node, i) == 0);
} }
node->n_children=n_children_in_a; node->n_children=n_children_in_a;
for (i=n_children_in_a; i<old_n_children; i++) {
toku_fifo_free(&BNC_BUFFER(node,i));
}
splitk->data = (void*)(node->childkeys[n_children_in_a-1]); splitk->data = (void*)(node->childkeys[n_children_in_a-1]);
splitk->size = toku_brt_pivot_key_len(node->childkeys[n_children_in_a-1]); splitk->size = toku_brt_pivot_key_len(node->childkeys[n_children_in_a-1]);
node->totalchildkeylens -= toku_brt_pivot_key_len(node->childkeys[n_children_in_a-1]); node->totalchildkeylens -= toku_brt_pivot_key_len(node->childkeys[n_children_in_a-1]);
REALLOC_N(n_children_in_a+1, node->u.n.childinfos); REALLOC_N(n_children_in_a, node->bp);
REALLOC_N(n_children_in_a+1, node->subtree_estimates); REALLOC_N(n_children_in_a-1, node->childkeys);
REALLOC_N(n_children_in_a, node->childkeys);
} }
node->max_msn_applied_to_node = max_msn_applied_to_node; node->max_msn_applied_to_node_in_memory = max_msn_applied_to_node;
B ->max_msn_applied_to_node = max_msn_applied_to_node; B ->max_msn_applied_to_node_in_memory = max_msn_applied_to_node;
node->dirty = 1; node->dirty = 1;
B ->dirty = 1; B ->dirty = 1;
toku_assert_entire_node_in_memory(node);
toku_assert_entire_node_in_memory(B);
VERIFY_NODE(t,node); VERIFY_NODE(t,node);
VERIFY_NODE(t,B); VERIFY_NODE(t,B);
*nodea = node; *nodea = node;
...@@ -1184,8 +1201,10 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -1184,8 +1201,10 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum,
{ {
assert(node->height>0); assert(node->height>0);
assert(0 <= childnum && childnum < node->n_children); assert(0 <= childnum && childnum < node->n_children);
FIFO old_h = BNC_BUFFER(node,childnum); toku_assert_entire_node_in_memory(node);
int old_count = BNC_NBYTESINBUF(node, childnum); toku_assert_entire_node_in_memory(childa);
toku_assert_entire_node_in_memory(childb);
int old_count = BNC_NBYTESINBUF(node, childnum);
assert(old_count==0); assert(old_count==0);
int cnum; int cnum;
int r; int r;
...@@ -1201,27 +1220,32 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -1201,27 +1220,32 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum,
node->dirty = 1; node->dirty = 1;
XREALLOC_N(node->n_children+2, node->u.n.childinfos); XREALLOC_N(node->n_children+1, node->bp);
XREALLOC_N(node->n_children+2, node->subtree_estimates); XREALLOC_N(node->n_children, node->childkeys);
XREALLOC_N(node->n_children+1, node->childkeys);
// Slide the children over. // Slide the children over.
node->subtree_estimates[node->n_children+1] = zero_estimates; // suppose n_children is 10 and childnum is 5, meaning node->childnum[5] just got split
// this moves node->bp[6] through node->bp[9] over to
// node->bp[7] through node->bp[10]
for (cnum=node->n_children; cnum>childnum+1; cnum--) { for (cnum=node->n_children; cnum>childnum+1; cnum--) {
node->u.n.childinfos[cnum] = node->u.n.childinfos[cnum-1]; node->bp[cnum] = node->bp[cnum-1];
node->subtree_estimates[cnum] = node->subtree_estimates[cnum-1];
} }
memset(&node->bp[childnum+1],0,sizeof(node->bp[0]));
node->n_children++; node->n_children++;
assert(BNC_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child assert(BP_BLOCKNUM(node, childnum).b==childa->thisnodename.b); // use the same child
BNC_BLOCKNUM(node, childnum+1) = childb->thisnodename;
BNC_HAVE_FULLHASH(node, childnum+1) = TRUE; BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
BNC_FULLHASH(node, childnum+1) = childb->fullhash; BP_HAVE_FULLHASH(node, childnum+1) = TRUE;
node->subtree_estimates[childnum+1] = zero_estimates; BP_FULLHASH(node, childnum+1) = childb->fullhash;
BP_SUBTREE_EST(node,childnum+1) = zero_estimates;
BP_STATE(node,childnum+1) = PT_AVAIL;
BP_OFFSET(node,childnum+1) = 0;
fixup_child_estimates(node, childnum, childa, TRUE); fixup_child_estimates(node, childnum, childa, TRUE);
fixup_child_estimates(node, childnum+1, childb, TRUE); fixup_child_estimates(node, childnum+1, childb, TRUE);
node->bp[childnum+1].ptr = toku_malloc(sizeof(struct brtnode_nonleaf_childinfo));
assert(node->bp[childnum+1].ptr);
r=toku_fifo_create(&BNC_BUFFER(node,childnum+1)); assert_zero(r); r=toku_fifo_create(&BNC_BUFFER(node,childnum+1)); assert_zero(r);
r=toku_fifo_create(&BNC_BUFFER(node,childnum)); assert_zero(r);
BNC_NBYTESINBUF(node, childnum) = 0;
BNC_NBYTESINBUF(node, childnum+1) = 0; BNC_NBYTESINBUF(node, childnum+1) = 0;
// Slide the keys over // Slide the keys over
...@@ -1245,10 +1269,10 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -1245,10 +1269,10 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum,
} }
) )
node->u.n.n_bytes_in_buffers -= old_count; /* By default, they are all removed. We might add them back in. */
/* Keep pushing to the children, but not if the children would require a pushdown */ /* Keep pushing to the children, but not if the children would require a pushdown */
toku_assert_entire_node_in_memory(node);
toku_fifo_free(&old_h); toku_assert_entire_node_in_memory(childa);
toku_assert_entire_node_in_memory(childb);
VERIFY_NODE(t, node); VERIFY_NODE(t, node);
VERIFY_NODE(t, childa); VERIFY_NODE(t, childa);
...@@ -1274,7 +1298,7 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react) ...@@ -1274,7 +1298,7 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react)
void *childnode_v; void *childnode_v;
// For now, don't use toku_pin_brtnode since we aren't yet prepared to deal with the TRY_AGAIN, and we don't have to apply all the messages above to do this split operation. // For now, don't use toku_pin_brtnode since we aren't yet prepared to deal with the TRY_AGAIN, and we don't have to apply all the messages above to do this split operation.
int r = toku_cachetable_get_and_pin(t->cf, int r = toku_cachetable_get_and_pin(t->cf,
BNC_BLOCKNUM(node, childnum), BP_BLOCKNUM(node, childnum),
compute_child_fullhash(t->cf, node, childnum), compute_child_fullhash(t->cf, node, childnum),
&childnode_v, &childnode_v,
NULL, NULL,
...@@ -1752,10 +1776,10 @@ brt_leaf_put_cmd ( ...@@ -1752,10 +1776,10 @@ brt_leaf_put_cmd (
// should be static, but used by test programs // should be static, but used by test programs
void void
toku_brt_append_to_child_buffer(BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val) { toku_brt_append_to_child_buffer(BRTNODE node, int childnum, int type, MSN msn, XIDS xids, const DBT *key, const DBT *val) {
assert(BP_STATE(node,childnum) == PT_AVAIL);
int diff = key->size + val->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids); int diff = key->size + val->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids);
int r = toku_fifo_enq(BNC_BUFFER(node,childnum), key->data, key->size, val->data, val->size, type, msn, xids); int r = toku_fifo_enq(BNC_BUFFER(node,childnum), key->data, key->size, val->data, val->size, type, msn, xids);
assert_zero(r); assert_zero(r);
node->u.n.n_bytes_in_buffers += diff;
BNC_NBYTESINBUF(node, childnum) += diff; BNC_NBYTESINBUF(node, childnum) += diff;
node->dirty = 1; node->dirty = 1;
} }
...@@ -1922,8 +1946,8 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -1922,8 +1946,8 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd)
// //
{ {
MSN cmd_msn = cmd->msn; MSN cmd_msn = cmd->msn;
invariant(cmd_msn.msn > node->max_msn_applied_to_node.msn); invariant(cmd_msn.msn > node->max_msn_applied_to_node_in_memory.msn);
node->max_msn_applied_to_node = cmd_msn; node->max_msn_applied_to_node_in_memory = cmd_msn;
//TODO: Accessing type directly //TODO: Accessing type directly
switch (cmd->type) { switch (cmd->type) {
...@@ -1951,6 +1975,8 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -1951,6 +1975,8 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd)
static void static void
merge_leaf_nodes (BRTNODE a, BRTNODE b) { merge_leaf_nodes (BRTNODE a, BRTNODE b) {
toku_assert_entire_node_in_memory(a);
toku_assert_entire_node_in_memory(b);
assert(a->height == 0); assert(a->height == 0);
assert(b->height == 0); assert(b->height == 0);
assert(a->n_children > 0); assert(a->n_children > 0);
...@@ -1959,27 +1985,27 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) { ...@@ -1959,27 +1985,27 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) {
// this BOOL states if the last basement node in a has any items or not // this BOOL states if the last basement node in a has any items or not
// If it does, then it stays in the merge. If it does not, the last basement node // If it does, then it stays in the merge. If it does not, the last basement node
// of a gets eliminated because we do not have a pivot to store for it (because it has no elements) // of a gets eliminated because we do not have a pivot to store for it (because it has no elements)
BOOL a_has_tail = toku_omt_size(a->u.l.bn[a->n_children-1].buffer); BOOL a_has_tail = toku_omt_size(BLB_BUFFER(a, a->n_children-1));
// move each basement node from b to a // move each basement node from b to a
// move the pivots, adding one of what used to be max(a) // move the pivots, adding one of what used to be max(a)
// move the estimates // move the estimates
int num_children = a->n_children + b->n_children; int num_children = a->n_children + b->n_children;
if (!a_has_tail) { if (!a_has_tail) {
destroy_basement_node(&a->u.l.bn[a->n_children-1]); destroy_basement_node((BASEMENTNODE)a->bp[a->n_children-1].ptr);
toku_free(a->bp[a->n_children-1].ptr);
num_children--; num_children--;
} }
//realloc pivots and basement nodes in a //realloc pivots and basement nodes in a
REALLOC_N(num_children, a->u.l.bn); REALLOC_N(num_children, a->bp);
REALLOC_N(num_children, a->subtree_estimates);
REALLOC_N(num_children-1, a->childkeys); REALLOC_N(num_children-1, a->childkeys);
// fill in pivot for what used to be max of node 'a', if it is needed // fill in pivot for what used to be max of node 'a', if it is needed
if (a_has_tail) { if (a_has_tail) {
LEAFENTRY le = fetch_from_buf( LEAFENTRY le = fetch_from_buf(
a->u.l.bn[a->n_children-1].buffer, BLB_BUFFER(a, a->n_children-1),
toku_omt_size(a->u.l.bn[a->n_children-1].buffer)-1 toku_omt_size(BLB_BUFFER(a, a->n_children-1))-1
); );
a->childkeys[a->n_children-1] = kv_pair_malloc(le_key(le), le_keylen(le), 0, 0); a->childkeys[a->n_children-1] = kv_pair_malloc(le_key(le), le_keylen(le), 0, 0);
a->totalchildkeylens += le_keylen(le); a->totalchildkeylens += le_keylen(le);
...@@ -1987,10 +2013,8 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) { ...@@ -1987,10 +2013,8 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) {
u_int32_t offset = a_has_tail ? a->n_children : a->n_children - 1; u_int32_t offset = a_has_tail ? a->n_children : a->n_children - 1;
for (int i = 0; i < b->n_children; i++) { for (int i = 0; i < b->n_children; i++) {
a->u.l.bn[i+offset] = b->u.l.bn[i]; a->bp[i+offset] = b->bp[i];
erase_basement_node(&b->u.l.bn[i]); memset(&b->bp[i],0,sizeof(b->bp[0]));
a->subtree_estimates[i+offset] = b->subtree_estimates[i];
b->subtree_estimates[i] = zero_estimates;
if (i < (b->n_children-1)) { if (i < (b->n_children-1)) {
a->childkeys[i+offset] = b->childkeys[i]; a->childkeys[i+offset] = b->childkeys[i];
b->childkeys[i] = NULL; b->childkeys[i] = NULL;
...@@ -2067,35 +2091,32 @@ maybe_merge_pinned_nonleaf_nodes (BRTNODE parent, int childnum_of_parent, struct ...@@ -2067,35 +2091,32 @@ maybe_merge_pinned_nonleaf_nodes (BRTNODE parent, int childnum_of_parent, struct
BRTNODE a, BRTNODE b, BRTNODE a, BRTNODE b,
BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk) BOOL *did_merge, BOOL *did_rebalance, struct kv_pair **splitk)
{ {
toku_assert_entire_node_in_memory(a);
toku_assert_entire_node_in_memory(b);
assert(parent_splitk); assert(parent_splitk);
int old_n_children = a->n_children; int old_n_children = a->n_children;
int new_n_children = old_n_children + b->n_children; int new_n_children = old_n_children + b->n_children;
XREALLOC_N(new_n_children, a->u.n.childinfos); XREALLOC_N(new_n_children, a->bp);
memcpy(a->u.n.childinfos + old_n_children, memcpy(a->bp + old_n_children,
b->u.n.childinfos, b->bp,
b->n_children*sizeof(b->u.n.childinfos[0])); b->n_children*sizeof(b->bp[0]));
XREALLOC_N(new_n_children, a->subtree_estimates); memset(b->bp,0,b->n_children*sizeof(b->bp[0]));
memcpy(a->subtree_estimates+ old_n_children,
b->subtree_estimates,
b->n_children*sizeof(b->subtree_estimates[0]));
XREALLOC_N(new_n_children-1, a->childkeys); XREALLOC_N(new_n_children-1, a->childkeys);
a->childkeys[old_n_children-1] = parent_splitk; a->childkeys[old_n_children-1] = parent_splitk;
memcpy(a->childkeys + old_n_children, memcpy(a->childkeys + old_n_children,
b->childkeys, b->childkeys,
(b->n_children-1)*sizeof(b->childkeys[0])); (b->n_children-1)*sizeof(b->childkeys[0]));
a->totalchildkeylens += b->totalchildkeylens + toku_brt_pivot_key_len(parent_splitk); a->totalchildkeylens += b->totalchildkeylens + toku_brt_pivot_key_len(parent_splitk);
a->u.n.n_bytes_in_buffers += b->u.n.n_bytes_in_buffers;
a->n_children = new_n_children; a->n_children = new_n_children;
b->totalchildkeylens = 0; b->totalchildkeylens = 0;
b->n_children = 0; b->n_children = 0;
b->u.n.n_bytes_in_buffers = 0;
a->dirty = 1; a->dirty = 1;
b->dirty = 1; b->dirty = 1;
fixup_child_estimates(parent, childnum_of_parent, a, TRUE); fixup_child_estimates(parent, childnum_of_parent, a, TRUE);
// abort(); // don't forget to reuse blocknums
*did_merge = TRUE; *did_merge = TRUE;
*did_rebalance = FALSE; *did_rebalance = FALSE;
*splitk = NULL; *splitk = NULL;
...@@ -2127,13 +2148,16 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2127,13 +2148,16 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
{ {
MSN msn_max; MSN msn_max;
assert(a->height == b->height); assert(a->height == b->height);
toku_assert_entire_node_in_memory(parent);
toku_assert_entire_node_in_memory(a);
toku_assert_entire_node_in_memory(b);
parent->dirty = 1; // just to make sure parent->dirty = 1; // just to make sure
{ {
MSN msna = a->max_msn_applied_to_node; MSN msna = a->max_msn_applied_to_node_in_memory;
MSN msnb = b->max_msn_applied_to_node; MSN msnb = b->max_msn_applied_to_node_in_memory;
msn_max = (msna.msn > msnb.msn) ? msna : msnb; msn_max = (msna.msn > msnb.msn) ? msna : msnb;
if (a->height > 0) { if (a->height > 0) {
invariant(msn_max.msn <= parent->max_msn_applied_to_node.msn); // parent msn must be >= children's msn invariant(msn_max.msn <= parent->max_msn_applied_to_node_in_memory.msn); // parent msn must be >= children's msn
} }
} }
if (a->height == 0) { if (a->height == 0) {
...@@ -2144,8 +2168,8 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair ...@@ -2144,8 +2168,8 @@ maybe_merge_pinned_nodes (BRTNODE parent, int childnum_of_parent, struct kv_pair
if (*did_merge || *did_rebalance) { if (*did_merge || *did_rebalance) {
// accurate for leaf nodes because all msgs above have been applied, // accurate for leaf nodes because all msgs above have been applied,
// accurate for non-leaf nodes because buffer immediately above each node has been flushed // accurate for non-leaf nodes because buffer immediately above each node has been flushed
a->max_msn_applied_to_node = msn_max; a->max_msn_applied_to_node_in_memory = msn_max;
b->max_msn_applied_to_node = msn_max; b->max_msn_applied_to_node_in_memory = msn_max;
} }
} }
...@@ -2154,6 +2178,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react, ...@@ -2154,6 +2178,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
ANCESTORS ancestors, struct pivot_bounds const * const bounds) ANCESTORS ancestors, struct pivot_bounds const * const bounds)
{ {
if (node->n_children < 2) return; // if no siblings, we are merged as best we can. if (node->n_children < 2) return; // if no siblings, we are merged as best we can.
toku_assert_entire_node_in_memory(node);
int childnuma,childnumb; int childnuma,childnumb;
if (childnum_to_merge > 0) { if (childnum_to_merge > 0) {
...@@ -2187,7 +2212,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react, ...@@ -2187,7 +2212,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
{ {
void *childnode_v; void *childnode_v;
u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnuma); u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnuma);
int r = toku_cachetable_get_and_pin(t->cf, BNC_BLOCKNUM(node, childnuma), childfullhash, &childnode_v, NULL, int r = toku_cachetable_get_and_pin(t->cf, BP_BLOCKNUM(node, childnuma), childfullhash, &childnode_v, NULL,
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, t->h); toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, t->h);
assert(r==0); assert(r==0);
childa = childnode_v; childa = childnode_v;
...@@ -2195,7 +2220,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react, ...@@ -2195,7 +2220,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
{ {
void *childnode_v; void *childnode_v;
u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnumb); u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnumb);
int r = toku_cachetable_get_and_pin(t->cf, BNC_BLOCKNUM(node, childnumb), childfullhash, &childnode_v, NULL, int r = toku_cachetable_get_and_pin(t->cf, BP_BLOCKNUM(node, childnumb), childfullhash, &childnode_v, NULL,
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, t->h); toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, t->h);
assert(r==0); assert(r==0);
childb = childnode_v; childb = childnode_v;
...@@ -2219,21 +2244,18 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react, ...@@ -2219,21 +2244,18 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_react,
if (did_merge) { if (did_merge) {
toku_fifo_free(&BNC_BUFFER(node, childnumb)); toku_fifo_free(&BNC_BUFFER(node, childnumb));
toku_free(node->bp[childnumb].ptr);
node->n_children--; node->n_children--;
memmove(&node->u.n.childinfos[childnumb], memmove(&node->bp[childnumb],
&node->u.n.childinfos[childnumb+1], &node->bp[childnumb+1],
(node->n_children-childnumb)*sizeof(node->u.n.childinfos[0])); (node->n_children-childnumb)*sizeof(node->bp[0]));
REALLOC_N(node->n_children, node->u.n.childinfos); REALLOC_N(node->n_children, node->bp);
memmove(&node->childkeys[childnuma], memmove(&node->childkeys[childnuma],
&node->childkeys[childnuma+1], &node->childkeys[childnuma+1],
(node->n_children-childnumb)*sizeof(node->childkeys[0])); (node->n_children-childnumb)*sizeof(node->childkeys[0]));
REALLOC_N(node->n_children-1, node->childkeys); REALLOC_N(node->n_children-1, node->childkeys);
memmove(&node->subtree_estimates[childnumb],
&node->subtree_estimates[childnumb+1],
(node->n_children-childnumb)*sizeof(node->subtree_estimates[0]));
REALLOC_N(node->n_children, node->subtree_estimates);
fixup_child_estimates(node, childnuma, childa, TRUE); fixup_child_estimates(node, childnuma, childa, TRUE);
assert(node->u.n.childinfos[childnuma].blocknum.b == childa->thisnodename.b); assert(BP_BLOCKNUM(node, childnuma).b == childa->thisnodename.b);
childa->dirty = 1; // just to make sure childa->dirty = 1; // just to make sure
childb->dirty = 1; // just to make sure childb->dirty = 1; // just to make sure
} else { } else {
...@@ -2277,6 +2299,7 @@ brt_handle_maybe_reactive_child(BRT t, BRTNODE node, int childnum, enum reactivi ...@@ -2277,6 +2299,7 @@ brt_handle_maybe_reactive_child(BRT t, BRTNODE node, int childnum, enum reactivi
static void static void
brt_handle_maybe_reactive_root (BRT brt, CACHEKEY *rootp, BRTNODE *nodep) { brt_handle_maybe_reactive_root (BRT brt, CACHEKEY *rootp, BRTNODE *nodep) {
BRTNODE node = *nodep; BRTNODE node = *nodep;
toku_assert_entire_node_in_memory(node);
enum reactivity re = get_node_reactivity(node); enum reactivity re = get_node_reactivity(node);
switch (re) { switch (re) {
case RE_STABLE: case RE_STABLE:
...@@ -2332,6 +2355,7 @@ flush_some_child (BRT t, BRTNODE node, BOOL is_first_flush, BOOL flush_recursive ...@@ -2332,6 +2355,7 @@ flush_some_child (BRT t, BRTNODE node, BOOL is_first_flush, BOOL flush_recursive
// FLUSH_RECURSIVELY=FALSE don't flush any grandchildren // FLUSH_RECURSIVELY=FALSE don't flush any grandchildren
{ {
assert(node->height>0); assert(node->height>0);
toku_assert_entire_node_in_memory(node);
int childnum; int childnum;
find_heaviest_child(node, &childnum); find_heaviest_child(node, &childnum);
assert(toku_fifo_n_entries(BNC_BUFFER(node, childnum))>0); assert(toku_fifo_n_entries(BNC_BUFFER(node, childnum))>0);
...@@ -2345,8 +2369,9 @@ flush_some_child (BRT t, BRTNODE node, BOOL is_first_flush, BOOL flush_recursive ...@@ -2345,8 +2369,9 @@ flush_some_child (BRT t, BRTNODE node, BOOL is_first_flush, BOOL flush_recursive
static void assert_leaf_up_to_date(BRTNODE node) { static void assert_leaf_up_to_date(BRTNODE node) {
assert(node->height == 0); assert(node->height == 0);
toku_assert_entire_node_in_memory(node);
for (int i=0; i < node->n_children; i++) { for (int i=0; i < node->n_children; i++) {
assert(node->u.l.bn[i].soft_copy_is_up_to_date); assert(BLB_SOFTCOPYISUPTODATE(node, i));
} }
} }
...@@ -2359,15 +2384,17 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, ...@@ -2359,15 +2384,17 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
// we are allowed to flush only one child. // we are allowed to flush only one child.
// For this version, flush_this_child cannot release the lock during I/O, but it does need the ancestor information so that it can apply messages when a page comes in. // For this version, flush_this_child cannot release the lock during I/O, but it does need the ancestor information so that it can apply messages when a page comes in.
{ {
toku_assert_entire_node_in_memory(node);
struct ancestors next_ancestors = {node, childnum, ancestors}; struct ancestors next_ancestors = {node, childnum, ancestors};
const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds); const struct pivot_bounds next_bounds = next_pivot_keys(node, childnum, bounds);
assert(node->height>0); assert(node->height>0);
BLOCKNUM targetchild = BNC_BLOCKNUM(node, childnum); BLOCKNUM targetchild = BP_BLOCKNUM(node, childnum);
toku_verify_blocknum_allocated(t->h->blocktable, targetchild); toku_verify_blocknum_allocated(t->h->blocktable, targetchild);
u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnum); u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnum);
BRTNODE child; BRTNODE child;
toku_pin_brtnode_holding_lock(t, targetchild, childfullhash, &next_ancestors, &next_bounds, &child); // get that child node in, and apply the ancestor messages if it's a leaf. toku_pin_brtnode_holding_lock(t, targetchild, childfullhash, &next_ancestors, &next_bounds, &child); // get that child node in, and apply the ancestor messages if it's a leaf.
toku_assert_entire_node_in_memory(node);
assert(child->thisnodename.b!=0); assert(child->thisnodename.b!=0);
VERIFY_NODE(t, child); VERIFY_NODE(t, child);
...@@ -2384,11 +2411,10 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, ...@@ -2384,11 +2411,10 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
XIDS xids; XIDS xids;
while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) { while(0==toku_fifo_peek(fifo, &key, &keylen, &val, &vallen, &type, &msn, &xids)) {
int n_bytes_removed = (keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids)); int n_bytes_removed = (keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD + xids_get_serialize_size(xids));
int r = toku_fifo_deq(fifo); int r = toku_fifo_deq(fifo);
assert(r==0); assert(r==0);
node->u.n.n_bytes_in_buffers -= n_bytes_removed;
BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed; BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
} }
...@@ -2425,7 +2451,6 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re, ...@@ -2425,7 +2451,6 @@ flush_this_child (BRT t, BRTNODE node, int childnum, enum reactivity *child_re,
assert(r==0); assert(r==0);
} }
node->u.n.n_bytes_in_buffers -= n_bytes_removed;
BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed; BNC_NBYTESINBUF(node, childnum) -= n_bytes_removed;
node->dirty = 1; node->dirty = 1;
...@@ -2460,6 +2485,7 @@ brtnode_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -2460,6 +2485,7 @@ brtnode_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd)
// If NODE is a nonleaf, then push the cmd into the FIFO(s) of the relevent child(ren). // If NODE is a nonleaf, then push the cmd into the FIFO(s) of the relevent child(ren).
// The node may become overfull. That's not our problem. // The node may become overfull. That's not our problem.
{ {
toku_assert_entire_node_in_memory(node);
if (node->height==0) { if (node->height==0) {
// we need to make sure that after doing all the put_cmd operations // we need to make sure that after doing all the put_cmd operations
// that the tree above is completely flushed out, // that the tree above is completely flushed out,
...@@ -2484,26 +2510,27 @@ brtnode_nonleaf_put_cmd_at_root (BRT t, BRTNODE node, BRT_MSG cmd) ...@@ -2484,26 +2510,27 @@ brtnode_nonleaf_put_cmd_at_root (BRT t, BRTNODE node, BRT_MSG cmd)
// Requires: node is not a leaf. // Requires: node is not a leaf.
{ {
assert(node->height>0); assert(node->height>0);
toku_assert_entire_node_in_memory(node);
brt_nonleaf_put_cmd(t, node, cmd); brt_nonleaf_put_cmd(t, node, cmd);
} }
void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, int *made_change) { void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, int *made_change) {
VERIFY_NODE(t, node); VERIFY_NODE(t, node);
// ignore messages that have already been applied to this leaf // ignore messages that have already been applied to this leaf
if (cmd->msn.msn <= node->max_msn_applied_to_node.msn) { if (cmd->msn.msn <= node->max_msn_applied_to_node_in_memory.msn) {
// TODO3514 add accountability counter here // TODO3514 add accountability counter here
return; return;
} }
else { else {
node->max_msn_applied_to_node = cmd->msn; node->max_msn_applied_to_node_in_memory = cmd->msn;
} }
if (brt_msg_applies_once(cmd)) { if (brt_msg_applies_once(cmd)) {
unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t); unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
brt_leaf_put_cmd( brt_leaf_put_cmd(
t, t,
&node->u.l.bn[childnum], (BASEMENTNODE)node->bp[childnum].ptr,
&node->subtree_estimates[childnum], &BP_SUBTREE_EST(node, childnum),
cmd, cmd,
made_change made_change
); );
...@@ -2513,8 +2540,8 @@ void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, int *made_change) ...@@ -2513,8 +2540,8 @@ void toku_apply_cmd_to_leaf(BRT t, BRTNODE node, BRT_MSG cmd, int *made_change)
for (int childnum=0; childnum<node->n_children; childnum++) { for (int childnum=0; childnum<node->n_children; childnum++) {
brt_leaf_put_cmd( brt_leaf_put_cmd(
t, t,
&node->u.l.bn[childnum], (BASEMENTNODE)node->bp[childnum].ptr,
&node->subtree_estimates[childnum], &BP_SUBTREE_EST(node,childnum),
cmd, cmd,
&bn_made_change &bn_made_change
); );
...@@ -2547,6 +2574,7 @@ static void push_something_at_root (BRT brt, BRTNODE *nodep, BRT_MSG cmd) ...@@ -2547,6 +2574,7 @@ static void push_something_at_root (BRT brt, BRTNODE *nodep, BRT_MSG cmd)
// Note: During the initial descent, we may gorged many nonleaf nodes. We wish to flush only one nonleaf node at each level. // Note: During the initial descent, we may gorged many nonleaf nodes. We wish to flush only one nonleaf node at each level.
{ {
BRTNODE node = *nodep; BRTNODE node = *nodep;
toku_assert_entire_node_in_memory(node);
if (node->height==0) { if (node->height==0) {
// Must special case height 0, since brtnode_put_cmd() doesn't modify leaves. // Must special case height 0, since brtnode_put_cmd() doesn't modify leaves.
// Part of the problem is: if the node is in memory, then it was updated as part of the in-memory operation. // Part of the problem is: if the node is in memory, then it was updated as part of the in-memory operation.
...@@ -2609,12 +2637,12 @@ static void apply_cmd_to_in_memory_non_root_leaves ( ...@@ -2609,12 +2637,12 @@ static void apply_cmd_to_in_memory_non_root_leaves (
if (brt_msg_applies_once(cmd)) { if (brt_msg_applies_once(cmd)) {
unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t); unsigned int childnum = toku_brtnode_which_child(node, cmd->u.id.key, t);
u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum); u_int32_t child_fullhash = compute_child_fullhash(t->cf, node, childnum);
apply_cmd_to_in_memory_non_root_leaves(t, BNC_BLOCKNUM(node, childnum), child_fullhash, cmd, FALSE, node, childnum); apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), child_fullhash, cmd, FALSE, node, childnum);
} }
else if (brt_msg_applies_all(cmd)) { else if (brt_msg_applies_all(cmd)) {
for (int childnum=0; childnum<node->n_children; childnum++) { for (int childnum=0; childnum<node->n_children; childnum++) {
assert(BNC_HAVE_FULLHASH(node, childnum)); assert(BP_HAVE_FULLHASH(node, childnum));
apply_cmd_to_in_memory_non_root_leaves(t, BNC_BLOCKNUM(node, childnum), BNC_FULLHASH(node, childnum), cmd, FALSE, node, childnum); apply_cmd_to_in_memory_non_root_leaves(t, BP_BLOCKNUM(node, childnum), BP_FULLHASH(node, childnum), cmd, FALSE, node, childnum);
} }
} }
else if (brt_msg_does_nothing(cmd)) { else if (brt_msg_does_nothing(cmd)) {
...@@ -2666,7 +2694,8 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd) ...@@ -2666,7 +2694,8 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
// get the root node // get the root node
toku_pin_brtnode_holding_lock(brt, *rootp, fullhash, NULL, &infinite_bounds, &node); toku_pin_brtnode_holding_lock(brt, *rootp, fullhash, NULL, &infinite_bounds, &node);
cmd->msn.msn = node->max_msn_applied_to_node.msn + 1; toku_assert_entire_node_in_memory(node);
cmd->msn.msn = node->max_msn_applied_to_node_in_memory.msn + 1;
// Note, the lower level function that filters messages based on msn, // Note, the lower level function that filters messages based on msn,
// (brt_leaf_put_cmd() or brt_nonleaf_put_cmd()) will capture the msn and // (brt_leaf_put_cmd() or brt_nonleaf_put_cmd()) will capture the msn and
// store it in the relevant node, including the root node. This is how the // store it in the relevant node, including the root node. This is how the
...@@ -2678,7 +2707,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd) ...@@ -2678,7 +2707,7 @@ toku_brt_root_put_cmd (BRT brt, BRT_MSG_S * cmd)
push_something_at_root(brt, &node, cmd); push_something_at_root(brt, &node, cmd);
// verify that msn of latest message was captured in root node (push_something_at_root() did not release ydb lock) // verify that msn of latest message was captured in root node (push_something_at_root() did not release ydb lock)
invariant(cmd->msn.msn == node->max_msn_applied_to_node.msn); invariant(cmd->msn.msn == node->max_msn_applied_to_node_in_memory.msn);
apply_cmd_to_in_memory_non_root_leaves(brt, *rootp, fullhash, cmd, TRUE, NULL, -1); apply_cmd_to_in_memory_non_root_leaves(brt, *rootp, fullhash, cmd, TRUE, NULL, -1);
if (node->height > 0 && nonleaf_node_is_gorged(node)) { if (node->height > 0 && nonleaf_node_is_gorged(node)) {
...@@ -3124,12 +3153,7 @@ static int setup_initial_brt_root_node (BRT t, BLOCKNUM blocknum) { ...@@ -3124,12 +3153,7 @@ static int setup_initial_brt_root_node (BRT t, BLOCKNUM blocknum) {
assert(node); assert(node);
//printf("%s:%d\n", __FILE__, __LINE__); //printf("%s:%d\n", __FILE__, __LINE__);
initialize_empty_brtnode(t, node, blocknum, 0, 1); initialize_empty_brtnode(t, node, blocknum, 0, 1);
// node->brt = t; BP_STATE(node,0) = PT_AVAIL;
if (0) {
printf("%s:%d for tree %p node %p \n", __FILE__, __LINE__, t, node);
printf("%s:%d put root at %" PRId64 "\n", __FILE__, __LINE__, blocknum.b);
}
//printf("%s:%d putting %p (%lld)\n", __FILE__, __LINE__, node, node->thisnodename);
u_int32_t fullhash = toku_cachetable_hash(t->cf, blocknum); u_int32_t fullhash = toku_cachetable_hash(t->cf, blocknum);
node->fullhash = fullhash; node->fullhash = fullhash;
r=toku_cachetable_put(t->cf, blocknum, fullhash, r=toku_cachetable_put(t->cf, blocknum, fullhash,
...@@ -4643,6 +4667,7 @@ apply_buffer_messages_to_node ( ...@@ -4643,6 +4667,7 @@ apply_buffer_messages_to_node (
ubi_ptr = &ubi; ubi_ptr = &ubi;
} }
int made_change; int made_change;
assert(BP_STATE(ancestor,childnum) == PT_AVAIL);
FIFO_ITERATE(BNC_BUFFER(ancestor, childnum), key, keylen, val, vallen, type, msn, xids, FIFO_ITERATE(BNC_BUFFER(ancestor, childnum), key, keylen, val, vallen, type, msn, xids,
({ ({
DBT hk; DBT hk;
...@@ -4670,13 +4695,13 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -4670,13 +4695,13 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
// need to apply messages to each basement node // need to apply messages to each basement node
// TODO: (Zardosht) cilkify this, watch out for setting of max_msn_applied_to_node // TODO: (Zardosht) cilkify this, watch out for setting of max_msn_applied_to_node
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
if (node->u.l.bn[i].soft_copy_is_up_to_date) { if (BP_STATE(node,i) != PT_AVAIL || BLB_SOFTCOPYISUPTODATE(node, i)) {
continue; continue;
} }
update_stats = TRUE; update_stats = TRUE;
int height = 0; int height = 0;
BASEMENTNODE curr_bn = &node->u.l.bn[i]; BASEMENTNODE curr_bn = (BASEMENTNODE)node->bp[i].ptr;
SUBTREE_EST curr_se = &node->subtree_estimates[i]; SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
ANCESTORS curr_ancestors = ancestors; ANCESTORS curr_ancestors = ancestors;
struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds); struct pivot_bounds curr_bounds = next_pivot_keys(node, i, bounds);
while (curr_ancestors) { while (curr_ancestors) {
...@@ -4688,15 +4713,15 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors ...@@ -4688,15 +4713,15 @@ maybe_apply_ancestors_messages_to_node (BRT t, BRTNODE node, ANCESTORS ancestors
curr_ancestors->node, curr_ancestors->node,
curr_ancestors->childnum, curr_ancestors->childnum,
height, height,
node->max_msn_applied_to_node, node->max_msn_applied_to_node_on_disk,
&curr_bounds &curr_bounds
); );
if (curr_ancestors->node->max_msn_applied_to_node.msn > node->max_msn_applied_to_node.msn) { if (curr_ancestors->node->max_msn_applied_to_node_in_memory.msn > node->max_msn_applied_to_node_in_memory.msn) {
node->max_msn_applied_to_node = curr_ancestors->node->max_msn_applied_to_node; node->max_msn_applied_to_node_in_memory = curr_ancestors->node->max_msn_applied_to_node_in_memory;
} }
curr_ancestors= curr_ancestors->next; curr_ancestors= curr_ancestors->next;
} }
node->u.l.bn[i].soft_copy_is_up_to_date = TRUE; BLB_SOFTCOPYISUPTODATE(node, i) = TRUE;
} }
// Must update the leaf estimates. Might as well use the estimates from the soft copy (even if they make it out to disk), since they are // Must update the leaf estimates. Might as well use the estimates from the soft copy (even if they make it out to disk), since they are
...@@ -4815,7 +4840,7 @@ static int ...@@ -4815,7 +4840,7 @@ static int
brt_search_node (BRT brt, BRTNODE node, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v, BOOL *doprefetch, BRT_CURSOR brtcursor, UNLOCKERS unlockers, ANCESTORS, struct pivot_bounds const * const bounds); brt_search_node (BRT brt, BRTNODE node, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v, BOOL *doprefetch, BRT_CURSOR brtcursor, UNLOCKERS unlockers, ANCESTORS, struct pivot_bounds const * const bounds);
// the number of nodes to prefetch // the number of nodes to prefetch
#define TOKU_DO_PREFETCH 2 #define TOKU_DO_PREFETCH 0
#if TOKU_DO_PREFETCH #if TOKU_DO_PREFETCH
static void static void
...@@ -4829,7 +4854,7 @@ brt_node_maybe_prefetch(BRT brt, BRTNODE node, int childnum, BRT_CURSOR brtcurso ...@@ -4829,7 +4854,7 @@ brt_node_maybe_prefetch(BRT brt, BRTNODE node, int childnum, BRT_CURSOR brtcurso
int nextchildnum = childnum+i+1; int nextchildnum = childnum+i+1;
if (nextchildnum >= node->n_children) if (nextchildnum >= node->n_children)
break; break;
BLOCKNUM nextchildblocknum = BNC_BLOCKNUM(node, nextchildnum); BLOCKNUM nextchildblocknum = BP_BLOCKNUM(node, nextchildnum);
u_int32_t nextfullhash = compute_child_fullhash(brt->cf, node, nextchildnum); u_int32_t nextfullhash = compute_child_fullhash(brt->cf, node, nextchildnum);
toku_cachefile_prefetch(brt->cf, nextchildblocknum, nextfullhash, toku_cachefile_prefetch(brt->cf, nextchildblocknum, nextfullhash,
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, brt->h); toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, brt->h);
...@@ -4863,7 +4888,7 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_ ...@@ -4863,7 +4888,7 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_
{ {
struct ancestors next_ancestors = {node, childnum, ancestors}; struct ancestors next_ancestors = {node, childnum, ancestors};
BLOCKNUM childblocknum = BNC_BLOCKNUM(node,childnum); BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum);
u_int32_t fullhash = compute_child_fullhash(brt->cf, node, childnum); u_int32_t fullhash = compute_child_fullhash(brt->cf, node, childnum);
BRTNODE childnode; BRTNODE childnode;
{ {
...@@ -4899,82 +4924,134 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_ ...@@ -4899,82 +4924,134 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_
} }
static int static int
brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v, BOOL *doprefetch, BRT_CURSOR brtcursor, UNLOCKERS unlockers, brt_search_which_child(
ANCESTORS ancestors, struct pivot_bounds const * const bounds) BRT brt,
BRTNODE node,
brt_search_t *search
)
{ {
int count=0; int c;
count++; DBT pivotkey;
int r; toku_init_dbt(&pivotkey);
{ /* binary search is overkill for a small array */
int c; int child[node->n_children];
/* binary search is overkill for a small array */ /* scan left to right or right to left depending on the search direction */
int child[node->n_children]; for (c = 0; c < node->n_children; c++) {
child[c] = (search->direction == BRT_SEARCH_LEFT) ? c : node->n_children - 1 - c;
/* scan left to right or right to left depending on the search direction */ }
for (c = 0; c < node->n_children; c++) for (c = 0; c < node->n_children-1; c++) {
child[c] = (search->direction == BRT_SEARCH_LEFT) ? c : node->n_children - 1 - c; int p = (search->direction == BRT_SEARCH_LEFT) ? child[c] : child[c] - 1;
struct kv_pair *pivot = node->childkeys[p];
DBT prevpivotkey; toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot));
for (c = 0; c < node->n_children-1; c++) { if (search_pivot_is_bounded(search, brt, &pivotkey) && search->compare(search, &pivotkey)) {
int p = (search->direction == BRT_SEARCH_LEFT) ? child[c] : child[c] - 1; return child[c];
struct kv_pair *pivot = node->childkeys[p]; }
DBT pivotkey; }
toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot)); /* check the first (left) or last (right) node if nothing has been found */
// if (search->have_pivot_bound) printf("%*scomparing tree pivot %s to saved pivot %s %s(%ld)\n", 9-node->height, "", (char*)pivotkey.data, (char*)search->pivot_bound.data, search_pivot_is_bounded(search, brt, &pivotkey) ? "continue" : "skip", BNC_BLOCKNUM(node, child[c]).b); return child[c];
if (search_pivot_is_bounded(search, brt, &pivotkey) }
&& search->compare(search, &pivotkey)) {
const struct pivot_bounds next_bounds = next_pivot_keys(node, child[c], bounds);
if (node->height > 0) {
r = brt_search_child(brt, node, child[c], search, getf, getf_v, doprefetch, brtcursor, unlockers, ancestors, &next_bounds);
}
else {
r = brt_search_basement_node(
&node->u.l.bn[child[c]],
search,
getf,
getf_v,
doprefetch,
brtcursor
);
}
assert(r != EAGAIN);
if (r == 0) return r; //Success
if (r != DB_NOTFOUND) {
return r; //Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
} else {
// If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible.
// This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but
// for point queries, it's overkill. If we got a DB_NOTFOUND on a point query then we should just stop looking.
// When releasing locks on I/O we must not search the same subtree again, or we won't be guaranteed to make forward progress.
// If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
// So save the pivot key in the search object.
// printf("%*ssave_bound %s\n", 9-node->height, "", (char*)pivotkey.data);
search_save_bound(search, &pivotkey);
}
} static void
prevpivotkey = pivotkey; maybe_search_save_bound(
} BRTNODE node,
int child_searched,
brt_search_t *search
)
{
DBT pivotkey;
toku_init_dbt(&pivotkey);
/* check the first (left) or last (right) node if nothing has been found */ int p = (search->direction == BRT_SEARCH_LEFT) ? child_searched : child_searched - 1;
const struct pivot_bounds next_bounds = next_pivot_keys(node, node->n_children-1, bounds); if (p >=0 && p < node->n_children-1) {
if (node->height > 0) { struct kv_pair *pivot = node->childkeys[p];
r = brt_search_child(brt, node, child[c], search, getf, getf_v, doprefetch, brtcursor, unlockers, ancestors, &next_bounds); toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot));
} search_save_bound(search, &pivotkey);
else { }
r = brt_search_basement_node( }
&node->u.l.bn[child[c]],
search, static int
getf, brt_search_node(
getf_v, BRT brt,
doprefetch, BRTNODE node,
brtcursor brt_search_t *search,
); BRT_GET_CALLBACK_FUNCTION getf,
} void *getf_v,
return r; BOOL *doprefetch,
BRT_CURSOR brtcursor,
UNLOCKERS unlockers,
ANCESTORS ancestors,
struct pivot_bounds const * const bounds
)
{ int r;
int child_to_search = brt_search_which_child(brt, node, search);
assert(child_to_search >= 0 || child_to_search < node->n_children);
//
// At this point, we must have the necessary partition available to continue the search
//
assert(BP_STATE(node,child_to_search) == PT_AVAIL);
while (child_to_search >= 0 && child_to_search < node->n_children) {
const struct pivot_bounds next_bounds = next_pivot_keys(node, child_to_search, bounds);
if (node->height > 0) {
r = brt_search_child(
brt,
node,
child_to_search,
search,
getf,
getf_v,
doprefetch,
brtcursor,
unlockers,
ancestors,
&next_bounds
);
}
else {
r = brt_search_basement_node(
(BASEMENTNODE)node->bp[child_to_search].ptr,
search,
getf,
getf_v,
doprefetch,
brtcursor
);
}
if (r == 0) return r; //Success
if (r != DB_NOTFOUND) {
return r; //Error (or message to quit early, such as TOKUDB_FOUND_BUT_REJECTED or TOKUDB_TRY_AGAIN)
}
// we have a new pivotkey
else {
// If we got a DB_NOTFOUND then we have to search the next record. Possibly everything present is not visible.
// This way of doing DB_NOTFOUND is a kludge, and ought to be simplified. Something like this is needed for DB_NEXT, but
// for point queries, it's overkill. If we got a DB_NOTFOUND on a point query then we should just stop looking.
// When releasing locks on I/O we must not search the same subtree again, or we won't be guaranteed to make forward progress.
// If we got a DB_NOTFOUND, then the pivot is too small if searching from left to right (too large if searching from right to left).
// So save the pivot key in the search object.
// printf("%*ssave_bound %s\n", 9-node->height, "", (char*)pivotkey.data);
maybe_search_save_bound(
node,
child_to_search,
search
);
}
// not really necessary, just put this here so that reading the
// code becomes simpler. The point is at this point in the code,
// we know that we got DB_NOTFOUND and we have to continue
assert(r == DB_NOTFOUND);
// TODO: (Zardosht), if the necessary partition is not available, we need to return and get the partition
if (search->direction == BRT_SEARCH_LEFT) {
child_to_search++;
}
else {
child_to_search--;
}
} }
return r;
} }
static int static int
...@@ -5498,7 +5575,7 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename, ...@@ -5498,7 +5575,7 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename,
for (i=0; i<node->n_children; i++) { for (i=0; i<node->n_children; i++) {
int prevcomp = (i==0) ? -1 : compares[i-1]; int prevcomp = (i==0) ? -1 : compares[i-1];
int nextcomp = (i+1 >= n_keys) ? 1 : compares[i]; int nextcomp = (i+1 >= n_keys) ? 1 : compares[i];
u_int64_t subest = node->subtree_estimates[i].ndata; u_int64_t subest = BP_SUBTREE_EST(node,i).ndata;
if (nextcomp < 0) { if (nextcomp < 0) {
// We're definitely looking too far to the left // We're definitely looking too far to the left
*less += subest; *less += subest;
...@@ -5512,14 +5589,14 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename, ...@@ -5512,14 +5589,14 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename,
// nextcomp>=0 and prevcomp<=0, so something in the subtree could match // nextcomp>=0 and prevcomp<=0, so something in the subtree could match
// but they are not both zero, so it's not the whole subtree, so we need to recurse // but they are not both zero, so it's not the whole subtree, so we need to recurse
if (node->height > 0) { if (node->height > 0) {
toku_brt_keyrange_internal(brt, BNC_BLOCKNUM(node, i), compute_child_fullhash(brt->cf, node, i), key, less, equal, greater); toku_brt_keyrange_internal(brt, BP_BLOCKNUM(node, i), compute_child_fullhash(brt->cf, node, i), key, less, equal, greater);
} }
else { else {
struct cmd_leafval_heaviside_extra be = {brt, key}; struct cmd_leafval_heaviside_extra be = {brt, key};
u_int32_t idx; u_int32_t idx;
int r = toku_omt_find_zero(node->u.l.bn[i].buffer, toku_cmd_leafval_heaviside, &be, 0, &idx, NULL); int r = toku_omt_find_zero(BLB_BUFFER(node, i), toku_cmd_leafval_heaviside, &be, 0, &idx, NULL);
*less += idx; *less += idx;
*greater += toku_omt_size(node->u.l.bn[i].buffer)-idx; *greater += toku_omt_size(BLB_BUFFER(node, i))-idx;
if (r==0) { if (r==0) {
(*greater)--; (*greater)--;
(*equal)++; (*equal)++;
...@@ -5564,7 +5641,7 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) { ...@@ -5564,7 +5641,7 @@ int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), struct brtstat64_s *s) {
s->nkeys = s->ndata = s->dsize = 0; s->nkeys = s->ndata = s->dsize = 0;
int i; int i;
for (i=0; i<node->n_children; i++) { for (i=0; i<node->n_children; i++) {
SUBTREE_EST se = &node->subtree_estimates[i]; SUBTREE_EST se = &BP_SUBTREE_EST(node,i);
s->nkeys += se->nkeys; s->nkeys += se->nkeys;
s->ndata += se->ndata; s->ndata += se->ndata;
s->dsize += se->dsize; s->dsize += se->dsize;
...@@ -5593,9 +5670,6 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_ ...@@ -5593,9 +5670,6 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
fprintf(file, "%*sNode %"PRId64" nodesize=%u height=%d n_children=%d keyrange=%s %s\n", fprintf(file, "%*sNode %"PRId64" nodesize=%u height=%d n_children=%d keyrange=%s %s\n",
depth, "", blocknum.b, node->nodesize, node->height, node->n_children, (char*)(lorange ? kv_pair_key(lorange) : 0), (char*)(hirange ? kv_pair_key(hirange) : 0)); depth, "", blocknum.b, node->nodesize, node->height, node->n_children, (char*)(lorange ? kv_pair_key(lorange) : 0), (char*)(hirange ? kv_pair_key(hirange) : 0));
if (node->height > 0) {
fprintf(file, " n_bytes_in_buffers=%u\n", node->u.n.n_bytes_in_buffers);
}
{ {
int i; int i;
for (i=0; i+1< node->n_children; i++) { for (i=0; i+1< node->n_children; i++) {
...@@ -5605,7 +5679,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_ ...@@ -5605,7 +5679,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
} }
for (i=0; i< node->n_children; i++) { for (i=0; i< node->n_children; i++) {
{ {
SUBTREE_EST e = &node->subtree_estimates[i]; SUBTREE_EST e = &BP_SUBTREE_EST(node,i);
fprintf(file, " est={n=%" PRIu64 " k=%" PRIu64 " s=%" PRIu64 " e=%d}", fprintf(file, " est={n=%" PRIu64 " k=%" PRIu64 " s=%" PRIu64 " e=%d}",
e->ndata, e->nkeys, e->dsize, (int)e->exact); e->ndata, e->nkeys, e->dsize, (int)e->exact);
} }
...@@ -5621,11 +5695,11 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_ ...@@ -5621,11 +5695,11 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
}); });
} }
else { else {
int size = toku_omt_size(node->u.l.bn[i].buffer); int size = toku_omt_size(BLB_BUFFER(node, i));
if (0) if (0)
for (int j=0; j<size; j++) { for (int j=0; j<size; j++) {
OMTVALUE v = 0; OMTVALUE v = 0;
r = toku_omt_fetch(node->u.l.bn[i].buffer, j, &v, 0); r = toku_omt_fetch(BLB_BUFFER(node, i), j, &v, 0);
assert_zero(r); assert_zero(r);
fprintf(file, " [%d]=", j); fprintf(file, " [%d]=", j);
print_leafentry(file, v); print_leafentry(file, v);
...@@ -5642,7 +5716,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_ ...@@ -5642,7 +5716,7 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, struct kv_
char *key = node->childkeys[i-1]->key; char *key = node->childkeys[i-1]->key;
fprintf(file, "%*spivot %d len=%u %u\n", depth+1, "", i-1, node->childkeys[i-1]->keylen, (unsigned)toku_dtoh32(*(int*)key)); fprintf(file, "%*spivot %d len=%u %u\n", depth+1, "", i-1, node->childkeys[i-1]->keylen, (unsigned)toku_dtoh32(*(int*)key));
} }
toku_dump_brtnode(file, brt, BNC_BLOCKNUM(node, i), depth+4, toku_dump_brtnode(file, brt, BP_BLOCKNUM(node, i), depth+4,
(i==0) ? lorange : node->childkeys[i-1], (i==0) ? lorange : node->childkeys[i-1],
(i==node->n_children-1) ? hirange : node->childkeys[i]); (i==node->n_children-1) ? hirange : node->childkeys[i]);
} }
...@@ -5880,12 +5954,14 @@ toku_brt_get_fragmentation(BRT brt, TOKU_DB_FRAGMENTATION report) { ...@@ -5880,12 +5954,14 @@ toku_brt_get_fragmentation(BRT brt, TOKU_DB_FRAGMENTATION report) {
static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) { static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) {
if (node->height > 0) { if (node->height > 0) {
if (node->u.n.n_bytes_in_buffers!=0) return 0; // it's not empty if there are bytes in buffers
for (int childnum=0; childnum<node->n_children; childnum++) { for (int childnum=0; childnum<node->n_children; childnum++) {
if (BNC_NBYTESINBUF(node, childnum) != 0) {
return 0; // it's not empty if there are bytes in buffers
}
BRTNODE childnode; BRTNODE childnode;
{ {
void *node_v; void *node_v;
BLOCKNUM childblocknum = BNC_BLOCKNUM(node,childnum); BLOCKNUM childblocknum = BP_BLOCKNUM(node,childnum);
u_int32_t fullhash = compute_child_fullhash(brt->cf, node, childnum); u_int32_t fullhash = compute_child_fullhash(brt->cf, node, childnum);
int rr = toku_cachetable_get_and_pin(brt->cf, childblocknum, fullhash, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, brt->h); int rr = toku_cachetable_get_and_pin(brt->cf, childblocknum, fullhash, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, toku_brtnode_pe_callback, brt->h);
assert(rr ==0); assert(rr ==0);
...@@ -5899,7 +5975,7 @@ static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) { ...@@ -5899,7 +5975,7 @@ static BOOL is_empty_fast_iter (BRT brt, BRTNODE node) {
} else { } else {
// leaf: If the omt is empty, we are happy. // leaf: If the omt is empty, we are happy.
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
if (toku_omt_size(node->u.l.bn[i].buffer)) { if (toku_omt_size(BLB_BUFFER(node, i))) {
return FALSE; return FALSE;
} }
} }
......
...@@ -136,19 +136,16 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { ...@@ -136,19 +136,16 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
printf(" layout_version_original=%d\n", n->layout_version_original); printf(" layout_version_original=%d\n", n->layout_version_original);
printf(" layout_version_read_from_disk=%d\n", n->layout_version_read_from_disk); printf(" layout_version_read_from_disk=%d\n", n->layout_version_read_from_disk);
printf(" build_id=%d\n", n->build_id); printf(" build_id=%d\n", n->build_id);
printf(" max_msn_applied_to_node=%"PRId64" (0x%"PRIx64")\n", n->max_msn_applied_to_node.msn, n->max_msn_applied_to_node.msn); printf(" max_msn_applied_to_node_on_disk=%"PRId64" (0x%"PRIx64")\n", n->max_msn_applied_to_node_on_disk.msn, n->max_msn_applied_to_node_on_disk.msn);
printf(" n_children=%d\n", n->n_children); printf(" n_children=%d\n", n->n_children);
printf(" total_childkeylens=%u\n", n->totalchildkeylens); printf(" total_childkeylens=%u\n", n->totalchildkeylens);
if (n->height > 0) {
printf(" n_bytes_in_buffers=%u\n", n->u.n.n_bytes_in_buffers);
}
int i; int i;
printf(" subleafentry_estimates={"); printf(" subleafentry_estimates={");
for (i=0; i<n->n_children; i++) { for (i=0; i<n->n_children; i++) {
if (i>0) printf(" "); if (i>0) printf(" ");
struct subtree_estimates *est = &n->subtree_estimates[i]; struct subtree_estimates *est = &BP_SUBTREE_EST(n,i);
printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 " %s }", est->nkeys, est->ndata, est->dsize, est->exact ? "T" : "F"); printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 " %s }", est->nkeys, est->ndata, est->dsize, est->exact ? "T" : "F");
} }
printf("}\n"); printf("}\n");
...@@ -163,7 +160,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { ...@@ -163,7 +160,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
printf(" children:\n"); printf(" children:\n");
for (i=0; i<n->n_children; i++) { for (i=0; i<n->n_children; i++) {
if (n->height > 0) { if (n->height > 0) {
printf(" child %d: %" PRId64 "\n", i, BNC_BLOCKNUM(n, i).b); printf(" child %d: %" PRId64 "\n", i, BP_BLOCKNUM(n, i).b);
unsigned int n_bytes = BNC_NBYTESINBUF(n, i); unsigned int n_bytes = BNC_NBYTESINBUF(n, i);
int n_entries = toku_fifo_n_entries(BNC_BUFFER(n, i)); int n_entries = toku_fifo_n_entries(BNC_BUFFER(n, i));
if (n_bytes > 0 || n_entries > 0) { if (n_bytes > 0 || n_entries > 0) {
...@@ -204,10 +201,10 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { ...@@ -204,10 +201,10 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
); );
} }
} else { } else {
printf(" optimized_for_upgrade=%u\n", n->u.l.bn[i].optimized_for_upgrade); printf(" optimized_for_upgrade=%u\n", BLB_OPTIMIZEDFORUPGRADE(n, i));
printf(" n_bytes_in_buffer=%u\n", n->u.l.bn[i].n_bytes_in_buffer); printf(" n_bytes_in_buffer=%u\n", BLB_NBYTESINBUF(n, i));
printf(" items_in_buffer =%u\n", toku_omt_size(n->u.l.bn[i].buffer)); printf(" items_in_buffer =%u\n", toku_omt_size(BLB_BUFFER(n, i)));
if (dump_data) toku_omt_iterate(n->u.l.bn[i].buffer, print_le, 0); if (dump_data) toku_omt_iterate(BLB_BUFFER(n, i), print_le, 0);
} }
} }
toku_brtnode_free(&n); toku_brtnode_free(&n);
......
...@@ -3104,7 +3104,8 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu ...@@ -3104,7 +3104,8 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu
node->layout_version = BRT_LAYOUT_VERSION; node->layout_version = BRT_LAYOUT_VERSION;
node->layout_version_original = BRT_LAYOUT_VERSION; node->layout_version_original = BRT_LAYOUT_VERSION;
node->build_id = BUILD_ID; node->build_id = BUILD_ID;
node->max_msn_applied_to_node = MIN_MSN; node->max_msn_applied_to_node_on_disk = MIN_MSN;
node->max_msn_applied_to_node_in_memory = MIN_MSN;
node->height=height; node->height=height;
node->n_children = n_children; node->n_children = n_children;
node->flags = 0; node->flags = 0;
...@@ -3122,21 +3123,19 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu ...@@ -3122,21 +3123,19 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu
node->childkeys[i] = childkey; node->childkeys[i] = childkey;
totalchildkeylens += kv_pair_keylen(childkey); totalchildkeylens += kv_pair_keylen(childkey);
} }
node->u.n.n_bytes_in_buffers = 0;
node->totalchildkeylens = totalchildkeylens; node->totalchildkeylens = totalchildkeylens;
XMALLOC_N(n_children, node->u.n.childinfos); XMALLOC_N(n_children, node->bp);
XMALLOC_N(n_children, node->subtree_estimates);
for (int i=0; i<n_children; i++) { for (int i=0; i<n_children; i++) {
struct brtnode_nonleaf_childinfo *ci = &node->u.n.childinfos[i]; node->bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_nonleaf_childinfo));
ci->blocknum = make_blocknum(subtree_info[i].block); BP_BLOCKNUM(node,i)= make_blocknum(subtree_info[i].block);
node->subtree_estimates[i] = subtree_info[i].subtree_estimates; BP_SUBTREE_EST(node,i) = subtree_info[i].subtree_estimates;
ci->have_fullhash = FALSE; BP_HAVE_FULLHASH(node,i) = FALSE;
ci->fullhash = 0; BP_FULLHASH(node,i) = 0;
ci->buffer = NULL; BP_STATE(node,i) = PT_AVAIL;
int r = toku_fifo_create(&ci->buffer); int r = toku_fifo_create(&BNC_BUFFER(node,i));
if (r != 0) if (r != 0)
result = r; result = r;
ci->n_bytes_in_buffer = 0; BNC_NBYTESINBUF(node,i)= 0;
} }
if (result == 0) { if (result == 0) {
...@@ -3167,15 +3166,15 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu ...@@ -3167,15 +3166,15 @@ static void write_nonleaf_node (BRTLOADER bl, struct dbout *out, int64_t blocknu
toku_free(node->childkeys[i]); toku_free(node->childkeys[i]);
} }
for (int i=0; i<n_children; i++) { for (int i=0; i<n_children; i++) {
if (node->u.n.childinfos[i].buffer) { if (BNC_BUFFER(node, i)) {
toku_fifo_free(&node->u.n.childinfos[i].buffer); toku_fifo_free(&BNC_BUFFER(node, i));
node->u.n.childinfos[i].buffer = NULL; BNC_BUFFER(node, i) = NULL;
} }
toku_free(node->bp[i].ptr);
} }
toku_free(pivots); toku_free(pivots);
toku_free(node->u.n.childinfos); toku_free(node->bp);
toku_free(node->childkeys); toku_free(node->childkeys);
toku_free(node->subtree_estimates);
toku_free(node); toku_free(node);
toku_free(subtree_info); toku_free(subtree_info);
......
...@@ -21,6 +21,7 @@ extern "C" { ...@@ -21,6 +21,7 @@ extern "C" {
typedef struct brt *BRT; typedef struct brt *BRT;
typedef struct brtnode *BRTNODE; typedef struct brtnode *BRTNODE;
typedef struct brtnode_leaf_basement_node *BASEMENTNODE; typedef struct brtnode_leaf_basement_node *BASEMENTNODE;
typedef struct brtnode_nonleaf_childinfo *NONLEAF_CHILDINFO;
typedef struct subtree_estimates *SUBTREE_EST; typedef struct subtree_estimates *SUBTREE_EST;
struct brt_header; struct brt_header;
struct wbuf; struct wbuf;
......
...@@ -5,7 +5,8 @@ ...@@ -5,7 +5,8 @@
#include "includes.h" #include "includes.h"
#define TESTMSNVAL 0x1234567890123456 // arbitrary number #define TESTMSNDSKVAL 0x1234567890123456 // arbitrary number
#define TESTMSNMEMVAL 0x6543210987654321 // arbitrary number
#define MIN(x, y) (((x) < (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y))
...@@ -87,7 +88,8 @@ test_serialize_leaf_with_large_pivots(void) { ...@@ -87,7 +88,8 @@ test_serialize_leaf_with_large_pivots(void) {
// assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible // assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
sn.max_msn_applied_to_node.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = 4*(1<<20); sn.nodesize = 4*(1<<20);
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -95,6 +97,7 @@ test_serialize_leaf_with_large_pivots(void) { ...@@ -95,6 +97,7 @@ test_serialize_leaf_with_large_pivots(void) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = nrows; sn.n_children = nrows;
LEAFENTRY les[nrows]; LEAFENTRY les[nrows];
{ {
char key[keylens], val[vallens]; char key[keylens], val[vallens];
...@@ -105,23 +108,24 @@ test_serialize_leaf_with_large_pivots(void) { ...@@ -105,23 +108,24 @@ test_serialize_leaf_with_large_pivots(void) {
les[i] = le_fastmalloc((char *) &key, sizeof(key), (char *) &val, sizeof(val)); les[i] = le_fastmalloc((char *) &key, sizeof(key), (char *) &val, sizeof(val));
} }
} }
MALLOC_N(sn.n_children, sn.u.l.bn); MALLOC_N(sn.n_children, sn.bp);
MALLOC_N(sn.n_children, sn.subtree_estimates);
MALLOC_N(sn.n_children-1, sn.childkeys); MALLOC_N(sn.n_children-1, sn.childkeys);
sn.totalchildkeylens = (sn.n_children-1)*sizeof(int); sn.totalchildkeylens = (sn.n_children-1)*sizeof(int);
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
sn.subtree_estimates[i].ndata = random() + (((long long) random())<<32); BP_STATE(&sn,i) = PT_AVAIL;
sn.subtree_estimates[i].nkeys = random() + (((long long) random())<<32); BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long) random())<<32);
sn.subtree_estimates[i].dsize = random() + (((long long) random())<<32); BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long) random())<<32);
sn.subtree_estimates[i].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long) random())<<32);
r = toku_omt_create(&sn.u.l.bn[i].buffer); assert(r==0); BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
sn.u.l.bn[i].optimized_for_upgrade = BRT_LAYOUT_VERSION; sn.bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
sn.u.l.bn[i].soft_copy_is_up_to_date = TRUE; r = toku_omt_create(&BLB_BUFFER(&sn, i)); assert(r==0);
sn.u.l.bn[i].seqinsert = 0; BLB_OPTIMIZEDFORUPGRADE(&sn, i) = BRT_LAYOUT_VERSION;
BLB_SOFTCOPYISUPTODATE(&sn, i) = TRUE;
BLB_SEQINSERT(&sn, i) = 0;
} }
for (int i = 0; i < nrows; ++i) { for (int i = 0; i < nrows; ++i) {
r = toku_omt_insert(sn.u.l.bn[i].buffer, les[i], omt_cmp, les[i], NULL); assert(r==0); r = toku_omt_insert(BLB_BUFFER(&sn, i), les[i], omt_cmp, les[i], NULL); assert(r==0);
sn.u.l.bn[i].n_bytes_in_buffer = OMT_ITEM_OVERHEAD + leafentry_disksize(les[i]); BLB_NBYTESINBUF(&sn, i) = OMT_ITEM_OVERHEAD + leafentry_disksize(les[i]);
if (i < nrows-1) { if (i < nrows-1) {
u_int32_t keylen; u_int32_t keylen;
char *key = le_key_and_len(les[i], &keylen); char *key = le_key_and_len(les[i], &keylen);
...@@ -170,11 +174,11 @@ test_serialize_leaf_with_large_pivots(void) { ...@@ -170,11 +174,11 @@ test_serialize_leaf_with_large_pivots(void) {
struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(toku_omt_size(dn->u.l.bn[i].buffer) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(dn->u.l.bn[i].buffer, check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
assert(dn->u.l.bn[i].optimized_for_upgrade == BRT_LAYOUT_VERSION); assert(BLB_OPTIMIZEDFORUPGRADE(dn, i) == BRT_LAYOUT_VERSION);
// don't check soft_copy_is_up_to_date or seqinsert // don't check soft_copy_is_up_to_date or seqinsert
assert(dn->u.l.bn[i].n_bytes_in_buffer == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+keylens+vallens) + toku_omt_size(dn->u.l.bn[i].buffer)); assert(BLB_NBYTESINBUF(dn, i) == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+keylens+vallens) + toku_omt_size(BLB_BUFFER(dn, i)));
last_i = extra.i; last_i = extra.i;
} }
assert(extra.i == nrows); assert(extra.i == nrows);
...@@ -185,14 +189,16 @@ test_serialize_leaf_with_large_pivots(void) { ...@@ -185,14 +189,16 @@ test_serialize_leaf_with_large_pivots(void) {
kv_pair_free(sn.childkeys[i]); kv_pair_free(sn.childkeys[i]);
} }
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
toku_omt_destroy(&sn.u.l.bn[i].buffer); toku_omt_destroy(&BLB_BUFFER(&sn, i));
} }
for (int i = 0; i < nrows; ++i) { for (int i = 0; i < nrows; ++i) {
toku_free(les[i]); toku_free(les[i]);
} }
toku_free(sn.u.l.bn);
toku_free(sn.childkeys); toku_free(sn.childkeys);
toku_free(sn.subtree_estimates); for (int i = 0; i < sn.n_children; i++) {
toku_free(sn.bp[i].ptr);
}
toku_free(sn.bp);
toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
toku_blocktable_destroy(&brt_h->blocktable); toku_blocktable_destroy(&brt_h->blocktable);
...@@ -210,7 +216,8 @@ test_serialize_leaf_with_many_rows(void) { ...@@ -210,7 +216,8 @@ test_serialize_leaf_with_many_rows(void) {
// assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible // assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
sn.max_msn_applied_to_node.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = 4*(1<<20); sn.nodesize = 4*(1<<20);
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -218,6 +225,7 @@ test_serialize_leaf_with_many_rows(void) { ...@@ -218,6 +225,7 @@ test_serialize_leaf_with_many_rows(void) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = 1; sn.n_children = 1;
LEAFENTRY les[nrows]; LEAFENTRY les[nrows];
{ {
int key = 0, val = 0; int key = 0, val = 0;
...@@ -225,24 +233,25 @@ test_serialize_leaf_with_many_rows(void) { ...@@ -225,24 +233,25 @@ test_serialize_leaf_with_many_rows(void) {
les[i] = le_fastmalloc((char *) &key, sizeof(key), (char *) &val, sizeof(val)); les[i] = le_fastmalloc((char *) &key, sizeof(key), (char *) &val, sizeof(val));
} }
} }
MALLOC_N(sn.n_children, sn.u.l.bn); MALLOC_N(sn.n_children, sn.bp);
MALLOC_N(sn.n_children, sn.subtree_estimates);
MALLOC_N(sn.n_children-1, sn.childkeys); MALLOC_N(sn.n_children-1, sn.childkeys);
sn.totalchildkeylens = (sn.n_children-1)*sizeof(int); sn.totalchildkeylens = (sn.n_children-1)*sizeof(int);
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
sn.subtree_estimates[i].ndata = random() + (((long long) random())<<32); BP_STATE(&sn,i) = PT_AVAIL;
sn.subtree_estimates[i].nkeys = random() + (((long long) random())<<32); BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long) random())<<32);
sn.subtree_estimates[i].dsize = random() + (((long long) random())<<32); BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long) random())<<32);
sn.subtree_estimates[i].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long) random())<<32);
r = toku_omt_create(&sn.u.l.bn[i].buffer); assert(r==0); BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
sn.u.l.bn[i].optimized_for_upgrade = BRT_LAYOUT_VERSION; sn.bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
sn.u.l.bn[i].soft_copy_is_up_to_date = TRUE; r = toku_omt_create(&BLB_BUFFER(&sn, i)); assert(r==0);
sn.u.l.bn[i].seqinsert = 0; BLB_OPTIMIZEDFORUPGRADE(&sn, i) = BRT_LAYOUT_VERSION;
} BLB_SOFTCOPYISUPTODATE(&sn, i) = TRUE;
sn.u.l.bn[0].n_bytes_in_buffer = 0; BLB_SEQINSERT(&sn, i) = 0;
}
BLB_NBYTESINBUF(&sn, 0) = 0;
for (int i = 0; i < nrows; ++i) { for (int i = 0; i < nrows; ++i) {
r = toku_omt_insert(sn.u.l.bn[0].buffer, les[i], omt_int_cmp, les[i], NULL); assert(r==0); r = toku_omt_insert(BLB_BUFFER(&sn, 0), les[i], omt_int_cmp, les[i], NULL); assert(r==0);
sn.u.l.bn[0].n_bytes_in_buffer += OMT_ITEM_OVERHEAD + leafentry_disksize(les[i]); BLB_NBYTESINBUF(&sn, 0) += OMT_ITEM_OVERHEAD + leafentry_disksize(les[i]);
} }
struct brt *XMALLOC(brt); struct brt *XMALLOC(brt);
...@@ -286,12 +295,12 @@ test_serialize_leaf_with_many_rows(void) { ...@@ -286,12 +295,12 @@ test_serialize_leaf_with_many_rows(void) {
struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_int_cmp }; struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_int_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(toku_omt_size(dn->u.l.bn[i].buffer) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(dn->u.l.bn[i].buffer, check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
assert(dn->u.l.bn[i].optimized_for_upgrade == BRT_LAYOUT_VERSION); assert(BLB_OPTIMIZEDFORUPGRADE(dn, i) == BRT_LAYOUT_VERSION);
// don't check soft_copy_is_up_to_date or seqinsert // don't check soft_copy_is_up_to_date or seqinsert
assert(dn->u.l.bn[i].n_bytes_in_buffer == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+keylens+vallens) + toku_omt_size(dn->u.l.bn[i].buffer)); assert(BLB_NBYTESINBUF(dn, i) == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+keylens+vallens) + toku_omt_size(BLB_BUFFER(dn, i)));
assert(dn->u.l.bn[i].n_bytes_in_buffer < 128*1024); // BN_MAX_SIZE, apt to change assert(BLB_NBYTESINBUF(dn, i) < 128*1024); // BN_MAX_SIZE, apt to change
last_i = extra.i; last_i = extra.i;
} }
assert(extra.i == nrows); assert(extra.i == nrows);
...@@ -302,14 +311,16 @@ test_serialize_leaf_with_many_rows(void) { ...@@ -302,14 +311,16 @@ test_serialize_leaf_with_many_rows(void) {
kv_pair_free(sn.childkeys[i]); kv_pair_free(sn.childkeys[i]);
} }
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
toku_omt_destroy(&sn.u.l.bn[i].buffer); toku_omt_destroy(&BLB_BUFFER(&sn, i));
} }
for (int i = 0; i < nrows; ++i) { for (int i = 0; i < nrows; ++i) {
toku_free(les[i]); toku_free(les[i]);
} }
toku_free(sn.u.l.bn); for (int i = 0; i < sn.n_children; i++) {
toku_free(sn.bp[i].ptr);
}
toku_free(sn.bp);
toku_free(sn.childkeys); toku_free(sn.childkeys);
toku_free(sn.subtree_estimates);
toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
toku_blocktable_destroy(&brt_h->blocktable); toku_blocktable_destroy(&brt_h->blocktable);
...@@ -327,7 +338,8 @@ test_serialize_leaf_with_large_rows(void) { ...@@ -327,7 +338,8 @@ test_serialize_leaf_with_large_rows(void) {
// assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible // assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible
int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); int fd = open(__FILE__ ".brt", O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
sn.max_msn_applied_to_node.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = 4*(1<<20); sn.nodesize = 4*(1<<20);
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -335,6 +347,7 @@ test_serialize_leaf_with_large_rows(void) { ...@@ -335,6 +347,7 @@ test_serialize_leaf_with_large_rows(void) {
sn.layout_version_original = BRT_LAYOUT_VERSION; sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0; sn.height = 0;
sn.n_children = 1; sn.n_children = 1;
LEAFENTRY les[7]; LEAFENTRY les[7];
{ {
char key[8], val[val_size]; char key[8], val[val_size];
...@@ -347,24 +360,25 @@ test_serialize_leaf_with_large_rows(void) { ...@@ -347,24 +360,25 @@ test_serialize_leaf_with_large_rows(void) {
les[i] = le_fastmalloc(key, 8, val, val_size); les[i] = le_fastmalloc(key, 8, val, val_size);
} }
} }
MALLOC_N(sn.n_children, sn.u.l.bn); MALLOC_N(sn.n_children, sn.bp);
MALLOC_N(sn.n_children, sn.subtree_estimates);
MALLOC_N(sn.n_children-1, sn.childkeys); MALLOC_N(sn.n_children-1, sn.childkeys);
sn.totalchildkeylens = (sn.n_children-1)*8; sn.totalchildkeylens = (sn.n_children-1)*8;
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
sn.subtree_estimates[i].ndata = random() + (((long long) random())<<32); BP_STATE(&sn,i) = PT_AVAIL;
sn.subtree_estimates[i].nkeys = random() + (((long long) random())<<32); BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long) random())<<32);
sn.subtree_estimates[i].dsize = random() + (((long long) random())<<32); BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long) random())<<32);
sn.subtree_estimates[i].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long) random())<<32);
r = toku_omt_create(&sn.u.l.bn[i].buffer); assert(r==0); BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
sn.u.l.bn[i].optimized_for_upgrade = BRT_LAYOUT_VERSION; sn.bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
sn.u.l.bn[i].soft_copy_is_up_to_date = TRUE; r = toku_omt_create(&BLB_BUFFER(&sn, i)); assert(r==0);
sn.u.l.bn[i].seqinsert = 0; BLB_OPTIMIZEDFORUPGRADE(&sn, i) = BRT_LAYOUT_VERSION;
} BLB_SOFTCOPYISUPTODATE(&sn, i) = TRUE;
sn.u.l.bn[0].n_bytes_in_buffer = 0; BLB_SEQINSERT(&sn, i) = 0;
}
BLB_NBYTESINBUF(&sn, 0) = 0;
for (int i = 0; i < 7; ++i) { for (int i = 0; i < 7; ++i) {
r = toku_omt_insert(sn.u.l.bn[0].buffer, les[i], omt_cmp, les[i], NULL); assert(r==0); r = toku_omt_insert(BLB_BUFFER(&sn, 0), les[i], omt_cmp, les[i], NULL); assert(r==0);
sn.u.l.bn[0].n_bytes_in_buffer += OMT_ITEM_OVERHEAD + leafentry_disksize(les[i]); BLB_NBYTESINBUF(&sn, 0) += OMT_ITEM_OVERHEAD + leafentry_disksize(les[i]);
} }
struct brt *XMALLOC(brt); struct brt *XMALLOC(brt);
...@@ -409,11 +423,11 @@ test_serialize_leaf_with_large_rows(void) { ...@@ -409,11 +423,11 @@ test_serialize_leaf_with_large_rows(void) {
struct check_leafentries_struct extra = { .nelts = 7, .elts = les, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = 7, .elts = les, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(toku_omt_size(dn->u.l.bn[i].buffer) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(dn->u.l.bn[i].buffer, check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
assert(dn->u.l.bn[i].optimized_for_upgrade == BRT_LAYOUT_VERSION); assert(BLB_OPTIMIZEDFORUPGRADE(dn, i) == BRT_LAYOUT_VERSION);
// don't check soft_copy_is_up_to_date or seqinsert // don't check soft_copy_is_up_to_date or seqinsert
assert(dn->u.l.bn[i].n_bytes_in_buffer == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+8+val_size) + toku_omt_size(dn->u.l.bn[i].buffer)); assert(BLB_NBYTESINBUF(dn, i) == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+8+val_size) + toku_omt_size(BLB_BUFFER(dn, i)));
last_i = extra.i; last_i = extra.i;
} }
assert(extra.i == 7); assert(extra.i == 7);
...@@ -424,14 +438,16 @@ test_serialize_leaf_with_large_rows(void) { ...@@ -424,14 +438,16 @@ test_serialize_leaf_with_large_rows(void) {
kv_pair_free(sn.childkeys[i]); kv_pair_free(sn.childkeys[i]);
} }
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
toku_omt_destroy(&sn.u.l.bn[i].buffer); toku_omt_destroy(&BLB_BUFFER(&sn, i));
} }
for (int i = 0; i < 7; ++i) { for (int i = 0; i < 7; ++i) {
toku_free(les[i]); toku_free(les[i]);
} }
toku_free(sn.u.l.bn); for (int i = 0; i < sn.n_children; i++) {
toku_free(sn.bp[i].ptr);
}
toku_free(sn.bp);
toku_free(sn.childkeys); toku_free(sn.childkeys);
toku_free(sn.subtree_estimates);
toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
toku_blocktable_destroy(&brt_h->blocktable); toku_blocktable_destroy(&brt_h->blocktable);
...@@ -450,7 +466,8 @@ test_serialize_leaf_with_empty_basement_nodes(void) { ...@@ -450,7 +466,8 @@ test_serialize_leaf_with_empty_basement_nodes(void) {
int r; int r;
sn.max_msn_applied_to_node.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -462,8 +479,7 @@ test_serialize_leaf_with_empty_basement_nodes(void) { ...@@ -462,8 +479,7 @@ test_serialize_leaf_with_empty_basement_nodes(void) {
elts[0] = le_malloc("a", "aval"); elts[0] = le_malloc("a", "aval");
elts[1] = le_malloc("b", "bval"); elts[1] = le_malloc("b", "bval");
elts[2] = le_malloc("x", "xval"); elts[2] = le_malloc("x", "xval");
MALLOC_N(sn.n_children, sn.u.l.bn); MALLOC_N(sn.n_children, sn.bp);
MALLOC_N(sn.n_children, sn.subtree_estimates);
MALLOC_N(sn.n_children-1, sn.childkeys); MALLOC_N(sn.n_children-1, sn.childkeys);
sn.childkeys[0] = kv_pair_malloc("A", 2, 0, 0); sn.childkeys[0] = kv_pair_malloc("A", 2, 0, 0);
sn.childkeys[1] = kv_pair_malloc("a", 2, 0, 0); sn.childkeys[1] = kv_pair_malloc("a", 2, 0, 0);
...@@ -473,25 +489,27 @@ test_serialize_leaf_with_empty_basement_nodes(void) { ...@@ -473,25 +489,27 @@ test_serialize_leaf_with_empty_basement_nodes(void) {
sn.childkeys[5] = kv_pair_malloc("x", 2, 0, 0); sn.childkeys[5] = kv_pair_malloc("x", 2, 0, 0);
sn.totalchildkeylens = (sn.n_children-1)*2; sn.totalchildkeylens = (sn.n_children-1)*2;
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
sn.subtree_estimates[i].ndata = random() + (((long long)random())<<32); BP_STATE(&sn,i) = PT_AVAIL;
sn.subtree_estimates[i].nkeys = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,i).ndata = random() + (((long long)random())<<32);
sn.subtree_estimates[i].dsize = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,i).nkeys = random() + (((long long)random())<<32);
sn.subtree_estimates[i].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,i).dsize = random() + (((long long)random())<<32);
r = toku_omt_create(&sn.u.l.bn[i].buffer); assert(r==0); BP_SUBTREE_EST(&sn,i).exact = (BOOL)(random()%2 != 0);
sn.u.l.bn[i].optimized_for_upgrade = BRT_LAYOUT_VERSION; sn.bp[i].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
sn.u.l.bn[i].soft_copy_is_up_to_date = TRUE; r = toku_omt_create(&BLB_BUFFER(&sn, i)); assert(r==0);
sn.u.l.bn[i].seqinsert = 0; BLB_OPTIMIZEDFORUPGRADE(&sn, i) = BRT_LAYOUT_VERSION;
} BLB_SOFTCOPYISUPTODATE(&sn, i) = TRUE;
r = toku_omt_insert(sn.u.l.bn[1].buffer, elts[0], omt_cmp, elts[0], NULL); assert(r==0); BLB_SEQINSERT(&sn, i) = 0;
r = toku_omt_insert(sn.u.l.bn[3].buffer, elts[1], omt_cmp, elts[1], NULL); assert(r==0); }
r = toku_omt_insert(sn.u.l.bn[5].buffer, elts[2], omt_cmp, elts[2], NULL); assert(r==0); r = toku_omt_insert(BLB_BUFFER(&sn, 1), elts[0], omt_cmp, elts[0], NULL); assert(r==0);
sn.u.l.bn[0].n_bytes_in_buffer = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[0].buffer); r = toku_omt_insert(BLB_BUFFER(&sn, 3), elts[1], omt_cmp, elts[1], NULL); assert(r==0);
sn.u.l.bn[1].n_bytes_in_buffer = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[1].buffer); r = toku_omt_insert(BLB_BUFFER(&sn, 5), elts[2], omt_cmp, elts[2], NULL); assert(r==0);
sn.u.l.bn[2].n_bytes_in_buffer = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[2].buffer); BLB_NBYTESINBUF(&sn, 0) = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 0));
sn.u.l.bn[3].n_bytes_in_buffer = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[3].buffer); BLB_NBYTESINBUF(&sn, 1) = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 1));
sn.u.l.bn[4].n_bytes_in_buffer = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[4].buffer); BLB_NBYTESINBUF(&sn, 2) = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 2));
sn.u.l.bn[5].n_bytes_in_buffer = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[5].buffer); BLB_NBYTESINBUF(&sn, 3) = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 3));
sn.u.l.bn[6].n_bytes_in_buffer = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[6].buffer); BLB_NBYTESINBUF(&sn, 4) = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 4));
BLB_NBYTESINBUF(&sn, 5) = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 5));
BLB_NBYTESINBUF(&sn, 6) = 0*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 6));
struct brt *XMALLOC(brt); struct brt *XMALLOC(brt);
struct brt_header *XCALLOC(brt_h); struct brt_header *XCALLOC(brt_h);
...@@ -536,11 +554,11 @@ test_serialize_leaf_with_empty_basement_nodes(void) { ...@@ -536,11 +554,11 @@ test_serialize_leaf_with_empty_basement_nodes(void) {
struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(toku_omt_size(dn->u.l.bn[i].buffer) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(dn->u.l.bn[i].buffer, check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
assert(dn->u.l.bn[i].optimized_for_upgrade == BRT_LAYOUT_VERSION); assert(BLB_OPTIMIZEDFORUPGRADE(dn, i) == BRT_LAYOUT_VERSION);
// don't check soft_copy_is_up_to_date or seqinsert // don't check soft_copy_is_up_to_date or seqinsert
assert(dn->u.l.bn[i].n_bytes_in_buffer == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(dn->u.l.bn[i].buffer)); assert(BLB_NBYTESINBUF(dn, i) == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(dn, i)));
last_i = extra.i; last_i = extra.i;
} }
assert(extra.i == 3); assert(extra.i == 3);
...@@ -551,14 +569,16 @@ test_serialize_leaf_with_empty_basement_nodes(void) { ...@@ -551,14 +569,16 @@ test_serialize_leaf_with_empty_basement_nodes(void) {
kv_pair_free(sn.childkeys[i]); kv_pair_free(sn.childkeys[i]);
} }
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
toku_omt_destroy(&sn.u.l.bn[i].buffer); toku_omt_destroy(&BLB_BUFFER(&sn, i));
} }
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
toku_free(elts[i]); toku_free(elts[i]);
} }
toku_free(sn.u.l.bn); for (int i = 0; i < sn.n_children; i++) {
toku_free(sn.bp[i].ptr);
}
toku_free(sn.bp);
toku_free(sn.childkeys); toku_free(sn.childkeys);
toku_free(sn.subtree_estimates);
toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
toku_blocktable_destroy(&brt_h->blocktable); toku_blocktable_destroy(&brt_h->blocktable);
...@@ -578,7 +598,8 @@ test_serialize_leaf(void) { ...@@ -578,7 +598,8 @@ test_serialize_leaf(void) {
int r; int r;
sn.max_msn_applied_to_node.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -590,30 +611,33 @@ test_serialize_leaf(void) { ...@@ -590,30 +611,33 @@ test_serialize_leaf(void) {
elts[0] = le_malloc("a", "aval"); elts[0] = le_malloc("a", "aval");
elts[1] = le_malloc("b", "bval"); elts[1] = le_malloc("b", "bval");
elts[2] = le_malloc("x", "xval"); elts[2] = le_malloc("x", "xval");
MALLOC_N(2, sn.u.l.bn); MALLOC_N(sn.n_children, sn.bp);
MALLOC_N(2, sn.subtree_estimates);
MALLOC_N(1, sn.childkeys); MALLOC_N(1, sn.childkeys);
sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0); sn.childkeys[0] = kv_pair_malloc("b", 2, 0, 0);
sn.totalchildkeylens = 2; sn.totalchildkeylens = 2;
sn.subtree_estimates[0].ndata = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,0).ndata = random() + (((long long)random())<<32);
sn.subtree_estimates[1].ndata = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,1).ndata = random() + (((long long)random())<<32);
sn.subtree_estimates[0].nkeys = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,0).nkeys = random() + (((long long)random())<<32);
sn.subtree_estimates[1].nkeys = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,1).nkeys = random() + (((long long)random())<<32);
sn.subtree_estimates[0].dsize = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,0).dsize = random() + (((long long)random())<<32);
sn.subtree_estimates[1].dsize = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,1).dsize = random() + (((long long)random())<<32);
sn.subtree_estimates[0].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,0).exact = (BOOL)(random()%2 != 0);
sn.subtree_estimates[1].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,1).exact = (BOOL)(random()%2 != 0);
r = toku_omt_create(&sn.u.l.bn[0].buffer); assert(r==0); BP_STATE(&sn,0) = PT_AVAIL;
r = toku_omt_create(&sn.u.l.bn[1].buffer); assert(r==0); BP_STATE(&sn,1) = PT_AVAIL;
r = toku_omt_insert(sn.u.l.bn[0].buffer, elts[0], omt_cmp, elts[0], NULL); assert(r==0); sn.bp[0].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
r = toku_omt_insert(sn.u.l.bn[0].buffer, elts[1], omt_cmp, elts[1], NULL); assert(r==0); sn.bp[1].ptr = toku_xmalloc(sizeof(struct brtnode_leaf_basement_node));
r = toku_omt_insert(sn.u.l.bn[1].buffer, elts[2], omt_cmp, elts[2], NULL); assert(r==0); r = toku_omt_create(&BLB_BUFFER(&sn, 0)); assert(r==0);
sn.u.l.bn[0].n_bytes_in_buffer = 2*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[0].buffer); r = toku_omt_create(&BLB_BUFFER(&sn, 1)); assert(r==0);
sn.u.l.bn[1].n_bytes_in_buffer = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(sn.u.l.bn[1].buffer); r = toku_omt_insert(BLB_BUFFER(&sn, 0), elts[0], omt_cmp, elts[0], NULL); assert(r==0);
r = toku_omt_insert(BLB_BUFFER(&sn, 0), elts[1], omt_cmp, elts[1], NULL); assert(r==0);
r = toku_omt_insert(BLB_BUFFER(&sn, 1), elts[2], omt_cmp, elts[2], NULL); assert(r==0);
BLB_NBYTESINBUF(&sn, 0) = 2*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 0));
BLB_NBYTESINBUF(&sn, 1) = 1*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(&sn, 1));
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
sn.u.l.bn[i].optimized_for_upgrade = BRT_LAYOUT_VERSION; BLB_OPTIMIZEDFORUPGRADE(&sn, i) = BRT_LAYOUT_VERSION;
sn.u.l.bn[i].soft_copy_is_up_to_date = TRUE; BLB_SOFTCOPYISUPTODATE(&sn, i) = TRUE;
sn.u.l.bn[i].seqinsert = 0; BLB_SEQINSERT(&sn, i) = 0;
} }
struct brt *XMALLOC(brt); struct brt *XMALLOC(brt);
...@@ -659,14 +683,14 @@ test_serialize_leaf(void) { ...@@ -659,14 +683,14 @@ test_serialize_leaf(void) {
struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
toku_omt_iterate(dn->u.l.bn[i].buffer, check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
u_int32_t keylen; u_int32_t keylen;
if (i < npartitions-1) { if (i < npartitions-1) {
assert(strcmp(kv_pair_key(dn->childkeys[i]), le_key_and_len(elts[extra.i-1], &keylen))==0); assert(strcmp(kv_pair_key(dn->childkeys[i]), le_key_and_len(elts[extra.i-1], &keylen))==0);
} }
assert(dn->u.l.bn[i].optimized_for_upgrade == BRT_LAYOUT_VERSION); assert(BLB_OPTIMIZEDFORUPGRADE(dn, i) == BRT_LAYOUT_VERSION);
// don't check soft_copy_is_up_to_date or seqinsert // don't check soft_copy_is_up_to_date or seqinsert
assert(dn->u.l.bn[i].n_bytes_in_buffer == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(dn->u.l.bn[i].buffer)); assert(BLB_NBYTESINBUF(dn, i) == (extra.i-last_i)*(KEY_VALUE_OVERHEAD+2+5) + toku_omt_size(BLB_BUFFER(dn, i)));
last_i = extra.i; last_i = extra.i;
} }
assert(extra.i == 3); assert(extra.i == 3);
...@@ -677,14 +701,16 @@ test_serialize_leaf(void) { ...@@ -677,14 +701,16 @@ test_serialize_leaf(void) {
kv_pair_free(sn.childkeys[i]); kv_pair_free(sn.childkeys[i]);
} }
for (int i = 0; i < sn.n_children; ++i) { for (int i = 0; i < sn.n_children; ++i) {
toku_omt_destroy(&sn.u.l.bn[i].buffer); toku_omt_destroy(&BLB_BUFFER(&sn, i));
} }
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
toku_free(elts[i]); toku_free(elts[i]);
} }
toku_free(sn.u.l.bn); for (int i = 0; i < sn.n_children; i++) {
toku_free(sn.bp[i].ptr);
}
toku_free(sn.bp);
toku_free(sn.childkeys); toku_free(sn.childkeys);
toku_free(sn.subtree_estimates);
toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
toku_blocktable_destroy(&brt_h->blocktable); toku_blocktable_destroy(&brt_h->blocktable);
...@@ -705,9 +731,11 @@ test_serialize_nonleaf(void) { ...@@ -705,9 +731,11 @@ test_serialize_nonleaf(void) {
int r; int r;
// source_brt.fd=fd; // source_brt.fd=fd;
sn.max_msn_applied_to_node.msn = 0; sn.max_msn_applied_to_node_on_disk.msn = 0;
sn.max_msn_applied_to_node_in_memory.msn = 0;
char *hello_string; char *hello_string;
sn.max_msn_applied_to_node = (MSN) {TESTMSNVAL}; sn.max_msn_applied_to_node_on_disk.msn = TESTMSNDSKVAL;
sn.max_msn_applied_to_node_in_memory.msn = TESTMSNMEMVAL;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.flags = 0x11223344; sn.flags = 0x11223344;
sn.thisnodename.b = 20; sn.thisnodename.b = 20;
...@@ -716,21 +744,24 @@ test_serialize_nonleaf(void) { ...@@ -716,21 +744,24 @@ test_serialize_nonleaf(void) {
sn.height = 1; sn.height = 1;
sn.n_children = 2; sn.n_children = 2;
hello_string = toku_strdup("hello"); hello_string = toku_strdup("hello");
MALLOC_N(2, sn.u.n.childinfos); MALLOC_N(2, sn.bp);
MALLOC_N(2, sn.subtree_estimates);
MALLOC_N(1, sn.childkeys); MALLOC_N(1, sn.childkeys);
sn.childkeys[0] = kv_pair_malloc(hello_string, 6, 0, 0); sn.childkeys[0] = kv_pair_malloc(hello_string, 6, 0, 0);
sn.totalchildkeylens = 6; sn.totalchildkeylens = 6;
BNC_BLOCKNUM(&sn, 0).b = 30; BP_BLOCKNUM(&sn, 0).b = 30;
BNC_BLOCKNUM(&sn, 1).b = 35; BP_BLOCKNUM(&sn, 1).b = 35;
sn.subtree_estimates[0].ndata = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,0).ndata = random() + (((long long)random())<<32);
sn.subtree_estimates[1].ndata = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,1).ndata = random() + (((long long)random())<<32);
sn.subtree_estimates[0].nkeys = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,0).nkeys = random() + (((long long)random())<<32);
sn.subtree_estimates[1].nkeys = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,1).nkeys = random() + (((long long)random())<<32);
sn.subtree_estimates[0].dsize = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,0).dsize = random() + (((long long)random())<<32);
sn.subtree_estimates[1].dsize = random() + (((long long)random())<<32); BP_SUBTREE_EST(&sn,1).dsize = random() + (((long long)random())<<32);
sn.subtree_estimates[0].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,0).exact = (BOOL)(random()%2 != 0);
sn.subtree_estimates[1].exact = (BOOL)(random()%2 != 0); BP_SUBTREE_EST(&sn,1).exact = (BOOL)(random()%2 != 0);
BP_STATE(&sn,0) = PT_AVAIL;
BP_STATE(&sn,1) = PT_AVAIL;
sn.bp[0].ptr = toku_xmalloc(sizeof(struct brtnode_nonleaf_childinfo));
sn.bp[1].ptr = toku_xmalloc(sizeof(struct brtnode_nonleaf_childinfo));
r = toku_fifo_create(&BNC_BUFFER(&sn,0)); assert(r==0); r = toku_fifo_create(&BNC_BUFFER(&sn,0)); assert(r==0);
r = toku_fifo_create(&BNC_BUFFER(&sn,1)); assert(r==0); r = toku_fifo_create(&BNC_BUFFER(&sn,1)); assert(r==0);
//Create XIDS //Create XIDS
...@@ -747,7 +778,6 @@ test_serialize_nonleaf(void) { ...@@ -747,7 +778,6 @@ test_serialize_nonleaf(void) {
r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, next_dummymsn(), xids_234); assert(r==0); r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, next_dummymsn(), xids_234); assert(r==0);
BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123); BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123);
BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_234); BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_234);
sn.u.n.n_bytes_in_buffers = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5) + xids_get_serialize_size(xids_0) + xids_get_serialize_size(xids_123) + xids_get_serialize_size(xids_234);
//Cleanup: //Cleanup:
xids_destroy(&xids_0); xids_destroy(&xids_0);
xids_destroy(&xids_123); xids_destroy(&xids_123);
...@@ -780,11 +810,16 @@ test_serialize_nonleaf(void) { ...@@ -780,11 +810,16 @@ test_serialize_nonleaf(void) {
r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE); r = toku_serialize_brtnode_to(fd, make_blocknum(20), &sn, brt->h, 1, 1, FALSE);
assert(r==0); assert(r==0);
assert(sn.max_msn_applied_to_node_on_disk.msn == TESTMSNMEMVAL);
assert(sn.max_msn_applied_to_node_in_memory.msn == TESTMSNMEMVAL);
r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, brt_h); r = toku_deserialize_brtnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, brt_h);
assert(r==0); assert(r==0);
assert(dn->thisnodename.b==20); assert(dn->thisnodename.b==20);
assert(dn->max_msn_applied_to_node.msn == TESTMSNVAL); assert(dn->max_msn_applied_to_node_on_disk.msn == TESTMSNMEMVAL);
assert(dn->max_msn_applied_to_node_in_memory.msn == TESTMSNMEMVAL);
assert(dn->layout_version ==BRT_LAYOUT_VERSION); assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION); assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
...@@ -794,17 +829,18 @@ test_serialize_nonleaf(void) { ...@@ -794,17 +829,18 @@ test_serialize_nonleaf(void) {
assert(strcmp(kv_pair_key(dn->childkeys[0]), "hello")==0); assert(strcmp(kv_pair_key(dn->childkeys[0]), "hello")==0);
assert(toku_brt_pivot_key_len(dn->childkeys[0])==6); assert(toku_brt_pivot_key_len(dn->childkeys[0])==6);
assert(dn->totalchildkeylens==6); assert(dn->totalchildkeylens==6);
assert(BNC_BLOCKNUM(dn,0).b==30); assert(BP_BLOCKNUM(dn,0).b==30);
assert(BNC_BLOCKNUM(dn,1).b==35); assert(BP_BLOCKNUM(dn,1).b==35);
toku_brtnode_free(&dn); toku_brtnode_free(&dn);
kv_pair_free(sn.childkeys[0]); kv_pair_free(sn.childkeys[0]);
toku_free(hello_string); toku_free(hello_string);
toku_fifo_free(&BNC_BUFFER(&sn,0)); toku_fifo_free(&BNC_BUFFER(&sn,0));
toku_fifo_free(&BNC_BUFFER(&sn,1)); toku_fifo_free(&BNC_BUFFER(&sn,1));
toku_free(sn.u.n.childinfos); toku_free(sn.bp[0].ptr);
toku_free(sn.bp[1].ptr);
toku_free(sn.bp);
toku_free(sn.childkeys); toku_free(sn.childkeys);
toku_free(sn.subtree_estimates);
toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); toku_block_free(brt_h->blocktable, BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
toku_blocktable_destroy(&brt_h->blocktable); toku_blocktable_destroy(&brt_h->blocktable);
......
...@@ -13,6 +13,7 @@ make_node(BRT brt, int height) { ...@@ -13,6 +13,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -24,12 +25,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -24,12 +25,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
...@@ -65,7 +66,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey, ...@@ -65,7 +66,7 @@ make_tree(BRT brt, int height, int fanout, int nperleaf, int *seq, int *minkey,
struct kv_pair *pivotkey = kv_pair_malloc(&k, sizeof k, NULL, 0); struct kv_pair *pivotkey = kv_pair_malloc(&k, sizeof k, NULL, 0);
toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k); toku_brt_nonleaf_append_child(node, child, pivotkey, sizeof k);
} }
node->subtree_estimates[childnum] = make_subtree_estimates(subtree_size, subtree_size, 0, FALSE); BP_SUBTREE_EST(node,childnum) = make_subtree_estimates(subtree_size, subtree_size, 0, FALSE);
toku_unpin_brtnode(brt, child); toku_unpin_brtnode(brt, child);
} }
*minkey = minkeys[0]; *minkey = minkeys[0];
......
...@@ -19,6 +19,7 @@ make_node(BRT brt, int height) { ...@@ -19,6 +19,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -30,15 +31,16 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -30,15 +31,16 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
// apply an insert to the leaf node // apply an insert to the leaf node
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
leafnode->max_msn_applied_to_node = msn; leafnode->max_msn_applied_to_node_on_disk = msn;
leafnode->max_msn_applied_to_node_in_memory = msn;
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
...@@ -63,7 +65,7 @@ insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) { ...@@ -63,7 +65,7 @@ insert_into_child_buffer(BRTNODE node, int childnum, int minkey, int maxkey) {
DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key); DBT thekey; toku_fill_dbt(&thekey, &key, sizeof key);
DBT theval; toku_fill_dbt(&theval, &val, sizeof val); DBT theval; toku_fill_dbt(&theval, &val, sizeof val);
toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval); toku_brt_append_to_child_buffer(node, childnum, BRT_INSERT, msn, xids_get_root_xids(), &thekey, &theval);
node->max_msn_applied_to_node = msn; node->max_msn_applied_to_node_in_memory = msn;
} }
} }
...@@ -138,7 +140,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) { ...@@ -138,7 +140,7 @@ test_make_tree(int height, int fanout, int nperleaf, int do_verify) {
// set the new root to point to the new tree // set the new root to point to the new tree
*rootp = newroot->thisnodename; *rootp = newroot->thisnodename;
newroot->max_msn_applied_to_node = last_dummymsn(); // capture msn of last message injected into tree newroot->max_msn_applied_to_node_in_memory = last_dummymsn(); // capture msn of last message injected into tree
// unpin the new root // unpin the new root
toku_unpin_brtnode(brt, newroot); toku_unpin_brtnode(brt, newroot);
......
...@@ -20,6 +20,7 @@ make_node(BRT brt, int height) { ...@@ -20,6 +20,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
......
...@@ -22,6 +22,7 @@ make_node(BRT brt, int height) { ...@@ -22,6 +22,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -33,13 +34,13 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -33,13 +34,13 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
// apply an insert to the leaf node // apply an insert to the leaf node
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// Create bad tree (don't do following): // Create bad tree (don't do following):
// leafnode->max_msn_applied_to_node = msn; // leafnode->max_msn_applied_to_node = msn;
......
...@@ -11,6 +11,7 @@ make_node(BRT brt, int height) { ...@@ -11,6 +11,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -22,12 +23,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -22,12 +23,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -12,6 +12,7 @@ make_node(BRT brt, int height) { ...@@ -12,6 +12,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -23,12 +24,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -23,12 +24,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -11,6 +11,7 @@ make_node(BRT brt, int height) { ...@@ -11,6 +11,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -22,12 +23,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -22,12 +23,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -12,6 +12,7 @@ make_node(BRT brt, int height) { ...@@ -12,6 +12,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -23,12 +24,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -23,12 +24,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -12,6 +12,7 @@ make_node(BRT brt, int height) { ...@@ -12,6 +12,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -23,12 +24,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -23,12 +24,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
...@@ -11,6 +11,7 @@ make_node(BRT brt, int height) { ...@@ -11,6 +11,7 @@ make_node(BRT brt, int height) {
BRTNODE node = NULL; BRTNODE node = NULL;
int n_children = (height == 0) ? 1 : 0; int n_children = (height == 0) ? 1 : 0;
toku_create_new_brtnode(brt, &node, height, n_children); toku_create_new_brtnode(brt, &node, height, n_children);
if (n_children) BP_STATE(node,0) = PT_AVAIL;
return node; return node;
} }
...@@ -22,12 +23,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen ...@@ -22,12 +23,12 @@ append_leaf(BRTNODE leafnode, void *key, size_t keylen, void *val, size_t vallen
DBT theval; toku_fill_dbt(&theval, val, vallen); DBT theval; toku_fill_dbt(&theval, val, vallen);
// get an index that we can use to create a new leaf entry // get an index that we can use to create a new leaf entry
uint32_t idx = toku_omt_size(leafnode->u.l.bn[0].buffer); uint32_t idx = toku_omt_size(BLB_BUFFER(leafnode, 0));
// apply an insert to the leaf node // apply an insert to the leaf node
MSN msn = next_dummymsn(); MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } }; BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
brt_leaf_apply_cmd_once(&leafnode->u.l.bn[0], &leafnode->subtree_estimates[0], &cmd, idx, NULL, NULL); brt_leaf_apply_cmd_once((BASEMENTNODE)leafnode->bp[0].ptr, &BP_SUBTREE_EST(leafnode,0), &cmd, idx, NULL, NULL);
// dont forget to dirty the node // dont forget to dirty the node
leafnode->dirty = 1; leafnode->dirty = 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment