Commit 4b779211 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

close[t:4056] Fix #4056. (Leafnode partition now allows for aligned and...

close[t:4056] Fix #4056.  (Leafnode partition now allows for aligned and partial I/O, or even reordering the partitions to pack them more tightly).

git-svn-id: file:///svn/toku/tokudb@35821 c7de825b-a66e-492c-adef-691d508d4ae1
parent c9e5b984
...@@ -252,12 +252,13 @@ struct brtnode_leaf_basement_node { ...@@ -252,12 +252,13 @@ struct brtnode_leaf_basement_node {
bool stale_ancestor_messages_applied; bool stale_ancestor_messages_applied;
}; };
#define PT_INVALID 0 enum __attribute__((__packed__)) pt_state { // declare this to be packed so that when used below it will only take 1 byte.
#define PT_ON_DISK 1 PT_INVALID = 0,
#define PT_COMPRESSED 2 PT_ON_DISK = 1,
#define PT_AVAIL 3 PT_COMPRESSED = 2,
PT_AVAIL = 3};
enum brtnode_child_tag { enum __attribute__((__packed__)) brtnode_child_tag {
BCT_INVALID = 0, BCT_INVALID = 0,
BCT_NULL, BCT_NULL,
BCT_SUBBLOCK, BCT_SUBBLOCK,
...@@ -266,7 +267,7 @@ enum brtnode_child_tag { ...@@ -266,7 +267,7 @@ enum brtnode_child_tag {
}; };
typedef struct __attribute__((__packed__)) brtnode_child_pointer { typedef struct __attribute__((__packed__)) brtnode_child_pointer {
u_int8_t tag; enum brtnode_child_tag tag;
union { union {
struct sub_block *subblock; struct sub_block *subblock;
struct brtnode_nonleaf_childinfo *nonleaf; struct brtnode_nonleaf_childinfo *nonleaf;
...@@ -289,12 +290,15 @@ struct __attribute__((__packed__)) brtnode_partition { ...@@ -289,12 +290,15 @@ struct __attribute__((__packed__)) brtnode_partition {
// PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress // PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress
// PT_AVAIL - means the partition is decompressed and in memory // PT_AVAIL - means the partition is decompressed and in memory
// //
u_int8_t state; enum pt_state state; // make this an enum to make debugging easier.
// //
// stores the offset to the end of the partition on disk from the brtnode, needed to read a partition off of disk // stores the offset to the beginning of the partition on disk from the brtnode, and the length, needed to read a partition off of disk
// the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
// // The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
u_int32_t offset; // The SIZE is the size of the compressed partition.
// Rationale: We cannot store the size from the beginning of the node since we don't know how big the header will be.
// However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
u_int32_t start,size;
// //
// pointer to the partition. Depending on the state, they may be different things // pointer to the partition. Depending on the state, they may be different things
// if state == PT_INVALID, then the node was just initialized and ptr == NULL // if state == PT_INVALID, then the node was just initialized and ptr == NULL
...@@ -331,8 +335,6 @@ struct brtnode { ...@@ -331,8 +335,6 @@ struct brtnode {
unsigned int totalchildkeylens; unsigned int totalchildkeylens;
struct kv_pair **childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1]. struct kv_pair **childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Child 1's keys are > childkeys[0]. */ Child 1's keys are > childkeys[0]. */
u_int32_t bp_offset; // offset on disk to where the partitions start
// array of size n_children, consisting of brtnode partitions // array of size n_children, consisting of brtnode partitions
// each one is associated with a child // each one is associated with a child
// for internal nodes, the ith partition corresponds to the ith message buffer // for internal nodes, the ith partition corresponds to the ith message buffer
...@@ -346,7 +348,8 @@ struct brtnode { ...@@ -346,7 +348,8 @@ struct brtnode {
#define BP_HAVE_FULLHASH(node,i) ((node)->bp[i].have_fullhash) #define BP_HAVE_FULLHASH(node,i) ((node)->bp[i].have_fullhash)
#define BP_FULLHASH(node,i) ((node)->bp[i].fullhash) #define BP_FULLHASH(node,i) ((node)->bp[i].fullhash)
#define BP_STATE(node,i) ((node)->bp[i].state) #define BP_STATE(node,i) ((node)->bp[i].state)
#define BP_OFFSET(node,i) ((node)->bp[i].offset) #define BP_START(node,i) ((node)->bp[i].start)
#define BP_SIZE(node,i) ((node)->bp[i].size)
#define BP_SUBTREE_EST(node,i) ((node)->bp[i].subtree_estimates) #define BP_SUBTREE_EST(node,i) ((node)->bp[i].subtree_estimates)
#define BP_WORKDONE(node, i)((node)->bp[i].workdone) #define BP_WORKDONE(node, i)((node)->bp[i].workdone)
......
...@@ -202,6 +202,8 @@ serialize_node_header_size(BRTNODE node) { ...@@ -202,6 +202,8 @@ serialize_node_header_size(BRTNODE node) {
retval += sizeof(node->layout_version); retval += sizeof(node->layout_version);
retval += sizeof(node->layout_version_original); retval += sizeof(node->layout_version_original);
retval += 4; // BUILD_ID retval += 4; // BUILD_ID
retval += 4; // n_children
retval += node->n_children*8; // encode start offset and length of each partition
retval += 4; // checksum retval += 4; // checksum
return retval; return retval;
} }
...@@ -216,6 +218,12 @@ serialize_node_header(BRTNODE node, struct wbuf *wbuf) { ...@@ -216,6 +218,12 @@ serialize_node_header(BRTNODE node, struct wbuf *wbuf) {
wbuf_nocrc_int(wbuf, node->layout_version); wbuf_nocrc_int(wbuf, node->layout_version);
wbuf_nocrc_int(wbuf, node->layout_version_original); wbuf_nocrc_int(wbuf, node->layout_version_original);
wbuf_nocrc_uint(wbuf, BUILD_ID); wbuf_nocrc_uint(wbuf, BUILD_ID);
wbuf_nocrc_int (wbuf, node->n_children);
for (int i=0; i<node->n_children; i++) {
assert(BP_SIZE(node,i)>0);
wbuf_nocrc_int(wbuf, BP_START(node, i)); // save the beginning of the partition
wbuf_nocrc_int(wbuf, BP_SIZE (node, i)); // and the size
}
// checksum the header // checksum the header
u_int32_t end_to_end_checksum = x1764_memory(wbuf->buf, wbuf_get_woffset(wbuf)); u_int32_t end_to_end_checksum = x1764_memory(wbuf->buf, wbuf_get_woffset(wbuf));
wbuf_nocrc_int(wbuf, end_to_end_checksum); wbuf_nocrc_int(wbuf, end_to_end_checksum);
...@@ -375,25 +383,19 @@ serialize_brtnode_info_size(BRTNODE node) ...@@ -375,25 +383,19 @@ serialize_brtnode_info_size(BRTNODE node)
retval += 4; // nodesize retval += 4; // nodesize
retval += 4; // flags retval += 4; // flags
retval += 4; // height; retval += 4; // height;
retval += 4; // n_children
retval += (3*8+1)*node->n_children; // subtree estimates for each child retval += (3*8+1)*node->n_children; // subtree estimates for each child
retval += node->totalchildkeylens; // total length of pivots retval += node->totalchildkeylens; // total length of pivots
retval += (node->n_children-1)*4; // encode length of each pivot retval += (node->n_children-1)*4; // encode length of each pivot
if (node->height > 0) { if (node->height > 0) {
retval += node->n_children*8; // child blocknum's retval += node->n_children*8; // child blocknum's
} }
retval += node->n_children*4; // encode offset of each partition
retval += 4; // checksum retval += 4; // checksum
return retval; return retval;
} }
static void static void serialize_brtnode_info(BRTNODE node,
serialize_brtnode_info(
BRTNODE node,
SUB_BLOCK sb_parts,
SUB_BLOCK sb // output SUB_BLOCK sb // output
) ) {
{
assert(sb->uncompressed_size == 0); assert(sb->uncompressed_size == 0);
assert(sb->uncompressed_ptr == NULL); assert(sb->uncompressed_ptr == NULL);
sb->uncompressed_size = serialize_brtnode_info_size(node); sb->uncompressed_size = serialize_brtnode_info_size(node);
...@@ -406,7 +408,6 @@ serialize_brtnode_info( ...@@ -406,7 +408,6 @@ serialize_brtnode_info(
wbuf_nocrc_uint(&wb, node->nodesize); wbuf_nocrc_uint(&wb, node->nodesize);
wbuf_nocrc_uint(&wb, node->flags); wbuf_nocrc_uint(&wb, node->flags);
wbuf_nocrc_int (&wb, node->height); wbuf_nocrc_int (&wb, node->height);
wbuf_nocrc_int (&wb, node->n_children);
// subtree estimates of each child // subtree estimates of each child
for (int i = 0; i < node->n_children; i++) { for (int i = 0; i < node->n_children; i++) {
wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).nkeys); wbuf_nocrc_ulonglong(&wb, BP_SUBTREE_EST(node,i).nkeys);
...@@ -425,18 +426,6 @@ serialize_brtnode_info( ...@@ -425,18 +426,6 @@ serialize_brtnode_info(
} }
} }
// offsets to other partitions
u_int32_t curr_offset = 0;
for (int i = 0; i < node->n_children; i++) {
// TODO: (Zardosht) figure out if we want to put some padding to align partitions
curr_offset += sb_parts[i].compressed_size + 4; // data and checksum
//
// update the offset in the node
//
BP_OFFSET(node,i) = curr_offset;
wbuf_nocrc_int(&wb, curr_offset);
}
u_int32_t end_to_end_checksum = x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb)); u_int32_t end_to_end_checksum = x1764_memory(sb->uncompressed_ptr, wbuf_get_woffset(&wb));
wbuf_nocrc_int(&wb, end_to_end_checksum); wbuf_nocrc_int(&wb, end_to_end_checksum);
invariant(wb.ndone == wb.size); invariant(wb.ndone == wb.size);
...@@ -763,7 +752,7 @@ toku_serialize_brtnode_to_memory (BRTNODE node, ...@@ -763,7 +752,7 @@ toku_serialize_brtnode_to_memory (BRTNODE node,
// Now lets create a sub-block that has the common node information, // Now lets create a sub-block that has the common node information,
// This does NOT include the header // This does NOT include the header
// //
serialize_brtnode_info(node, sb, &sb_node_info); serialize_brtnode_info(node, &sb_node_info);
compress_brtnode_sub_block(&sb_node_info); compress_brtnode_sub_block(&sb_node_info);
// now we have compressed each of our pieces into individual sub_blocks, // now we have compressed each of our pieces into individual sub_blocks,
...@@ -772,19 +761,17 @@ toku_serialize_brtnode_to_memory (BRTNODE node, ...@@ -772,19 +761,17 @@ toku_serialize_brtnode_to_memory (BRTNODE node,
// The total size of the node is: // The total size of the node is:
// size of header + disk size of the n+1 sub_block's created above // size of header + disk size of the n+1 sub_block's created above
u_int32_t total_node_size = 0; u_int32_t total_node_size = (serialize_node_header_size(node) // uncomrpessed header
total_node_size += serialize_node_header_size(node); //header + sb_node_info.compressed_size // compressed nodeinfo (without its checksum)
+ 4); // nodinefo's checksum
total_node_size += sb_node_info.compressed_size + 4; // total plus checksum // store the BP_SIZESs
for (int i = 0; i < npartitions; i++) { for (int i = 0; i < node->n_children; i++) {
u_int32_t len = sb[i].compressed_size + 4; // data and checksum
BP_SIZE (node,i) = len;
BP_START(node,i) = total_node_size;
total_node_size += sb[i].compressed_size + 4; total_node_size += sb[i].compressed_size + 4;
} }
//
// set the node bp_offset
//
node->bp_offset = serialize_node_header_size(node) + sb_node_info.compressed_size + 4;
char *data = toku_xmalloc(total_node_size); char *data = toku_xmalloc(total_node_size);
char *curr_ptr = data; char *curr_ptr = data;
// now create the final serialized node // now create the final serialized node
...@@ -1118,12 +1105,14 @@ deserialize_brtnode_info( ...@@ -1118,12 +1105,14 @@ deserialize_brtnode_info(
node->nodesize = rbuf_int(&rb); node->nodesize = rbuf_int(&rb);
node->flags = rbuf_int(&rb); node->flags = rbuf_int(&rb);
node->height = rbuf_int(&rb); node->height = rbuf_int(&rb);
node->n_children = rbuf_int(&rb);
// now create the basement nodes or childinfos, depending on whether this is a // now create the basement nodes or childinfos, depending on whether this is a
// leaf node or internal node // leaf node or internal node
// now the subtree_estimates // now the subtree_estimates
XMALLOC_N(node->n_children, node->bp);
// n_children is now in the header, nd the allocatio of the node->bp is in deserialize_brtnode_from_rbuf.
assert(node->bp!=NULL); //
for (int i=0; i < node->n_children; i++) { for (int i=0; i < node->n_children; i++) {
SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i); SUBTREE_EST curr_se = &BP_SUBTREE_EST(node,i);
curr_se->nkeys = rbuf_ulonglong(&rb); curr_se->nkeys = rbuf_ulonglong(&rb);
...@@ -1159,11 +1148,6 @@ deserialize_brtnode_info( ...@@ -1159,11 +1148,6 @@ deserialize_brtnode_info(
} }
} }
// read the offsets
for (int i = 0; i < node->n_children; i++) {
BP_OFFSET(node,i) = rbuf_int(&rb);
}
// make sure that all the data was read // make sure that all the data was read
if (data_size != rb.ndone) { if (data_size != rb.ndone) {
dump_bad_block(rb.buf, rb.size); dump_bad_block(rb.buf, rb.size);
...@@ -1337,6 +1321,13 @@ deserialize_brtnode_from_rbuf( ...@@ -1337,6 +1321,13 @@ deserialize_brtnode_from_rbuf(
node->layout_version = node->layout_version_read_from_disk; node->layout_version = node->layout_version_read_from_disk;
node->layout_version_original = rbuf_int(rb); node->layout_version_original = rbuf_int(rb);
node->build_id = rbuf_int(rb); node->build_id = rbuf_int(rb);
node->n_children = rbuf_int(rb);
XMALLOC_N(node->n_children, node->bp);
// read the partition locations
for (int i=0; i<node->n_children; i++) {
BP_START(node,i) = rbuf_int(rb);
BP_SIZE (node,i) = rbuf_int(rb);
}
// verify checksum of header stored // verify checksum of header stored
checksum = x1764_memory(rb->buf, rb->ndone); checksum = x1764_memory(rb->buf, rb->ndone);
stored_checksum = rbuf_int(rb); stored_checksum = rbuf_int(rb);
...@@ -1352,13 +1343,6 @@ deserialize_brtnode_from_rbuf( ...@@ -1352,13 +1343,6 @@ deserialize_brtnode_from_rbuf(
deserialize_brtnode_info(&sb_node_info, node); deserialize_brtnode_info(&sb_node_info, node);
toku_free(sb_node_info.uncompressed_ptr); toku_free(sb_node_info.uncompressed_ptr);
//
// now that we have read and decompressed up until
// the start of the bp's, we can set the node->bp_offset
// so future partial fetches know where to get bp's
//
node->bp_offset = rb->ndone;
// now that the node info has been deserialized, we can proceed to deserialize // now that the node info has been deserialized, we can proceed to deserialize
// the individual sub blocks // the individual sub blocks
assert(bfe->type == brtnode_fetch_none || bfe->type == brtnode_fetch_subset || bfe->type == brtnode_fetch_all || bfe->type == brtnode_fetch_prefetch); assert(bfe->type == brtnode_fetch_none || bfe->type == brtnode_fetch_subset || bfe->type == brtnode_fetch_all || bfe->type == brtnode_fetch_prefetch);
...@@ -1368,14 +1352,16 @@ deserialize_brtnode_from_rbuf( ...@@ -1368,14 +1352,16 @@ deserialize_brtnode_from_rbuf(
// for partitions staying compressed, create sub_block // for partitions staying compressed, create sub_block
setup_brtnode_partitions(node,bfe); setup_brtnode_partitions(node,bfe);
for (int i = 0; i < node->n_children; i++) { // Previously, this code was a for loop with spawns inside and a sync at the end.
u_int32_t curr_offset = (i==0) ? 0 : BP_OFFSET(node,i-1); // But now the loop is parallelizeable since we don't have a dependency on the work done so far.
u_int32_t curr_size = (i==0) ? BP_OFFSET(node,i) : (BP_OFFSET(node,i) - BP_OFFSET(node,i-1)); cilk_for (int i = 0; i < node->n_children; i++) {
u_int32_t curr_offset = BP_START(node,i);
u_int32_t curr_size = BP_SIZE(node,i);
// the compressed, serialized partitions start at where rb is currently pointing, // the compressed, serialized partitions start at where rb is currently pointing,
// which would be rb->buf + rb->ndone // which would be rb->buf + rb->ndone
// we need to intialize curr_rbuf to point to this place // we need to intialize curr_rbuf to point to this place
struct rbuf curr_rbuf = {.buf = NULL, .size = 0, .ndone = 0}; struct rbuf curr_rbuf = {.buf = NULL, .size = 0, .ndone = 0};
rbuf_init(&curr_rbuf, rb->buf + rb->ndone + curr_offset, curr_size); rbuf_init(&curr_rbuf, rb->buf + curr_offset, curr_size);
// //
// now we are at the point where we have: // now we are at the point where we have:
...@@ -1393,18 +1379,28 @@ deserialize_brtnode_from_rbuf( ...@@ -1393,18 +1379,28 @@ deserialize_brtnode_from_rbuf(
struct sub_block curr_sb; struct sub_block curr_sb;
sub_block_init(&curr_sb); sub_block_init(&curr_sb);
// case where we read and decompress the partition // curr_rbuf is passed by value to decompress_and_deserialize_worker, so there's no ugly race condition.
// This would be more obvious if curr_rbuf were an array.
// deserialize_brtnode_info figures out what the state // deserialize_brtnode_info figures out what the state
// should be and sets up the memory so that we are ready to use it // should be and sets up the memory so that we are ready to use it
if (BP_STATE(node,i) == PT_AVAIL) {
cilk_spawn decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i, bfe->cmp_extra, bfe->cmp); switch (BP_STATE(node,i)) {
} case PT_AVAIL:
// case where we read and decompress the partition
decompress_and_deserialize_worker(curr_rbuf, curr_sb, node, i, bfe->cmp_extra, bfe->cmp);
continue;
case PT_COMPRESSED:
// case where we leave the partition in the compressed state // case where we leave the partition in the compressed state
else if (BP_STATE(node,i) == PT_COMPRESSED) { check_and_copy_compressed_sub_block_worker(curr_rbuf, curr_sb, node, i);
cilk_spawn check_and_copy_compressed_sub_block_worker(curr_rbuf, curr_sb, node, i); continue;
case PT_INVALID: // this is really bad
case PT_ON_DISK: // it's supposed to be in memory.
assert(0);
continue;
} }
assert(0);
} }
cilk_sync;
*brtnode = node; *brtnode = node;
r = 0; r = 0;
cleanup: cleanup:
...@@ -1437,9 +1433,8 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode ...@@ -1437,9 +1433,8 @@ toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode
&total_node_disk_size &total_node_disk_size
); );
u_int32_t curr_offset = (childnum==0) ? 0 : BP_OFFSET(node,childnum-1); u_int32_t curr_offset = BP_START(node, childnum);
curr_offset += node->bp_offset; u_int32_t curr_size = BP_SIZE (node, childnum);
u_int32_t curr_size = (childnum==0) ? BP_OFFSET(node,childnum) : (BP_OFFSET(node,childnum) - BP_OFFSET(node,childnum-1));
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0}; struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
u_int8_t *XMALLOC_N(curr_size, raw_block); u_int8_t *XMALLOC_N(curr_size, raw_block);
......
...@@ -785,10 +785,7 @@ void toku_brtnode_pe_est_callback( ...@@ -785,10 +785,7 @@ void toku_brtnode_pe_est_callback(
// first get an estimate for how much space will be taken // first get an estimate for how much space will be taken
// after compression, it is simply the size of compressed // after compression, it is simply the size of compressed
// data on disk plus the size of the struct that holds it // data on disk plus the size of the struct that holds it
u_int32_t compressed_data_size = u_int32_t compressed_data_size = BP_SIZE(node, i);
((i==0) ?
BP_OFFSET(node,i) :
(BP_OFFSET(node,i) - BP_OFFSET(node,i-1)));
compressed_data_size += sizeof(struct sub_block); compressed_data_size += sizeof(struct sub_block);
// now get the space taken now // now get the space taken now
...@@ -1207,7 +1204,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num ...@@ -1207,7 +1204,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
n->childkeys = 0; n->childkeys = 0;
n->bp = 0; n->bp = 0;
n->n_children = num_children; n->n_children = num_children;
n->bp_offset = 0;
if (num_children > 0) { if (num_children > 0) {
XMALLOC_N(num_children-1, n->childkeys); XMALLOC_N(num_children-1, n->childkeys);
...@@ -1215,7 +1211,8 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num ...@@ -1215,7 +1211,8 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
for (int i = 0; i < num_children; i++) { for (int i = 0; i < num_children; i++) {
BP_BLOCKNUM(n,i).b=0; BP_BLOCKNUM(n,i).b=0;
BP_STATE(n,i) = PT_INVALID; BP_STATE(n,i) = PT_INVALID;
BP_OFFSET(n,i) = 0; BP_START(n,i) = 0;
BP_SIZE (n,i) = 0;
BP_SUBTREE_EST(n,i) = zero_estimates; BP_SUBTREE_EST(n,i) = zero_estimates;
BP_WORKDONE(n,i) = 0; BP_WORKDONE(n,i) = 0;
BP_INIT_TOUCHED_CLOCK(n, i); BP_INIT_TOUCHED_CLOCK(n, i);
...@@ -1379,7 +1376,8 @@ static void ...@@ -1379,7 +1376,8 @@ static void
init_childinfo(BRTNODE node, int childnum, BRTNODE child) { init_childinfo(BRTNODE node, int childnum, BRTNODE child) {
BP_BLOCKNUM(node,childnum) = child->thisnodename; BP_BLOCKNUM(node,childnum) = child->thisnodename;
BP_STATE(node,childnum) = PT_AVAIL; BP_STATE(node,childnum) = PT_AVAIL;
BP_OFFSET(node,childnum) = 0; BP_START(node,childnum) = 0;
BP_SIZE (node,childnum) = 0;
BP_SUBTREE_EST(node,childnum) = zero_estimates; BP_SUBTREE_EST(node,childnum) = zero_estimates;
BP_WORKDONE(node, childnum) = 0; BP_WORKDONE(node, childnum) = 0;
set_BNC(node, childnum, toku_create_empty_nl()); set_BNC(node, childnum, toku_create_empty_nl());
...@@ -1605,10 +1603,10 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, ...@@ -1605,10 +1603,10 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
REALLOC_N(num_children_in_b, B->bp); REALLOC_N(num_children_in_b, B->bp);
B->n_children = num_children_in_b; B->n_children = num_children_in_b;
for (int i = 0; i < num_children_in_b; i++) { for (int i = 0; i < num_children_in_b; i++) {
BP_STATE(B,i) = PT_AVAIL;
BP_OFFSET(B,i) = 0;
BP_BLOCKNUM(B,i).b = 0; BP_BLOCKNUM(B,i).b = 0;
BP_SUBTREE_EST(B,i)= zero_estimates; BP_STATE(B,i) = PT_AVAIL;
BP_START(B,i) = 0;
BP_SIZE(B,i) = 0;
BP_WORKDONE(B,i) = 0; BP_WORKDONE(B,i) = 0;
set_BLB(B, i, toku_create_empty_bn()); set_BLB(B, i, toku_create_empty_bn());
} }
...@@ -1834,7 +1832,8 @@ handle_split_of_child (BRT UU(t), BRTNODE node, int childnum, ...@@ -1834,7 +1832,8 @@ handle_split_of_child (BRT UU(t), BRTNODE node, int childnum,
BP_SUBTREE_EST(node,childnum+1) = zero_estimates; BP_SUBTREE_EST(node,childnum+1) = zero_estimates;
BP_WORKDONE(node, childnum+1) = 0; BP_WORKDONE(node, childnum+1) = 0;
BP_STATE(node,childnum+1) = PT_AVAIL; BP_STATE(node,childnum+1) = PT_AVAIL;
BP_OFFSET(node,childnum+1) = 0; BP_START(node,childnum+1) = 0;
BP_SIZE(node,childnum+1) = 0;
fixup_child_estimates(node, childnum, childa, TRUE); fixup_child_estimates(node, childnum, childa, TRUE);
fixup_child_estimates(node, childnum+1, childb, TRUE); fixup_child_estimates(node, childnum+1, childb, TRUE);
......
...@@ -270,7 +270,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) { ...@@ -270,7 +270,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION); assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0); assert(dn->height == 0);
assert(dn->n_children>=1); assert(dn->n_children>=1);
assert(dn->bp_offset > 0);
assert(dn->max_msn_applied_to_node_on_disk.msn == POSTSERIALIZE_MSN_ON_DISK.msn); assert(dn->max_msn_applied_to_node_on_disk.msn == POSTSERIALIZE_MSN_ON_DISK.msn);
{ {
const u_int32_t npartitions = dn->n_children; const u_int32_t npartitions = dn->n_children;
...@@ -279,9 +278,10 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) { ...@@ -279,9 +278,10 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(BLB_MAX_MSN_APPLIED(dn, i).msn == POSTSERIALIZE_MSN_ON_DISK.msn); assert(BLB_MAX_MSN_APPLIED(dn, i).msn == POSTSERIALIZE_MSN_ON_DISK.msn);
assert(dn->bp[i].offset > 0); assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) { if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset); assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
} }
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
u_int32_t keylen; u_int32_t keylen;
...@@ -401,16 +401,16 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) { ...@@ -401,16 +401,16 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
assert(dn->layout_version ==BRT_LAYOUT_VERSION); assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION); assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->bp_offset > 0);
{ {
const u_int32_t npartitions = dn->n_children; const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(keylens*(npartitions-1))); assert(dn->totalchildkeylens==(keylens*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0); assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) { if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset); assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
} }
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
...@@ -520,16 +520,16 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) { ...@@ -520,16 +520,16 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
assert(dn->layout_version ==BRT_LAYOUT_VERSION); assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION); assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->bp_offset > 0);
{ {
const u_int32_t npartitions = dn->n_children; const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(sizeof(int)*(npartitions-1))); assert(dn->totalchildkeylens==(sizeof(int)*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_int_cmp }; struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_int_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0); assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) { if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset); assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
} }
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
...@@ -645,7 +645,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) { ...@@ -645,7 +645,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
assert(dn->layout_version ==BRT_LAYOUT_VERSION); assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION); assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->bp_offset > 0);
{ {
const u_int32_t npartitions = dn->n_children; const u_int32_t npartitions = dn->n_children;
assert(npartitions == 7); assert(npartitions == 7);
...@@ -653,9 +652,10 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) { ...@@ -653,9 +652,10 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
struct check_leafentries_struct extra = { .nelts = 7, .elts = les, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = 7, .elts = les, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0); assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) { if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset); assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
} }
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
...@@ -777,16 +777,16 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) { ...@@ -777,16 +777,16 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION); assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0); assert(dn->height == 0);
assert(dn->n_children>0); assert(dn->n_children>0);
assert(dn->bp_offset > 0);
{ {
const u_int32_t npartitions = dn->n_children; const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(2*(npartitions-1))); assert(dn->totalchildkeylens==(2*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0); assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) { if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset); assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
} }
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
...@@ -894,16 +894,16 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type ...@@ -894,16 +894,16 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION); assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0); assert(dn->height == 0);
assert(dn->n_children == 1); assert(dn->n_children == 1);
assert(dn->bp_offset > 0);
{ {
const u_int32_t npartitions = dn->n_children; const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(2*(npartitions-1))); assert(dn->totalchildkeylens==(2*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = 0, .elts = NULL, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = 0, .elts = NULL, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0); assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) { if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset); assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
} }
assert(toku_omt_size(BLB_BUFFER(dn, i)) == 0); assert(toku_omt_size(BLB_BUFFER(dn, i)) == 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
...@@ -1018,16 +1018,16 @@ test_serialize_leaf(enum brtnode_verify_type bft) { ...@@ -1018,16 +1018,16 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION); assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0); assert(dn->height == 0);
assert(dn->n_children>=1); assert(dn->n_children>=1);
assert(dn->bp_offset > 0);
{ {
const u_int32_t npartitions = dn->n_children; const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(2*(npartitions-1))); assert(dn->totalchildkeylens==(2*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp }; struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0; u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) { for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0); assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) { if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset); assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
} }
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra); toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
u_int32_t keylen; u_int32_t keylen;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment