Commit 9acace70 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

close[t:4056] Fix #4056. (Leafnode partition now allows for aligned and...

close[t:4056] Fix #4056.  (Leafnode partition now allows for aligned and partial I/O, or even reordering the partitions to pack them more tightly).

git-svn-id: file:///svn/toku/tokudb@35821 c7de825b-a66e-492c-adef-691d508d4ae1
parent 3774a503
......@@ -252,12 +252,13 @@ struct brtnode_leaf_basement_node {
bool stale_ancestor_messages_applied;
};
#define PT_INVALID 0
#define PT_ON_DISK 1
#define PT_COMPRESSED 2
#define PT_AVAIL 3
enum __attribute__((__packed__)) pt_state { // declare this to be packed so that when used below it will only take 1 byte.
PT_INVALID = 0,
PT_ON_DISK = 1,
PT_COMPRESSED = 2,
PT_AVAIL = 3};
enum brtnode_child_tag {
enum __attribute__((__packed__)) brtnode_child_tag {
BCT_INVALID = 0,
BCT_NULL,
BCT_SUBBLOCK,
......@@ -266,7 +267,7 @@ enum brtnode_child_tag {
};
typedef struct __attribute__((__packed__)) brtnode_child_pointer {
u_int8_t tag;
enum brtnode_child_tag tag;
union {
struct sub_block *subblock;
struct brtnode_nonleaf_childinfo *nonleaf;
......@@ -289,12 +290,15 @@ struct __attribute__((__packed__)) brtnode_partition {
// PT_COMPRESSED - means that the partition is compressed in memory. To use, must decompress
// PT_AVAIL - means the partition is decompressed and in memory
//
u_int8_t state;
enum pt_state state; // make this an enum to make debugging easier.
//
// stores the offset to the end of the partition on disk from the brtnode, needed to read a partition off of disk
// stores the offset to the beginning of the partition on disk from the brtnode, and the length, needed to read a partition off of disk
// the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
//
u_int32_t offset;
// The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
// The SIZE is the size of the compressed partition.
// Rationale: We cannot store the size from the beginning of the node since we don't know how big the header will be.
// However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
u_int32_t start,size;
//
// pointer to the partition. Depending on the state, they may be different things
// if state == PT_INVALID, then the node was just initialized and ptr == NULL
......@@ -331,8 +335,6 @@ struct brtnode {
unsigned int totalchildkeylens;
struct kv_pair **childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Child 1's keys are > childkeys[0]. */
u_int32_t bp_offset; // offset on disk to where the partitions start
// array of size n_children, consisting of brtnode partitions
// each one is associated with a child
// for internal nodes, the ith partition corresponds to the ith message buffer
......@@ -346,7 +348,8 @@ struct brtnode {
#define BP_HAVE_FULLHASH(node,i) ((node)->bp[i].have_fullhash)
#define BP_FULLHASH(node,i) ((node)->bp[i].fullhash)
#define BP_STATE(node,i) ((node)->bp[i].state)
#define BP_OFFSET(node,i) ((node)->bp[i].offset)
#define BP_START(node,i) ((node)->bp[i].start)
#define BP_SIZE(node,i) ((node)->bp[i].size)
#define BP_SUBTREE_EST(node,i) ((node)->bp[i].subtree_estimates)
#define BP_WORKDONE(node, i)((node)->bp[i].workdone)
......
This diff is collapsed.
......@@ -785,10 +785,7 @@ void toku_brtnode_pe_est_callback(
// first get an estimate for how much space will be taken
// after compression, it is simply the size of compressed
// data on disk plus the size of the struct that holds it
u_int32_t compressed_data_size =
((i==0) ?
BP_OFFSET(node,i) :
(BP_OFFSET(node,i) - BP_OFFSET(node,i-1)));
u_int32_t compressed_data_size = BP_SIZE(node, i);
compressed_data_size += sizeof(struct sub_block);
// now get the space taken now
......@@ -1207,7 +1204,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
n->childkeys = 0;
n->bp = 0;
n->n_children = num_children;
n->bp_offset = 0;
if (num_children > 0) {
XMALLOC_N(num_children-1, n->childkeys);
......@@ -1215,7 +1211,8 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
for (int i = 0; i < num_children; i++) {
BP_BLOCKNUM(n,i).b=0;
BP_STATE(n,i) = PT_INVALID;
BP_OFFSET(n,i) = 0;
BP_START(n,i) = 0;
BP_SIZE (n,i) = 0;
BP_SUBTREE_EST(n,i) = zero_estimates;
BP_WORKDONE(n,i) = 0;
BP_INIT_TOUCHED_CLOCK(n, i);
......@@ -1379,7 +1376,8 @@ static void
init_childinfo(BRTNODE node, int childnum, BRTNODE child) {
BP_BLOCKNUM(node,childnum) = child->thisnodename;
BP_STATE(node,childnum) = PT_AVAIL;
BP_OFFSET(node,childnum) = 0;
BP_START(node,childnum) = 0;
BP_SIZE (node,childnum) = 0;
BP_SUBTREE_EST(node,childnum) = zero_estimates;
BP_WORKDONE(node, childnum) = 0;
set_BNC(node, childnum, toku_create_empty_nl());
......@@ -1605,10 +1603,10 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
REALLOC_N(num_children_in_b, B->bp);
B->n_children = num_children_in_b;
for (int i = 0; i < num_children_in_b; i++) {
BP_STATE(B,i) = PT_AVAIL;
BP_OFFSET(B,i) = 0;
BP_BLOCKNUM(B,i).b = 0;
BP_SUBTREE_EST(B,i)= zero_estimates;
BP_STATE(B,i) = PT_AVAIL;
BP_START(B,i) = 0;
BP_SIZE(B,i) = 0;
BP_WORKDONE(B,i) = 0;
set_BLB(B, i, toku_create_empty_bn());
}
......@@ -1834,7 +1832,8 @@ handle_split_of_child (BRT UU(t), BRTNODE node, int childnum,
BP_SUBTREE_EST(node,childnum+1) = zero_estimates;
BP_WORKDONE(node, childnum+1) = 0;
BP_STATE(node,childnum+1) = PT_AVAIL;
BP_OFFSET(node,childnum+1) = 0;
BP_START(node,childnum+1) = 0;
BP_SIZE(node,childnum+1) = 0;
fixup_child_estimates(node, childnum, childa, TRUE);
fixup_child_estimates(node, childnum+1, childb, TRUE);
......
......@@ -270,7 +270,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->n_children>=1);
assert(dn->bp_offset > 0);
assert(dn->max_msn_applied_to_node_on_disk.msn == POSTSERIALIZE_MSN_ON_DISK.msn);
{
const u_int32_t npartitions = dn->n_children;
......@@ -279,9 +278,10 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft) {
u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) {
assert(BLB_MAX_MSN_APPLIED(dn, i).msn == POSTSERIALIZE_MSN_ON_DISK.msn);
assert(dn->bp[i].offset > 0);
assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset);
assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
}
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
u_int32_t keylen;
......@@ -401,16 +401,16 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft) {
assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->bp_offset > 0);
{
const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(keylens*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0);
assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset);
assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
}
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
......@@ -520,16 +520,16 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft) {
assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->bp_offset > 0);
{
const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(sizeof(int)*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = nrows, .elts = les, .i = 0, .cmp = omt_int_cmp };
u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0);
assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset);
assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
}
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
......@@ -645,7 +645,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
assert(dn->layout_version ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->bp_offset > 0);
{
const u_int32_t npartitions = dn->n_children;
assert(npartitions == 7);
......@@ -653,9 +652,10 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft) {
struct check_leafentries_struct extra = { .nelts = 7, .elts = les, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0);
assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset);
assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
}
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
......@@ -777,16 +777,16 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft) {
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->n_children>0);
assert(dn->bp_offset > 0);
{
const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(2*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0);
assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset);
assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
}
assert(toku_omt_size(BLB_BUFFER(dn, i)) > 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
......@@ -894,16 +894,16 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->n_children == 1);
assert(dn->bp_offset > 0);
{
const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(2*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = 0, .elts = NULL, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0);
assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset);
assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
}
assert(toku_omt_size(BLB_BUFFER(dn, i)) == 0);
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
......@@ -1018,16 +1018,16 @@ test_serialize_leaf(enum brtnode_verify_type bft) {
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->n_children>=1);
assert(dn->bp_offset > 0);
{
const u_int32_t npartitions = dn->n_children;
assert(dn->totalchildkeylens==(2*(npartitions-1)));
struct check_leafentries_struct extra = { .nelts = 3, .elts = elts, .i = 0, .cmp = omt_cmp };
u_int32_t last_i = 0;
for (u_int32_t i = 0; i < npartitions; ++i) {
assert(dn->bp[i].offset > 0);
assert(dn->bp[i].start > 0);
assert(dn->bp[i].size > 0);
if (i > 0) {
assert(dn->bp[i].offset > dn->bp[i-1].offset);
assert(dn->bp[i].start >= dn->bp[i-1].start + dn->bp[i-1].size);
}
toku_omt_iterate(BLB_BUFFER(dn, i), check_leafentries, &extra);
u_int32_t keylen;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment