Commit 8d0368fd authored by Leif Walsh's avatar Leif Walsh Committed by Yoni Fogel

[t:3983] merging auto-upgrade to mainline

git-svn-id: file:///svn/toku/tokudb@41591 c7de825b-a66e-492c-adef-691d508d4ae1
parent 1974ace7
......@@ -755,11 +755,12 @@ translation_default(struct translation *t) { // destination into which to creat
}
static void
static enum deserialize_error_code
translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize
DISKOFF location_on_disk, //Location of translation_buffer
u_int64_t size_on_disk,
unsigned char * translation_buffer) { // buffer with serialized translation
enum deserialize_error_code e;
assert(location_on_disk!=0);
t->type = TRANSLATION_CHECKPOINTED;
{
......@@ -768,7 +769,11 @@ translation_deserialize_from_buffer(struct translation *t, // destination int
u_int64_t offset = size_on_disk - 4;
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset));
assert(x1764 == stored_x1764);
if (x1764 != stored_x1764) {
fprintf(stderr, "Translation table checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764);
e = DS_XSUM_FAIL;
goto exit;
}
}
struct rbuf rt;
rt.buf = translation_buffer;
......@@ -789,6 +794,9 @@ PRNTF("ReadIn", i, t->block_translation[i].size, t->block_translation[i].u.disko
assert(calculate_size_on_disk(t) == (int64_t)size_on_disk);
assert(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size == (int64_t)size_on_disk);
assert(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff == location_on_disk);
e = DS_OK;
exit:
return e;
}
// We just initialized a translation, inform block allocator to reserve space for each blocknum in use.
......@@ -817,17 +825,22 @@ blocktable_note_translation (BLOCK_ALLOCATOR allocator, struct translation *t) {
// The one read from disk is the last known checkpointed one, so we are keeping it in
// place and then setting current (which is never stored on disk) for current use.
// The translation_buffer has translation only, we create the rest of the block_table.
void
enum deserialize_error_code
toku_blocktable_create_from_buffer(BLOCK_TABLE *btp,
DISKOFF location_on_disk, //Location of translation_buffer
DISKOFF size_on_disk,
unsigned char *translation_buffer) {
BLOCK_TABLE bt = blocktable_create_internal();
translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer);
enum deserialize_error_code e = translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer);
if (e != DS_OK) {
goto exit;
}
blocktable_note_translation(bt->block_allocator, &bt->checkpointed);
// we just filled in checkpointed, now copy it to current.
copy_translation(&bt->current, &bt->checkpointed, TRANSLATION_CURRENT);
*btp = bt;
exit:
return e;
}
......
......@@ -5,6 +5,8 @@
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <brttypes.h>
#if defined(__cplusplus) || defined(__cilkplusplus)
extern "C" {
#endif
......@@ -21,7 +23,7 @@ struct block_translation_pair {
};
void toku_blocktable_create_new(BLOCK_TABLE *btp);
void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
enum deserialize_error_code toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
void toku_blocktable_destroy(BLOCK_TABLE *btp);
void toku_brtheader_lock(struct brt_header *h);
......
......@@ -251,7 +251,6 @@ struct brtnode {
int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
int dirty;
u_int32_t fullhash;
uint32_t optimized_for_upgrade; // version number to which this leaf has been optimized, zero if never optimized for upgrade
int n_children; //for internal nodes, if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced.
// for leaf nodes, represents number of basement nodes
unsigned int totalchildkeylens;
......@@ -377,9 +376,6 @@ struct brt_header {
uint64_t time_of_creation; // time this tree was created
uint64_t time_of_last_modification; // last time this header was serialized to disk (read from disk, overwritten when written to disk)
uint64_t time_of_last_verification; // last time that this tree was verified
BOOL upgrade_brt_performed; // initially FALSE, set TRUE when brt has been fully updated (even though nodes may not have been)
int64_t num_blocks_to_upgrade_13; // Number of v13 blocks still not newest version.
int64_t num_blocks_to_upgrade_14; // Number of v14 blocks still not newest version.
unsigned int nodesize;
unsigned int basementnodesize;
// this field is protected by tree_lock, see comment for tree_lock
......@@ -411,6 +407,10 @@ struct brt_header {
uint32_t count_of_optimize_in_progress_read_from_disk; // the number of hot optimize operations in progress on this tree at the time of the last crash (this field is in-memory only)
MSN msn_at_start_of_last_completed_optimize; // all messages before this msn have been applied to leaf nodes
enum toku_compression_method compression_method;
// Current Minimum MSN to be used when upgrading pre-MSN BRT's.
// This is decremented from our currnt MIN_MSN so as not to clash
// with any existing 'normal' MSN's.
MSN highest_unused_msn_for_upgrade;
};
struct brt {
......@@ -479,7 +479,7 @@ toku_brt_header_init(struct brt_header *h,
int toku_serialize_brt_header_size (struct brt_header *h);
int toku_serialize_brt_header_to (int fd, struct brt_header *h);
int toku_serialize_brt_header_to_wbuf (struct wbuf *, struct brt_header *h, int64_t address_translation, int64_t size_translation);
int toku_deserialize_brtheader_from (int fd, LSN max_acceptable_lsn, struct brt_header **brth);
enum deserialize_error_code toku_deserialize_brtheader_from (int fd, LSN max_acceptable_lsn, struct brt_header **brth);
int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset);
void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc);
BASEMENTNODE toku_create_empty_bn(void);
......@@ -818,7 +818,6 @@ int toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **er
int toku_brtheader_begin_checkpoint (LSN checkpoint_lsn, void *header_v) __attribute__((__warn_unused_result__));
int toku_brtheader_checkpoint (CACHEFILE cachefile, int fd, void *header_v) __attribute__((__warn_unused_result__));
int toku_brtheader_end_checkpoint (CACHEFILE cachefile, int fd, void *header_v) __attribute__((__warn_unused_result__));
int toku_maybe_upgrade_brt(BRT t) __attribute__((__warn_unused_result__));
int toku_db_badformat(void) __attribute__((__warn_unused_result__));
int toku_brt_remove_on_commit(TOKUTXN child, DBT* iname_dbt_p) __attribute__((__warn_unused_result__));
......@@ -826,10 +825,6 @@ int toku_brt_remove_now(CACHETABLE ct, DBT* iname_dbt_p) __attribute__((__warn_u
typedef enum {
BRT_UPGRADE_FOOTPRINT = 0,
BRT_UPGRADE_HEADER_13, // how many headers were upgraded from version 13
BRT_UPGRADE_NONLEAF_13,
BRT_UPGRADE_LEAF_13,
BRT_UPGRADE_OPTIMIZED_FOR_UPGRADE, // how many optimize_for_upgrade messages were sent
BRT_UPGRADE_STATUS_NUM_ROWS
} brt_upgrade_status_entry;
......@@ -937,7 +932,6 @@ brt_leaf_put_cmd (
BRTNODE leafnode,
BASEMENTNODE bn,
BRT_MSG cmd,
bool* made_change,
uint64_t *workdone,
OMT snapshot_txnids,
OMT live_list_reverse
......@@ -949,7 +943,6 @@ void toku_apply_cmd_to_leaf(
DESCRIPTOR desc,
BRTNODE node,
BRT_MSG cmd,
bool *made_change,
uint64_t *workdone,
OMT snapshot_txnids,
OMT live_list_reverse
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
......@@ -34,10 +34,6 @@ status_init(void)
// Note, this function initializes the keyname, type, and legend fields.
// Value fields are initialized to zero by compiler.
UPGRADE_STATUS_INIT(BRT_UPGRADE_FOOTPRINT, UINT64, "footprint");
UPGRADE_STATUS_INIT(BRT_UPGRADE_HEADER_13, UINT64, "V13 headers");
UPGRADE_STATUS_INIT(BRT_UPGRADE_NONLEAF_13, UINT64, "V13 nonleaf nodes");
UPGRADE_STATUS_INIT(BRT_UPGRADE_LEAF_13, UINT64, "V13 leaf nodes");
UPGRADE_STATUS_INIT(BRT_UPGRADE_OPTIMIZED_FOR_UPGRADE, UINT64, "optimized for upgrade");
brt_upgrade_status.initialized = true;
}
#undef UPGRADE_STATUS_INIT
......@@ -409,7 +405,6 @@ serialize_brtnode_info_size(BRTNODE node)
retval += 4; // nodesize
retval += 4; // flags
retval += 4; // height;
retval += 4; // optimized_for_upgrade
retval += node->totalchildkeylens; // total length of pivots
retval += (node->n_children-1)*4; // encode length of each pivot
if (node->height > 0) {
......@@ -434,7 +429,6 @@ static void serialize_brtnode_info(BRTNODE node,
wbuf_nocrc_uint(&wb, node->nodesize);
wbuf_nocrc_uint(&wb, node->flags);
wbuf_nocrc_int (&wb, node->height);
wbuf_nocrc_int (&wb, node->optimized_for_upgrade);
// pivot information
for (int i = 0; i < node->n_children-1; i++) {
wbuf_nocrc_bytes(&wb, kv_pair_key(node->childkeys[i]), toku_brt_pivot_key_len(node->childkeys[i]));
......@@ -554,7 +548,7 @@ rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize)
bn_sizes[0] = 0;
// TODO 4050: All these arrays should be combined into a single array of some bn_info struct (pivot, msize, num_les).
// Each entry is the number of leafentries in this basement. (Again, num_le is overkill upper bound.)
// Each entry is the number of leafentries in this basement. (Again, num_le is overkill upper baound.)
uint32_t *XMALLOC_N(num_alloc, num_les_this_bn);
num_les_this_bn[0] = 0;
......@@ -740,7 +734,7 @@ serialize_and_compress(BRTNODE node, int npartitions, struct sub_block sb[]) {
//
int
toku_serialize_brtnode_to_memory (BRTNODE node,
BRTNODE_DISK_DATA* ndd,
BRTNODE_DISK_DATA* ndd,
unsigned int basementnodesize,
BOOL do_rebalancing,
/*out*/ size_t *n_bytes_to_write,
......@@ -1195,7 +1189,9 @@ deserialize_brtnode_info(
node->nodesize = rbuf_int(&rb);
node->flags = rbuf_int(&rb);
node->height = rbuf_int(&rb);
node->optimized_for_upgrade = rbuf_int(&rb);
if (node->layout_version_read_from_disk < BRT_LAYOUT_VERSION_19) {
(void) rbuf_int(&rb); // optimized_for_upgrade
}
// now create the basement nodes or childinfos, depending on whether this is a
// leaf node or internal node
......@@ -1249,13 +1245,11 @@ setup_available_brtnode_partition(BRTNODE node, int i) {
}
}
static void setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe, bool data_in_memory)
// Effect: Used when reading a brtnode into main memory, this sets up the partitions.
// We set bfe->child_to_read as well as the BP_STATE and the data pointers (e.g., with set_BSB or set_BNULL or other set_ operations).
// Arguments: Node: the node to set up.
// bfe: Describes the key range needed.
// data_in_memory: true if we have all the data (in which case we set the BP_STATE to be either PT_AVAIL or PT_COMPRESSED depending on the bfe.
// false if we don't have the partitions in main memory (in which case we set the state to PT_ON_DISK.
// Assign the child_to_read member of the bfe from the given brt node
// that has been brought into memory.
static void
update_bfe_using_brtnode(BRTNODE node, struct brtnode_fetch_extra *bfe)
{
if (bfe->type == brtnode_fetch_subset && bfe->search != NULL) {
// we do not take into account prefetching yet
......@@ -1271,6 +1265,17 @@ static void setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* b
bfe->search
);
}
}
// Using the search parameters in the bfe, this function will
// initialize all of the given brt node's partitions.
static void
setup_partitions_using_bfe(BRTNODE node,
struct brtnode_fetch_extra *bfe,
bool data_in_memory)
{
// Leftmost and Rightmost Child bounds.
int lc, rc;
if (bfe->type == brtnode_fetch_subset || bfe->type == brtnode_fetch_prefetch) {
lc = toku_bfe_leftmost_child_wanted(bfe, node);
......@@ -1279,6 +1284,7 @@ static void setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* b
lc = -1;
rc = -1;
}
//
// setup memory needed for the node
//
......@@ -1311,6 +1317,23 @@ static void setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* b
}
}
static void setup_brtnode_partitions(BRTNODE node, struct brtnode_fetch_extra* bfe, bool data_in_memory)
// Effect: Used when reading a brtnode into main memory, this sets up the partitions.
// We set bfe->child_to_read as well as the BP_STATE and the data pointers (e.g., with set_BSB or set_BNULL or other set_ operations).
// Arguments: Node: the node to set up.
// bfe: Describes the key range needed.
// data_in_memory: true if we have all the data (in which case we set the BP_STATE to be either PT_AVAIL or PT_COMPRESSED depending on the bfe.
// false if we don't have the partitions in main memory (in which case we set the state to PT_ON_DISK.
{
// Set bfe->child_to_read.
update_bfe_using_brtnode(node, bfe);
// Setup the partitions.
setup_partitions_using_bfe(node, bfe, data_in_memory);
}
/* deserialize the partition from the sub-block's uncompressed buffer
* and destroy the uncompressed buffer
*/
......@@ -1430,7 +1453,16 @@ static int deserialize_brtnode_header_from_rbuf_if_small_enough (BRTNODE *brtnod
goto cleanup;
}
node->layout_version = node->layout_version_read_from_disk;
// Upgrade from 18 to 19.
if (node->layout_version_read_from_disk == BRT_LAYOUT_VERSION_18) {
node->layout_version = BRT_LAYOUT_VERSION;
} else {
// If the version is greater than the first version with
// basement nodes, but not version 18, then just use the old
// behavior.
node->layout_version = node->layout_version_read_from_disk;
}
node->layout_version_original = rbuf_int(rb);
node->build_id = rbuf_int(rb);
node->n_children = rbuf_int(rb);
......@@ -1531,6 +1563,473 @@ static int deserialize_brtnode_header_from_rbuf_if_small_enough (BRTNODE *brtnod
return r;
}
// This function takes a deserialized version 13 or 14 buffer and
// constructs the associated internal, non-leaf brtnode object. It
// also creates MSN's for older messages created in older versions
// that did not generate MSN's for messages. These new MSN's are
// generated from the root downwards, counting backwards from MIN_MSN
// and persisted in the brt header.
static int
deserialize_and_upgrade_internal_node(BRTNODE node,
struct rbuf *rb,
struct brtnode_fetch_extra* bfe)
{
int r = 0;
int version = node->layout_version_read_from_disk;
if(version == BRT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) {
(void) rbuf_int(rb); // 6. fingerprint
}
node->n_children = rbuf_int(rb); // 7. n_children
// Sub-tree esitmates...
for (int i = 0; i < node->n_children; ++i) {
if (version == BRT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) {
(void) rbuf_int(rb); // 8. fingerprint
}
(void) rbuf_ulonglong(rb); // 9. nkeys (ulonglong)
(void) rbuf_ulonglong(rb); // 10. ndata (ulonglong)
(void) rbuf_ulonglong(rb); // 11. dsize (ulonglong)
(void) rbuf_char(rb); // 12. exact (char)
}
node->childkeys = NULL;
node->totalchildkeylens = 0;
// I. Allocate keys based on number of children.
XMALLOC_N(node->n_children - 1, node->childkeys);
// II. Copy keys from buffer to allocated keys in brtnode.
for (int i = 0; i < node->n_children - 1; ++i) {
// 13. child key pointers and offsets
bytevec childkeyptr;
unsigned int cklen;
rbuf_bytes(rb, &childkeyptr, &cklen);
node->childkeys[i] = kv_pair_malloc((void*)childkeyptr,
cklen,
0,
0);
node->totalchildkeylens += toku_brt_pivot_key_len(node->childkeys[i]);
}
// Create space for the child node buffers (a.k.a. partitions).
XMALLOC_N(node->n_children, node->bp);
// Set the child blocknums.
for (int i = 0; i < node->n_children; ++i) {
// 14. blocknums
BP_BLOCKNUM(node, i) = rbuf_blocknum(rb);
BP_WORKDONE(node, i) = 0;
}
// Read in the child buffer maps.
struct sub_block_map child_buffer_map[node->n_children];
for (int i = 0; i < node->n_children; ++i) {
// The following fields are read in the
// sub_block_map_deserialize() call:
// 15. index 16. offset 17. size
sub_block_map_deserialize(&child_buffer_map[i], rb);
}
// We need to setup this node's partitions, but we can't call the
// existing call (setup_brtnode_paritions.) because there are
// existing optimizations that would prevent us from bringing all
// of this node's partitions into memory. Instead, We use the
// existing bfe and node to set the bfe's child_to_search member.
// Then we create a temporary bfe that needs all the nodes to make
// sure we properly intitialize our partitions before filling them
// in from our soon-to-be-upgraded node.
update_bfe_using_brtnode(node, bfe);
struct brtnode_fetch_extra temp_bfe;
temp_bfe.type = brtnode_fetch_all;
setup_partitions_using_bfe(node, &temp_bfe, true);
// Cache the highest MSN generated for the message buffers. This
// will be set in the brtnode.
//
// The way we choose MSNs for upgraded messages is delicate. The
// field `highest_unused_msn_for_upgrade' in the header is always an
// MSN that no message has yet. So when we have N messages that need
// MSNs, we decrement it by N, and then use it and the N-1 MSNs less
// than it, but we do not use the value we decremented it to.
//
// In the code below, we initialize `lowest' with the value of
// `highest_unused_msn_for_upgrade' after it is decremented, so we
// need to be sure to increment it once before we enqueue our first
// message.
MSN highest_msn;
highest_msn.msn = 0;
// Deserialize de-compressed buffers.
for (int i = 0; i < node->n_children; ++i) {
NONLEAF_CHILDINFO bnc = BNC(node, i);
int n_bytes_in_buffer = 0;
int n_in_this_buffer = rbuf_int(rb);
void **fresh_offsets;
void **broadcast_offsets;
int nfresh = 0;
int nbroadcast_offsets = 0;
if (bfe->h->compare_fun) {
XMALLOC_N(n_in_this_buffer, fresh_offsets);
// We skip 'stale' offsets for upgraded nodes.
XMALLOC_N(n_in_this_buffer, broadcast_offsets);
}
// Atomically decrement the header's MSN count by the number
// of messages in the buffer.
MSN lowest;
u_int64_t amount = n_in_this_buffer;
lowest.msn = __sync_sub_and_fetch(&bfe->h->highest_unused_msn_for_upgrade.msn, amount);
if (highest_msn.msn == 0) {
highest_msn.msn = lowest.msn + n_in_this_buffer;
}
// Create the FIFO entires from the deserialized buffer.
for (int j = 0; j < n_in_this_buffer; ++j) {
bytevec key; ITEMLEN keylen;
bytevec val; ITEMLEN vallen;
unsigned char ctype = rbuf_char(rb);
enum brt_msg_type type = (enum brt_msg_type) ctype;
XIDS xids;
xids_create_from_buffer(rb, &xids);
rbuf_bytes(rb, &key, &keylen);
rbuf_bytes(rb, &val, &vallen);
// <CER> can we factor this out?
long *dest;
if (bfe->h->compare_fun) {
if (brt_msg_type_applies_once(type)) {
dest = (long *) &fresh_offsets[nfresh];
nfresh++;
} else if (brt_msg_type_applies_all(type) || brt_msg_type_does_nothing(type)) {
dest = (long *) &broadcast_offsets[nbroadcast_offsets];
nbroadcast_offsets++;
} else {
assert(false);
}
} else {
dest = NULL;
}
// Increment our MSN, the last message should have the
// newest/highest MSN. See above for a full explanation.
lowest.msn++;
r = toku_fifo_enq(bnc->buffer,
key,
keylen,
val,
vallen,
type,
lowest,
xids,
true,
dest);
lazy_assert_zero(r);
n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD + xids_get_serialize_size(xids);
xids_destroy(&xids);
}
if (bfe->h->compare_fun) {
struct toku_fifo_entry_key_msn_cmp_extra extra = { .desc = &bfe->h->cmp_descriptor,
.cmp = bfe->h->compare_fun,
.fifo = bnc->buffer };
r = mergesort_r(fresh_offsets,
nfresh,
sizeof fresh_offsets[0],
&extra,
toku_fifo_entry_key_msn_cmp);
assert_zero(r);
toku_omt_destroy(&bnc->fresh_message_tree);
r = toku_omt_create_steal_sorted_array(&bnc->fresh_message_tree,
&fresh_offsets,
nfresh,
n_in_this_buffer);
assert_zero(r);
toku_omt_destroy(&bnc->broadcast_list);
r = toku_omt_create_steal_sorted_array(&bnc->broadcast_list,
&broadcast_offsets,
nbroadcast_offsets,
n_in_this_buffer);
assert_zero(r);
}
bnc->n_bytes_in_buffer = n_bytes_in_buffer;
}
// Assign the highest msn from our upgrade message FIFO queues.
node->max_msn_applied_to_node_on_disk = highest_msn;
// Since we assigned MSNs to this node's messages, we need to dirty it.
node->dirty = 1;
// Must compute the checksum now (rather than at the end, while we
// still have the pointer to the buffer).
if (version >= BRT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM) {
u_int32_t expected_xsum = toku_dtoh32(*(u_int32_t*)(rb->buf+rb->size-4));
u_int32_t actual_xsum = x1764_memory(rb->buf, rb->size-4);
if (expected_xsum != actual_xsum) {
fprintf(stderr, "%s:%d: Bad checksum: expected = %"PRIx32", actual= %"PRIx32"\n",
__FUNCTION__,
__LINE__,
expected_xsum,
actual_xsum);
fflush(stderr);
return toku_db_badformat();
}
}
return r;
}
// This function takes a deserialized version 13 or 14 buffer and
// constructs the associated leaf brtnode object.
static int
deserialize_and_upgrade_leaf_node(BRTNODE node,
struct rbuf *rb,
struct brtnode_fetch_extra* bfe)
{
int r = 0;
int version = node->layout_version_read_from_disk;
// This is a leaf node, so the offsets in the buffer will be
// different from the internal node offsets above.
(void) rbuf_ulonglong(rb); // 6. nkeys
(void) rbuf_ulonglong(rb); // 7. ndata
(void) rbuf_ulonglong(rb); // 8. dsize
if (version == BRT_LAYOUT_VERSION_14) {
(void) rbuf_int(rb); // 9. optimized_for_upgrade
}
// 10. npartitions - This is really the number of leaf entries in
// our single basement node. There should only be 1 (ONE)
// partition, so there shouldn't be any pivot key stored. This
// means the loop will not iterate. We could remove the loop and
// assert that the value is indeed 1.
int npartitions = rbuf_int(rb);
assert(npartitions == 1);
// Set number of children to 1, since we will only have one
// basement node.
node->n_children = 1;
XMALLOC_N(node->n_children, node->bp);
// This is a malloc(0), but we need to do it in order to get a pointer
// we can free() later.
XMALLOC_N(node->n_children - 1, node->childkeys);
node->totalchildkeylens = 0;
// Create one basement node to contain all the leaf entries by
// setting up the single partition and updating the bfe.
update_bfe_using_brtnode(node, bfe);
struct brtnode_fetch_extra temp_bfe;
fill_bfe_for_full_read(&temp_bfe, bfe->h);
setup_partitions_using_bfe(node, &temp_bfe, true);
// 11. Deserialize the partition maps, though they are not used in the
// newer versions of brt nodes.
struct sub_block_map part_map[npartitions];
for (int i = 0; i < npartitions; ++i) {
sub_block_map_deserialize(&part_map[i], rb);
}
// Copy all of the leaf entries into the single basement node.
// 12. The number of leaf entries in buffer.
int n_in_buf = rbuf_int(rb);
BLB_NBYTESINBUF(node,0) = 0;
BLB_SEQINSERT(node,0) = 0;
BASEMENTNODE bn = BLB(node, 0);
// The current end of the buffer, read from disk and decompressed,
// is the start of the leaf entries.
u_int32_t start_of_data = rb->ndone;
// 13. Read the leaf entries from the buffer, advancing the buffer
// as we go.
if (version <= BRT_LAYOUT_VERSION_13) {
// Create our mempool.
toku_mempool_construct(&bn->buffer_mempool, 0);
OMT omt = BLB_BUFFER(node, 0);
struct mempool *mp = &BLB_BUFFER_MEMPOOL(node, 0);
// Loop through
for (int i = 0; i < n_in_buf; ++i) {
LEAFENTRY_13 le = (LEAFENTRY_13)(&rb->buf[rb->ndone]);
u_int32_t disksize = leafentry_disksize_13(le);
rb->ndone += disksize;
invariant(rb->ndone<=rb->size);
LEAFENTRY new_le;
size_t new_le_size;
r = toku_le_upgrade_13_14(le,
&new_le_size,
&new_le,
omt,
mp);
assert_zero(r);
// Copy the pointer value straight into the OMT
r = toku_omt_insert_at(omt, (OMTVALUE) new_le, i);
assert_zero(r);
bn->n_bytes_in_buffer += new_le_size;
}
} else {
u_int32_t end_of_data;
u_int32_t data_size;
// Leaf Entry creation for version 14 and above:
// Allocate space for our leaf entry pointers.
OMTVALUE *XMALLOC_N(n_in_buf, array);
// Iterate over leaf entries copying their addresses into our
// temporary array.
for (int i = 0; i < n_in_buf; ++i) {
LEAFENTRY le = (LEAFENTRY)(&rb->buf[rb->ndone]);
u_int32_t disksize = leafentry_disksize(le);
rb->ndone += disksize;
invariant(rb->ndone <= rb->size);
array[i] = (OMTVALUE) le;
}
end_of_data = rb->ndone;
data_size = end_of_data - start_of_data;
// Now we must create the OMT and it's associated mempool.
// Allocate mempool in basement node and memcpy from start of
// input/deserialized buffer.
toku_mempool_copy_construct(&bn->buffer_mempool,
&rb->buf[start_of_data],
data_size);
// Adjust the array of OMT values to point to the correct
// position in the mempool. The mempool should have all the
// data at this point.
for (int i = 0; i < n_in_buf; ++i) {
int offset = (unsigned char *) array[i] - &rb->buf[start_of_data];
unsigned char *mp_base = toku_mempool_get_base(&bn->buffer_mempool);
array[i] = &mp_base[offset];
}
BLB_NBYTESINBUF(node, 0) = data_size;
toku_omt_destroy(&BLB_BUFFER(node, 0));
// Construct the omt.
r = toku_omt_create_steal_sorted_array(&BLB_BUFFER(node, 0),
&array,
n_in_buf,
n_in_buf);
invariant_zero(r);
}
// Whatever this is must be less than the MSNs of every message above
// it, so it's ok to take it here.
bn->max_msn_applied = bfe->h->highest_unused_msn_for_upgrade;
bn->stale_ancestor_messages_applied = false;
node->max_msn_applied_to_node_on_disk = bn->max_msn_applied;
// 14. Checksum (end to end) is only on version 14
if (version >= BRT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM) {
u_int32_t expected_xsum = rbuf_int(rb);
u_int32_t actual_xsum = x1764_memory(rb->buf, rb->size - 4);
if (expected_xsum != actual_xsum) {
// TODO: Error handling.
return 1;
}
}
// We should have read the whole block by this point.
if (rb->ndone != rb->size) {
// TODO: Error handling.
return 1;
}
return r;
}
static int
read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
struct brt_header *h,
struct rbuf *rb,
/* out */ int *layout_version_p);
// This function upgrades a version 14 brtnode to the current
// verison. NOTE: This code assumes the first field of the rbuf has
// already been read from the buffer (namely the layout_version of the
// brtnode.)
static int
deserialize_and_upgrade_brtnode(BRTNODE node,
BRTNODE_DISK_DATA* ndd,
BLOCKNUM blocknum,
struct brtnode_fetch_extra* bfe,
int fd)
{
int r = 0;
int version;
// I. First we need to de-compress the entire node, only then can
// we read the different sub-sections.
struct rbuf rb;
read_and_decompress_block_from_fd_into_rbuf(fd,
blocknum,
bfe->h,
&rb,
&version);
// Re-read the magic field from the previous call, since we are
// restarting with a fresh rbuf.
{
bytevec magic;
rbuf_literal_bytes(&rb, &magic, 8);
}
// II. Start reading brtnode fields out of the decompressed buffer.
// Copy over old version info.
node->layout_version_read_from_disk = rbuf_int(&rb);
version = node->layout_version_read_from_disk;
assert(version <= BRT_LAYOUT_VERSION_14);
// Upgrade the current version number to the current version.
node->layout_version = BRT_LAYOUT_VERSION;
node->layout_version_original = rbuf_int(&rb);
node->build_id = rbuf_int(&rb);
// The remaining offsets into the rbuf do not map to the current
// version, so we need to fill in the blanks and ignore older
// fields.
node->nodesize = rbuf_int(&rb); // 1. nodesize
node->flags = rbuf_int(&rb); // 2. flags
node->height = rbuf_int(&rb); // 3. height
// If the version is less than 14, there are two extra ints here.
// we would need to ignore them if they are there.
if (version == BRT_LAYOUT_VERSION_13) {
(void) rbuf_int(&rb); // 4. rand4
(void) rbuf_int(&rb); // 5. local
}
// The next offsets are dependent on whether this is a leaf node
// or not.
// III. Read in Leaf and Internal Node specific data.
// Check height to determine whether this is a leaf node or not.
if (node->height > 0) {
r = deserialize_and_upgrade_internal_node(node, &rb, bfe);
} else {
r = deserialize_and_upgrade_leaf_node(node, &rb, bfe);
}
*ndd = toku_xmalloc(node->n_children*sizeof(**ndd));
// Initialize the partition locations to zero, becuse version 14
// and below have no notion of partitions on disk.
for (int i=0; i<node->n_children; i++) {
BP_START(*ndd,i) = 0;
BP_SIZE (*ndd,i) = 0;
}
toku_free(rb.buf);
return r;
}
static int
deserialize_brtnode_from_rbuf(
BRTNODE *brtnode,
......@@ -1538,14 +2037,14 @@ deserialize_brtnode_from_rbuf(
BLOCKNUM blocknum,
u_int32_t fullhash,
struct brtnode_fetch_extra* bfe,
struct rbuf *rb
struct rbuf *rb,
int fd
)
// Effect: deserializes a brtnode that is in rb (with pointer of rb just past the magic) into a BRTNODE.
{
int r = 0;
BRTNODE node = toku_xmalloc(sizeof(*node));
struct sub_block sb_node_info;
// fill in values that are known and not stored in rb
node->fullhash = fullhash;
node->thisnodename = blocknum;
......@@ -1563,9 +2062,37 @@ deserialize_brtnode_from_rbuf(
}
node->layout_version_read_from_disk = rbuf_int(rb);
int version = node->layout_version_read_from_disk;
assert(version >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
// Check if we are reading in an older node version.
if (version <= BRT_LAYOUT_VERSION_14) {
// Perform the upgrade.
r = deserialize_and_upgrade_brtnode(node, ndd, blocknum, bfe, fd);
if (r != 0) {
goto cleanup;
}
if (version <= BRT_LAYOUT_VERSION_13) {
// deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag
node->flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
}
// If everything is ok, just re-assign the brtnode and retrn.
*brtnode = node;
r = 0;
goto cleanup;
} else if (version == BRT_LAYOUT_VERSION_18) {
// Upgrade version 18 to version 19. This upgrade is trivial,
// it removes the optimized for upgrade field, which has
// already been removed in the deserialization code (see
// deserialize_brtnode_info()).
version = BRT_LAYOUT_VERSION;
}
// TODO 4053
invariant(node->layout_version_read_from_disk == BRT_LAYOUT_VERSION);
node->layout_version = node->layout_version_read_from_disk;
invariant(version == BRT_LAYOUT_VERSION);
node->layout_version = version;
node->layout_version_original = rbuf_int(rb);
node->build_id = rbuf_int(rb);
node->n_children = rbuf_int(rb);
......@@ -1751,7 +2278,7 @@ int toku_deserialize_brtnode_from (int fd,
r = read_block_from_fd_into_rbuf(fd, blocknum, bfe->h, &rb);
if (r != 0) { goto cleanup; } // if we were successful, then we are done.
r = deserialize_brtnode_from_rbuf(brtnode, ndd, blocknum, fullhash, bfe, &rb);
r = deserialize_brtnode_from_rbuf(brtnode, ndd, blocknum, fullhash, bfe, &rb, fd);
if (r!=0) {
dump_bad_block(rb.buf,rb.size);
}
......@@ -1765,59 +2292,8 @@ cleanup:
return r;
}
int
toku_maybe_upgrade_brt(BRT t) { // possibly do some work to complete the version upgrade of brt
// If someday we need to inject a message to upgrade the brt, this is where
// it should be done. Whenever an upgrade is done, all nodes will be marked
// as dirty, so it makes sense here to always inject an OPTIMIZE message.
// (Note, if someday the version number is stored in the translation instead
// of in each node, then the upgrade would not necessarily dirty each node.)
int r = 0;
int version = t->h->layout_version_read_from_disk;
int upgrade = 0;
if (!t->h->upgrade_brt_performed) { // upgrade may be necessary
switch (version) {
case BRT_LAYOUT_VERSION_13:
r = 0;
upgrade++;
//Fall through on purpose.
case BRT_LAYOUT_VERSION:
if (r == 0 && upgrade) {
r = toku_brt_optimize_for_upgrade(t);
if (r==0)
__sync_fetch_and_add(&UPGRADE_STATUS_VALUE(BRT_UPGRADE_OPTIMIZED_FOR_UPGRADE), 1);
}
if (r == 0) {
t->h->upgrade_brt_performed = TRUE; // no further upgrade necessary
}
break;
default:
invariant(FALSE);
}
}
if (r) {
if (t->h->panic==0) {
char *e = strerror(r);
int l = 200 + strlen(e);
char s[l];
t->h->panic=r;
snprintf(s, l-1, "While upgrading brt version, error %d (%s)", r, e);
t->h->panic_string = toku_strdup(s);
}
}
return r;
}
// ################
void
toku_verify_or_set_counts (BRTNODE node) {
node = node;
toku_verify_or_set_counts(BRTNODE node) {
if (node->height==0) {
for (int i=0; i<node->n_children; i++) {
lazy_assert(BLB_BUFFER(node, i));
......@@ -1841,11 +2317,14 @@ serialize_brt_header_min_size (u_int32_t version) {
switch(version) {
case BRT_LAYOUT_VERSION_19:
size += 1; // compression method
size += sizeof(uint64_t); // highest_unused_msn_for_upgrade
case BRT_LAYOUT_VERSION_18:
size += sizeof(uint64_t); // time_of_last_optimize_begin
size += sizeof(uint64_t); // time_of_last_optimize_end
size += sizeof(uint32_t); // count_of_optimize_in_progress
size += sizeof(MSN); // msn_at_start_of_last_completed_optimize
size -= 8; // removed num_blocks_to_upgrade_14
size -= 8; // removed num_blocks_to_upgrade_13
case BRT_LAYOUT_VERSION_17:
size += 16;
invariant(sizeof(STAT64INFO_S) == 16);
......@@ -1896,11 +2375,10 @@ int toku_serialize_brt_header_size (struct brt_header *h) {
int toku_serialize_brt_header_to_wbuf (struct wbuf *wbuf, struct brt_header *h, DISKOFF translation_location_on_disk, DISKOFF translation_size_on_disk) {
unsigned int size = toku_serialize_brt_header_size (h); // !!! seems silly to recompute the size when the caller knew it. Do we really need the size?
wbuf_literal_bytes(wbuf, "tokudata", 8);
wbuf_network_int (wbuf, h->layout_version); //MUST be in network order regardless of disk order
wbuf_network_int (wbuf, BUILD_ID); //MUST be in network order regardless of disk order
wbuf_network_int (wbuf, size); //MUST be in network order regardless of disk order
wbuf_network_int (wbuf, wbuf->size); //MUST be in network order regardless of disk order
wbuf_literal_bytes(wbuf, &toku_byte_order_host, 8); //Must not translate byte order
wbuf_ulonglong(wbuf, h->checkpoint_count);
wbuf_LSN (wbuf, h->checkpoint_lsn);
......@@ -1915,8 +2393,6 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *wbuf, struct brt_header *h,
wbuf_int(wbuf, h->build_id_original);
wbuf_ulonglong(wbuf, h->time_of_creation);
wbuf_ulonglong(wbuf, h->time_of_last_modification);
wbuf_ulonglong(wbuf, h->num_blocks_to_upgrade_13);
wbuf_ulonglong(wbuf, h->num_blocks_to_upgrade_14);
wbuf_TXNID(wbuf, h->root_xid_that_created);
wbuf_int(wbuf, h->basementnodesize);
wbuf_ulonglong(wbuf, h->time_of_last_verification);
......@@ -1927,6 +2403,7 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *wbuf, struct brt_header *h,
wbuf_int(wbuf, h->count_of_optimize_in_progress);
wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize);
wbuf_char(wbuf, (unsigned char) h->compression_method);
wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade);
u_int32_t checksum = x1764_finish(&wbuf->checksum);
wbuf_int(wbuf, checksum);
lazy_assert(wbuf->ndone == wbuf->size);
......@@ -2045,35 +2522,38 @@ toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF
static void
deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) {
if (layout_version == BRT_LAYOUT_VERSION_13) {
// in previous versions of TokuDB the Descriptor had a 4 byte version, which we must skip over
u_int32_t dummy_version __attribute__((__unused__)) = rbuf_int(rb);
if (layout_version <= BRT_LAYOUT_VERSION_13) {
// in older versions of TokuDB the Descriptor had a 4 byte
// version, which we skip over
(void) rbuf_int(rb);
}
u_int32_t size;
bytevec data;
bytevec data;
rbuf_bytes(rb, &data, &size);
bytevec data_copy = data;;
if (size>0) {
data_copy = toku_memdup(data, size); //Cannot keep the reference from rbuf. Must copy.
lazy_assert(data_copy);
}
else {
bytevec data_copy = data;
if (size > 0) {
data_copy = toku_memdup(data, size); //Cannot keep the reference from rbuf. Must copy.
lazy_assert(data_copy);
} else {
lazy_assert(size==0);
data_copy = NULL;
}
toku_fill_dbt(&desc->dbt, data_copy, size);
}
static void
static enum deserialize_error_code
deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_version) {
enum deserialize_error_code e;
DISKOFF offset;
DISKOFF size;
unsigned char *dbuf = NULL;
toku_get_descriptor_offset_size(bt, &offset, &size);
memset(desc, 0, sizeof(*desc));
if (size > 0) {
lazy_assert(size>=4); //4 for checksum
{
unsigned char *XMALLOC_N(size, dbuf);
XMALLOC_N(size, dbuf);
{
lock_for_pwrite();
ssize_t r = toku_os_pread(fd, dbuf, size, offset);
......@@ -2085,7 +2565,12 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_
u_int32_t x1764 = x1764_memory(dbuf, size-4);
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4));
lazy_assert(x1764 == stored_x1764);
if (x1764 != stored_x1764) {
fprintf(stderr, "Descriptor checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764);
e = DS_XSUM_FAIL;
toku_free(dbuf);
goto exit;
}
}
{
struct rbuf rb = {.buf = dbuf, .size = size, .ndone = 0};
......@@ -2096,36 +2581,149 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_
toku_free(dbuf);
}
}
e = DS_OK;
exit:
return e;
}
static void
upgrade_subtree_estimates_to_stat64info(int UU(fd), struct brt_header *h)
{
int r;
// 15 was the last version with subtree estimates
invariant(h->layout_version_read_from_disk <= BRT_LAYOUT_VERSION_15);
BLOCKNUM b = h->root_blocknum;
struct rbuf rb_s;
struct rbuf *rb = &rb_s;
rbuf_init(rb, NULL, 0);
DISKOFF offset, size;
toku_translate_blocknum_to_offset_size(h->blocktable, b, &offset, &size);
{
u_int8_t *XMALLOC_N(size, raw_block);
{
ssize_t rlen = pread(fd, raw_block, size, offset);
lazy_assert((DISKOFF)rlen == size);
}
{
// root node must be a leaf or nonleaf node
u_int8_t *magic = raw_block + uncompressed_magic_offset;
invariant(memcmp(magic, "tokuleaf", 8) == 0 || memcmp(magic, "tokunode", 8) == 0);
// root node cannot have a different version from the header, if
// the header needs to read its subtree estimates
u_int8_t *version = raw_block + uncompressed_version_offset;
int layout_version = toku_dtoh32(*(uint32_t*)version);
invariant(layout_version == h->layout_version_read_from_disk);
}
{
int n_sub_blocks = toku_dtoh32(*(u_int32_t*)&raw_block[node_header_overhead]);
invariant(0 <= n_sub_blocks && n_sub_blocks <= max_sub_blocks);
{
u_int32_t header_length = node_header_overhead + sub_block_header_size(n_sub_blocks);
invariant(header_length <= size);
u_int32_t xsum = x1764_memory(raw_block, header_length);
u_int32_t stored_xsum = toku_dtoh32(*(u_int32_t *)(raw_block + header_length));
invariant(xsum == stored_xsum);
}
struct sub_block sub_block[n_sub_blocks];
u_int32_t *sub_block_header = (u_int32_t *) &raw_block[node_header_overhead + 4];
size_t uncompressed_size = 0;
for (int i = 0; i < n_sub_blocks; ++i) {
sub_block_init(&sub_block[i]);
int64_t csize = toku_dtoh32(sub_block_header[0]);
int64_t usize = toku_dtoh32(sub_block_header[1]);
invariant(0 <= csize && csize < (1<<30));
invariant(0 <= usize && usize < (1<<30));
sub_block[i].compressed_size = csize;
sub_block[i].uncompressed_size = usize;
sub_block[i].xsum = toku_dtoh32(sub_block_header[2]);
uncompressed_size += sub_block[i].uncompressed_size;
sub_block_header += 3;
}
unsigned char *buf = toku_xmalloc(node_header_overhead + uncompressed_size);
resource_assert(buf);
rbuf_init(rb, buf, node_header_overhead + uncompressed_size);
memcpy(rb->buf, raw_block, node_header_overhead);
unsigned char *compressed_data = raw_block + node_header_overhead + sub_block_header_size(n_sub_blocks) + sizeof(u_int32_t);
unsigned char *uncompressed_data = rb->buf + node_header_overhead;
r = decompress_all_sub_blocks(n_sub_blocks, sub_block, compressed_data, uncompressed_data, num_cores, brt_pool);
if (r != 0) {
fprintf(stderr, "%s:%d block %"PRId64" failed %d at %p size %zu\n", __FUNCTION__, __LINE__, b.b, r, raw_block, size);
dump_bad_block(raw_block, size);
}
lazy_assert_zero(r);
rb->ndone = 0;
}
toku_free(raw_block);
}
resource_assert(rb->buf);
bytevec magic;
rbuf_literal_bytes(rb, &magic, 8);
int node_version = rbuf_int(rb);
invariant(node_version == h->layout_version_read_from_disk);
(void) rbuf_int(rb); // layout_version_original
(void) rbuf_int(rb); // build_id
(void) rbuf_int(rb); // nodesize
(void) rbuf_int(rb); // flags
int height = rbuf_int(rb);
if (node_version <= BRT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) {
(void) rbuf_int(rb); // rand4fingerprint
(void) rbuf_int(rb); // localfingerprint
(void) rbuf_int(rb); // another fingerprint (according to deserialize_brtnode_nonleaf_from_rbuf in 5.0.8)
}
h->on_disk_stats = ZEROSTATS;
if (height > 0) {
invariant(memcmp(magic, "tokunode", 8) == 0);
int n_children = rbuf_int(rb);
for (int i = 0; i < n_children; ++i) {
if (node_version <= BRT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT) {
(void) rbuf_int(rb); // child fingerprint
}
u_int64_t nkeys = rbuf_ulonglong(rb);
u_int64_t ndata = rbuf_ulonglong(rb);
invariant(nkeys == ndata);
h->on_disk_stats.numrows += nkeys;
h->on_disk_stats.numbytes += rbuf_ulonglong(rb);
(void) rbuf_char(rb); // exact
}
} else {
invariant(memcmp(magic, "tokuleaf", 8) == 0);
u_int64_t nkeys = rbuf_ulonglong(rb);
u_int64_t ndata = rbuf_ulonglong(rb);
invariant(nkeys == ndata);
h->on_disk_stats.numrows += nkeys;
h->on_disk_stats.numbytes += rbuf_ulonglong(rb);
}
// done, discard the rest
toku_free(rb->buf);
}
// We only deserialize brt header once and then share everything with all the brts.
static int
deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
static enum deserialize_error_code
deserialize_brtheader_versioned(int fd, struct rbuf *rb, struct brt_header **brth, uint32_t version)
{
enum deserialize_error_code e = DS_OK;
struct brt_header *h = NULL;
invariant(version >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
invariant(version <= BRT_LAYOUT_VERSION);
// We already know:
// we have an rbuf representing the header.
// The checksum has been validated
//Steal rbuf (used to simplify merge, reduce diff size, and keep old code)
struct rbuf rc = *rb;
memset(rb, 0, sizeof(*rb));
//Verification of initial elements.
{
//Check magic number
bytevec magic;
rbuf_literal_bytes(&rc, &magic, 8);
lazy_assert(memcmp(magic,"tokudata",8)==0);
}
//Check magic number
bytevec magic;
rbuf_literal_bytes(rb, &magic, 8);
lazy_assert(memcmp(magic,"tokudata",8)==0);
struct brt_header *CALLOC(h);
if (h==0) return errno;
int ret=-1;
if (0) { died1: toku_free(h); return ret; }
CALLOC(h);
if (!h) {
e = DS_ERRNO;
goto exit;
}
h->type = BRTHEADER_CURRENT;
h->checkpoint_header = NULL;
h->dirty=0;
h->dirty = 0;
h->panic = 0;
h->panic_string = 0;
toku_list_init(&h->live_brts);
......@@ -2133,181 +2731,175 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
toku_list_init(&h->checkpoint_before_commit_link);
//version MUST be in network order on disk regardless of disk order
h->layout_version = rbuf_network_int(&rc);
//TODO: #1924
invariant(h->layout_version >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
invariant(h->layout_version <= BRT_LAYOUT_VERSION);
h->layout_version_read_from_disk = h->layout_version;
h->layout_version_read_from_disk = rbuf_network_int(rb);
invariant(h->layout_version_read_from_disk >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
invariant(h->layout_version_read_from_disk <= BRT_LAYOUT_VERSION);
h->layout_version = BRT_LAYOUT_VERSION;
//build_id MUST be in network order on disk regardless of disk order
h->build_id = rbuf_network_int(&rc);
h->build_id = rbuf_network_int(rb);
//Size MUST be in network order regardless of disk order.
u_int32_t size = rbuf_network_int(&rc);
lazy_assert(size==rc.size);
u_int32_t size = rbuf_network_int(rb);
lazy_assert(size == rb->size);
bytevec tmp_byte_order_check;
rbuf_literal_bytes(&rc, &tmp_byte_order_check, 8); //Must not translate byte order
lazy_assert((sizeof tmp_byte_order_check) >= 8);
rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
int64_t byte_order_stored = *(int64_t*)tmp_byte_order_check;
lazy_assert(byte_order_stored == toku_byte_order_host);
h->checkpoint_count = rbuf_ulonglong(&rc);
h->checkpoint_lsn = rbuf_lsn(&rc);
h->nodesize = rbuf_int(&rc);
DISKOFF translation_address_on_disk = rbuf_diskoff(&rc);
DISKOFF translation_size_on_disk = rbuf_diskoff(&rc);
lazy_assert(translation_address_on_disk>0);
lazy_assert(translation_size_on_disk>0);
h->checkpoint_count = rbuf_ulonglong(rb);
h->checkpoint_lsn = rbuf_lsn(rb);
h->nodesize = rbuf_int(rb);
DISKOFF translation_address_on_disk = rbuf_diskoff(rb);
DISKOFF translation_size_on_disk = rbuf_diskoff(rb);
lazy_assert(translation_address_on_disk > 0);
lazy_assert(translation_size_on_disk > 0);
// initialize the tree lock
toku_brtheader_init_treelock(h);
// printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, h->translated_blocknum_limit, h->block_translation_address_on_disk);
//Load translation table
{
lock_for_pwrite();
unsigned char *XMALLOC_N(translation_size_on_disk, tbuf);
{
// This cast is messed up in 32-bits if the block translation table is ever more than 4GB. But in that case, the translation table itself won't fit in main memory.
ssize_t r = toku_os_pread(fd, tbuf, translation_size_on_disk, translation_address_on_disk);
lazy_assert(r==translation_size_on_disk);
// This cast is messed up in 32-bits if the block translation
// table is ever more than 4GB. But in that case, the
// translation table itself won't fit in main memory.
ssize_t readsz = toku_os_pread(fd, tbuf, translation_size_on_disk,
translation_address_on_disk);
lazy_assert(readsz == translation_size_on_disk);
}
unlock_for_pwrite();
// Create table and read in data.
toku_blocktable_create_from_buffer(&h->blocktable,
translation_address_on_disk,
translation_size_on_disk,
tbuf);
e = toku_blocktable_create_from_buffer(&h->blocktable,
translation_address_on_disk,
translation_size_on_disk,
tbuf);
toku_free(tbuf);
if (e != DS_OK) {
goto exit;
}
}
h->root_blocknum = rbuf_blocknum(&rc);
h->flags = rbuf_int(&rc);
h->layout_version_original = rbuf_int(&rc);
h->build_id_original = rbuf_int(&rc);
h->time_of_creation = rbuf_ulonglong(&rc);
h->time_of_last_modification = rbuf_ulonglong(&rc);
h->root_blocknum = rbuf_blocknum(rb);
h->flags = rbuf_int(rb);
if (h->layout_version_read_from_disk <= BRT_LAYOUT_VERSION_13) {
// deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag
h->flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
}
h->layout_version_original = rbuf_int(rb);
h->build_id_original = rbuf_int(rb);
h->time_of_creation = rbuf_ulonglong(rb);
h->time_of_last_modification = rbuf_ulonglong(rb);
h->time_of_last_verification = 0;
h->num_blocks_to_upgrade_13 = rbuf_ulonglong(&rc);
h->num_blocks_to_upgrade_14 = rbuf_ulonglong(&rc);
if (h->layout_version >= BRT_LAYOUT_VERSION_14) {
// at this layer, this new field is the only difference between versions 13 and 14
rbuf_TXNID(&rc, &h->root_xid_that_created);
}
if (h->layout_version >= BRT_LAYOUT_VERSION_15) {
h->basementnodesize = rbuf_int(&rc);
h->time_of_last_verification = rbuf_ulonglong(&rc);
}
if (h->layout_version >= BRT_LAYOUT_VERSION_18) {
h->on_disk_stats.numrows = rbuf_ulonglong(&rc);
h->on_disk_stats.numbytes = rbuf_ulonglong(&rc);
h->in_memory_stats = h->on_disk_stats;
h->time_of_last_optimize_begin = rbuf_ulonglong(&rc);
h->time_of_last_optimize_end = rbuf_ulonglong(&rc);
h->count_of_optimize_in_progress = rbuf_int(&rc);
h->count_of_optimize_in_progress_read_from_disk = h->count_of_optimize_in_progress;
h->msn_at_start_of_last_completed_optimize = rbuf_msn(&rc);
}
if (h->layout_version >= BRT_LAYOUT_VERSION_19) {
unsigned char method = rbuf_char(&rc);
if (h->layout_version_read_from_disk <= BRT_LAYOUT_VERSION_18) {
// 17 was the last version with these fields, we no longer store
// them, so read and discard them
(void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_13
if (h->layout_version_read_from_disk >= BRT_LAYOUT_VERSION_15) {
(void) rbuf_ulonglong(rb); // num_blocks_to_upgrade_14
}
}
if (h->layout_version_read_from_disk >= BRT_LAYOUT_VERSION_14) {
rbuf_TXNID(rb, &h->root_xid_that_created);
} else {
// fake creation during the last checkpoint
h->root_xid_that_created = h->checkpoint_lsn.lsn;
}
if (h->layout_version_read_from_disk >= BRT_LAYOUT_VERSION_15) {
h->basementnodesize = rbuf_int(rb);
h->time_of_last_verification = rbuf_ulonglong(rb);
} else {
h->basementnodesize = BRT_DEFAULT_BASEMENT_NODE_SIZE;
h->time_of_last_verification = 0;
}
if (h->layout_version_read_from_disk >= BRT_LAYOUT_VERSION_18) {
h->on_disk_stats.numrows = rbuf_ulonglong(rb);
h->on_disk_stats.numbytes = rbuf_ulonglong(rb);
h->in_memory_stats = h->on_disk_stats;
h->time_of_last_optimize_begin = rbuf_ulonglong(rb);
h->time_of_last_optimize_end = rbuf_ulonglong(rb);
h->count_of_optimize_in_progress = rbuf_int(rb);
h->count_of_optimize_in_progress_read_from_disk = h->count_of_optimize_in_progress;
h->msn_at_start_of_last_completed_optimize = rbuf_msn(rb);
} else {
upgrade_subtree_estimates_to_stat64info(fd, h);
h->time_of_last_optimize_begin = 0;
h->time_of_last_optimize_end = 0;
h->count_of_optimize_in_progress = 0;
h->count_of_optimize_in_progress_read_from_disk = 0;
h->msn_at_start_of_last_completed_optimize = ZERO_MSN;
}
if (h->layout_version_read_from_disk >= BRT_LAYOUT_VERSION_19) {
unsigned char method = rbuf_char(rb);
h->compression_method = (enum toku_compression_method) method;
h->highest_unused_msn_for_upgrade = rbuf_msn(rb);
} else {
// we hard coded zlib until 5.2, then quicklz in 5.2
if (h->layout_version < BRT_LAYOUT_VERSION_18) {
if (h->layout_version_read_from_disk < BRT_LAYOUT_VERSION_18) {
h->compression_method = TOKU_ZLIB_METHOD;
} else {
h->compression_method = TOKU_QUICKLZ_METHOD;
}
h->highest_unused_msn_for_upgrade.msn = MIN_MSN.msn - 1;
}
(void) rbuf_int(rb); //Read in checksum and ignore (already verified).
if (rb->ndone != rb->size) {
fprintf(stderr, "Header size did not match contents.\n");
errno = EINVAL;
e = DS_ERRNO;
goto exit;
}
invariant(h);
invariant((uint32_t) h->layout_version_read_from_disk == version);
e = deserialize_descriptor_from(fd, h->blocktable, &h->descriptor, version);
if (e != DS_OK) {
goto exit;
}
// copy descriptor to cmp_descriptor for #4541
h->cmp_descriptor.dbt.size = h->descriptor.dbt.size;
h->cmp_descriptor.dbt.data = toku_xmemdup(h->descriptor.dbt.data, h->descriptor.dbt.size);
// Version 13 descriptors had an extra 4 bytes that we don't read
// anymore. Since the header is going to think it's the current
// version if it gets written out, we need to write the descriptor in
// the new format (without those bytes) before that happens.
int r = toku_update_descriptor(h, &h->cmp_descriptor, fd);
if (r != 0) {
errno = r;
e = DS_ERRNO;
goto exit;
}
(void)rbuf_int(&rc); //Read in checksum and ignore (already verified).
if (rc.ndone!=rc.size) {ret = EINVAL; goto died1;}
toku_free(rc.buf);
rc.buf = NULL;
*brth = h;
return 0;
}
static int
write_descriptor_to_disk_unlocked(struct brt_header * h, DESCRIPTOR d, int fd) {
int r = 0;
DISKOFF offset;
//4 for checksum
toku_realloc_descriptor_on_disk_unlocked(h->blocktable, toku_serialize_descriptor_size(d)+4, &offset, h);
r = toku_serialize_descriptor_contents_to_fd(fd, d, offset);
return r;
}
//TODO: When version 15 exists, add case for version 14 that looks like today's version 13 case,
static int
deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **brth, u_int32_t version) {
int rval;
int upgrade = 0;
struct brt_header *h = NULL;
rval = deserialize_brtheader (fd, rb, &h); //deserialize from rbuf and fd into header
if (rval == 0) {
invariant(h);
invariant((uint32_t) h->layout_version == version);
deserialize_descriptor_from(fd, h->blocktable, &(h->descriptor), version);
h->cmp_descriptor.dbt.size = h->descriptor.dbt.size;
h->cmp_descriptor.dbt.data = toku_xmemdup(h->descriptor.dbt.data, h->descriptor.dbt.size);
switch (version) {
case BRT_LAYOUT_VERSION_13:
invariant(h->layout_version == BRT_LAYOUT_VERSION_13);
{
//Upgrade root_xid_that_created
//Fake creation during the last checkpoint.
h->root_xid_that_created = h->checkpoint_lsn.lsn;
}
{
//Deprecate 'TOKU_DB_VALCMP_BUILTIN'. Just remove the flag
h->flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
}
h->layout_version++;
__sync_fetch_and_add(&UPGRADE_STATUS_VALUE(BRT_UPGRADE_HEADER_13), 1); // how many header nodes upgraded from v13
upgrade++;
//Fall through on purpose
case BRT_LAYOUT_VERSION_14:
h->basementnodesize = 128*1024; // basement nodes added in v15
//fall through on purpose
case BRT_LAYOUT_VERSION_19:
case BRT_LAYOUT_VERSION_18:
case BRT_LAYOUT_VERSION_17: // version 17 never released to customers
case BRT_LAYOUT_VERSION_16: // version 16 never released to customers
case BRT_LAYOUT_VERSION_15: // this will not properly support version 15, we'll fix that on upgrade.
invariant(h->layout_version == BRT_LAYOUT_VERSION);
h->upgrade_brt_performed = FALSE;
if (upgrade) {
toku_brtheader_lock(h);
h->num_blocks_to_upgrade_13 = toku_block_get_blocks_in_use_unlocked(h->blocktable); //Total number of blocks
if (version == BRT_LAYOUT_VERSION_13) {
// write upgraded descriptor to disk if descriptor upgraded from version 13
rval = write_descriptor_to_disk_unlocked(h, &(h->descriptor), fd);
}
h->dirty = 1;
toku_brtheader_unlock(h);
}
*brth = h;
break; // this is the only break
default:
invariant(FALSE);
}
exit:
if (e != DS_OK && h != NULL) {
toku_free(h);
h = NULL;
}
return rval;
*brth = h;
return e;
}
// Simply reading the raw bytes of the header into an rbuf is insensitive to disk format version.
// If that ever changes, then modify this.
//TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the file AND the header is useless
// Simply reading the raw bytes of the header into an rbuf is insensitive
// to disk format version. If that ever changes, then modify this.
//
// TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
// file AND the header is useless
static int
deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset_of_header, struct rbuf *rb,
u_int64_t *checkpoint_count, LSN *checkpoint_lsn, u_int32_t * version_p) {
deserialize_brtheader_from_fd_into_rbuf(int fd,
toku_off_t offset_of_header,
struct rbuf *rb,
u_int64_t *checkpoint_count,
LSN *checkpoint_lsn,
u_int32_t * version_p,
enum deserialize_error_code *e)
{
int r = 0;
const int64_t prefix_size = 8 + // magic ("tokudata")
4 + // version
......@@ -2316,96 +2908,120 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset_of_header, str
unsigned char prefix[prefix_size];
rb->buf = NULL;
int64_t n = toku_os_pread(fd, prefix, prefix_size, offset_of_header);
if (n==0) r = TOKUDB_DICTIONARY_NO_HEADER;
else if (n<0) {r = errno; lazy_assert(r!=0);}
else if (n!=prefix_size) r = EINVAL;
else {
rb->size = prefix_size;
rb->ndone = 0;
rb->buf = prefix;
{
//Check magic number
bytevec magic;
rbuf_literal_bytes(rb, &magic, 8);
if (memcmp(magic,"tokudata",8)!=0) {
if ((*(u_int64_t*)magic) == 0) r = TOKUDB_DICTIONARY_NO_HEADER;
else r = EINVAL; //Not a tokudb file! Do not use.
}
}
u_int32_t version = 0;
if (r==0) {
//Version MUST be in network order regardless of disk order.
version = rbuf_network_int(rb);
*version_p = version;
if (version < BRT_LAYOUT_MIN_SUPPORTED_VERSION) r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
if (version > BRT_LAYOUT_VERSION) r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
//build_id MUST be in network order regardless of disk order.
u_int32_t build_id __attribute__((__unused__)) = rbuf_network_int(rb);
}
u_int32_t size;
if (r==0) {
const int64_t max_header_size = BLOCK_ALLOCATOR_HEADER_RESERVE;
int64_t min_header_size = serialize_brt_header_min_size(version);
//Size MUST be in network order regardless of disk order.
size = rbuf_network_int(rb);
//If too big, it is corrupt. We would probably notice during checksum
//but may have to do a multi-gigabyte malloc+read to find out.
//If its too small reading rbuf would crash, so verify.
if (size > max_header_size || size < min_header_size) r = TOKUDB_DICTIONARY_NO_HEADER;
}
if (r!=0) {
rb->buf = NULL; //Prevent freeing of 'prefix'
}
if (r==0) {
lazy_assert(rb->ndone==prefix_size);
rb->size = size;
rb->buf = toku_xmalloc(rb->size);
}
if (r==0) {
n = toku_os_pread(fd, rb->buf, rb->size, offset_of_header);
if (n==-1) {
r = errno;
lazy_assert(r!=0);
}
else if (n!=(int64_t)rb->size) r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
}
//It's version 10 or later. Magic looks OK.
//We have an rbuf that represents the header.
//Size is within acceptable bounds.
if (r==0) {
//Verify checksum (BRT_LAYOUT_VERSION_13 or later, when checksum function changed)
u_int32_t calculated_x1764 = x1764_memory(rb->buf, rb->size-4);
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
if (calculated_x1764!=stored_x1764) r = TOKUDB_DICTIONARY_NO_HEADER; //Header useless
if (n != prefix_size) {
if (n==0) {
r = TOKUDB_DICTIONARY_NO_HEADER;
} else if (n<0) {
r = errno;
lazy_assert(r!=0);
} else {
r = EINVAL;
}
if (r==0) {
//Verify byte order
bytevec tmp_byte_order_check;
rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
int64_t byte_order_stored = *(int64_t*)tmp_byte_order_check;
if (byte_order_stored != toku_byte_order_host) r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary
goto exit;
}
rbuf_init(rb, prefix, prefix_size);
//Check magic number
bytevec magic;
rbuf_literal_bytes(rb, &magic, 8);
if (memcmp(magic,"tokudata",8)!=0) {
if ((*(u_int64_t*)magic) == 0) {
r = TOKUDB_DICTIONARY_NO_HEADER;
} else {
r = EINVAL; //Not a tokudb file! Do not use.
}
if (r==0) {
//Load checkpoint count
*checkpoint_count = rbuf_ulonglong(rb);
*checkpoint_lsn = rbuf_lsn(rb);
//Restart at beginning during regular deserialization
rb->ndone = 0;
goto exit;
}
//Version MUST be in network order regardless of disk order.
u_int32_t version = rbuf_network_int(rb);
*version_p = version;
if (version < BRT_LAYOUT_MIN_SUPPORTED_VERSION) {
r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
goto exit;
} else if (version > BRT_LAYOUT_VERSION) {
r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
goto exit;
}
//build_id MUST be in network order regardless of disk order.
u_int32_t build_id __attribute__((__unused__)) = rbuf_network_int(rb);
const int64_t max_header_size = BLOCK_ALLOCATOR_HEADER_RESERVE;
int64_t min_header_size = serialize_brt_header_min_size(version);
//Size MUST be in network order regardless of disk order.
u_int32_t size = rbuf_network_int(rb);
//If too big, it is corrupt. We would probably notice during checksum
//but may have to do a multi-gigabyte malloc+read to find out.
//If its too small reading rbuf would crash, so verify.
if (size > max_header_size || size < min_header_size) {
r = TOKUDB_DICTIONARY_NO_HEADER;
goto exit;
}
lazy_assert(rb->ndone==prefix_size);
rb->size = size;
rb->buf = toku_xmalloc(rb->size);
n = toku_os_pread(fd, rb->buf, rb->size, offset_of_header);
if (n != rb->size) {
if (n < 0) {
r = errno;
lazy_assert(r!=0);
} else {
r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
}
goto exit;
}
//It's version 14 or later. Magic looks OK.
//We have an rbuf that represents the header.
//Size is within acceptable bounds.
//Verify checksum (BRT_LAYOUT_VERSION_13 or later, when checksum function changed)
u_int32_t calculated_x1764 = x1764_memory(rb->buf, rb->size-4);
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
if (calculated_x1764 != stored_x1764) {
r = TOKUDB_DICTIONARY_NO_HEADER; //Header useless
fprintf(stderr, "Header checksum failure: calc=0x%08x read=0x%08x\n", calculated_x1764, stored_x1764);
*e = DS_XSUM_FAIL;
goto exit;
}
if (r!=0 && rb->buf) {
toku_free(rb->buf);
//Verify byte order
bytevec tmp_byte_order_check;
lazy_assert((sizeof toku_byte_order_host) == 8);
rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
int64_t byte_order_stored = *(int64_t*)tmp_byte_order_check;
if (byte_order_stored != toku_byte_order_host) {
r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary
goto exit;
}
//Load checkpoint count
*checkpoint_count = rbuf_ulonglong(rb);
*checkpoint_lsn = rbuf_lsn(rb);
//Restart at beginning during regular deserialization
rb->ndone = 0;
exit:
if (r != 0 && rb->buf != NULL) {
if (rb->buf != prefix) { // don't free prefix, it's stack alloc'd
toku_free(rb->buf);
}
rb->buf = NULL;
}
return r;
}
// Read brtheader from file into struct. Read both headers and use one.
// We want the latest acceptable header whose checkpoint_lsn is no later
// than max_acceptable_lsn.
int
toku_deserialize_brtheader_from (int fd, LSN max_acceptable_lsn, struct brt_header **brth) {
enum deserialize_error_code
toku_deserialize_brtheader_from(int fd,
LSN max_acceptable_lsn,
struct brt_header **brth)
{
struct rbuf rb_0;
struct rbuf rb_1;
u_int64_t checkpoint_count_0;
......@@ -2417,73 +3033,93 @@ toku_deserialize_brtheader_from (int fd, LSN max_acceptable_lsn, struct brt_head
BOOL h1_acceptable = FALSE;
struct rbuf *rb = NULL;
int r0, r1, r;
enum deserialize_error_code e0, e1, e;
{
toku_off_t header_0_off = 0;
r0 = deserialize_brtheader_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0);
if ( (r0==0) && (checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) )
h0_acceptable = TRUE;
}
{
toku_off_t header_1_off = BLOCK_ALLOCATOR_HEADER_RESERVE;
r1 = deserialize_brtheader_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1);
if ( (r1==0) && (checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) )
h1_acceptable = TRUE;
toku_off_t header_0_off = 0;
e0 = DS_OK;
r0 = deserialize_brtheader_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0, &e0);
if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) {
h0_acceptable = TRUE;
}
// if either header is too new, the dictionary is unreadable
if (r0!=TOKUDB_DICTIONARY_TOO_NEW && r1!=TOKUDB_DICTIONARY_TOO_NEW) {
if (h0_acceptable && h1_acceptable) {
if (checkpoint_count_0 > checkpoint_count_1) {
invariant(checkpoint_count_0 == checkpoint_count_1 + 1);
invariant(version_0 >= version_1);
rb = &rb_0;
version = version_0;
r = 0;
}
else {
invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
invariant(version_1 >= version_0);
rb = &rb_1;
version = version_1;
r = 0;
}
}
else if (h0_acceptable) {
rb = &rb_0;
version = version_0;
r = 0;
}
else if (h1_acceptable) {
rb = &rb_1;
version = version_1;
r = 0;
}
toku_off_t header_1_off = BLOCK_ALLOCATOR_HEADER_RESERVE;
e1 = DS_OK;
r1 = deserialize_brtheader_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1, &e1);
if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) {
h1_acceptable = TRUE;
}
if (rb==NULL) {
// We were unable to read either header or at least one is too new.
// Certain errors are higher priority than others. Order of these if/else if is important.
if (r0==TOKUDB_DICTIONARY_TOO_NEW || r1==TOKUDB_DICTIONARY_TOO_NEW)
// if either header is too new, the dictionary is unreadable
if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW ||
!(h0_acceptable || h1_acceptable)) {
// We were unable to read either header or at least one is too
// new. Certain errors are higher priority than others. Order of
// these if/else if is important.
if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW) {
r = TOKUDB_DICTIONARY_TOO_NEW;
else if (r0==TOKUDB_DICTIONARY_TOO_OLD || r1==TOKUDB_DICTIONARY_TOO_OLD) {
} else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || r1 == TOKUDB_DICTIONARY_TOO_OLD) {
r = TOKUDB_DICTIONARY_TOO_OLD;
}
else if (r0==TOKUDB_DICTIONARY_NO_HEADER || r1==TOKUDB_DICTIONARY_NO_HEADER) {
} else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || r1 == TOKUDB_DICTIONARY_NO_HEADER) {
r = TOKUDB_DICTIONARY_NO_HEADER;
} else {
r = r0 ? r0 : r1; //Arbitrarily report the error from the
//first header, unless it's readable
}
else r = r0 ? r0 : r1; //Arbitrarily report the error from the first header, unless it's readable
// it should not be possible for both headers to be later than the max_acceptable_lsn
invariant(!( (r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
(r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn) ));
// it should not be possible for both headers to be later than the max_acceptable_lsn
invariant(!((r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
(r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
invariant(r!=0);
if (e0 == DS_XSUM_FAIL && e1 == DS_XSUM_FAIL) {
fprintf(stderr, "Both header checksums failed.\n");
e = DS_XSUM_FAIL;
} else {
errno = r;
e = DS_ERRNO;
}
goto exit;
}
if (r==0) r = deserialize_brtheader_versioned(fd, rb, brth, version);
if (rb_0.buf) toku_free(rb_0.buf);
if (rb_1.buf) toku_free(rb_1.buf);
return r;
if (h0_acceptable && h1_acceptable) {
if (checkpoint_count_0 > checkpoint_count_1) {
invariant(checkpoint_count_0 == checkpoint_count_1 + 1);
invariant(version_0 >= version_1);
rb = &rb_0;
version = version_0;
}
else {
invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
invariant(version_1 >= version_0);
rb = &rb_1;
version = version_1;
}
} else if (h0_acceptable) {
if (e1 == DS_XSUM_FAIL) {
// print something reassuring
fprintf(stderr, "Header 2 checksum failed, but header 1 ok. Proceeding.\n");
}
rb = &rb_0;
version = version_0;
} else if (h1_acceptable) {
if (e0 == DS_XSUM_FAIL) {
// print something reassuring
fprintf(stderr, "Header 1 checksum failed, but header 2 ok. Proceeding.\n");
}
rb = &rb_1;
version = version_1;
}
invariant(rb);
e = deserialize_brtheader_versioned(fd, rb, brth, version);
exit:
if (rb_0.buf) {
toku_free(rb_0.buf);
}
if (rb_1.buf) {
toku_free(rb_1.buf);
}
return e;
}
unsigned int
......
......@@ -10,7 +10,7 @@
// dummymsn needed to simulate msn because messages are injected at a lower level than toku_brt_root_put_cmd()
#define MIN_DUMMYMSN ((MSN) {(uint64_t)100000000000})
#define MIN_DUMMYMSN ((MSN) {(uint64_t)1 << 62})
static MSN dummymsn;
static int testsetup_initialized = 0;
......
......@@ -696,7 +696,6 @@ void toku_brtnode_clone_callback(
cloned_node->height = node->height;
cloned_node->dirty = node->dirty;
cloned_node->fullhash = node->fullhash;
cloned_node->optimized_for_upgrade = node->optimized_for_upgrade;
cloned_node->n_children = node->n_children;
cloned_node->totalchildkeylens = node->totalchildkeylens;
......@@ -721,6 +720,7 @@ void toku_brtnode_clone_callback(
// clear dirty bit
node->dirty = 0;
cloned_node->dirty = 0;
node->layout_version_read_from_disk = BRT_LAYOUT_VERSION;
// set new pair attr if necessary
if (node->height == 0) {
*new_attr = make_brtnode_pair_attr(node);
......@@ -763,6 +763,7 @@ void toku_brtnode_flush_callback (
toku_cachefile_get_workqueue_load(cachefile, &n_workitems, &n_threads);
int r = toku_serialize_brtnode_to(fd, brtnode->thisnodename, brtnode, ndd, !is_clone, h, n_workitems, n_threads, for_checkpoint);
assert_zero(r);
brtnode->layout_version_read_from_disk = BRT_LAYOUT_VERSION;
}
brt_status_update_flush_reason(brtnode, for_checkpoint);
}
......@@ -818,7 +819,8 @@ void toku_brtnode_pe_est_callback(
assert(brtnode_pv != NULL);
long bytes_to_free = 0;
BRTNODE node = (BRTNODE)brtnode_pv;
if (node->dirty || node->height == 0) {
if (node->dirty || node->height == 0 ||
node->layout_version_read_from_disk < BRT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
*bytes_freed_estimate = 0;
*cost = PE_CHEAP;
goto exit;
......@@ -876,10 +878,15 @@ compress_internal_node_partition(BRTNODE node, int i)
// callback for partially evicting a node
int toku_brtnode_pe_callback (void *brtnode_pv, PAIR_ATTR UU(old_attr), PAIR_ATTR* new_attr, void* UU(extraargs)) {
BRTNODE node = (BRTNODE)brtnode_pv;
//
// Don't partially evict dirty nodes
if (node->dirty) {
goto exit;
}
// Don't partially evict nodes whose partitions can't be read back
// from disk individually
if (node->layout_version_read_from_disk < BRT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
goto exit;
}
//
// partial eviction for nonleaf nodes
//
......@@ -1404,7 +1411,6 @@ toku_initialize_empty_brtnode (BRTNODE n, BLOCKNUM nodename, int height, int num
n->layout_version_original = layout_version;
n->layout_version_read_from_disk = layout_version;
n->height = height;
n->optimized_for_upgrade = 0;
n->totalchildkeylens = 0;
n->childkeys = 0;
n->bp = 0;
......@@ -1623,7 +1629,6 @@ struct setval_extra_s {
LEAFENTRY le;
OMT snapshot_txnids;
OMT live_list_reverse;
bool made_change;
uint64_t * workdone; // set by brt_leaf_apply_cmd_once()
};
......@@ -1659,7 +1664,6 @@ static void setval_fun (const DBT *new_val, void *svextra_v) {
svextra->workdone);
svextra->setval_r = 0;
}
svextra->made_change = TRUE;
}
// We are already past the msn filter (in brt_leaf_put_cmd(), which calls do_update()),
......@@ -1667,7 +1671,7 @@ static void setval_fun (const DBT *new_val, void *svextra_v) {
// would be to put a dummy msn in the messages created by setval_fun(), but preserving
// the original msn seems cleaner and it preserves accountability at a lower layer.
static int do_update(brt_update_func update_fun, DESCRIPTOR desc, BRTNODE leafnode, BASEMENTNODE bn, BRT_MSG cmd, int idx,
LEAFENTRY le, OMT snapshot_txnids, OMT live_list_reverse, bool* made_change,
LEAFENTRY le, OMT snapshot_txnids, OMT live_list_reverse,
uint64_t * workdone) {
LEAFENTRY le_for_update;
DBT key;
......@@ -1710,7 +1714,7 @@ static int do_update(brt_update_func update_fun, DESCRIPTOR desc, BRTNODE leafno
}
struct setval_extra_s setval_extra = {setval_tag, FALSE, 0, leafnode, bn, cmd->msn, cmd->xids,
keyp, idx, le_for_update, snapshot_txnids, live_list_reverse, 0, workdone};
keyp, idx, le_for_update, snapshot_txnids, live_list_reverse, workdone};
// call handlerton's brt->update_fun(), which passes setval_extra to setval_fun()
FAKE_DB(db, desc);
int r = update_fun(
......@@ -1721,10 +1725,6 @@ static int do_update(brt_update_func update_fun, DESCRIPTOR desc, BRTNODE leafno
setval_fun, &setval_extra
);
*made_change = setval_extra.made_change;
// TODO(leif): ensure that really bad return codes actually cause a
// crash higher up the stack somewhere
if (r == 0) { r = setval_extra.setval_r; }
return r;
}
......@@ -1738,7 +1738,6 @@ brt_leaf_put_cmd (
BRTNODE leafnode, // bn is within leafnode
BASEMENTNODE bn,
BRT_MSG cmd,
bool* made_change,
uint64_t *workdone,
OMT snapshot_txnids,
OMT live_list_reverse
......@@ -1754,7 +1753,6 @@ brt_leaf_put_cmd (
u_int32_t omt_size;
int r;
struct cmd_leafval_heaviside_extra be = {compare_fun, desc, cmd->u.id.key};
*made_change = 0;
unsigned int doing_seqinsert = bn->seqinsert;
bn->seqinsert = 0;
......@@ -1763,7 +1761,6 @@ brt_leaf_put_cmd (
case BRT_INSERT_NO_OVERWRITE:
case BRT_INSERT: {
u_int32_t idx;
*made_change = 1;
if (doing_seqinsert) {
idx = toku_omt_size(bn->buffer);
r = toku_omt_fetch(bn->buffer, idx-1, &storeddatav);
......@@ -1816,7 +1813,6 @@ brt_leaf_put_cmd (
u_int32_t num_leafentries_before = toku_omt_size(bn->buffer);
brt_leaf_apply_cmd_once(leafnode, bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone);
*made_change = 1;
{
// Now we must find the next leafentry.
......@@ -1851,9 +1847,6 @@ brt_leaf_put_cmd (
break;
}
case BRT_OPTIMIZE_FOR_UPGRADE:
*made_change = 1;
// TODO 4053: Record version of software that sent the optimize_for_upgrade message, but that goes in the
// node's optimize_for_upgrade field, not in the basement.
// fall through so that optimize_for_upgrade performs rest of the optimize logic
case BRT_COMMIT_BROADCAST_ALL:
case BRT_OPTIMIZE:
......@@ -1872,7 +1865,6 @@ brt_leaf_put_cmd (
//Item was deleted.
deleted = 1;
}
*made_change = 1;
}
if (deleted)
omt_size--;
......@@ -1899,7 +1891,6 @@ brt_leaf_put_cmd (
//Item was deleted.
deleted = 1;
}
*made_change = 1;
}
if (deleted)
omt_size--;
......@@ -1914,10 +1905,10 @@ brt_leaf_put_cmd (
r = toku_omt_find_zero(bn->buffer, toku_cmd_leafval_heaviside, &be,
&storeddatav, &idx);
if (r==DB_NOTFOUND) {
r = do_update(update_fun, desc, leafnode, bn, cmd, idx, NULL, snapshot_txnids, live_list_reverse, made_change, workdone);
r = do_update(update_fun, desc, leafnode, bn, cmd, idx, NULL, snapshot_txnids, live_list_reverse, workdone);
} else if (r==0) {
storeddata=storeddatav;
r = do_update(update_fun, desc, leafnode, bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, made_change, workdone);
r = do_update(update_fun, desc, leafnode, bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone);
} // otherwise, a worse error, just return it
break;
}
......@@ -1929,7 +1920,7 @@ brt_leaf_put_cmd (
r = toku_omt_fetch(bn->buffer, idx, &storeddatav);
assert(r==0);
storeddata=storeddatav;
r = do_update(update_fun, desc, leafnode, bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, made_change, workdone);
r = do_update(update_fun, desc, leafnode, bn, cmd, idx, storeddata, snapshot_txnids, live_list_reverse, workdone);
// TODO(leif): This early return means get_leaf_reactivity()
// and VERIFY_NODE() never get called. Is this a problem?
assert(r==0);
......@@ -2481,7 +2472,6 @@ brtnode_put_cmd (
// and instead defer to these functions
//
if (node->height==0) {
bool made_change = false;
uint64_t workdone = 0;
toku_apply_cmd_to_leaf(
compare_fun,
......@@ -2489,7 +2479,6 @@ brtnode_put_cmd (
desc,
node,
cmd,
&made_change,
&workdone,
snapshot_txnids,
live_list_reverse
......@@ -2512,7 +2501,6 @@ void toku_apply_cmd_to_leaf(
DESCRIPTOR desc,
BRTNODE node,
BRT_MSG cmd,
bool *made_change,
uint64_t *workdone,
OMT snapshot_txnids,
OMT live_list_reverse
......@@ -2556,7 +2544,6 @@ void toku_apply_cmd_to_leaf(
node,
BLB(node, childnum),
cmd,
made_change,
workdone,
snapshot_txnids,
live_list_reverse);
......@@ -2565,7 +2552,6 @@ void toku_apply_cmd_to_leaf(
}
}
else if (brt_msg_applies_all(cmd)) {
bool bn_made_change = false;
for (int childnum=0; childnum<node->n_children; childnum++) {
if (cmd->msn.msn > BLB(node, childnum)->max_msn_applied.msn) {
BLB(node, childnum)->max_msn_applied = cmd->msn;
......@@ -2575,11 +2561,9 @@ void toku_apply_cmd_to_leaf(
node,
BLB(node, childnum),
cmd,
&bn_made_change,
workdone,
snapshot_txnids,
live_list_reverse);
if (bn_made_change) *made_change = 1;
} else {
STATUS_VALUE(BRT_MSN_DISCARDS)++;
}
......@@ -2775,30 +2759,13 @@ toku_brt_hot_index_recovery(TOKUTXN txn, FILENUMS filenums, int do_fsync, int do
return r;
}
static int brt_optimize (BRT brt, BOOL upgrade);
// Effect: Optimize the brt.
int
toku_brt_optimize (BRT brt) {
int r = brt_optimize(brt, FALSE);
return r;
}
int
toku_brt_optimize_for_upgrade (BRT brt) {
int r = brt_optimize(brt, TRUE);
return r;
}
static int
brt_optimize (BRT brt, BOOL upgrade) {
int r = 0;
TXNID oldest = TXNID_NONE_LIVING;
if (!upgrade) {
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
oldest = toku_logger_get_oldest_living_xid(logger, NULL);
}
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
TXNID oldest = toku_logger_get_oldest_living_xid(logger, NULL);
XIDS root_xids = xids_get_root_xids();
XIDS message_xids;
......@@ -2814,16 +2781,8 @@ brt_optimize (BRT brt, BOOL upgrade) {
DBT val;
toku_init_dbt(&key);
toku_init_dbt(&val);
if (upgrade) {
// maybe there's a better place than the val dbt to put the version, but it seems harmless and is convenient
toku_fill_dbt(&val, &this_version, sizeof(this_version));
BRT_MSG_S brtcmd = { BRT_OPTIMIZE_FOR_UPGRADE, ZERO_MSN, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
}
else {
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, ZERO_MSN, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
}
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, ZERO_MSN, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
xids_destroy(&message_xids);
return r;
}
......@@ -3305,14 +3264,13 @@ brt_init_header_partial (BRT t, TOKUTXN txn) {
t->h->cf = t->cf;
t->h->nodesize=t->nodesize;
t->h->basementnodesize=t->basementnodesize;
t->h->num_blocks_to_upgrade_13 = 0;
t->h->num_blocks_to_upgrade_14 = 0;
t->h->root_xid_that_created = txn ? txn->ancestor_txnid64 : TXNID_NONE;
t->h->compare_fun = t->compare_fun;
t->h->update_fun = t->update_fun;
t->h->in_memory_stats = ZEROSTATS;
t->h->on_disk_stats = ZEROSTATS;
t->h->checkpoint_staging_stats = ZEROSTATS;
t->h->highest_unused_msn_for_upgrade.msn = MIN_MSN.msn - 1;
BLOCKNUM root = t->h->root_blocknum;
if ((r=setup_initial_brt_root_node(t, root))!=0) { return r; }
......@@ -3419,7 +3377,17 @@ int toku_read_brt_header_and_store_in_cachefile (BRT brt, CACHEFILE cf, LSN max_
int r;
{
int fd = toku_cachefile_get_and_pin_fd (cf);
r = toku_deserialize_brtheader_from(fd, max_acceptable_lsn, &h);
enum deserialize_error_code e = toku_deserialize_brtheader_from(fd, max_acceptable_lsn, &h);
if (e == DS_XSUM_FAIL) {
fprintf(stderr, "Checksum failure while reading header in file %s.\n", toku_cachefile_fname_in_env(cf));
assert(false); // make absolutely sure we crash before doing anything else
} else if (e == DS_ERRNO) {
r = errno;
} else if (e == DS_OK) {
r = 0;
} else {
assert(false);
}
toku_cachefile_unpin_fd(cf);
}
if (r!=0) return r;
......@@ -3700,9 +3668,6 @@ brt_open(BRT t, const char *fname_in_env, int is_create, int only_create, CACHET
assert(t->h->dict_id.dictid != DICTIONARY_ID_NONE.dictid);
assert(t->h->dict_id.dictid < dict_id_serial);
r = toku_maybe_upgrade_brt(t); // possibly do some work to complete the version upgrade of brt
if (r!=0) goto died_after_read_and_pin;
// brtheader_note_brt_open must be after all functions that can fail.
r = brtheader_note_brt_open(t);
if (r!=0) goto died_after_read_and_pin;
......@@ -4797,8 +4762,7 @@ do_brt_leaf_put_cmd(BRT t, BRTNODE leafnode, BASEMENTNODE bn, BRTNODE ancestor,
toku_fill_dbt(&hk, key, keylen);
DBT hv;
BRT_MSG_S brtcmd = { type, msn, xids, .u.id = { &hk, toku_fill_dbt(&hv, val, vallen) } };
bool made_change;
brt_leaf_put_cmd(t->compare_fun, t->update_fun, &t->h->cmp_descriptor, leafnode, bn, &brtcmd, &made_change, &BP_WORKDONE(ancestor, childnum), NULL, NULL); // pass NULL omts (snapshot_txnids and live_list_reverse) to prevent GC from running on message application for a query
brt_leaf_put_cmd(t->compare_fun, t->update_fun, &t->h->cmp_descriptor, leafnode, bn, &brtcmd, &BP_WORKDONE(ancestor, childnum), NULL, NULL); // pass NULL omts (snapshot_txnids and live_list_reverse) to prevent GC from running on message application for a query
} else {
STATUS_VALUE(BRT_MSN_DISCARDS)++;
}
......@@ -6857,6 +6821,7 @@ toku_brt_header_init(struct brt_header *h,
h->flags = 0;
h->root_xid_that_created = root_xid_that_created;
h->compression_method = compression_method;
h->highest_unused_msn_for_upgrade.msn = MIN_MSN.msn - 1;
}
#include <valgrind/helgrind.h>
......
......@@ -115,8 +115,6 @@ int toku_brt_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn) __attribute__ ((warn
int toku_brt_optimize (BRT brt) __attribute__ ((warn_unused_result));
int toku_brt_optimize_for_upgrade (BRT brt) __attribute__ ((warn_unused_result));
// Effect: Insert a key and data pair into a brt if the oplsn is newer than the brt lsn. This function is called during recovery.
// Returns 0 if successful
int toku_brt_maybe_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn, BOOL oplsn_valid, LSN oplsn, BOOL do_logging, enum brt_msg_type type) __attribute__ ((warn_unused_result));
......
......@@ -23,7 +23,7 @@ enum brt_layout_version_e {
// ALERT ALERT ALERT: version 16 never released to customers, internal and beta use only
BRT_LAYOUT_VERSION_17 = 17, // Dr. No: Add STAT64INFO_S to brt_header
BRT_LAYOUT_VERSION_18 = 18, // Dr. No: Add HOT info to brt_header
BRT_LAYOUT_VERSION_19 = 19, // Doofenshmirtz: Add compression method, msn_for_upgrade, TODO
BRT_LAYOUT_VERSION_19 = 19, // Doofenshmirtz: Add compression method, highest_unused_msn_for_upgrade
BRT_NEXT_VERSION, // the version after the current version
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_13, // Minimum version supported
......
......@@ -139,7 +139,6 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
printf(" thisnodename=%" PRId64 "\n", n->thisnodename.b);
//printf(" log_lsn =%lld\n", n->log_lsn.lsn); // The log_lsn is a memory-only value.
printf(" height =%d\n", n->height);
printf(" optimized_for_upgrade = %u\n", n->optimized_for_upgrade);
printf(" layout_version=%d\n", n->layout_version);
printf(" layout_version_original=%d\n", n->layout_version_original);
printf(" layout_version_read_from_disk=%d\n", n->layout_version_read_from_disk);
......
......@@ -102,7 +102,7 @@ typedef struct __toku_lsn { u_int64_t lsn; } LSN;
* Make the MSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_msn { u_int64_t msn; } MSN;
#define ZERO_MSN ((MSN){0}) // dummy used for message construction, to be filled in when msg is applied to tree
#define MIN_MSN ((MSN){(u_int64_t)1000*1000*1000}) // first 1B values reserved for messages created before Dr. No (for upgrade)
#define MIN_MSN ((MSN){(u_int64_t)1 << 62}) // first 2^62 values reserved for messages created before Dr. No (for upgrade)
#define MAX_MSN ((MSN){UINT64_MAX})
/* At the brt layer, a FILENUM uniquely identifies an open file.
......@@ -273,6 +273,12 @@ enum reactivity {
RE_FISSIBLE
};
enum deserialize_error_code {
DS_OK = 0,
DS_XSUM_FAIL,
DS_ERRNO
};
#if defined(__cplusplus) || defined(__cilkplusplus)
};
#endif
......
......@@ -12,6 +12,7 @@
#include <toku_portability.h>
#include "rbuf.h"
#include "x1764.h"
#include "mempool.h"
#if defined(__cplusplus) || defined(__cilkplusplus)
extern "C" {
......@@ -168,7 +169,9 @@ leafentry_disksize_13(LEAFENTRY_13 le);
int
toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry, // NULL if there was no stored data.
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p);
LEAFENTRY *new_leafentry_p,
OMT omt,
struct mempool *mp);
#if defined(__cplusplus) || defined(__cilkplusplus)
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2007-2011 Tokutek Inc. All rights reserved."
......@@ -74,6 +74,7 @@ int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
unsigned long eltsize, nodesize, repeat;
initialize_dummymsn();
if (argc != 4) {
fprintf(stderr, "Usage: %s <eltsize> <nodesize> <repeat>\n", argv[0]);
return 2;
......
......@@ -301,7 +301,6 @@ test_prefetching(void) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 1;
sn.optimized_for_upgrade = 1234;
sn.n_children = 3;
sn.dirty = 1;
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
......@@ -243,7 +243,6 @@ test_serialize_nonleaf(void) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 1;
sn.optimized_for_upgrade = 1234;
sn.n_children = 2;
sn.dirty = 1;
hello_string = toku_strdup("hello");
......@@ -344,7 +343,6 @@ test_serialize_leaf(void) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 2;
sn.dirty = 1;
LEAFENTRY elts[3];
......@@ -424,6 +422,7 @@ test_serialize_leaf(void) {
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
initialize_dummymsn();
test_serialize_nonleaf();
test_serialize_leaf();
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
......@@ -67,7 +67,6 @@ test_serialize_leaf(int valsize, int nelts, double entropy) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 8;
sn.dirty = 1;
MALLOC_N(sn.n_children, sn.bp);
......@@ -194,7 +193,6 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 1;
sn.optimized_for_upgrade = 1234;
sn.n_children = 8;
sn.dirty = 1;
MALLOC_N(sn.n_children, sn.bp);
......@@ -321,6 +319,7 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute_
valsize = strtol(argv[1], NULL, 0);
nelts = strtol(argv[2], NULL, 0);
initialize_dummymsn();
test_serialize_leaf(valsize, nelts, entropy);
test_serialize_nonleaf(valsize, nelts, entropy);
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
......@@ -220,7 +220,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft, BOOL do_clone) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 2;
sn.dirty = 1;
MALLOC_N(sn.n_children, sn.bp);
......@@ -290,7 +289,6 @@ test_serialize_leaf_check_msn(enum brtnode_verify_type bft, BOOL do_clone) {
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->optimized_for_upgrade == 1234);
assert(dn->n_children>=1);
assert(dn->max_msn_applied_to_node_on_disk.msn == POSTSERIALIZE_MSN_ON_DISK.msn);
{
......@@ -364,7 +362,6 @@ test_serialize_leaf_with_large_pivots(enum brtnode_verify_type bft, BOOL do_clon
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = nrows;
sn.dirty = 1;
......@@ -508,7 +505,6 @@ test_serialize_leaf_with_many_rows(enum brtnode_verify_type bft, BOOL do_clone)
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 1;
sn.dirty = 1;
......@@ -649,7 +645,6 @@ test_serialize_leaf_with_large_rows(enum brtnode_verify_type bft, BOOL do_clone)
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 1;
sn.dirty = 1;
......@@ -798,7 +793,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft, BOOL
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 7;
sn.dirty = 1;
MALLOC_N(sn.n_children, sn.bp);
......@@ -879,7 +873,6 @@ test_serialize_leaf_with_empty_basement_nodes(enum brtnode_verify_type bft, BOOL
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->optimized_for_upgrade == 1234);
assert(dn->n_children>0);
{
// Man, this is way too ugly. This entire test suite needs to be refactored.
......@@ -949,7 +942,6 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 4;
sn.dirty = 1;
MALLOC_N(sn.n_children, sn.bp);
......@@ -1006,7 +998,6 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum brtnode_verify_type
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->optimized_for_upgrade == 1234);
assert(dn->n_children == 1);
{
const u_int32_t npartitions = dn->n_children;
......@@ -1069,7 +1060,6 @@ test_serialize_leaf(enum brtnode_verify_type bft, BOOL do_clone) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 0;
sn.optimized_for_upgrade = 1234;
sn.n_children = 2;
sn.dirty = 1;
MALLOC_N(sn.n_children, sn.bp);
......@@ -1135,7 +1125,6 @@ test_serialize_leaf(enum brtnode_verify_type bft, BOOL do_clone) {
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 0);
assert(dn->optimized_for_upgrade == 1234);
assert(dn->n_children>=1);
{
// Man, this is way too ugly. This entire test suite needs to be refactored.
......@@ -1211,7 +1200,6 @@ test_serialize_nonleaf(enum brtnode_verify_type bft, BOOL do_clone) {
sn.layout_version = BRT_LAYOUT_VERSION;
sn.layout_version_original = BRT_LAYOUT_VERSION;
sn.height = 1;
sn.optimized_for_upgrade = 1234;
sn.n_children = 2;
sn.dirty = 1;
hello_string = toku_strdup("hello");
......@@ -1282,7 +1270,6 @@ test_serialize_nonleaf(enum brtnode_verify_type bft, BOOL do_clone) {
assert(dn->layout_version_original ==BRT_LAYOUT_VERSION);
assert(dn->layout_version_read_from_disk ==BRT_LAYOUT_VERSION);
assert(dn->height == 1);
assert(dn->optimized_for_upgrade == 1234);
assert(dn->n_children==2);
assert(strcmp(kv_pair_key(dn->childkeys[0]), "hello")==0);
assert(toku_brt_pivot_key_len(dn->childkeys[0])==6);
......@@ -1395,6 +1382,8 @@ test_serialize_nonleaf(enum brtnode_verify_type bft, BOOL do_clone) {
int
test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
initialize_dummymsn();
test_serialize_leaf(read_none, FALSE);
test_serialize_leaf(read_all, FALSE);
test_serialize_leaf(read_compressed, FALSE);
......
......@@ -30,8 +30,6 @@ static void test_header (void) {
h->layout_version_original = 13;
h->layout_version_read_from_disk = 14;
h->build_id_original = 1234;
h->num_blocks_to_upgrade_13 = 1013;
h->num_blocks_to_upgrade_14 = 1014;
h->in_memory_stats = (STAT64INFO_S) {10, 11};
h->on_disk_stats = (STAT64INFO_S) {20, 21};
h->checkpoint_staging_stats = (STAT64INFO_S) {30, 31};
......@@ -51,8 +49,6 @@ static void test_header (void) {
assert(h->layout_version_original == 13);
assert(h->layout_version_read_from_disk == BRT_LAYOUT_VERSION);
assert(h->build_id_original == 1234);
assert(h->num_blocks_to_upgrade_13 == 1013);
assert(h->num_blocks_to_upgrade_14 == 1014);
assert(h->in_memory_stats.numrows == expected_stats.numrows);
assert(h->on_disk_stats.numbytes == expected_stats.numbytes);
r = toku_close_brt_nolsn(t, 0); assert(r==0);
......
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#include "includes.h"
......@@ -89,6 +90,7 @@ test_fifo_enq (int n) {
int
test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
initialize_dummymsn();
test_fifo_create();
test_fifo_enq(4);
test_fifo_enq(512);
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -171,6 +171,7 @@ test_main (int argc , const char *argv[]) {
int fanout = 2;
int nperleaf = 8;
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -46,9 +46,8 @@ append_leaf(BRT brt, BRTNODE leafnode, void *key, size_t keylen, void *val, size
MSN msn = next_dummymsn();
BRT_MSG_S cmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &theval } };
bool made_change;
u_int64_t workdone=0;
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &cmd, &made_change, &workdone, NULL, NULL);
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &cmd, &workdone, NULL, NULL);
{
int r = toku_brt_lookup(brt, &thekey, lookup_checkf, &pair);
assert(r==0);
......@@ -56,7 +55,7 @@ append_leaf(BRT brt, BRTNODE leafnode, void *key, size_t keylen, void *val, size
}
BRT_MSG_S badcmd = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &badval } };
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &badcmd, &made_change, &workdone, NULL, NULL);
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &badcmd, &workdone, NULL, NULL);
// message should be rejected for duplicate msn, row should still have original val
......@@ -69,7 +68,7 @@ append_leaf(BRT brt, BRTNODE leafnode, void *key, size_t keylen, void *val, size
// now verify that message with proper msn gets through
msn = next_dummymsn();
BRT_MSG_S cmd2 = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &val2 } };
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &cmd2, &made_change, &workdone, NULL, NULL);
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &cmd2, &workdone, NULL, NULL);
// message should be accepted, val should have new value
{
......@@ -81,7 +80,7 @@ append_leaf(BRT brt, BRTNODE leafnode, void *key, size_t keylen, void *val, size
// now verify that message with lesser (older) msn is rejected
msn.msn = msn.msn - 10;
BRT_MSG_S cmd3 = { BRT_INSERT, msn, xids_get_root_xids(), .u.id = { &thekey, &badval } };
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &cmd3, &made_change, &workdone, NULL, NULL);
toku_apply_cmd_to_leaf(brt->compare_fun, brt->update_fun, &brt->h->cmp_descriptor, leafnode, &cmd3, &workdone, NULL, NULL);
// message should be rejected, val should still have value in pair2
{
......@@ -165,6 +164,7 @@ usage(void) {
int
test_main (int argc , const char *argv[]) {
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
......@@ -127,8 +127,7 @@ insert_random_message_to_leaf(BRT t, BRTNODE leafnode, BASEMENTNODE blb, LEAFENT
int64_t numbytes;
int r = apply_msg_to_leafentry(&msg, NULL, &memsize, save, NULL, NULL, NULL, NULL, NULL, &numbytes);
assert_zero(r);
bool made_change;
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, leafnode, blb, &msg, &made_change, NULL, NULL, NULL);
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, leafnode, blb, &msg, NULL, NULL, NULL);
if (msn.msn > blb->max_msn_applied.msn) {
blb->max_msn_applied = msn;
}
......@@ -169,12 +168,11 @@ insert_same_message_to_leaves(BRT t, BRTNODE child1, BASEMENTNODE blb1, BRTNODE
int64_t numbytes;
int r = apply_msg_to_leafentry(&msg, NULL, &memsize, save, NULL, NULL, NULL, NULL, NULL, &numbytes);
assert_zero(r);
bool made_change;
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, child1, blb1, &msg, &made_change, NULL, NULL, NULL);
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, child1, blb1, &msg, NULL, NULL, NULL);
if (msn.msn > blb1->max_msn_applied.msn) {
blb1->max_msn_applied = msn;
}
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, child2, blb2, &msg, &made_change, NULL, NULL, NULL);
brt_leaf_put_cmd(t->compare_fun, t->update_fun, NULL, child2, blb2, &msg, NULL, NULL, NULL);
if (msn.msn > blb2->max_msn_applied.msn) {
blb2->max_msn_applied = msn;
}
......@@ -586,8 +584,7 @@ flush_to_leaf(BRT t, bool make_leaf_up_to_date, bool use_flush) {
if (make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) {
if (!parent_messages_is_fresh[i]) {
bool made_change;
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child, parent_messages[i], &made_change, NULL, NULL, NULL);
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child, parent_messages[i], NULL, NULL, NULL);
}
}
for (i = 0; i < 8; ++i) {
......@@ -811,8 +808,7 @@ flush_to_leaf_with_keyrange(BRT t, bool make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) {
if (dummy_cmp(NULL, parent_messages[i]->u.id.key, &childkeys[7]) <= 0 &&
!parent_messages_is_fresh[i]) {
bool made_change;
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child, parent_messages[i], &made_change, NULL, NULL, NULL);
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child, parent_messages[i], NULL, NULL, NULL);
}
}
for (i = 0; i < 8; ++i) {
......@@ -999,9 +995,8 @@ compare_apply_and_flush(BRT t, bool make_leaf_up_to_date) {
if (make_leaf_up_to_date) {
for (i = 0; i < num_parent_messages; ++i) {
if (!parent_messages_is_fresh[i]) {
bool made_change;
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child1, parent_messages[i], &made_change, NULL, NULL, NULL);
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child2, parent_messages[i], &made_change, NULL, NULL, NULL);
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child1, parent_messages[i], NULL, NULL, NULL);
toku_apply_cmd_to_leaf(t->compare_fun, t->update_fun, &t->h->descriptor, child2, parent_messages[i], NULL, NULL, NULL);
}
}
for (i = 0; i < 8; ++i) {
......@@ -1124,6 +1119,7 @@ int
test_main (int argc, const char *argv[]) {
parse_args(argc, argv);
initialize_dummymsn();
int r;
CACHETABLE ct;
r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
......
......@@ -27,7 +27,7 @@ const ITEMLEN len_ignore = 0xFFFFFFFF;
// dummymsn needed to simulate msn because test messages are injected at a lower level than toku_brt_root_put_cmd()
#define MIN_DUMMYMSN ((MSN) {(uint64_t)1<<48})
#define MIN_DUMMYMSN ((MSN) {(uint64_t)1<<62})
static MSN dummymsn;
static int dummymsn_initialized = 0;
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
#ident "Id:"
......@@ -74,6 +74,7 @@ static void test_3748 (void) {
int
test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
initialize_dummymsn();
test_3748();
return 0;
......
......@@ -85,7 +85,6 @@ setup_brtnode_header(struct brtnode *node)
node->layout_version = BRT_LAYOUT_VERSION;
node->layout_version_original = BRT_LAYOUT_VERSION;
node->height = 0;
node->optimized_for_upgrade = 1324;
node->dirty = 1;
node->totalchildkeylens = 0;
}
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "$Id$"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#include <unistd.h>
#include <stdlib.h>
#include <sys/time.h>
#include "test.h"
#include "brt-flusher.h"
#include "includes.h"
static TOKUTXN const null_txn = NULL;
static DB * const null_db = NULL;
static int
noop_getf(ITEMLEN UU(keylen), bytevec UU(key), ITEMLEN UU(vallen), bytevec UU(val), void *extra, bool UU(lock_only))
{
int *calledp = extra;
(*calledp)++;
return 0;
}
static int
get_one_value(BRT t, CACHETABLE UU(ct), void *UU(extra))
{
int r;
int called;
BRT_CURSOR cursor;
r = toku_brt_cursor(t, &cursor, null_txn, false, false);
CKERR(r);
called = 0;
r = toku_brt_cursor_first(cursor, noop_getf, &called);
CKERR(r);
assert(called == 1);
r = toku_brt_cursor_close(cursor);
CKERR(r);
return 0;
}
static int
progress(void *extra, float fraction)
{
float *stop_at = extra;
if (fraction > *stop_at) {
return 1;
} else {
return 0;
}
}
static int
do_hot_optimize(BRT t, CACHETABLE UU(ct), void *extra)
{
float *fraction = extra;
int r = toku_brt_hot_optimize(t, progress, extra);
if (*fraction < 1.0) {
CKERR2(r, 1);
} else {
CKERR(r);
}
return 0;
}
static int
insert_something(BRT t, CACHETABLE UU(ct), void *UU(extra))
{
assert(t);
return 0;
}
static int
scan_tree(BRT t, CACHETABLE UU(ct), void *UU(extra))
{
assert(t);
return 0;
}
typedef int (*tree_cb)(BRT t, CACHETABLE ct, void *extra);
static int
with_open_tree(const char *fname, tree_cb cb, void *cb_extra)
{
int r, r2;
BRT t;
CACHETABLE ct;
r = toku_brt_create_cachetable(&ct, 16*(1<<20), ZERO_LSN, NULL_LOGGER);
CKERR(r);
r = toku_open_brt(fname,
0,
&t,
4*(1<<20),
128*(1<<10),
ct,
null_txn,
toku_builtin_compare_fun,
null_db);
CKERR(r);
r2 = cb(t, ct, cb_extra);
r = toku_close_brt_nolsn(t, 0);
CKERR(r);
r = toku_cachetable_close(&ct);
CKERR(r);
return r2;
}
#define TMPBRTFMT "%s-tmpdata.brt"
static const char *origbrt_5_0 = "upgrade_test_data.brt.5.0";
static const char *origbrt_4_2 = "upgrade_test_data.brt.4.2";
static int
run_test(const char *prog, const char *origbrt) {
int r;
size_t templen = strlen(prog) + strlen(TMPBRTFMT) - 2;
char tempbrt[templen + 1];
snprintf(tempbrt, templen + 1, TMPBRTFMT, prog);
{
size_t len = 4 + strlen(origbrt) + strlen(tempbrt);
char buf[len + 1];
snprintf(buf, len + 1, "cp %s %s", origbrt, tempbrt);
r = system(buf);
CKERR(r);
}
r = with_open_tree(tempbrt, get_one_value, NULL);
CKERR(r);
r = with_open_tree(tempbrt, insert_something, NULL);
CKERR(r);
float fraction = 0.5;
r = with_open_tree(tempbrt, do_hot_optimize, &fraction);
CKERR(r);
fraction = 1.0;
r = with_open_tree(tempbrt, do_hot_optimize, &fraction);
CKERR(r);
r = with_open_tree(tempbrt, scan_tree, NULL);
CKERR(r);
r = unlink(tempbrt);
CKERR(r);
return r;
}
int
test_main(int argc __attribute__((__unused__)), const char *argv[])
{
int r;
r = run_test(argv[0], origbrt_5_0);
CKERR(r);
r = run_test(argv[0], origbrt_4_2);
CKERR(r);
return r;
}
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -174,6 +174,7 @@ usage(void) {
int
test_main (int argc , const char *argv[]) {
initialize_dummymsn();
int height = 1;
int fanout = 2;
int nperleaf = 8;
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -145,6 +145,7 @@ test_main (int argc , const char *argv[]) {
int fanout = 2;
int nperleaf = 8;
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -100,6 +100,7 @@ usage(void) {
int
test_main (int argc , const char *argv[]) {
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -145,6 +145,7 @@ test_main (int argc , const char *argv[]) {
int fanout = 3;
int nperleaf = 8;
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -160,6 +160,7 @@ test_main (int argc , const char *argv[]) {
int fanout = 2;
int nperleaf = 8;
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -100,6 +100,7 @@ usage(void) {
int
test_main (int argc , const char *argv[]) {
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
/* -*- mode: C; c-basic-offset: 4 -*- */
/* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */
#ident "$Id$"
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
......@@ -145,6 +145,7 @@ test_main (int argc , const char *argv[]) {
int fanout = 3;
int nperleaf = 8;
int do_verify = 1;
initialize_dummymsn();
for (int i = 1; i < argc; i++) {
const char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
......
......@@ -2326,15 +2326,20 @@ leafentry_disksize_13(LEAFENTRY_13 le) {
int
toku_le_upgrade_13_14(LEAFENTRY_13 old_leafentry,
size_t *new_leafentry_memorysize,
LEAFENTRY *new_leafentry_p) {
LEAFENTRY *new_leafentry_p,
OMT omt,
struct mempool *mp) {
ULE_S ule;
int rval;
invariant(old_leafentry);
le_unpack_13(&ule, old_leafentry);
// We used to pass NULL for omt and mempool, so that we would use
// malloc instead of a mempool. However after supporting upgrade,
// we need to use mempools and the OMT.
rval = le_pack(&ule, // create packed leafentry
new_leafentry_memorysize,
new_leafentry_p,
NULL, NULL, NULL); // NULL for omt means that we use malloc instead of mempool
omt, mp, NULL);
ule_cleanup(&ule);
return rval;
}
......
......@@ -54,6 +54,7 @@ enum {ROWS_PER_TRANSACTION=10000};
uint NUM_DBS=1;
uint NUM_ROWS=100000;
int CHECK_RESULTS=0;
int optimize=0;
int littlenode = 0;
enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize;
......@@ -136,6 +137,11 @@ static void preload_dbs(DB **dbs)
nested_insert(dbs, 0, NULL, row, generated_value);
}
if (optimize) {
if (verbose) { printf("\noptimizing");fflush(stdout);}
do_hot_optimize_on_dbs(env, dbs, 1);
}
if ( CHECK_RESULTS) {
if ( verbose ) {printf("\nchecking");fflush(stdout);}
check_results_nested(&dbs[0], NUM_ROWS);
......@@ -288,6 +294,8 @@ static void do_args(int argc, char * const argv[]) {
CHECK_RESULTS = 1;
} else if (strcmp(argv[0], "-n")==0) {
littlenode = 1;
} else if (strcmp(argv[0], "-o")==0) {
optimize = 1;
} else {
fprintf(stderr, "Unknown arg: %s\n", argv[0]);
resultcode=1;
......
......@@ -20,6 +20,7 @@ enum {ROWS_PER_TRANSACTION=10000};
int NUM_DBS=5;
int NUM_ROWS=100000;
int CHECK_RESULTS=0;
int optimize=0;
int littlenode = 0;
enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize;
......@@ -71,6 +72,11 @@ static void preload_dbs(DB **dbs)
if ( key.flags ) { toku_free(key.data); key.data = NULL; }
if ( val.flags ) { toku_free(val.data); key.data = NULL; }
if (optimize) {
if (verbose) { printf("\noptimizing");fflush(stdout);}
do_hot_optimize_on_dbs(env, dbs, NUM_DBS);
}
if ( CHECK_RESULTS) {
if ( verbose ) {printf("\nchecking");fflush(stdout);}
check_results(env, dbs, NUM_DBS, NUM_ROWS);
......@@ -193,6 +199,8 @@ static void do_args(int argc, char * const argv[]) {
CHECK_RESULTS = 1;
} else if (strcmp(argv[0], "-n")==0) {
littlenode = 1;
} else if (strcmp(argv[0], "-o")==0) {
optimize = 1;
} else {
fprintf(stderr, "Unknown arg: %s\n", argv[0]);
resultcode=1;
......
......@@ -85,6 +85,21 @@ pkey_for_val(int key, int i) {
}
static int __attribute__((unused))
dummy_progress(void *UU(extra), float UU(progress))
{
return 0;
}
static void __attribute__((unused))
do_hot_optimize_on_dbs(DB_ENV *UU(env), DB **dbs, int num_dbs)
{
for (int i = 0; i < num_dbs; ++i) {
int r = dbs[i]->hot_optimize(dbs[i], dummy_progress, NULL);
CKERR(r);
}
}
// don't check first n rows (expect to have been deleted)
static void UU()
check_results_after_row_n(DB_ENV *env, DB **dbs, const int num_dbs, const int num_rows, const int first_row_to_check) {
......
......@@ -31,6 +31,16 @@ char *db_v4_dir = OLDDATADIR "env_preload.4.2.0.cleanshutdown";
char *db_v4_dir_node4k = OLDDATADIR "env_preload.4.2.0.node4k.cleanshutdown";
char *db_v4_dir_flat = OLDDATADIR "env_preload.4.2.0.flat.cleanshutdown";
// HACK: Newer versions of the database/brt to use with this old
// upgrade test code.
char *db_v6_dir = OLDDATADIR "env_preload.5.0.8.cleanshutdown";
char *db_v6_dir_node4k = OLDDATADIR "env_preload.5.0.8.node4k.cleanshutdown";
char *db_v6_dir_flat = OLDDATADIR "env_preload.5.0.8.flat.cleanshutdown";
char *db_v7_dir = OLDDATADIR "env_preload.5.2.7.cleanshutdown";
char *db_v7_dir_node4k = OLDDATADIR "env_preload.5.2.7.node4k.cleanshutdown";
char *db_v7_dir_flat = OLDDATADIR "env_preload.5.2.7.flat.cleanshutdown";
// should put this in test.h:
static __attribute__((__unused__)) int
......@@ -98,6 +108,24 @@ static void setup(void) {
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else if (SRC_VERSION == 6) {
if (flat) {
src_db_dir = db_v6_dir_flat;
} else if (littlenode) {
src_db_dir = db_v6_dir_node4k;
} else {
src_db_dir = db_v6_dir;
}
}
else if (SRC_VERSION == 7) {
if (flat) {
src_db_dir = db_v7_dir_flat;
} else if (littlenode) {
src_db_dir = db_v7_dir_node4k;
} else {
src_db_dir = db_v7_dir;
}
}
else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment