Commit 548d03d7 authored by Barry Perlman's avatar Barry Perlman Committed by Yoni Fogel

[t:2892] Merge upgrade logic to main. Merge command was svn merge...

[t:2892] Merge upgrade logic to main.  Merge command was svn merge --accept=postpone -r25293:HEAD ../tokudb.main+2892 .

git-svn-id: file:///svn/toku/tokudb@25303 c7de825b-a66e-492c-adef-691d508d4ae1
parent 35800b4c
......@@ -232,6 +232,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */
#define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
......@@ -234,6 +234,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */
#define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
......@@ -234,6 +234,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */
#define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
......@@ -234,6 +234,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */
#define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
......@@ -235,6 +235,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */
#define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
......@@ -72,6 +72,7 @@ enum {
TOKUDB_NO_DATA = -100008,
TOKUDB_ACCEPT = -100009,
TOKUDB_MVCC_DICTIONARY_TOO_NEW = -100010,
TOKUDB_UPGRADE_FAILURE = -100011,
};
static void print_defines (void) {
......@@ -218,6 +219,7 @@ static void print_defines (void) {
dodefine(TOKUDB_NO_DATA);
dodefine(TOKUDB_ACCEPT);
dodefine(TOKUDB_MVCC_DICTIONARY_TOO_NEW);
dodefine(TOKUDB_UPGRADE_FAILURE);
/* LOADER flags */
printf("/* LOADER flags */\n");
......
......@@ -235,6 +235,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */
#define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
......@@ -235,6 +235,7 @@ typedef enum {
#define TOKUDB_NO_DATA -100008
#define TOKUDB_ACCEPT -100009
#define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
#define TOKUDB_UPGRADE_FAILURE -100011
/* LOADER flags */
#define LOADER_USE_PUTS 1
/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/
......
......@@ -748,8 +748,7 @@ static void
translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize
DISKOFF location_on_disk, //Location of translation_buffer
u_int64_t size_on_disk,
unsigned char * translation_buffer,
BOOL invert_checksum) { // buffer with serialized translation
unsigned char * translation_buffer) { // buffer with serialized translation
assert(location_on_disk!=0);
t->type = TRANSLATION_CHECKPOINTED;
{
......@@ -758,9 +757,6 @@ translation_deserialize_from_buffer(struct translation *t, // destination int
u_int64_t offset = size_on_disk - 4;
//printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset));
if (invert_checksum) {
x1764 = ~x1764;
}
assert(x1764 == stored_x1764);
}
struct rbuf rt;
......@@ -808,10 +804,9 @@ void
toku_blocktable_create_from_buffer(BLOCK_TABLE *btp,
DISKOFF location_on_disk, //Location of translation_buffer
DISKOFF size_on_disk,
unsigned char *translation_buffer,
BOOL invert_checksum) {
unsigned char *translation_buffer) {
BLOCK_TABLE bt = blocktable_create_internal();
translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer, invert_checksum);
translation_deserialize_from_buffer(&bt->checkpointed, location_on_disk, size_on_disk, translation_buffer);
blocktable_note_translation(bt->block_allocator, &bt->checkpointed);
// we just filled in checkpointed, now copy it to current.
copy_translation(&bt->current, &bt->checkpointed, TRANSLATION_CURRENT);
......
......@@ -21,7 +21,7 @@ struct block_translation_pair {
};
void toku_blocktable_create_new(BLOCK_TABLE *btp);
void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer, BOOL invert_checksum);
void toku_blocktable_create_from_buffer(BLOCK_TABLE *btp, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
void toku_blocktable_destroy(BLOCK_TABLE *btp);
void toku_brtheader_lock(struct brt_header *h);
......
......@@ -6,6 +6,7 @@
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "brt_layout_version.h"
#include "toku_assert.h"
#include "block_allocator.h"
#include "cachetable.h"
......@@ -44,7 +45,7 @@ enum { BUFFER_HEADER_SIZE = (4 // height//
struct subtree_estimates {
// estimate number of rows in the tree by counting the number of rows
// in the leaves. The stuff in the internal nodes is likely to be off O(1).
u_int64_t nkeys; // number of distinct keys.
u_int64_t nkeys; // number of distinct keys (obsolete with removal of dupsort, but not worth removing)
u_int64_t ndata; // number of key-data pairs (previously leafentry_estimate)
u_int64_t dsize; // total size of leafentries
BOOL exact; // are the estimates exact?
......@@ -82,7 +83,6 @@ struct brtnode_nonleaf_childinfo {
unsigned int n_bytes_in_buffer; /* How many bytes are in each buffer (including overheads for the disk-representation) */
};
typedef struct brtnode *BRTNODE;
/* Internal nodes. */
struct brtnode {
unsigned int nodesize;
......@@ -121,6 +121,7 @@ struct brtnode {
} n;
struct leaf {
struct subtree_estimates leaf_stats; // actually it is exact.
uint32_t optimized_for_upgrade; // version number to which this leaf has been optimized, zero if never optimized for upgrade
OMT buffer;
LEAFLOCK_POOL leaflock_pool;
LEAFLOCK leaflock;
......@@ -166,7 +167,7 @@ struct brt_header {
int layout_version_original; // different (<) from layout_version if upgraded from a previous version (useful for debugging)
int layout_version_read_from_disk; // transient, not serialized to disk
BOOL upgrade_brt_performed; // initially FALSE, set TRUE when brt has been fully updated (even though nodes may not have been)
uint64_t num_blocks_to_upgrade; // Number of blocks still not newest version. When we release layout 13 we may need to turn this to an array.
int64_t num_blocks_to_upgrade; // Number of v12 blocks still not newest version. When we release layout 14 we may need to turn this to an array or add more variables.
unsigned int nodesize;
BLOCKNUM root; // roots of the dictionary
struct remembered_hash root_hash; // hash of the root offset.
......@@ -269,7 +270,7 @@ struct brtenv {
};
extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void *extraargs, long size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint);
extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, void*extraargs);
extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, int*dirty, void*extraargs);
extern int toku_brt_alloc_init_header(BRT t, TOKUTXN txn);
extern int toku_read_brt_header_and_store_in_cachefile (CACHEFILE cf, struct brt_header **header, BOOL* was_open);
extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *root_hash);
......@@ -352,21 +353,6 @@ void toku_verify_all_in_mempool(BRTNODE node);
int toku_verify_brtnode (BRT brt, BLOCKNUM blocknum, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) ;
enum brt_layout_version_e {
BRT_LAYOUT_VERSION_5 = 5,
BRT_LAYOUT_VERSION_6 = 6, // Diff from 5 to 6: Add leafentry_estimate
BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6
BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression.
BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer
BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Added MVCC
BRT_NEXT_VERSION, // the version after the current version
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_12 // Minimum version supported
};
void toku_brtheader_free (struct brt_header *h);
int toku_brtheader_close (CACHEFILE cachefile, int fd, void *header_v, char **error_string, BOOL oplsn_valid, LSN oplsn);
int toku_brtheader_begin_checkpoint (CACHEFILE cachefile, int fd, LSN checkpoint_lsn, void *header_v);
......@@ -380,9 +366,10 @@ int toku_brt_remove_now(CACHETABLE ct, DBT* iname_dbt_p);
typedef struct brt_upgrade_status {
u_int64_t header;
u_int64_t nonleaf;
u_int64_t leaf;
u_int64_t header_12; // how many headers upgrade from version 12
u_int64_t nonleaf_12;
u_int64_t leaf_12;
u_int64_t optimized_for_upgrade_12; // how many optimize_for_upgrade messages sent
} BRT_UPGRADE_STATUS_S, *BRT_UPGRADE_STATUS;
void toku_brt_get_upgrade_status(BRT_UPGRADE_STATUS);
......
......@@ -225,6 +225,7 @@ toku_serialize_brtnode_size_slow (BRTNODE node) {
invariant(hsize==node->u.l.n_bytes_in_buffer);
hsize += 4; // add n entries in buffer table
hsize += 3*8; // add the three leaf stats, but no exact bit
hsize += 4; // optimized_for_upgrade
size += 4 + 1*stored_sub_block_map_size; // one partition
return size+hsize;
}
......@@ -247,6 +248,7 @@ toku_serialize_brtnode_size (BRTNODE node) {
} else {
result += 4; // n_entries in buffer table
result += 3*8; // the three leaf stats
result += 4; // optimized_for_upgrade
result += node->u.l.n_bytes_in_buffer;
result += 4 + 1*stored_sub_block_map_size; // one partition
}
......@@ -372,6 +374,8 @@ serialize_leaf(BRTNODE node, int n_sub_blocks, struct sub_block sub_block[], str
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.ndata);
wbuf_nocrc_ulonglong(wbuf, node->u.l.leaf_stats.dsize);
wbuf_nocrc_int(wbuf, node->u.l.optimized_for_upgrade);
// RFP partition the leaf elements. for now, 1 partition
const int npartitions = 1;
wbuf_nocrc_int(wbuf, npartitions);
......@@ -732,6 +736,13 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf *
result->u.l.leaf_stats.dsize = rbuf_ulonglong(rb);
result->u.l.leaf_stats.exact = TRUE;
if (result->layout_version >= BRT_LAYOUT_VERSION_13) {
result->u.l.optimized_for_upgrade = rbuf_int(rb);
}
else {
result->u.l.optimized_for_upgrade = 0;
}
// deserialize the number of partitions
int npartitions = rbuf_int(rb);
invariant(npartitions == 1);
......@@ -759,14 +770,31 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf *
u_int32_t actual_sum = 0;
u_int32_t start_of_data = rb->ndone;
OMTVALUE *MALLOC_N(n_in_buf, array);
for (int i=0; i<n_in_buf; i++) {
LEAFENTRY le = (LEAFENTRY)(&rb->buf[rb->ndone]);
u_int32_t disksize = leafentry_disksize(le);
rb->ndone += disksize;
invariant(rb->ndone<=rb->size);
array[i]=(OMTVALUE)le;
actual_sum += x1764_memory(le, disksize);
if (result->layout_version == BRT_LAYOUT_VERSION) {
for (int i=0; i<n_in_buf; i++) {
LEAFENTRY le = (LEAFENTRY)(&rb->buf[rb->ndone]);
u_int32_t disksize = leafentry_disksize(le);
rb->ndone += disksize;
invariant(rb->ndone<=rb->size);
array[i]=(OMTVALUE)le;
actual_sum += x1764_memory(le, disksize);
}
}
else if (result->layout_version == BRT_LAYOUT_VERSION_12) {
for (int i=0; i<n_in_buf; i++) {
// these two lines and optimized_for_upgrade logic above are only difference in handling
// versions 12 and 13 at this layer (more logic at higher layer)
LEAFENTRY_12 le = (LEAFENTRY_12)(&rb->buf[rb->ndone]);
u_int32_t disksize = leafentry_disksize_12(le);
rb->ndone += disksize;
invariant(rb->ndone<=rb->size);
array[i]=(OMTVALUE)le;
actual_sum += x1764_memory(le, disksize);
}
}
else {
invariant(FALSE);
}
toku_trace("fill array");
u_int32_t end_of_data = rb->ndone;
......@@ -822,7 +850,8 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b
bytevec magic;
rbuf_literal_bytes(rb, &magic, 8);
result->layout_version = rbuf_int(rb);
invariant(result->layout_version == BRT_LAYOUT_VERSION);
invariant(result->layout_version >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
invariant(result->layout_version <= BRT_LAYOUT_VERSION);
result->layout_version_original = rbuf_int(rb);
result->layout_version_read_from_disk = result->layout_version;
result->nodesize = rbuf_int(rb);
......@@ -927,8 +956,10 @@ decompress_from_raw_block_into_rbuf(u_int8_t *raw_block, size_t raw_block_size,
static int
decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) {
// This function exists solely to accomodate future changes in compression.
int r;
switch (version) {
case BRT_LAYOUT_VERSION_12:
case BRT_LAYOUT_VERSION:
r = decompress_from_raw_block_into_rbuf(raw_block, raw_block_size, rb, blocknum);
break;
......@@ -941,27 +972,87 @@ decompress_from_raw_block_into_rbuf_versioned(u_int32_t version, u_int8_t *raw_b
static int
deserialize_brtnode_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h, struct rbuf *rb) {
int r = 0;
BRTNODE brtnode_12 = NULL;
int upgrade = 0;
switch (version) {
case BRT_LAYOUT_VERSION:
if (!upgrade)
r = deserialize_brtnode_from_rbuf(blocknum, fullhash, &brtnode_12, h, rb);
if (r==0) {
lazy_assert(brtnode_12);
*brtnode = brtnode_12;
}
if (upgrade && r == 0) {
toku_brtheader_lock(h);
lazy_assert(h->num_blocks_to_upgrade>0);
h->num_blocks_to_upgrade--;
toku_brtheader_unlock(h);
(*brtnode)->dirty = 1;
}
break; // this is the only break
default:
lazy_assert(FALSE);
BRTNODE node = NULL;
r = deserialize_brtnode_from_rbuf(blocknum, fullhash, &node, h, rb); // we just filled the node with contents from rbuf
if (r==0) {
invariant(node);
int upgrade = 0;
switch (version) {
case BRT_LAYOUT_VERSION_12:
invariant(node->layout_version == BRT_LAYOUT_VERSION_12);
//Any upgrade necessary.
if (node->height == 0) {
//leaf
uint32_t i;
OMT omt = node->u.l.buffer;
uint32_t num_les = toku_omt_size(omt);
LEAFENTRY *XCALLOC_N(num_les, new_les);
OMTVALUE v;
u_int32_t incremental_fingerprint = 0;
u_int32_t incremental_size = 0;
for (i = 0; i < num_les; i++) {
r = toku_omt_fetch(omt, i, &v, NULL);
invariant(r==0);
size_t new_memsize, new_disksize;
// Translate packed version 12 leafentry to packed version 13 leafentry
r = toku_le_upgrade_12_13(v, &new_memsize, &new_disksize, &new_les[i]);
invariant(r==0);
invariant(new_memsize == new_disksize);
incremental_size += OMT_ITEM_OVERHEAD + new_memsize;
incremental_fingerprint += toku_le_crc(new_les[i]);
}
//Regenerate fingerprint.
node->local_fingerprint = node->rand4fingerprint * incremental_fingerprint;
//Set buffer size.
node->u.l.n_bytes_in_buffer = incremental_size;
//Replace mempool (destroy old, create new).
uint8_t *p;
{
void *mpbase = toku_mempool_get_base(&node->u.l.buffer_mempool);
toku_mempool_fini(&node->u.l.buffer_mempool);
toku_free(mpbase);
mpbase = toku_xmalloc(incremental_size);
toku_mempool_init(&node->u.l.buffer_mempool, mpbase, incremental_size);
node->u.l.buffer_mempool.free_offset = incremental_size;
p = mpbase;
}
//p points to beginning of new mempool
for (i = 0; i < num_les; i++) {
size_t len = leafentry_memsize(new_les[i]);
memcpy(p, new_les[i], len);
r = toku_omt_set_at(omt, p, i);
invariant(r==0);
p += len;
toku_free(new_les[i]); //Free malloced version of new leafentry (copy exists in mempool)
}
toku_free(new_les); // Free array of pointers to new leafentries
//Regenerate nkeys, ndata, dsize
toku_brt_leaf_reset_calc_leaf_stats(node);
toku_sync_fetch_and_increment_uint64(&upgrade_status.leaf_12); // how many leaf nodes upgraded from v12
}
else {
toku_sync_fetch_and_increment_uint64(&upgrade_status.nonleaf_12); // how many nonleaf nodes upgraded from v12
}
node->flags &= ~TOKU_DB_VALCMP_BUILTIN_12; // delete obsolete flag
node->layout_version = BRT_LAYOUT_VERSION;
upgrade++;
//Fall through on purpose
case BRT_LAYOUT_VERSION:
invariant(node->layout_version == BRT_LAYOUT_VERSION);
if (upgrade) {
toku_brtheader_lock(h);
invariant(h->num_blocks_to_upgrade>0);
h->num_blocks_to_upgrade--;
toku_brtheader_unlock(h);
node->dirty = 1;
}
*brtnode = node;
break; // this is the only break
default:
invariant(FALSE);
}
}
return r;
}
......@@ -1051,21 +1142,32 @@ cleanup:
int
toku_maybe_upgrade_brt(BRT t) { // possibly do some work to complete the version upgrade of brt
// If someday we need to inject a message to upgrade the brt, this is where
// it should be done. Whenever an upgrade is done, all nodes will be marked
// as dirty, so it makes sense here to always inject an OPTIMIZE message.
// (Note, if someday the version number is stored in the translation instead
// of in each node, then the upgrade would not necessarily dirty each node.)
int r = 0;
int version = t->h->layout_version_read_from_disk;
if (!t->h->upgrade_brt_performed) {
int upgrade = 0;
if (!t->h->upgrade_brt_performed) { // upgrade may be necessary
switch (version) {
case BRT_LAYOUT_VERSION_11:
r = 0;
//Fall through on purpose.
case BRT_LAYOUT_VERSION:
if (r == 0) {
t->h->upgrade_brt_performed = TRUE;
}
break;
default:
lazy_assert(FALSE);
case BRT_LAYOUT_VERSION_12:
r = 0;
upgrade++;
//Fall through on purpose.
case BRT_LAYOUT_VERSION:
if (r == 0 && upgrade) {
r = toku_brt_optimize_for_upgrade(t);
toku_sync_fetch_and_increment_uint64(&upgrade_status.optimized_for_upgrade_12);
}
if (r == 0) {
t->h->upgrade_brt_performed = TRUE; // no further upgrade necessary
}
break;
default:
invariant(FALSE);
}
}
if (r) {
......@@ -1147,9 +1249,6 @@ serialize_brt_header_min_size (u_int32_t version) {
case BRT_LAYOUT_VERSION_13:
size += 8; //TXNID that created
case BRT_LAYOUT_VERSION_12:
size += 8; // Number of blocks in old version.
// fall through to add up bytes in previous version
case BRT_LAYOUT_VERSION_11:
size += (+8 // "tokudata"
+4 // version
+4 // original_version
......@@ -1161,9 +1260,9 @@ serialize_brt_header_min_size (u_int32_t version) {
+8 // translation_size_on_disk
+8 // translation_address_on_disk
+4 // checksum
);
size+=(+8 // diskoff
+4 // flags
+8 // Number of blocks in old version.
+8 // diskoff
+4 // flags
);
break;
default:
......@@ -1398,7 +1497,9 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
//version MUST be in network order on disk regardless of disk order
h->layout_version = rbuf_network_int(&rc);
//TODO: #1924
lazy_assert(h->layout_version==BRT_LAYOUT_VERSION);
invariant(h->layout_version >= BRT_LAYOUT_MIN_SUPPORTED_VERSION);
invariant(h->layout_version <= BRT_LAYOUT_VERSION);
h->layout_version_read_from_disk = h->layout_version;
//Size MUST be in network order regardless of disk order.
u_int32_t size = rbuf_network_int(&rc);
......@@ -1432,8 +1533,7 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
toku_blocktable_create_from_buffer(&h->blocktable,
translation_address_on_disk,
translation_size_on_disk,
tbuf,
FALSE /*not version 11 or older */ );
tbuf);
toku_free(tbuf);
}
......@@ -1443,7 +1543,10 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
deserialize_descriptor_from(fd, h, &h->descriptor);
h->layout_version_original = rbuf_int(&rc);
h->num_blocks_to_upgrade = rbuf_ulonglong(&rc);
rbuf_TXNID(&rc, &h->root_xid_that_created);
if (h->layout_version >= BRT_LAYOUT_VERSION_13) {
// at this layer, this new field is the only difference between versions 12 and 13
rbuf_TXNID(&rc, &h->root_xid_that_created);
}
(void)rbuf_int(&rc); //Read in checksum and ignore (already verified).
if (rc.ndone!=rc.size) {ret = EINVAL; goto died1;}
toku_free(rc.buf);
......@@ -1454,36 +1557,46 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
//TODO: When version 13 exists, add case for version 12 that looks like version 10 case,
// but calls deserialize_brtheader_12() and upgrade_12_13()
//TODO: When version 14 exists, add case for version 13 that looks like version 12 case,
static int
deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **brth, u_int32_t version) {
int rval;
struct brt_header *brth_12 = NULL;
int upgrade = 0;
switch(version) {
case BRT_LAYOUT_VERSION:
if (!upgrade)
rval = deserialize_brtheader (fd, rb, &brth_12);
if (rval == 0) {
lazy_assert(brth_12);
*brth = brth_12;
}
if (upgrade && rval == 0) {
toku_brtheader_lock(*brth);
(*brth)->num_blocks_to_upgrade = toku_block_get_blocks_in_use_unlocked((*brth)->blocktable);
(*brth)->dirty = 1;
toku_brtheader_unlock(*brth);
}
break; // this is the only break
default:
lazy_assert(FALSE);
}
struct brt_header *h = NULL;
rval = deserialize_brtheader (fd, rb, &h); //deserialize from rbuf and fd into header
if (rval == 0) {
lazy_assert((*brth)->layout_version == BRT_LAYOUT_VERSION);
(*brth)->layout_version_read_from_disk = version;
(*brth)->upgrade_brt_performed = FALSE;
invariant(h);
switch (version) {
case BRT_LAYOUT_VERSION_12:
invariant(h->layout_version == BRT_LAYOUT_VERSION_12);
{
//Upgrade root_xid_that_created
//Fake creation during the last checkpoint.
h->root_xid_that_created = h->checkpoint_lsn.lsn;
}
{
//Deprecate 'TOKU_DB_VALCMP_BUILTIN'. Just remove the flag
h->flags &= ~TOKU_DB_VALCMP_BUILTIN_12;
}
h->layout_version++;
toku_sync_fetch_and_increment_uint64(&upgrade_status.header_12); // how many header nodes upgraded from v12
upgrade++;
//Fall through on purpose
case BRT_LAYOUT_VERSION:
invariant(h->layout_version == BRT_LAYOUT_VERSION);
h->upgrade_brt_performed = FALSE;
if (upgrade) {
toku_brtheader_lock(h);
h->num_blocks_to_upgrade = toku_block_get_blocks_in_use_unlocked(h->blocktable); //Total number of blocks
h->dirty = 1;
toku_brtheader_unlock(h);
}
*brth = h;
break; // this is the only break
default:
invariant(FALSE);
}
}
return rval;
}
......@@ -1494,14 +1607,14 @@ deserialize_brtheader_versioned (int fd, struct rbuf *rb, struct brt_header **br
// If that ever changes, then modify this.
//TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the file AND the header is useless
static int
deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf *rb, u_int64_t *checkpoint_count, u_int32_t * version_p) {
deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset_of_header, struct rbuf *rb, u_int64_t *checkpoint_count, u_int32_t * version_p) {
int r = 0;
const int64_t prefix_size = 8 + // magic ("tokudata")
4 + // version
4; // size
unsigned char prefix[prefix_size];
rb->buf = NULL;
int64_t n = pread(fd, prefix, prefix_size, offset);
int64_t n = pread(fd, prefix, prefix_size, offset_of_header);
if (n==0) r = TOKUDB_DICTIONARY_NO_HEADER;
else if (n<0) {r = errno; lazy_assert(r!=0);}
else if (n!=prefix_size) r = EINVAL;
......@@ -1546,7 +1659,7 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf *
rb->buf = toku_xmalloc(rb->size);
}
if (r==0) {
n = pread(fd, rb->buf, rb->size, offset);
n = pread(fd, rb->buf, rb->size, offset_of_header);
if (n==-1) {
r = errno;
lazy_assert(r!=0);
......@@ -1557,12 +1670,9 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset, struct rbuf *
//We have an rbuf that represents the header.
//Size is within acceptable bounds.
if (r==0) {
//Verify checksum
//Verify checksum (BRT_LAYOUT_VERSION_12 or later, when checksum function changed)
u_int32_t calculated_x1764 = x1764_memory(rb->buf, rb->size-4);
u_int32_t stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
if (version<=BRT_LAYOUT_VERSION_11) {
calculated_x1764 = ~calculated_x1764;
}
if (calculated_x1764!=stored_x1764) r = TOKUDB_DICTIONARY_NO_HEADER; //Header useless
}
if (r==0) {
......@@ -1837,23 +1947,12 @@ static int
deserialize_rollback_log_from_rbuf_versioned (u_int32_t version, BLOCKNUM blocknum, u_int32_t fullhash,
ROLLBACK_LOG_NODE *log,
struct brt_header *h, struct rbuf *rb) {
//Upgrade is not necessary really here. Rollback log nodes do not survive version changes.
int r = 0;
ROLLBACK_LOG_NODE rollback_log_node = NULL;
int upgrade = 0;
switch (version) {
case BRT_LAYOUT_VERSION:
if (!upgrade)
r = deserialize_rollback_log_from_rbuf(blocknum, fullhash, &rollback_log_node, h, rb);
if (r==0) {
lazy_assert(rollback_log_node);
*log = rollback_log_node;
}
if (upgrade && r == 0) (*log)->dirty = 1;
break; // this is the only break
default:
lazy_assert(FALSE);
invariant(version==BRT_LAYOUT_VERSION); //Rollback log nodes do not survive version changes.
r = deserialize_rollback_log_from_rbuf(blocknum, fullhash, &rollback_log_node, h, rb);
if (r==0) {
*log = rollback_log_node;
}
return r;
}
......
......@@ -174,6 +174,10 @@ message are not gorged. (But they may be hungry or too fat or too thin.)
#include "roll.h"
#include "toku_atomic.h"
static const uint32_t this_version = BRT_LAYOUT_VERSION;
void
toku_brt_header_suppress_rollbacks(struct brt_header *h, TOKUTXN txn) {
TXNID txnid = toku_txn_get_txnid(txn);
......@@ -296,6 +300,12 @@ calc_leaf_stats (BRTNODE node) {
return e;
}
void
toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node) {
invariant(node->height==0);
node->u.l.leaf_stats = calc_leaf_stats(node);
}
static void __attribute__((__unused__))
brt_leaf_check_leaf_stats (BRTNODE node)
{
......@@ -483,13 +493,16 @@ void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename
}
//fd is protected (must be holding fdlock)
int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, long *sizep, void*extraargs) {
int toku_brtnode_fetch_callback (CACHEFILE UU(cachefile), int fd, BLOCKNUM nodename, u_int32_t fullhash,
void **brtnode_pv, long *sizep, int *dirtyp, void *extraargs) {
lazy_assert(extraargs);
struct brt_header *h = extraargs;
BRTNODE *result=(BRTNODE*)brtnode_pv;
int r = toku_deserialize_brtnode_from(fd, nodename, fullhash, result, h);
if (r == 0)
if (r == 0) {
*sizep = brtnode_memory_size(*result);
*dirtyp = (*result)->dirty;
}
//(*result)->parent_brtnode = 0; /* Don't know it right now. */
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
return r;
......@@ -656,6 +669,7 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height, size_
n->u.n.childkeys=0;
} else {
n->u.l.leaf_stats = zero_estimates;
n->u.l.optimized_for_upgrade = 0;
int r;
r = toku_omt_create(&n->u.l.buffer);
lazy_assert_zero(r);
......@@ -1646,6 +1660,9 @@ brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd,
lazy_assert(toku_omt_size(node->u.l.buffer) == omt_size);
break;
case BRT_OPTIMIZE_FOR_UPGRADE:
node->dirty = 1;
node->u.l.optimized_for_upgrade = *((uint32_t*)(cmd->u.id.val->data)); // record version of software that sent the optimize_for_upgrade message
case BRT_OPTIMIZE:
// Apply to all leafentries
idx = 0;
......@@ -1893,6 +1910,7 @@ brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_MSG cmd,
case BRT_COMMIT_BROADCAST_TXN:
case BRT_ABORT_BROADCAST_TXN:
case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
return brt_nonleaf_cmd_all (t, node, cmd, re_array, did_io); // send message to all children
case BRT_NONE:
break;
......@@ -2601,14 +2619,33 @@ toku_brt_load_recovery(TOKUTXN txn, char const * old_iname, char const * new_ina
return r;
}
static int brt_optimize (BRT brt, BOOL upgrade);
// Effect: Optimize the brt.
int
toku_brt_optimize (BRT brt) {
int r = brt_optimize(brt, FALSE);
return r;
}
int
toku_brt_optimize_for_upgrade (BRT brt) {
int r = brt_optimize(brt, TRUE);
return r;
}
static int
brt_optimize (BRT brt, BOOL upgrade) {
int r = 0;
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
TXNID oldest = toku_logger_get_oldest_living_xid(logger);
XIDS root_xids = xids_get_root_xids();
TXNID oldest = TXNID_NONE_LIVING;
if (!upgrade) {
TOKULOGGER logger = toku_cachefile_logger(brt->cf);
oldest = toku_logger_get_oldest_living_xid(logger);
}
XIDS root_xids = xids_get_root_xids();
XIDS message_xids;
if (oldest == TXNID_NONE_LIVING) {
message_xids = root_xids;
......@@ -2622,8 +2659,16 @@ toku_brt_optimize (BRT brt) {
DBT val;
toku_init_dbt(&key);
toku_init_dbt(&val);
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
if (upgrade) {
// maybe there's a better place than the val dbt to put the version, but it seems harmless and is convenient
toku_fill_dbt(&val, &this_version, sizeof(this_version));
BRT_MSG_S brtcmd = { BRT_OPTIMIZE_FOR_UPGRADE, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
}
else {
BRT_MSG_S brtcmd = { BRT_OPTIMIZE, message_xids, .u.id={&key,&val}};
r = toku_brt_root_put_cmd(brt, &brtcmd);
}
xids_destroy(&message_xids);
return r;
}
......
......@@ -68,6 +68,8 @@ int toku_brt_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn) __attribute__ ((warn
int toku_brt_optimize (BRT brt) __attribute__ ((warn_unused_result));
int toku_brt_optimize_for_upgrade (BRT brt) __attribute__ ((warn_unused_result));
// Effect: Insert a key and data pair into a brt if the oplsn is newer than the brt lsn. This function is called during recovery.
// Returns 0 if successful
int toku_brt_maybe_insert (BRT brt, DBT *k, DBT *v, TOKUTXN txn, BOOL oplsn_valid, LSN oplsn, int do_logging, enum brt_msg_type type) __attribute__ ((warn_unused_result));
......@@ -176,7 +178,9 @@ enum brt_header_flags {
//TOKU_DB_DUP = (1<<0), //Obsolete #2862
//TOKU_DB_DUPSORT = (1<<1), //Obsolete #2862
TOKU_DB_KEYCMP_BUILTIN = (1<<2),
//TOKU_DB_VALCMP_BUILTIN = (1<<3),
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_12
TOKU_DB_VALCMP_BUILTIN_12 = (1<<3),
#endif
};
int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater) __attribute__ ((warn_unused_result));
......@@ -238,6 +242,8 @@ BOOL toku_brt_is_recovery_logging_suppressed (BRT) __attribute__ ((warn_unused_r
#define TOKU_MULTIPLE_MAIN_THREADS 0
#endif
void toku_brt_leaf_reset_calc_leaf_stats(BRTNODE node);
int toku_brt_strerror_r(int error, char *buf, size_t buflen);
// Effect: LIke the XSI-compliant strerorr_r, extended to db_strerror().
// If error>=0 then the result is to do strerror_r(error, buf, buflen), that is fill buf with a descriptive error message.
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ifndef BRT_LAYOUT_VERSION_H
#define BRT_LAYOUT_VERSION_H
#ident "$Id$"
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
//Must be defined before other recursive headers could include logger.h
enum brt_layout_version_e {
BRT_LAYOUT_VERSION_5 = 5,
BRT_LAYOUT_VERSION_6 = 6, // Diff from 5 to 6: Add leafentry_estimate
BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6
BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression.
BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates, translation table, dictionary descriptors, checksum in header, subdb support removed from brt layer
BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Added MVCC, deprecated TOKU_DB_VALCMP_BUILTIN(_12)
BRT_NEXT_VERSION, // the version after the current version
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_12 // Minimum version supported
};
#endif
......@@ -120,6 +120,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
case BRT_COMMIT_BROADCAST_TXN: printf("COMMIT_BROADCAST_TXN"); goto ok;
case BRT_ABORT_BROADCAST_TXN: printf("ABORT_BROADCAST_TXN"); goto ok;
case BRT_OPTIMIZE: printf("OPTIMIZE"); goto ok;
case BRT_OPTIMIZE_FOR_UPGRADE: printf("OPTIMIZE_FOR_UPGRADE"); goto ok;
}
printf("HUH?");
ok:
......@@ -139,6 +140,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
} else {
struct subtree_estimates *est = &n->u.l.leaf_stats;
printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 " %s }\n", est->nkeys, est->ndata, est->dsize, est->exact ? "T" : "F");
printf(" optimized_for_upgrade=%u\n", n->u.l.optimized_for_upgrade);
printf(" n_bytes_in_buffer=%u\n", n->u.l.n_bytes_in_buffer);
printf(" items_in_buffer =%u\n", toku_omt_size(n->u.l.buffer));
if (dump_data) toku_omt_iterate(n->u.l.buffer, print_le, 0);
......
......@@ -2227,6 +2227,9 @@ static struct leaf_buf *start_leaf (struct dbout *out, const DESCRIPTOR UU(desc)
lbuf->nkeys_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->ndata_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
lbuf->dsize_p = lbuf->dbuf.off; lbuf->dbuf.off+=8;
putbuf_int32(&lbuf->dbuf, 0); // optimized_for_upgrade
lbuf->partitions_p = lbuf->dbuf.off; lbuf->dbuf.off+=4; lbuf->dbuf.off += stored_sub_block_map_size; // RFP partition map
lbuf->n_in_buf_p = lbuf->dbuf.off; lbuf->dbuf.off+=4;
......
......@@ -19,6 +19,7 @@ extern "C" {
#endif
typedef struct brt *BRT;
typedef struct brtnode *BRTNODE;
struct brt_header;
struct wbuf;
struct dbuf;
......@@ -96,7 +97,8 @@ enum brt_msg_type {
BRT_COMMIT_BROADCAST_TXN = 9, // Broadcast to all leafentries, (commit specific transaction).
BRT_ABORT_BROADCAST_TXN = 10, // Broadcast to all leafentries, (commit specific transaction).
BRT_INSERT_NO_OVERWRITE = 11,
BRT_OPTIMIZE = 12,
BRT_OPTIMIZE = 12, // Broadcast
BRT_OPTIMIZE_FOR_UPGRADE = 13, // same as BRT_OPTIMIZE, but record version number in leafnode
};
typedef struct xids_t *XIDS;
......
......@@ -1092,6 +1092,8 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) {
void *toku_value = 0;
long size = 0;
int dirty = 0;
WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key));
......@@ -1100,7 +1102,9 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) {
int r;
if (toku_cachefile_is_dev_null_unlocked(cf)) r = -1;
else r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &size, extraargs);
else r = fetch_callback(cf, cf->fd, key, fullhash, &toku_value, &size, &dirty, extraargs);
if (dirty)
p->dirty = CACHETABLE_DIRTY;
cachetable_lock(ct);
rwlock_read_unlock(&cf->fdlock);
......
......@@ -122,7 +122,7 @@ typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void
// Returns: 0 if success, otherwise an error number. The address and size of the object
// associated with the key are returned.
// Can access fd (fd is protected by a readlock during call)
typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs);
typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs);
void toku_cachefile_set_userdata(CACHEFILE cf, void *userdata,
int (*log_fassociate_during_checkpoint)(CACHEFILE, void*),
......
......@@ -111,7 +111,7 @@ struct __attribute__ ((__packed__)) leafentry {
typedef struct leafentry *LEAFENTRY;
typedef struct leafentry_12 *LEAFENTRY_12;
u_int32_t toku_le_crc(LEAFENTRY v);
......@@ -173,7 +173,6 @@ le_clean(uint8_t *key, uint32_t keylen,
struct dbuf *d);
//Callback contract:
// Function checks to see if id is accepted by context.
// Returns:
......@@ -187,6 +186,15 @@ int le_iterate_is_empty(LEAFENTRY le, LE_ITERATE_CALLBACK f, BOOL *is_empty, TOK
int le_iterate_val(LEAFENTRY le, LE_ITERATE_CALLBACK f, void** valpp, u_int32_t *vallenp, TOKUTXN context);
size_t
leafentry_disksize_12(LEAFENTRY_12 le);
int
toku_le_upgrade_12_13(LEAFENTRY_12 old_leafentry, // NULL if there was no stored data.
size_t *new_leafentry_memorysize,
size_t *new_leafentry_disksize,
LEAFENTRY *new_leafentry_p);
#if defined(__cplusplus) || defined(__cilkplusplus)
};
#endif
......
......@@ -38,7 +38,7 @@ static inline void toku_free_FILENUMS(FILENUMS val) { toku_free(val.filenums); }
void toku_set_lsn_increment (uint64_t incr) __attribute__((__visibility__("default")));
int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir);
int toku_maybe_upgrade_log (const char *env_dir, const char *log_dir, LSN * lsn_of_clean_shutdown, BOOL * upgrade_in_progress);
uint64_t toku_log_upgrade_get_footprint(void);
......
......@@ -8,15 +8,18 @@
#include "checkpoint.h"
static uint64_t footprint = 0; // for debug and accountability
static uint64_t footprint_previous_upgrade = 0; // for debug and accountability
uint64_t
toku_log_upgrade_get_footprint(void) {
return footprint + (100000 * footprint_previous_upgrade);
return footprint;
}
#define FOOTPRINT(x) footprint=footprint_start+(x*footprint_increment)
#define FOOTPRINTSETUP(increment) uint64_t footprint_start=footprint; uint64_t footprint_increment=increment;
// Footprint concept here is that each function increments a different decimal digit.
// The cumulative total shows the path taken for the upgrade.
// Each function must have a single return for this to work.
#define FOOTPRINT(x) function_footprint=(x*footprint_increment)
#define FOOTPRINTSETUP(increment) uint64_t function_footprint = 0; uint64_t footprint_increment=increment;
#define FOOTPRINTCAPTURE footprint+=function_footprint;
// The lock file is used to detect a failed upgrade. It is created at the start
// of the upgrade procedure and deleted at the end of the upgrade procedure. If
......@@ -37,17 +40,17 @@ static const int upgrade_lock_prefix_size = 8 // magic ("tokuupgr")
static int
verify_clean_shutdown_of_log_version_current(const char *log_dir, LSN * last_lsn) {
int rval = DB_RUNRECOVERY;
TOKULOGCURSOR logcursor = NULL;
int rval = TOKUDB_UPGRADE_FAILURE;
TOKULOGCURSOR cursor = NULL;
int r;
FOOTPRINTSETUP(100);
FOOTPRINT(1);
r = toku_logcursor_create(&logcursor, log_dir);
r = toku_logcursor_create(&cursor, log_dir);
assert(r == 0);
struct log_entry *le = NULL;
r = toku_logcursor_last(logcursor, &le);
r = toku_logcursor_last(cursor, &le);
if (r == 0) {
FOOTPRINT(2);
if (le->cmd==LT_shutdown) {
......@@ -57,276 +60,108 @@ verify_clean_shutdown_of_log_version_current(const char *log_dir, LSN * last_lsn
rval = 0;
}
}
r = toku_logcursor_destroy(&logcursor);
r = toku_logcursor_destroy(&cursor);
assert(r == 0);
FOOTPRINTCAPTURE;
return rval;
}
static int
verify_clean_shutdown_of_log_version_1(const char *log_dir, LSN * last_lsn) {
FOOTPRINTSETUP(100);
verify_clean_shutdown_of_log_version_old(const char *log_dir, LSN * last_lsn) {
int rval = TOKUDB_UPGRADE_FAILURE;
int r;
FOOTPRINTSETUP(10);
FOOTPRINT(1);
//TODO: Remove this hack:
//Base this function on
// - (above)verify_clean_shutdown_of_log_version_current
// - (3.1)tokudb_needs_recovery
// - do breadth/depth first search to find out which functions have to be copied over from 3.1
// - Put copied functions in .. backwards_log_1.[ch]
LSN lsn = {.lsn = 1LLU << 40};
if (last_lsn)
*last_lsn = lsn;
log_dir = log_dir;
return 0;
int n_logfiles;
char **logfiles;
r = toku_logger_find_logfiles(log_dir, &logfiles, &n_logfiles);
if (r!=0) return r;
char *basename;
TOKULOGCURSOR cursor;
struct log_entry *entry;
//Only look at newest log
basename = strrchr(logfiles[n_logfiles-1], '/') + 1;
int version;
long long index = -1;
r = sscanf(basename, "log%lld.tokulog%d", &index, &version);
assert(r==2); // found index and version
assert(version>=TOKU_LOG_MIN_SUPPORTED_VERSION);
assert(version< TOKU_LOG_VERSION); //Must be old
// find last LSN
r = toku_logcursor_create_for_file(&cursor, log_dir, basename);
if (r==0) {
r = toku_logcursor_last(cursor, &entry);
if (r == 0) {
FOOTPRINT(2);
if (entry->cmd==LT_shutdown) {
LSN lsn = entry->u.shutdown.lsn;
if (last_lsn)
*last_lsn = lsn;
rval = 0;
}
}
r = toku_logcursor_destroy(&cursor);
assert(r == 0);
}
for(int i=0;i<n_logfiles;i++) {
toku_free(logfiles[i]);
}
toku_free(logfiles);
FOOTPRINTCAPTURE;
return rval;
}
static int
verify_clean_shutdown_of_log_version(const char *log_dir, uint32_t version, LSN *last_lsn) {
// return 0 if clean shutdown, DB_RUNRECOVERY if not clean shutdown
// return 0 if clean shutdown, TOKUDB_UPGRADE_FAILURE if not clean shutdown
// examine logfile at logfilenum and possibly logfilenum-1
int r = 0;
FOOTPRINTSETUP(100);
FOOTPRINTSETUP(1000);
if (version == TOKU_LOG_VERSION_1) {
if (version < TOKU_LOG_VERSION) {
FOOTPRINT(1);
r = verify_clean_shutdown_of_log_version_1(log_dir, last_lsn);
r = verify_clean_shutdown_of_log_version_old(log_dir, last_lsn);
}
else {
FOOTPRINT(2);
assert(version == TOKU_LOG_VERSION);
r = verify_clean_shutdown_of_log_version_current(log_dir, last_lsn);
}
FOOTPRINTCAPTURE;
return r;
}
//Cross the Rubicon (POINT OF NO RETURN)
static int
convert_logs_and_fsync(const char *log_dir, const char *env_dir, uint32_t from_version, uint32_t to_version) {
int r;
FOOTPRINTSETUP(100);
r = verify_clean_shutdown_of_log_version(log_dir, to_version, NULL);
assert(r==0);
r = toku_delete_all_logs_of_version(log_dir, from_version);
assert(r==0);
r = toku_fsync_dir_by_name_without_accounting(log_dir);
assert(r==0);
if (to_version==TOKU_LOG_VERSION_1) {
//Undo an upgrade from version 1.
//Delete rollback cachefile if it exists.
FOOTPRINT(1);
int rollback_len = strlen(log_dir) + sizeof(ROLLBACK_CACHEFILE_NAME) +1; //1 for '/'
char rollback_fname[rollback_len];
{
int l = snprintf(rollback_fname, sizeof(rollback_fname),
"%s/%s", env_dir, ROLLBACK_CACHEFILE_NAME);
assert(l+1 == (signed)(sizeof(rollback_fname)));
}
r = unlink(rollback_fname);
assert(r==0 || errno==ENOENT);
if (r==0) {
r = toku_fsync_dir_by_name_without_accounting(env_dir);
assert(r==0);
}
}
return r;
}
//After this function completes:
// If any log files exist they are all of the same version.
// There is no lock file.
// There is no commit file.
static int
cleanup_previous_upgrade_attempt(const char *env_dir, const char *log_dir,
const char *upgrade_lock_fname,
const char *upgrade_commit_fname) {
int r = 0;
int lock_fd;
int commit_fd;
unsigned char prefix[upgrade_lock_prefix_size];
FOOTPRINTSETUP(1000);
commit_fd = open(upgrade_commit_fname, O_RDONLY|O_BINARY, S_IRWXU);
if (commit_fd<0) {
assert(errno==ENOENT);
}
lock_fd = open(upgrade_lock_fname, O_RDONLY|O_BINARY, S_IRWXU);
if (lock_fd<0) {
assert(errno == ENOENT);
//Nothing to clean up (lock file does not exist).
}
else { //Lock file exists. Will commit or abort the upgrade.
FOOTPRINT(1);
int64_t n = pread(lock_fd, prefix, upgrade_lock_prefix_size, 0);
assert(n>=0 && n <= upgrade_lock_prefix_size);
struct rbuf rb;
rb.size = upgrade_lock_prefix_size;
rb.buf = prefix;
rb.ndone = 0;
if (n == upgrade_lock_prefix_size) {
FOOTPRINT(2);
//Check magic number
bytevec magic;
rbuf_literal_bytes(&rb, &magic, 8);
assert(memcmp(magic,"tokuupgr",8)==0);
uint32_t to_version = rbuf_network_int(&rb);
uint32_t from_version = rbuf_network_int(&rb);
uint32_t suffix_length = rbuf_int(&rb);
uint32_t stored_x1764 = rbuf_int(&rb);
uint32_t calculated_x1764 = x1764_memory(rb.buf, rb.size-4);
assert(calculated_x1764 == stored_x1764);
//Now that checksum matches, verify data.
assert(to_version == TOKU_LOG_VERSION); //Only upgrading directly to newest log version.
assert(from_version < TOKU_LOG_VERSION); //Otherwise it isn't an upgrade.
assert(from_version >= TOKU_LOG_MIN_SUPPORTED_VERSION); //TODO: make this an error case once we have 3 log versions
assert(suffix_length == 0); //TODO: Future versions may change this.
if (commit_fd>=0) { //Commit the upgrade
footprint_previous_upgrade = 1;
FOOTPRINT(3);
r = convert_logs_and_fsync(log_dir, env_dir, from_version, to_version);
assert(r==0);
}
else { //Abort the upgrade
footprint_previous_upgrade = 2;
FOOTPRINT(4);
r = convert_logs_and_fsync(log_dir, env_dir, to_version, from_version);
assert(r==0);
}
}
else { // We never finished writing lock file: commit file cannot exist yet.
// We are aborting the upgrade, but because the previous attempt never got past
// writing the lock file, nothing needs to be undone.
assert(commit_fd<0);
}
{ //delete lock file
r = close(lock_fd);
assert(r==0);
r = unlink(upgrade_lock_fname);
assert(r==0);
r = toku_fsync_dir_by_name_without_accounting(log_dir);
assert(r==0);
}
}
if (commit_fd>=0) { //delete commit file
r = close(commit_fd);
assert(r==0);
r = unlink(upgrade_commit_fname);
assert(r==0);
r = toku_fsync_dir_by_name_without_accounting(log_dir);
assert(r==0);
}
return r;
}
static int
write_commit_file_and_fsync(const char *log_dir, const char * upgrade_commit_fname) {
int fd;
fd = open(upgrade_commit_fname, O_RDWR|O_BINARY|O_CREAT|O_EXCL, S_IRWXU);
assert(fd>=0);
int r;
r = toku_file_fsync_without_accounting(fd);
assert(r==0);
r = close(fd);
assert(r==0);
r = toku_fsync_dir_by_name_without_accounting(log_dir);
assert(r==0);
return r;
}
static int
write_lock_file_and_fsync(const char *log_dir, const char * upgrade_lock_fname, uint32_t from_version) {
int fd;
fd = open(upgrade_lock_fname, O_RDWR|O_BINARY|O_CREAT|O_EXCL, S_IRWXU);
assert(fd>=0);
char buf[upgrade_lock_prefix_size];
struct wbuf wb;
const int suffix_size = 0;
wbuf_init(&wb, buf, upgrade_lock_prefix_size);
{ //Serialize to wbuf
wbuf_literal_bytes(&wb, "tokuupgr", 8); //magic
wbuf_network_int(&wb, TOKU_LOG_VERSION); //to version
wbuf_network_int(&wb, from_version); //from version
wbuf_int(&wb, suffix_size); //Suffix Length
u_int32_t checksum = x1764_finish(&wb.checksum);
wbuf_int(&wb, checksum); //checksum
assert(wb.ndone == wb.size);
}
toku_os_full_pwrite(fd, wb.buf, wb.size, 0);
{
//Serialize suffix to wbuf and then disk (if exist)
//There is no suffix as of TOKU_LOG_VERSION_2
}
int r;
r = toku_file_fsync_without_accounting(fd);
assert(r==0);
r = close(fd);
assert(r==0);
r = toku_fsync_dir_by_name_without_accounting(log_dir);
assert(r==0);
return r;
}
// from_version is version of lognumber_newest, which contains last_lsn
static int
upgrade_log(const char *env_dir, const char *log_dir,
const char * upgrade_lock_fname, const char * upgrade_commit_fname,
LSN last_lsn,
uint32_t from_version) { // the real deal
upgrade_log(const char *env_dir, const char *log_dir, LSN last_lsn) { // the real deal
int r;
FOOTPRINTSETUP(1000);
FOOTPRINTSETUP(10000);
r = write_lock_file_and_fsync(log_dir, upgrade_lock_fname, from_version);
assert(r==0);
LSN initial_lsn = last_lsn;
initial_lsn.lsn++;
CACHETABLE ct;
TOKULOGGER logger;
FOOTPRINT(1);
{ //Create temporary environment
r = toku_create_cachetable(&ct, 1<<25, initial_lsn, NULL);
assert(r == 0);
toku_cachetable_set_env_dir(ct, env_dir);
r = toku_logger_create(&logger);
assert(r == 0);
toku_logger_write_log_files(logger, FALSE); //Prevent initial creation of log file
toku_logger_set_cachetable(logger, ct);
r = toku_logger_open(log_dir, logger);
assert(r==0);
r = toku_logger_restart(logger, initial_lsn); //Turn log writing on and create first log file with initial lsn
assert(r==0);
FOOTPRINT(1);
}
if (from_version == TOKU_LOG_VERSION_1) {
{ //Create rollback cachefile
r = toku_logger_open_rollback(logger, ct, TRUE);
assert(r==0);
}
{ //Checkpoint
r = toku_checkpoint(ct, logger, NULL, NULL, NULL, NULL);
assert(r == 0);
}
{ //Close rollback cachefile
r = toku_logger_close_rollback(logger, FALSE);
assert(r==0);
}
FOOTPRINT(2);
}
{ //Checkpoint
r = toku_checkpoint(ct, logger, NULL, NULL, NULL, NULL); //fsyncs log dir
assert(r == 0);
FOOTPRINT(3);
}
{ //Close cachetable and logger
r = toku_logger_shutdown(logger);
......@@ -335,82 +170,53 @@ upgrade_log(const char *env_dir, const char *log_dir,
assert(r==0);
r = toku_logger_close(&logger);
assert(r==0);
FOOTPRINT(4);
}
{ //Write commit file
r = write_commit_file_and_fsync(log_dir, upgrade_commit_fname);
assert(r==0);
}
{ // Cross the Rubicon here:
// Delete all old logs: POINT OF NO RETURN
r = convert_logs_and_fsync(log_dir, env_dir, from_version, TOKU_LOG_VERSION);
assert(r==0);
FOOTPRINT(5);
}
{ //Delete upgrade lock file and ensure directory is fsynced
r = unlink(upgrade_lock_fname);
assert(r==0);
r = toku_fsync_dir_by_name_without_accounting(log_dir);
assert(r==0);
}
{ //Delete upgrade commit file and ensure directory is fsynced
r = unlink(upgrade_commit_fname);
assert(r==0);
r = toku_fsync_dir_by_name_without_accounting(log_dir);
{
r = verify_clean_shutdown_of_log_version(log_dir, TOKU_LOG_VERSION, NULL);
assert(r==0);
}
FOOTPRINT(6);
FOOTPRINTCAPTURE;
return 0;
}
int
toku_maybe_upgrade_log(const char *env_dir, const char *log_dir) {
toku_maybe_upgrade_log(const char *env_dir, const char *log_dir, LSN * lsn_of_clean_shutdown, BOOL * upgrade_in_progress) {
int r;
int lockfd = -1;
FOOTPRINTSETUP(10000);
FOOTPRINTSETUP(100000);
*upgrade_in_progress = FALSE; // set TRUE only if all criteria are met and we're actually doing an upgrade
FOOTPRINT(1);
r = toku_recover_lock(log_dir, &lockfd);
if (r == 0) {
FOOTPRINT(2);
assert(log_dir);
assert(env_dir);
char upgrade_lock_fname[strlen(log_dir) + sizeof(upgrade_lock_file_suffix)];
{ //Generate full fname
int l = snprintf(upgrade_lock_fname, sizeof(upgrade_lock_fname),
"%s%s", log_dir, upgrade_lock_file_suffix);
assert(l+1 == (ssize_t)(sizeof(upgrade_lock_fname)));
}
char upgrade_commit_fname[strlen(log_dir) + sizeof(upgrade_commit_file_suffix)];
{ //Generate full fname
int l = snprintf(upgrade_commit_fname, sizeof(upgrade_commit_fname),
"%s%s", log_dir, upgrade_commit_file_suffix);
assert(l+1 == (ssize_t)(sizeof(upgrade_commit_fname)));
}
r = cleanup_previous_upgrade_attempt(env_dir, log_dir,
upgrade_lock_fname, upgrade_commit_fname);
uint32_t version_of_logs_on_disk;
BOOL found_any_logs;
r = toku_get_version_of_logs_on_disk(log_dir, &found_any_logs, &version_of_logs_on_disk);
if (r==0) {
uint32_t version_of_logs_on_disk;
BOOL found_any_logs;
r = toku_get_version_of_logs_on_disk(log_dir, &found_any_logs, &version_of_logs_on_disk);
if (r==0) {
if (!found_any_logs)
r = 0; //No logs means no logs to upgrade.
else if (version_of_logs_on_disk > TOKU_LOG_VERSION)
r = TOKUDB_DICTIONARY_TOO_NEW;
else if (version_of_logs_on_disk < TOKU_LOG_MIN_SUPPORTED_VERSION)
r = TOKUDB_DICTIONARY_TOO_OLD;
else if (version_of_logs_on_disk == TOKU_LOG_VERSION)
r = 0; //Logs are up to date
else {
FOOTPRINT(1);
LSN last_lsn;
r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn);
if (r==0) {
FOOTPRINT(2);
r = upgrade_log(env_dir, log_dir,
upgrade_lock_fname, upgrade_commit_fname,
last_lsn, version_of_logs_on_disk);
}
FOOTPRINT(3);
if (!found_any_logs)
r = 0; //No logs means no logs to upgrade.
else if (version_of_logs_on_disk > TOKU_LOG_VERSION)
r = TOKUDB_DICTIONARY_TOO_NEW;
else if (version_of_logs_on_disk < TOKU_LOG_MIN_SUPPORTED_VERSION)
r = TOKUDB_DICTIONARY_TOO_OLD;
else if (version_of_logs_on_disk == TOKU_LOG_VERSION)
r = 0; //Logs are up to date
else {
FOOTPRINT(4);
LSN last_lsn;
r = verify_clean_shutdown_of_log_version(log_dir, version_of_logs_on_disk, &last_lsn);
if (r==0) {
FOOTPRINT(5);
*lsn_of_clean_shutdown = last_lsn;
*upgrade_in_progress = TRUE;
r = upgrade_log(env_dir, log_dir, last_lsn);
}
}
}
......@@ -421,6 +227,7 @@ toku_maybe_upgrade_log(const char *env_dir, const char *log_dir) {
if (r==0) r = rc;
}
}
FOOTPRINTCAPTURE;
return r;
}
......@@ -89,7 +89,7 @@ static int lc_open_logfile(TOKULOGCURSOR lc, int index) {
r = toku_read_logmagic(lc->cur_fp, &version);
if (r!=0)
return DB_BADFORMAT;
if (version != TOKU_LOG_VERSION)
if (version < TOKU_LOG_MIN_SUPPORTED_VERSION || version > TOKU_LOG_VERSION)
return DB_BADFORMAT;
}
// mark as open
......@@ -379,6 +379,7 @@ int toku_logcursor_first(TOKULOGCURSOR lc, struct log_entry **le) {
return r;
}
//get last entry in the logfile specified by logcursor
int toku_logcursor_last(TOKULOGCURSOR lc, struct log_entry **le) {
int r=0;
if ( lc->entry_valid ) {
......@@ -462,6 +463,7 @@ static int lc_fix_bad_logfile(TOKULOGCURSOR lc) {
r = fseek(lc->cur_fp, 0, SEEK_SET); if ( r!=0 ) return r;
r = toku_read_logmagic(lc->cur_fp, &version); if ( r!=0 ) return r;
if (version != TOKU_LOG_VERSION) return -1;
toku_off_t last_good_pos;
last_good_pos = ftello(lc->cur_fp);
......
......@@ -79,17 +79,20 @@ int toku_logfilemgr_init(TOKULOGFILEMGR lfm, const char *log_dir) {
return ENOMEM;
}
// find the index
// basename is the filename of the i-th logfile
basename = strrchr(logfiles[i], '/') + 1;
int version;
r = sscanf(basename, "log%lld.tokulog%d", &index, &version);
assert(r==2); // found index and version
assert(version==TOKU_LOG_VERSION);
assert(version>=TOKU_LOG_MIN_SUPPORTED_VERSION);
assert(version<=TOKU_LOG_VERSION);
lf_info->index = index;
// find last LSN
lf_info->version = version;
// find last LSN in logfile
r = toku_logcursor_create_for_file(&cursor, log_dir, basename);
if (r!=0)
return r;
r = toku_logcursor_last(cursor, &entry);
r = toku_logcursor_last(cursor, &entry); // set "entry" to last log entry in logfile
if ( r == 0 ) {
lf_info->maxlsn = toku_log_entry_get_lsn(entry);
tmp_lsn = lf_info->maxlsn;
......
......@@ -15,6 +15,7 @@ extern "C" {
struct toku_logfile_info {
int64_t index;
LSN maxlsn;
uint32_t version;
};
typedef struct toku_logfile_info *TOKULOGFILEINFO;
......
......@@ -10,7 +10,7 @@ static const int log_format_version=TOKU_LOG_VERSION;
static int open_logfile (TOKULOGGER logger);
static int toku_logger_write_buffer (TOKULOGGER logger, LSN *fsynced_lsn);
static int delete_logfile(TOKULOGGER logger, long long index);
static int delete_logfile(TOKULOGGER logger, long long index, uint32_t version);
static void grab_output(TOKULOGGER logger, LSN *fsynced_lsn);
static void release_output(TOKULOGGER logger, LSN fsynced_lsn);
......@@ -573,10 +573,40 @@ int toku_logger_find_next_unused_log_file(const char *directory, long long *resu
return r;
}
// TODO: Put this in portability layer when ready
// in: file pathname that may have a dirname prefix
// return: file leaf name
static char * fileleafname(char *pathname) {
const char delimiter = '/';
char *leafname = strrchr(pathname, delimiter);
if (leafname)
leafname++;
else
leafname = pathname;
return leafname;
}
static int logfilenamecompare (const void *ap, const void *bp) {
char *a=*(char**)ap;
char *a_leafname = fileleafname(a);
char *b=*(char**)bp;
return strcmp(a,b);
char * b_leafname = fileleafname(b);
int rval;
BOOL valid;
uint64_t num_a = 0; // placate compiler
uint64_t num_b = 0;
uint32_t ver_a = 0;
uint32_t ver_b = 0;
valid = is_a_logfile_any_version(a_leafname, &num_a, &ver_a);
invariant(valid);
valid = is_a_logfile_any_version(b_leafname, &num_b, &ver_b);
invariant(valid);
if (ver_a < ver_b) rval = -1;
else if (ver_a > ver_b) rval = +1;
else if (num_a < num_b) rval = -1;
else if (num_a > num_b) rval = +1;
else rval = 0;
return rval;
}
// Return the log files in sorted order
......@@ -596,8 +626,9 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
}
int dirnamelen = strlen(directory);
while ((de=readdir(d))) {
long long thisl;
if ( !(is_a_logfile(de->d_name, &thisl)) ) continue; //#2424: Skip over files that don't match the exact logfile template
uint64_t thisl;
uint32_t version_ignore;
if ( !(is_a_logfile_any_version(de->d_name, &thisl, &version_ignore)) ) continue; //#2424: Skip over files that don't match the exact logfile template
if (n_results+1>=result_limit) {
result_limit*=2;
result = toku_realloc(result, result_limit*sizeof(*result));
......@@ -610,8 +641,12 @@ int toku_logger_find_logfiles (const char *directory, char ***resultp, int *n_lo
snprintf(fname, fnamelen, "%s/%s", directory, de->d_name);
result[n_results++] = fname;
}
// Return them in increasing order.
qsort(result, n_results, sizeof(result[0]), logfilenamecompare);
// Return them in increasing order. Set width to allow for newer log file names ("xxx.tokulog13")
// which are one character longer than old log file names ("xxx.tokulog2"). The comparison function
// won't look beyond the terminating NUL, so an extra character in the comparison string doesn't matter.
// Allow room for terminating NUL after "xxx.tokulog13" even if result[0] is of form "xxx.tokulog2."
int width = sizeof(result[0]+2);
qsort(result, n_results, width, logfilenamecompare);
*resultp = result;
*n_logfiles = n_results;
result[n_results]=0; // make a trailing null
......@@ -644,6 +679,7 @@ static int open_logfile (TOKULOGGER logger)
return ENOMEM;
lf_info->index = index;
lf_info->maxlsn = logger->written_lsn;
lf_info->version = TOKU_LOG_VERSION;
toku_logfilemgr_add_logfile_info(logger->logfilemgr, lf_info);
}
logger->fsynced_lsn = logger->written_lsn;
......@@ -651,12 +687,12 @@ static int open_logfile (TOKULOGGER logger)
return 0;
}
static int delete_logfile(TOKULOGGER logger, long long index)
static int delete_logfile(TOKULOGGER logger, long long index, uint32_t version)
// Entry and Exit: This thread has permission to modify the output.
{
int fnamelen = strlen(logger->directory)+50;
char fname[fnamelen];
snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, index, TOKU_LOG_VERSION);
snprintf(fname, fnamelen, "%s/log%012lld.tokulog%d", logger->directory, index, version);
int r = remove(fname);
return r;
}
......@@ -675,7 +711,9 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn)
if ( logger->write_log_files && logger->trim_log_files) {
while ( n_logfiles > 1 ) { // don't delete current logfile
uint32_t log_version;
lf_info = toku_logfilemgr_get_oldest_logfile_info(lfm);
log_version = lf_info->version;
if ( lf_info->maxlsn.lsn > trim_lsn.lsn ) {
// file contains an open LSN, can't delete this or any newer log files
break;
......@@ -684,7 +722,7 @@ int toku_logger_maybe_trim_log(TOKULOGGER logger, LSN trim_lsn)
long index = lf_info->index;
toku_logfilemgr_delete_oldest_logfile_info(lfm);
n_logfiles--;
r = delete_logfile(logger, index);
r = delete_logfile(logger, index, log_version);
if (r!=0) {
break;
}
......@@ -1329,7 +1367,7 @@ toku_logger_get_status(TOKULOGGER logger, LOGGER_STATUS s) {
int
toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint32_t *version_found) {
BOOL found = FALSE;
uint32_t single_version = 0;
uint32_t highest_version = 0;
int r = 0;
struct dirent *de;
......@@ -1338,16 +1376,17 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint
r = errno;
}
else {
// Examine every file in the directory and assert that all log files are of the same version (single_version).
// Examine every file in the directory and find highest version
while ((de=readdir(d))) {
uint32_t this_log_version;
uint64_t this_log_number;
BOOL is_log = is_a_logfile_any_version(de->d_name, &this_log_number, &this_log_version);
if (is_log) {
if (found)
assert(single_version == this_log_version);
if (found) {
highest_version = highest_version > this_log_version ? highest_version : this_log_version;
}
found = TRUE;
single_version = this_log_version;
highest_version = this_log_version;
}
}
}
......@@ -1358,7 +1397,7 @@ toku_get_version_of_logs_on_disk(const char *log_dir, BOOL *found_any_logs, uint
if (r==0) {
*found_any_logs = found;
if (found)
*version_found = single_version;
*version_found = highest_version;
}
return r;
}
......
......@@ -9,12 +9,19 @@
extern "C" {
#endif
#include "brt_layout_version.h"
enum {
TOKU_LOG_VERSION_1 = 1,
TOKU_LOG_VERSION_2 = 2,
TOKU_LOG_NEXT_VERSION, // the version after the current version
TOKU_LOG_VERSION = TOKU_LOG_NEXT_VERSION-1, // A hack so I don't have to change this line.
TOKU_LOG_MIN_SUPPORTED_VERSION = TOKU_LOG_VERSION_2
//After 2 we linked the log version to the BRT_LAYOUT VERSION.
//So it went from 2 to 13 (3-12 do not exist)
TOKU_LOG_VERSION = BRT_LAYOUT_VERSION, //Linked
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION > BRT_LAYOUT_VERSION_12 //linked once we remove support for 12
TOKU_LOG_MIN_SUPPORTED_VERSION = BRT_LAYOUT_MIN_SUPPORTED_VERSION,
#else
TOKU_LOG_MIN_SUPPORTED_VERSION = TOKU_LOG_VERSION_2,
#endif
};
#define ROLLBACK_CACHEFILE_NAME "tokudb.rollback"
......
......@@ -474,7 +474,7 @@ static void toku_rollback_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM
}
static int toku_rollback_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM logname, u_int32_t fullhash,
void **rollback_pv, long *sizep, void *extraargs) {
void **rollback_pv, long *sizep, int * UU(dirtyp), void *extraargs) {
int r;
struct brt_header *h = extraargs;
assert(h->cf == cachefile);
......
......@@ -46,7 +46,7 @@ flush (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), void *value, void *UU
}
static int
fetch (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), u_int32_t UU(fullhash), void **UU(value), long *UU(sizep), void *UU(extraargs))
fetch (CACHEFILE UU(thiscf), int UU(fd), CACHEKEY UU(key), u_int32_t UU(fullhash), void **UU(value), long *UU(sizep), int *UU(dirtyp), void *UU(extraargs))
{
assert(0); // should not be called
return 0;
......
......@@ -19,12 +19,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext
if (keep_me) n_keep_me++;
}
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs) {
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs) {
cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs;
assert(0); // should not be called
n_fetch++;
*value = 0;
*sizep = item_size;
*dirtyp = 0;
return 0;
}
......
......@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
return 0;
......
......@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
return 0;
......
......@@ -22,6 +22,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
return 0;
......
......@@ -17,10 +17,11 @@ flush (CACHEFILE cf __attribute__((__unused__)),
}
static int
fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t hash, void **vptr, long *sizep, void *extra) {
fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t hash, void **vptr, long *sizep, int *dirtyp, void *extra) {
cf = cf; hash = hash; extra = extra;
*sizep = (long) key.b;
*vptr = toku_malloc(*sizep);
*dirtyp = 0;
return 0;
}
......@@ -31,6 +32,7 @@ fetch_error (CACHEFILE cf __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void*extraargs __attribute__((__unused__))
) {
return -1;
......
......@@ -22,12 +22,13 @@ static void flush(CACHEFILE cf, int UU(fd), CACHEKEY key, void *value, void *ext
if (keep_me) n_keep_me++;
}
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs) {
static int fetch(CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, int *dirtyp, void *extraargs) {
cf = cf; key = key; fullhash = fullhash; value = value; sizep = sizep; extraargs = extraargs;
n_fetch++;
sleep(10);
*value = 0;
*sizep = item_size;
*dirtyp = 0;
return 0;
}
......
......@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
......@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0;
*sizep = 1;
*dirtyp = 0;
return -42;
}
......
......@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
......@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = toku_malloc(1);
*sizep = 1;
*dirtyp = 0;
return 0;
}
......
......@@ -27,6 +27,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
......@@ -35,6 +36,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0;
*sizep = 1;
*dirtyp = 0;
return 0;
}
......
......@@ -39,6 +39,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value,
long *sizep,
int *dirtyp,
void *extraargs __attribute__((__unused__))
) {
......@@ -47,6 +48,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0;
*sizep = 1;
*dirtyp = 0;
return 0;
}
......
......@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
......@@ -32,7 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0;
*sizep = 1;
*dirtyp = 0;
return -42;
}
......
......@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
......@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0;
*sizep = 1;
*dirtyp = 0;
return 0;
}
......
......@@ -25,6 +25,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
......@@ -32,6 +33,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0;
*sizep = 1;
*dirtyp = 0;
return 0;
}
......
......@@ -28,6 +28,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
......@@ -36,6 +37,7 @@ fetch (CACHEFILE f __attribute__((__unused__)),
*value = 0;
*sizep = 1;
*dirtyp = 0;
return 0;
}
......
......@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
*dirtyp = 0;
return 0;
}
......
......@@ -75,6 +75,7 @@ static int r_fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void**value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void*extraargs __attribute__((__unused__))) {
// fprintf(stderr, "Whoops, this should never be called");
return -42;
......
......@@ -33,6 +33,7 @@ static int f_fetch (CACHEFILE f,
u_int32_t fullhash __attribute__((__unused__)),
void**value,
long *sizep,
int *dirtyp,
void*extraargs __attribute__((__unused__))) {
void *buf = toku_malloc(BLOCKSIZE);
int r = pread(toku_cachefile_get_and_pin_fd(f), buf, BLOCKSIZE, key.b);
......@@ -40,6 +41,7 @@ static int f_fetch (CACHEFILE f,
assert(r==BLOCKSIZE);
*value = buf;
*sizep = BLOCKSIZE;
*dirtyp = 0;
return 0;
}
......
......@@ -144,12 +144,13 @@ static struct item *make_item (u_int64_t key) {
}
static CACHEKEY did_fetch={-1};
static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
static int fetch (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash __attribute__((__unused__)), void**value, long *sizep __attribute__((__unused__)), int *dirtyp, void*extraargs) {
if (verbose) printf("Fetch %" PRId64 "\n", key.b);
assert (expect_f==f);
assert((long)extraargs==23);
*value = make_item(key.b);
*sizep = test_object_size;
*dirtyp = 0;
did_fetch=key;
return 0;
}
......@@ -308,9 +309,11 @@ static void flush_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEK
}
static int fetch_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
void**value, long *sizep __attribute__((__unused__)),
int * dirtyp, void*extraargs) {
assert((long)extraargs==42);
*value=0;
*dirtyp = 0;
return 0;
}
......@@ -369,17 +372,19 @@ static void null_flush (CACHEFILE cf __attribute__((__unused__)),
BOOL for_checkpoint __attribute__((__unused__))) {
}
static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), void*extraargs) {
static int add123_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
assert(fullhash==toku_cachetable_hash(cf,key));
assert((long)extraargs==123);
*value = (void*)((unsigned long)key.b+123L);
*dirtyp = 0;
return 0;
}
static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), void*extraargs) {
static int add222_fetch (CACHEFILE cf, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
assert(fullhash==toku_cachetable_hash(cf,key));
assert((long)extraargs==222);
*value = (void*)((unsigned long)key.b+222L);
*dirtyp = 0;
return 0;
}
......@@ -443,8 +448,9 @@ static void test_dirty_flush(CACHEFILE f,
if (verbose) printf("test_dirty_flush %p %" PRId64 " %p %ld %u %u\n", f, key.b, value, size, (unsigned)do_write, (unsigned)keep);
}
static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, long *size_ptr, void *arg) {
static int test_dirty_fetch(CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void **value_ptr, long *size_ptr, int * dirtyp, void *arg) {
*value_ptr = arg;
*dirtyp = 0;
assert(fullhash==toku_cachetable_hash(f,key));
if (verbose) printf("test_dirty_fetch %p %" PRId64 " %p %ld %p\n", f, key.b, *value_ptr, *size_ptr, arg);
return 0;
......
......@@ -112,10 +112,11 @@ static void flush_forchain (CACHEFILE f __attribute__((__unused__)),
//print_ints();
}
static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
static int fetch_forchain (CACHEFILE f, int UU(fd), CACHEKEY key, u_int32_t fullhash, void**value, long *sizep __attribute__((__unused__)), int * dirtyp, void*extraargs) {
assert(toku_cachetable_hash(f, key)==fullhash);
assert((long)extraargs==(long)key.b);
*value = (void*)(long)key.b;
*dirtyp = 0;
return 0;
}
......
......@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp __attribute__((__unused__)),
void *extraargs __attribute__((__unused__))
) {
*dirtyp = 0;
return 0;
}
......
......@@ -22,8 +22,10 @@ fetch (CACHEFILE f __attribute__((__unused__)),
u_int32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
int *dirtyp,
void *extraargs __attribute__((__unused__))
) {
*dirtyp = 0;
return 0;
}
......
......@@ -313,7 +313,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) {
XIDS xids = brt_msg_get_xids(msg);
invariant(xids_get_num_xids(xids) < MAX_TRANSACTION_RECORDS);
enum brt_msg_type type = brt_msg_get_type(msg);
if (type != BRT_OPTIMIZE) {
if (type != BRT_OPTIMIZE && type != BRT_OPTIMIZE_FOR_UPGRADE) {
ule_do_implicit_promotions(ule, xids);
}
switch (type) {
......@@ -342,6 +342,7 @@ msg_modify_ule(ULE ule, BRT_MSG msg) {
ule_apply_commit(ule, xids);
break;
case BRT_OPTIMIZE:
case BRT_OPTIMIZE_FOR_UPGRADE:
ule_optimize(ule, xids);
break;
default:
......@@ -358,7 +359,7 @@ test_msg_modify_ule(ULE ule, BRT_MSG msg){
static void ule_optimize(ULE ule, XIDS xids) {
if (ule->num_puxrs) {
TXNID uncommitted = ule->uxrs[ule->num_cuxrs].xid;
TXNID uncommitted = ule->uxrs[ule->num_cuxrs].xid; // outermost uncommitted
TXNID oldest_living_xid = TXNID_NONE;
uint32_t num_xids = xids_get_num_xids(xids);
if (num_xids > 0) {
......@@ -2018,3 +2019,202 @@ bool transaction_open(TXNID xid) {
#endif
#if BRT_LAYOUT_MIN_SUPPORTED_VERSION <= BRT_LAYOUT_VERSION_12
#if TOKU_WINDOWS
#pragma pack(push, 1)
#endif
struct __attribute__ ((__packed__)) leafentry_12 {
u_int8_t num_xrs;
u_int32_t keylen;
u_int32_t innermost_inserted_vallen;
union {
struct __attribute__ ((__packed__)) leafentry_committed_12 {
u_int8_t key_val[0]; //Actual key, then actual val
} comm;
struct __attribute__ ((__packed__)) leafentry_provisional_12 {
u_int8_t innermost_type;
TXNID xid_outermost_uncommitted;
u_int8_t key_val_xrs[]; //Actual key,
//then actual innermost inserted val,
//then transaction records.
} prov;
} u;
};
#if TOKU_WINDOWS
#pragma pack(pop)
#endif
//Requires:
// Leafentry that ule represents should not be destroyed (is not just all deletes)
static size_t
le_memsize_from_ule_12 (ULE ule) {
uint32_t num_uxrs = ule->num_cuxrs + ule->num_puxrs;
assert(num_uxrs);
size_t rval;
if (num_uxrs == 1) {
assert(uxr_is_insert(&ule->uxrs[0]));
rval = 1 //num_uxrs
+4 //keylen
+4 //vallen
+ule->keylen //actual key
+ule->uxrs[0].vallen; //actual val
}
else {
rval = 1 //num_uxrs
+4 //keylen
+ule->keylen //actual key
+1*num_uxrs //types
+8*(num_uxrs-1); //txnids
u_int8_t i;
for (i = 0; i < num_uxrs; i++) {
UXR uxr = &ule->uxrs[i];
if (uxr_is_insert(uxr)) {
rval += 4; //vallen
rval += uxr->vallen; //actual val
}
}
}
return rval;
}
//This function is mostly copied from 4.1.1
// Note, number of transaction records in version 12 has been replaced by separate counters in version 13 (MVCC),
// one counter for committed transaction records and one counter for provisional transaction records. When
// upgrading a version 12 le to version 13, the number of committed transaction records is always set to one (1)
// and the number of provisional transaction records is set to the original number of transaction records
// minus one. The bottom transaction record is assumed to be a committed value. (If there is no committed
// value then the bottom transaction record of version 12 is a committed delete.)
// This is the only change from the 4.1.1 code. The rest of the leafentry is read as is.
static void
le_unpack_12(ULE ule, LEAFENTRY_12 le) {
//Read num_uxrs
uint8_t num_xrs = le->num_xrs;
assert(num_xrs > 0);
ule->uxrs = ule->uxrs_static; //Static version is always enough.
ule->num_cuxrs = 1;
ule->num_puxrs = num_xrs - 1;
//Read the keylen
ule->keylen = toku_dtoh32(le->keylen);
//Read the vallen of innermost insert
u_int32_t vallen_of_innermost_insert = toku_dtoh32(le->innermost_inserted_vallen);
u_int8_t *p;
if (num_xrs == 1) {
//Unpack a 'committed leafentry' (No uncommitted transactions exist)
ule->keyp = le->u.comm.key_val;
ule->uxrs[0].type = XR_INSERT; //Must be or the leafentry would not exist
ule->uxrs[0].vallen = vallen_of_innermost_insert;
ule->uxrs[0].valp = &le->u.comm.key_val[ule->keylen];
ule->uxrs[0].xid = 0; //Required.
//Set p to immediately after leafentry
p = &le->u.comm.key_val[ule->keylen + vallen_of_innermost_insert];
}
else {
//Unpack a 'provisional leafentry' (Uncommitted transactions exist)
//Read in type.
u_int8_t innermost_type = le->u.prov.innermost_type;
assert(!uxr_type_is_placeholder(innermost_type));
//Read in xid
TXNID xid_outermost_uncommitted = toku_dtoh64(le->u.prov.xid_outermost_uncommitted);
//Read pointer to key
ule->keyp = le->u.prov.key_val_xrs;
//Read pointer to innermost inserted val (immediately after key)
u_int8_t *valp_of_innermost_insert = &le->u.prov.key_val_xrs[ule->keylen];
//Point p to immediately after 'header'
p = &le->u.prov.key_val_xrs[ule->keylen + vallen_of_innermost_insert];
BOOL found_innermost_insert = FALSE;
int i; //Index in ULE.uxrs[]
//Loop inner to outer
for (i = num_xrs - 1; i >= 0; i--) {
UXR uxr = &ule->uxrs[i];
//Innermost's type is in header.
if (i < num_xrs - 1) {
//Not innermost, so load the type.
uxr->type = *p;
p += 1;
}
else {
//Innermost, load the type previously read from header
uxr->type = innermost_type;
}
//Committed txn id is implicit (0). (i==0)
//Outermost uncommitted txnid is stored in header. (i==1)
if (i > 1) {
//Not committed nor outermost uncommitted, so load the xid.
uxr->xid = toku_dtoh64(*(TXNID*)p);
p += 8;
}
else if (i == 1) {
//Outermost uncommitted, load the xid previously read from header
uxr->xid = xid_outermost_uncommitted;
}
else {
// i == 0, committed entry
uxr->xid = 0;
}
if (uxr_is_insert(uxr)) {
if (found_innermost_insert) {
//Not the innermost insert. Load vallen/valp
uxr->vallen = toku_dtoh32(*(u_int32_t*)p);
p += 4;
uxr->valp = p;
p += uxr->vallen;
}
else {
//Innermost insert, load the vallen/valp previously read from header
uxr->vallen = vallen_of_innermost_insert;
uxr->valp = valp_of_innermost_insert;
found_innermost_insert = TRUE;
}
}
}
assert(found_innermost_insert);
}
#if ULE_DEBUG
size_t memsize = le_memsize_from_ule_12(ule);
assert(p == ((u_int8_t*)le) + memsize);
#endif
}
size_t
leafentry_disksize_12(LEAFENTRY_12 le) {
ULE_S ule;
le_unpack_12(&ule, le);
size_t memsize = le_memsize_from_ule_12(&ule);
ule_cleanup(&ule);
return memsize;
}
int
toku_le_upgrade_12_13(LEAFENTRY_12 old_leafentry,
size_t *new_leafentry_memorysize,
size_t *new_leafentry_disksize,
LEAFENTRY *new_leafentry_p) {
ULE_S ule;
int rval;
invariant(old_leafentry);
le_unpack_12(&ule, old_leafentry);
rval = le_pack(&ule, // create packed leafentry
new_leafentry_memorysize,
new_leafentry_disksize,
new_leafentry_p,
NULL, NULL, NULL); //NULL for omt means that we use malloc instead of mempool
ule_cleanup(&ule);
return rval;
}
#endif
The essential idea of auto-upgrade from BRT_LAYOUT_VERSION 12 to 13 is to
take advantage of the similarities between the two versions, and not to
try to create an infrastructure for all future upgrades.
As future layouts are created, upgrade paths, if any, will be crafted to
each particular change.
On startup, the version number of the recovery log is checked. If an
upgrade is needed, then the log is tested for a clean shutdown. If
there is no clean shutdown, then an error is returned. If the log does
end in a clean shutdown, then a new log file is created with the current
version number, starting with an LSN that is one greater than the clean
shutdown.
Once the new log is in place, the persistent environment dictionary is
upgraded, and then normal operation begins.
The startup of a new version of the storage engine might not be crash
safe.
Dictionaries, including the persistent environment and the fileops
directory, are upgraded as they are read into memory from disk.
The brt header is upgraded by
- removing an unused flag
- setting the transaction id to the xid of the clean shutdown
- marking the header as dirty
Each non-leaf node is upgraded by:
- removing an unused flag
- upgrading the version numbers in the node
- marking the node as dirty.
This works because all of the version 12 messages are unchanged
in version 13. The version 12 messages will be applied to the
leafentries using version 13 code.
Each non-leaf node is upgraded by
- removing an unused flag
- using modified version 12 code to unpack the version 12 packed
leaf entries into version 13 unpacked leaf entries
- repacking the leafentries into a new mempool
- destroying the original mempool (that holds the version 12
node read from disk)
The node is marked as dirty.
Once the brt is open, a BRT_OPTIMIZE broadcast message is inserted to
optimize the dictionary.
A schematic overview of how a brt node is deserialized:
toku_deserialize_brtnode_from() { // accepts fd, fills in BRTNODE, brt_header
deserialize_brtnode_from_rbuf_versioned() {
deserialize_brtnode_from_rbuf() // accepts rbuf fills in BRTNODE
if nonleaf deserialize_brtnode_nonleaf_from_rbuf(){ // rbuf -> BRTNODE (no version sensitivity)
if leaf deserialize_brtnode_leaf_from_rbuf() { // calculates node size from leafentry sizes
// leafentry sizes vary with version
if version 12 {
if leaf {
unpack each leafentry into a version 13 ule
pack each version 13 ule into version 13 le
allocate new mempool for version 13 les
destroy old mempool
}
remove unused flag
increment version number
mark dirty
}
}
}
Open issues:
- The brt layer makes some callbacks to the handlerton layer. If
any of the functions change from one version to another, then
the result may not be correct. A version number could be
included in all the function signatures so the callback function
could be aware of what version the caller is expecting.
The callbacks are:
- comparator
- hot index generator
- hot column mutator
Note, brt-internal.h defines struct subtree_estimates which contains field nkeys.
This field is obsolete with the removal of dupsort databases (since it will always
be the same as ndata), but removing it is not worth the trouble.
==========
The changes from version 12 to 13 include (may not be complete list):
- Persistent environment dictionary
- version number
- timestamp of environment creation (database installation)
- history of previous versions
- timestamps for upgrades
- Recovery log
- version number
- new log entries (hotindex, maybe others)
- brt header
- version number
- added field (root_xid_that_created), set to last checkpoint lsn
- deleted flag (built-in comparison function for values)
- brt internal node
- version number
- additional message(s) possible, no upgrade needed beyond changing version number
- brt leafnode
- version number
- new leafentry format
- version 12 leafentry unpack code is preserved
- rollback log
- version number is only change, no upgrade is needed because
rollback logs are not preserved through clean shutdown
Because version 12 and version 13 leafentries are significantly
different, the way leafentries is handled is as follows:
- deserialize_brtnode_leaf_from_rbuf()
- sets up array of pointers to leafentries (to be unpacked later),
these pointers are put into an OMT
- calculates checksum (x1764)
- adjusts ndone byte counter to verify that entire rbuf is read
- deserialize_brtnode_from_rbuf_versioned() calls
deserialize_brtnode_leaf_from_rbuf()
- loop through all leafentries, one at a time:
- unpack version 12 le and repack as version 13 le, each in its own malloc'ed memory
- calculate new fingerprint
- create new block
- allocate new mempool
- copy individual les into new mempool
- destroy individual les
- destroy original mempool
Open issues:
- We need to verify clean shutdown before upgrade.
If shutdown was not clean then we would run recovery, and the
code does not support recovering from an old format version.
- One way to do this is to increase the log version number (either
increment or synchronize with BRT_LAYOUT_VERSION).
- Can we just look at the log? needs_recovery(env);
If this mechanism is specific
to the version 12 to 13 upgrade, then that is adequate.
Once the recovery log format changes, then we need a
different mechanism, similar to the 3.x->4.x upgrade
logic in log_upgrade.c.
- How to decide that an upgrade is necessary?
Needed for logic that says:
- If upgrade is necessary, then verify clean shutdown:
If upgrade is necessary (recorded version is old)
and clean shutdown was not done, then exit with
error code.
- tokudb_needs_recovery() is not separate from verification of
clean shutdown. This function indicates if a recovery is
necessary, but it does not verify simple clean shutdown
with just the shutdown log entry. Instead, it looks for
checkpoint begin/checkpoint end. (Also, comment at end
is permitted.)
Proposed solution:
- Decision on whether to perform upgrade is done by examining log version.
- If we need an upgrade:
- If not clean shutdown, then exit with error message, change nothing
on disk.
- If clean shutdown, then create new log by simply creating new log file
(empty, or perhaps with initial comment that says "start of new log").
- Normal log-trimming code will delete old logs. (None of the
locking logic in log_upgrade.c is needed.)
- Log-opening logic needs to be modified to do this. See log file
manager initialization function (and maybe functions it calls),
maybe the log cursor:
- logfilemgr.c: toku_logfilemgr_init()
- Log-trimming logic loops over pairs of file names and LSNs,
deleting old files based on LSN.
- Question: would it help any if the "clean shutdown" log entry
was required to be in a new log file of its own? It would
prevent the creation of an empty log file after "clean shutdown."
It might, but it's probably not worth doing.
Issue of optimize message (to be sent into each dictionary on upgrade)
- BRT_COMMIT_BROADCAST_ALL (should be faster executing, always commits everything, was needed for an earlier upgrade attempt)
- BRT_OPTIMIZE (better tested, has been used, tests to see if transactions are still live)
After upgrade (after clean shutdown, no running transactions, trees
fully flattened), there is no difference in what these two message do.
Note, BRT_OPTIMIZE requires a clean shutdown if used on upgrade. If used before recovery (which an upgrade
without clean shutdown would do), then it would be wrong because it would appear that all transactions were
completed.
TODO:
- update brt header fields
- original layout version
- version read from disk
- add accountability counters
- capture LSN of clean shutdown, use instead of checkpoint lsn
......@@ -182,9 +182,14 @@ xids_get_serialize_size(XIDS xids){
return rval;
}
// Include TXNID zero in checksum to maintain compatibility
// with previously released version.
void
toku_calc_more_murmur_xids (struct x1764 *mm, XIDS xids) {
x1764_add(mm, &xids->num_xids, 1);
TXNID zero = 0;
x1764_add(mm, &zero, 8);
u_int8_t index;
u_int8_t num_xids = xids_get_num_xids(xids);
for (index = 0; index < num_xids; index++) {
......
......@@ -37,7 +37,6 @@ TRANSPARENT_UPGRADE_SRCS = $(wildcard upgrade-*.c)
NONSTANDARD_SRCS= \
$(RECOVER_SRCS) \
$(LOADER_SRCS) \
$(TRANSPARENT_UPGRADE_SRCS) \
#end
#Tests that don't compile in windows. SHould
......@@ -179,6 +178,7 @@ BDB_DONTRUN_TESTS = \
update-multiple-nochange \
update-multiple-key0 \
update-multiple-data-diagonal \
upgrade_simple \
upgrade-test-1 \
upgrade-test-2 \
upgrade-test-3 \
......
......@@ -21,6 +21,7 @@ enum {ROWS_PER_TRANSACTION=10000};
int NUM_DBS=5;
int NUM_ROWS=100000;
int CHECK_RESULTS=0;
int littlenode = 0;
enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize;
int ALLOW_DUPS=0;
......@@ -112,7 +113,7 @@ static void run_test(void)
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr);
r = env->checkpointing_set_period(env, 60); CKERR(r);
r = env->checkpointing_set_period(env, 0); CKERR(r);
DBT desc;
dbt_init(&desc, "foo", sizeof("foo"));
......@@ -124,6 +125,10 @@ static void run_test(void)
for(int i=0;i<NUM_DBS;i++) {
idx[i] = i;
r = db_create(&dbs[i], env, 0); CKERR(r);
if (littlenode) {
r=dbs[i]->set_pagesize(dbs[i], 4096);
CKERR(0);
}
r = dbs[i]->set_descriptor(dbs[i], 1, &desc); CKERR(r);
dbs[i]->app_private = &idx[i];
snprintf(name, sizeof(name), "db_%04x", i);
......@@ -176,7 +181,7 @@ static void do_args(int argc, char * const argv[]) {
} else if (strcmp(argv[0], "-h")==0) {
resultcode=0;
do_usage:
fprintf(stderr, "Usage: -h -c -d <num_dbs> -r <num_rows> %s\n", cmd);
fprintf(stderr, "Usage: -h -c -n -d <num_dbs> -r <num_rows> %s\n", cmd);
exit(resultcode);
} else if (strcmp(argv[0], "-d")==0) {
argc--; argv++;
......@@ -191,6 +196,8 @@ static void do_args(int argc, char * const argv[]) {
NUM_ROWS = atoi(argv[0]);
} else if (strcmp(argv[0], "-c")==0) {
CHECK_RESULTS = 1;
} else if (strcmp(argv[0], "-n")==0) {
littlenode = 1;
} else {
fprintf(stderr, "Unknown arg: %s\n", argv[0]);
resultcode=1;
......
......@@ -40,7 +40,7 @@ test_main (int argc, char *const argv[]) {
dbt_init(&data, there, strlen(there)+1),
0);
r=txn->commit(txn, 0); CKERR(r);
r=env->txn_checkpoint(env, 0, 0, 0);
r=env->txn_checkpoint(env, 0, 0, 0); CKERR(r);
}
{
......
......@@ -48,7 +48,7 @@ test_main (int argc, char *const argv[]) {
dbt_init(&data, there, strlen(there)+1),
0);
r=txn->commit(txn, 0); CKERR(r);
r=env->txn_checkpoint(env, 0, 0, 0);
r=env->txn_checkpoint(env, 0, 0, 0); CKERR(r);
}
{
......
......@@ -20,15 +20,17 @@ DB_ENV *env;
enum {MAX_NAME=128};
int NUM_DBS=5;
int NUM_ROWS=100000;
int CHECK_RESULTS=0;
enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize;
int SRC_VERSION = 4;
int littlenode = 0;
int flat = 0;
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
char *db_v4_dir_flat = "env_preload.4.1.1.flat.cleanshutdown";
int SRC_VERSION = 4;
static void upgrade_test_1(DB **dbs) {
int r;
......@@ -64,39 +66,52 @@ static void upgrade_test_1(DB **dbs) {
}
}
static void run_test(void)
{
static void setup(void) {
int r;
char *src_db_dir;
if ( SRC_VERSION == 3 )
src_db_dir = db_v3_dir;
else if ( SRC_VERSION == 4 )
src_db_dir = db_v4_dir;
int len = 256;
char syscmd[len];
char * src_db_dir;
if ( SRC_VERSION == 4 ) {
if (flat)
src_db_dir = db_v4_dir_flat;
else if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0);
}
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables();
}
static void run_test(void)
{
int r;
r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr);
r = env->checkpointing_set_period(env, 60); CKERR(r);
r = env->checkpointing_set_period(env, 1); CKERR(r);
DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS);
assert(dbs != NULL);
......@@ -117,7 +132,12 @@ static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) {
do_args(argc, argv);
run_test();
if (SRC_VERSION == 4) {
littlenode = 1; // 4k nodes, small cache
}
setup();
run_test(); // read, upgrade, write back to disk
run_test(); // read and verify
return 0;
}
......@@ -135,7 +155,7 @@ static void do_args(int argc, char * const argv[]) {
} else if (strcmp(argv[0], "-h")==0) {
resultcode=0;
do_usage:
fprintf(stderr, "Usage: -h -c -d <num_dbs> -r <num_rows> %s\n", cmd);
fprintf(stderr, "Usage: -h -d <num_dbs> -r <num_rows> %s\n", cmd);
exit(resultcode);
} else if (strcmp(argv[0], "-d")==0) {
argc--; argv++;
......@@ -148,11 +168,11 @@ static void do_args(int argc, char * const argv[]) {
} else if (strcmp(argv[0], "-r")==0) {
argc--; argv++;
NUM_ROWS = atoi(argv[0]);
} else if (strcmp(argv[0], "-c")==0) {
CHECK_RESULTS = 1;
} else if (strcmp(argv[0], "-V")==0) {
argc--; argv++;
SRC_VERSION = atoi(argv[0]);
} else if (strcmp(argv[0], "-f")==0) {
flat = 1;
} else {
fprintf(stderr, "Unknown arg: %s\n", argv[0]);
resultcode=1;
......
......@@ -19,14 +19,15 @@ enum {MAX_NAME=128};
int NUM_DBS=5;
int NUM_ROWS=100000;
int CHECK_RESULTS=0;
enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize;
int SRC_VERSION = 4;
int littlenode = 0;
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
int SRC_VERSION = 4;
static void upgrade_test_2(DB **dbs) {
int r = 0;
......@@ -85,39 +86,52 @@ static void upgrade_test_2(DB **dbs) {
}
}
static void run_test(void)
{
int r;
char *src_db_dir;
if ( SRC_VERSION == 3 )
src_db_dir = db_v3_dir;
else if ( SRC_VERSION == 4 )
src_db_dir = db_v4_dir;
static void setup(void) {
int r;
int len = 256;
char syscmd[len];
char * src_db_dir;
if ( SRC_VERSION == 4 ) {
if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0);
}
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables();
}
static void run_test(int checkpoint_period)
{
int r;
r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr);
r = env->checkpointing_set_period(env, 60); CKERR(r);
r = env->checkpointing_set_period(env, checkpoint_period); CKERR(r);
DB **dbs = (DB**)toku_malloc(sizeof(DB*) * NUM_DBS);
assert(dbs != NULL);
......@@ -136,9 +150,15 @@ static void run_test(void)
// ------------ infrastructure ----------
static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) {
do_args(argc, argv);
run_test();
if (SRC_VERSION == 4) {
littlenode = 1; // 4k nodes, small cache
}
setup();
run_test(1);
return 0;
}
......
......@@ -19,14 +19,15 @@ enum {MAX_NAME=128};
int NUM_DBS=5;
int NUM_ROWS=100000;
int CHECK_RESULTS=0;
enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize;
int SRC_VERSION = 4;
int littlenode = 0;
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
int SRC_VERSION = 4;
static void upgrade_test_3(DB **dbs) {
int r;
......@@ -87,35 +88,47 @@ static void upgrade_test_3(DB **dbs) {
}
}
static void run_test(void)
{
static void setup(void) {
int r;
char *src_db_dir;
if ( SRC_VERSION == 3 )
src_db_dir = db_v3_dir;
else if ( SRC_VERSION == 4 )
src_db_dir = db_v4_dir;
int len = 256;
char syscmd[len];
char * src_db_dir;
if ( SRC_VERSION == 4 ) {
if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0);
}
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables();
}
static void run_test(void)
{
int r;
r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr);
......@@ -140,7 +153,16 @@ static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) {
do_args(argc, argv);
littlenode = 0;
setup();
run_test();
if (SRC_VERSION == 4) {
if (verbose)
printf("Now repeat test with small nodes and small cache.\n");
littlenode = 1; // 4k nodes, small cache
setup();
run_test();
}
return 0;
}
......
......@@ -19,15 +19,18 @@ enum {MAX_NAME=128};
int NUM_DBS=5;
int NUM_ROWS=100000;
int CHECK_RESULTS=0;
enum { old_default_cachesize=1024 }; // MB
int CACHESIZE=old_default_cachesize;
enum {ROWS_PER_TRANSACTION=10000};
int SRC_VERSION = 4;
int littlenode = 0;
char *db_v3_dir = "../../utils/preload-3.1-db";
char *db_v4_dir = "dir.preload-3.1-db.c.tdb";
char *env_dir = ENVDIR; // the default env_dir.
char *db_v5_dir = "dir.preload-db.c.tdb";
char *db_v4_dir = "env_preload.4.1.1.cleanshutdown";
char *db_v4_dir_node4k = "env_preload.4.1.1.node4k.cleanshutdown";
enum {ROWS_PER_TRANSACTION=10000};
int SRC_VERSION = 4;
static void upgrade_test_4(DB **dbs) {
int r;
......@@ -122,35 +125,47 @@ static void upgrade_test_4(DB **dbs) {
}
}
static void run_test(void)
{
static void setup(void) {
int r;
char *src_db_dir;
if ( SRC_VERSION == 3 )
src_db_dir = db_v3_dir;
else if ( SRC_VERSION == 4 )
src_db_dir = db_v4_dir;
int len = 256;
char syscmd[len];
char * src_db_dir;
if ( SRC_VERSION == 4 ) {
if (littlenode)
src_db_dir = db_v4_dir_node4k;
else
src_db_dir = db_v4_dir;
}
else if ( SRC_VERSION == 5 ) {
src_db_dir = db_v5_dir;
}
else {
fprintf(stderr, "unsupported TokuDB version %d to upgrade\n", SRC_VERSION);
assert(0);
}
{
int len = 256;
char syscmd[len];
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd); CKERR(r);
}
r = snprintf(syscmd, len, "rm -rf %s", env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
r = snprintf(syscmd, len, "cp -r %s %s", src_db_dir, env_dir);
assert(r<len);
r = system(syscmd);
CKERR(r);
generate_permute_tables();
}
static void run_test(void)
{
int r;
r = db_env_create(&env, 0); CKERR(r);
if (littlenode) {
r = env->set_cachesize(env, 0, 512*1024, 1); CKERR(r);
}
int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE;
r = env->open(env, env_dir, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
env->set_errfile(env, stderr);
......@@ -175,7 +190,17 @@ static void do_args(int argc, char * const argv[]);
int test_main(int argc, char * const *argv) {
do_args(argc, argv);
do_args(argc, argv);
littlenode = 0;
setup();
run_test();
if (SRC_VERSION == 4) {
if (verbose)
printf("Now repeat test with small nodes and small cache.\n");
littlenode = 1; // 4k nodes, small cache
setup();
run_test();
}
return 0;
}
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2009 Tokutek Inc. All rights reserved."
#ident "$Id: env_startup.c 20778 2010-05-28 20:38:42Z yfogel $"
/* Purpose of this test is to verify simplest part of upgrade logic.
* Start by creating two very simple 4.x environments,
* one in each of two states:
* - after a clean shutdown
* - without a clean shutdown
*
* The two different environments will be used to exercise upgrade logic
* for 5.x.
*
*/
#include "test.h"
#include <db.h>
static DB_ENV *env;
#define FLAGS_NOLOG DB_INIT_LOCK|DB_INIT_MPOOL|DB_CREATE|DB_PRIVATE
#define FLAGS_LOG FLAGS_NOLOG|DB_INIT_TXN|DB_INIT_LOG
static int mode = S_IRWXU+S_IRWXG+S_IRWXO;
static void test_shutdown(void);
static void
setup (u_int32_t flags, BOOL clean) {
int r;
if (env)
test_shutdown();
r = system("rm -rf " ENVDIR);
CKERR(r);
r=toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
CKERR(r);
if (clean) {
r = system("cp env_simple.4.1.1.cleanshutdown/* " ENVDIR);
}
else {
r = system("cp env_simple.4.1.1.dirtyshutdown/* " ENVDIR);
}
CKERR(r);
r=db_env_create(&env, 0);
CKERR(r);
env->set_errfile(env, stderr);
r=env->open(env, ENVDIR, flags, mode);
if (clean)
CKERR(r);
else
CKERR2(r, TOKUDB_UPGRADE_FAILURE);
}
static void
test_shutdown(void) {
int r;
r=env->close(env, 0); CKERR(r);
env = NULL;
}
static void
test_env_startup(void) {
u_int32_t flags;
flags = FLAGS_LOG;
setup(flags, TRUE);
print_engine_status(env);
test_shutdown();
setup(flags, FALSE);
if (verbose) {
printf("\n\nEngine status after aborted env->open() will have some garbage values:\n");
}
print_engine_status(env);
test_shutdown();
}
int
test_main (int argc, char * const argv[]) {
parse_args(argc, argv);
test_env_startup();
return 0;
}
......@@ -423,37 +423,116 @@ db_use_builtin_key_cmp(DB *db) {
return r;
}
static const char * curr_env_ver_key = "current_version";
// Keys used in persistent environment dictionary:
// Following keys added in version 12
static const char * orig_env_ver_key = "original_version";
// requires: persistent environment dictionary is already open
static const char * curr_env_ver_key = "current_version";
// Following keys added in version 13
static const char * creation_time_key = "creation_time";
static const char * last_lsn_of_v12_key = "last_lsn_of_v12";
static const char * upgrade_13_time_key = "upgrade_13_time"; // Add more keys for future upgrades
// Values read from (or written into) persistent environment,
// kept here for read-only access from engine status.
static uint32_t persistent_original_env_version;
static uint32_t persistent_stored_env_version_at_startup; // read from curr_env_ver_key, prev version as of this startup
static time_t persistent_creation_time;
static uint64_t persistent_last_lsn_of_v12;
static time_t persistent_upgrade_13_time;
// Requires: persistent environment dictionary is already open.
// Input arg is lsn of clean shutdown of previous version,
// or ZERO_LSN if no upgrade or if crash between log upgrade and here.
static int
maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn) {
maybe_upgrade_persistent_environment_dictionary(DB_ENV * env, DB_TXN * txn, LSN last_lsn_of_clean_shutdown_read_from_log) {
int r;
uint32_t stored_env_version;
DBT key, val;
DB *persistent_environment = env->i->persistent_environment;
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
toku_init_dbt(&val);
r = toku_db_get(env->i->persistent_environment, txn, &key, &val, 0);
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
assert(r == 0);
stored_env_version = toku_dtoh32(*(uint32_t*)val.data);
uint32_t stored_env_version = toku_dtoh32(*(uint32_t*)val.data);
persistent_stored_env_version_at_startup = stored_env_version;
if (stored_env_version > BRT_LAYOUT_VERSION)
r = TOKUDB_DICTIONARY_TOO_NEW;
else if (stored_env_version < BRT_LAYOUT_MIN_SUPPORTED_VERSION)
r = TOKUDB_DICTIONARY_TOO_OLD;
else if (stored_env_version < BRT_LAYOUT_VERSION) {
const uint32_t environment_version = toku_htod32(BRT_LAYOUT_VERSION);
const uint32_t curr_env_ver_d = toku_htod32(BRT_LAYOUT_VERSION);
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, DB_YESOVERWRITE);
toku_fill_dbt(&val, &curr_env_ver_d, sizeof(curr_env_ver_d));
r = toku_db_put(persistent_environment, txn, &key, &val, DB_YESOVERWRITE);
assert(r==0);
uint64_t last_lsn_of_v12_d = toku_htod64(last_lsn_of_clean_shutdown_read_from_log.lsn);
toku_fill_dbt(&key, last_lsn_of_v12_key, strlen(last_lsn_of_v12_key));
toku_fill_dbt(&val, &last_lsn_of_v12_d, sizeof(last_lsn_of_v12_d));
r = toku_db_put(persistent_environment, txn, &key, &val, DB_YESOVERWRITE);
assert(r==0);
time_t upgrade_13_time_d = toku_htod64(time(NULL));
toku_fill_dbt(&key, upgrade_13_time_key, strlen(upgrade_13_time_key));
toku_fill_dbt(&val, &upgrade_13_time_d, sizeof(upgrade_13_time_d));
r = toku_db_put(persistent_environment, txn, &key, &val, DB_NOOVERWRITE);
assert(r==0);
}
// TODO: add key/val for timestamp of VERSION_12_CREATION (could be upgrade)
return r;
}
// Capture persistent env contents to be read by engine status
static void
capture_persistent_env (DB_ENV * env, DB_TXN * txn) {
int r;
DBT key, val;
DB *persistent_environment = env->i->persistent_environment;
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
toku_init_dbt(&val);
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
assert(r == 0);
uint32_t curr_env_version = toku_dtoh32(*(uint32_t*)val.data);
assert(curr_env_version == BRT_LAYOUT_VERSION);
toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
toku_init_dbt(&val);
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
assert(r == 0);
persistent_original_env_version = toku_dtoh32(*(uint32_t*)val.data);
assert(persistent_original_env_version <= curr_env_version);
// make no assertions about timestamps, clock may have been reset
if (persistent_original_env_version >= BRT_LAYOUT_VERSION_13) {
toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
toku_init_dbt(&val);
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
assert(r == 0);
persistent_creation_time = toku_dtoh64((*(time_t*)val.data));
}
if (persistent_original_env_version != curr_env_version) {
// an upgrade was performed at some time, capture info about the upgrade
toku_fill_dbt(&key, last_lsn_of_v12_key, strlen(last_lsn_of_v12_key));
toku_init_dbt(&val);
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
assert(r == 0);
persistent_last_lsn_of_v12 = toku_dtoh64(*(uint32_t*)val.data);
toku_fill_dbt(&key, upgrade_13_time_key, strlen(upgrade_13_time_key));
toku_init_dbt(&val);
r = toku_db_get(persistent_environment, txn, &key, &val, 0);
assert(r == 0);
persistent_upgrade_13_time = toku_dtoh64((*(time_t*)val.data));
}
}
// return 0 if log exists or ENOENT if log does not exist
static int
ydb_recover_log_exists(DB_ENV *env) {
......@@ -492,7 +571,7 @@ validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) {
assert(r);
}
// Test for rollback cachefile
// Test for existence of rollback cachefile if it is expected to exist
if (r == 0 && need_rollback_cachefile) {
path = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
assert(path);
......@@ -558,11 +637,11 @@ validate_env(DB_ENV * env, BOOL * valid_newenv, BOOL need_rollback_cachefile) {
}
static int
ydb_maybe_upgrade_env (DB_ENV *env) {
ydb_maybe_upgrade_env (DB_ENV *env, LSN * last_lsn_of_clean_shutdown_read_from_log, BOOL * upgrade_in_progress) {
int r = 0;
if (env->i->open_flags & DB_INIT_TXN && env->i->open_flags & DB_INIT_LOG) {
toku_ydb_unlock();
r = toku_maybe_upgrade_log(env->i->dir, env->i->real_log_dir);
r = toku_maybe_upgrade_log(env->i->dir, env->i->real_log_dir, last_lsn_of_clean_shutdown_read_from_log, upgrade_in_progress);
toku_ydb_lock();
}
return r;
......@@ -598,6 +677,8 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
goto cleanup;
}
assert(sizeof(time_t) == sizeof(uint64_t));
HANDLE_EXTRA_FLAGS(env, flags,
DB_CREATE|DB_PRIVATE|DB_INIT_LOG|DB_INIT_TXN|DB_RECOVER|DB_INIT_MPOOL|DB_INIT_LOCK|DB_THREAD);
......@@ -678,9 +759,22 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
need_rollback_cachefile = TRUE;
}
r = ydb_maybe_upgrade_env(env);
LSN last_lsn_of_clean_shutdown_read_from_log = ZERO_LSN;
BOOL upgrade_in_progress = FALSE;
r = ydb_maybe_upgrade_env(env, &last_lsn_of_clean_shutdown_read_from_log, &upgrade_in_progress);
if (r!=0) goto cleanup;
if (upgrade_in_progress) {
// Delete old rollback file. There was a clean shutdown, so it has nothing useful,
// and there is no value in upgrading it. It is simpler to just create a new one.
char* rollback_filename = toku_construct_full_name(2, env->i->dir, ROLLBACK_CACHEFILE_NAME);
assert(rollback_filename);
r = unlink(rollback_filename);
toku_free(rollback_filename);
assert(r==0 || errno==ENOENT);
need_rollback_cachefile = FALSE; // we're not expecting it to exist now
}
r = validate_env(env, &newenv, need_rollback_cachefile); // make sure that environment is either new or complete
if (r != 0) goto cleanup;
......@@ -743,10 +837,12 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
int using_txns = env->i->open_flags & DB_INIT_TXN;
if (env->i->logger) {
// if this is a newborn env or if this is an upgrade, then create a brand new rollback file
BOOL create_new_rollback_file = newenv | upgrade_in_progress;
assert (using_txns);
toku_logger_set_cachetable(env->i->logger, env->i->cachetable);
toku_logger_set_remove_finalize_callback(env->i->logger, finalize_file_removal, env->i->ltm);
r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, newenv);
r = toku_logger_open_rollback(env->i->logger, env->i->cachetable, create_new_rollback_file);
assert(r==0);
}
......@@ -766,20 +862,30 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {
if (newenv) {
// create new persistent_environment
DBT key, val;
const uint32_t environment_version = toku_htod32(BRT_LAYOUT_VERSION);
persistent_original_env_version = BRT_LAYOUT_VERSION;
const uint32_t environment_version = toku_htod32(persistent_original_env_version);
toku_fill_dbt(&key, orig_env_ver_key, strlen(orig_env_ver_key));
toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
assert(r==0);
toku_fill_dbt(&key, curr_env_ver_key, strlen(curr_env_ver_key));
toku_fill_dbt(&val, &environment_version, sizeof(environment_version));
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
assert(r==0);
time_t creation_time_d = toku_htod64(time(NULL));
toku_fill_dbt(&key, creation_time_key, strlen(creation_time_key));
toku_fill_dbt(&val, &creation_time_d, sizeof(creation_time_d));
r = toku_db_put(env->i->persistent_environment, txn, &key, &val, 0);
assert(r==0);
}
else {
r = maybe_upgrade_persistent_environment_dictionary(env, txn);
r = maybe_upgrade_persistent_environment_dictionary(env, txn, last_lsn_of_clean_shutdown_read_from_log);
assert(r==0);
}
capture_persistent_env(env, txn);
}
{
r = toku_db_create(&env->i->directory, env, 0);
......@@ -805,6 +911,8 @@ cleanup:
unlock_single_process(env);
}
}
if (r == 0)
errno = 0; // tabula rasa
return r;
}
......@@ -1509,8 +1617,8 @@ format_time(const time_t *timer, char *buf) {
}
}
// Do not take ydb lock around or in this function.
// If the engine is blocked because some thread is holding the ydb lock, this function
// Do not take ydb lock or any other lock around or in this function.
// If the engine is blocked because some thread is holding a lock, this function
// can help diagnose the problem.
// This function only collects information, and it does not matter if something gets garbled
// because of a race condition.
......@@ -1671,9 +1779,9 @@ env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat) {
toku_brt_get_upgrade_status(&brt_upgrade_stat);
engstat->upgrade_env_status = toku_log_upgrade_get_footprint();
engstat->upgrade_header = brt_upgrade_stat.header;
engstat->upgrade_nonleaf = brt_upgrade_stat.nonleaf;
engstat->upgrade_leaf = brt_upgrade_stat.leaf;
engstat->upgrade_header = brt_upgrade_stat.header_12;
engstat->upgrade_nonleaf = brt_upgrade_stat.nonleaf_12;
engstat->upgrade_leaf = brt_upgrade_stat.leaf_12;
}
}
return r;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment