Commit 62989886 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul

Merge the tokudb.558 branch back to to the main branch with:

{{{
svn merge -r3272:3320 https://svn.tokutek.com/tokudb/tokudb.558
}}}
No conflicts.


git-svn-id: file:///svn/tokudb@3322 c7de825b-a66e-492c-adef-691d508d4ae1
parent 22cf8597
...@@ -58,6 +58,7 @@ REGRESSION_TESTS = \ ...@@ -58,6 +58,7 @@ REGRESSION_TESTS = \
brt-test2 \ brt-test2 \
brt-test3 \ brt-test3 \
brt-test4 \ brt-test4 \
brt-test5 \
cachetable-test \ cachetable-test \
cachetable-test2 \ cachetable-test2 \
fifo-test \ fifo-test \
...@@ -75,6 +76,7 @@ REGRESSION_TESTS = \ ...@@ -75,6 +76,7 @@ REGRESSION_TESTS = \
test-gpma-blackbox \ test-gpma-blackbox \
test-gpma-glassbox \ test-gpma-glassbox \
test-gpma-glassbox \ test-gpma-glassbox \
test-gpma-leftmost-dup \
test-inc-split \ test-inc-split \
test-primes \ test-primes \
test_oexcl \ test_oexcl \
...@@ -177,7 +179,7 @@ check-fanout: ...@@ -177,7 +179,7 @@ check-fanout:
let BRT_FANOUT=BRT_FANOUT+1; \ let BRT_FANOUT=BRT_FANOUT+1; \
done done
log-test log-test2 log-test3 log-test4 log-test5 log-test6 benchmark-test brt-test brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz log-test log-test2 log-test3 log-test4 log-test5 log-test6 benchmark-test brt-test brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 brt-test5 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz
# pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage # pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage
BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h gpma.h brt.h brt-search.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h mempool.h leafentry.h BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h gpma.h brt.h brt-search.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h mempool.h leafentry.h
...@@ -193,18 +195,19 @@ pma.o: gpma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h log.h ../i ...@@ -193,18 +195,19 @@ pma.o: gpma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h log.h ../i
test-gpma-glassbox.o: test-gpma-glassbox.c gpma.h gpma-internal.h toku_assert.h memory.h test-gpma-glassbox.o: test-gpma-glassbox.c gpma.h gpma-internal.h toku_assert.h memory.h
test-gpma-glassbox: test-gpma-glassbox.o toku_assert.o memory-debug.o gpma.o test-gpma-glassbox: test-gpma-glassbox.o toku_assert.o memory-debug.o gpma.o
test-gpma-blackbox: test-gpma-blackbox.o toku_assert.o memory.o gpma.o test-gpma-blackbox: test-gpma-blackbox.o toku_assert.o memory.o gpma.o
test-gpma-blackbox.o: test-gpma-blackbox.c gpma.h memory.h toku_assert.h
test-gpma-worstinsert: test-gpma-worstinsert.o toku_assert.o memory.o gpma.o test-gpma-worstinsert: test-gpma-worstinsert.o toku_assert.o memory.o gpma.o
test-gpma-worstinsert.o test-gpma-blackbox.o: gpma.h memory.h toku_assert.h test-gpma-leftmost-dup: test-gpma-leftmost-dup.o toku_assert.o memory.o gpma.o
test-gpma-worstinsert.o test-gpma-blackbox.o test-gpma-leftmost-dup.o: gpma.h memory.h toku_assert.h
: gpma.h memory.h toku_assert.h
gpma.o: gpma.c gpma.h gpma.o: gpma.c gpma.h
ybt.o: ybt.h brttypes.h ../include/db.h ybt.o: ybt.h brttypes.h ../include/db.h
ybt-test: ybt-test.o ybt.o memory.o toku_assert.o ybt-test: ybt-test.o ybt.o memory.o toku_assert.o
ybt-test.o: ybt.h ../include/db.h ybt-test.o: ybt.h ../include/db.h
cachetable.o: cachetable.h hashfun.h memory.h cachetable.o: cachetable.h hashfun.h memory.h
brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 brt-test: ybt.o brt.o fifo.o gpma.o leafentry.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 brt-test5 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 brt-test: ybt.o brt.o fifo.o gpma.o leafentry.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
log.o: log_header.h log-internal.h log.h wbuf.h crc.h brttypes.h $(BRT_INTERNAL_H_INCLUDES) log.o: log_header.h log-internal.h log.h wbuf.h crc.h brttypes.h $(BRT_INTERNAL_H_INCLUDES)
logformat: logformat.o toku_assert.o logformat: logformat.o toku_assert.o
brt-test0.o brt-test1.o brt-test2.o brt-test3.o brt-test4.o test-brt-overflow.h brt-test-named-db.o brt-test-cursor.o brt-test-cursor-2.o brt-test.o brt.o: brt.h brt-search.h ../include/db.h fifo.h gpma.h brttypes.h cachetable.h memory.h $(BRT_INTERNAL_H_INCLUDES) brt-test0.o brt-test1.o brt-test2.o brt-test3.o brt-test4.o brt-test5.o test-brt-overflow.h brt-test-named-db.o brt-test-cursor.o brt-test-cursor-2.o brt-test.o brt.o: brt.h brt-search.h ../include/db.h fifo.h gpma.h brttypes.h cachetable.h memory.h $(BRT_INTERNAL_H_INCLUDES)
brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES) brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES)
brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h
fifo.o: fifo.h brttypes.h fifo.o: fifo.h brttypes.h
......
...@@ -160,8 +160,8 @@ extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt); ...@@ -160,8 +160,8 @@ extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt);
static const BRTNODE null_brtnode=0; static const BRTNODE null_brtnode=0;
extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen); //extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen);
extern u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp); //extern u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp);
extern u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen); extern u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen);
extern u_int32_t toku_calccrc32_cmdstruct (BRT_CMD cmd); extern u_int32_t toku_calccrc32_cmdstruct (BRT_CMD cmd);
...@@ -193,25 +193,17 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, DISKOFF diskoff, enum brt_cmd_typ ...@@ -193,25 +193,17 @@ int toku_testsetup_insert_to_nonleaf (BRT brt, DISKOFF diskoff, enum brt_cmd_typ
int toku_set_func_fsync (int (*fsync_function)(int)); int toku_set_func_fsync (int (*fsync_function)(int));
/* allocate a kv pair from a kv memory pool */ // These two go together to do lookups in a brtnode using the keys in a command.
//static inline struct kv_pair *kv_pair_malloc_mempool(const void *key, int keylen, const void *val, int vallen, struct mempool *mp) { struct cmd_leafval_bessel_extra {
// struct kv_pair *kv = toku_mempool_malloc(mp, sizeof (struct kv_pair) + keylen + vallen, 4);
// if (kv)
// kv_pair_init(kv, key, keylen, val, vallen);
// return kv;
//}
static inline struct kv_pair *brtnode_malloc_kv_pair (GPMA pma, struct mempool *mp, const void *key, unsigned int keylen, const void *val, unsigned int vallen) {
struct kv_pair *kv = mempool_malloc_from_gpma(pma, mp, sizeof (struct kv_pair) + keylen + vallen);
kv_pair_init(kv, key, keylen, val, vallen);
return kv;
}
// used for the leaf compare fun
struct lc_pair {
BRT t; BRT t;
int compare_both; // compare_both is set if it is a DUPSORT database and both keys are needed (e.g, for DB_DELETE_ANY) BRT_CMD cmd;
int compare_both_keys; // Set to 1 for DUPSORT databases that are not doing a DELETE_BOTH
}; };
int toku_brtleaf_compare_fun (u_int32_t alen __attribute__((__unused__)), void *aval, u_int32_t blen __attribute__((__unused__)), void *bval, void *lc /*this is (struct lc_pair *) cast to (void*). */) ; int toku_cmd_leafval_bessel (u_int32_t dlen, void *leafentry, void *extra);
int toku_brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger);
int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp);
void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size);
#endif #endif
...@@ -62,9 +62,8 @@ static unsigned int toku_serialize_brtnode_size_slow(BRTNODE node) { ...@@ -62,9 +62,8 @@ static unsigned int toku_serialize_brtnode_size_slow(BRTNODE node) {
GPMA_ITERATE(node->u.l.buffer, GPMA_ITERATE(node->u.l.buffer,
idx, vlen, vdata, idx, vlen, vdata,
({ ({
struct kv_pair *p=vdata; LEAFENTRY le=vdata;
assert(vlen==sizeof(*p)+kv_pair_keylen(p)+kv_pair_vallen(p)); hsize+= PMA_ITEM_OVERHEAD + leafentry_disksize(le);
hsize+=PMA_ITEM_OVERHEAD+KEY_VALUE_OVERHEAD+kv_pair_keylen(p)+kv_pair_vallen(p);
})); }));
assert(hsize==node->u.l.n_bytes_in_buffer); assert(hsize==node->u.l.n_bytes_in_buffer);
hsize+=4; /* the PMA size */ hsize+=4; /* the PMA size */
...@@ -97,7 +96,7 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) { ...@@ -97,7 +96,7 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) {
return result; return result;
} }
void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) { void toku_serialize_brtnode_to (int fd, DISKOFF off, DISKOFF size, BRTNODE node) {
//printf("%s:%d serializing\n", __FILE__, __LINE__); //printf("%s:%d serializing\n", __FILE__, __LINE__);
struct wbuf w; struct wbuf w;
int i; int i;
...@@ -105,7 +104,7 @@ void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) ...@@ -105,7 +104,7 @@ void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node)
assert(calculated_size<=size); assert(calculated_size<=size);
//char buf[size]; //char buf[size];
char *MALLOC_N(size,buf); char *MALLOC_N(size,buf);
toku_verify_counts(node); //toku_verify_counts(node);
assert(size>0); assert(size>0);
wbuf_init(&w, buf, size); wbuf_init(&w, buf, size);
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]); //printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
...@@ -174,19 +173,14 @@ void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) ...@@ -174,19 +173,14 @@ void toku_serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node)
assert(check_local_fingerprint==node->local_fingerprint); assert(check_local_fingerprint==node->local_fingerprint);
} }
} else { } else {
//printf(" n_entries=%d\n", toku_pma_n_entries(node->u.l.buffer)); //printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer));
wbuf_uint(&w, toku_gpma_n_entries(node->u.l.buffer)); wbuf_uint(&w, toku_gpma_n_entries(node->u.l.buffer));
wbuf_uint(&w, toku_gpma_index_limit(node->u.l.buffer)); wbuf_uint(&w, toku_gpma_index_limit(node->u.l.buffer));
GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata, GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata,
({ ({
struct kv_pair *p=vdata; //printf(" %s:%d idx=%d\n", __FILE__, __LINE__, idx);
assert((char*)node->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size );
u_int32_t keylen=kv_pair_keylen(p);
u_int32_t datalen=kv_pair_vallen(p);
assert(vlen==sizeof(*p)+keylen+datalen);
wbuf_uint(&w, idx); wbuf_uint(&w, idx);
wbuf_bytes(&w, kv_pair_key(p), keylen); wbuf_LEAFENTRY(&w, vdata);
wbuf_bytes(&w, kv_pair_val(p), datalen);
})); }));
} }
assert(w.ndone<=w.size); assert(w.ndone<=w.size);
...@@ -343,7 +337,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign ...@@ -343,7 +337,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign
int diff; int diff;
bytevec key; ITEMLEN keylen; bytevec key; ITEMLEN keylen;
bytevec val; ITEMLEN vallen; bytevec val; ITEMLEN vallen;
toku_verify_counts(result); //toku_verify_counts(result);
int type = rbuf_char(&rc); int type = rbuf_char(&rc);
TXNID xid = rbuf_ulonglong(&rc); TXNID xid = rbuf_ulonglong(&rc);
rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */ rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */
...@@ -387,19 +381,24 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign ...@@ -387,19 +381,24 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign
} }
u_int32_t actual_sum = 0; u_int32_t actual_sum = 0;
//printf("%s:%d node %lld, reading %d items\n", __FILE__, __LINE__, off, n_in_buf);
for (i=0; i<n_in_buf; i++) { for (i=0; i<n_in_buf; i++) {
bytevec key; ITEMLEN keylen; LEAFENTRY tmp_le;
bytevec val; ITEMLEN vallen; //printf("%s:%d reading %dth item\n", __FILE__, __LINE__, i);
int idx = rbuf_int(&rc); int idx = rbuf_int(&rc);
rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */ //printf("%s:%d idx=%d\n", __FILE__, __LINE__, idx);
rbuf_bytes(&rc, &val, &vallen); u_int32_t memsize, disksize;
result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD + PMA_ITEM_OVERHEAD; rbuf_LEAFENTRY(&rc, &memsize, &disksize, &tmp_le);
struct kv_pair *pair = brtnode_malloc_kv_pair(result->u.l.buffer, &result->u.l.buffer_mempool, key, keylen, val, vallen); LEAFENTRY le = mempool_malloc_from_gpma(result->u.l.buffer, &result->u.l.buffer_mempool, memsize);
assert(pair); assert(le);
int pairlen = kv_pair_size(pair); memcpy(le, tmp_le, memsize);
toku_gpma_set_at_index(result->u.l.buffer, idx, pairlen, pair); toku_free(tmp_le);
actual_sum += result->rand4fingerprint*toku_calccrc32_kvpair_struct(pair); assert(disksize==leafentry_disksize(le));
// printf("%s:%d rand4=%08x actual=%08x this=%08x expect=%08x\n", __FILE__, __LINE__, result->rand4fingerprint, actual_sum, toku_calccrc32_kvpair_struct(pair), result->local_fingerprint); result->u.l.n_bytes_in_buffer += disksize + PMA_ITEM_OVERHEAD;
//printf("idx=%d\n", idx);
toku_gpma_set_at_index(result->u.l.buffer, idx, memsize, le);
actual_sum += result->rand4fingerprint*toku_le_crc(le);
//printf("%s:%d rand4=%08x fp=%08x \n", __FILE__, __LINE__, result->rand4fingerprint, actual_sum);
} }
if (r!=0) goto died_21; if (r!=0) goto died_21;
...@@ -411,7 +410,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign ...@@ -411,7 +410,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign
//fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height); //fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height);
} }
toku_verify_counts(result); //toku_verify_counts(result);
} }
{ {
unsigned int n_read_so_far = rc.ndone; unsigned int n_read_so_far = rc.ndone;
...@@ -430,7 +429,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign ...@@ -430,7 +429,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, unsign
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children); //printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
toku_free(rc.buf); toku_free(rc.buf);
*brtnode = result; *brtnode = result;
toku_verify_counts(result); //toku_verify_counts(result);
return 0; return 0;
} }
...@@ -444,8 +443,8 @@ void toku_verify_counts (BRTNODE node) { ...@@ -444,8 +443,8 @@ void toku_verify_counts (BRTNODE node) {
GPMA_ITERATE(node->u.l.buffer, idx, dlen, ddata, GPMA_ITERATE(node->u.l.buffer, idx, dlen, ddata,
({ ({
count++; count++;
sum+=(PMA_ITEM_OVERHEAD + dlen); sum+= PMA_ITEM_OVERHEAD + leafentry_disksize(ddata); // use the disk size, not the memory size.
fp += toku_calccrc32_kvpair_struct(ddata); fp += toku_le_crc(ddata);
})); }));
assert(count==toku_gpma_n_entries(node->u.l.buffer)); assert(count==toku_gpma_n_entries(node->u.l.buffer));
assert(sum==node->u.l.n_bytes_in_buffer); assert(sum==node->u.l.n_bytes_in_buffer);
......
...@@ -76,27 +76,38 @@ int toku_testsetup_insert_to_leaf (BRT brt, DISKOFF diskoff, char *key, int keyl ...@@ -76,27 +76,38 @@ int toku_testsetup_insert_to_leaf (BRT brt, DISKOFF diskoff, char *key, int keyl
toku_verify_counts(node); toku_verify_counts(node);
assert(node->height==0); assert(node->height==0);
struct kv_pair *kv = brtnode_malloc_kv_pair(node->u.l.buffer, &node->u.l.buffer_mempool, key, keylen, val, vallen); u_int32_t lesize, disksize;
struct lc_pair lc = {brt, node->flags & TOKU_DB_DUPSORT}; LEAFENTRY tmp_leafentry;
r = le_committed(keylen, key, vallen, val, &lesize, &disksize, &tmp_leafentry);
LEAFENTRY leafentry = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, lesize);
memcpy(leafentry, tmp_leafentry, lesize);
toku_free(tmp_leafentry);
u_int32_t storedlen; u_int32_t storedlen;
void *storeddata; void *storeddata;
u_int32_t idx; u_int32_t idx;
r = toku_gpma_lookup_item(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, &storedlen, &storeddata, &idx); DBT keydbt,valdbt;
BRT_CMD_S cmd = {BRT_INSERT, 0, .u.id={toku_fill_dbt(&keydbt, key, keylen),
toku_fill_dbt(&valdbt, val, vallen)}};
struct cmd_leafval_bessel_extra be = {brt, &cmd, node->flags & TOKU_DB_DUPSORT};
r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be, &storedlen, &storeddata, &idx);
if (r==0) { if (r==0) {
// It's already there. So now we have to remove it and put the new one back in. // It's already there. So now we have to remove it and put the new one back in.
node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + storedlen; node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(storeddata);
node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair_struct(storeddata); node->local_fingerprint -= node->rand4fingerprint*toku_le_crc(storeddata);
toku_mempool_mfree(&node->u.l.buffer_mempool, storeddata, storedlen); toku_mempool_mfree(&node->u.l.buffer_mempool, storeddata, storedlen);
// Now put the new kv in. // Now put the new kv in.
toku_gpma_set_at_index(node->u.l.buffer, idx, kv_pair_size(kv), kv); toku_gpma_set_at_index(node->u.l.buffer, idx, lesize, leafentry);
} else { } else {
r = toku_gpma_insert(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, 0, 0, 0); r = toku_gpma_insert_bessel(node->u.l.buffer, lesize, leafentry, toku_cmd_leafval_bessel, &be, 0, 0, 0);
assert(r==0); assert(r==0);
} }
node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + kv_pair_size(kv); node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + disksize;
node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_kvpair_struct(kv); node->local_fingerprint += node->rand4fingerprint*toku_le_crc(leafentry);
node->dirty=1; node->dirty=1;
*subtree_fingerprint = node->local_fingerprint; *subtree_fingerprint = node->local_fingerprint;
......
...@@ -21,53 +21,6 @@ ...@@ -21,53 +21,6 @@
static TOKUTXN const null_txn = 0; static TOKUTXN const null_txn = 0;
static DB * const null_db = 0; static DB * const null_db = 0;
static void test5 (void) {
int r;
BRT t;
int limit=100000;
int *values;
int i;
CACHETABLE ct;
char fname[]="testbrt.brt";
toku_memory_check_all_free();
MALLOC_N(limit,values);
for (i=0; i<limit; i++) values[i]=-1;
unlink(fname);
r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = toku_open_brt(fname, 0, 1, &t, 1<<12, ct, null_txn, toku_default_compare_fun, null_db); assert(r==0);
for (i=0; i<limit/2; i++) {
char key[100],val[100];
int rk = random()%limit;
int rv = random();
if (i%1000==0 && verbose) { printf("w"); fflush(stdout); }
values[rk] = rv;
snprintf(key, 100, "key%d", rk);
snprintf(val, 100, "val%d", rv);
DBT k,v;
toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
}
if (verbose) printf("\n");
for (i=0; i<limit/2; i++) {
int rk = random()%limit;
if (values[rk]>=0) {
char key[100], valexpected[100];
DBT k,v;
if (i%1000==0 && verbose) { printf("r"); fflush(stdout); }
snprintf(key, 100, "key%d", rk);
snprintf(valexpected, 100, "val%d", values[rk]);
r = toku_brt_lookup(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_init_dbt(&v));
assert(r==0);
assert(v.size==(1+strlen(valexpected)));
assert(memcmp(v.data,valexpected,v.size)==0);
}
}
if (verbose) printf("\n");
toku_free(values);
r = toku_close_brt(t); assert(r==0);
r = toku_cachetable_close(&ct); assert(r==0);
toku_memory_check_all_free();
}
static void test_dump_empty_db (void) { static void test_dump_empty_db (void) {
BRT t; BRT t;
CACHETABLE ct; CACHETABLE ct;
...@@ -1518,8 +1471,6 @@ static void brt_blackbox_test (void) { ...@@ -1518,8 +1471,6 @@ static void brt_blackbox_test (void) {
toku_memory_check_all_free(); toku_memory_check_all_free();
test_multiple_dbs(); test_multiple_dbs();
toku_memory_check_all_free(); toku_memory_check_all_free();
if (verbose) printf("test5\n");
test5();
if (verbose) printf("test_multiple_files\n"); if (verbose) printf("test_multiple_files\n");
test_multiple_files(); test_multiple_files();
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
static TOKUTXN const null_txn = 0; static TOKUTXN const null_txn = 0;
static DB * const null_db = 0; static DB * const null_db = 0;
static void test2 (int memcheck) { static void test2 (int memcheck, int limit) {
BRT t; BRT t;
int r; int r;
int i; int i;
...@@ -24,7 +24,7 @@ static void test2 (int memcheck) { ...@@ -24,7 +24,7 @@ static void test2 (int memcheck) {
r = toku_open_brt(fname, 0, 1, &t, 1024, ct, null_txn, toku_default_compare_fun, null_db); r = toku_open_brt(fname, 0, 1, &t, 1024, ct, null_txn, toku_default_compare_fun, null_db);
if (verbose) printf("%s:%d did setup\n", __FILE__, __LINE__); if (verbose) printf("%s:%d did setup\n", __FILE__, __LINE__);
assert(r==0); assert(r==0);
for (i=0; i<4096; i++) { for (i=0; i<limit; i++) { // 4096
DBT k,v; DBT k,v;
char key[100],val[100]; char key[100],val[100];
snprintf(key,100,"hello%d",i); snprintf(key,100,"hello%d",i);
...@@ -42,6 +42,7 @@ static void test2 (int memcheck) { ...@@ -42,6 +42,7 @@ static void test2 (int memcheck) {
} }
} }
if (verbose) printf("%s:%d inserted\n", __FILE__, __LINE__); if (verbose) printf("%s:%d inserted\n", __FILE__, __LINE__);
r = toku_verify_brt(t); assert(r==0);
r = toku_close_brt(t); assert(r==0); r = toku_close_brt(t); assert(r==0);
r = toku_cachetable_close(&ct); assert(r==0); r = toku_cachetable_close(&ct); assert(r==0);
toku_memory_check_all_free(); toku_memory_check_all_free();
...@@ -50,10 +51,12 @@ static void test2 (int memcheck) { ...@@ -50,10 +51,12 @@ static void test2 (int memcheck) {
int main (int argc , const char *argv[]) { int main (int argc , const char *argv[]) {
default_parse_args(argc, argv); default_parse_args(argc, argv);
if (verbose) printf("test2 checking memory\n"); // if (verbose) printf("test2 checking memory\n");
// test2(1); // test2(1);
if (verbose) printf("test2 faster\n"); if (verbose) printf("test2 faster\n");
test2(0); test2(0, 2);
test2(0, 212);
test2(0, 4096);
toku_malloc_cleanup(); toku_malloc_cleanup();
if (verbose) printf("test1 ok\n"); if (verbose) printf("test1 ok\n");
return 0; return 0;
......
...@@ -42,6 +42,7 @@ static void test3 (int nodesize, int count, int memcheck) { ...@@ -42,6 +42,7 @@ static void test3 (int nodesize, int count, int memcheck) {
snprintf(val,100,"there%d",i); snprintf(val,100,"there%d",i);
toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn); toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
} }
r = toku_verify_brt(t); assert(r==0);
r = toku_close_brt(t); assert(r==0); r = toku_close_brt(t); assert(r==0);
r = toku_cachetable_close(&ct); assert(r==0); r = toku_cachetable_close(&ct); assert(r==0);
toku_memory_check_all_free(); toku_memory_check_all_free();
......
...@@ -42,6 +42,7 @@ static void test4 (int nodesize, int count, int memcheck) { ...@@ -42,6 +42,7 @@ static void test4 (int nodesize, int count, int memcheck) {
snprintf(val,100,"there%d",i); snprintf(val,100,"there%d",i);
toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn); toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
} }
r = toku_verify_brt(t); assert(r==0);
r = toku_close_brt(t); assert(r==0); r = toku_close_brt(t); assert(r==0);
r = toku_cachetable_close(&ct); assert(r==0); r = toku_cachetable_close(&ct); assert(r==0);
toku_memory_check_all_free(); toku_memory_check_all_free();
...@@ -53,6 +54,9 @@ static void test4 (int nodesize, int count, int memcheck) { ...@@ -53,6 +54,9 @@ static void test4 (int nodesize, int count, int memcheck) {
} }
static void brt_blackbox_test (void) { static void brt_blackbox_test (void) {
test4(2048, 1<<14, 1);
return;
if (verbose) printf("test4 slow\n"); if (verbose) printf("test4 slow\n");
test4(2048, 1<<15, 1); test4(2048, 1<<15, 1);
......
...@@ -19,15 +19,6 @@ ...@@ -19,15 +19,6 @@
#include "toku_assert.h" #include "toku_assert.h"
#include "kv-pair.h" #include "kv-pair.h"
static void gpma_verify_fingerprint (GPMA pma, u_int32_t rand4fingerprint, u_int32_t fingerprint) {
u_int32_t actual_fingerprint=0;
GPMA_ITERATE(pma, idx, len, val,
actual_fingerprint+=rand4fingerprint*toku_calccrc32_kvpair_struct(val)
);
assert(actual_fingerprint==fingerprint);
}
static void verify_local_fingerprint (BRTNODE node) { static void verify_local_fingerprint (BRTNODE node) {
u_int32_t fp=0; u_int32_t fp=0;
int i; int i;
...@@ -39,8 +30,33 @@ static void verify_local_fingerprint (BRTNODE node) { ...@@ -39,8 +30,33 @@ static void verify_local_fingerprint (BRTNODE node) {
})); }));
assert(fp==node->local_fingerprint); assert(fp==node->local_fingerprint);
} else { } else {
gpma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint); toku_verify_counts(node);
}
}
static int compare_pairs (BRT brt, struct kv_pair *a, struct kv_pair *b) {
DBT x,y;
int cmp = brt->compare_fun(brt->db,
toku_fill_dbt(&x, kv_pair_key(a), kv_pair_keylen(a)),
toku_fill_dbt(&y, kv_pair_key(b), kv_pair_keylen(b)));
if (cmp==0 && (brt->flags & TOKU_DB_DUPSORT)) {
cmp = brt->dup_compare(brt->db,
toku_fill_dbt(&x, kv_pair_val(a), kv_pair_vallen(a)),
toku_fill_dbt(&y, kv_pair_val(b), kv_pair_vallen(b)));
} }
return cmp;
}
static int compare_leafentries (BRT brt, LEAFENTRY a, LEAFENTRY b) {
DBT x,y;
int cmp = brt->compare_fun(brt->db,
toku_fill_dbt(&x, le_any_key(a), le_any_keylen(a)),
toku_fill_dbt(&y, le_any_key(b), le_any_keylen(b)));
if (cmp==0 && (brt->flags & TOKU_DB_DUPSORT)) {
cmp = brt->dup_compare(brt->db,
toku_fill_dbt(&x, le_any_val(a), le_any_vallen(a)),
toku_fill_dbt(&y, le_any_val(b), le_any_vallen(b)));
}
return cmp;
} }
int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) { int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse) {
...@@ -56,7 +72,7 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b ...@@ -56,7 +72,7 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b
verify_local_fingerprint(node); verify_local_fingerprint(node);
if (node->height>0) { if (node->height>0) {
int i; int i;
for (i=0; i< node->u.n.n_children-1; i++) { for (i=0; i< node->u.n.n_children; i++) {
bytevec thislorange,thishirange; bytevec thislorange,thishirange;
ITEMLEN thislolen, thishilen; ITEMLEN thislolen, thishilen;
if (node->u.n.n_children==0 || i==0) { if (node->u.n.n_children==0 || i==0) {
...@@ -89,8 +105,14 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b ...@@ -89,8 +105,14 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b
toku_fifo_iterate(BNC_BUFFER(node,i), verify_pair, 0); toku_fifo_iterate(BNC_BUFFER(node,i), verify_pair, 0);
} }
} }
//if (lorange) printf("%s:%d lorange=%s\n", __FILE__, __LINE__, (char*)lorange);
//if (hirange) printf("%s:%d lorange=%s\n", __FILE__, __LINE__, (char*)hirange);
for (i=0; i<node->u.n.n_children-2; i++) {
assert(compare_pairs(brt, node->u.n.childkeys[i], node->u.n.childkeys[i+1])<0);
}
for (i=0; i<node->u.n.n_children; i++) { for (i=0; i<node->u.n.n_children; i++) {
if (i>0) { if (i>0) {
//printf(" %s:%d i=%d %p v=%s\n", __FILE__, __LINE__, i, node->u.n.childkeys[i-1], (char*)kv_pair_key(node->u.n.childkeys[i-1]));
if (lorange) assert(toku_keycompare(lorange,lolen, kv_pair_key(node->u.n.childkeys[i-1]), toku_brt_pivot_key_len(brt, node->u.n.childkeys[i-1]))<0); if (lorange) assert(toku_keycompare(lorange,lolen, kv_pair_key(node->u.n.childkeys[i-1]), toku_brt_pivot_key_len(brt, node->u.n.childkeys[i-1]))<0);
if (hirange) assert(toku_keycompare(kv_pair_key(node->u.n.childkeys[i-1]), toku_brt_pivot_key_len(brt, node->u.n.childkeys[i-1]), hirange, hilen)<=0); if (hirange) assert(toku_keycompare(kv_pair_key(node->u.n.childkeys[i-1]), toku_brt_pivot_key_len(brt, node->u.n.childkeys[i-1]), hirange, hilen)<=0);
} }
...@@ -103,6 +125,16 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b ...@@ -103,6 +125,16 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b
recurse); recurse);
} }
} }
} else {
// Make sure that they are in increasing order.
void *prev=0;
GPMA_ITERATE(node->u.l.buffer, idx, dlen, data,
({
if (prev==0)
prev=data;
else
assert(compare_leafentries(brt, prev, data)<0);
}));
} }
if ((r = toku_cachetable_unpin(brt->cf, off, 0, 0))) return r; if ((r = toku_cachetable_unpin(brt->cf, off, 0, 0))) return r;
return result; return result;
......
...@@ -40,6 +40,13 @@ ...@@ -40,6 +40,13 @@
#include "mempool.h" #include "mempool.h"
#include "leafentry.h" #include "leafentry.h"
//#define SLOW
#ifdef SLOW
#define VERIFY_NODE(n) toku_verify_counts(n)
#else
#define VERIFY_NODE(n) ((void)0)
#endif
extern long long n_items_malloced; extern long long n_items_malloced;
static int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER); static int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER);
...@@ -210,7 +217,7 @@ int toku_unpin_brtnode (BRT brt, BRTNODE node) { ...@@ -210,7 +217,7 @@ int toku_unpin_brtnode (BRT brt, BRTNODE node) {
// node->log_lsn = toku_txn_get_last_lsn(txn); // node->log_lsn = toku_txn_get_last_lsn(txn);
// //if (node->log_lsn.lsn>33320) printf("%s:%d node%lld lsn=%lld\n", __FILE__, __LINE__, node->thisnodename, node->log_lsn.lsn); // //if (node->log_lsn.lsn>33320) printf("%s:%d node%lld lsn=%lld\n", __FILE__, __LINE__, node->thisnodename, node->log_lsn.lsn);
// } // }
//toku_verify_counts(node); VERIFY_NODE(node);
return toku_cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node)); return toku_cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
} }
...@@ -221,16 +228,6 @@ typedef struct kvpair { ...@@ -221,16 +228,6 @@ typedef struct kvpair {
unsigned int vallen; unsigned int vallen;
} *KVPAIR; } *KVPAIR;
#if 0
int kvpair_compare (const void *av, const void *bv) {
const KVPAIR a = (const KVPAIR)av;
const KVPAIR b = (const KVPAIR)bv;
int r = toku_keycompare(a->key, a->keylen, b->key, b->keylen);
//printf("keycompare(%s,\n %s)-->%d\n", a->key, b->key, r);
return r;
}
#endif
/* Forgot to handle the case where there is something in the freelist. */ /* Forgot to handle the case where there is something in the freelist. */
static int malloc_diskblock_header_is_in_memory (DISKOFF *res, BRT brt, int size, TOKULOGGER logger) { static int malloc_diskblock_header_is_in_memory (DISKOFF *res, BRT brt, int size, TOKULOGGER logger) {
DISKOFF result = brt->h->unused_memory; DISKOFF result = brt->h->unused_memory;
...@@ -314,7 +311,7 @@ int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logg ...@@ -314,7 +311,7 @@ int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logg
*result = n; *result = n;
assert(n->nodesize>0); assert(n->nodesize>0);
// n->brt = t; // n->brt = t;
//printf("%s:%d putting %p (%lld) parent=%p\n", __FILE__, __LINE__, n, n->thisnodename, parent_brtnode); //printf("%s:%d putting %p (%lld)\n", __FILE__, __LINE__, n, n->thisnodename);
r=toku_cachetable_put(t->cf, n->thisnodename, n, brtnode_size(n), r=toku_cachetable_put(t->cf, n->thisnodename, n, brtnode_size(n),
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, t); toku_brtnode_flush_callback, toku_brtnode_fetch_callback, t);
assert(r==0); assert(r==0);
...@@ -348,13 +345,13 @@ int move_between_mempools (u_int32_t len, void *odata, void **ndata, void *extra ...@@ -348,13 +345,13 @@ int move_between_mempools (u_int32_t len, void *odata, void **ndata, void *extra
struct move_struct *ms=extra; struct move_struct *ms=extra;
assert(ms->from->height==0); assert(ms->from->height==0);
assert(ms->to->height==0); assert(ms->to->height==0);
assert(len==(unsigned)kv_pair_size(odata)); assert(len==(unsigned)leafentry_memsize(odata));
void *newitem=mempool_malloc_from_gpma(ms->to->u.l.buffer, &ms->to->u.l.buffer_mempool, len); void *newitem=mempool_malloc_from_gpma(ms->to->u.l.buffer, &ms->to->u.l.buffer_mempool, len);
assert(newitem); assert(newitem);
memcpy(newitem, odata, len); memcpy(newitem, odata, len);
toku_mempool_mfree(&ms->from->u.l.buffer_mempool, odata, len); toku_mempool_mfree(&ms->from->u.l.buffer_mempool, odata, len);
*ndata = newitem; *ndata = newitem;
assert(len==(unsigned)kv_pair_size(newitem)); assert(len==(unsigned)leafentry_memsize(newitem));
return 0; return 0;
} }
...@@ -395,8 +392,8 @@ static int note_move_items_between (u_int32_t nitems, u_int32_t *froms, u_int32_ ...@@ -395,8 +392,8 @@ static int note_move_items_between (u_int32_t nitems, u_int32_t *froms, u_int32_
u_int32_t diffsize = 0; u_int32_t diffsize = 0;
u_int32_t diff_fp = 0; u_int32_t diff_fp = 0;
for (i=0; i<nitems; i++) { for (i=0; i<nitems; i++) {
diffsize += PMA_ITEM_OVERHEAD + items[i].len; diffsize += PMA_ITEM_OVERHEAD + leafentry_disksize(items[i].data);
diff_fp += toku_calccrc32_kvpair_struct(items[i].data); diff_fp += toku_le_crc(items[i].data);
} }
ms->from->local_fingerprint -= ms->from->rand4fingerprint * diff_fp; ms->from->local_fingerprint -= ms->from->rand4fingerprint * diff_fp;
ms->to->local_fingerprint += ms->to->rand4fingerprint * diff_fp; ms->to->local_fingerprint += ms->to->rand4fingerprint * diff_fp;
...@@ -409,16 +406,18 @@ struct delete_struct { ...@@ -409,16 +406,18 @@ struct delete_struct {
BRTNODE node; BRTNODE node;
}; };
#if 0
static int brt_leaf_delete_callback (u_int32_t slotnum, u_int32_t len, void *data, void *extra) { static int brt_leaf_delete_callback (u_int32_t slotnum, u_int32_t len, void *data, void *extra) {
struct delete_struct *d = extra; struct delete_struct *d = extra;
d->node->local_fingerprint -= d->node->rand4fingerprint*toku_calccrc32_kvpair_struct(data); d->node->local_fingerprint -= d->node->rand4fingerprint*toku_calccrc32_kvpair_struct(data);
d->node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + len; d->node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(data);
toku_mempool_mfree(&d->node->u.l.buffer_mempool, data, len); toku_mempool_mfree(&d->node->u.l.buffer_mempool, data, len);
d->node->dirty=1; d->node->dirty=1;
// Should use slotnum for logging // Should use slotnum for logging
slotnum=slotnum; //???? slotnum=slotnum; //????
return 0; return 0;
} }
#endif
static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) { static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) {
BRTNODE B; BRTNODE B;
...@@ -438,9 +437,9 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod ...@@ -438,9 +437,9 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod
//toku_verify_gpma(node->u.l.buffer); //toku_verify_gpma(node->u.l.buffer);
GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata, GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata,
({ ({
struct kv_pair *p=vdata; char *p=vdata;
//printf("%s:%d %d:%p ", __FILE__, __LINE__, idx, p); //printf("%s:%d %d:%p ", __FILE__, __LINE__, idx, p);
assert((char*)node->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); assert((char*)node->u.l.buffer_mempool.base<= p && p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size );
})); }));
r = toku_gpma_split(node->u.l.buffer, B->u.l.buffer, PMA_ITEM_OVERHEAD, r = toku_gpma_split(node->u.l.buffer, B->u.l.buffer, PMA_ITEM_OVERHEAD,
move_between_mempools, move_between_mempools,
...@@ -449,28 +448,28 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod ...@@ -449,28 +448,28 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod
&ms); &ms);
GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata, GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata,
({ ({
struct kv_pair *p=vdata; char *p=vdata;
//printf("%s:%d %d:%p ", __FILE__, __LINE__, idx, p); //printf("%s:%d %d:%p ", __FILE__, __LINE__, idx, p);
assert((char*)node->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); assert((char*)node->u.l.buffer_mempool.base<= p && p < (char*)node->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size );
})); }));
GPMA_ITERATE(B->u.l.buffer, idx, vlen, vdata, GPMA_ITERATE(B->u.l.buffer, idx, vlen, vdata,
({ ({
struct kv_pair *p=vdata; char *p=vdata;
//printf("%s:%d %d:%p\n", __FILE__, __LINE__, idx, p); //printf("%s:%d %d:%p\n", __FILE__, __LINE__, idx, p);
assert((char*)B->u.l.buffer_mempool.base<= (char*)p && (char*)p < (char*)B->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size ); assert((char*)B->u.l.buffer_mempool.base<= p && p < (char*)B->u.l.buffer_mempool.base+node->u.l.buffer_mempool.size );
})); }));
//toku_verify_gpma(node->u.l.buffer); //toku_verify_gpma(node->u.l.buffer);
//toku_verify_gpma(B->u.l.buffer); //toku_verify_gpma(B->u.l.buffer);
if (splitk) { if (splitk) {
memset(splitk, 0, sizeof *splitk); memset(splitk, 0, sizeof *splitk);
struct kv_pair *kp=ms.last_pair_remaining_in_from.data; LEAFENTRY le=ms.last_pair_remaining_in_from.data;
if (node->flags&TOKU_DB_DUPSORT) { if (node->flags&TOKU_DB_DUPSORT) {
splitk->size = kv_pair_keylen(kp)+kv_pair_vallen(kp); splitk->size = le_any_keylen(le)+le_any_vallen(le);
splitk->data = kv_pair_malloc(kv_pair_key(kp), kv_pair_keylen(kp), kv_pair_val(kp), kv_pair_vallen(kp)); splitk->data = kv_pair_malloc(le_any_key(le), le_any_keylen(le), le_any_val(le), le_any_vallen(le));
} else { } else {
splitk->size = kv_pair_keylen(kp); splitk->size = le_any_keylen(le);
splitk->data = kv_pair_malloc(kv_pair_key(kp), kv_pair_keylen(kp), 0, 0); splitk->data = kv_pair_malloc(le_any_key(le), le_any_keylen(le), 0, 0);
} }
splitk->flags=0; splitk->flags=0;
} }
...@@ -486,9 +485,9 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod ...@@ -486,9 +485,9 @@ static int brtleaf_split (TOKULOGGER logger, FILENUM filenum, BRT t, BRTNODE nod
return 0; return 0;
} }
#define MAX_PATHLEN_TO_ROOT 40 //#define MAX_PATHLEN_TO_ROOT 40
static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int childnum, TXNID xid, int type, const char *key, int keylen, const char *data, int datalen, u_int32_t *fingerprint, DISKOFFARRAY path_to_parent) { static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int childnum, TXNID xid, int type, const char *key, int keylen, const char *data, int datalen, u_int32_t *fingerprint) {
BYTESTRING keybs = {.len=keylen, .data=(char*)key}; BYTESTRING keybs = {.len=keylen, .data=(char*)key};
BYTESTRING databs = {.len=datalen, .data=(char*)data}; BYTESTRING databs = {.len=datalen, .data=(char*)data};
u_int32_t old_fingerprint = *fingerprint; u_int32_t old_fingerprint = *fingerprint;
...@@ -498,19 +497,11 @@ static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int child ...@@ -498,19 +497,11 @@ static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int child
*fingerprint = new_fingerprint; *fingerprint = new_fingerprint;
int r = toku_log_brtenq(logger, (LSN*)0, 0, toku_cachefile_filenum(t->cf), node->thisnodename, childnum, xid, type, keybs, databs, old_fingerprint, new_fingerprint); int r = toku_log_brtenq(logger, (LSN*)0, 0, toku_cachefile_filenum(t->cf), node->thisnodename, childnum, xid, type, keybs, databs, old_fingerprint, new_fingerprint);
if (r!=0) return r; if (r!=0) return r;
TOKUTXN txn;
if (0==toku_txnid2txn(logger, xid, &txn) && txn) {
DISKOFFARRAY path = path_to_parent;
path.array = toku_memdup(path.array, sizeof(path.array[0])*(1+path.len));
if (path.array==0) return errno;
r = toku_logger_save_rollback_xactiontouchednonleaf(txn, toku_cachefile_filenum(t->cf), path, node->thisnodename);
if (r!=0) return r;
}
return 0; return 0;
} }
/* Side effect: sets splitk->data pointer to a malloc'd value */ /* Side effect: sets splitk->data pointer to a malloc'd value */
static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, TOKULOGGER logger) {
int old_n_children = node->u.n.n_children; int old_n_children = node->u.n.n_children;
int n_children_in_a = old_n_children/2; int n_children_in_a = old_n_children/2;
int n_children_in_b = old_n_children-n_children_in_a; int n_children_in_b = old_n_children-n_children_in_a;
...@@ -538,9 +529,6 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -538,9 +529,6 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
BNC_SUBTREE_FINGERPRINT(B,i)=0; BNC_SUBTREE_FINGERPRINT(B,i)=0;
} }
assert(path_to_parent.len<MAX_PATHLEN_TO_ROOT);
path_to_parent.array[path_to_parent.len++]=node->thisnodename; // Don't have to restore it since path_to_parent is passed by value, and this one not used again except in this loop.
for (i=n_children_in_a; i<old_n_children; i++) { for (i=n_children_in_a; i<old_n_children; i++) {
int targchild = i-n_children_in_a; int targchild = i-n_children_in_a;
...@@ -569,7 +557,7 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -569,7 +557,7 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
if (r!=0) return r; if (r!=0) return r;
r = toku_log_brtdeq(logger, (LSN*)0, 0, fnum, node->thisnodename, n_children_in_a, xid, type, keybs, databs, old_from_fingerprint, new_from_fingerprint); r = toku_log_brtdeq(logger, (LSN*)0, 0, fnum, node->thisnodename, n_children_in_a, xid, type, keybs, databs, old_from_fingerprint, new_from_fingerprint);
if (r!=0) return r; if (r!=0) return r;
r = log_and_save_brtenq(logger, t, B, targchild, xid, type, key, keylen, data, datalen, &B->local_fingerprint, path_to_parent); r = log_and_save_brtenq(logger, t, B, targchild, xid, type, key, keylen, data, datalen, &B->local_fingerprint);
r = toku_fifo_enq(to_htab, key, keylen, data, datalen, type, xid); r = toku_fifo_enq(to_htab, key, keylen, data, datalen, type, xid);
if (r!=0) return r; if (r!=0) return r;
toku_fifo_deq(from_htab); toku_fifo_deq(from_htab);
...@@ -656,14 +644,13 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -656,14 +644,13 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *split, DBT *split,
int debug, int debug,
TOKULOGGER, DISKOFFARRAY path_to_parent); TOKULOGGER);
/* key is not in the buffer. Either put the key-value pair in the child, or put it in the node. */ /* key is not in the buffer. Either put the key-value pair in the child, or put it in the node. */
static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRTNODE node, BRTNODE child, static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRTNODE node, BRTNODE child,
BRT_CMD cmd, BRT_CMD cmd,
int childnum_of_node, int childnum_of_node,
TOKULOGGER logger, TOKULOGGER logger) {
DISKOFFARRAY path_to_parent) {
assert(node->height>0); /* Not a leaf. */ assert(node->height>0); /* Not a leaf. */
DBT *k = cmd->u.id.key; DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val; DBT *v = cmd->u.id.val;
...@@ -695,13 +682,10 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT ...@@ -695,13 +682,10 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT
DBT againk; DBT againk;
toku_init_dbt(&againk); toku_init_dbt(&againk);
//printf("%s:%d hello!\n", __FILE__, __LINE__); //printf("%s:%d hello!\n", __FILE__, __LINE__);
assert(path_to_parent.len<MAX_PATHLEN_TO_ROOT);
path_to_parent.array[path_to_parent.len++]=node->thisnodename;
r = brtnode_put_cmd(t, child, cmd, r = brtnode_put_cmd(t, child, cmd,
&again_split, &againa, &againb, &againk, &again_split, &againa, &againb, &againk,
0, 0,
logger, logger);
path_to_parent);
if (r!=0) return r; if (r!=0) return r;
assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */ assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */
} else { } else {
...@@ -715,19 +699,15 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum ...@@ -715,19 +699,15 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
BRT_CMD cmd, BRT_CMD cmd,
int *child_did_split, BRTNODE *childa, BRTNODE *childb, int *child_did_split, BRTNODE *childa, BRTNODE *childb,
DBT *childsplitk, DBT *childsplitk,
TOKULOGGER logger, TOKULOGGER logger) {
DISKOFFARRAY path_to_parent) {
//if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, ""); //if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, "");
//printf("%s:%d hello!\n", __FILE__, __LINE__); //printf("%s:%d hello!\n", __FILE__, __LINE__);
assert(node->height>0); assert(node->height>0);
{ {
assert(path_to_parent.len<MAX_PATHLEN_TO_ROOT);
path_to_parent.array[path_to_parent.len++]=node->thisnodename;
int r = brtnode_put_cmd(t, child, cmd, int r = brtnode_put_cmd(t, child, cmd,
child_did_split, childa, childb, childsplitk, child_did_split, childa, childb, childsplitk,
0, 0,
logger, logger);
path_to_parent);
if (r!=0) return r; if (r!=0) return r;
} }
...@@ -765,7 +745,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum ...@@ -765,7 +745,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
return 0; return 0;
} }
static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent); static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger);
static int split_count=0; static int split_count=0;
...@@ -781,8 +761,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -781,8 +761,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
DBT *childsplitk, /* the data in the childsplitk is alloc'd and is consumed by this call. */ DBT *childsplitk, /* the data in the childsplitk is alloc'd and is consumed by this call. */
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *splitk, DBT *splitk,
TOKULOGGER logger, TOKULOGGER logger) {
DISKOFFARRAY path_to_parent) {
assert(node->height>0); assert(node->height>0);
assert(0 <= childnum && childnum < node->u.n.n_children); assert(0 <= childnum && childnum < node->u.n.n_children);
FIFO old_h = BNC_BUFFER(node,childnum); FIFO old_h = BNC_BUFFER(node,childnum);
...@@ -875,15 +854,20 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -875,15 +854,20 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
switch (type) { switch (type) {
case BRT_INSERT: case BRT_INSERT:
case BRT_DELETE_BOTH: case BRT_DELETE_BOTH:
case BRT_DELETE: case BRT_DELETE_ANY:
if (type!=BRT_DELETE || 0==(t->flags&TOKU_DB_DUPSORT)) { case BRT_ABORT_BOTH:
case BRT_ABORT_ANY:
case BRT_COMMIT_BOTH:
case BRT_COMMIT_ANY:
if ((type!=BRT_DELETE_ANY && type!=BRT_ABORT_ANY && type!=BRT_COMMIT_ANY) || 0==(t->flags&TOKU_DB_DUPSORT)) {
// If it's an INSERT or DELETE_BOTH or there are no duplicates then we just put the command into one subtree // If it's an INSERT or DELETE_BOTH or there are no duplicates then we just put the command into one subtree
int cmp = brt_compare_pivot(t, &skd, &svd, childsplitk->data); int cmp = brt_compare_pivot(t, &skd, &svd, childsplitk->data);
if (cmp <= 0) pusha = 1; if (cmp <= 0) pusha = 1;
else pushb = 1; else pushb = 1;
} else { } else {
assert(type==BRT_DELETE && t->flags&TOKU_DB_DUPSORT); assert((type==BRT_DELETE_ANY || type==BRT_ABORT_ANY || type==BRT_COMMIT_ANY) && t->flags&TOKU_DB_DUPSORT);
// It is a DELETE and it's a DUPSORT database, in which case if the comparison function comes up 0 we must write the command to both children. (See #201) // It is a DELETE or ABORT_ANY and it's a DUPSORT database,
// in which case if the comparison function comes up 0 we must write the command to both children. (See #201)
int cmp = brt_compare_pivot(t, &skd, 0, childsplitk->data); int cmp = brt_compare_pivot(t, &skd, 0, childsplitk->data);
if (cmp<=0) pusha=1; if (cmp<=0) pusha=1;
if (cmp>=0) pushb=1; // Could be that both pusha and pushb are set if (cmp>=0) pushb=1; // Could be that both pusha and pushb are set
...@@ -891,7 +875,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -891,7 +875,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
if (pusha) { if (pusha) {
// If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order. // If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order.
if (toku_fifo_n_entries(BNC_BUFFER(node,childnum))==0) { if (toku_fifo_n_entries(BNC_BUFFER(node,childnum))==0) {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger, path_to_parent); r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, logger);
} else { } else {
r=insert_to_buffer_in_nonleaf(node, childnum, &skd, &svd, type, xid); r=insert_to_buffer_in_nonleaf(node, childnum, &skd, &svd, type, xid);
} }
...@@ -899,7 +883,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -899,7 +883,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
if (pushb) { if (pushb) {
// If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order. // If we already have something in the buffer, we must add the new command to the buffer so that commands don't get out of order.
if (toku_fifo_n_entries(BNC_BUFFER(node,childnum+1))==0) { if (toku_fifo_n_entries(BNC_BUFFER(node,childnum+1))==0) {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger, path_to_parent); r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, logger);
} else { } else {
r=insert_to_buffer_in_nonleaf(node, childnum+1, &skd, &svd, type, xid); r=insert_to_buffer_in_nonleaf(node, childnum+1, &skd, &svd, type, xid);
} }
...@@ -926,9 +910,9 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -926,9 +910,9 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
//verify_local_fingerprint_nonleaf(childb); //verify_local_fingerprint_nonleaf(childb);
//verify_local_fingerprint_nonleaf(node); //verify_local_fingerprint_nonleaf(node);
//toku_verify_counts(node); VERIFY_NODE(node);
//toku_verify_counts(childa); VERIFY_NODE(childa);
//toku_verify_counts(childb); VERIFY_NODE(childb);
r=toku_unpin_brtnode(t, childa); r=toku_unpin_brtnode(t, childa);
assert(r==0); assert(r==0);
...@@ -937,7 +921,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -937,7 +921,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
if (node->u.n.n_children>TREE_FANOUT) { if (node->u.n.n_children>TREE_FANOUT) {
//printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs); //printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs);
r=brt_nonleaf_split(t, node, nodea, nodeb, splitk, logger, path_to_parent); r=brt_nonleaf_split(t, node, nodea, nodeb, splitk, logger);
if (r!=0) return r; if (r!=0) return r;
//printf("%s:%d did split\n", __FILE__, __LINE__); //printf("%s:%d did split\n", __FILE__, __LINE__);
split_count++; split_count++;
...@@ -957,7 +941,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -957,7 +941,7 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
if (toku_serialize_brtnode_size(node) > node->nodesize) { if (toku_serialize_brtnode_size(node) > node->nodesize) {
/* lighten the node by pushing down its buffers. this may cause /* lighten the node by pushing down its buffers. this may cause
the current node to split and go away */ the current node to split and go away */
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger, path_to_parent); r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, 0, logger);
assert(r == 0); assert(r == 0);
} }
if (*did_split == 0) assert(toku_serialize_brtnode_size(node)<=node->nodesize); if (*did_split == 0) assert(toku_serialize_brtnode_size(node)<=node->nodesize);
...@@ -969,8 +953,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -969,8 +953,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *splitk, DBT *splitk,
int debug, int debug,
TOKULOGGER logger, TOKULOGGER logger) {
DISKOFFARRAY path_to_parent) {
void *childnode_v; void *childnode_v;
BRTNODE child; BRTNODE child;
int r; int r;
...@@ -983,7 +966,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -983,7 +966,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
//printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v); //printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v);
child=childnode_v; child=childnode_v;
//verify_local_fingerprint_nonleaf(child); //verify_local_fingerprint_nonleaf(child);
//toku_verify_counts(child); VERIFY_NODE(child);
//printf("%s:%d height=%d n_bytes_in_buffer = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_buffer[0], child->n_bytes_in_buffer[1], child->n_bytes_in_buffer[2]); //printf("%s:%d height=%d n_bytes_in_buffer = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_buffer[0], child->n_bytes_in_buffer[1], child->n_bytes_in_buffer[2]);
if (child->height>0 && child->u.n.n_children>0) assert(BNC_DISKOFF(child, child->u.n.n_children-1)!=0); if (child->height>0 && child->u.n.n_children>0) assert(BNC_DISKOFF(child, child->u.n.n_children-1)!=0);
if (debug) printf("%s:%d %*spush_some_brt_cmds_down to %lld\n", __FILE__, __LINE__, debug, "", child->thisnodename); if (debug) printf("%s:%d %*spush_some_brt_cmds_down to %lld\n", __FILE__, __LINE__, debug, "", child->thisnodename);
...@@ -1018,8 +1001,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -1018,8 +1001,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
&brtcmd, &brtcmd,
&child_did_split, &childa, &childb, &child_did_split, &childa, &childb,
&childsplitk, &childsplitk,
logger, logger);
path_to_parent);
if (0){ if (0){
unsigned int sum=0; unsigned int sum=0;
...@@ -1037,8 +1019,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -1037,8 +1019,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
r=handle_split_of_child (t, node, childnum, r=handle_split_of_child (t, node, childnum,
childa, childb, &childsplitk, childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk, did_split, nodea, nodeb, splitk,
logger, logger);
path_to_parent);
//if (*did_split) { //if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea); // verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb); // verify_local_fingerprint_nonleaf(*nodeb);
...@@ -1061,7 +1042,7 @@ static int debugp1 (int debug) { ...@@ -1061,7 +1042,7 @@ static int debugp1 (int debug) {
return debug ? debug+1 : 0; return debug ? debug+1 : 0;
} }
static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent) static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int debug, TOKULOGGER logger)
/* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */ /* If the buffer is too full, then push down. Possibly the child will split. That may make us split. */
{ {
assert(node->height>0); assert(node->height>0);
...@@ -1077,7 +1058,7 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE ...@@ -1077,7 +1058,7 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE
find_heaviest_child(node, &childnum); find_heaviest_child(node, &childnum);
if (0) printf("%s:%d %*spush some down from %lld into %lld (child %d)\n", __FILE__, __LINE__, debug, "", node->thisnodename, BNC_DISKOFF(node, childnum), childnum); if (0) printf("%s:%d %*spush some down from %lld into %lld (child %d)\n", __FILE__, __LINE__, debug, "", node->thisnodename, BNC_DISKOFF(node, childnum), childnum);
assert(BNC_DISKOFF(node, childnum)!=0); assert(BNC_DISKOFF(node, childnum)!=0);
int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent); int r = push_some_brt_cmds_down(t, node, childnum, did_split, nodea, nodeb, splitk, debugp1(debug), logger);
if (r!=0) return r; if (r!=0) return r;
assert(*did_split==0 || *did_split==1); assert(*did_split==0 || *did_split==1);
if (debug) printf("%s:%d %*sdid push_some_brt_cmds_down did_split=%d\n", __FILE__, __LINE__, debug, "", *did_split); if (debug) printf("%s:%d %*sdid push_some_brt_cmds_down did_split=%d\n", __FILE__, __LINE__, debug, "", *did_split);
...@@ -1107,71 +1088,393 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE ...@@ -1107,71 +1088,393 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE
return 0; return 0;
} }
int toku_brtleaf_compare_fun (u_int32_t alen __attribute__((__unused__)), void *aval, u_int32_t blen __attribute__((__unused__)), void *bval, void *extra) { int leafval_bessel_le_committed (u_int32_t klen, void *kval,
struct lc_pair *p = extra; u_int32_t dlen, void *dval,
BRT t = p->t; struct cmd_leafval_bessel_extra *be) {
DBT k1,k2; BRT t = be->t;
int cmp = t->compare_fun (t->db, DBT dbt;
toku_fill_dbt(&k1, kv_pair_key(aval), kv_pair_keylen(aval)), int cmp = t->compare_fun(t->db,
toku_fill_dbt(&k2, kv_pair_key(bval), kv_pair_keylen(bval))); toku_fill_dbt(&dbt, kval, klen),
if (cmp == 0 && p->compare_both ) { be->cmd->u.id.key);
if (cmp == 0 && be->compare_both_keys && be->cmd->u.id.val->data) {
return t->dup_compare(t->db, return t->dup_compare(t->db,
toku_fill_dbt(&k1, kv_pair_val(aval), kv_pair_vallen(aval)), toku_fill_dbt(&dbt, dval, dlen),
toku_fill_dbt(&k2, kv_pair_val(bval), kv_pair_vallen(bval))); be->cmd->u.id.val);
} else { } else {
return cmp; return cmp;
} }
} }
int leafval_bessel_le_both (TXNID xid __attribute__((__unused__)),
u_int32_t klen, void *kval,
u_int32_t clen, void *cval,
u_int32_t plen __attribute__((__unused__)), void *pval __attribute__((__unused__)),
struct cmd_leafval_bessel_extra *be) {
return leafval_bessel_le_committed(klen, kval, clen, cval, be);
}
int leafval_bessel_le_provdel (TXNID xid __attribute__((__unused__)),
u_int32_t klen, void *kval,
u_int32_t clen, void *cval,
struct cmd_leafval_bessel_extra *be) {
return leafval_bessel_le_committed(klen, kval, clen, cval, be);
}
int leafval_bessel_le_provpair (TXNID xid __attribute__((__unused__)),
u_int32_t klen, void *kval,
u_int32_t plen, void *pval,
struct cmd_leafval_bessel_extra *be) {
return leafval_bessel_le_committed(klen, kval, plen, pval, be);
}
int toku_cmd_leafval_bessel (u_int32_t dlen __attribute__((__unused__)), void *dval, void *extra) {
struct cmd_leafval_bessel_extra *be = extra;
LEAFENTRY le = dval;
LESWITCHCALL(le, leafval_bessel, be);
}
// Whenever anything provisional is happening, it's XID must match the cmd's.
static int apply_cmd_to_le_committed (u_int32_t klen, void *kval,
u_int32_t dlen, void *dval,
BRT_CMD cmd,
u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) {
assert(cmd->u.id.key->size == klen);
assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
switch (cmd->type) {
case BRT_INSERT:
return le_both(cmd->xid,
klen, kval,
dlen, dval,
cmd->u.id.val->size, cmd->u.id.val->data,
newlen, disksize, new_data);
case BRT_DELETE_ANY:
case BRT_DELETE_BOTH:
return le_provdel(cmd->xid,
klen, kval,
dlen, dval,
newlen, disksize, new_data);
case BRT_ABORT_BOTH:
case BRT_ABORT_ANY:
case BRT_COMMIT_BOTH:
case BRT_COMMIT_ANY:
// Just return the original committed record
return le_committed(klen, kval, dlen, dval,
newlen, disksize, new_data);
case BRT_NONE: break;
}
assert(0);
return 0;
}
static int apply_cmd_to_le_both (TXNID xid,
u_int32_t klen, void *kval,
u_int32_t clen, void *cval,
u_int32_t plen, void *pval,
BRT_CMD cmd,
u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) {
// keep the committed value for rollback.
assert(cmd->xid == xid); // provisional things must match the cmd. (Others should already be committed or aborted)
assert(cmd->u.id.key->size == klen);
assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
switch (cmd->type) {
case BRT_INSERT:
return le_both(cmd->xid,
klen, kval,
clen, cval,
cmd->u.id.val->size, cmd->u.id.val->data,
newlen, disksize, new_data);
case BRT_DELETE_ANY:
case BRT_DELETE_BOTH:
return le_provdel(cmd->xid,
klen, kval,
clen, cval,
newlen, disksize, new_data);
case BRT_ABORT_BOTH:
case BRT_ABORT_ANY:
return le_committed(klen, kval,
clen, cval,
newlen, disksize, new_data);
case BRT_COMMIT_BOTH:
case BRT_COMMIT_ANY:
return le_committed(klen, kval,
plen, pval,
newlen, disksize, new_data);
case BRT_NONE: break;
}
assert(0);
return 0;
}
static int apply_cmd_to_le_provdel (TXNID xid,
u_int32_t klen, void *kval,
u_int32_t clen, void *cval,
BRT_CMD cmd,
u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) {
// keep the committed value for rollback
assert(cmd->xid == xid); // provisional things must match the cmd. (Others should already be committed or aborted)
assert(cmd->u.id.key->size == klen);
assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
switch (cmd->type) {
case BRT_INSERT:
return le_both(cmd->xid,
klen, kval,
clen, cval,
cmd->u.id.val->size, cmd->u.id.val->data,
newlen, disksize, new_data);
case BRT_DELETE_ANY:
case BRT_DELETE_BOTH:
// A delete of a delete could conceivably return the same item, but to simplify things we just reallocate it
// because othewise we have to notice not to free() the olditem.
return le_provdel(cmd->xid,
klen, kval,
clen, cval,
newlen, disksize, new_data);
case BRT_ABORT_BOTH:
case BRT_ABORT_ANY:
return le_committed(klen, kval,
clen, cval,
newlen, disksize, new_data);
case BRT_COMMIT_BOTH:
case BRT_COMMIT_ANY:
*new_data = 0;
return 0;
case BRT_NONE: break;
}
assert(0);
return 0;
}
static int apply_cmd_to_le_provpair (TXNID xid,
u_int32_t klen, void *kval,
u_int32_t plen , void *pval,
BRT_CMD cmd,
u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) {
assert(cmd->xid == xid); // provisional things must match the cmd. (Others should already be committed or aborted)
assert(cmd->u.id.key->size == klen);
assert(memcmp(cmd->u.id.key->data, kval, klen)==0);
switch (cmd->type) {
case BRT_INSERT:
// it's still a provpair (the old prov value is lost)
return le_provpair(cmd->xid,
klen, kval,
cmd->u.id.val->size, cmd->u.id.val->data,
newlen, disksize, new_data);
case BRT_DELETE_BOTH:
case BRT_DELETE_ANY:
case BRT_ABORT_BOTH:
case BRT_ABORT_ANY:
// A delete or abort of a provisional pair is nothing.
*new_data = 0;
return 0;
case BRT_COMMIT_ANY:
case BRT_COMMIT_BOTH:
return le_committed(klen, kval,
plen, pval,
newlen, disksize, new_data);
case BRT_NONE: break;
}
assert(0);
return 0;
}
static int apply_cmd_to_leaf (BRT_CMD cmd,
u_int32_t oldlen, void *stored_data, // NULL if there was no stored data.
u_int32_t *newlen, u_int32_t *disksize, LEAFENTRY *new_data) {
if (stored_data==0) {
switch (cmd->type) {
case BRT_INSERT:
{
LEAFENTRY le;
int r = le_provpair(cmd->xid,
cmd->u.id.key->size, cmd->u.id.key->data,
cmd->u.id.val->size, cmd->u.id.val->data,
newlen, disksize, &le);
if (r==0) *new_data=le;
return r;
}
case BRT_DELETE_BOTH:
case BRT_DELETE_ANY:
case BRT_ABORT_BOTH:
case BRT_ABORT_ANY:
case BRT_COMMIT_BOTH:
case BRT_COMMIT_ANY:
*new_data = 0;
return 0; // Don't have to insert anything.
case BRT_NONE:
break;
}
assert(0);
return 0;
} else {
assert(oldlen==leafentry_memsize(stored_data));
LESWITCHCALL(stored_data, apply_cmd_to, cmd,
newlen, disksize, new_data);
}
}
int should_compare_both_keys (BRTNODE node, BRT_CMD cmd) {
switch (cmd->type) {
case BRT_INSERT:
return node->flags & TOKU_DB_DUPSORT;
case BRT_DELETE_BOTH:
case BRT_ABORT_BOTH:
case BRT_COMMIT_BOTH:
return 1;
case BRT_DELETE_ANY:
case BRT_ABORT_ANY:
case BRT_COMMIT_ANY:
return 0;
case BRT_NONE:
break;
}
assert(0);
return 0;
}
static int brt_leaf_apply_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger,
u_int32_t idx, u_int32_t storedlen, LEAFENTRY le) {
FILENUM filenum = toku_cachefile_filenum(t->cf);
u_int32_t newlen, newdisksize;
LEAFENTRY newdata;
int r = apply_cmd_to_leaf(cmd, storedlen, le, &newlen, &newdisksize, &newdata);
if (r!=0) return r;
if (newdata) assert(newdisksize == leafentry_disksize(newdata));
if (le) {
// It's there, note that it's gone and remove it from the mempool
node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(le);
node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(le);
r = toku_log_deleteleafentry(logger, &node->log_lsn, 0, filenum, node->thisnodename, idx, le);
if (r!=0) return r;
BRT_CMD_S cmd2 = *cmd;
DBT val_from_lekey;
cmd2.u.id.val = toku_fill_dbt(&val_from_lekey, le_latest_val(le), le_latest_vallen(le));
struct cmd_leafval_bessel_extra be = {t, &cmd2, 1}; // always compare both in this mode, if the value is there
struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node};
toku_gpma_delete_bessel(node->u.l.buffer,
toku_cmd_leafval_bessel, &be,
0, 0,
note_move_items_within, &ms);
toku_mempool_mfree(&node->u.l.buffer_mempool, 0, storedlen); // Must pass 0, since le may be no good any more.
}
if (newdata) {
struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node};
struct cmd_leafval_bessel_extra be = {t, cmd, node->flags & TOKU_DB_DUPSORT};
LEAFENTRY new_le = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, newlen);
memcpy(new_le, newdata, newlen);
r = toku_gpma_insert_bessel(node->u.l.buffer, newlen, new_le, toku_cmd_leafval_bessel, &be, note_move_items_within, &ms, &idx);
if (r!=0) return r;
r = toku_log_insertleafentry(logger, &node->log_lsn, 0, toku_cachefile_filenum(t->cf), node->thisnodename, idx, newdata);
if (r!=0) return r;
assert(newdisksize == leafentry_disksize(newdata));
node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + newdisksize;
node->local_fingerprint += node->rand4fingerprint*toku_le_crc(newdata);
toku_free(newdata);
}
// printf("%s:%d rand4=%08x local_fingerprint=%08x this=%08x\n", __FILE__, __LINE__, node->rand4fingerprint, node->local_fingerprint, toku_calccrc32_kvpair_struct(kv));
return 0;
}
static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, int debug,
TOKULOGGER logger) { TOKULOGGER logger) {
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint); // toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
VERIFY_NODE(node);
assert(node->height==0); assert(node->height==0);
FILENUM filenum = toku_cachefile_filenum(t->cf); FILENUM filenum = toku_cachefile_filenum(t->cf);
if (cmd->type == BRT_INSERT) {
DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val;
struct kv_pair *kv = brtnode_malloc_kv_pair(node->u.l.buffer, &node->u.l.buffer_mempool, k->data, k->size, v->data, v->size);
assert(kv);
u_int32_t storedlen; u_int32_t storedlen;
void *storeddata; void *storeddata;
u_int32_t idx; u_int32_t idx;
struct lc_pair lc = {t, node->flags & TOKU_DB_DUPSORT}; // for put operations we compare both keys if they are both there int r;
int r = toku_gpma_lookup_item(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, &storedlen, &storeddata, &idx); int compare_both = should_compare_both_keys(node, cmd);
struct cmd_leafval_bessel_extra be = {t, cmd, compare_both};
if (r==0) {
// It's already there. Note that it's gone and remove it from the mempool. switch (cmd->type) {
node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + storedlen; case BRT_INSERT:
node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair_struct(storeddata); r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be,
BYTESTRING okbs = { kv_pair_keylen(storeddata), kv_pair_key(storeddata) }; &storedlen, &storeddata, &idx);
BYTESTRING odbs = { kv_pair_vallen(storeddata), kv_pair_val(storeddata) }; if (r==DB_NOTFOUND) {
r = toku_log_deleteinleaf(logger, &node->log_lsn, 0, cmd->xid, filenum, node->thisnodename, idx, okbs, odbs); storeddata = 0;
toku_mempool_mfree(&node->u.l.buffer_mempool, storeddata, storedlen); } else if (r!=0)
return r;
// Now put the new kv in.
toku_gpma_set_at_index(node->u.l.buffer, idx, kv_pair_size(kv), kv); r = brt_leaf_apply_cmd_once(t, node, cmd, logger, idx, storedlen, storeddata);
} else {
// Insert it.
struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node};
r = toku_gpma_insert(node->u.l.buffer, kv_pair_size(kv), kv, toku_brtleaf_compare_fun, &lc, note_move_items_within, &ms, &idx);
if (r!=0) return r; if (r!=0) return r;
} break;
{ case BRT_DELETE_BOTH:
BYTESTRING kbs = { kv_pair_keylen(kv), kv_pair_key(kv) }; case BRT_ABORT_BOTH:
BYTESTRING dbs = { kv_pair_vallen(kv), kv_pair_val(kv) }; case BRT_COMMIT_BOTH:
r = toku_log_insertinleaf(logger, &node->log_lsn, 0, cmd->xid, filenum, node->thisnodename, idx, kbs, dbs);
// Delete the one item
r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be,
&storedlen, &storeddata, &idx);
if (r == DB_NOTFOUND) break;
if (r != 0) return r;
VERIFY_NODE(node);
static int count=0;
count++;
r = brt_leaf_apply_cmd_once(t, node, cmd, logger, idx, storedlen, storeddata);
if (r!=0) return r;
VERIFY_NODE(node);
break;
case BRT_DELETE_ANY:
case BRT_ABORT_ANY:
case BRT_COMMIT_ANY:
// Delete all the matches
r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be,
&storedlen, &storeddata, &idx);
if (r == DB_NOTFOUND) break;
if (r != 0) return r;
while (1) {
int vallen = le_any_vallen(storeddata);
void *save_val = toku_memdup(le_any_val(storeddata), storedlen);
r = brt_leaf_apply_cmd_once(t, node, cmd, logger, idx, storedlen, storeddata);
if (r!=0) return r; if (r!=0) return r;
// Now we must find the next one.
DBT valdbt;
BRT_CMD_S ncmd = { cmd->type, cmd->xid, .u.id={cmd->u.id.key, toku_fill_dbt(&valdbt, save_val, vallen)}};
struct cmd_leafval_bessel_extra nbe = {t, &ncmd, 1};
r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, +1, &nbe,
&storedlen, &storeddata, &idx);
toku_free(save_val);
if (r!=0) break;
{ // Continue only if the next record that we found has the same key.
DBT adbt;
if (t->compare_fun(t->db,
toku_fill_dbt(&adbt, le_any_key(storeddata), le_any_keylen(storeddata)),
cmd->u.id.key) != 0)
break;
} }
node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + kv_pair_size(kv); }
node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_kvpair_struct(kv);
// printf("%s:%d rand4=%08x local_fingerprint=%08x this=%08x\n", __FILE__, __LINE__, node->rand4fingerprint, node->local_fingerprint, toku_calccrc32_kvpair_struct(kv)); break;
case BRT_NONE: return EINVAL;
}
/// All done doing the work
node->dirty = 1; node->dirty = 1;
// toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint); // toku_pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
VERIFY_NODE(node);
// If it doesn't fit, then split the leaf. // If it doesn't fit, then split the leaf.
if (toku_serialize_brtnode_size(node) > node->nodesize) { if (toku_serialize_brtnode_size(node) > node->nodesize) {
r = brtleaf_split (logger, filenum, t, node, nodea, nodeb, splitk); r = brtleaf_split (logger, filenum, t, node, nodea, nodeb, splitk);
...@@ -1182,32 +1485,12 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1182,32 +1485,12 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize); if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize);
assert(toku_serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); assert(toku_serialize_brtnode_size(*nodea)<=(*nodea)->nodesize);
assert(toku_serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); assert(toku_serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize);
// toku_pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint); VERIFY_NODE(*nodea);
// toku_pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint); VERIFY_NODE(*nodeb);
} else { } else {
*did_split = 0; *did_split = 0;
} }
return 0; return 0;
} else if (cmd->type == BRT_DELETE || cmd->type == BRT_DELETE_BOTH) {
DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val;
struct kv_pair *kv = kv_pair_malloc(k->data, k->size, v->data, v->size);
struct lc_pair lc = {t, (cmd->type == BRT_DELETE_BOTH) };
struct move_struct ms = {.logger=logger, .filenum=filenum, .from=node, .to=node};
struct delete_struct dp = {node};
int r = toku_gpma_delete_item(node->u.l.buffer,
kv_pair_size(kv), kv,
toku_brtleaf_compare_fun, &lc,
brt_leaf_delete_callback, &dp,
note_move_items_within, &ms);
toku_free(kv);
*did_split = 0;
if (r==DB_NOTFOUND) return 0;
return r;
} else {
return EINVAL;
}
} }
/* find the leftmost child that may contain the key */ /* find the leftmost child that may contain the key */
...@@ -1226,8 +1509,7 @@ unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) { ...@@ -1226,8 +1509,7 @@ unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) {
/* put a cmd into a nodes child */ /* put a cmd into a nodes child */
static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, TOKULOGGER logger, int childnum, int maybe, int debug, TOKULOGGER logger, int childnum, int maybe) {
DISKOFFARRAY path_to_parent) {
int r; int r;
void *child_v; void *child_v;
BRTNODE child; BRTNODE child;
...@@ -1248,12 +1530,8 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1248,12 +1530,8 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd,
child = child_v; child = child_v;
child_did_split = 0; child_did_split = 0;
assert(path_to_parent.len<MAX_PATHLEN_TO_ROOT);
path_to_parent.array[path_to_parent.len++]=node->thisnodename;
r = brtnode_put_cmd(t, child, cmd, r = brtnode_put_cmd(t, child, cmd,
&child_did_split, &childa, &childb, &childsplitk, debug, logger, &child_did_split, &childa, &childb, &childsplitk, debug, logger);
path_to_parent);
path_to_parent.len--;
if (r != 0) { if (r != 0) {
/* putting to the child failed for some reason, so unpin the child and return the error code */ /* putting to the child failed for some reason, so unpin the child and return the error code */
int rr = toku_unpin_brtnode(t, child); int rr = toku_unpin_brtnode(t, child);
...@@ -1266,8 +1544,7 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1266,8 +1544,7 @@ static int brt_nonleaf_put_cmd_child_node (BRT t, BRTNODE node, BRT_CMD cmd,
r = handle_split_of_child(t, node, childnum, r = handle_split_of_child(t, node, childnum,
childa, childb, &childsplitk, childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk, did_split, nodea, nodeb, splitk,
logger, logger);
path_to_parent);
assert(r == 0); assert(r == 0);
} else { } else {
//verify_local_fingerprint_nonleaf(child); //verify_local_fingerprint_nonleaf(child);
...@@ -1283,13 +1560,12 @@ int toku_brt_do_push_cmd = 1; ...@@ -1283,13 +1560,12 @@ int toku_brt_do_push_cmd = 1;
/* put a cmd into a node at childnum */ /* put a cmd into a node at childnum */
static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down, int debug, TOKULOGGER logger, unsigned int childnum, int can_push, int *do_push_down) {
DISKOFFARRAY path_to_parent) {
//verify_local_fingerprint_nonleaf(node); //verify_local_fingerprint_nonleaf(node);
/* try to push the cmd to the subtree if the buffer is empty and pushes are enabled */ /* try to push the cmd to the subtree if the buffer is empty and pushes are enabled */
if (BNC_NBYTESINBUF(node, childnum) == 0 && can_push && toku_brt_do_push_cmd) { if (BNC_NBYTESINBUF(node, childnum) == 0 && can_push && toku_brt_do_push_cmd) {
int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, path_to_parent); int r = brt_nonleaf_put_cmd_child_node(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1);
if (r == 0) if (r == 0)
return r; return r;
} }
...@@ -1301,7 +1577,7 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1301,7 +1577,7 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd,
DBT *k = cmd->u.id.key; DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val; DBT *v = cmd->u.id.val;
int r = log_and_save_brtenq(logger, t, node, childnum, cmd->xid, type, k->data, k->size, v->data, v->size, &node->local_fingerprint, path_to_parent); int r = log_and_save_brtenq(logger, t, node, childnum, cmd->xid, type, k->data, k->size, v->data, v->size, &node->local_fingerprint);
if (r!=0) return r; if (r!=0) return r;
int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD; int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
r=toku_fifo_enq(BNC_BUFFER(node,childnum), k->data, k->size, v->data, v->size, type, cmd->xid); r=toku_fifo_enq(BNC_BUFFER(node,childnum), k->data, k->size, v->data, v->size, type, cmd->xid);
...@@ -1314,9 +1590,9 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1314,9 +1590,9 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD cmd,
return 0; return 0;
} }
static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, static int brt_nonleaf_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { int debug, TOKULOGGER logger) {
//verify_local_fingerprint_nonleaf(node); //verify_local_fingerprint_nonleaf(node);
unsigned int childnum; unsigned int childnum;
int r; int r;
...@@ -1326,14 +1602,14 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1326,14 +1602,14 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
/* put the cmd in the subtree */ /* put the cmd in the subtree */
int do_push_down = 0; int do_push_down = 0;
r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down, path_to_parent); r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, childnum, 1, &do_push_down);
if (r != 0) return r; if (r != 0) return r;
/* maybe push down */ /* maybe push down */
if (do_push_down) { if (do_push_down) {
if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, "");
//verify_local_fingerprint_nonleaf(node); //verify_local_fingerprint_nonleaf(node);
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent); r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger);
if (r!=0) return r; if (r!=0) return r;
if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, "");
if (*did_split) { if (*did_split) {
...@@ -1357,18 +1633,17 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1357,18 +1633,17 @@ static int brt_nonleaf_insert_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
} }
/* delete in all subtrees starting from the left most one which contains the key */ /* delete in all subtrees starting from the left most one which contains the key */
static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd, static int brt_nonleaf_cmd_many (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, int debug,
TOKULOGGER logger, TOKULOGGER logger) {
DISKOFFARRAY path_to_parent) {
int r; int r;
/* find all children that need a delete cmd */ /* find all children that need a copy of the command */
int delchild[TREE_FANOUT], delidx = 0; int sendchild[TREE_FANOUT], delidx = 0;
inline void delchild_append(int i) { inline void sendchild_append(int i) {
if (delidx == 0 || delchild[delidx-1] != i) if (delidx == 0 || sendchild[delidx-1] != i)
delchild[delidx++] = i; sendchild[delidx++] = i;
} }
int i; int i;
for (i = 0; i < node->u.n.n_children-1; i++) { for (i = 0; i < node->u.n.n_children-1; i++) {
...@@ -1376,24 +1651,24 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1376,24 +1651,24 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
if (cmp > 0) { if (cmp > 0) {
continue; continue;
} else if (cmp < 0) { } else if (cmp < 0) {
delchild_append(i); sendchild_append(i);
break; break;
} else if (t->flags & TOKU_DB_DUPSORT) { } else if (t->flags & TOKU_DB_DUPSORT) {
delchild_append(i); sendchild_append(i);
delchild_append(i+1); sendchild_append(i+1);
} else { } else {
delchild_append(i); sendchild_append(i);
break; break;
} }
} }
if (delidx == 0) if (delidx == 0)
delchild_append(node->u.n.n_children-1); sendchild_append(node->u.n.n_children-1);
/* issue the delete cmd to all of the children found previously */ /* issue the to all of the children found previously */
int do_push_down = 0; int do_push_down = 0;
for (i=0; i<delidx; i++) { for (i=0; i<delidx; i++) {
r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, delchild[i], delidx == 1, &do_push_down, path_to_parent); r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, sendchild[i], delidx == 1, &do_push_down);
assert(r == 0); assert(r == 0);
} }
...@@ -1401,7 +1676,7 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1401,7 +1676,7 @@ static int brt_nonleaf_delete_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
/* maybe push down */ /* maybe push down */
if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, "");
//verify_local_fingerprint_nonleaf(node); //verify_local_fingerprint_nonleaf(node);
r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger, path_to_parent); r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), logger);
if (r!=0) return r; if (r!=0) return r;
if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, "");
if (*did_split) { if (*did_split) {
...@@ -1427,13 +1702,22 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1427,13 +1702,22 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, int *did_split, BRTNODE *nodea, BRTNODE *nodeb,
DBT *splitk, DBT *splitk,
int debug, int debug,
TOKULOGGER logger, TOKULOGGER logger) {
DISKOFFARRAY path_to_parent) { switch (cmd->type) {
if (cmd->type == BRT_INSERT || cmd->type == BRT_DELETE_BOTH) { case BRT_INSERT:
return brt_nonleaf_insert_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent); case BRT_DELETE_BOTH:
} else if (cmd->type == BRT_DELETE) { case BRT_ABORT_BOTH:
return brt_nonleaf_delete_cmd(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger, path_to_parent); case BRT_COMMIT_BOTH:
} else do_once:
return brt_nonleaf_cmd_once(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger);
case BRT_DELETE_ANY:
case BRT_ABORT_ANY:
case BRT_COMMIT_ANY:
if (0 == (node->flags & TOKU_DB_DUPSORT)) goto do_once; // nondupsort delete_any is just do once.
return brt_nonleaf_cmd_many(t, node, cmd, did_split, nodea, nodeb, splitk, debug, logger);
case BRT_NONE:
break;
}
return EINVAL; return EINVAL;
} }
...@@ -1453,8 +1737,7 @@ static void verify_local_fingerprint_nonleaf (BRTNODE node) { ...@@ -1453,8 +1737,7 @@ static void verify_local_fingerprint_nonleaf (BRTNODE node) {
static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, int debug,
TOKULOGGER logger, TOKULOGGER logger) {
DISKOFFARRAY path_to_parent) {
//static int counter=0; // FOO //static int counter=0; // FOO
//static int oldcounter=0; //static int oldcounter=0;
//int tmpcounter; //int tmpcounter;
...@@ -1469,7 +1752,7 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd, ...@@ -1469,7 +1752,7 @@ static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD cmd,
} else { } else {
r = brt_nonleaf_put_cmd(t, node, cmd, r = brt_nonleaf_put_cmd(t, node, cmd,
did_split, nodea, nodeb, splitk, did_split, nodea, nodeb, splitk,
debug, logger, path_to_parent); debug, logger);
} }
//oldcounter=tmpcounter; //oldcounter=tmpcounter;
// Watch out. If did_split then the original node is no longer allocated. // Watch out. If did_split then the original node is no longer allocated.
...@@ -1693,6 +1976,7 @@ int toku_brt_open(BRT t, const char *fname, const char *fname_in_env, const char ...@@ -1693,6 +1976,7 @@ int toku_brt_open(BRT t, const char *fname, const char *fname_in_env, const char
if ((r=toku_log_fheader(toku_txn_logger(txn), (LSN*)0, 0, toku_txn_get_txnid(txn), toku_cachefile_filenum(t->cf), lh))) { goto died6; } if ((r=toku_log_fheader(toku_txn_logger(txn), (LSN*)0, 0, toku_txn_get_txnid(txn), toku_cachefile_filenum(t->cf), lh))) { goto died6; }
} }
if ((r=setup_initial_brt_root_node(t, t->nodesize, toku_txn_logger(txn)))!=0) { died6: if (dbname) goto died5; else goto died2; } if ((r=setup_initial_brt_root_node(t, t->nodesize, toku_txn_logger(txn)))!=0) { died6: if (dbname) goto died5; else goto died2; }
//printf("%s:%d putting %p (%d)\n", __FILE__, __LINE__, t->h, 0);
if ((r=toku_cachetable_put(t->cf, 0, t->h, 0, toku_brtheader_flush_callback, toku_brtheader_fetch_callback, 0))) { goto died6; } if ((r=toku_cachetable_put(t->cf, 0, t->h, 0, toku_brtheader_flush_callback, toku_brtheader_fetch_callback, 0))) { goto died6; }
} }
else if (r!=0) { else if (r!=0) {
...@@ -1914,14 +2198,14 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, ...@@ -1914,14 +2198,14 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk,
if (r!=0) return r; if (r!=0) return r;
r = toku_unpin_brtnode(brt, nodeb); r = toku_unpin_brtnode(brt, nodeb);
if (r!=0) return r; if (r!=0) return r;
//printf("%s:%d put %lld\n", __FILE__, __LINE__, brt->root); //printf("%s:%d put %lld\n", __FILE__, __LINE__, newroot_diskoff);
toku_cachetable_put(brt->cf, newroot_diskoff, newroot, brtnode_size(newroot), toku_cachetable_put(brt->cf, newroot_diskoff, newroot, brtnode_size(newroot),
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt); toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt);
*newrootp = newroot; *newrootp = newroot;
return 0; return 0;
} }
static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { int toku_brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) {
void *node_v; void *node_v;
BRTNODE node; BRTNODE node;
CACHEKEY *rootp; CACHEKEY *rootp;
...@@ -1948,8 +2232,7 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRA ...@@ -1948,8 +2232,7 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRA
result = brtnode_put_cmd(brt, node, cmd, result = brtnode_put_cmd(brt, node, cmd,
&did_split, &nodea, &nodeb, &splitk, &did_split, &nodea, &nodeb, &splitk,
debug, debug,
logger, logger);
path_to_parent);
if (debug) printf("%s:%d did_insert\n", __FILE__, __LINE__); if (debug) printf("%s:%d did_insert\n", __FILE__, __LINE__);
if (did_split) { if (did_split) {
// node is unpinned, so now we have to proceed to update the root with a new node. // node is unpinned, so now we have to proceed to update the root with a new node.
...@@ -1974,10 +2257,15 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRA ...@@ -1974,10 +2257,15 @@ static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger, DISKOFFARRA
int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn) { int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
int r; int r;
if (txn) {
BYTESTRING keybs = {key->size, toku_memdup(key->data, key->size)};
BYTESTRING databs = {val->size, toku_memdup(val->data, val->size)};
r = toku_logger_save_rollback_cmdinsert(txn, toku_txn_get_txnid(txn), toku_cachefile_filenum(brt->cf), keybs, databs);
if (r!=0) return r;
}
BRT_CMD_S brtcmd = { BRT_INSERT, toku_txn_get_txnid(txn), .u.id={key,val}}; BRT_CMD_S brtcmd = { BRT_INSERT, toku_txn_get_txnid(txn), .u.id={key,val}};
DISKOFF path[MAX_PATHLEN_TO_ROOT]; r = toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
DISKOFFARRAY path_to_parent = {0, path}; if (r!=0) return r;
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent);
return r; return r;
} }
...@@ -1998,20 +2286,27 @@ int toku_brt_lookup (BRT brt, DBT *k, DBT *v) { ...@@ -1998,20 +2286,27 @@ int toku_brt_lookup (BRT brt, DBT *k, DBT *v) {
int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) { int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) {
int r; int r;
if (txn) {
BYTESTRING keybs = {key->size, toku_memdup(key->data, key->size)};
r = toku_logger_save_rollback_cmddelete(txn, toku_txn_get_txnid(txn), toku_cachefile_filenum(brt->cf), keybs);
if (r!=0) return r;
}
DBT val; DBT val;
BRT_CMD_S brtcmd = { BRT_DELETE, toku_txn_get_txnid(txn), .u.id={key, toku_init_dbt(&val)}}; BRT_CMD_S brtcmd = { BRT_DELETE_ANY, toku_txn_get_txnid(txn), .u.id={key, toku_init_dbt(&val)}};
DISKOFF path[MAX_PATHLEN_TO_ROOT]; r = toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
DISKOFFARRAY path_to_parent = {0, path};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent);
return r; return r;
} }
int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) { int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
int r; int r;
if (txn) {
BYTESTRING keybs = {key->size, toku_memdup(key->data, key->size)};
BYTESTRING databs = {val->size, toku_memdup(val->data, val->size)};
r = toku_logger_save_rollback_cmddeleteboth(txn, toku_txn_get_txnid(txn), toku_cachefile_filenum(brt->cf), keybs, databs);
if (r!=0) return r;
}
BRT_CMD_S brtcmd = { BRT_DELETE_BOTH, toku_txn_get_txnid(txn), .u.id={key,val}}; BRT_CMD_S brtcmd = { BRT_DELETE_BOTH, toku_txn_get_txnid(txn), .u.id={key,val}};
DISKOFF path[MAX_PATHLEN_TO_ROOT]; r = toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
DISKOFFARRAY path_to_parent = {0, path};
r = brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn), path_to_parent);
return r; return r;
} }
...@@ -2149,15 +2444,15 @@ static inline void brt_split_init(BRT_SPLIT *split) { ...@@ -2149,15 +2444,15 @@ static inline void brt_split_init(BRT_SPLIT *split) {
toku_init_dbt(&split->splitk); toku_init_dbt(&split->splitk);
} }
static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent); static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger);
/* search in a node's child */ /* search in a node's child */
static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) {
int r, rr; int r, rr;
/* if the child's buffer is not empty then try to empty it */ /* if the child's buffer is not empty then try to empty it */
if (BNC_NBYTESINBUF(node, childnum) > 0) { if (BNC_NBYTESINBUF(node, childnum) > 0) {
rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger, path_to_parent); rr = push_some_brt_cmds_down(brt, node, childnum, &split->did_split, &split->nodea, &split->nodeb, &split->splitk, 0, logger);
assert(rr == 0); assert(rr == 0);
/* push down may cause a child split, so childnum may not be appropriate, and the node itself may split, so retry */ /* push down may cause a child split, so childnum may not be appropriate, and the node itself may split, so retry */
return EAGAIN; return EAGAIN;
...@@ -2167,16 +2462,13 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s ...@@ -2167,16 +2462,13 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s
rr = toku_cachetable_get_and_pin(brt->cf, BNC_DISKOFF(node,childnum), &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt); rr = toku_cachetable_get_and_pin(brt->cf, BNC_DISKOFF(node,childnum), &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt);
assert(rr == 0); assert(rr == 0);
assert(path_to_parent.len<MAX_PATHLEN_TO_ROOT);
path_to_parent.array[path_to_parent.len++]=node->thisnodename;
for (;;) { for (;;) {
BRTNODE childnode = node_v; BRTNODE childnode = node_v;
BRT_SPLIT childsplit; brt_split_init(&childsplit); BRT_SPLIT childsplit; brt_split_init(&childsplit);
r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger, path_to_parent); r = brt_search_node(brt, childnode, search, newkey, newval, &childsplit, logger);
if (childsplit.did_split) { if (childsplit.did_split) {
rr = handle_split_of_child(brt, node, childnum, childsplit.nodea, childsplit.nodeb, &childsplit.splitk, rr = handle_split_of_child(brt, node, childnum, childsplit.nodea, childsplit.nodeb, &childsplit.splitk,
&split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger, path_to_parent); &split->did_split, &split->nodea, &split->nodeb, &split->splitk, logger);
assert(rr == 0); assert(rr == 0);
break; break;
} else { } else {
...@@ -2191,7 +2483,7 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s ...@@ -2191,7 +2483,7 @@ static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *s
return r; return r;
} }
static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) {
int r = DB_NOTFOUND; int r = DB_NOTFOUND;
int c; int c;
...@@ -2209,7 +2501,7 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, ...@@ -2209,7 +2501,7 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search,
if (search->compare(search, if (search->compare(search,
toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot)), toku_fill_dbt(&pivotkey, kv_pair_key(pivot), kv_pair_keylen(pivot)),
brt->flags & TOKU_DB_DUPSORT ? toku_fill_dbt(&pivotval, kv_pair_val(pivot), kv_pair_vallen(pivot)): 0)) { brt->flags & TOKU_DB_DUPSORT ? toku_fill_dbt(&pivotval, kv_pair_val(pivot), kv_pair_vallen(pivot)): 0)) {
r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent); r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger);
if (r == 0 || r == EAGAIN) if (r == 0 || r == EAGAIN)
break; break;
} }
...@@ -2217,27 +2509,18 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search, ...@@ -2217,27 +2509,18 @@ static int brt_search_nonleaf_node(BRT brt, BRTNODE node, brt_search_t *search,
/* check the first (left) or last (right) node if nothing has been found */ /* check the first (left) or last (right) node if nothing has been found */
if (r == DB_NOTFOUND && c == node->u.n.n_children-1) if (r == DB_NOTFOUND && c == node->u.n.n_children-1)
r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger, path_to_parent); r = brt_search_child(brt, node, child[c], search, newkey, newval, split, logger);
return r; return r;
} }
struct bessel_from_search_struct { int pair_leafval_bessel_le_committed (u_int32_t klen, void *kval,
brt_search_t *search; u_int32_t dlen, void *dval,
}; brt_search_t *search) {
static int bessel_from_search_t (u_int32_t len __attribute__((__unused__)), void *data, void *extra) {
struct bessel_from_search_struct *bs = extra;
brt_search_t *search=bs->search;
DBT x,y; DBT x,y;
struct kv_pair *kv = data;
int cmp = search->compare(search, int cmp = search->compare(search,
search->k ? toku_fill_dbt(&x, kv_pair_key(kv), kv_pair_keylen(kv)) : 0, search->k ? toku_fill_dbt(&x, kval, klen) : 0,
search->v ? toku_fill_dbt(&y, kv_pair_val(kv), kv_pair_vallen(kv)) : 0); search->v ? toku_fill_dbt(&y, dval, dlen) : 0);
// For a left-to-right search, the search compare function returns 0 for all pairs < kv. We want the first value that is 1.
// To convert it to a bessel, we have to convert the 0 to a -1.
// For a right-to-left search, the search compare function returns 0 for all pairs > kv, and 1 for lesser values. We want the last value that is 1.
// To convert it to a bessel, we have to convert 0 to +1, and 1 to -1.
switch (search->direction) { switch (search->direction) {
case BRT_SEARCH_LEFT: return cmp==0 ? -1 : +1; case BRT_SEARCH_LEFT: return cmp==0 ? -1 : +1;
case BRT_SEARCH_RIGHT: return cmp==0 ? +1 : -1; // Because the comparison runs backwards for right searches. case BRT_SEARCH_RIGHT: return cmp==0 ? +1 : -1; // Because the comparison runs backwards for right searches.
...@@ -2246,9 +2529,37 @@ static int bessel_from_search_t (u_int32_t len __attribute__((__unused__)), void ...@@ -2246,9 +2529,37 @@ static int bessel_from_search_t (u_int32_t len __attribute__((__unused__)), void
return 0; return 0;
} }
int pair_leafval_bessel_le_both (TXNID xid __attribute__((__unused__)),
u_int32_t klen, void *kval,
u_int32_t clen, void *cval,
u_int32_t plen __attribute__((__unused__)), void *pval __attribute__((__unused__)),
brt_search_t *search) {
return pair_leafval_bessel_le_committed(klen, kval, clen, cval, search);
}
int pair_leafval_bessel_le_provdel (TXNID xid __attribute__((__unused__)),
u_int32_t klen, void *kval,
u_int32_t clen, void *cval,
brt_search_t *be) {
return pair_leafval_bessel_le_committed(klen, kval, clen, cval, be);
}
int pair_leafval_bessel_le_provpair (TXNID xid __attribute__((__unused__)),
u_int32_t klen, void *kval,
u_int32_t plen, void *pval,
brt_search_t *be) {
return pair_leafval_bessel_le_committed(klen, kval, plen, pval, be);
}
static int bessel_from_search_t (u_int32_t dlen __attribute__((__unused__)), void *leafval, void *extra) {
brt_search_t *search = extra;
LESWITCHCALL(leafval, pair_leafval_bessel, search);
}
static int brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval) { static int brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval) {
// Now we have to convert from brt_search_t to the bessel function with a direction. What a pain... // Now we have to convert from brt_search_t to the bessel function with a direction. What a pain...
struct bessel_from_search_struct bs = {search};
int direction; int direction;
switch (search->direction) { switch (search->direction) {
case BRT_SEARCH_LEFT: direction = +1; goto ok; case BRT_SEARCH_LEFT: direction = +1; goto ok;
...@@ -2262,25 +2573,29 @@ static int brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT ...@@ -2262,25 +2573,29 @@ static int brt_search_leaf_node(BRT brt, BRTNODE node, brt_search_t *search, DBT
int r = toku_gpma_lookup_bessel(node->u.l.buffer, int r = toku_gpma_lookup_bessel(node->u.l.buffer,
bessel_from_search_t, bessel_from_search_t,
direction, direction,
&bs, search,
&len, &data, &idx); &len, &data, &idx);
if (r!=0) return r; if (r!=0) return r;
struct kv_pair *kv = data; LEAFENTRY le = data;
if (le_is_provdel(le)) {
// Provisionally deleted stuff is gone.
return DB_NOTFOUND;
}
if (newkey) { if (newkey) {
r = toku_dbt_set_value(newkey, kv_pair_key(kv), kv_pair_keylen(kv), &brt->skey); r = toku_dbt_set_value(newkey, le_latest_key(le), le_latest_keylen(le), &brt->skey);
if (r!=0) return r; if (r!=0) return r;
} }
if (newval) { if (newval) {
r = toku_dbt_set_value(newval, kv_pair_val(kv), kv_pair_vallen(kv), &brt->sval); r = toku_dbt_set_value(newval, le_latest_val(le), le_latest_vallen(le), &brt->sval);
if (r!=0) return r; if (r!=0) return r;
} }
return 0; return 0;
} }
static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger, DISKOFFARRAY path_to_parent) { static int brt_search_node(BRT brt, BRTNODE node, brt_search_t *search, DBT *newkey, DBT *newval, BRT_SPLIT *split, TOKULOGGER logger) {
if (node->height > 0) if (node->height > 0)
return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger, path_to_parent); return brt_search_nonleaf_node(brt, node, search, newkey, newval, split, logger);
else else
return brt_search_leaf_node(brt, node, search, newkey, newval); return brt_search_leaf_node(brt, node, search, newkey, newval);
} }
...@@ -2302,9 +2617,7 @@ int toku_brt_search(BRT brt, brt_search_t *search, DBT *newkey, DBT *newval, TOK ...@@ -2302,9 +2617,7 @@ int toku_brt_search(BRT brt, brt_search_t *search, DBT *newkey, DBT *newval, TOK
for (;;) { for (;;) {
BRT_SPLIT split; brt_split_init(&split); BRT_SPLIT split; brt_split_init(&split);
DISKOFF path[MAX_PATHLEN_TO_ROOT]; r = brt_search_node(brt, node, search, newkey, newval, &split, logger);
DISKOFFARRAY path_to_parent = {0, path};
r = brt_search_node(brt, node, search, newkey, newval, &split, logger, path_to_parent);
if (split.did_split) { if (split.did_split) {
rr = brt_init_new_root(brt, split.nodea, split.nodeb, split.splitk, rootp, 0, &node); rr = brt_init_new_root(brt, split.nodea, split.nodeb, split.splitk, rootp, 0, &node);
...@@ -2729,3 +3042,39 @@ int toku_brt_nonleaf_expunge_xaction(BRT brt, DISKOFF diskoff, TXNID xid) { ...@@ -2729,3 +3042,39 @@ int toku_brt_nonleaf_expunge_xaction(BRT brt, DISKOFF diskoff, TXNID xid) {
int r2 = toku_cachetable_unpin(brt->cf, diskoff, 1, toku_serialize_brtnode_size(node)); int r2 = toku_cachetable_unpin(brt->cf, diskoff, 1, toku_serialize_brtnode_size(node));
return r ? r : r2; return r ? r : r2;
} }
int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp);
void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size);
int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp) {
if (toku_mempool_get_frag_size(memp) == 0)
return -1;
void *newmem = toku_malloc(memp->size);
if (newmem == 0)
return -2;
struct mempool new_kvspace;
toku_mempool_init(&new_kvspace, newmem, memp->size);
GPMA_ITERATE(pma, idx, len, data,
({
void *newdata = toku_mempool_malloc(&new_kvspace, (size_t)len, 4);
assert(newdata);
memcpy(newdata, data, (size_t)len);
toku_gpma_set_at_index(pma, idx, len, newdata);
// toku_verify_gpma(pma);
}));
toku_free(memp->base);
*memp = new_kvspace;
// toku_verify_gpma(pma);
return 0;
}
void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size) {
void *v = toku_mempool_malloc(mp, size, 4);
if (v==0) {
if (0 == toku_gpma_compress_kvspace(pma, mp)) {
v = toku_mempool_malloc(mp, size, 4);
assert(v);
}
}
return v;
}
...@@ -86,8 +86,12 @@ void dump_node (int f, DISKOFF off, struct brt_header *h) { ...@@ -86,8 +86,12 @@ void dump_node (int f, DISKOFF off, struct brt_header *h) {
switch ((enum brt_cmd_type)typ) { switch ((enum brt_cmd_type)typ) {
case BRT_NONE: printf("NONE"); goto ok; case BRT_NONE: printf("NONE"); goto ok;
case BRT_INSERT: printf("INSERT"); goto ok; case BRT_INSERT: printf("INSERT"); goto ok;
case BRT_DELETE: printf("DELETE"); goto ok; case BRT_DELETE_ANY: printf("DELETE_ANY"); goto ok;
case BRT_DELETE_BOTH: printf("DELETE_BOTH"); goto ok; case BRT_DELETE_BOTH: printf("DELETE_BOTH"); goto ok;
case BRT_ABORT_ANY: printf("ABORT_ANY"); goto ok;
case BRT_ABORT_BOTH: printf("ABORT_BOTH"); goto ok;
case BRT_COMMIT_ANY: printf("COMMIT_ANY"); goto ok;
case BRT_COMMIT_BOTH: printf("COMMIT_BOTH"); goto ok;
} }
printf("HUH?"); printf("HUH?");
ok: ok:
...@@ -106,10 +110,7 @@ void dump_node (int f, DISKOFF off, struct brt_header *h) { ...@@ -106,10 +110,7 @@ void dump_node (int f, DISKOFF off, struct brt_header *h) {
printf(" items_in_buffer =%d\n", toku_gpma_n_entries(n->u.l.buffer)); printf(" items_in_buffer =%d\n", toku_gpma_n_entries(n->u.l.buffer));
GPMA_ITERATE(n->u.l.buffer, idx, len, data, GPMA_ITERATE(n->u.l.buffer, idx, len, data,
({ ({
printf("%d: ", idx); print_leafentry(stdout, data);
print_item(kv_pair_key(data), kv_pair_keylen(data));
printf(" ");
print_item(kv_pair_val(data), kv_pair_vallen(data));
printf("\n"); printf("\n");
})); }));
} }
......
...@@ -27,11 +27,6 @@ typedef struct { ...@@ -27,11 +27,6 @@ typedef struct {
char *data; char *data;
} BYTESTRING; } BYTESTRING;
typedef struct {
int len;
DISKOFF *array;
} DISKOFFARRAY;
/* Make the LSN be a struct instead of an integer so that we get better type checking. */ /* Make the LSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_lsn { u_int64_t lsn; } LSN; typedef struct __toku_lsn { u_int64_t lsn; } LSN;
#define ZERO_LSN ((LSN){0}) #define ZERO_LSN ((LSN){0})
...@@ -79,8 +74,12 @@ typedef struct cachefile *CACHEFILE; ...@@ -79,8 +74,12 @@ typedef struct cachefile *CACHEFILE;
enum brt_cmd_type { enum brt_cmd_type {
BRT_NONE = 0, BRT_NONE = 0,
BRT_INSERT = 1, BRT_INSERT = 1,
BRT_DELETE = 2, BRT_DELETE_ANY = 2, // Delete any matching key. This used to be called BRT_DELETE.
BRT_DELETE_BOTH = 3, BRT_DELETE_BOTH = 3,
BRT_ABORT_ANY = 4, // Abort any commands on any matching key.
BRT_ABORT_BOTH = 5, // Abort commands that match both the key and the value
BRT_COMMIT_ANY = 6,
BRT_COMMIT_BOTH = 7
}; };
/* tree commands */ /* tree commands */
......
...@@ -62,7 +62,12 @@ struct fileid { ...@@ -62,7 +62,12 @@ struct fileid {
struct cachefile { struct cachefile {
CACHEFILE next; CACHEFILE next;
int refcount; /* CACHEFILEs are shared. Use a refcount to decide when to really close it. */ u_int64_t refcount; /* CACHEFILEs are shared. Use a refcount to decide when to really close it.
* The reference count is one for every open DB.
* Plus one for every commit/rollback record. (It would be harder to keep a count for every open transaction,
* because then we'd have to figure out if the transaction was already counted. If we simply use a count for
* every record in the transaction, we'll be ok. Hence we use a 64-bit counter to make sure we don't run out.
*/
int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */ int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */
CACHETABLE cachetable; CACHETABLE cachetable;
struct fileid fileid; struct fileid fileid;
......
...@@ -17,7 +17,8 @@ static inline u_int32_t toku_calc_more_crc32_kvpair (u_int32_t crc, const void * ...@@ -17,7 +17,8 @@ static inline u_int32_t toku_calc_more_crc32_kvpair (u_int32_t crc, const void *
return crc; return crc;
} }
u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen) { #if 0
u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen) {
return toku_calc_more_crc32_kvpair(toku_null_crc, key, keylen, val, vallen); return toku_calc_more_crc32_kvpair(toku_null_crc, key, keylen, val, vallen);
} }
...@@ -25,6 +26,7 @@ u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp) { ...@@ -25,6 +26,7 @@ u_int32_t toku_calccrc32_kvpair_struct (const struct kv_pair *kvp) {
return toku_calccrc32_kvpair(kv_pair_key_const(kvp), kv_pair_keylen(kvp), return toku_calccrc32_kvpair(kv_pair_key_const(kvp), kv_pair_keylen(kvp),
kv_pair_val_const(kvp), kv_pair_vallen(kvp)); kv_pair_val_const(kvp), kv_pair_vallen(kvp));
} }
#endif
u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen) { u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int32_t keylen, const void *val, u_int32_t vallen) {
unsigned char type_c = type; unsigned char type_c = type;
...@@ -38,10 +40,18 @@ u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int3 ...@@ -38,10 +40,18 @@ u_int32_t toku_calccrc32_cmd (u_int32_t type, TXNID xid, const void *key, u_int3
} }
u_int32_t toku_calccrc32_cmdstruct (BRT_CMD cmd) { u_int32_t toku_calccrc32_cmdstruct (BRT_CMD cmd) {
if (cmd->type <= BRT_DELETE_BOTH) switch (cmd->type) {
case BRT_INSERT:
case BRT_DELETE_ANY:
case BRT_DELETE_BOTH:
case BRT_COMMIT_ANY:
case BRT_COMMIT_BOTH:
case BRT_ABORT_ANY:
case BRT_ABORT_BOTH:
return toku_calccrc32_cmd (cmd->type, cmd->xid, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size); return toku_calccrc32_cmd (cmd->type, cmd->xid, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size);
else { case BRT_NONE:
assert(0); /* Should not have come here. */
return 0; return 0;
} }
assert(0); /* Should not have come here. */
return 0;
} }
...@@ -86,11 +86,14 @@ u_int32_t toku_gpma_index_limit(GPMA pma) { ...@@ -86,11 +86,14 @@ u_int32_t toku_gpma_index_limit(GPMA pma) {
} }
// If direction==0 then find any match for which the bessel gives 0. *found is set to 1 iff something with 0. The return value is the place where the zero is (if found), or the place where it would go (if there's a value there, then that value goes after the zero.) // If direction==0 then find any match for which the bessel gives 0. *found is set to 1 iff something with 0. The return value is the place where the zero is (if found), or the place where it would go (if there's a value there, then that value goes after the zero.)
// If more than one value returns 0, return the left most such value.
// If direction>0 then find the first match for which bessel gives >0. *found is set to 1 iff something with >0. The return value is the index of the leftmost such value (if found). In the not-found case, all items are <=0 and the return value is pma->N. // If direction>0 then find the first match for which bessel gives >0. *found is set to 1 iff something with >0. The return value is the index of the leftmost such value (if found). In the not-found case, all items are <=0 and the return value is pma->N.
// If direction<0 then find the last match for which bessel gives <0. *found is set to 1 iff something with <0. The return value is the index of the rightmost such value (if found). In the not-found case, all items are >=0 and the return value is 0. // If direction<0 then find the last match for which bessel gives <0. *found is set to 1 iff something with <0. The return value is the index of the rightmost such value (if found). In the not-found case, all items are >=0 and the return value is 0.
u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int direction, void *extra, int *found) { u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int direction, void *extra, int *found) {
if (direction==0) { if (direction==0) {
int lo=0, hi=pma->N; int lo=0, hi=pma->N;
int foundone = 0;
u_int32_t foundidx = 0;
while (lo<hi) { while (lo<hi) {
int mi = (lo+hi)/2; int mi = (lo+hi)/2;
int look = mi; int look = mi;
...@@ -102,8 +105,10 @@ u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int directi ...@@ -102,8 +105,10 @@ u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int directi
int cmp = besf(pma->items[look].len, pma->items[look].data, extra); int cmp = besf(pma->items[look].len, pma->items[look].data, extra);
if (cmp==0) { if (cmp==0) {
/* We found a match. */ /* We found a match. */
*found=1; foundone = 1;
return look; foundidx=look;
/* But keep looking to the left. */
hi=mi;
} else if (cmp>0) { } else if (cmp>0) {
hi=mi; hi=mi;
} else { } else {
...@@ -111,8 +116,9 @@ u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int directi ...@@ -111,8 +116,9 @@ u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int directi
} }
} }
} }
*found = 0; *found = foundone;
return lo; if (foundone) return foundidx;
else return lo;
} else if (direction<0) { } else if (direction<0) {
// Find the rightmost negative value. // Find the rightmost negative value.
...@@ -371,15 +377,12 @@ int toku_make_space_at (GPMA pma, u_int32_t idx, u_int32_t *newidx, gpma_renumbe ...@@ -371,15 +377,12 @@ int toku_make_space_at (GPMA pma, u_int32_t idx, u_int32_t *newidx, gpma_renumbe
return toku_gpma_smooth_region (pma, lo, hi, count, idx, newidx, rcall, extra, pma->N); return toku_gpma_smooth_region (pma, lo, hi, count, idx, newidx, rcall, extra, pma->N);
} }
int toku_gpma_insert(GPMA pma, static int finish_insert (GPMA pma,
u_int32_t len, void*data, u_int32_t len, void*data,
gpma_compare_fun_t compare, void *extra_for_compare,
gpma_renumber_callback_t rcall, void*extra_for_rcall, // if anything gets renumbered, let the caller know gpma_renumber_callback_t rcall, void*extra_for_rcall, // if anything gets renumbered, let the caller know
u_int32_t *idxp u_int32_t idx,
u_int32_t *idxp // store idx into *idxp (but only do it when we succeed.)
) { ) {
int found;
u_int32_t idx = toku_gpma_find_index(pma, len, data, compare, extra_for_compare, &found);
if (found) return DB_KEYEXIST;
assert(idx<=toku_gpma_index_limit(pma)); assert(idx<=toku_gpma_index_limit(pma));
if (idx==toku_gpma_index_limit(pma) || pma->items[idx].data) { if (idx==toku_gpma_index_limit(pma) || pma->items[idx].data) {
u_int32_t newidx; u_int32_t newidx;
...@@ -395,6 +398,32 @@ int toku_gpma_insert(GPMA pma, ...@@ -395,6 +398,32 @@ int toku_gpma_insert(GPMA pma,
return 0; return 0;
} }
int toku_gpma_insert(GPMA pma,
u_int32_t len, void*data,
gpma_compare_fun_t compare, void *extra_for_compare,
gpma_renumber_callback_t rcall, void*extra_for_rcall, // if anything gets renumbered, let the caller know
u_int32_t *idxp
) {
int found;
u_int32_t idx = toku_gpma_find_index(pma, len, data, compare, extra_for_compare, &found);
if (found) return DB_KEYEXIST;
return finish_insert(pma, len, data, rcall, extra_for_rcall, idx, idxp);
}
int toku_gpma_insert_bessel (GPMA pma,
u_int32_t len, void *data,
gpma_besselfun_t besf, void *extra_for_besself,
gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know
u_int32_t *indexp // Where did the item get stored?
) {
int found;
u_int32_t idx = toku_gpma_find_index_bes(pma, besf, 0, extra_for_besself, &found);
if (found) return DB_KEYEXIST;
return finish_insert(pma, len, data, renumberf, extra_for_renumberf, idx, indexp);
}
inline int toku_max_int (int a, int b) { inline int toku_max_int (int a, int b) {
return a<b ? b : a; return a<b ? b : a;
} }
...@@ -520,9 +549,13 @@ int toku_gpma_delete_bessel (GPMA pma, ...@@ -520,9 +549,13 @@ int toku_gpma_delete_bessel (GPMA pma,
// Now we know the range and how many items will be deleted. // Now we know the range and how many items will be deleted.
for (i=minidx; i<=maxidx; i++) { for (i=minidx; i<=maxidx; i++) {
if (pma->items[i].data) { if (pma->items[i].data) {
if (deletef) {
r = deletef(i, pma->items[i].len, pma->items[i].data, extra_for_deletef); r = deletef(i, pma->items[i].len, pma->items[i].data, extra_for_deletef);
pma->items[i].data = 0; pma->items[i].data = 0;
if (r!=0) return r; if (r!=0) return r;
} else {
pma->items[i].data = 0;
}
} }
} }
// Now we must find a region that is sufficiently densely packed and spread things out. // Now we must find a region that is sufficiently densely packed and spread things out.
...@@ -566,10 +599,10 @@ int toku_gpma_lookup_item (GPMA pma, ...@@ -566,10 +599,10 @@ int toku_gpma_lookup_item (GPMA pma,
int toku_gpma_lookup_bessel(GPMA pma, gpma_besselfun_t besf, int direction, void*extra, u_int32_t *resultlen, void **resultdata, u_int32_t *idxp) { int toku_gpma_lookup_bessel(GPMA pma, gpma_besselfun_t besf, int direction, void*extra, u_int32_t *resultlen, void **resultdata, u_int32_t *idxp) {
int found; int found;
u_int32_t idx = toku_gpma_find_index_bes(pma, besf, direction, extra, &found); u_int32_t idx = toku_gpma_find_index_bes(pma, besf, direction, extra, &found);
if (idxp) *idxp=idx;
if (found) { if (found) {
*resultlen =pma->items[idx].len; *resultlen =pma->items[idx].len;
*resultdata=pma->items[idx].data; *resultdata=pma->items[idx].data;
if (idxp) *idxp=idx;
return 0; return 0;
} else { } else {
return DB_NOTFOUND; return DB_NOTFOUND;
...@@ -699,7 +732,7 @@ void toku_gpma_set_at_index (GPMA pma, u_int32_t idx, u_int32_t len, void *data) ...@@ -699,7 +732,7 @@ void toku_gpma_set_at_index (GPMA pma, u_int32_t idx, u_int32_t len, void *data)
void toku_gpma_clear_at_index (GPMA pma, u_int32_t idx) { void toku_gpma_clear_at_index (GPMA pma, u_int32_t idx) {
assert(idx<pma->N); assert(idx<pma->N);
if (pma->items[idx].data==0) { if (pma->items[idx].data) {
pma->n_items_present--; pma->n_items_present--;
} }
pma->items[idx].data = 0; pma->items[idx].data = 0;
......
...@@ -42,6 +42,15 @@ int toku_gpma_insert (GPMA, ...@@ -42,6 +42,15 @@ int toku_gpma_insert (GPMA,
gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know
u_int32_t *indexp // Where did the item get stored? u_int32_t *indexp // Where did the item get stored?
); );
// Use a bessel function to determine where to insert the data.
// Puts the new value between the rightmost -1 and the leftmost +1.
// Requires: Nothing in the pma returns 0.
int toku_gpma_insert_bessel (GPMA pma,
u_int32_t len, void *data,
gpma_besselfun_t, void *extra_for_besself,
gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know
u_int32_t *indexp // Where did the item get stored?
);
// Delete anything for which the besselfun is zero. The besselfun must be monotonically increasing compared to the comparison function. // Delete anything for which the besselfun is zero. The besselfun must be monotonically increasing compared to the comparison function.
// That is, if two othings compare to be < then their besselfun's must yield <=, and if the compare to be = their besselfuns must be =, and if they are > then their besselfuns must be >= // That is, if two othings compare to be < then their besselfun's must yield <=, and if the compare to be = their besselfuns must be =, and if they are > then their besselfuns must be >=
...@@ -69,7 +78,8 @@ int toku_gpma_delete_item (GPMA, ...@@ -69,7 +78,8 @@ int toku_gpma_delete_item (GPMA,
int toku_gpma_lookup_item (GPMA, u_int32_t len, void *data, gpma_compare_fun_t compf, void*extra, u_int32_t *resultlen, void **resultdata, u_int32_t *idx); int toku_gpma_lookup_item (GPMA, u_int32_t len, void *data, gpma_compare_fun_t compf, void*extra, u_int32_t *resultlen, void **resultdata, u_int32_t *idx);
// Lookup something according to the besselfun. // Lookup something according to the besselfun.
// If direction==0 then return something for which the besselfun is zero (or return DB_NOTFOUND). // If direction==0 then return something for which the besselfun is zero (or return DB_NOTFOUND and set the idx to point at the spot where the item would go. That spot may already have an element in it, or it may be off the end.)
// If more than one value is zero, return the leftmost such value.
// If direction>0 then return the first thing for which the besselfun is positive (or return DB_NOTFOUND). // If direction>0 then return the first thing for which the besselfun is positive (or return DB_NOTFOUND).
// If direction<0 then return the last thing for which the besselfun is negative (or return DB_NOTFOUND). // If direction<0 then return the last thing for which the besselfun is negative (or return DB_NOTFOUND).
int toku_gpma_lookup_bessel (GPMA, gpma_besselfun_t, int direction, void*extra, u_int32_t *len, void **data, u_int32_t *idx); int toku_gpma_lookup_bessel (GPMA, gpma_besselfun_t, int direction, void*extra, u_int32_t *len, void **data, u_int32_t *idx);
......
#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
#include "brttypes.h" #include "brttypes.h"
#include "crc.h" #include "crc.h"
#include "leafentry.h" #include "leafentry.h"
#include "memory.h" #include "memory.h"
#include "toku_assert.h" #include "toku_assert.h"
#include "log.h"
#include "wbuf.h"
#include <arpa/inet.h> #include <arpa/inet.h>
#include <inttypes.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
enum le_state { LE_COMMITTED=1, // A committed pair.
LE_BOTH, // A committed pair and a provisional pair.
LE_PROVDEL, // A committed pair that has been provisionally deleted
LE_PROVPAIR }; // No committed value, but a provisional pair.
struct leafentry { struct leafentry {
enum typ_tag tag; // Delete this later
char state; char state;
char contents[0]; char contents[0];
} __attribute__((packed)); } __attribute__((packed));
...@@ -21,7 +22,7 @@ struct leafentry { ...@@ -21,7 +22,7 @@ struct leafentry {
struct contents_committed { struct contents_committed {
u_int32_t keylen; u_int32_t keylen;
u_int32_t vallen; u_int32_t vallen;
char *data[0]; char data[0];
} __attribute__((packed)); } __attribute__((packed));
struct contents_both { struct contents_both {
...@@ -29,112 +30,92 @@ struct contents_both { ...@@ -29,112 +30,92 @@ struct contents_both {
u_int32_t keylen; u_int32_t keylen;
u_int32_t committed_vallen; u_int32_t committed_vallen;
u_int32_t prov_vallen; u_int32_t prov_vallen;
char *data[0]; char data[0];
} __attribute__((packed)); } __attribute__((packed));
struct contents_provdelorpair { // Te PROVDEL or PROVPAIR cases struct contents_provdelorpair { // The PROVDEL or PROVPAIR cases
TXNID xid; TXNID xid;
u_int32_t keylen; u_int32_t keylen;
u_int32_t vallen; u_int32_t vallen;
char *data[0]; char data[0];
} __attribute__((packed)); } __attribute__((packed));
static u_int32_t committed_keylen (void*cev) { enum le_state get_le_state(LEAFENTRY le) {
return le->state;
}
void* get_le_contents(LEAFENTRY le) {
return &le->contents[0];
}
enum typ_tag get_le_tag(LEAFENTRY le) {
return le->tag;
}
u_int32_t committed_keylen (void*cev) {
struct contents_committed *ce=cev; struct contents_committed *ce=cev;
return ce->keylen; return ce->keylen;
} }
static void* committed_key (void*cev) { void* committed_key (void*cev) {
struct contents_committed *ce=cev; struct contents_committed *ce=cev;
return &ce->data[0]; return &ce->data[0];
} }
static u_int32_t committed_vallen (struct contents_committed *ce) { u_int32_t committed_vallen (struct contents_committed *ce) {
return ce->vallen; return ce->vallen;
} }
static void* committed_val (struct contents_committed *ce) { void* committed_val (struct contents_committed *ce) {
return &ce->data[ce->keylen]; return &ce->data[ce->keylen];
} }
static TXNID both_xid (struct contents_both *ce) { TXNID both_xid (struct contents_both *ce) {
return ce->xid; return ce->xid;
} }
static u_int32_t both_keylen (struct contents_both *ce) { u_int32_t both_keylen (struct contents_both *ce) {
return ce->keylen; return ce->keylen;
} }
static u_int32_t both_committed_vallen (struct contents_both *ce) { u_int32_t both_committed_vallen (struct contents_both *ce) {
return ce->committed_vallen; return ce->committed_vallen;
} }
static u_int32_t both_prov_vallen (struct contents_both *ce) { u_int32_t both_prov_vallen (struct contents_both *ce) {
return ce->prov_vallen; return ce->prov_vallen;
} }
static void* both_key (struct contents_both *ce) { void* both_key (struct contents_both *ce) {
return &ce->data[0]; return &ce->data[0];
} }
static void* both_committed_val (struct contents_both *ce) { void* both_committed_val (struct contents_both *ce) {
return &ce->data[ce->keylen]; return &ce->data[ce->keylen];
} }
static void* both_prov_val (struct contents_both*ce) { void* both_prov_val (struct contents_both*ce) {
return &ce->data[ce->keylen+ce->committed_vallen]; return &ce->data[ce->keylen+ce->committed_vallen];
} }
static TXNID provdelorpair_xid (struct contents_provdelorpair *ce) { TXNID provdelorpair_xid (struct contents_provdelorpair *ce) {
return ce->xid; return ce->xid;
} }
static u_int32_t provdelorpair_keylen (struct contents_provdelorpair *ce) { u_int32_t provdelorpair_keylen (struct contents_provdelorpair *ce) {
return ce->keylen; return ce->keylen;
} }
static u_int32_t provdelorpair_vallen (struct contents_provdelorpair *ce) { u_int32_t provdelorpair_vallen (struct contents_provdelorpair *ce) {
return ce->vallen; return ce->vallen;
} }
static void* provdelorpair_key (struct contents_provdelorpair *ce) { void* provdelorpair_key (struct contents_provdelorpair *ce) {
return &ce->data[0]; return &ce->data[0];
} }
static void* provdelorpair_val (struct contents_provdelorpair *ce) { void* provdelorpair_val (struct contents_provdelorpair *ce) {
return &ce->data[ce->keylen]; return &ce->data[ce->keylen];
} }
#define LESWITCHCALL(le,funname, ...) ({ \
switch((enum le_state)((le)->state)) { \
case LE_COMMITTED: return funname ## _le_committed( committed_keylen((struct contents_committed*)&(le)->contents), \
committed_key((struct contents_committed*)&(le)->contents), \
committed_vallen((struct contents_committed*)&(le)->contents), \
committed_val((struct contents_committed*)&(le)->contents), \
## __VA_ARGS__); \
case LE_BOTH: return funname ## _le_both( both_xid((struct contents_both*)&(le)->contents), \
both_keylen((struct contents_both*)&(le)->contents), \
both_key((struct contents_both*)&(le)->contents), \
both_committed_vallen((struct contents_both*)&(le)->contents), \
both_committed_val((struct contents_both*)&(le)->contents), \
both_prov_vallen((struct contents_both*)&(le)->contents), \
both_prov_val((struct contents_both*)&(le)->contents), \
## __VA_ARGS__); \
case LE_PROVDEL: return funname ## _le_provdel ( provdelorpair_xid((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_keylen((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_key((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_vallen((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_val((struct contents_provdelorpair*)&(le)->contents), \
## __VA_ARGS__); \
case LE_PROVPAIR: return funname ## _le_provpair(provdelorpair_xid((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_keylen((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_key((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_vallen((struct contents_provdelorpair*)&(le)->contents), \
provdelorpair_val((struct contents_provdelorpair*)&(le)->contents), \
## __VA_ARGS__); \
} abort(); })
static u_int32_t crc_uint32_t (u_int32_t crc, u_int32_t v) { static u_int32_t crc_uint32_t (u_int32_t crc, u_int32_t v) {
u_int32_t i = htonl(v); u_int32_t i = htonl(v);
return toku_crc32(crc, &i, 4); return toku_crc32(crc, &i, 4);
...@@ -183,53 +164,463 @@ u_int32_t toku_le_crc(LEAFENTRY v) { ...@@ -183,53 +164,463 @@ u_int32_t toku_le_crc(LEAFENTRY v) {
LESWITCHCALL(v, crc, crc); LESWITCHCALL(v, crc, crc);
} }
int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp) { int le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) {
if (toku_mempool_get_frag_size(memp) == 0) struct contents_committed *ce;
return -1; LEAFENTRY le;
void *newmem = toku_malloc(memp->size); size_t size = sizeof(*le)+sizeof(*ce)+klen+dlen;
if (newmem == 0) le=toku_malloc(size);
return -2; le->tag = TYP_LEAFENTRY;
struct mempool new_kvspace; le->state= LE_COMMITTED;
toku_mempool_init(&new_kvspace, newmem, memp->size); ce=(struct contents_committed*)&le->contents[0];
GPMA_ITERATE(pma, idx, len, data, ce->keylen = klen;
({ ce->vallen = dlen;
void *newdata = toku_mempool_malloc(&new_kvspace, (size_t)len, 4); memcpy(&ce->data[0], kval, (size_t)klen);
assert(newdata); memcpy(&ce->data[klen], dval, (size_t)dlen);
memcpy(newdata, data, (size_t)len); *resultsize=size;
toku_gpma_set_at_index(pma, idx, len, newdata); *disksize = 1 + 4 + 4 + klen + dlen;
// toku_verify_gpma(pma); *result=le;
}));
toku_free(memp->base);
*memp = new_kvspace;
// toku_verify_gpma(pma);
return 0; return 0;
} }
int le_both (TXNID xid, u_int32_t klen, void* kval, u_int32_t clen, void* cval, u_int32_t plen, void* pval,
u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) {
struct contents_both *ce;
LEAFENTRY le;
size_t size = sizeof(*le)+sizeof(*ce)+klen+plen+clen;
le=toku_malloc(size);
le->tag = TYP_LEAFENTRY;
le->state= LE_BOTH;
ce=(struct contents_both*)&le->contents[0];
ce->xid = xid;
ce->keylen = klen;
ce->committed_vallen = clen;
ce->prov_vallen = plen;
memcpy(&ce->data[0], kval, (size_t)klen);
memcpy(&ce->data[klen], cval, (size_t)clen);
memcpy(&ce->data[klen+clen], pval, (size_t)plen);
*resultsize=size;
*disksize = 1 + 8 + 4*3 + klen + clen + plen;
*result=le;
return 0;
void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size) {
void *v = toku_mempool_malloc(mp, size, 4);
if (v==0) {
if (0 == toku_gpma_compress_kvspace(pma, mp)) {
v = toku_mempool_malloc(mp, size, 4);
assert(v);
}
}
return v;
} }
int le_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
int le_committed (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, GPMA pma, struct mempool *mp, LEAFENTRY *result) { u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result) {
struct contents_committed *ce; struct contents_provdelorpair *ce;
LEAFENTRY le=mempool_malloc_from_gpma(pma, mp, sizeof(*le)+sizeof(*ce)+klen+dlen); LEAFENTRY le;
le->state=LE_COMMITTED; size_t size = sizeof(*le)+sizeof(*ce)+klen+dlen;
ce=(struct contents_committed*)&le->contents[0]; le=toku_malloc(size);
le->tag = TYP_LEAFENTRY;
le->state= LE_PROVDEL;
ce=(struct contents_provdelorpair*)&le->contents[0];
ce->xid = xid;
ce->keylen = klen;
ce->vallen = dlen;
memcpy(&ce->data[0], kval, (size_t)klen);
memcpy(&ce->data[klen], dval, (size_t)dlen);
*memsize=size;
*disksize = 1 + 4 + 4 + 8 + klen + dlen;
*result=le;
return 0;
}
int le_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result) {
struct contents_provdelorpair *ce;
LEAFENTRY le;
size_t size = sizeof(*le)+sizeof(*ce)+klen+dlen;
le=toku_malloc(size);
le->tag = TYP_LEAFENTRY;
le->state= LE_PROVPAIR;
ce=(struct contents_provdelorpair*)&le->contents[0];
ce->xid = xid;
ce->keylen = klen; ce->keylen = klen;
ce->vallen = dlen; ce->vallen = dlen;
memcpy(&ce->data[0], kval, (size_t)klen); memcpy(&ce->data[0], kval, (size_t)klen);
memcpy(&ce->data[klen], dval, (size_t)dlen); memcpy(&ce->data[klen], dval, (size_t)dlen);
*resultsize=size;
*disksize = 1 + 4 + 4 + 8 + klen + dlen;
*result=le; *result=le;
return 0; return 0;
} }
int le_both (ITEMLEN cklen, bytevec ckval, ITEMLEN cdlen, bytevec cdval, ITEMLEN pdlen, bytevec pdval,
struct mempool *mp, LEAFENTRY *result); static u_int32_t memsize_le_committed (u_int32_t keylen, void *key __attribute__((__unused__)),
int le_provdel (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); u_int32_t vallen, void *val __attribute__((__unused__))) {
int le_provpair (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); return sizeof(struct leafentry) + sizeof(struct contents_committed) + keylen + vallen;
}
static u_int32_t memsize_le_both (TXNID txnid __attribute__((__unused__)),
u_int32_t klen, void *kval __attribute__((__unused__)),
u_int32_t clen, void *cval __attribute__((__unused__)),
u_int32_t plen, void *pval __attribute__((__unused__))) {
return sizeof(struct leafentry) + sizeof(struct contents_both) + klen + clen + plen;
}
static u_int32_t memsize_le_provdel (TXNID txnid __attribute__((__unused__)),
u_int32_t klen, void *kval __attribute__((__unused__)),
u_int32_t clen, void *cval __attribute__((__unused__))) {
return sizeof(struct leafentry) + sizeof(struct contents_provdelorpair) + klen + clen;
}
static u_int32_t memsize_le_provpair (TXNID txnid __attribute__((__unused__)),
u_int32_t klen, void *kval __attribute__((__unused__)),
u_int32_t plen, void *pval __attribute__((__unused__))) {
return sizeof(struct leafentry) + sizeof(struct contents_provdelorpair) + klen + plen;
}
u_int32_t leafentry_memsize (LEAFENTRY le) {
LESWITCHCALL(le, memsize);
}
static u_int32_t disksize_le_committed (u_int32_t keylen, void *key __attribute__((__unused__)),
u_int32_t vallen, void *val __attribute__((__unused__))) {
return 1 + 4 + 4 + keylen + vallen;
}
static u_int32_t disksize_le_both (TXNID txnid __attribute__((__unused__)),
u_int32_t klen, void *kval __attribute__((__unused__)),
u_int32_t clen, void *cval __attribute__((__unused__)),
u_int32_t plen, void *pval __attribute__((__unused__))) {
return 1 + 8 + 4*3 + klen + clen + plen;
}
static u_int32_t disksize_le_provdel (TXNID txnid __attribute__((__unused__)),
u_int32_t klen, void *kval __attribute__((__unused__)),
u_int32_t clen, void *cval __attribute__((__unused__))) {
return 1 + 8 + 4 + 4 + klen + clen;
}
static u_int32_t disksize_le_provpair (TXNID txnid __attribute__((__unused__)),
u_int32_t klen, void *kval __attribute__((__unused__)),
u_int32_t plen, void *pval __attribute__((__unused__))) {
return 1 + 8 + 4 + 4 + klen + plen;
}
u_int32_t leafentry_disksize (LEAFENTRY le) {
LESWITCHCALL(le, disksize);
}
u_int32_t toku_logsizeof_LEAFENTRY (LEAFENTRY le) {
return leafentry_disksize(le);
}
int toku_fread_LEAFENTRY(FILE *f, LEAFENTRY *le, u_int32_t *crc, u_int32_t *len) {
u_int8_t state;
int r = toku_fread_u_int8_t (f, &state, crc, len); if (r!=0) return r;
TXNID xid;
BYTESTRING a,b,c;
u_int32_t memsize, disksize;
switch ((enum le_state)state) {
case LE_COMMITTED:
r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r;
r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r;
r = le_committed(a.len, a.data, b.len, b.data,
&memsize, &disksize, le);
toku_free_BYTESTRING(a);
toku_free_BYTESTRING(b);
return r;
case LE_BOTH:
r = toku_fread_TXNID(f, &xid, crc, len);
r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r;
r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r;
r = toku_fread_BYTESTRING(f, &c, crc, len); if (r!=0) return r;
r = le_both(xid, a.len, a.data, b.len, b.data, c.len, c.data,
&memsize, &disksize, le);
toku_free_BYTESTRING(a);
toku_free_BYTESTRING(b);
toku_free_BYTESTRING(c);
return r;
case LE_PROVDEL:
r = toku_fread_TXNID(f, &xid, crc, len);
r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r;
r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r;
r = le_provdel(xid, a.len, a.data, b.len, b.data,
&memsize, &disksize, le);
toku_free_BYTESTRING(a);
toku_free_BYTESTRING(b);
return r;
case LE_PROVPAIR:
r = toku_fread_TXNID(f, &xid, crc, len);
r = toku_fread_BYTESTRING(f, &a, crc, len); if (r!=0) return r;
r = toku_fread_BYTESTRING(f, &b, crc, len); if (r!=0) return r;
r = le_provpair(xid, a.len, a.data, b.len, b.data,
&memsize, &disksize, le);
toku_free_BYTESTRING(a);
toku_free_BYTESTRING(b);
return r;
}
return DB_BADFORMAT;
}
static int print_le_committed (u_int32_t keylen, void *key, u_int32_t vallen, void *val, FILE *outf) {
fprintf(outf, "{C: ");
toku_print_BYTESTRING(outf, keylen, key);
toku_print_BYTESTRING(outf, vallen, val);
fprintf(outf, "}");
return 0;
}
static int print_le_both (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval, FILE *outf) {
fprintf(outf, "{B: ");
fprintf(outf, " xid=%" PRId64, xid);
fprintf(outf, " key=");
toku_print_BYTESTRING(outf, klen, kval);
toku_print_BYTESTRING(outf, clen, cval);
fprintf(outf, " provisional=");
toku_print_BYTESTRING(outf, plen, pval);
fprintf(outf, "}");
return 0;
}
static int print_le_provdel (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, FILE *outf) {
fprintf(outf, "{D: ");
fprintf(outf, " xid=%" PRId64, xid);
fprintf(outf, " key=");
toku_print_BYTESTRING(outf, klen, kval);
fprintf(outf, " committed=");
toku_print_BYTESTRING(outf, clen, cval);
fprintf(outf, "}");
return 0;
}
static int print_le_provpair (TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval, FILE *outf) {
fprintf(outf, "{P: ");
fprintf(outf, " xid=%" PRId64, xid);
fprintf(outf, " key=");
toku_print_BYTESTRING(outf, klen, kval);
fprintf(outf, " provisional=");
toku_print_BYTESTRING(outf, plen, pval);
fprintf(outf, "}");
return 0;
}
int print_leafentry (FILE *outf, LEAFENTRY v) {
if (!v) return 0;
LESWITCHCALL(v, print, outf);
}
int toku_logprint_LEAFENTRY (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *format __attribute__((__unused__))) {
LEAFENTRY v;
int r = toku_fread_LEAFENTRY(inf, &v, crc, len);
if (r!=0) return r;
fprintf(outf, " %s=", fieldname);
print_leafentry(outf, v);
toku_free(v);
return 0;
}
static int wbuf_le_committed (u_int32_t keylen, void *key, u_int32_t vallen, void *val, struct wbuf *w) {
wbuf_bytes(w, key, keylen);
wbuf_bytes(w, val, vallen);
return 0;
}
static int wbuf_le_both (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, u_int32_t plen, void *pval, struct wbuf *w) {
wbuf_TXNID(w, xid);
wbuf_bytes(w, kval, klen);
wbuf_bytes(w, cval, clen);
wbuf_bytes(w, pval, plen);
return 0;
}
static int wbuf_le_provdel (TXNID xid, u_int32_t klen, void *kval, u_int32_t clen, void *cval, struct wbuf *w) {
wbuf_TXNID(w, xid);
wbuf_bytes(w, kval, klen);
wbuf_bytes(w, cval, clen);
return 0;
}
static int wbuf_le_provpair (TXNID xid, u_int32_t klen, void *kval, u_int32_t plen, void *pval, struct wbuf *w) {
wbuf_TXNID(w, xid);
wbuf_bytes(w, kval, klen);
wbuf_bytes(w, pval, plen);
return 0;
}
static int do_wbuf_le (struct wbuf *w, LEAFENTRY le) {
LESWITCHCALL(le, wbuf, w);
}
void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le) {
wbuf_char(w, (unsigned int)le->state);
do_wbuf_le(w,le);
}
void rbuf_LEAFENTRY(struct rbuf *r, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *le) {
enum le_state state = rbuf_char(r);
switch (state) {
case LE_COMMITTED: {
//printf("%s:%d reading committed\n", __FILE__, __LINE__);
bytevec key, val;
u_int32_t keylen, vallen;
rbuf_bytes(r, &key, &keylen);
rbuf_bytes(r, &val, &vallen);
le_committed(keylen, (void*)key, vallen, (void*)val, resultsize, disksize, le);
return;
}
case LE_BOTH: {
//printf("%s:%d reading both\n", __FILE__, __LINE__);
bytevec kval, cval, pval;
u_int32_t klen, clen, plen;
TXNID xid = rbuf_ulonglong(r);
rbuf_bytes(r, &kval, &klen);
rbuf_bytes(r, &cval, &clen);
rbuf_bytes(r, &pval, &plen);
le_both(xid, klen, (void*)kval, clen, (void*)cval, plen, (void*)pval, resultsize, disksize, le);
return;
}
case LE_PROVDEL: {
//printf("%s:%d reading provdel\n", __FILE__, __LINE__);
bytevec kval, cval;
u_int32_t klen, clen;
TXNID xid = rbuf_ulonglong(r);
rbuf_bytes(r, &kval, &klen);
rbuf_bytes(r, &cval, &clen);
le_provdel(xid, klen, (void*)kval, clen, (void*)cval, resultsize, disksize, le);
return;
}
case LE_PROVPAIR: {
//printf("%s:%d reading both\n", __FILE__, __LINE__);
bytevec kval, pval;
u_int32_t klen, plen;
TXNID xid = rbuf_ulonglong(r);
rbuf_bytes(r, &kval, &klen);
rbuf_bytes(r, &pval, &plen);
le_provpair(xid, klen, (void*)kval, plen, (void*)pval, resultsize, disksize, le);
return;
}
}
assert(0);
}
// Use toku_free()
void toku_free_LEAFENTRY(LEAFENTRY le) {
toku_free(le);
}
int le_is_provdel(LEAFENTRY le) {
return le->state==LE_PROVDEL;
}
void* latest_key_le_committed (u_int32_t UU(keylen), void *key, u_int32_t UU(vallen), void *UU(val)) {
return key;
}
void* latest_key_le_both (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
return kval;
}
void* latest_key_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
return 0; // for provisional delete, there is no *latest* key, so return NULL
}
void* latest_key_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(plen), void *UU(pval)) {
return kval;
}
void* le_latest_key (LEAFENTRY le) {
LESWITCHCALL(le, latest_key);
}
u_int32_t latest_keylen_le_committed (u_int32_t keylen, void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
return keylen;
}
u_int32_t latest_keylen_le_both (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
return klen;
}
u_int32_t latest_keylen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
return 0; // for provisional delete, there is no *latest* key, so return 0. What else can we do?
}
u_int32_t latest_keylen_le_provpair (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(plen), void *UU(pval)) {
return klen;
}
u_int32_t le_latest_keylen (LEAFENTRY le) {
LESWITCHCALL(le, latest_keylen);
}
void* latest_val_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
return val;
}
void* latest_val_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *cval, u_int32_t UU(plen), void *UU(pval)) {
return cval;
}
void* latest_val_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
return 0; // for provisional delete, there is no *latest* key, so return NULL
}
void* latest_val_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(plen), void *pval) {
return pval;
}
void* le_latest_val (LEAFENTRY le) {
LESWITCHCALL(le, latest_val);
}
u_int32_t latest_vallen_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t vallen, void *UU(val)) {
return vallen;
}
u_int32_t latest_vallen_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t plen, void *UU(pval)) {
return plen;
}
u_int32_t latest_vallen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
return 0; // for provisional delete, there is no *latest* key, so return 0. What else can we do?
}
u_int32_t latest_vallen_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t plen, void *UU(pval)) {
return plen;
}
u_int32_t le_latest_vallen (LEAFENTRY le) {
LESWITCHCALL(le, latest_vallen);
}
void* any_key_le_committed (u_int32_t UU(keylen), void *key, u_int32_t UU(vallen), void *UU(val)) {
return key;
}
void* any_key_le_both (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
return kval;
}
void* any_key_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(clen), void *UU(cval)) {
return kval;
}
void* any_key_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *kval, u_int32_t UU(plen), void *UU(pval)) {
return kval;
}
void* le_any_key (LEAFENTRY le) {
LESWITCHCALL(le, any_key);
}
u_int32_t any_keylen_le_committed (u_int32_t keylen, void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
return keylen;
}
u_int32_t any_keylen_le_both (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t UU(plen), void *UU(pval)) {
return klen;
}
u_int32_t any_keylen_le_provdel (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(clen), void *UU(cval)) {
return klen;
}
u_int32_t any_keylen_le_provpair (TXNID UU(xid), u_int32_t klen, void *UU(kval), u_int32_t UU(plen), void *UU(pval)) {
return klen;
}
u_int32_t le_any_keylen (LEAFENTRY le) {
LESWITCHCALL(le, any_keylen);
}
void* any_val_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t UU(vallen), void *UU(val)) {
return val;
}
void* any_val_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *cval, u_int32_t UU(plen), void *UU(pval)) {
return cval;
}
void* any_val_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *cval) {
return cval;
}
void* any_val_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(plen), void *pval) {
return pval;
}
void* le_any_val (LEAFENTRY le) {
LESWITCHCALL(le, any_val);
}
u_int32_t any_vallen_le_committed (u_int32_t UU(keylen), void *UU(key), u_int32_t vallen, void *UU(val)) {
return vallen;
}
u_int32_t any_vallen_le_both (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t UU(clen), void *UU(cval), u_int32_t plen, void *UU(pval)) {
return plen;
}
u_int32_t any_vallen_le_provdel (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t clen, void *UU(cval)) {
return clen; // for provisional delete, there is no *any* key, so return 0. What else can we do?
}
u_int32_t any_vallen_le_provpair (TXNID UU(xid), u_int32_t UU(klen), void *UU(kval), u_int32_t plen, void *UU(pval)) {
return plen;
}
u_int32_t le_any_vallen (LEAFENTRY le) {
LESWITCHCALL(le, any_vallen);
}
...@@ -30,19 +30,104 @@ ...@@ -30,19 +30,104 @@
#include "mempool.h" #include "mempool.h"
#include "brttypes.h" #include "brttypes.h"
#include "gpma.h" #include "gpma.h"
#include "rbuf.h"
typedef struct leafentry *LEAFENTRY; typedef struct leafentry *LEAFENTRY;
u_int32_t le_crc(LEAFENTRY v); u_int32_t toku_le_crc(LEAFENTRY v);
int le_committed (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, GPMA pma, struct mempool *mp, LEAFENTRY *result); int le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result);
int le_both (ITEMLEN cklen, bytevec ckval, ITEMLEN cdlen, bytevec cdval, ITEMLEN pdlen, bytevec pdval, int le_both (TXNID xid, u_int32_t cklen, void* ckval, u_int32_t cdlen, void* cdval, u_int32_t pdlen, void* pdval,
struct mempool *mp, LEAFENTRY *result); u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result);
int le_provdel (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); int le_provdel (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
int le_provpair (ITEMLEN klen, bytevec kval, ITEMLEN dlen, bytevec dval, struct mempool *mp, LEAFENTRY *result); u_int32_t *resultsize, u_int32_t *memsize, LEAFENTRY *result);
int le_provpair (TXNID xid, u_int32_t klen, void* kval, u_int32_t dlen, void* dval,
u_int32_t *memsize, u_int32_t *disksize, LEAFENTRY *result);
int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp); enum le_state { LE_COMMITTED=1, // A committed pair.
void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size); LE_BOTH, // A committed pair and a provisional pair.
LE_PROVDEL, // A committed pair that has been provisionally deleted
LE_PROVPAIR }; // No committed value, but a provisional pair.
struct contents_committed;
struct contents_both;
struct contents_provdelorpair;
u_int32_t leafentry_memsize (LEAFENTRY);
enum le_state get_le_state(LEAFENTRY);
void *get_le_contents(LEAFENTRY);
enum typ_tag get_le_tag(LEAFENTRY);
u_int32_t committed_keylen (void*cev);
void* committed_key (void*cev);
u_int32_t committed_vallen (struct contents_committed *ce);
void* committed_val (struct contents_committed *ce);
TXNID both_xid (struct contents_both *ce);
u_int32_t both_keylen (struct contents_both *ce);
u_int32_t both_committed_vallen (struct contents_both *ce);
u_int32_t both_prov_vallen (struct contents_both *ce);
void* both_key (struct contents_both *ce);
void* both_committed_val (struct contents_both *ce);
void* both_prov_val (struct contents_both*ce);
TXNID provdelorpair_xid (struct contents_provdelorpair *ce);
u_int32_t provdelorpair_keylen (struct contents_provdelorpair *ce);
u_int32_t provdelorpair_vallen (struct contents_provdelorpair *ce);
void* provdelorpair_key (struct contents_provdelorpair *ce);
void* provdelorpair_val (struct contents_provdelorpair *ce);
#define LESWITCHCALL(le,funname, ...) ({ \
assert(get_le_tag(le)==TYP_LEAFENTRY); \
switch(get_le_state(le)) { \
case LE_COMMITTED: return funname ## _le_committed( committed_keylen((struct contents_committed*)(get_le_contents(le))), \
committed_key((struct contents_committed*)(get_le_contents(le))), \
committed_vallen((struct contents_committed*)(get_le_contents(le))), \
committed_val((struct contents_committed*)(get_le_contents(le))), \
## __VA_ARGS__); \
case LE_BOTH: return funname ## _le_both( both_xid((struct contents_both*)(get_le_contents(le))), \
both_keylen((struct contents_both*)(get_le_contents(le))), \
both_key((struct contents_both*)(get_le_contents(le))), \
both_committed_vallen((struct contents_both*)(get_le_contents(le))), \
both_committed_val((struct contents_both*)(get_le_contents(le))), \
both_prov_vallen((struct contents_both*)(get_le_contents(le))), \
both_prov_val((struct contents_both*)(get_le_contents(le))), \
## __VA_ARGS__); \
case LE_PROVDEL: return funname ## _le_provdel ( provdelorpair_xid((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_keylen((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_key((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_vallen((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_val((struct contents_provdelorpair*)(get_le_contents(le))), \
## __VA_ARGS__); \
case LE_PROVPAIR: return funname ## _le_provpair(provdelorpair_xid((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_keylen((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_key((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_vallen((struct contents_provdelorpair*)(get_le_contents(le))), \
provdelorpair_val((struct contents_provdelorpair*)(get_le_contents(le))), \
## __VA_ARGS__); \
} abort(); })
u_int32_t leafentry_memsize (LEAFENTRY le); // the size of a leafentry in memory.
u_int32_t leafentry_disksize (LEAFENTRY le); // this is the same as logsizeof_LEAFENTRY. The size of a leafentry on disk.
u_int32_t toku_logsizeof_LEAFENTRY(LEAFENTRY le);
void wbuf_LEAFENTRY(struct wbuf *w, LEAFENTRY le);
void rbuf_LEAFENTRY(struct rbuf *r, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *le);
int toku_fread_LEAFENTRY(FILE *f, LEAFENTRY *le, u_int32_t *crc, u_int32_t *len); // read a leafentry from a log
int toku_logprint_LEAFENTRY(FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *format); // read a leafentry from a log and then print it in human-readable form.
void toku_free_LEAFENTRY(LEAFENTRY le);
int print_leafentry (FILE *outf, LEAFENTRY v); // Print a leafentry out in human-readable form.
int le_is_provdel(LEAFENTRY le); // Return true if it is a provisional delete.
void* le_latest_key (LEAFENTRY le); // Return the latest key (return NULL for provisional deletes)
u_int32_t le_latest_keylen (LEAFENTRY le); // Return the latest keylen.
void* le_latest_val (LEAFENTRY le); // Return the latest val (return NULL for provisional deletes)
u_int32_t le_latest_vallen (LEAFENTRY le); // Return the latest vallen. Returns 0 for provisional deletes.
// Return any key or value (even if it's only provisional)
void* le_any_key (LEAFENTRY le);
u_int32_t le_any_keylen (LEAFENTRY le);
void* le_any_val (LEAFENTRY le);
u_int32_t le_any_vallen (LEAFENTRY le);
#endif #endif
...@@ -343,10 +343,19 @@ int toku_logger_finish (TOKULOGGER logger, struct logbytes *logbytes, struct wbu ...@@ -343,10 +343,19 @@ int toku_logger_finish (TOKULOGGER logger, struct logbytes *logbytes, struct wbu
} }
int toku_logger_commit (TOKUTXN txn, int nosync) { int toku_logger_commit (TOKUTXN txn, int nosync) {
// printf("%s:%d committing\n", __FILE__, __LINE__);
// panic handled in log_commit // panic handled in log_commit
int r = toku_log_commit(txn->logger, (LSN*)0, (txn->parent==0) && !nosync, txn->txnid64); // exits holding neither of the tokulogger locks. int r = toku_log_commit(txn->logger, (LSN*)0, (txn->parent==0) && !nosync, txn->txnid64); // exits holding neither of the tokulogger locks.
if (r!=0) goto free_and_return; if (r!=0) {
if (txn->parent!=0) { struct roll_entry *item;
broken:
while ((item=txn->newest_logentry)) {
txn->newest_logentry = item->prev;
rolltype_dispatch(item, toku_free_rolltype_);
toku_free(item);
}
r = 0;
} else if (txn->parent!=0) {
// Append the list to the front. // Append the list to the front.
if (txn->oldest_logentry) { if (txn->oldest_logentry) {
// There are some entries, so link them in. // There are some entries, so link them in.
...@@ -357,18 +366,23 @@ int toku_logger_commit (TOKUTXN txn, int nosync) { ...@@ -357,18 +366,23 @@ int toku_logger_commit (TOKUTXN txn, int nosync) {
txn->parent->oldest_logentry = txn->oldest_logentry; txn->parent->oldest_logentry = txn->oldest_logentry;
} }
txn->newest_logentry = txn->oldest_logentry = 0; txn->newest_logentry = txn->oldest_logentry = 0;
} r = 0;
free_and_return: } else {
{ // do the commit calls and free everything
// we do the commit calls in reverse order too.
struct roll_entry *item; struct roll_entry *item;
//printf("%s:%d abort\n", __FILE__, __LINE__);
while ((item=txn->newest_logentry)) { while ((item=txn->newest_logentry)) {
txn->newest_logentry = item->prev; txn->newest_logentry = item->prev;
rolltype_dispatch_assign(item, toku_commit_, r, txn);
if (r!=0) goto broken;
rolltype_dispatch(item, toku_free_rolltype_); rolltype_dispatch(item, toku_free_rolltype_);
toku_free(item); toku_free(item);
} }
r = 0;
}
list_remove(&txn->live_txns_link); list_remove(&txn->live_txns_link);
toku_free(txn); toku_free(txn);
}
return r; return r;
} }
...@@ -402,7 +416,7 @@ int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, int mode) { ...@@ -402,7 +416,7 @@ int toku_logger_log_fcreate (TOKUTXN txn, const char *fname, int mode) {
BYTESTRING bs = { .len=strlen(fname), .data = strdup(fname) }; BYTESTRING bs = { .len=strlen(fname), .data = strdup(fname) };
int r = toku_log_fcreate (txn->logger, (LSN*)0, 0, toku_txn_get_txnid(txn), bs, mode); int r = toku_log_fcreate (txn->logger, (LSN*)0, 0, toku_txn_get_txnid(txn), bs, mode);
if (r!=0) return r; if (r!=0) return r;
r = toku_logger_save_rollback_fcreate(txn, bs); r = toku_logger_save_rollback_fcreate(txn, toku_txn_get_txnid(txn), bs);
return r; return r;
} }
...@@ -569,23 +583,30 @@ int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname, u_int ...@@ -569,23 +583,30 @@ int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname, u_int
return 0; return 0;
} }
int toku_logprint_BYTESTRING (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *format __attribute__((__unused__))) {
BYTESTRING bs; void toku_print_BYTESTRING (FILE *outf, u_int32_t len, char *data) {
int r = toku_fread_BYTESTRING(inf, &bs, crc, len); fprintf(outf, "{len=%d data=\"", len);
if (r!=0) return r;
fprintf(outf, " %s={len=%d data=\"", fieldname, bs.len);
u_int32_t i; u_int32_t i;
for (i=0; i<bs.len; i++) { for (i=0; i<len; i++) {
switch (bs.data[i]) { switch (data[i]) {
case '"': fprintf(outf, "\\\""); break; case '"': fprintf(outf, "\\\""); break;
case '\\': fprintf(outf, "\\\\"); break; case '\\': fprintf(outf, "\\\\"); break;
case '\n': fprintf(outf, "\\n"); break; case '\n': fprintf(outf, "\\n"); break;
default: default:
if (isprint(bs.data[i])) fprintf(outf, "%c", bs.data[i]); if (isprint(data[i])) fprintf(outf, "%c", data[i]);
else fprintf(outf, "\\%03o", bs.data[i]); else fprintf(outf, "\\%03o", (unsigned char)(data[i]));
} }
} }
fprintf(outf, "\"}"); fprintf(outf, "\"}");
}
int toku_logprint_BYTESTRING (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *format __attribute__((__unused__))) {
BYTESTRING bs;
int r = toku_fread_BYTESTRING(inf, &bs, crc, len);
if (r!=0) return r;
fprintf(outf, " %s=", fieldname);
toku_print_BYTESTRING(outf, bs.len, bs.data);
toku_free(bs.data); toku_free(bs.data);
return 0; return 0;
} }
...@@ -671,6 +692,7 @@ int toku_abort_logentry_commit (struct logtype_commit *le __attribute__((__unuse ...@@ -671,6 +692,7 @@ int toku_abort_logentry_commit (struct logtype_commit *le __attribute__((__unuse
} }
int toku_logger_abort(TOKUTXN txn) { int toku_logger_abort(TOKUTXN txn) {
//printf("%s:%d aborting\n", __FILE__, __LINE__);
// Must undo everything. Must undo it all in reverse order. // Must undo everything. Must undo it all in reverse order.
// Build the reverse list // Build the reverse list
struct roll_entry *item; struct roll_entry *item;
...@@ -770,7 +792,8 @@ int toku_logger_log_archive (TOKULOGGER logger, char ***logs_p, int flags) { ...@@ -770,7 +792,8 @@ int toku_logger_log_archive (TOKULOGGER logger, char ***logs_p, int flags) {
//printf("%s:%d file=%s firstlsn=%lld checkpoint_lsns={%lld %lld}\n", __FILE__, __LINE__, all_logs[i], (long long)earliest_lsn_seen.lsn, (long long)logger->checkpoint_lsns[0].lsn, (long long)logger->checkpoint_lsns[1].lsn); //printf("%s:%d file=%s firstlsn=%lld checkpoint_lsns={%lld %lld}\n", __FILE__, __LINE__, all_logs[i], (long long)earliest_lsn_seen.lsn, (long long)logger->checkpoint_lsns[0].lsn, (long long)logger->checkpoint_lsns[1].lsn);
if ((earliest_lsn_seen.lsn <= logger->checkpoint_lsns[0].lsn)&& if ((earliest_lsn_seen.lsn <= logger->checkpoint_lsns[0].lsn)&&
(earliest_lsn_seen.lsn <= logger->checkpoint_lsns[1].lsn)) { (earliest_lsn_seen.lsn <= logger->checkpoint_lsns[1].lsn)&&
(earliest_lsn_seen.lsn <= oldest_live_txn_lsn.lsn)) {
break; break;
} }
} }
......
...@@ -70,6 +70,9 @@ int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname, ...@@ -70,6 +70,9 @@ int toku_logprint_u_int32_t (FILE *outf, FILE *inf, const char *fieldname,
int toku_logprint_LOGGEDBRTHEADER (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *); int toku_logprint_LOGGEDBRTHEADER (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *);
int toku_logprint_INTPAIRARRAY (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *); int toku_logprint_INTPAIRARRAY (FILE *outf, FILE *inf, const char *fieldname, u_int32_t *crc, u_int32_t *len, const char *);
// Useful thing for printing a bytestring.
void toku_print_BYTESTRING (FILE *outf, u_int32_t len, char *data);
int toku_read_and_print_logmagic (FILE *f, u_int32_t *version); int toku_read_and_print_logmagic (FILE *f, u_int32_t *version);
TXNID toku_txn_get_txnid (TOKUTXN); TXNID toku_txn_get_txnid (TOKUTXN);
...@@ -110,9 +113,6 @@ static inline int toku_copy_BYTESTRING(BYTESTRING *target, BYTESTRING val) { ...@@ -110,9 +113,6 @@ static inline int toku_copy_BYTESTRING(BYTESTRING *target, BYTESTRING val) {
static inline void toku_free_BYTESTRING(BYTESTRING val) { static inline void toku_free_BYTESTRING(BYTESTRING val) {
toku_free(val.data); toku_free(val.data);
} }
static inline void toku_free_DISKOFFARRAY(DISKOFFARRAY val) {
toku_free(val.array);
}
static inline int toku_copy_LOGGEDBRTHEADER(LOGGEDBRTHEADER *target, LOGGEDBRTHEADER val) { static inline int toku_copy_LOGGEDBRTHEADER(LOGGEDBRTHEADER *target, LOGGEDBRTHEADER val) {
*target = val; *target = val;
......
...@@ -39,23 +39,38 @@ struct logtype { ...@@ -39,23 +39,38 @@ struct logtype {
int logformat_version_number = 0; int logformat_version_number = 0;
const struct logtype rollbacks[] = { const struct logtype rollbacks[] = {
{"fcreate", 'F', FA{{"BYTESTRING", "fname", 0}, {"fcreate", 'F', FA{{"TXNID", "xid", 0},
{"BYTESTRING", "fname", 0},
NULLFIELD}}, NULLFIELD}},
// {"fclose", 'c', FA{{"FILENUM", "filenum", 0}, {"cmdinsert", 'i', FA{{"TXNID", "xid", 0},
// {"BYTESTRING", "fname", 0}, {"FILENUM", "filenum", 0},
// NULLFIELD}},
{"deleteatleaf", 'd', FA{{"FILENUM", "filenum", 0}, // Note a delete for rollback. The delete takes place in a leaf.
{"BYTESTRING", "key", 0}, {"BYTESTRING", "key", 0},
{"BYTESTRING", "data", 0}, {"BYTESTRING", "data", 0},
NULLFIELD}}, NULLFIELD}},
{"insertatleaf", 'i', FA{{"FILENUM", "filenum", 0}, // Note an insert for rollback. The insert takes place in a leaf. {"cmddeleteboth", 'D', FA{{"TXNID", "xid", 0},
{"FILENUM", "filenum", 0},
{"BYTESTRING", "key", 0}, {"BYTESTRING", "key", 0},
{"BYTESTRING", "data", 0}, {"BYTESTRING", "data", 0},
NULLFIELD}}, NULLFIELD}},
{"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0}, {"cmddelete", 'd', FA{{"TXNID", "xid", 0},
{"DISKOFFARRAY", "parents", 0}, {"FILENUM", "filenum", 0},
{"DISKOFF", "diskoff", 0}, {"BYTESTRING", "key", 0},
NULLFIELD}}, NULLFIELD}},
// {"fclose", 'c', FA{{"FILENUM", "filenum", 0},
// {"BYTESTRING", "fname", 0},
// NULLFIELD}},
// {"deleteatleaf", 'd', FA{{"FILENUM", "filenum", 0}, // Note a delete for rollback. The delete takes place in a leaf.
// {"BYTESTRING", "key", 0},
// {"BYTESTRING", "data", 0},
// NULLFIELD}},
// {"insertatleaf", 'i', FA{{"FILENUM", "filenum", 0}, // Note an insert for rollback. The insert takes place in a leaf.
// {"BYTESTRING", "key", 0},
// {"BYTESTRING", "data", 0},
// NULLFIELD}},
// {"xactiontouchednonleaf", 'n', FA{{"FILENUM", "filenum", 0},
// {"DISKOFFARRAY", "parents", 0},
// {"DISKOFF", "diskoff", 0},
// NULLFIELD}},
{0,0,FA{NULLFIELD}} {0,0,FA{NULLFIELD}}
}; };
...@@ -152,12 +167,28 @@ const struct logtype logtypes[] = { ...@@ -152,12 +167,28 @@ const struct logtype logtypes[] = {
{"u_int32_t", "oldfingerprint", "%08x"}, {"u_int32_t", "oldfingerprint", "%08x"},
{"u_int32_t", "newfingerprint", "%08x"}, {"u_int32_t", "newfingerprint", "%08x"},
NULLFIELD}}, NULLFIELD}},
{"insertinleaf", 'I', FA{{"TXNID", "txnid", 0}, // {"insertinleaf", 'I', FA{{"TXNID", "txnid", 0},
{"FILENUM", "filenum", 0}, // {"FILENUM", "filenum", 0},
// {"DISKOFF", "diskoff", 0},
// {"u_int32_t", "pmaidx", 0},
// {"BYTESTRING", "key", 0},
// {"BYTESTRING", "data", 0},
// NULLFIELD}},
// {"replaceleafentry", 'L', FA{{"FILENUM", "filenum", 0},
// {"DISKOFF", "diskoff", 0},
// {"u_int32_t", "pmaidx", 0},
// {"LEAFENTRY", "oldleafentry", 0},
// {"LEAFENTRY", "newleafentry", 0},
// NULLFIELD}},
{"insertleafentry", 'I', FA{{"FILENUM", "filenum", 0},
{"DISKOFF", "diskoff", 0}, {"DISKOFF", "diskoff", 0},
{"u_int32_t", "pmaidx", 0}, {"u_int32_t", "pmaidx", 0},
{"BYTESTRING", "key", 0}, {"LEAFENTRY", "newleafentry", 0},
{"BYTESTRING", "data", 0}, NULLFIELD}},
{"deleteleafentry", 'D', FA{{"FILENUM", "filenum", 0},
{"DISKOFF", "diskoff", 0},
{"u_int32_t", "pmaidx", 0},
{"LEAFENTRY", "oldleafentry", 0},
NULLFIELD}}, NULLFIELD}},
{"deleteinleaf", 'd', FA{{"TXNID", "txnid", 0}, {"deleteinleaf", 'd', FA{{"TXNID", "txnid", 0},
{"FILENUM", "filenum", 0}, {"FILENUM", "filenum", 0},
...@@ -259,6 +290,9 @@ void generate_log_struct (void) { ...@@ -259,6 +290,9 @@ void generate_log_struct (void) {
fprintf(hf, "int toku_rollback_%s (", lt->name); fprintf(hf, "int toku_rollback_%s (", lt->name);
DO_FIELDS(ft, lt, fprintf(hf, "%s %s,", ft->type, ft->name)); DO_FIELDS(ft, lt, fprintf(hf, "%s %s,", ft->type, ft->name));
fprintf(hf, "TOKUTXN txn);\n"); fprintf(hf, "TOKUTXN txn);\n");
fprintf(hf, "int toku_commit_%s (", lt->name);
DO_FIELDS(ft, lt, fprintf(hf, "%s %s,", ft->type, ft->name));
fprintf(hf, "TOKUTXN txn);\n");
})); }));
fprintf(hf, "struct log_entry {\n"); fprintf(hf, "struct log_entry {\n");
fprintf(hf, " enum lt_cmd cmd;\n"); fprintf(hf, " enum lt_cmd cmd;\n");
......
...@@ -10,12 +10,13 @@ ...@@ -10,12 +10,13 @@
/* Generally: errno is set to 0 or a value to indicate problems. */ /* Generally: errno is set to 0 or a value to indicate problems. */
enum typ_tag { TYP_BRTNODE = 0xdead0001, enum typ_tag { TYP_BRTNODE = 3735879681, //0xdead0001,
TYP_CACHETABLE, TYP_PAIR, /* for cachetables */ TYP_CACHETABLE, TYP_PAIR, /* for cachetables */
TYP_PMA, TYP_PMA,
TYP_GPMA, TYP_GPMA,
TYP_TOKULOGGER, TYP_TOKULOGGER,
TYP_TOKUTXN TYP_TOKUTXN,
TYP_LEAFENTRY
}; };
/* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */ /* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */
......
...@@ -45,6 +45,7 @@ void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) { ...@@ -45,6 +45,7 @@ void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) {
assert(mp->free_offset <= mp->size); assert(mp->free_offset <= mp->size);
void *vp; void *vp;
size_t offset = (mp->free_offset + (alignment-1)) & ~(alignment-1); size_t offset = (mp->free_offset + (alignment-1)) & ~(alignment-1);
//printf("mempool_malloc size=%ld base=%p free_offset=%ld mp->size=%ld offset=%ld\n", size, mp->base, mp->free_offset, mp->size, offset);
if (offset + size > mp->size) { if (offset + size > mp->size) {
vp = 0; vp = 0;
} else { } else {
...@@ -54,11 +55,14 @@ void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) { ...@@ -54,11 +55,14 @@ void *toku_mempool_malloc(struct mempool *mp, size_t size, int alignment) {
assert(mp->free_offset <= mp->size); assert(mp->free_offset <= mp->size);
assert(((long)vp & (alignment-1)) == 0); assert(((long)vp & (alignment-1)) == 0);
assert(vp == 0 || (mp->base <= vp && vp + size <= mp->base + mp->size)); assert(vp == 0 || (mp->base <= vp && vp + size <= mp->base + mp->size));
//printf("mempool returning %p\n", vp);
return vp; return vp;
} }
// if vp is null then we are freeing something, but not specifying what. The data won't be freed until compression is done.
void toku_mempool_mfree(struct mempool *mp, void *vp, int size) { void toku_mempool_mfree(struct mempool *mp, void *vp, int size) {
assert(size >= 0 && mp->base <= vp && vp + size <= mp->base + mp->size); assert(size >= 0);
if (vp) assert(toku_mempool_inrange(mp, vp, size));
mp->frag_size += size; mp->frag_size += size;
assert(mp->frag_size <= mp->size); assert(mp->frag_size <= mp->size);
} }
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved." #ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
#include <assert.h> #include "toku_assert.h"
struct rbuf { struct rbuf {
unsigned char *buf; unsigned char *buf;
...@@ -34,19 +34,19 @@ static inline void rbuf_literal_bytes (struct rbuf *r, bytevec *bytes, unsigned ...@@ -34,19 +34,19 @@ static inline void rbuf_literal_bytes (struct rbuf *r, bytevec *bytes, unsigned
} }
/* Return a pointer into the middle of the buffer. */ /* Return a pointer into the middle of the buffer. */
static void rbuf_bytes (struct rbuf *r, bytevec *bytes, unsigned int *n_bytes) static inline void rbuf_bytes (struct rbuf *r, bytevec *bytes, unsigned int *n_bytes)
{ {
*n_bytes = rbuf_int(r); *n_bytes = rbuf_int(r);
rbuf_literal_bytes(r, bytes, *n_bytes); rbuf_literal_bytes(r, bytes, *n_bytes);
} }
static unsigned long long rbuf_ulonglong (struct rbuf *r) { static inline unsigned long long rbuf_ulonglong (struct rbuf *r) {
unsigned i0 = rbuf_int(r); unsigned i0 = rbuf_int(r);
unsigned i1 = rbuf_int(r); unsigned i1 = rbuf_int(r);
return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1)); return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1));
} }
static DISKOFF rbuf_diskoff (struct rbuf *r) { static inline DISKOFF rbuf_diskoff (struct rbuf *r) {
unsigned i0 = rbuf_int(r); unsigned i0 = rbuf_int(r);
unsigned i1 = rbuf_int(r); unsigned i1 = rbuf_int(r);
return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1)); return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1));
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#define DO_VERIFY_COUNTS //#define DO_VERIFY_COUNTS
#ifdef DO_VERIFY_COUNTS #ifdef DO_VERIFY_COUNTS
#define VERIFY_COUNTS(n) toku_verify_counts(n) #define VERIFY_COUNTS(n) toku_verify_counts(n)
#else #else
...@@ -390,7 +390,7 @@ void toku_recover_fopen (LSN UU(lsn), TXNID UU(txnid), BYTESTRING fname, FILENUM ...@@ -390,7 +390,7 @@ void toku_recover_fopen (LSN UU(lsn), TXNID UU(txnid), BYTESTRING fname, FILENUM
toku_free_BYTESTRING(fname); toku_free_BYTESTRING(fname);
} }
void toku_recover_insertinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, BYTESTRING keybs, BYTESTRING databs) { void toku_recover_insertleafentry (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, LEAFENTRY newleafentry) {
struct cf_pair *pair = NULL; struct cf_pair *pair = NULL;
int r = find_cachefile(filenum, &pair); int r = find_cachefile(filenum, &pair);
assert(r==0); assert(r==0);
...@@ -401,25 +401,84 @@ void toku_recover_insertinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO ...@@ -401,25 +401,84 @@ void toku_recover_insertinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO
BRTNODE node = node_v; BRTNODE node = node_v;
assert(node->height==0); assert(node->height==0);
VERIFY_COUNTS(node); VERIFY_COUNTS(node);
struct kv_pair *kvp = brtnode_malloc_kv_pair(node->u.l.buffer, &node->u.l.buffer_mempool, keybs.data, keybs.len, databs.data, databs.len); node->log_lsn = lsn;
assert(pair); {
toku_gpma_set_at_index(node->u.l.buffer, pmaidx, kv_pair_size(kvp), kvp); int memsize = leafentry_memsize(newleafentry);
node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len); void *mem = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, memsize);
// printf("%s:%d local_fingerprint=%08x (this=%08x)\n", __FILE__, __LINE__, node->local_fingerprint, toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len)); memcpy(mem, newleafentry, memsize);
node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + KEY_VALUE_OVERHEAD + keybs.len + databs.len; toku_gpma_set_at_index(node->u.l.buffer, pmaidx, memsize, mem);
node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + leafentry_disksize(newleafentry);
// PMA_ITERATE_IDX(node->u.l.buffer, idx, skey, keylen __attribute__((__unused__)), sdata, datalen __attribute__((__unused__)), node->local_fingerprint += node->rand4fingerprint * toku_le_crc(newleafentry);
// printf("%d: %s %s\n", idx, (char*)skey, (char*)sdata)); }
r = toku_cachetable_unpin(pair->cf, diskoff, 1, toku_serialize_brtnode_size(node));
assert(r==0);
toku_free_LEAFENTRY(newleafentry);
}
void toku_recover_deleteleafentry (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, LEAFENTRY oldleafentry) {
struct cf_pair *pair = NULL;
int r = find_cachefile(filenum, &pair);
assert(r==0);
void *node_v;
assert(pair->brt);
r = toku_cachetable_get_and_pin(pair->cf, diskoff, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, pair->brt);
assert(r==0);
BRTNODE node = node_v;
assert(node->height==0);
VERIFY_COUNTS(node); VERIFY_COUNTS(node);
node->log_lsn = lsn; node->log_lsn = lsn;
{
u_int32_t len; void *data;
r=toku_gpma_get_from_index(node->u.l.buffer, pmaidx, &len, &data);
assert(r==0);
assert(len==leafentry_memsize(oldleafentry));
assert(memcmp(oldleafentry, data, len)==0);
node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(data);
node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(data);
toku_mempool_mfree(&node->u.l.buffer_mempool, data, len);
toku_gpma_clear_at_index(node->u.l.buffer, pmaidx);
}
r = toku_cachetable_unpin(pair->cf, diskoff, 1, toku_serialize_brtnode_size(node)); r = toku_cachetable_unpin(pair->cf, diskoff, 1, toku_serialize_brtnode_size(node));
assert(r==0); assert(r==0);
toku_free_BYTESTRING(keybs); toku_free_LEAFENTRY(oldleafentry);
toku_free_BYTESTRING(databs);
} }
//void toku_recover_replaceleafentry (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, LEAFENTRY oldleafentry, LEAFENTRY newleafentry) {
// struct cf_pair *pair = NULL;
// int r = find_cachefile(filenum, &pair);
// assert(r==0);
// void *node_v;
// assert(pair->brt);
// r = toku_cachetable_get_and_pin(pair->cf, diskoff, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, pair->brt);
// assert(r==0);
// BRTNODE node = node_v;
// assert(node->height==0);
// VERIFY_COUNTS(node);
// node->log_lsn = lsn;
// {
// u_int32_t len; void *data;
// r=toku_gpma_get_from_index(node->u.l.buffer, pmaidx, &len, &data);
// assert(r==0);
// assert(len==leafentry_memsize(oldleafentry));
// assert(memcmp(oldleafentry, data, len)==0);
// node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(data);
// node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(data);
// toku_mempool_mfree(&node->u.l.buffer_mempool, data, len);
// }
// {
// int memsize = leafentry_memsize(newleafentry);
// void *mem = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, memsize);
// memcpy(mem, newleafentry, memsize);
// toku_gpma_set_at_index(node->u.l.buffer, pmaidx, memsize, mem);
// node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + leafentry_disksize(newleafentry);
// node->local_fingerprint += node->rand4fingerprint * toku_le_crc(newleafentry);
// }
// r = toku_cachetable_unpin(pair->cf, diskoff, 1, toku_serialize_brtnode_size(node));
// assert(r==0);
// toku_free_LEAFENTRY(oldleafentry);
// toku_free_LEAFENTRY(newleafentry);
//}
void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, BYTESTRING keybs, BYTESTRING databs) { void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKOFF diskoff, u_int32_t pmaidx, BYTESTRING keybs, BYTESTRING databs) {
struct cf_pair *pair = NULL; struct cf_pair *pair = NULL;
int r = find_cachefile(filenum, &pair); int r = find_cachefile(filenum, &pair);
...@@ -440,7 +499,8 @@ void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO ...@@ -440,7 +499,8 @@ void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO
} }
} }
toku_gpma_clear_at_index(node->u.l.buffer, pmaidx); toku_gpma_clear_at_index(node->u.l.buffer, pmaidx);
node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len); assert(!"kvpair");
//node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_kvpair(keybs.data, keybs.len, databs.data, databs.len);
node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + KEY_VALUE_OVERHEAD + keybs.len + databs.len; node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + KEY_VALUE_OVERHEAD + keybs.len + databs.len;
VERIFY_COUNTS(node); VERIFY_COUNTS(node);
node->log_lsn = lsn; node->log_lsn = lsn;
...@@ -449,7 +509,6 @@ void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO ...@@ -449,7 +509,6 @@ void toku_recover_deleteinleaf (LSN lsn, TXNID UU(txnid), FILENUM filenum, DISKO
toku_free_BYTESTRING(keybs); toku_free_BYTESTRING(keybs);
toku_free_BYTESTRING(databs); toku_free_BYTESTRING(databs);
} }
// a newbrtnode should have been done before this // a newbrtnode should have been done before this
void toku_recover_resizepma (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t oldsize __attribute__((__unused__)), u_int32_t newsize) { void toku_recover_resizepma (LSN lsn, FILENUM filenum, DISKOFF diskoff, u_int32_t oldsize __attribute__((__unused__)), u_int32_t newsize) {
struct cf_pair *pair = NULL; struct cf_pair *pair = NULL;
...@@ -490,9 +549,9 @@ int move_indices (GPMA from, struct mempool *from_mempool, ...@@ -490,9 +549,9 @@ int move_indices (GPMA from, struct mempool *from_mempool,
struct gitem item = from->items[idx]; struct gitem item = from->items[idx];
items[i]=item; items[i]=item;
from->items[idx].data = 0; from->items[idx].data = 0;
fp += toku_calccrc32_kvpair_struct(item.data); fp += toku_le_crc(item.data);
sizediff += PMA_ITEM_OVERHEAD + item.len; sizediff += PMA_ITEM_OVERHEAD + leafentry_disksize(item.data);
assert(kv_pair_size(item.data)==item.len); assert(leafentry_memsize(item.data)==item.len);
} }
from->n_items_present -= fromto.size; from->n_items_present -= fromto.size;
...@@ -512,7 +571,7 @@ int move_indices (GPMA from, struct mempool *from_mempool, ...@@ -512,7 +571,7 @@ int move_indices (GPMA from, struct mempool *from_mempool,
to->items[to_idx] = (struct gitem){items[i].len, new_data}; to->items[to_idx] = (struct gitem){items[i].len, new_data};
toku_mempool_mfree(from_mempool, items[i].data, items[i].len); toku_mempool_mfree(from_mempool, items[i].data, items[i].len);
} }
assert(kv_pair_size(to->items[to_idx].data)==to->items[to_idx].len); assert(leafentry_memsize(to->items[to_idx].data)==to->items[to_idx].len);
} }
to->n_items_present += fromto.size; to->n_items_present += fromto.size;
*a_fp -= a_rand * fp; *a_fp -= a_rand * fp;
...@@ -520,8 +579,8 @@ int move_indices (GPMA from, struct mempool *from_mempool, ...@@ -520,8 +579,8 @@ int move_indices (GPMA from, struct mempool *from_mempool,
*a_nbytes -= sizediff; *a_nbytes -= sizediff;
*b_nbytes += sizediff; *b_nbytes += sizediff;
toku_free(items); toku_free(items);
toku_verify_gpma(from); //toku_verify_gpma(from);
toku_verify_gpma(to); //toku_verify_gpma(to);
return 0; return 0;
} }
......
...@@ -12,7 +12,14 @@ ...@@ -12,7 +12,14 @@
#include "cachetable.h" #include "cachetable.h"
#include "key.h" #include "key.h"
int toku_rollback_fcreate (BYTESTRING bs_fname, int toku_commit_fcreate (TXNID xid __attribute__((__unused__)),
BYTESTRING bs_fname __attribute__((__unused__)),
TOKUTXN txn __attribute__((__unused__))) {
return 0;
}
int toku_rollback_fcreate (TXNID xid __attribute__((__unused__)),
BYTESTRING bs_fname,
TOKUTXN txn __attribute__((__unused__))) { TOKUTXN txn __attribute__((__unused__))) {
char *fname = fixup_fname(&bs_fname); char *fname = fixup_fname(&bs_fname);
char *directory = txn->logger->directory; char *directory = txn->logger->directory;
...@@ -26,84 +33,62 @@ int toku_rollback_fcreate (BYTESTRING bs_fname, ...@@ -26,84 +33,62 @@ int toku_rollback_fcreate (BYTESTRING bs_fname,
return 0; return 0;
} }
#if 0 int toku_commit_cmdinsert (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) {
int toku_rollback_fclose (FILENUM filenum, BYTESTRING bs_fname, TOKUTXN txn) { CACHEFILE cf;
abort(); BRT brt;
filenum=filenum; //printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data);
bs_fname=bs_fname; int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt);
txn=txn;
#if 0
char *fixedfname = fixup_fname(&bs_fname);
int fd = open(fixedfname, O_RDWR, 0);
assert(fd>=0);
BRT MALLOC(brt);
assert(errno==0 && brt!=0);
brt->database_name = fixedfname;
brt->h=0;
list_init(&brt->cursors);
brt->compare_fun = 0;
brt->dup_compare = 0;
brt->db = 0;
CACHETABLE cf;
int r = toku_cachetable_openfd(&cf, /*ct*/0, fd, brt);
assert(r==0); assert(r==0);
brt->skey = brt->sval = 0; DBT key_dbt,data_dbt;
brt->cf = cf; BRT_CMD_S brtcmd = { BRT_COMMIT_BOTH, xid,
toku_recover_note_cachefile(filenum, cf, brt); .u.id={toku_fill_dbt(&key_dbt, key.data, key.len),
toku_fill_dbt(&data_dbt, data.data, data.len)}};
printf("%s:%d Must remember to close the file again after txn %p finishes aborting\n", __FILE__, __LINE__, txn); return toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
return 0;
#endif
} }
#endif
//int toku_rollback_newbrtnode (struct logtype_newbrtnode *le, TOKUTXN txn) {
// // All that must be done is to put the node on the freelist.
// // Since we don't have a freelist right now, we don't have anything to do.
// // We'll fix this later (See #264)
// le=le;
// txn=txn;
// return 0;
//}
int toku_rollback_insertatleaf (FILENUM filenum, BYTESTRING key,BYTESTRING data, TOKUTXN txn) { int toku_rollback_cmdinsert (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) {
CACHEFILE cf; CACHEFILE cf;
BRT brt; BRT brt;
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt); int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt);
assert(r==0); assert(r==0);
//printf("%s:%d aborting insert %s %s\n", __FILE__, __LINE__, key.data, data.data);
DBT key_dbt,data_dbt; DBT key_dbt,data_dbt;
r = toku_brt_delete_both(brt, BRT_CMD_S brtcmd = { BRT_ABORT_BOTH, xid,
toku_fill_dbt(&key_dbt, key.data, key.len), .u.id={toku_fill_dbt(&key_dbt, key.data, key.len),
toku_fill_dbt(&data_dbt, data.data, data.len), toku_fill_dbt(&data_dbt, data.data, data.len)}};
0); return toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
return r; }
int toku_commit_cmddeleteboth (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) {
return toku_commit_cmdinsert(xid, filenum, key, data, txn);
}
int toku_rollback_cmddeleteboth (TXNID xid, FILENUM filenum, BYTESTRING key,BYTESTRING data,TOKUTXN txn) {
return toku_rollback_cmdinsert(xid, filenum, key, data, txn);
} }
int toku_rollback_deleteatleaf (FILENUM filenum, BYTESTRING key, BYTESTRING data,TOKUTXN txn) { int toku_commit_cmddelete (TXNID xid, FILENUM filenum, BYTESTRING key,TOKUTXN txn) {
CACHEFILE cf; CACHEFILE cf;
BRT brt; BRT brt;
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt); int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt);
assert(r==0); assert(r==0);
//printf("%s:%d aborting delete %s %s\n", __FILE__, __LINE__, key.data, data.data);
DBT key_dbt,data_dbt; DBT key_dbt,data_dbt;
r = toku_brt_insert(brt, BRT_CMD_S brtcmd = { BRT_COMMIT_ANY, xid,
toku_fill_dbt(&key_dbt, key.data, key.len), .u.id={toku_fill_dbt(&key_dbt, key.data, key.len),
toku_fill_dbt(&data_dbt, data.data, data.len), toku_init_dbt(&data_dbt)}};
0); // Do the insertion unconditionally return toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
return r;
} }
int toku_rollback_xactiontouchednonleaf(FILENUM filenum, DISKOFFARRAY array __attribute__((__unused__)), DISKOFF diskoff, TOKUTXN txn) { int toku_rollback_cmddelete (TXNID xid, FILENUM filenum, BYTESTRING key,TOKUTXN txn) {
CACHEFILE cf; CACHEFILE cf;
BRT brt; BRT brt;
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt); int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf, &brt);
assert(r==0); assert(r==0);
r = toku_brt_nonleaf_expunge_xaction(brt, diskoff, txn->txnid64); //printf("%s:%d aborting delete %s %s\n", __FILE__, __LINE__, key.data, data.data);
assert(r==0); DBT key_dbt,data_dbt;
//printf("%s:%d node=%lld has Rollback parents = {", __FILE__, __LINE__, (long long)diskoff); BRT_CMD_S brtcmd = { BRT_ABORT_ANY, xid,
//int i; for (i=0; i<array.len; i++) printf(" %lld", array.array[i]); .u.id={toku_fill_dbt(&key_dbt, key.data, key.len),
//printf("}\n"); toku_init_dbt(&data_dbt)}};
if (array.len!=0) printf("%s:%d array.len!=0 and we didn't fix up the fingerprints.\n", __FILE__, __LINE__); return toku_brt_root_put_cmd(brt, &brtcmd, toku_txn_logger(txn));
return 0;
} }
...@@ -44,7 +44,7 @@ void doit (void) { ...@@ -44,7 +44,7 @@ void doit (void) {
assert(r==0); assert(r==0);
u_int32_t fingerprint=0; u_int32_t fingerprint=0;
r = toku_testsetup_insert_to_nonleaf(t, nodeb, BRT_DELETE, "hello", 6, 0, 0, &fingerprint); r = toku_testsetup_insert_to_nonleaf(t, nodeb, BRT_DELETE_ANY, "hello", 6, 0, 0, &fingerprint);
assert(r==0); assert(r==0);
r = toku_testsetup_root(t, nodeb); r = toku_testsetup_root(t, nodeb);
......
...@@ -51,10 +51,8 @@ TDB_CPPFLAGS = -I../../include ...@@ -51,10 +51,8 @@ TDB_CPPFLAGS = -I../../include
SRCS = $(sort $(wildcard *.c)) SRCS = $(sort $(wildcard *.c))
TDB_TESTS = $(patsubst %.c,%.tdb,$(SRCS)) TDB_TESTS = $(patsubst %.c,%.tdb,$(SRCS))
BDB_DONTRUN = bug627 BDB_DONTRUN = bug627 test_abort1
BDB_TESTS = $(patsubst %.c,%.bdb,$(filter-out $(patsubst %,%.c,$(BDB_DONTRUN)),$(SRCS))) BDB_TESTS = $(patsubst %.c,%.bdb,$(filter-out $(patsubst %,%.c,$(BDB_DONTRUN)),$(SRCS)))
foobdb:
echo $(BDB_TESTS)
ALL_TESTS = $(TDB_TESTS) $(BDB_TESTS) ALL_TESTS = $(TDB_TESTS) $(BDB_TESTS)
......
...@@ -96,10 +96,12 @@ void test_db_put_aborts (void) { ...@@ -96,10 +96,12 @@ void test_db_put_aborts (void) {
key.size=4; key.size=4;
data.data="now"; data.data="now";
data.size=4; data.size=4;
r=db->put(db, tid, &key, &data, 0); r=db->put(db, tid2, &key, &data, 0);
CKERR(r); CKERR(r);
} }
//printf("%s:%d aborting\n", __FILE__, __LINE__);
r=tid->abort(tid); assert(r==0); r=tid->abort(tid); assert(r==0);
//printf("%s:%d committing\n", __FILE__, __LINE__);
r=tid2->commit(tid2,0); assert(r==0); r=tid2->commit(tid2,0); assert(r==0);
} }
// The database should exist // The database should exist
...@@ -109,6 +111,7 @@ void test_db_put_aborts (void) { ...@@ -109,6 +111,7 @@ void test_db_put_aborts (void) {
assert(r==0); assert(r==0);
} }
// But the item should not be in it. // But the item should not be in it.
if (1)
{ {
DB_TXN *tid; DB_TXN *tid;
r=env->txn_begin(env, 0, &tid, 0); assert(r==0); r=env->txn_begin(env, 0, &tid, 0); assert(r==0);
...@@ -122,6 +125,15 @@ void test_db_put_aborts (void) { ...@@ -122,6 +125,15 @@ void test_db_put_aborts (void) {
assert(r!=0); assert(r!=0);
assert(r==DB_NOTFOUND); assert(r==DB_NOTFOUND);
} }
{
DBT key,data;
memset(&key, 0, sizeof(key));
memset(&data, 0, sizeof(data));
key.data="bye";
key.size=4;
r=db->get(db, tid, &key, &data, 0);
CKERR(r);
}
r=tid->commit(tid,0); assert(r==0); r=tid->commit(tid,0); assert(r==0);
} }
......
...@@ -49,7 +49,7 @@ void do_test_abort2 (void) { ...@@ -49,7 +49,7 @@ void do_test_abort2 (void) {
r=db->close(db, 0); CKERR(r); r=db->close(db, 0); CKERR(r);
r=env->close(env, 0); CKERR(r); r=env->close(env, 0); CKERR(r);
printf("%s:%d\n", __FILE__, __LINE__); //printf("%s:%d\n", __FILE__, __LINE__);
// Now do a few inserts that abort. // Now do a few inserts that abort.
r=db_env_create(&env, 0); assert(r==0); r=db_env_create(&env, 0); assert(r==0);
...@@ -78,17 +78,17 @@ void do_test_abort2 (void) { ...@@ -78,17 +78,17 @@ void do_test_abort2 (void) {
r=txn->abort(txn); CKERR(r); r=txn->abort(txn); CKERR(r);
printf("%s:%d\n", __FILE__, __LINE__); //printf("%s:%d\n", __FILE__, __LINE__);
//r=db->close(db,0); CKERR(r); r=env->close(env, 0); CKERR(r); return; //r=db->close(db,0); CKERR(r); r=env->close(env, 0); CKERR(r); return;
// Don't do a lookup on "hello7", because that will force things out of the buffer. // Don't do a lookup on "hello7", because that will force things out of the buffer.
r=db->close(db, 0); CKERR(r); r=db->close(db, 0); CKERR(r);
printf("%s:%d\n", __FILE__, __LINE__); //printf("%s:%d\n", __FILE__, __LINE__);
r=db_create(&db, env, 0); CKERR(r); r=db_create(&db, env, 0); CKERR(r);
r=env->txn_begin(env, 0, &txn, 0); assert(r==0); r=env->txn_begin(env, 0, &txn, 0); assert(r==0);
r=db->open(db, txn, "foo.db", 0, DB_BTREE, 0, 0777); CKERR(r); r=db->open(db, txn, "foo.db", 0, DB_BTREE, 0, 0777); CKERR(r);
r=txn->abort(txn); CKERR(r); r=txn->commit(txn, 0); CKERR(r);
printf("%s:%d\n", __FILE__, __LINE__); //printf("%s:%d\n", __FILE__, __LINE__);
r=env->txn_begin(env, 0, &txn, 0); assert(r==0); r=env->txn_begin(env, 0, &txn, 0); assert(r==0);
{ {
...@@ -96,7 +96,7 @@ void do_test_abort2 (void) { ...@@ -96,7 +96,7 @@ void do_test_abort2 (void) {
memset(&data, 0, sizeof(data)); memset(&data, 0, sizeof(data));
r = db->get(db, txn, dbt_init(&key, "hello7", strlen("hello7")+1), &data, 0); r = db->get(db, txn, dbt_init(&key, "hello7", strlen("hello7")+1), &data, 0);
CKERR(r); CKERR(r);
printf("data is %s\n", (char*)data.data); //printf("data is %s\n", (char*)data.data);
assert(((char*)data.data)[0]=='0'); assert(((char*)data.data)[0]=='0');
} }
r=txn->abort(txn); CKERR(r); r=txn->abort(txn); CKERR(r);
......
...@@ -106,7 +106,7 @@ void test_dup_delete(int n, int dup_mode) { ...@@ -106,7 +106,7 @@ void test_dup_delete(int n, int dup_mode) {
int k = htonl(n/2); int k = htonl(n/2);
DBT key, val; DBT key, val;
r = db->get(db, null_txn, dbt_init(&key, &k, sizeof k), dbt_init_malloc(&val), 0); r = db->get(db, null_txn, dbt_init(&key, &k, sizeof k), dbt_init_malloc(&val), 0);
assert(r != 0); assert(r == DB_NOTFOUND);
} }
/* verify all dups are removed using a cursor */ /* verify all dups are removed using a cursor */
......
...@@ -178,6 +178,7 @@ static void verify_items (DB_ENV *env, DB *db) { ...@@ -178,6 +178,7 @@ static void verify_items (DB_ENV *env, DB *db) {
snprintf(hello, sizeof(hello), "hello%d.%d", kv, dv); snprintf(hello, sizeof(hello), "hello%d.%d", kv, dv);
snprintf(there, sizeof(hello), "there%d", dv); snprintf(there, sizeof(hello), "there%d", dv);
k2.data = hello; k2.size=strlen(hello)+1; k2.data = hello; k2.size=strlen(hello)+1;
printf("kv=%d dv=%d\n", kv, dv);
r=db->get(db, txn, &k2, &v2, 0); r=db->get(db, txn, &k2, &v2, 0);
assert(r==0); assert(r==0);
assert(strcmp(v2.data, there)==0); assert(strcmp(v2.data, there)==0);
......
...@@ -2663,6 +2663,9 @@ char *db_strerror(int error) { ...@@ -2663,6 +2663,9 @@ char *db_strerror(int error) {
if (error==DB_BADFORMAT) { if (error==DB_BADFORMAT) {
return "Database Bad Format (probably a corrupted database)"; return "Database Bad Format (probably a corrupted database)";
} }
if (error==DB_NOTFOUND) {
return "Not found";
}
static char unknown_result[100]; // Race condition if two threads call this at the same time. However even in a bad case, it should be some sort of null-terminated string. static char unknown_result[100]; // Race condition if two threads call this at the same time. However even in a bad case, it should be some sort of null-terminated string.
errorstr = unknown_result; errorstr = unknown_result;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment