Commit 33b25277 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul

Merge in the OMT and its integration into the main line.

{{{
$ cd tokudb
$ svn merge -r3533:3571 https://svn.tokutek.com/tokudb/tokudb.724
}}}
 
There remain a few problems with recovery, but the main line has problems with recovery, so let's bite the bullet and move forward.

Fixes #729, #724.


git-svn-id: file:///svn/tokudb@3572 c7de825b-a66e-492c-adef-691d508d4ae1
parent 0a37b673
......@@ -72,22 +72,18 @@ REGRESSION_TESTS = \
test-brt-delete-both \
test-brt-overflow \
test-del-inorder \
test-gpma-blackbox \
test-gpma-glassbox \
test-gpma-glassbox \
test-gpma-leftmost-dup \
test-inc-split \
test-primes \
test_oexcl \
test_toku_malloc_plain_free \
ybt-test \
log-test \
omt-test \
# This line intentially kept commented so I can have a \ on the end of the previous line
# Add in the binaries that must be run in various ways.
BINS = $(REGRESSION_TESTS) \
benchmark-test \
test-gpma-worstinsert \
brtdump \
randbrt \
randdb4 \
......@@ -99,7 +95,7 @@ OFILES = \
brt-verify.o \
brt.o \
cachetable.o \
fifo.o gpma.o \
fifo.o \
fingerprint.o \
key.o \
leafentry.o \
......@@ -107,6 +103,7 @@ OFILES = \
log_code.o \
memory.o \
mempool.o \
omt.o \
primes.o \
recover.o \
roll.o \
......@@ -128,7 +125,7 @@ tdb-recover: LDFLAGS+=-lz
recover.o: log_header.h log-internal.h log.h yerror.h brttypes.h kv-pair.h memory.h key.h cachetable.h
tdb-recover: $(OFILES)
roll.o: log_header.h log-internal.h log.h yerror.h brttypes.h kv-pair.h memory.h key.h cachetable.h gpma.h
roll.o: log_header.h log-internal.h log.h yerror.h brttypes.h kv-pair.h memory.h key.h cachetable.h omt.h
log_code.o: log_header.h wbuf.h log-internal.h rbuf.h
log_header.h: log_code.c
......@@ -141,9 +138,6 @@ bins: $(BINS)
# Put the benchmarktest_256 first since it takes the longest (and we want to use parallelism in the make)
CHECKS = \
benchmarktest_256 \
test-gpma-worstinsert-a \
test-gpma-worstinsert-b \
test-gpma-worstinsert-c \
$(REGRESSION_TESTS) \
# This line intentially kept commented so I can have a \ on the previous line
......@@ -151,12 +145,6 @@ CHECKS = \
check: bins $(patsubst %,check_%,$(CHECKS))
check_benchmarktest_256: benchmark-test
$(VGRIND) ./benchmark-test $(VERBVERBOSE) --valsize 256 --verify 1
check_test-gpma-worstinsert-a: test-gpma-worstinsert
$(VGRIND) ./test-gpma-worstinsert $(VERBVERBOSE) -a
check_test-gpma-worstinsert-b: test-gpma-worstinsert
$(VGRIND) ./test-gpma-worstinsert $(VERBVERBOSE) -b
check_test-gpma-worstinsert-c: test-gpma-worstinsert
$(VGRIND) ./test-gpma-worstinsert $(VERBVERBOSE) -c
check_test-assert: test-assert
@# no arguments, should err
......@@ -179,34 +167,24 @@ check-fanout:
done
log-test log-test2 log-test3 log-test4 log-test5 log-test6 benchmark-test brt-test brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 brt-test5 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder cachetable-test cachetable-test2: LDFLAGS+=-lz
# pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage
BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h gpma.h brt.h brt-search.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h mempool.h leafentry.h
HFILES = $(wildcard *.h)
BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h omt.h brt.h brt-search.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h mempool.h leafentry.h
key.o: brttypes.h key.h
list-test: list-test.o toku_assert.o
test-brt-delete-both: ybt.o brt.o fifo.o gpma.o memory.o leafentry.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
test-brt-delete-both: ybt.o brt.o fifo.o omt.o memory.o leafentry.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
test-inc-split: $(TEST_OFILES)
brt-test-helpers.o: $(BRT_INTERNAL_H_INCLUDES) toku_assert.h
test-del-inorder: $(TEST_OFILES)
# pma-test.o: $(BRT_INTERNAL_H_INCLUDES) pma-internal.h gpma.h list.h mempool.h
# pma-test: pma.o memory.o key.o ybt.o log.o mempool.o fingerprint.o brt-serialize.o fifo.o primes.o toku_assert.o log_code.o roll.o brt.o cachetable.o brt-verify.o
pma.o: gpma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h log.h ../include/db.h log_header.h
test-gpma-glassbox.o: test-gpma-glassbox.c gpma.h gpma-internal.h toku_assert.h memory.h
test-gpma-glassbox: test-gpma-glassbox.o toku_assert.o memory-debug.o gpma.o
test-gpma-blackbox: test-gpma-blackbox.o toku_assert.o memory.o gpma.o
test-gpma-worstinsert: test-gpma-worstinsert.o toku_assert.o memory.o gpma.o
test-gpma-leftmost-dup: test-gpma-leftmost-dup.o toku_assert.o memory.o gpma.o
test-gpma-worstinsert.o test-gpma-blackbox.o test-gpma-leftmost-dup.o: gpma.h memory.h toku_assert.h
: gpma.h memory.h toku_assert.h
gpma.o: gpma.c gpma.h
omt.o: $(HFILES)
ybt.o: ybt.h brttypes.h ../include/db.h
ybt-test: ybt-test.o ybt.o memory.o toku_assert.o
ybt-test.o: ybt.h ../include/db.h
cachetable.o: brttypes.h cachetable.h hashfun.h memory.h primes.h toku_assert.h $(BRT_INTERNAL_H_INCLUDES) log_header.h
brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 brt-test5 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 brt-test: ybt.o brt.o fifo.o gpma.o leafentry.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
brt-test0 brt-test1 brt-test2 brt-test3 brt-test4 brt-test5 test-brt-overflow brt-test-named-db brt-test-cursor brt-test-cursor-2 brt-test: ybt.o brt.o fifo.o omt.o leafentry.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
log.o: log_header.h log-internal.h log.h wbuf.h crc.h brttypes.h $(BRT_INTERNAL_H_INCLUDES)
logformat: logformat.o toku_assert.o
brt-test0.o brt-test1.o brt-test2.o brt-test3.o brt-test4.o brt-test5.o test-brt-overflow.h brt-test-named-db.o brt-test-cursor.o brt-test-cursor-2.o brt-test.o brt.o: brt.h brt-search.h ../include/db.h fifo.h gpma.h brttypes.h cachetable.h memory.h $(BRT_INTERNAL_H_INCLUDES)
brt-test0.o brt-test1.o brt-test2.o brt-test3.o brt-test4.o brt-test5.o test-brt-overflow.h brt-test-named-db.o brt-test-cursor.o brt-test-cursor-2.o brt-test.o brt.o: brt.h brt-search.h ../include/db.h fifo.h omt.h brttypes.h cachetable.h memory.h $(BRT_INTERNAL_H_INCLUDES)
brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES)
brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h
fifo.o: fifo.h brttypes.h
......@@ -214,16 +192,18 @@ memory.o: memory.h
primes.o: primes.h toku_assert.h
fifo-test: fifo.o memory.o toku_assert.o ybt.o
brt-serialize.o: $(BRT_INTERNAL_H_INCLUDES) key.h wbuf.h rbuf.h
brt-bigtest: memory.o ybt.o brt.o gpma.o cachetable.o key.o fifo.o brt-serialize.o
brt-bigtest: memory.o ybt.o brt.o omt.o cachetable.o key.o fifo.o brt-serialize.o
brt-bigtest.o: brt.h brt-search.h ../include/db.h
log-test6 log-test5 log-test4 log-test3 log-test2 log-test: log.o memory.o leafentry.o toku_assert.o roll.o log_code.o brt-serialize.o brt.o cachetable.o gpma.o ybt.o fifo.o key.o fingerprint.o brt-verify.o mempool.o primes.o
log-test6 log-test5 log-test4 log-test3 log-test2 log-test: log.o memory.o leafentry.o toku_assert.o roll.o log_code.o brt-serialize.o brt.o cachetable.o omt.o ybt.o fifo.o key.o fingerprint.o brt-verify.o mempool.o primes.o
brt-verify.o: $(BRT_INTERNAL_H_INCLUDES)
fingerprint.o: $(BRT_INTERNAL_H_INCLUDES)
mempool.o: toku_assert.h mempool.h
leafentry.o: brttypes.h crc.h leafentry.h memory.h toku_assert.h
toku_assert.o: toku_assert.h
omt-test.o: toku_assert.h memory.h toku_assert.h ../include/db.h brttypes.h
omt-test: omt-test.o omt.o memory.o toku_assert.o
brt-serialize-test: brt-serialize-test.o brt-serialize.o leafentry.o memory.o fifo.o gpma.o key.o ybt.o brt.o cachetable.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
brt-serialize-test: brt-serialize-test.o brt-serialize.o leafentry.o memory.o fifo.o omt.o key.o ybt.o brt.o cachetable.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
test_toku_malloc_plain_free: memory.o toku_assert.o
......
......@@ -7,7 +7,6 @@
#include "cachetable.h"
#include "fifo.h"
#include "yerror.h"
#include "gpma.h"
#include "brt.h"
#include "crc.h"
#include "list.h"
......@@ -15,15 +14,21 @@
#include "kv-pair.h"
#include "leafentry.h"
typedef LEAFENTRY OMTVALUE;
#include "omt.h"
#ifndef BRT_FANOUT
#define BRT_FANOUT 16
#endif
enum { TREE_FANOUT = BRT_FANOUT };
enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
enum { PMA_ITEM_OVERHEAD = 4 };
enum { OMT_ITEM_OVERHEAD = 0 }; /* No overhead for the OMT item. The PMA needed to know the idx, but the OMT doesn't. */
enum { BRT_CMD_OVERHEAD = (1 // the type
+ 8) // the xid
};
enum { LE_OVERHEAD_BOUND = 9 }; // the type and xid
enum { BRT_DEFAULT_NODE_SIZE = 1 << 20 };
struct nodeheader_in_file {
......@@ -57,7 +62,7 @@ struct brtnode {
// When we checkpoint: Create a checkpoint record, and cause every dirty node to be written to disk. The new checkpoint record is *not* incorporated into the disk_lsn of the written nodes.
// While we are checkpointing, someone may modify a dirty node that has not yet been written. In that case, when we unpin the node, we make the new copy (because the disk_lsn<checkpoint_lsn), just as we would usually.
//
int layout_version; // What version of the data structure? (version 2 adds the xid to the brt cmds)
int layout_version; // What version of the data structure? (version 2 adds the xid to the brt cmds)
int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
u_int32_t rand4fingerprint;
u_int32_t local_fingerprint; /* For leaves this is everything in the buffer. For nonleaves, this is everything in the buffers, but does not include child subtree fingerprints. */
......@@ -82,7 +87,7 @@ struct brtnode {
However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
} n;
struct leaf {
GPMA buffer;
OMT buffer;
unsigned int n_bytes_in_buffer; /* How many bytes to represent the PMA (including the per-key overheads, but not including the overheads for the node. */
struct mempool buffer_mempool;
} l;
......@@ -186,6 +191,7 @@ struct brt_cursor {
void *skey, *sval;
};
// logs the memory allocation, but not the creation of the new node
int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger);
int toku_unpin_brtnode (BRT brt, BRTNODE node) ;
unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t);
......@@ -206,12 +212,12 @@ struct cmd_leafval_bessel_extra {
BRT_CMD cmd;
int compare_both_keys; // Set to 1 for DUPSORT databases that are not doing a DELETE_BOTH
};
int toku_cmd_leafval_bessel (u_int32_t dlen, void *leafentry, void *extra);
int toku_cmd_leafval_bessel (LEAFENTRY leafentry, void *extra);
int toku_brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger);
int toku_cachefile_root_put_cmd (CACHEFILE cf, BRT_CMD cmd, TOKULOGGER logger);
int toku_gpma_compress_kvspace (GPMA pma, struct mempool *memp);
void *mempool_malloc_from_gpma(GPMA pma, struct mempool *mp, size_t size);
int toku_omt_compress_kvspace (OMT omt, struct mempool *memp);
void *mempool_malloc_from_omt(OMT omt, struct mempool *mp, size_t size);
#endif
......@@ -27,7 +27,7 @@ static void test_serialize(void) {
sn.thisnodename = sn.nodesize*20;
sn.disk_lsn.lsn = 789;
sn.log_lsn.lsn = 123456;
sn.layout_version = 4;
sn.layout_version = 5;
sn.height = 1;
sn.rand4fingerprint = randval;
sn.local_fingerprint = 0;
......@@ -57,7 +57,7 @@ static void test_serialize(void) {
assert(dn->thisnodename==nodesize*20);
assert(dn->disk_lsn.lsn==123456);
assert(dn->layout_version ==4);
assert(dn->layout_version ==5);
assert(dn->height == 1);
assert(dn->rand4fingerprint==randval);
assert(dn->u.n.n_children==2);
......
......@@ -3,7 +3,6 @@
#define _XOPEN_SOURCE 500
//#include "pma.h"
#include "toku_assert.h"
#include "brt-internal.h"
#include "key.h"
......@@ -56,14 +55,15 @@ static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) {
return size+hsize+csize;
} else {
unsigned int hsize=0;
GPMA_ITERATE(node->u.l.buffer,
idx, vlen, vdata,
({
LEAFENTRY le=vdata;
hsize+= PMA_ITEM_OVERHEAD + leafentry_disksize(le);
}));
int addupsize (LEAFENTRY le, u_int32_t UU(idx), void *vp) {
unsigned int *ip=vp;
(*ip) += OMT_ITEM_OVERHEAD + leafentry_disksize(le);
return 0;
}
toku_omt_iterate(node->u.l.buffer,
addupsize,
&hsize);
assert(hsize<=node->u.l.n_bytes_in_buffer);
hsize+=4; /* the PMA size */
hsize+=4; /* add n entries in buffer table. */
return size+hsize;
}
......@@ -81,8 +81,7 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) {
result+=(8+4+4)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, and the subtree fingerprint. */
result+=node->u.n.n_bytes_in_buffers;
} else {
result+=(4 /* n_entries in buffer table. */
+4); /* the pma size */
result+=4; /* n_entries in buffer table. */
result+=node->u.l.n_bytes_in_buffer;
if (toku_memory_check) {
unsigned int slowresult = toku_serialize_brtnode_size_slow(node);
......@@ -177,14 +176,13 @@ void toku_serialize_brtnode_to (int fd, DISKOFF off, BRTNODE node) {
}
} else {
//printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer));
wbuf_uint(&w, toku_gpma_n_entries(node->u.l.buffer));
wbuf_uint(&w, toku_gpma_index_limit(node->u.l.buffer));
GPMA_ITERATE(node->u.l.buffer, idx, vlen, vdata,
({
//printf(" %s:%d idx=%d\n", __FILE__, __LINE__, idx);
wbuf_uint(&w, idx);
wbuf_LEAFENTRY(&w, vdata);
}));
wbuf_uint(&w, toku_omt_size(node->u.l.buffer));
int wbufwriteleafentry (LEAFENTRY le, u_int32_t UU(idx), void *v) {
struct wbuf *thisw=v;
wbuf_LEAFENTRY(thisw, le);
return 0;
}
toku_omt_iterate(node->u.l.buffer, wbufwriteleafentry, &w);
}
assert(w.ndone<=w.size);
#ifdef CRC_ATEND
......@@ -266,7 +264,7 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode) {
}
}
result->layout_version = rbuf_int(&rc);
if (result->layout_version!=4) {
if (result->layout_version!=5) {
r=DB_BADFORMAT;
goto died1;
}
......@@ -368,11 +366,10 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode) {
}
} else {
int n_in_buf = rbuf_int(&rc);
int index_limit = rbuf_int(&rc);
result->u.l.n_bytes_in_buffer = 0;
r=toku_gpma_create(&result->u.l.buffer, index_limit);
r=toku_omt_create(&result->u.l.buffer);
if (r!=0) {
if (0) { died_21: toku_gpma_free(&result->u.l.buffer, 0, 0); }
if (0) { died_21: toku_omt_destroy(&result->u.l.buffer); }
goto died1;
}
//printf("%s:%d r PMA= %p\n", __FILE__, __LINE__, result->u.l.buffer);
......@@ -388,18 +385,15 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode) {
for (i=0; i<n_in_buf; i++) {
LEAFENTRY tmp_le;
//printf("%s:%d reading %dth item\n", __FILE__, __LINE__, i);
int idx = rbuf_int(&rc);
//printf("%s:%d idx=%d\n", __FILE__, __LINE__, idx);
u_int32_t memsize, disksize;
rbuf_LEAFENTRY(&rc, &memsize, &disksize, &tmp_le);
LEAFENTRY le = mempool_malloc_from_gpma(result->u.l.buffer, &result->u.l.buffer_mempool, memsize);
LEAFENTRY le = mempool_malloc_from_omt(result->u.l.buffer, &result->u.l.buffer_mempool, memsize);
assert(le);
memcpy(le, tmp_le, memsize);
toku_free(tmp_le);
assert(disksize==leafentry_disksize(le));
result->u.l.n_bytes_in_buffer += disksize + PMA_ITEM_OVERHEAD;
//printf("idx=%d\n", idx);
toku_gpma_set_at_index(result->u.l.buffer, idx, memsize, le);
result->u.l.n_bytes_in_buffer += disksize + OMT_ITEM_OVERHEAD;
toku_omt_insert_at(result->u.l.buffer, le, i);
actual_sum += result->rand4fingerprint*toku_le_crc(le);
//printf("%s:%d rand4=%08x fp=%08x \n", __FILE__, __LINE__, result->rand4fingerprint, actual_sum);
}
......@@ -440,18 +434,26 @@ void toku_verify_counts (BRTNODE node) {
/*foo*/
if (node->height==0) {
assert(node->u.l.buffer);
unsigned int sum=0;
unsigned int count=0;
u_int32_t fp=0;
GPMA_ITERATE(node->u.l.buffer, idx, dlen, ddata,
({
count++;
sum+= PMA_ITEM_OVERHEAD + leafentry_disksize(ddata); // use the disk size, not the memory size.
fp += toku_le_crc(ddata);
}));
assert(count==toku_gpma_n_entries(node->u.l.buffer));
assert(sum==node->u.l.n_bytes_in_buffer);
u_int32_t fps = node->rand4fingerprint *fp;
struct sum_info {
unsigned int dsum;
unsigned int msum;
unsigned int count;
u_int32_t fp;
} sum_info = {0,0,0,0};
int sum_item (LEAFENTRY le, u_int32_t UU(idx), void *vsi) {
struct sum_info *si = vsi;
si->count++;
si->dsum += OMT_ITEM_OVERHEAD + leafentry_disksize(le);
si->msum += leafentry_memsize(le);
si->fp += toku_le_crc(le);
return 0;
}
toku_omt_iterate(node->u.l.buffer, sum_item, &sum_info);
assert(sum_info.count==toku_omt_size(node->u.l.buffer));
assert(sum_info.dsum==node->u.l.n_bytes_in_buffer);
assert(sum_info.msum == node->u.l.buffer_mempool.free_offset - node->u.l.buffer_mempool.frag_size);
u_int32_t fps = node->rand4fingerprint * sum_info.fp;
assert(fps==node->local_fingerprint);
} else {
unsigned int sum = 0;
......
......@@ -80,33 +80,32 @@ int toku_testsetup_insert_to_leaf (BRT brt, DISKOFF diskoff, char *key, int keyl
LEAFENTRY tmp_leafentry;
r = le_committed(keylen, key, vallen, val, &lesize, &disksize, &tmp_leafentry);
LEAFENTRY leafentry = mempool_malloc_from_gpma(node->u.l.buffer, &node->u.l.buffer_mempool, lesize);
LEAFENTRY leafentry = mempool_malloc_from_omt(node->u.l.buffer, &node->u.l.buffer_mempool, lesize);
memcpy(leafentry, tmp_leafentry, lesize);
toku_free(tmp_leafentry);
u_int32_t storedlen;
void *storeddata;
LEAFENTRY storeddata;
u_int32_t idx;
DBT keydbt,valdbt;
BRT_CMD_S cmd = {BRT_INSERT, 0, .u.id={toku_fill_dbt(&keydbt, key, keylen),
toku_fill_dbt(&valdbt, val, vallen)}};
struct cmd_leafval_bessel_extra be = {brt, &cmd, node->flags & TOKU_DB_DUPSORT};
r = toku_gpma_lookup_bessel(node->u.l.buffer, toku_cmd_leafval_bessel, 0, &be, &storedlen, &storeddata, &idx);
r = toku_omt_find_zero(node->u.l.buffer, toku_cmd_leafval_bessel, &be, &storeddata, &idx);
if (r==0) {
// It's already there. So now we have to remove it and put the new one back in.
node->u.l.n_bytes_in_buffer -= PMA_ITEM_OVERHEAD + leafentry_disksize(storeddata);
node->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + leafentry_disksize(storeddata);
node->local_fingerprint -= node->rand4fingerprint*toku_le_crc(storeddata);
toku_mempool_mfree(&node->u.l.buffer_mempool, storeddata, storedlen);
toku_mempool_mfree(&node->u.l.buffer_mempool, storeddata, leafentry_memsize(storeddata));
// Now put the new kv in.
toku_gpma_set_at_index(node->u.l.buffer, idx, lesize, leafentry);
toku_omt_set_at(node->u.l.buffer, leafentry, idx);
} else {
r = toku_gpma_insert_bessel(node->u.l.buffer, lesize, leafentry, toku_cmd_leafval_bessel, &be, 0, 0, 0);
r = toku_omt_insert(node->u.l.buffer, leafentry, toku_cmd_leafval_bessel, &be, 0);
assert(r==0);
}
node->u.l.n_bytes_in_buffer += PMA_ITEM_OVERHEAD + disksize;
node->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + disksize;
node->local_fingerprint += node->rand4fingerprint*toku_le_crc(leafentry);
node->dirty=1;
......
......@@ -3,7 +3,6 @@
#include "brt.h"
#include "key.h"
#include "gpma.h"
#include "brt-internal.h"
#include "memory.h"
#include "toku_assert.h"
......
......@@ -30,6 +30,7 @@ static void test2 (int memcheck, int limit) {
snprintf(key,100,"hello%d",i);
snprintf(val,100,"there%d",i);
toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
r = toku_verify_brt(t); assert(r==0);
//printf("%s:%d did insert %d\n", __FILE__, __LINE__, i);
if (0) {
brt_flush(t);
......
......@@ -127,14 +127,15 @@ int toku_verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, b
}
} else {
// Make sure that they are in increasing order.
void *prev=0;
GPMA_ITERATE(node->u.l.buffer, idx, dlen, data,
({
if (prev==0)
prev=data;
else
assert(compare_leafentries(brt, prev, data)<0);
}));
int check_increasing (LEAFENTRY v, u_int32_t idx, void *vprevp) {
LEAFENTRY *prevp = vprevp;
if (idx>0)
assert(compare_leafentries(brt, *prevp, v)<0);
*prevp=v;
return 0;
}
LEAFENTRY prev=0;
toku_omt_iterate(node->u.l.buffer, check_increasing, &prev);
}
if ((r = toku_cachetable_unpin(brt->cf, off, 0, 0))) return r;
return result;
......
This diff is collapsed.
......@@ -219,7 +219,7 @@ static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height)
n->thisnodename = nodename;
n->disk_lsn.lsn = 0; // a new one can always be 0.
n->log_lsn = n->disk_lsn;
n->layout_version = 4;
n->layout_version = 5;
n->height = height;
n->rand4fingerprint = random();
n->local_fingerprint = 0;
......
......@@ -107,12 +107,13 @@ void dump_node (int f, DISKOFF off) {
}
} else {
printf(" n_bytes_in_buffer=%d\n", n->u.l.n_bytes_in_buffer);
printf(" items_in_buffer =%d\n", toku_gpma_n_entries(n->u.l.buffer));
GPMA_ITERATE(n->u.l.buffer, idx, len, data,
({
print_leafentry(stdout, data);
printf("\n");
}));
printf(" items_in_buffer =%d\n", toku_omt_size(n->u.l.buffer));
int print_le(LEAFENTRY le, u_int32_t UU(idx), void *UU(v)) {
print_leafentry(stdout, le);
printf("\n");
return 0;
}
toku_omt_iterate(n->u.l.buffer, print_le, 0);
}
}
......
......@@ -98,4 +98,6 @@ typedef struct brt_cmd BRT_CMD_S, *BRT_CMD;
#define UU(x) x __attribute__((__unused__))
typedef struct leafentry *LEAFENTRY;
#endif
#include "memory.h"
struct gpma {
enum typ_tag tag;
unsigned int N; /* How long is the array? Always a power of two >= 4. */
u_int32_t n_items_present; /* How many array elements are non-null. */
struct gitem *items; /* A malloced array. If any item's DATA is null, then it's not in use. */
double udt_step; /* upper density threshold step */
/* Each doubling decreases the density by density step.
* For example if array_len=256 and uplgN=8 then there are 5 doublings.
* Regions of size 8 are full. Regions of size 16 are 90% full.
* Regions of size 32 are 80% full. Regions of size 64 are 70% full.
* Regions of size 128 are 60% full. Regions of size 256 are 50% full.
* The density step is 0.10. */
double ldt_step; /* lower density threshold step */
};
#define GPMA_MIN_ARRAY_SIZE 4
/* density thresholds */
#define GPMA_LDT_HIGH 0.25
#define GPMA_LDT_LOW 0.40
#define GPMA_UDT_HIGH 1.00
#define GPMA_UDT_LOW 0.50
/* Expose these for testing purposes */
u_int32_t toku_gpma_find_index_bes (GPMA pma, gpma_besselfun_t besf, int direction, void *extra, int *found);
u_int32_t toku_gpma_find_index (GPMA pma, u_int32_t len, void *data, gpma_compare_fun_t compare, void *extra, int *found);
int toku_lg (unsigned int n);
u_int32_t toku_hyperceil (u_int32_t v);
int toku_max_int (int, int);
int toku_gpma_smooth_region (GPMA pma,
u_int32_t lo, u_int32_t hi,
u_int32_t count, // The number of nonnull values
u_int32_t idx, u_int32_t *newidxp, gpma_renumber_callback_t rcall, void *extra,
u_int32_t old_N);
int toku_make_space_at (GPMA pma, u_int32_t idx, u_int32_t *newidx, gpma_renumber_callback_t rcall, void *extra);
void toku_gpma_distribute (GPMA pma,
u_int32_t lo, u_int32_t hi,
u_int32_t count,
struct gitem *items, // some of these may be NULL data, be we leave space for them anyway.
/*out*/ u_int32_t *tos // the indices where the values end up (we fill this in)
);
int toku_smooth_deleted_region (GPMA pma, u_int32_t minidx, u_int32_t maxidx, gpma_renumber_callback_t renumberf, void *extra_for_renumberf);
This diff is collapsed.
#ifndef GPMA_H
#define GPMA_H
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
// Need this to get the u_int32_t types and so forth
#include <sys/types.h>
typedef struct gpma *GPMA;
struct gitem {
u_int32_t len;
void *data;
};
typedef int (*gpma_compare_fun_t)(u_int32_t alen, void *aval, u_int32_t blen, void *bval, void*extra);
typedef int (*gpma_besselfun_t)(u_int32_t dlen, void *dval, void *extra); // return a number, not an error code.
typedef int (*gpma_delete_callback_t)(u_int32_t slotnum, u_int32_t deletelen, void*deletedata, void*extra); // return 0 if OK.
// If the pma moves things around and/or changes the size of the pma, it calls this function to indicate what happened.
typedef int (*gpma_renumber_callback_t)(u_int32_t nitems, // How many things moved
u_int32_t *froms, // An array of indices indicating where things moved from
u_int32_t *tos, // An array of indices indicating where thigns moved to
struct gitem *items, // The actual items that were moved
u_int32_t old_N, // The old size of the target array
u_int32_t new_N, // The new size of the target array
void *extra); // Context
typedef void (*gpma_free_callback_t)(u_int32_t len, void*freeme, void*extra);
// initial_index_limit must be zero or a power of two.
int toku_gpma_create (GPMA*, int initial_index_limit);
/* Return 0 if OK, and sets the referenced GPMA to NULL. */
void toku_gpma_free (GPMA*, gpma_free_callback_t, void*);
// How many items are present
u_int32_t toku_gpma_n_entries (GPMA);
// What is the maximum index limit
u_int32_t toku_gpma_index_limit (GPMA);
// Require that the item not be already present, according ot the compare function
// The data in the DBT is passed in.
int toku_gpma_insert (GPMA,
u_int32_t len, void*data,
gpma_compare_fun_t comparef, void*extra_for_comparef,
gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know
u_int32_t *indexp // Where did the item get stored?
);
// Use a bessel function to determine where to insert the data.
// Puts the new value between the rightmost -1 and the leftmost +1.
// Requires: Nothing in the pma returns 0.
int toku_gpma_insert_bessel (GPMA pma,
u_int32_t len, void *data,
gpma_besselfun_t, void *extra_for_besself,
gpma_renumber_callback_t renumberf, void*extra_for_renumberf, // if anything gets renumbered, let the caller know
u_int32_t *indexp // Where did the item get stored?
);
// Delete a particular index, and rebalance the tree.
int toku_gpma_delete_at_index (GPMA pma, u_int32_t index,
gpma_renumber_callback_t renumberf,
void *extra_for_renumberf);
// Delete anything for which the besselfun is zero. The besselfun must be monotonically increasing compared to the comparison function.
// That is, if two othings compare to be < then their besselfun's must yield <=, and if the compare to be = their besselfuns must be =, and if they are > then their besselfuns must be >=
// Note the delete_callback would be responsible for calling free on the object.
int toku_gpma_delete_bessel (GPMA,
gpma_besselfun_t,
void*extra_for_besself,
gpma_delete_callback_t,
void*extra_for_deletef,
gpma_renumber_callback_t, // if anything gets renumbered, let the caller know
void*extra_for_renumberf);
// Delete any items for which the compare function says things are zero.
// For each item deleted, invoke deletef.
// For any items moved around, invoke renumberf.
int toku_gpma_delete_item (GPMA,
u_int32_t len, void *data,
gpma_compare_fun_t comparef, void *extra_for_comparef,
gpma_delete_callback_t deletef, void *extra_for_deletef,
gpma_renumber_callback_t renumberf, void *extra_for_renumberf);
// Look up a particular item, using the compare function. Find some X such that compf(len,data, X.len, X.data)==0
// (Note that the len and data passed here are always passed as the first pair of arguments to compf. )
// The item being looked up is the second pair of arguments.
int toku_gpma_lookup_item (GPMA, u_int32_t len, void *data, gpma_compare_fun_t compf, void*extra, u_int32_t *resultlen, void **resultdata, u_int32_t *idx);
// Lookup something according to the besselfun.
// If direction==0 then return something for which the besselfun is zero (or return DB_NOTFOUND and set the idx to point at the spot where the item would go. That spot may already have an element in it, or it may be off the end.)
// If more than one value is zero, return the leftmost such value.
// If direction>0 then return the first thing for which the besselfun is positive (or return DB_NOTFOUND).
// If direction<0 then return the last thing for which the besselfun is negative (or return DB_NOTFOUND).
int toku_gpma_lookup_bessel (GPMA, gpma_besselfun_t, int direction, void*extra, u_int32_t *len, void **data, u_int32_t *idx);
void toku_gpma_iterate (GPMA, void(*)(u_int32_t len, void*data, void*extra), void*extra);
#define GPMA_ITERATE(table,idx,vallen,val,body) ({ \
u_int32_t idx; \
for (idx=0; idx<toku_gpma_index_limit(table); idx++) { \
u_int32_t vallen; void*val; \
if (0==toku_gpma_get_from_index(table, idx, &vallen, &val)) { \
body; \
} } })
int toku_gpma_valididx (GPMA, u_int32_t idx);
int toku_gpma_get_from_index (GPMA, u_int32_t idx, u_int32_t *len, void **data);
// Whatever is in the slot gets overwritten. Watch out that you free the thing before overwriting it.
void toku_gpma_set_at_index (GPMA, u_int32_t idx, u_int32_t len, void*data);
// Clears the item at a particular index without rebalancing the PMA.
void toku_gpma_clear_at_index (GPMA, u_int32_t idx);
int toku_gpma_move_inside_pma_by_renumbering (GPMA,
u_int32_t nitems,
u_int32_t *froms, u_int32_t *tos);
int toku_gpma_split (GPMA pma, GPMA newpma, u_int32_t overhead,
int (*realloc_data)(u_int32_t len, void *odata, void **ndata, void *extra),
void *extra_realloc,
gpma_renumber_callback_t rcall,
void *extra_rcall,
gpma_renumber_callback_t rcall_across_pmas, // This one is called for everything that moved. It is called first (before the rcall). The old_N is the size of pma before resizing.
void *extra_rcall_across);
void toku_verify_gpma (GPMA pma);
// Change the size of the PMA. Anything beyond the oldsize is discarded (if the newsize is smaller) or zerod (if the newsize is larger)
int toku_resize_gpma_exactly (GPMA pma, u_int32_t newsize);
#endif
......@@ -27,13 +27,9 @@
* The case of a committed pair and a provisional pair can be represented by a committed pair, since it doesn't matter whether the transction aborts or commits, the value is the same.
*/
#include "mempool.h"
#include "brttypes.h"
#include "gpma.h"
#include "rbuf.h"
typedef struct leafentry *LEAFENTRY;
u_int32_t toku_le_crc(LEAFENTRY v);
int le_committed (u_int32_t klen, void* kval, u_int32_t dlen, void* dval, u_int32_t *resultsize, u_int32_t *disksize, LEAFENTRY *result);
......
......@@ -203,33 +203,23 @@ const struct logtype logtypes[] = {
NULLFIELD}},
{"insertleafentry", 'I', FA{{"FILENUM", "filenum", 0},
{"DISKOFF", "diskoff", 0},
{"u_int32_t", "pmaidx", 0},
{"u_int32_t", "idx", 0},
{"LEAFENTRY", "newleafentry", 0},
NULLFIELD}},
{"deleteleafentry", 'D', FA{{"FILENUM", "filenum", 0},
{"DISKOFF", "diskoff", 0},
{"u_int32_t", "pmaidx", 0},
{"u_int32_t", "idx", 0},
{"LEAFENTRY", "oldleafentry", 0},
NULLFIELD}},
{"deleteinleaf", 'd', FA{{"TXNID", "txnid", 0},
{"FILENUM", "filenum", 0},
{"DISKOFF", "diskoff", 0},
{"u_int32_t", "pmaidx", 0},
{"BYTESTRING", "key", 0},
{"BYTESTRING", "data", 0},
NULLFIELD}},
{"resizepma", 'R', FA{{"FILENUM", "filenum", 0},
{"DISKOFF", "diskoff", 0},
{"u_int32_t", "oldsize", 0},
{"u_int32_t", "newsize", 0},
NULLFIELD}},
{"pmadistribute", 'M', FA{{"FILENUM", "filenum", 0},
{"DISKOFF", "old_diskoff", 0},
{"DISKOFF", "new_diskoff", 0},
{"INTPAIRARRAY", "fromto", 0},
{"u_int32_t", "old_N", 0},
{"u_int32_t", "new_N", 0},
NULLFIELD}},
{"leafsplit", 's', FA{{"FILENUM", "filenum", 0}, // log the creation of a new node by splitting stuff out of an old node
{"DISKOFF", "old_diskoff", 0},
{"DISKOFF", "new_diskoff", 0},
{"u_int32_t", "old_n", 0},
{"u_int32_t", "split_at", 0},
{"u_int32_t", "new_nodesize", 0},
{"u_int32_t", "new_rand4", "%08x"},
{"u_int8_t", "is_dupsort", 0},
NULLFIELD}},
{0,0,FA{NULLFIELD}}
};
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
// Find out if the leftmost value is returned when the besselfun returns 0 for more than one thing.
#include "gpma.h"
#include "memory.h"
#include "toku_assert.h"
#include "../include/db.h"
#include <stdio.h>
#include <string.h>
int verbose;
static int compare_strings(u_int32_t alen, void *aval, u_int32_t blen, void *bval, void *extra __attribute__((__unused__))) {
assert(alen==strlen(aval)+1);
assert(blen==strlen(bval)+1);
return strcmp(aval, bval);
}
static int rcall_ok (u_int32_t nitems __attribute__((__unused__)), u_int32_t *froms __attribute__((__unused__)), u_int32_t *tos __attribute__((__unused__)), struct gitem *items __attribute__((__unused__)), u_int32_t old_N __attribute__((__unused__)), u_int32_t new_N __attribute__((__unused__)), void *extra __attribute__((__unused__))) {
return 0;
}
static void lookfor (GPMA pma, u_int32_t strlens, int/*char*/ minc, int /*char*/ maxc, int /*char*/ expectc) {
// Make a bessel function that returns 0 for anything in the range [minc, maxc] inclusive.
int zero_for_0_and_1 (u_int32_t dlen, void *dval, void *extra) {
assert(dlen==strlens);
assert(extra==0);
if (((char*)dval)[0]<minc) return -1;
if (((char*)dval)[0]>maxc) return +1;
return 0;
}
u_int32_t len, idx;
void *data;
int r = toku_gpma_lookup_bessel(pma, zero_for_0_and_1, 0, 0, &len, &data, &idx);
assert(r==0);
assert(len==strlens);
//printf("Got %c, expect %c\n", ((char*)data)[0], expectc);
assert(((char*)data)[0]==expectc);
}
static void test_leftmost (void) {
GPMA pma;
int r = toku_gpma_create(&pma, 0);
assert(r==0);
enum { N = 9, strlens=2 };
char *strings[N];
int i;
for (i=0; i<N; i++) {
assert(N<10); // Or we need to fix our format string
char str[strlens];
snprintf(str, strlens, "%d", i);
strings[i]=strdup(str);
r = toku_gpma_insert(pma, 1+strlen(strings[i]), strings[i], compare_strings, 0, rcall_ok, strings[i], 0);
assert(r==0);
}
int lo, hi;
for (lo=0; lo<N; lo++) {
for (hi=lo; hi<N; hi++) {
lookfor(pma, strlens, '0'+lo, '0'+hi, '0'+lo);
}
}
// Other tests go here. Check when -1 for 0, 0 for 1 and 2, 1 for 3 that we get 1
for (i=0; i<N; i++) toku_free(strings[i]);
toku_gpma_free(&pma, 0, 0);
}
int main (int argc, const char *argv[]) {
int i;
for (i = 1; i < argc; i++) {
const char *arg = argv[i];
if (0 == strcmp(arg, "-v") || 0 == strcmp(arg, "--verbose"))
verbose = 1;
else if (0 == strcmp(arg, "-q") || 0 == strcmp(arg, "--quiet"))
verbose = 0;
}
test_leftmost();
toku_malloc_cleanup();
return 0;
}
/* Worst-case insert patterns. */
#include "gpma.h"
#include "toku_assert.h"
#include "memory.h"
#include <string.h>
#include <stdio.h>
int verbose;
static int count_frees=0;
static void free_callback (u_int32_t len __attribute__((__unused__)), void*freeme, void *extra) {
assert(extra==(void*)&verbose);
toku_free(freeme);
}
static int compare_strings(u_int32_t alen, void *aval, u_int32_t blen, void *bval, void *extra __attribute__((__unused__))) {
assert(alen==strlen(aval)+1);
assert(blen==strlen(bval)+1);
return strcmp(aval, bval);
}
static int rcall_ok (u_int32_t nitems __attribute__((__unused__)), u_int32_t *froms __attribute__((__unused__)), u_int32_t *tos __attribute__((__unused__)), struct gitem *items __attribute__((__unused__)), u_int32_t old_N __attribute__((__unused__)), u_int32_t new_N __attribute__((__unused__)), void *extra __attribute__((__unused__))) {
return 0;
}
static int delete_callback (u_int32_t slotnum __attribute__((__unused__)), u_int32_t len, void *data, void *extra) {
assert(strlen(data)+1==len);
assert(strcmp(data, extra)==0);
toku_free(data);
return 0;
}
static const int initial_N=1000;
static const int N=100000;
static const int w=6;
static void insert_n (GPMA pma, int n) {
char buf[w+1];
int l = snprintf(buf, sizeof(buf), "%0*d", w, n);
assert(l==w);
int r = toku_gpma_insert(pma, strlen(buf)+1, strdup(buf), compare_strings, 0, rcall_ok, 0, 0);
assert(r==0);
}
static void delete_n (GPMA pma, int n) {
char buf[w+1];
int l = snprintf(buf, sizeof(buf), "%0*d", w, n);
assert(l==w);
int r = toku_gpma_delete_item(pma,
strlen(buf)+1, buf,
compare_strings, 0,
delete_callback, buf,
0, 0);
if (r!=0) printf("deleted %d\n", n);
assert(r==0);
}
static int inum (int direction, int itemnum) {
switch (direction) {
case 1:
// Insert things from left to right
return itemnum;
case -1:
// Insert things from right to left
return 2*N-1-itemnum;
case 0:
// Insert things at the outer edges
if (itemnum%2) {
return itemnum/2;
} else {
return 2*N-1-itemnum/2;
}
default: assert(0); return 0;
}
}
static void test_worst_insert(int direction) {
int r;
GPMA pma;
r = toku_gpma_create(&pma, 0);
assert(r==0);
count_frees=0;
int i;
int next_to_insert=0;
int next_to_delete=0;
int max_size = 0;
for (i=0; i<initial_N; i++) {
insert_n(pma, inum(direction,next_to_insert++));
}
for (; i<N; i++) {
insert_n(pma, inum(direction,next_to_insert++));
if (i%10==0) continue; // Make the table get slowly larger
delete_n(pma, inum(direction, next_to_delete++));
}
for (; i<2*N; i++) {
int this_size = toku_gpma_index_limit(pma);
if (this_size>max_size) max_size=this_size;
delete_n(pma, inum(direction,next_to_delete++));
if (i%20==0) continue; // Make the table get slowly smaller
insert_n(pma, inum(direction,next_to_insert++));
}
assert(count_frees==0);
if (verbose) printf("size=%d max_size=%d\n", toku_gpma_index_limit(pma), max_size);
toku_gpma_free(&pma, free_callback, &verbose);
}
int main (int argc, const char *argv[]) {
int i;
int which = 0;
for (i = 1; i < argc; i++) {
const char *arg = argv[i];
if (0 == strcmp(arg, "-v") || 0 == strcmp(arg, "--verbose"))
verbose = 1;
else if (0 == strcmp(arg, "-q") || 0 == strcmp(arg, "--quiet"))
verbose = 0;
else if (0 == strcmp(arg, "-a"))
which = 1;
else if (0 == strcmp(arg, "-b"))
which = 2;
else if (0 == strcmp(arg, "-c"))
which = 3;
}
if (which==0 || which==1) test_worst_insert(+1);
if (which==0 || which==2) test_worst_insert(-1);
if (which==0 || which==3) test_worst_insert( 0);
return 0;
}
......@@ -70,7 +70,7 @@ ydbtrace.o tdbtrace.o: tdbtrace.h
ydbtrace.o: ydb.c
$(CC) $(CFLAGS) $(CPPFLAGS) -DTOKUTRACE -c -o $@ $<
DBBINS = ydb.o errors.o elocks.o ../newbrt/brt.o ../newbrt/brt-serialize.o ../newbrt/brt-verify.o ../newbrt/cachetable.o ../newbrt/fifo.o ../newbrt/key.o ../newbrt/leafentry.o ../newbrt/memory.o ../newbrt/mempool.o ../newbrt/gpma.o ../newbrt/ybt.o ../newbrt/primes.o ../newbrt/log.o ../newbrt/fingerprint.o ../newbrt/log_code.o ../newbrt/roll.o ../newbrt/toku_assert.o ../newbrt/recover.o
DBBINS = ydb.o errors.o elocks.o ../newbrt/brt.o ../newbrt/brt-serialize.o ../newbrt/brt-verify.o ../newbrt/cachetable.o ../newbrt/fifo.o ../newbrt/key.o ../newbrt/leafentry.o ../newbrt/memory.o ../newbrt/mempool.o ../newbrt/omt.o ../newbrt/ybt.o ../newbrt/primes.o ../newbrt/log.o ../newbrt/fingerprint.o ../newbrt/log_code.o ../newbrt/roll.o ../newbrt/toku_assert.o ../newbrt/recover.o
TDBBINS = tdbtrace.o $(patsubst ydb.o,ydbtrace.o,$(DBBINS))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment