Commit 48f0ad74 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul

Up

git-svn-id: file:///svn/tokudb@519 c7de825b-a66e-492c-adef-691d508d4ae1
parent d5153759
...@@ -12,7 +12,7 @@ FPICFLAGS = -fPIC ...@@ -12,7 +12,7 @@ FPICFLAGS = -fPIC
DTOOL = valgrind --quiet --error-exitcode=1 DTOOL = valgrind --quiet --error-exitcode=1
endif endif
CFLAGS = -Wall -W $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) -Werror $(FPICFLAGS) CFLAGS = -Wall -W $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) -Werror $(FPICFLAGS) -Wshadow
LDFLAGS = $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) LDFLAGS = $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS)
CPPFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE CPPFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
...@@ -31,10 +31,10 @@ REGRESSION_TESTS = \ ...@@ -31,10 +31,10 @@ REGRESSION_TESTS = \
ybt-test \ ybt-test \
pma-test \ pma-test \
brt-serialize-test \ brt-serialize-test \
brt-test \
cachetable-test \ cachetable-test \
cachetable-test2 \ cachetable-test2 \
hashtest \ hashtest \
brt-test \
# This line intentially kept commented so I can have a \ on the end of the previous line # This line intentially kept commented so I can have a \ on the end of the previous line
BINS = $(REGRESSION_TESTS) \ BINS = $(REGRESSION_TESTS) \
...@@ -46,7 +46,6 @@ BINS = $(REGRESSION_TESTS) \ ...@@ -46,7 +46,6 @@ BINS = $(REGRESSION_TESTS) \
libs: log.o libs: log.o
bins: $(BINS) bins: $(BINS)
check: bins check: bins
./benchmark-test --valsize 256 --verify 1
$(DTOOL) ./ybt-test $(DTOOL) ./ybt-test
$(DTOOL) ./pma-test $(DTOOL) ./pma-test
$(DTOOL) ./cachetable-test $(DTOOL) ./cachetable-test
...@@ -54,6 +53,7 @@ check: bins ...@@ -54,6 +53,7 @@ check: bins
$(DTOOL) ./brt-serialize-test $(DTOOL) ./brt-serialize-test
$(DTOOL) ./brt-test $(DTOOL) ./brt-test
$(DTOOL) ./hashtest $(DTOOL) ./hashtest
./benchmark-test --valsize 256 --verify 1
# ./mdict-test # ./mdict-test
check-fanout: check-fanout:
...@@ -63,33 +63,40 @@ check-fanout: ...@@ -63,33 +63,40 @@ check-fanout:
let BRT_FANOUT=BRT_FANOUT+1; \ let BRT_FANOUT=BRT_FANOUT+1; \
done done
pma-test benchmark-test brt-test brt-serialize-test: LDFLAGS+=-lz
# pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage # pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage
BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h hashtable.h pma.h brt.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h
key.o: brttypes.h key.h key.o: brttypes.h key.h
pma-test.o: pma-internal.h pma.h yerror.h memory.h ../include/db.h list.h kv-pair.h brttypes.h ybt.h yerror.h pma-test.o: $(BRT_INTERNAL_H_INCLUDES) pma-internal.h pma.h list.h mempool.h
pma-test: pma.o memory.o key.o ybt.o log.o mempool.o pma-test: pma.o memory.o key.o ybt.o log.o mempool.o fingerprint.o
pma.o: pma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h log.h ../include/db.h pma.o: pma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h log.h ../include/db.h
ybt.o: ybt.h brttypes.h ../include/db.h ybt.o: ybt.h brttypes.h ../include/db.h
ybt-test: ybt-test.o ybt.o memory.o ybt-test: ybt-test.o ybt.o memory.o
ybt-test.o: ybt.h ../include/db.h ybt-test.o: ybt.h ../include/db.h
cachetable.o: cachetable.h hashfun.h cachetable.o: cachetable.h hashfun.h
brt-test: ybt.o brt.o hashtable.o pma.o memory.o brt-serialize.o cachetable.o header-io.o ybt.o key.o primes.o log.o mempool.o brt-test: ybt.o brt.o hashtable.o pma.o memory.o brt-serialize.o cachetable.o header-io.o ybt.o key.o primes.o log.o mempool.o brt-verify.o fingerprint.o
log.o: log-internal.h log.h log.o: log-internal.h log.h wbuf.h crc.h
brt-test.o brt.o: brt.h ../include/db.h hashtable.h pma.h brttypes.h cachetable.h brt-test.o brt.o: brt.h ../include/db.h hashtable.h pma.h brttypes.h cachetable.h
brt-serialize-test.o: pma.h yerror.h brt.h ../include/db.h memory.h hashtable.h brttypes.h brt-internal.h brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES)
brt.o: brt.h ../include/db.h mdict.h pma.h brttypes.h memory.h brt-internal.h cachetable.h hashtable.h brt.o: $(BRT_INTERNAL_H_INCLUDES)
mdict.o: pma.h mdict.o: pma.h
hashtable.o: hashtable.h brttypes.h memory.h key.h yerror.h ../include/db.h hashfun.h hashtable.o: hashtable.h brttypes.h memory.h key.h yerror.h ../include/db.h hashfun.h
memory.o: memory.h memory.o: memory.h
primes.o: primes.h primes.o: primes.h
hashtest: hashtable.o memory.o primes.o hashtest: hashtable.o memory.o primes.o
brt-serialize.o: brt.h ../include/db.h cachetable.h memory.h mdict.h pma.h brttypes.h brt-internal.h hashtable.h wbuf.h rbuf.h brt-serialize.o: $(BRT_INTERNAL_H_INCLUDES) key.h wbuf.h rbuf.h
header-io.o: brttypes.h brt-internal.h brt.h ../include/db.h memory.h header-io.o: $(BRT_INTERNAL_H_INCLUDES)
mdict-test: hashtable.o pma.o memory.o mdict-test: hashtable.o pma.o memory.o
brt-bigtest: memory.o ybt.o brt.o pma.o cachetable.o key.o hashtable.o brt-serialize.o brt-bigtest: memory.o ybt.o brt.o pma.o cachetable.o key.o hashtable.o brt-serialize.o
brt-bigtest.o: brt.h ../include/db.h brt-bigtest.o: brt.h ../include/db.h
log-test: log.o memory.o log-test: log.o memory.o
brt-verify.o: $(BRT_INTERNAL_H_INCLUDES)
fingerprint.o: $(BRT_INTERNAL_H_INCLUDES)
brt-serialize-test: brt-serialize-test.o brt-serialize.o memory.o hashtable.o pma.o key.o ybt.o brt.o cachetable.o primes.o log.o mempool.o brt-verify.o fingerprint.o
brt-serialize-test: brt-serialize-test.o brt-serialize.o memory.o hashtable.o pma.o key.o ybt.o brt.o cachetable.o primes.o log.o mempool.o
cachetable-test.o: cachetable.h memory.h cachetable-test.o: cachetable.h memory.h
cachetable-test: cachetable.o memory.o cachetable-test.o primes.o cachetable-test: cachetable.o memory.o cachetable-test.o primes.o
...@@ -97,7 +104,7 @@ cachetable-test: cachetable.o memory.o cachetable-test.o primes.o ...@@ -97,7 +104,7 @@ cachetable-test: cachetable.o memory.o cachetable-test.o primes.o
cachetable-test2.o: cachetable.h memory.h cachetable-test2.o: cachetable.h memory.h
cachetable-test2: cachetable.o memory.o cachetable-test2.o primes.o cachetable-test2: cachetable.o memory.o cachetable-test2.o primes.o
benchmark-test: benchmark-test.o ybt.o memory.o brt.o pma.o cachetable.o key.o hashtable.o brt-serialize.o primes.o log.o mempool.o benchmark-test: benchmark-test.o ybt.o memory.o brt.o pma.o cachetable.o key.o hashtable.o brt-serialize.o primes.o log.o mempool.o brt-verify.o fingerprint.o
benchmark-test.o: brt.h ../include/db.h benchmark-test.o: brt.h ../include/db.h
clean: clean:
......
...@@ -30,7 +30,7 @@ BRT t; ...@@ -30,7 +30,7 @@ BRT t;
void setup (void) { void setup (void) {
int r; int r;
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun); assert(r==0); r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun); assert(r==0);
} }
...@@ -69,6 +69,7 @@ long long llrandom (void) { ...@@ -69,6 +69,7 @@ long long llrandom (void) {
void random_insert_below (long long below) { void random_insert_below (long long below) {
long long i; long long i;
assert(0 < below);
for (i=0; i<ITEMS_TO_INSERT_PER_ITERATION; i++) { for (i=0; i<ITEMS_TO_INSERT_PER_ITERATION; i++) {
insert(llrandom()%below); insert(llrandom()%below);
} }
...@@ -79,7 +80,7 @@ double tdiff (struct timeval *a, struct timeval *b) { ...@@ -79,7 +80,7 @@ double tdiff (struct timeval *a, struct timeval *b) {
} }
void biginsert (long long n_elements, struct timeval *starttime) { void biginsert (long long n_elements, struct timeval *starttime) {
long i; long long i;
struct timeval t1,t2; struct timeval t1,t2;
int iteration; int iteration;
for (i=0, iteration=0; i<n_elements; i+=ITEMS_TO_INSERT_PER_ITERATION, iteration++) { for (i=0, iteration=0; i<n_elements; i+=ITEMS_TO_INSERT_PER_ITERATION, iteration++) {
......
static int brt_root_put_cmd_XY (BRT brt, BRT_CMD *md, TOKUTXN txn) {
int r;
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: toku_unpin_brt_header(brt); }
return r;
}
CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt);
if ((r=cachetable_get_and_pin(brt->cf, *rootp, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize))) {
goto died0;
}
node=node_v;
if (0) {
died1:
cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnodesize(node));
goto died0;
}
node->parent_brtnode = 0;
result = brtnode_put_cmd_XY(brt, node, cmd, txn);
// It's still pinned, and it may be too big or the fanout may be too large.
if (node->height>0 && node->u.n.n_children==TREE_FANOUT) {
// Must split it.
r = do_split_node(node, &nodea, &nodeb, &splitk); // On error: node is unmodified
if (r!=0) goto died1;
// node is garbage, and nodea and nodeb are pinned
r = brt_init_new_root(brt, nodea, nodeb, splitk, rootp); // On error: root is unmodified and nodea and nodeb are both unpinned
if (r!=0) goto died0;
// nodea and nodeb are unpinned, and the root has been fixed
// up to point at a new node (*rootp) containing two children
// (nodea and nodeb). nodea and nodeb are unpinned. *rootp is still pinned
node = *rootp;
}
// Now the fanout is small enough.
// But the node could still be too large.
if (serialize_brtnode_size(node)>node->nodesize) {
}
}
...@@ -2,14 +2,15 @@ ...@@ -2,14 +2,15 @@
#include "hashtable.h" #include "hashtable.h"
#include "pma.h" #include "pma.h"
#include "brt.h" #include "brt.h"
//#include "pma.h" #include "crc.h"
#ifndef BRT_FANOUT #ifndef BRT_FANOUT
#define BRT_FANOUT 16 #define BRT_FANOUT 16
#endif #endif
enum { TREE_FANOUT = BRT_FANOUT }; //, NODESIZE=1<<20 }; enum { TREE_FANOUT = BRT_FANOUT };
enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */ enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
enum { BRT_CMD_OVERHEAD = 1 }; enum { BRT_CMD_OVERHEAD = 1 };
enum { BRT_DEFAULT_NODE_SIZE = 1 << 20 };
struct nodeheader_in_file { struct nodeheader_in_file {
int n_in_buffer; int n_in_buffer;
...@@ -22,21 +23,28 @@ typedef struct brtnode *BRTNODE; ...@@ -22,21 +23,28 @@ typedef struct brtnode *BRTNODE;
/* Internal nodes. */ /* Internal nodes. */
struct brtnode { struct brtnode {
enum typ_tag tag; enum typ_tag tag;
BRT brt; // The containing BRT
unsigned int nodesize; unsigned int nodesize;
diskoff thisnodename; DISKOFF thisnodename; // The size of the node allocated on disk. Not all is necessarily in use.
LSN lsn; // Need the LSN as of the most recent modification.
int layout_version; // What version of the data structure?
BRTNODE parent_brtnode; /* Invariant: The parent of an in-memory node must be in main memory. This is so we can find and update the down pointer when we change the diskoff of a node. */ BRTNODE parent_brtnode; /* Invariant: The parent of an in-memory node must be in main memory. This is so we can find and update the down pointer when we change the diskoff of a node. */
int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */ int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
u_int32_t rand4fingerprint;
u_int32_t local_fingerprint; /* For leaves this is everything in the buffer. For nonleaves, this is everything in the hash tables, but does not include child subtree fingerprints. */
int dirty; int dirty;
union node { union node {
struct nonleaf { struct nonleaf {
// Don't actually store the subree fingerprint in the in-memory data structure.
int n_children; /* if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced. */ int n_children; /* if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced. */
u_int32_t child_subtree_fingerprints[TREE_FANOUT+1];
bytevec childkeys[TREE_FANOUT]; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1]. bytevec childkeys[TREE_FANOUT]; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Note: It is possible that Child 1's keys are == to child 0's key's, so it is Note: It is possible that Child 1's keys are == to child 0's key's, so it is
not necessarily true that child 1's keys are > childkeys[0]. not necessarily true that child 1's keys are > childkeys[0].
However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */ However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
unsigned int childkeylens[TREE_FANOUT]; unsigned int childkeylens[TREE_FANOUT];
unsigned int totalchildkeylens; unsigned int totalchildkeylens;
diskoff children[TREE_FANOUT+1]; /* unused if height==0 */ /* Note: The last element of these arrays is used only temporarily while splitting a node. */ DISKOFF children[TREE_FANOUT+1]; /* unused if height==0 */ /* Note: The last element of these arrays is used only temporarily while splitting a node. */
HASHTABLE htables[TREE_FANOUT+1]; HASHTABLE htables[TREE_FANOUT+1];
unsigned int n_bytes_in_hashtable[TREE_FANOUT+1]; /* how many bytes are in each hashtable (including overheads) */ unsigned int n_bytes_in_hashtable[TREE_FANOUT+1]; /* how many bytes are in each hashtable (including overheads) */
unsigned int n_bytes_in_hashtables; unsigned int n_bytes_in_hashtables;
...@@ -52,12 +60,13 @@ struct brtnode { ...@@ -52,12 +60,13 @@ struct brtnode {
struct brt_header { struct brt_header {
int dirty; int dirty;
unsigned int nodesize; unsigned int nodesize;
diskoff freelist; DISKOFF freelist;
diskoff unused_memory; DISKOFF unused_memory;
diskoff unnamed_root; DISKOFF unnamed_root;
int n_named_roots; /* -1 if the only one is unnamed */ int n_named_roots; /* -1 if the only one is unnamed */
char **names; char **names;
diskoff *roots; DISKOFF *roots;
unsigned int flags;
}; };
...@@ -69,21 +78,24 @@ struct brt { ...@@ -69,21 +78,24 @@ struct brt {
BRT_CURSOR cursors_head, cursors_tail; BRT_CURSOR cursors_head, cursors_tail;
unsigned int nodesize;
unsigned int flags;
int (*compare_fun)(DB*,const DBT*,const DBT*); int (*compare_fun)(DB*,const DBT*,const DBT*);
int (*dup_compare)(DB*,const DBT*,const DBT*);
void *skey,*sval; /* Used for DBT return values. */ void *skey,*sval; /* Used for DBT return values. */
}; };
/* serialization code */ /* serialization code */
void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node); void serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node);
int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesize); int deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int nodesize);
unsigned int serialize_brtnode_size(BRTNODE node); /* How much space will it take? */ unsigned int serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
void verify_counts(BRTNODE); void verify_counts(BRTNODE);
int serialize_brt_header_to (int fd, struct brt_header *h); int serialize_brt_header_to (int fd, struct brt_header *h);
int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth); int deserialize_brtheader_from (int fd, DISKOFF off, struct brt_header **brth);
/* return the size of a tree node */ /* return the size of a tree node */
long brtnode_size (BRTNODE node); long brtnode_size (BRTNODE node);
...@@ -169,3 +181,21 @@ struct brt_cmd { ...@@ -169,3 +181,21 @@ struct brt_cmd {
}; };
typedef struct brt_cmd BRT_CMD; typedef struct brt_cmd BRT_CMD;
struct brtenv {
CACHETABLE ct;
TOKULOGGER logger;
long long checksum_number;
// SPINLOCK checkpointing;
};
extern cachetable_flush_func_t brtnode_flush_callback;
extern cachetable_fetch_func_t brtnode_fetch_callback;
extern int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header);
extern int toku_unpin_brt_header (BRT brt);
extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt);
static const BRTNODE null_brtnode=0;
extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen);
extern u_int32_t toku_calccrc32_cmd (int type, const void *key, int keylen, const void *val, int vallen);
extern u_int32_t toku_calccrc32_cmdstruct (BRT_CMD *cmd);
#include "brt.h"
#include "memory.h"
#include "brt-internal.h" #include "brt-internal.h"
#include <fcntl.h> #include <fcntl.h>
#include <assert.h> #include <assert.h>
#include <string.h> #include <string.h>
#include <zlib.h>
#include <arpa/inet.h>
#include <stdlib.h>
void test_serialize(void) { void test_serialize(void) {
// struct brt source_brt; // struct brt source_brt;
...@@ -12,41 +13,59 @@ void test_serialize(void) { ...@@ -12,41 +13,59 @@ void test_serialize(void) {
struct brtnode sn, *dn; struct brtnode sn, *dn;
int fd = open("brt-serialize-test.brt", O_RDWR|O_CREAT, 0777); int fd = open("brt-serialize-test.brt", O_RDWR|O_CREAT, 0777);
int r; int r;
const u_int32_t randval = random();
assert(fd>=0); assert(fd>=0);
// source_brt.fd=fd; // source_brt.fd=fd;
char *hello_string; char *hello_string;
sn.nodesize = nodesize; sn.nodesize = nodesize;
sn.thisnodename = sn.nodesize*20; sn.thisnodename = sn.nodesize*20;
sn.lsn.lsn = 123456;
sn.layout_version = 0;
sn.height = 1; sn.height = 1;
sn.rand4fingerprint = randval;
sn.local_fingerprint = 0;
sn.u.n.n_children = 2; sn.u.n.n_children = 2;
sn.u.n.childkeys[0] = hello_string = toku_strdup("hello"); sn.u.n.childkeys[0] = hello_string = toku_strdup("hello");
sn.u.n.childkeylens[0] = 6; sn.u.n.childkeylens[0] = 6;
sn.u.n.totalchildkeylens = 6; sn.u.n.totalchildkeylens = 6;
sn.u.n.children[0] = sn.nodesize*30; sn.u.n.children[0] = sn.nodesize*30;
sn.u.n.children[1] = sn.nodesize*35; sn.u.n.children[1] = sn.nodesize*35;
sn.u.n.child_subtree_fingerprints[0] = random();
sn.u.n.child_subtree_fingerprints[1] = random();
r = toku_hashtable_create(&sn.u.n.htables[0]); assert(r==0); r = toku_hashtable_create(&sn.u.n.htables[0]); assert(r==0);
r = toku_hashtable_create(&sn.u.n.htables[1]); assert(r==0); r = toku_hashtable_create(&sn.u.n.htables[1]); assert(r==0);
r = toku_hash_insert(sn.u.n.htables[0], "a", 2, "aval", 5, BRT_NONE); assert(r==0); r = toku_hash_insert(sn.u.n.htables[0], "a", 2, "aval", 5, BRT_NONE); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, "a", 2, "aval", 5);
r = toku_hash_insert(sn.u.n.htables[0], "b", 2, "bval", 5, BRT_NONE); assert(r==0); r = toku_hash_insert(sn.u.n.htables[0], "b", 2, "bval", 5, BRT_NONE); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, "b", 2, "bval", 5);
r = toku_hash_insert(sn.u.n.htables[1], "x", 2, "xval", 5, BRT_NONE); assert(r==0); r = toku_hash_insert(sn.u.n.htables[1], "x", 2, "xval", 5, BRT_NONE); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, "x", 2, "xval", 5);
sn.u.n.n_bytes_in_hashtables = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5); sn.u.n.n_bytes_in_hashtables = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
serialize_brtnode_to(fd, sn.nodesize*20, sn.nodesize, &sn); assert(r==0); serialize_brtnode_to(fd, sn.nodesize*20, sn.nodesize, &sn); assert(r==0);
r = deserialize_brtnode_from(fd, nodesize*20, &dn, nodesize); r = deserialize_brtnode_from(fd, nodesize*20, &dn, nodesize);
assert(r==0);
assert(dn->thisnodename==nodesize*20); assert(dn->thisnodename==nodesize*20);
assert(dn->lsn.lsn==123456);
assert(dn->layout_version ==0);
assert(dn->height == 1); assert(dn->height == 1);
assert(dn->rand4fingerprint==randval);
assert(dn->u.n.n_children==2); assert(dn->u.n.n_children==2);
assert(strcmp(dn->u.n.childkeys[0], "hello")==0); assert(strcmp(dn->u.n.childkeys[0], "hello")==0);
assert(dn->u.n.childkeylens[0]==6); assert(dn->u.n.childkeylens[0]==6);
assert(dn->u.n.totalchildkeylens==6); assert(dn->u.n.totalchildkeylens==6);
assert(dn->u.n.children[0]==nodesize*30); assert(dn->u.n.children[0]==nodesize*30);
assert(dn->u.n.children[1]==nodesize*35); assert(dn->u.n.children[1]==nodesize*35);
{
int i;
for (i=0; i<2; i++) {
assert(dn->u.n.child_subtree_fingerprints[i]==sn.u.n.child_subtree_fingerprints[i]);
}
assert(dn->local_fingerprint==sn.local_fingerprint);
}
{ {
bytevec data; ITEMLEN datalen; int type; bytevec data; ITEMLEN datalen; int type;
int r = toku_hash_find(dn->u.n.htables[0], "a", 2, &data, &datalen, &type); r = toku_hash_find(dn->u.n.htables[0], "a", 2, &data, &datalen, &type);
assert(r==0); assert(r==0);
assert(strcmp(data,"aval")==0); assert(strcmp(data,"aval")==0);
assert(datalen==5); assert(datalen==5);
...@@ -64,7 +83,7 @@ void test_serialize(void) { ...@@ -64,7 +83,7 @@ void test_serialize(void) {
assert(datalen==5); assert(datalen==5);
assert(type == BRT_NONE); assert(type == BRT_NONE);
} }
// brtnode_free(&dn); brtnode_free(&dn);
toku_free(hello_string); toku_free(hello_string);
toku_hashtable_free(&sn.u.n.htables[0]); toku_hashtable_free(&sn.u.n.htables[0]);
......
#define _XOPEN_SOURCE 500 #define _XOPEN_SOURCE 500
#include "brt.h"
#include "memory.h"
//#include "pma.h" //#include "pma.h"
#include "brt-internal.h" #include "brt-internal.h"
#include "key.h" #include "key.h"
#include "rbuf.h" #include "rbuf.h"
#include "wbuf.h" #include "wbuf.h"
#include <assert.h> #include <assert.h>
#include <unistd.h> #include <unistd.h>
#include <stdio.h> #include <stdio.h>
#include <arpa/inet.h> #include <arpa/inet.h>
const int brtnode_header_overhead = (8+ // magic "tokunode" or "tokuleaf"
8+ // checkpoint number
4+ // block size
4+ // data size
4+ // height
4+ // random for fingerprint
4+ // localfingerprint
4); // crc32 at the end
static unsigned int serialize_brtnode_size_slow(BRTNODE node) { static unsigned int serialize_brtnode_size_slow(BRTNODE node) {
unsigned int size=4+4; /* size+height */ unsigned int size=brtnode_header_overhead;
if (node->height>0) { if (node->height>0) {
unsigned int hsize=0; unsigned int hsize=0;
unsigned int csize=0; unsigned int csize=0;
int i; int i;
size+=4; /* n_children */ size+=4; /* n_children */
size+=4; /* subtree fingerprint. */
for (i=0; i<node->u.n.n_children-1; i++) { for (i=0; i<node->u.n.n_children-1; i++) {
size+=4; size+=4;
csize+=node->u.n.childkeylens[i]; csize+=node->u.n.childkeylens[i];
} }
for (i=0; i<node->u.n.n_children; i++) { for (i=0; i<node->u.n.n_children; i++) {
size+=8; size+=8; // diskoff
size+=4; // subsum
} }
int n_hashtables = node->u.n.n_bytes_in_hashtables; int n_hashtables = node->u.n.n_children;
size+=4; /* n_entries */ size+=4; /* n_entries */
assert(0 <= n_hashtables && n_hashtables < TREE_FANOUT+1);
for (i=0; i< n_hashtables; i++) { for (i=0; i< n_hashtables; i++) {
HASHTABLE_ITERATE(node->u.n.htables[i], HASHTABLE_ITERATE(node->u.n.htables[i],
key __attribute__((__unused__)), keylen, key __attribute__((__unused__)), keylen,
...@@ -53,13 +63,14 @@ static unsigned int serialize_brtnode_size_slow(BRTNODE node) { ...@@ -53,13 +63,14 @@ static unsigned int serialize_brtnode_size_slow(BRTNODE node) {
} }
unsigned int serialize_brtnode_size (BRTNODE node) { unsigned int serialize_brtnode_size (BRTNODE node) {
unsigned int result = 4+4; /* size+height */ unsigned int result =brtnode_header_overhead;
assert(sizeof(off_t)==8); assert(sizeof(off_t)==8);
if (node->height>0) { if (node->height>0) {
result+=4; /* n_children */ result+=4; /* n_children */
result+=4; /* subtree fingerpirnt */
result+=4*(node->u.n.n_children-1); /* key lengths */ result+=4*(node->u.n.n_children-1); /* key lengths */
result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */ result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */
result+=(8+4)*(node->u.n.n_children); /* For each child, a child offset and a count for the number of hash table entries. */ result+=(8+4+4)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, and the subtree fingerprint. */
result+=node->u.n.n_bytes_in_hashtables; result+=node->u.n.n_bytes_in_hashtables;
} else { } else {
result+=4; /* n_entries in buffer table. */ result+=4; /* n_entries in buffer table. */
...@@ -73,7 +84,8 @@ unsigned int serialize_brtnode_size (BRTNODE node) { ...@@ -73,7 +84,8 @@ unsigned int serialize_brtnode_size (BRTNODE node) {
return result; return result;
} }
void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) { void serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) {
//printf("%s:%d serializing\n", __FILE__, __LINE__);
struct wbuf w; struct wbuf w;
int i; int i;
unsigned int calculated_size = serialize_brtnode_size(node); unsigned int calculated_size = serialize_brtnode_size(node);
...@@ -82,11 +94,33 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) { ...@@ -82,11 +94,33 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) {
assert(size>0); assert(size>0);
wbuf_init(&w, buf, size); wbuf_init(&w, buf, size);
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]); //printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
wbuf_literal_bytes(&w, "toku", 4);
if (node->height==0) wbuf_literal_bytes(&w, "leaf", 4);
else wbuf_literal_bytes(&w, "node", 4);
wbuf_int(&w, node->layout_version);
wbuf_ulonglong(&w, node->lsn.lsn);
//printf("%s:%d %lld.calculated_size=%d\n", __FILE__, __LINE__, off, calculated_size);
wbuf_int(&w, calculated_size); wbuf_int(&w, calculated_size);
wbuf_int(&w, node->height); wbuf_int(&w, node->height);
//printf("%s:%d %lld rand=%08x sum=%08x height=%d\n", __FILE__, __LINE__, node->thisnodename, node->rand4fingerprint, node->subtree_fingerprint, node->height);
wbuf_int(&w, node->rand4fingerprint);
wbuf_int(&w, node->local_fingerprint);
//printf("%s:%d local_fingerprint=%8x\n", __FILE__, __LINE__, node->local_fingerprint);
//printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children); //printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children);
if (node->height>0) { if (node->height>0) {
// Local fingerprint is not actually stored while in main memory. Must calculate it.
// Subtract the child fingerprints from the subtree fingerprint to get the local fingerprint.
{
u_int32_t subtree_fingerprint = node->local_fingerprint;
for (i=0; i<node->u.n.n_children; i++) {
subtree_fingerprint += node->u.n.child_subtree_fingerprints[i];
}
wbuf_int(&w, subtree_fingerprint);
}
wbuf_int(&w, node->u.n.n_children); wbuf_int(&w, node->u.n.n_children);
for (i=0; i<node->u.n.n_children; i++) {
wbuf_int(&w, node->u.n.child_subtree_fingerprints[i]);
}
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone); //printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
for (i=0; i<node->u.n.n_children-1; i++) { for (i=0; i<node->u.n.n_children-1; i++) {
wbuf_bytes(&w, node->u.n.childkeys[i], node->u.n.childkeylens[i]); wbuf_bytes(&w, node->u.n.childkeys[i], node->u.n.childkeylens[i]);
...@@ -99,21 +133,37 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) { ...@@ -99,21 +133,37 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) {
{ {
int n_hash_tables = node->u.n.n_children; int n_hash_tables = node->u.n.n_children;
u_int32_t check_local_fingerprint = 0;
for (i=0; i< n_hash_tables; i++) { for (i=0; i< n_hash_tables; i++) {
//printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i])); //printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i]));
wbuf_int(&w, toku_hashtable_n_entries(node->u.n.htables[i])); wbuf_int(&w, toku_hashtable_n_entries(node->u.n.htables[i]));
HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type, HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type,
(wbuf_char(&w, type), wbuf_bytes(&w, key, keylen), ({
wbuf_bytes(&w, data, datalen))); wbuf_char(&w, type);
wbuf_bytes(&w, key, keylen);
wbuf_bytes(&w, data, datalen);
check_local_fingerprint+=node->rand4fingerprint*toku_calccrc32_cmd(type, key, keylen, data, datalen);
}));
} }
//printf("%s:%d check_local_fingerprint=%8x\n", __FILE__, __LINE__, check_local_fingerprint);
assert(check_local_fingerprint==node->local_fingerprint);
} }
} else { } else {
//printf(" n_entries=%d\n", pma_n_entries(node->u.l.buffer));
wbuf_int(&w, pma_n_entries(node->u.l.buffer)); wbuf_int(&w, pma_n_entries(node->u.l.buffer));
PMA_ITERATE(node->u.l.buffer, key, keylen, data, datalen, PMA_ITERATE(node->u.l.buffer, key, keylen, data, datalen,
(wbuf_bytes(&w, key, keylen), (wbuf_bytes(&w, key, keylen),
wbuf_bytes(&w, data, datalen))); wbuf_bytes(&w, data, datalen)));
} }
assert(w.ndone<=w.size); assert(w.ndone<=w.size);
#ifdef CRC_ATEND
wbuf_int(&w, crc32(toku_null_crc, w.buf, w.ndone));
#endif
#ifdef CRC_INCR
wbuf_int(&w, w.crc32);
#endif
//write_now: printf("%s:%d Writing %d bytes\n", __FILE__, __LINE__, w.ndone);
{ {
ssize_t r=pwrite(fd, w.buf, w.ndone, off); ssize_t r=pwrite(fd, w.buf, w.ndone, off);
if (r<0) printf("r=%ld errno=%d\n", (long)r, errno); if (r<0) printf("r=%ld errno=%d\n", (long)r, errno);
...@@ -128,11 +178,11 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) { ...@@ -128,11 +178,11 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) {
toku_free(buf); toku_free(buf);
} }
int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesize) { int deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int nodesize) {
TAGMALLOC(BRTNODE, result); TAGMALLOC(BRTNODE, result);
struct rbuf rc; struct rbuf rc;
int i; int i;
uint32_t datasize; u_int32_t datasize;
int r; int r;
if (errno!=0) { if (errno!=0) {
r=errno; r=errno;
...@@ -140,8 +190,8 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -140,8 +190,8 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
return r; return r;
} }
{ {
uint32_t datasize_n; u_int32_t datasize_n;
r = pread(fd, &datasize_n, sizeof(datasize_n), off); r = pread(fd, &datasize_n, sizeof(datasize_n), off +8+4+8);
//printf("%s:%d r=%d the datasize=%d\n", __FILE__, __LINE__, r, ntohl(datasize_n)); //printf("%s:%d r=%d the datasize=%d\n", __FILE__, __LINE__, r, ntohl(datasize_n));
if (r!=sizeof(datasize_n)) { if (r!=sizeof(datasize_n)) {
if (r==-1) r=errno; if (r==-1) r=errno;
...@@ -152,6 +202,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -152,6 +202,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
if (datasize<=0 || datasize>(1<<30)) { r = DB_BADFORMAT; goto died0; } if (datasize<=0 || datasize>(1<<30)) { r = DB_BADFORMAT; goto died0; }
} }
rc.buf=toku_malloc(datasize); rc.buf=toku_malloc(datasize);
//printf("%s:%d errno=%d\n", __FILE__, __LINE__, errno);
if (errno!=0) { if (errno!=0) {
if (0) { died1: toku_free(rc.buf); } if (0) { died1: toku_free(rc.buf); }
r=errno; r=errno;
...@@ -162,10 +213,30 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -162,10 +213,30 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
rc.ndone=0; rc.ndone=0;
//printf("Deserializing %lld datasize=%d\n", off, datasize); //printf("Deserializing %lld datasize=%d\n", off, datasize);
{ {
ssize_t r=pread(fd, rc.buf, datasize, off); ssize_t rlen=pread(fd, rc.buf, datasize, off);
if ((size_t)r!=datasize) { r=errno; goto died1; } //printf("%s:%d pread->%d datasize=%d\n", __FILE__, __LINE__, r, datasize);
if ((size_t)rlen!=datasize) {
//printf("%s:%d size messed up\n", __FILE__, __LINE__);
r=errno;
goto died1;
}
//printf("Got %d %d %d %d\n", rc.buf[0], rc.buf[1], rc.buf[2], rc.buf[3]); //printf("Got %d %d %d %d\n", rc.buf[0], rc.buf[1], rc.buf[2], rc.buf[3]);
} }
{
bytevec tmp;
rbuf_literal_bytes(&rc, &tmp, 8);
if (memcmp(tmp, "tokuleaf", 8)!=0
&& memcmp(tmp, "tokunode", 8)!=0) {
r = DB_BADFORMAT;
goto died1;
}
}
result->layout_version = rbuf_int(&rc);
if (result->layout_version!=0) {
r=DB_BADFORMAT;
goto died1;
}
result->lsn.lsn = rbuf_ulonglong(&rc);
{ {
unsigned int stored_size = rbuf_int(&rc); unsigned int stored_size = rbuf_int(&rc);
if (stored_size!=datasize) { r=DB_BADFORMAT; goto died1; } if (stored_size!=datasize) { r=DB_BADFORMAT; goto died1; }
...@@ -173,11 +244,14 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -173,11 +244,14 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
result->nodesize = nodesize; // How to compute the nodesize? result->nodesize = nodesize; // How to compute the nodesize?
result->thisnodename = off; result->thisnodename = off;
result->height = rbuf_int(&rc); result->height = rbuf_int(&rc);
result->rand4fingerprint = rbuf_int(&rc);
result->local_fingerprint = rbuf_int(&rc);
result->dirty = 0; result->dirty = 0;
//printf("height==%d\n", result->height); //printf("height==%d\n", result->height);
if (result->height>0) { if (result->height>0) {
result->u.n.totalchildkeylens=0; result->u.n.totalchildkeylens=0;
for (i=0; i<TREE_FANOUT; i++) { for (i=0; i<TREE_FANOUT; i++) {
result->u.n.child_subtree_fingerprints[i]=0;
result->u.n.childkeys[i]=0; result->u.n.childkeys[i]=0;
result->u.n.childkeylens[i]=0; result->u.n.childkeylens[i]=0;
} }
...@@ -187,9 +261,16 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -187,9 +261,16 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
result->u.n.n_bytes_in_hashtable[i]=0; result->u.n.n_bytes_in_hashtable[i]=0;
result->u.n.n_cursors[i]=0; result->u.n.n_cursors[i]=0;
} }
u_int32_t subtree_fingerprint = rbuf_int(&rc);
u_int32_t check_subtree_fingerprint = 0;
result->u.n.n_children = rbuf_int(&rc); result->u.n.n_children = rbuf_int(&rc);
//printf("n_children=%d\n", result->n_children); //printf("n_children=%d\n", result->n_children);
assert(result->u.n.n_children>=0 && result->u.n.n_children<=TREE_FANOUT); assert(result->u.n.n_children>=0 && result->u.n.n_children<=TREE_FANOUT);
for (i=0; i<result->u.n.n_children; i++) {
u_int32_t childfp = rbuf_int(&rc);
result->u.n.child_subtree_fingerprints[i]= childfp;
check_subtree_fingerprint += childfp;
}
for (i=0; i<result->u.n.n_children-1; i++) { for (i=0; i<result->u.n.n_children-1; i++) {
bytevec childkeyptr; bytevec childkeyptr;
rbuf_bytes(&rc, &childkeyptr, &result->u.n.childkeylens[i]); /* Returns a pointer into the rbuf. */ rbuf_bytes(&rc, &childkeyptr, &result->u.n.childkeylens[i]); /* Returns a pointer into the rbuf. */
...@@ -206,7 +287,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -206,7 +287,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
} }
result->u.n.n_bytes_in_hashtables = 0; result->u.n.n_bytes_in_hashtables = 0;
for (i=0; i<result->u.n.n_children; i++) { for (i=0; i<result->u.n.n_children; i++) {
int r=toku_hashtable_create(&result->u.n.htables[i]); r=toku_hashtable_create(&result->u.n.htables[i]);
if (r!=0) { if (r!=0) {
int j; int j;
if (0) { died_12: j=result->u.n.n_bytes_in_hashtables; } if (0) { died_12: j=result->u.n.n_bytes_in_hashtables; }
...@@ -216,6 +297,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -216,6 +297,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
} }
{ {
int cnum; int cnum;
u_int32_t check_local_fingerprint = 0;
for (cnum=0; cnum<result->u.n.n_children; cnum++) { for (cnum=0; cnum<result->u.n.n_children; cnum++) {
int n_in_this_hash = rbuf_int(&rc); int n_in_this_hash = rbuf_int(&rc);
//printf("%d in hash\n", n_in_hash); //printf("%d in hash\n", n_in_hash);
...@@ -228,9 +310,10 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -228,9 +310,10 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
type = rbuf_char(&rc); type = rbuf_char(&rc);
rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */ rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */
rbuf_bytes(&rc, &val, &vallen); rbuf_bytes(&rc, &val, &vallen);
check_local_fingerprint += result->rand4fingerprint * toku_calccrc32_cmd(type, key, keylen, val, vallen);
//printf("Found %s,%s\n", (char*)key, (char*)val); //printf("Found %s,%s\n", (char*)key, (char*)val);
{ {
int r=toku_hash_insert(result->u.n.htables[cnum], key, keylen, val, vallen, type); /* Copies the data into the hash table. */ r=toku_hash_insert(result->u.n.htables[cnum], key, keylen, val, vallen, type); /* Copies the data into the hash table. */
if (r!=0) { goto died_12; } if (r!=0) { goto died_12; }
} }
diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD; diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
...@@ -239,11 +322,19 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -239,11 +322,19 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
//printf("Inserted\n"); //printf("Inserted\n");
} }
} }
if (check_local_fingerprint != result->local_fingerprint) {
fprintf(stderr, "%s:%d local fingerprint is wrong (found %8x calcualted %8x\n", __FILE__, __LINE__, result->local_fingerprint, check_local_fingerprint);
return DB_BADFORMAT;
}
if (check_subtree_fingerprint+check_local_fingerprint != subtree_fingerprint) {
fprintf(stderr, "%s:%d subtree fingerprint is wrong\n", __FILE__, __LINE__);
return DB_BADFORMAT;
}
} }
} else { } else {
int n_in_buf = rbuf_int(&rc); int n_in_buf = rbuf_int(&rc);
result->u.l.n_bytes_in_buffer = 0; result->u.l.n_bytes_in_buffer = 0;
int r=pma_create(&result->u.l.buffer, default_compare_fun, nodesize); r=pma_create(&result->u.l.buffer, default_compare_fun, nodesize);
if (r!=0) { if (r!=0) {
if (0) { died_21: pma_free(&result->u.l.buffer); } if (0) { died_21: pma_free(&result->u.l.buffer); }
goto died1; goto died1;
...@@ -253,7 +344,6 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -253,7 +344,6 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
#if BRT_USE_PMA_BULK_INSERT #if BRT_USE_PMA_BULK_INSERT
{ {
DBT keys[n_in_buf], vals[n_in_buf]; DBT keys[n_in_buf], vals[n_in_buf];
int r;
for (i=0; i<n_in_buf; i++) { for (i=0; i<n_in_buf; i++) {
bytevec key; ITEMLEN keylen; bytevec key; ITEMLEN keylen;
...@@ -266,8 +356,16 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -266,8 +356,16 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD; result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD;
} }
if (n_in_buf > 0) { if (n_in_buf > 0) {
r = pma_bulk_insert(result->u.l.buffer, keys, vals, n_in_buf); u_int32_t actual_sum = 0;
r = pma_bulk_insert(result->u.l.buffer, keys, vals, n_in_buf, result->rand4fingerprint, &actual_sum);
if (r!=0) goto died_21; if (r!=0) goto died_21;
if (actual_sum!=result->local_fingerprint) {
//fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf);
return DB_BADFORMAT;
goto died_21;
} else {
//fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height);
}
} }
} }
#else #else
...@@ -279,13 +377,27 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz ...@@ -279,13 +377,27 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
rbuf_bytes(&rc, &val, &vallen); rbuf_bytes(&rc, &val, &vallen);
{ {
DBT k,v; DBT k,v;
int r = pma_insert(result->u.l.buffer, fill_dbt(&k, key, keylen), fill_dbt(&v, val, vallen), 0); r = pma_insert(result->u.l.buffer, fill_dbt(&k, key, keylen), fill_dbt(&v, val, vallen), 0);
if (r!=0) goto died_21; if (r!=0) goto died_21;
} }
result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD; result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD;
} }
#endif #endif
} }
{
unsigned int n_read_so_far = rc.ndone;
if (n_read_so_far+4!=rc.size) {
r = DB_BADFORMAT; goto died_21;
}
uint32_t crc = toku_crc32(toku_null_crc, rc.buf, n_read_so_far);
uint32_t storedcrc = rbuf_int(&rc);
if (crc!=storedcrc) {
printf("Bad CRC\n");
assert(0);//this is wrong!!!
r = DB_BADFORMAT;
goto died_21;
}
}
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children); //printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
toku_free(rc.buf); toku_free(rc.buf);
*brtnode = result; *brtnode = result;
...@@ -302,6 +414,8 @@ void verify_counts (BRTNODE node) { ...@@ -302,6 +414,8 @@ void verify_counts (BRTNODE node) {
int i; int i;
for (i=0; i<node->u.n.n_children; i++) for (i=0; i<node->u.n.n_children; i++)
sum += node->u.n.n_bytes_in_hashtable[i]; sum += node->u.n.n_bytes_in_hashtable[i];
// We don't rally care of the later hashtables have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
// But for now the code always initializes the later tables so they are 0.
for (; i<TREE_FANOUT+1; i++) { for (; i<TREE_FANOUT+1; i++) {
assert(node->u.n.n_bytes_in_hashtable[i]==0); assert(node->u.n.n_bytes_in_hashtable[i]==0);
} }
...@@ -313,7 +427,7 @@ int serialize_brt_header_to (int fd, struct brt_header *h) { ...@@ -313,7 +427,7 @@ int serialize_brt_header_to (int fd, struct brt_header *h) {
struct wbuf w; struct wbuf w;
int i; int i;
unsigned int size=0; /* I don't want to mess around calculating it exactly. */ unsigned int size=0; /* I don't want to mess around calculating it exactly. */
size += 4+4+8+8+4; /* this size, the tree's nodesize, freelist, unused_memory, nnamed_rootse. */ size += 4+4+4+8+8+4; /* this size, flags, the tree's nodesize, freelist, unused_memory, nnamed_rootse. */
if (h->n_named_roots<0) { if (h->n_named_roots<0) {
size+=8; size+=8;
} else { } else {
...@@ -321,10 +435,9 @@ int serialize_brt_header_to (int fd, struct brt_header *h) { ...@@ -321,10 +435,9 @@ int serialize_brt_header_to (int fd, struct brt_header *h) {
size+=12 + 1 + strlen(h->names[i]); size+=12 + 1 + strlen(h->names[i]);
} }
} }
w.buf = toku_malloc(size); wbuf_init(&w, toku_malloc(size), size);
w.size = size;
w.ndone = 0;
wbuf_int (&w, size); wbuf_int (&w, size);
wbuf_int (&w, h->flags);
wbuf_int (&w, h->nodesize); wbuf_int (&w, h->nodesize);
wbuf_diskoff(&w, h->freelist); wbuf_diskoff(&w, h->freelist);
wbuf_diskoff(&w, h->unused_memory); wbuf_diskoff(&w, h->unused_memory);
...@@ -350,7 +463,7 @@ int serialize_brt_header_to (int fd, struct brt_header *h) { ...@@ -350,7 +463,7 @@ int serialize_brt_header_to (int fd, struct brt_header *h) {
return 0; return 0;
} }
int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) { int deserialize_brtheader_from (int fd, DISKOFF off, struct brt_header **brth) {
//printf("%s:%d calling MALLOC\n", __FILE__, __LINE__); //printf("%s:%d calling MALLOC\n", __FILE__, __LINE__);
struct brt_header *MALLOC(h); struct brt_header *MALLOC(h);
struct rbuf rc; struct rbuf rc;
...@@ -376,6 +489,7 @@ int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) { ...@@ -376,6 +489,7 @@ int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) {
h->dirty=0; h->dirty=0;
sizeagain = rbuf_int(&rc); sizeagain = rbuf_int(&rc);
assert(sizeagain==size); assert(sizeagain==size);
h->flags = rbuf_int(&rc);
h->nodesize = rbuf_int(&rc); h->nodesize = rbuf_int(&rc);
h->freelist = rbuf_diskoff(&rc); h->freelist = rbuf_diskoff(&rc);
h->unused_memory = rbuf_diskoff(&rc); h->unused_memory = rbuf_diskoff(&rc);
...@@ -403,3 +517,4 @@ int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) { ...@@ -403,3 +517,4 @@ int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) {
*brth = h; *brth = h;
return 0; return 0;
} }
...@@ -24,7 +24,7 @@ static void test0 (void) { ...@@ -24,7 +24,7 @@ static void test0 (void) {
printf("%s:%d test0\n", __FILE__, __LINE__); printf("%s:%d test0\n", __FILE__, __LINE__);
memory_check=1; memory_check=1;
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
printf("%s:%d test0\n", __FILE__, __LINE__); printf("%s:%d test0\n", __FILE__, __LINE__);
unlink(fname); unlink(fname);
...@@ -47,7 +47,7 @@ static void test1 (void) { ...@@ -47,7 +47,7 @@ static void test1 (void) {
DBT k,v; DBT k,v;
memory_check=1; memory_check=1;
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun);
...@@ -74,7 +74,7 @@ static void test2 (int memcheck) { ...@@ -74,7 +74,7 @@ static void test2 (int memcheck) {
memory_check=memcheck; memory_check=memcheck;
printf("%s:%d checking\n", __FILE__, __LINE__); printf("%s:%d checking\n", __FILE__, __LINE__);
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun);
printf("%s:%d did setup\n", __FILE__, __LINE__); printf("%s:%d did setup\n", __FILE__, __LINE__);
...@@ -112,7 +112,7 @@ static void test3 (int nodesize, int count, int memcheck) { ...@@ -112,7 +112,7 @@ static void test3 (int nodesize, int count, int memcheck) {
char fname[]="testbrt.brt"; char fname[]="testbrt.brt";
memory_check=memcheck; memory_check=memcheck;
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
gettimeofday(&t0, 0); gettimeofday(&t0, 0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun);
...@@ -145,7 +145,7 @@ static void test4 (int nodesize, int count, int memcheck) { ...@@ -145,7 +145,7 @@ static void test4 (int nodesize, int count, int memcheck) {
unlink(fname); unlink(fname);
memory_check=memcheck; memory_check=memcheck;
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun); assert(r==0); r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun); assert(r==0);
for (i=0; i<count; i++) { for (i=0; i<count; i++) {
char key[100],val[100]; char key[100],val[100];
...@@ -177,7 +177,7 @@ static void test5 (void) { ...@@ -177,7 +177,7 @@ static void test5 (void) {
MALLOC_N(limit,values); MALLOC_N(limit,values);
for (i=0; i<limit; i++) values[i]=-1; for (i=0; i<limit; i++) values[i]=-1;
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(fname, 0, 1, &t, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(fname, 0, 1, &t, 1<<12, ct, default_compare_fun); assert(r==0);
for (i=0; i<limit/2; i++) { for (i=0; i<limit/2; i++) {
char key[100],val[100]; char key[100],val[100];
...@@ -218,7 +218,7 @@ static void test_dump_empty_db (void) { ...@@ -218,7 +218,7 @@ static void test_dump_empty_db (void) {
int r; int r;
char fname[]="testbrt.brt"; char fname[]="testbrt.brt";
memory_check=1; memory_check=1;
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun);
...@@ -240,7 +240,7 @@ static void test_multiple_files_of_size (int size) { ...@@ -240,7 +240,7 @@ static void test_multiple_files_of_size (int size) {
unlink(n0); unlink(n0);
unlink(n1); unlink(n1);
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, 0, 1, &t0, size, ct, default_compare_fun); assert(r==0); r = open_brt(n0, 0, 1, &t0, size, ct, default_compare_fun); assert(r==0);
r = open_brt(n1, 0, 1, &t1, size, ct, default_compare_fun); assert(r==0); r = open_brt(n1, 0, 1, &t1, size, ct, default_compare_fun); assert(r==0);
for (i=0; i<10000; i++) { for (i=0; i<10000; i++) {
...@@ -264,7 +264,7 @@ static void test_multiple_files_of_size (int size) { ...@@ -264,7 +264,7 @@ static void test_multiple_files_of_size (int size) {
memory_check_all_free(); memory_check_all_free();
/* Now see if the data is all there. */ /* Now see if the data is all there. */
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, 0, 0, &t0, 1<<12, ct, default_compare_fun); r = open_brt(n0, 0, 0, &t0, 1<<12, ct, default_compare_fun);
printf("%s:%d r=%d\n", __FILE__, __LINE__,r); printf("%s:%d r=%d\n", __FILE__, __LINE__,r);
assert(r==0); assert(r==0);
...@@ -309,7 +309,7 @@ static void test_named_db (void) { ...@@ -309,7 +309,7 @@ static void test_named_db (void) {
unlink(n0); unlink(n0);
unlink(n1); unlink(n1);
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 1, &t0, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n0, "db1", 1, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
...@@ -320,7 +320,7 @@ static void test_named_db (void) { ...@@ -320,7 +320,7 @@ static void test_named_db (void) {
memory_check_all_free(); memory_check_all_free();
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 0, &t0, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n0, "db1", 0, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
{ {
...@@ -346,7 +346,7 @@ static void test_multiple_dbs (void) { ...@@ -346,7 +346,7 @@ static void test_multiple_dbs (void) {
unlink(n0); unlink(n0);
unlink(n1); unlink(n1);
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 1, &t0, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n0, "db1", 1, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
r = open_brt(n1, "db2", 1, &t1, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n1, "db2", 1, &t1, 1<<12, ct, default_compare_fun); assert(r==0);
...@@ -359,7 +359,7 @@ static void test_multiple_dbs (void) { ...@@ -359,7 +359,7 @@ static void test_multiple_dbs (void) {
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 0, &t0, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n0, "db1", 0, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
r = open_brt(n1, "db2", 0, &t1, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n1, "db2", 0, &t1, 1<<12, ct, default_compare_fun); assert(r==0);
...@@ -399,7 +399,7 @@ static void test_multiple_dbs_many (void) { ...@@ -399,7 +399,7 @@ static void test_multiple_dbs_many (void) {
printf("test_multiple_dbs_many:\n"); printf("test_multiple_dbs_many:\n");
memory_check_all_free(); memory_check_all_free();
unlink(name); unlink(name);
r = brt_create_cachetable(&ct, MANYN+4); assert(r==0); r = brt_create_cachetable(&ct, MANYN+4, ZERO_LSN, NULL_LOGGER); assert(r==0);
for (i=0; i<MANYN; i++) { for (i=0; i<MANYN; i++) {
char dbname[20]; char dbname[20];
snprintf(dbname, 20, "db%d", i); snprintf(dbname, 20, "db%d", i);
...@@ -430,7 +430,7 @@ static void test_multiple_brts_one_db_one_file (void) { ...@@ -430,7 +430,7 @@ static void test_multiple_brts_one_db_one_file (void) {
printf("test_multiple_brts_one_db_one_file:"); printf("test_multiple_brts_one_db_one_file:");
memory_check_all_free(); memory_check_all_free();
unlink(name); unlink(name);
r = brt_create_cachetable(&ct, 32); assert(r==0); r = brt_create_cachetable(&ct, 32, ZERO_LSN, NULL_LOGGER); assert(r==0);
for (i=0; i<MANYN; i++) { for (i=0; i<MANYN; i++) {
r = open_brt(name, 0, (i==0), &trees[i], 1<<12, ct, default_compare_fun); r = open_brt(name, 0, (i==0), &trees[i], 1<<12, ct, default_compare_fun);
assert(r==0); assert(r==0);
...@@ -468,14 +468,13 @@ static void test_read_what_was_written (void) { ...@@ -468,14 +468,13 @@ static void test_read_what_was_written (void) {
BRT brt; BRT brt;
int r; int r;
const int NVALS=10000; const int NVALS=10000;
DBT k,v;
printf("test_read_what_was_written(): "); fflush(stdout); printf("test_read_what_was_written(): "); fflush(stdout);
unlink(n); unlink(n);
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
r = close_brt(brt); assert(r==0); r = close_brt(brt); assert(r==0);
r = cachetable_close(&ct); assert(r==0); r = cachetable_close(&ct); assert(r==0);
...@@ -483,11 +482,14 @@ static void test_read_what_was_written (void) { ...@@ -483,11 +482,14 @@ static void test_read_what_was_written (void) {
memory_check_all_free(); memory_check_all_free();
/* Now see if we can read an empty tree in. */ /* Now see if we can read an empty tree in. */
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
/* See if we can put something in it. */ /* See if we can put something in it. */
{
DBT k,v;
brt_insert(brt, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), null_db, null_txn); brt_insert(brt, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), null_db, null_txn);
}
r = close_brt(brt); assert(r==0); r = close_brt(brt); assert(r==0);
r = cachetable_close(&ct); assert(r==0); r = cachetable_close(&ct); assert(r==0);
...@@ -495,10 +497,11 @@ static void test_read_what_was_written (void) { ...@@ -495,10 +497,11 @@ static void test_read_what_was_written (void) {
memory_check_all_free(); memory_check_all_free();
/* Now see if we can read it in and get the value. */ /* Now see if we can read it in and get the value. */
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
{ {
DBT k,v;
r = brt_lookup(brt, fill_dbt(&k, "hello", 6), init_dbt(&v), 0); r = brt_lookup(brt, fill_dbt(&k, "hello", 6), init_dbt(&v), 0);
assert(r==0); assert(r==0);
assert(v.size==6); assert(v.size==6);
...@@ -507,7 +510,7 @@ static void test_read_what_was_written (void) { ...@@ -507,7 +510,7 @@ static void test_read_what_was_written (void) {
assert(verify_brt(brt)==0); assert(verify_brt(brt)==0);
/* Now put a bunch (VALS) of things in. */ /* Now put a bunch (NVALS) of things in. */
{ {
int i; int i;
for (i=0; i<NVALS; i++) { for (i=0; i<NVALS; i++) {
...@@ -554,6 +557,7 @@ static void test_read_what_was_written (void) { ...@@ -554,6 +557,7 @@ static void test_read_what_was_written (void) {
int i; int i;
for (i=0; i<NVALS; i++) { for (i=0; i<NVALS; i++) {
char key[100],expectedval[100]; char key[100],expectedval[100];
DBT k,v;
snprintf(key, 100, "key%d", i); snprintf(key, 100, "key%d", i);
snprintf(expectedval, 100, "val%d", i); snprintf(expectedval, 100, "val%d", i);
r=brt_lookup(brt, fill_dbt(&k, key, strlen(key)+1), init_dbt(&v), 0); r=brt_lookup(brt, fill_dbt(&k, key, strlen(key)+1), init_dbt(&v), 0);
...@@ -569,10 +573,11 @@ static void test_read_what_was_written (void) { ...@@ -569,10 +573,11 @@ static void test_read_what_was_written (void) {
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
{ {
DBT k,v;
r = brt_lookup(brt, fill_dbt(&k, "hello", 6), init_dbt(&v), 0); r = brt_lookup(brt, fill_dbt(&k, "hello", 6), init_dbt(&v), 0);
assert(r==0); assert(r==0);
assert(v.size==6); assert(v.size==6);
...@@ -582,6 +587,7 @@ static void test_read_what_was_written (void) { ...@@ -582,6 +587,7 @@ static void test_read_what_was_written (void) {
int i; int i;
for (i=0; i<NVALS; i++) { for (i=0; i<NVALS; i++) {
char key[100],expectedval[100]; char key[100],expectedval[100];
DBT k,v;
snprintf(key, 100, "key%d", i); snprintf(key, 100, "key%d", i);
snprintf(expectedval, 100, "val%d", i); snprintf(expectedval, 100, "val%d", i);
r=brt_lookup(brt, fill_dbt(&k, key, strlen(key)+1), init_dbt(&v), 0); r=brt_lookup(brt, fill_dbt(&k, key, strlen(key)+1), init_dbt(&v), 0);
...@@ -614,7 +620,7 @@ void test_cursor_last_empty(void) { ...@@ -614,7 +620,7 @@ void test_cursor_last_empty(void) {
unlink(n); unlink(n);
memory_check_all_free(); memory_check_all_free();
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
...@@ -646,7 +652,7 @@ void test_cursor_next (void) { ...@@ -646,7 +652,7 @@ void test_cursor_next (void) {
unlink(n); unlink(n);
memory_check_all_free(); memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0); r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
...@@ -692,7 +698,9 @@ DB nonce_db; ...@@ -692,7 +698,9 @@ DB nonce_db;
DBT *fill_b(DBT *x, unsigned char *key, unsigned int keylen) { DBT *fill_b(DBT *x, unsigned char *key, unsigned int keylen) {
fill_dbt(x, key, keylen); fill_dbt(x, key, keylen);
#if USE_DBT_APP_PRIVATE
x->app_private = &nonce; x->app_private = &nonce;
#endif
return x; return x;
} }
...@@ -702,7 +710,9 @@ int wrong_compare_fun(DB *db, const DBT *a, const DBT *b) { ...@@ -702,7 +710,9 @@ int wrong_compare_fun(DB *db, const DBT *a, const DBT *b) {
unsigned char *bd=b->data; unsigned char *bd=b->data;
unsigned int siz=a->size; unsigned int siz=a->size;
assert(a->size==b->size); assert(a->size==b->size);
#if USE_DBT_APP_PRIVATE
assert(a->app_private == &nonce); // a must have the nonce in it, but I don't care if b does. assert(a->app_private == &nonce); // a must have the nonce in it, but I don't care if b does.
#endif
assert(db==&nonce_db); // make sure the db was passed down correctly assert(db==&nonce_db); // make sure the db was passed down correctly
for (i=0; i<siz; i++) { for (i=0; i<siz; i++) {
if (ad[siz-1-i]<bd[siz-1-i]) return -1; if (ad[siz-1-i]<bd[siz-1-i]) return -1;
...@@ -732,8 +742,8 @@ static void test_wrongendian_compare (int wrong_p, unsigned int N) { ...@@ -732,8 +742,8 @@ static void test_wrongendian_compare (int wrong_p, unsigned int N) {
assert(wrong_compare_fun(&nonce_db, fill_dbt_ap(&at, b, 4, &nonce), fill_dbt(&bt, a, 4))<0); assert(wrong_compare_fun(&nonce_db, fill_dbt_ap(&at, b, 4, &nonce), fill_dbt(&bt, a, 4))<0);
} }
r = brt_create_cachetable(&ct, 0); assert(r==0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
printf("%s:%d WRONG=%d\n", __FILE__, __LINE__, wrong_p); //printf("%s:%d WRONG=%d\n", __FILE__, __LINE__, wrong_p);
if (0) { // ???? Why is this commented out? if (0) { // ???? Why is this commented out?
r = open_brt(n, 0, 1, &brt, 1<<20, ct, wrong_p ? wrong_compare_fun : default_compare_fun); assert(r==0); r = open_brt(n, 0, 1, &brt, 1<<20, ct, wrong_p ? wrong_compare_fun : default_compare_fun); assert(r==0);
...@@ -832,7 +842,9 @@ void clear_test_db() { ...@@ -832,7 +842,9 @@ void clear_test_db() {
int test_brt_cursor_keycompare(DB *db, const DBT *a, const DBT *b) { int test_brt_cursor_keycompare(DB *db, const DBT *a, const DBT *b) {
assert(db == test_db); assert(db == test_db);
#if USE_DBT_APP_PRIVATE
assert(a->app_private == test_app_private); assert(a->app_private == test_app_private);
#endif
return keycompare(a->data, a->size, b->data, b->size); return keycompare(a->data, a->size, b->data, b->size);
} }
...@@ -844,7 +856,7 @@ void assert_cursor_notfound(BRT brt, int position, DB *db, void *app_private) { ...@@ -844,7 +856,7 @@ void assert_cursor_notfound(BRT brt, int position, DB *db, void *app_private) {
r = brt_cursor(brt, &cursor); r = brt_cursor(brt, &cursor);
assert(r==0); assert(r==0);
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, position, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, position, db, null_txn);
assert(r == DB_NOTFOUND); assert(r == DB_NOTFOUND);
...@@ -863,7 +875,7 @@ void assert_cursor_value(BRT brt, int position, long long value, DB *db, void *a ...@@ -863,7 +875,7 @@ void assert_cursor_value(BRT brt, int position, long long value, DB *db, void *a
assert(r==0); assert(r==0);
if (test_cursor_debug) printf("key: "); if (test_cursor_debug) printf("key: ");
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, position, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, position, db, null_txn);
assert(r == 0); assert(r == 0);
...@@ -889,7 +901,7 @@ void assert_cursor_first_last(BRT brt, long long firstv, long long lastv, DB *db ...@@ -889,7 +901,7 @@ void assert_cursor_first_last(BRT brt, long long firstv, long long lastv, DB *db
assert(r==0); assert(r==0);
if (test_cursor_debug) printf("first key: "); if (test_cursor_debug) printf("first key: ");
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_FIRST, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, DB_FIRST, db, null_txn);
assert(r == 0); assert(r == 0);
...@@ -902,7 +914,7 @@ void assert_cursor_first_last(BRT brt, long long firstv, long long lastv, DB *db ...@@ -902,7 +914,7 @@ void assert_cursor_first_last(BRT brt, long long firstv, long long lastv, DB *db
if (test_cursor_debug) printf("\n"); if (test_cursor_debug) printf("\n");
if (test_cursor_debug) printf("last key:"); if (test_cursor_debug) printf("last key:");
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_LAST, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, DB_LAST, db, null_txn);
assert(r == 0); assert(r == 0);
...@@ -931,7 +943,7 @@ void test_brt_cursor_first(int n, DB *db) { ...@@ -931,7 +943,7 @@ void test_brt_cursor_first(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -977,7 +989,7 @@ void test_brt_cursor_last(int n, DB *db) { ...@@ -977,7 +989,7 @@ void test_brt_cursor_last(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1023,7 +1035,7 @@ void test_brt_cursor_first_last(int n, DB *db) { ...@@ -1023,7 +1035,7 @@ void test_brt_cursor_first_last(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1070,7 +1082,7 @@ void test_brt_cursor_rfirst(int n, DB *db) { ...@@ -1070,7 +1082,7 @@ void test_brt_cursor_rfirst(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1116,7 +1128,7 @@ void assert_cursor_walk(BRT brt, int n, DB *db, void *app_private) { ...@@ -1116,7 +1128,7 @@ void assert_cursor_walk(BRT brt, int n, DB *db, void *app_private) {
DBT kbt, vbt; DBT kbt, vbt;
long long v; long long v;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
if (r != 0) if (r != 0)
...@@ -1148,7 +1160,7 @@ void test_brt_cursor_walk(int n, DB *db) { ...@@ -1148,7 +1160,7 @@ void test_brt_cursor_walk(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1192,7 +1204,7 @@ void assert_cursor_rwalk(BRT brt, int n, DB *db, void *app_private) { ...@@ -1192,7 +1204,7 @@ void assert_cursor_rwalk(BRT brt, int n, DB *db, void *app_private) {
DBT kbt, vbt; DBT kbt, vbt;
long long v; long long v;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_PREV, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, DB_PREV, db, null_txn);
if (r != 0) if (r != 0)
...@@ -1224,7 +1236,7 @@ void test_brt_cursor_rwalk(int n, DB *db) { ...@@ -1224,7 +1236,7 @@ void test_brt_cursor_rwalk(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1270,7 +1282,7 @@ void assert_cursor_walk_inorder(BRT brt, int n, DB *db, void *app_private) { ...@@ -1270,7 +1282,7 @@ void assert_cursor_walk_inorder(BRT brt, int n, DB *db, void *app_private) {
DBT kbt, vbt; DBT kbt, vbt;
long long v; long long v;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
if (r != 0) if (r != 0)
...@@ -1306,7 +1318,7 @@ void test_brt_cursor_rand(int n, DB *db) { ...@@ -1306,7 +1318,7 @@ void test_brt_cursor_rand(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1362,7 +1374,7 @@ void test_brt_cursor_split(int n, DB *db) { ...@@ -1362,7 +1374,7 @@ void test_brt_cursor_split(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1385,7 +1397,7 @@ void test_brt_cursor_split(int n, DB *db) { ...@@ -1385,7 +1397,7 @@ void test_brt_cursor_split(int n, DB *db) {
if (test_cursor_debug) printf("key: "); if (test_cursor_debug) printf("key: ");
for (i=0; i<n/2; i++) { for (i=0; i<n/2; i++) {
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = &my_app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, &my_app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
assert(r==0); assert(r==0);
...@@ -1408,7 +1420,7 @@ void test_brt_cursor_split(int n, DB *db) { ...@@ -1408,7 +1420,7 @@ void test_brt_cursor_split(int n, DB *db) {
if (test_cursor_debug) printf("key: "); if (test_cursor_debug) printf("key: ");
for (;;) { for (;;) {
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = &my_app_private; init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, &my_app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC; init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn); r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
if (r != 0) if (r != 0)
...@@ -1444,7 +1456,7 @@ void test_multiple_brt_cursors(int n, DB *db) { ...@@ -1444,7 +1456,7 @@ void test_multiple_brt_cursors(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1498,7 +1510,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) { ...@@ -1498,7 +1510,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) {
int nodesize = 1<<12; int nodesize = 1<<12;
int h = log16(n); int h = log16(n);
int cachesize = 2 * h * ncursors * nodesize; int cachesize = 2 * h * ncursors * nodesize;
r = brt_create_cachetable_size(&ct, 127, cachesize); r = brt_create_cachetable(&ct, cachesize, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1527,7 +1539,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) { ...@@ -1527,7 +1539,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) {
/* point cursor i / cursor_gap to the current last key i */ /* point cursor i / cursor_gap to the current last key i */
if ((i % cursor_gap) == 0) { if ((i % cursor_gap) == 0) {
c = i / cursor_gap; c = i / cursor_gap;
init_dbt(&key); key.flags = DB_DBT_MALLOC; key.app_private = &my_app_private; init_dbt(&key); key.flags = DB_DBT_MALLOC; dbt_set_app_private(&key, &my_app_private);
init_dbt(&val); val.flags = DB_DBT_MALLOC; init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursors[c], &key, &val, DB_LAST, db, null_txn); r = brt_cursor_get(cursors[c], &key, &val, DB_LAST, db, null_txn);
assert(r == 0); assert(r == 0);
...@@ -1539,7 +1551,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) { ...@@ -1539,7 +1551,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) {
/* walk the cursors by cursor_gap */ /* walk the cursors by cursor_gap */
for (i=0; i<cursor_gap; i++) { for (i=0; i<cursor_gap; i++) {
for (c=0; c<ncursors; c++) { for (c=0; c<ncursors; c++) {
init_dbt(&key); key.flags = DB_DBT_MALLOC; key.app_private = &my_app_private; init_dbt(&key); key.flags = DB_DBT_MALLOC; dbt_set_app_private(&key, &my_app_private);
init_dbt(&val); val.flags = DB_DBT_MALLOC; init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursors[c], &key, &val, DB_NEXT, db, null_txn); r = brt_cursor_get(cursors[c], &key, &val, DB_NEXT, db, null_txn);
if (r == DB_NOTFOUND) { if (r == DB_NOTFOUND) {
...@@ -1584,7 +1596,7 @@ void test_brt_cursor_set(int n, int cursor_op, DB *db) { ...@@ -1584,7 +1596,7 @@ void test_brt_cursor_set(int n, int cursor_op, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1659,7 +1671,7 @@ void test_brt_cursor_set_range(int n, DB *db) { ...@@ -1659,7 +1671,7 @@ void test_brt_cursor_set_range(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1730,7 +1742,7 @@ void test_brt_cursor_delete(int n, DB *db) { ...@@ -1730,7 +1742,7 @@ void test_brt_cursor_delete(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
error = brt_create_cachetable(&ct, 0); error = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(error == 0); assert(error == 0);
error = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); error = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1755,7 +1767,7 @@ void test_brt_cursor_delete(int n, DB *db) { ...@@ -1755,7 +1767,7 @@ void test_brt_cursor_delete(int n, DB *db) {
/* walk the tree and delete under the cursor */ /* walk the tree and delete under the cursor */
for (;;) { for (;;) {
init_dbt(&key); key.flags = DB_DBT_MALLOC; key.app_private = &my_app_private; init_dbt(&key); key.flags = DB_DBT_MALLOC; dbt_set_app_private(&key, &my_app_private);
init_dbt(&val); val.flags = DB_DBT_MALLOC; init_dbt(&val); val.flags = DB_DBT_MALLOC;
error = brt_cursor_get(cursor, &key, &val, DB_NEXT, db, null_txn); error = brt_cursor_get(cursor, &key, &val, DB_NEXT, db, null_txn);
if (error == DB_NOTFOUND) if (error == DB_NOTFOUND)
...@@ -1796,7 +1808,7 @@ void test_brt_cursor_get_both(int n, DB *db) { ...@@ -1796,7 +1808,7 @@ void test_brt_cursor_get_both(int n, DB *db) {
set_test_db_app(db, &my_app_private); set_test_db_app(db, &my_app_private);
unlink(fname); unlink(fname);
error = brt_create_cachetable(&ct, 0); error = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(error == 0); assert(error == 0);
error = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare); error = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
...@@ -1900,9 +1912,6 @@ int test_brt_cursor_limit = 10000; ...@@ -1900,9 +1912,6 @@ int test_brt_cursor_limit = 10000;
void test_brt_cursor(DB *db) { void test_brt_cursor(DB *db) {
int n; int n;
int old_brt_do_push_cmd = brt_do_push_cmd;
brt_do_push_cmd = 0;
test_multiple_brt_cursors(1, db); test_multiple_brt_cursors(1, db);
test_multiple_brt_cursors(2, db); test_multiple_brt_cursors(2, db);
test_multiple_brt_cursors(3, db); test_multiple_brt_cursors(3, db);
...@@ -1943,8 +1952,6 @@ void test_brt_cursor(DB *db) { ...@@ -1943,8 +1952,6 @@ void test_brt_cursor(DB *db) {
test_multiple_brt_cursor_walk(10000, db); memory_check_all_free(); test_multiple_brt_cursor_walk(10000, db); memory_check_all_free();
test_multiple_brt_cursor_walk(100000, db); memory_check_all_free(); test_multiple_brt_cursor_walk(100000, db); memory_check_all_free();
test_brt_cursor_get_both(1000, db); memory_check_all_free(); test_brt_cursor_get_both(1000, db); memory_check_all_free();
brt_do_push_cmd = old_brt_do_push_cmd;
} }
void test_large_kv(int bsize, int ksize, int vsize) { void test_large_kv(int bsize, int ksize, int vsize) {
...@@ -1955,7 +1962,7 @@ void test_large_kv(int bsize, int ksize, int vsize) { ...@@ -1955,7 +1962,7 @@ void test_large_kv(int bsize, int ksize, int vsize) {
printf("test_large_kv: %d %d %d\n", bsize, ksize, vsize); printf("test_large_kv: %d %d %d\n", bsize, ksize, vsize);
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, bsize, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, bsize, ct, default_compare_fun);
...@@ -2002,7 +2009,7 @@ void test_brt_delete_empty() { ...@@ -2002,7 +2009,7 @@ void test_brt_delete_empty() {
CACHETABLE ct; CACHETABLE ct;
char fname[]="testbrt.brt"; char fname[]="testbrt.brt";
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
...@@ -2012,7 +2019,7 @@ void test_brt_delete_empty() { ...@@ -2012,7 +2019,7 @@ void test_brt_delete_empty() {
int k = htonl(1); int k = htonl(1);
fill_dbt(&key, &k, sizeof k); fill_dbt(&key, &k, sizeof k);
r = brt_delete(t, &key, 0); r = brt_delete(t, &key, 0);
assert(r != 0); assert(r == 0);
r = close_brt(t); assert(r==0); r = close_brt(t); assert(r==0);
r = cachetable_close(&ct); assert(r==0); r = cachetable_close(&ct); assert(r==0);
...@@ -2031,7 +2038,7 @@ void test_brt_delete_present(int n) { ...@@ -2031,7 +2038,7 @@ void test_brt_delete_present(int n) {
char fname[]="testbrt.brt"; char fname[]="testbrt.brt";
int i; int i;
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
...@@ -2093,7 +2100,7 @@ void test_brt_delete_not_present(int n) { ...@@ -2093,7 +2100,7 @@ void test_brt_delete_not_present(int n) {
char fname[]="testbrt.brt"; char fname[]="testbrt.brt";
int i; int i;
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
...@@ -2140,7 +2147,7 @@ void test_brt_delete_cursor_first(int n) { ...@@ -2140,7 +2147,7 @@ void test_brt_delete_cursor_first(int n) {
char fname[]="testbrt.brt"; char fname[]="testbrt.brt";
int i; int i;
r = brt_create_cachetable(&ct, 0); r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun); r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
...@@ -2158,12 +2165,30 @@ void test_brt_delete_cursor_first(int n) { ...@@ -2158,12 +2165,30 @@ void test_brt_delete_cursor_first(int n) {
assert(r == 0); assert(r == 0);
} }
/* lookups 0 .. n-1 should succeed */
for (i=0; i<n; i++) {
k = htonl(i);
fill_dbt(&key, &k, sizeof k);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_lookup(t, &key, &val, 0);
assert(r == 0);
assert(val.size == sizeof (int));
int vv;
memcpy(&vv, val.data, val.size);
assert(vv == i);
toku_free(val.data);
}
/* delete 0 .. n-2 */ /* delete 0 .. n-2 */
for (i=0; i<n-1; i++) { for (i=0; i<n-1; i++) {
k = htonl(i); k = htonl(i);
fill_dbt(&key, &k, sizeof k); fill_dbt(&key, &k, sizeof k);
r = brt_delete(t, &key, 0); r = brt_delete(t, &key, 0);
assert(r == 0); assert(r == 0);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_lookup(t, &key, &val, 0);
assert(r == DB_NOTFOUND);
} }
/* lookup of 0 .. n-2 should all fail */ /* lookup of 0 .. n-2 should all fail */
...@@ -2199,6 +2224,56 @@ void test_brt_delete_cursor_first(int n) { ...@@ -2199,6 +2224,56 @@ void test_brt_delete_cursor_first(int n) {
r = cachetable_close(&ct); assert(r==0); r = cachetable_close(&ct); assert(r==0);
} }
/* test for bug: insert cmd in a nonleaf node, delete removes the
insert cmd, but lookup finds the insert cmd
build a 2 level tree, and expect the last insertion to be
buffered. then delete and lookup. */
void test_insert_delete_lookup(int n) {
printf("test_insert_delete_lookup:%d\n", n);
BRT t;
int r;
CACHETABLE ct;
char fname[]="testbrt.brt";
int i;
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
assert(r==0);
DBT key, val;
int k, v;
/* insert 0 .. n-1 */
for (i=0; i<n; i++) {
k = htonl(i); v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
r = brt_insert(t, &key, &val, 0, 0);
assert(r == 0);
}
if (n > 0) {
k = htonl(n-1);
fill_dbt(&key, &k, sizeof k);
r = brt_delete(t, &key, 0);
assert(r == 0);
k = htonl(n-1);
fill_dbt(&key, &k, sizeof k);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_lookup(t, &key, &val, 0);
assert(r == DB_NOTFOUND);
}
r = close_brt(t); assert(r==0);
r = cachetable_close(&ct); assert(r==0);
}
void test_brt_delete() { void test_brt_delete() {
test_brt_delete_empty(); memory_check_all_free(); test_brt_delete_empty(); memory_check_all_free();
test_brt_delete_present(1); memory_check_all_free(); test_brt_delete_present(1); memory_check_all_free();
...@@ -2210,6 +2285,8 @@ void test_brt_delete() { ...@@ -2210,6 +2285,8 @@ void test_brt_delete() {
test_brt_delete_cursor_first(1); memory_check_all_free(); test_brt_delete_cursor_first(1); memory_check_all_free();
test_brt_delete_cursor_first(100); memory_check_all_free(); test_brt_delete_cursor_first(100); memory_check_all_free();
test_brt_delete_cursor_first(500); memory_check_all_free(); test_brt_delete_cursor_first(500); memory_check_all_free();
test_brt_delete_cursor_first(10000); memory_check_all_free();
test_insert_delete_lookup(512); memory_check_all_free();
} }
static void brt_blackbox_test (void) { static void brt_blackbox_test (void) {
...@@ -2272,6 +2349,14 @@ static void brt_blackbox_test (void) { ...@@ -2272,6 +2349,14 @@ static void brt_blackbox_test (void) {
test_brt_delete(); test_brt_delete();
int old_brt_do_push_cmd = brt_do_push_cmd;
brt_do_push_cmd = 0;
test_brt_delete();
test_brt_cursor(db);
brt_do_push_cmd = old_brt_do_push_cmd;
// test3(1<<19, 1<<20, 0); // test3(1<<19, 1<<20, 0);
// test4(1<<19, 1<<20, 0); // test4(1<<19, 1<<20, 0);
......
/* Verify a BRT. */
/* Check:
* the fingerprint of every node (local check)
* the child's fingerprint matches the parent's copy
* the tree is of uniform depth (and the height is correct at every node)
* For non-dup trees: the values to the left are < the values to the right
* and < the pivot
* For dup trees: the values to the left are <= the values to the right
* the pivots are < or <= left values (according to the PresentL bit)
* the pivots are > or >= right values (according to the PresentR bit)
*
* Note: We don't yet have DUP trees, so thee checks on duplicate trees are unimplemented. (Nov 1 2007)
*/
#include "brt-internal.h"
#include <assert.h>
static void verify_local_fingerprint (BRTNODE node) {
u_int32_t fp=0;
int i;
if (node->height>0) {
for (i=0; i<node->u.n.n_children; i++)
HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type,
({
fp += node->rand4fingerprint * toku_calccrc32_cmd(type, key, keylen, data, datalen);
}));
assert(fp==node->local_fingerprint);
} else {
pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint);
}
}
static void verify_parent_fingerprint (BRTNODE node) {
BRTNODE parent=node->parent_brtnode;
u_int32_t subtree_fingerprint=node->local_fingerprint;
if (node->height>0) {
int i;
for (i=0; i<node->u.n.n_children; i++) {
subtree_fingerprint+=node->u.n.child_subtree_fingerprints[i];
}
}
if (parent) {
int i;
assert(parent->height>0);
for (i=0; i<parent->u.n.n_children; i++) {
if (parent->u.n.children[i]==node->thisnodename) {
assert(parent->u.n.child_subtree_fingerprints[i]==subtree_fingerprint);
return;
}
}
assert(0); // no parent matches
}
}
int verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode) {
int result=0;
BRTNODE node;
void *node_v;
int r;
if ((r = cachetable_get_and_pin(brt->cf, off, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize)))
return r;
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
node=node_v;
node->parent_brtnode = parent_brtnode;
verify_local_fingerprint(node);
verify_parent_fingerprint(node);
if (node->height>0) {
int i;
for (i=0; i< node->u.n.n_children-1; i++) {
bytevec thislorange,thishirange;
ITEMLEN thislolen, thishilen;
if (node->u.n.n_children==0 || i==0) {
thislorange=lorange;
thislolen =lolen;
} else {
thislorange=node->u.n.childkeys[i-1];
thislolen =node->u.n.childkeylens[i-1];
}
if (node->u.n.n_children==0 || i+1>=node->u.n.n_children) {
thishirange=hirange;
thishilen =hilen;
} else {
thishirange=node->u.n.childkeys[i];
thishilen =node->u.n.childkeylens[i];
}
{
void verify_pair (bytevec key, unsigned int keylen,
bytevec data __attribute__((__unused__)),
unsigned int datalen __attribute__((__unused__)),
int type __attribute__((__unused__)),
void *ignore __attribute__((__unused__))) {
if (thislorange) assert(keycompare(thislorange,thislolen,key,keylen)<0);
if (thishirange && keycompare(key,keylen,thishirange,thishilen)>0) {
printf("%s:%d in buffer %d key %s is bigger than %s\n", __FILE__, __LINE__, i, (char*)key, (char*)thishirange);
result=1;
}
}
toku_hashtable_iterate(node->u.n.htables[i], verify_pair, 0);
}
}
for (i=0; i<node->u.n.n_children; i++) {
if (i>0) {
if (lorange) assert(keycompare(lorange,lolen, node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1])<0);
if (hirange) assert(keycompare(node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1], hirange, hilen)<=0);
}
if (recurse) {
result|=verify_brtnode(brt, node->u.n.children[i],
(i==0) ? lorange : node->u.n.childkeys[i-1],
(i==0) ? lolen : node->u.n.childkeylens[i-1],
(i==node->u.n.n_children-1) ? hirange : node->u.n.childkeys[i],
(i==node->u.n.n_children-1) ? hilen : node->u.n.childkeylens[i],
recurse,
node);
}
}
}
if ((r = cachetable_unpin(brt->cf, off, 0, 0))) return r;
return result;
}
int verify_brt (BRT brt) {
int r;
CACHEKEY *rootp;
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: toku_unpin_brt_header(brt); }
return r;
}
rootp = toku_calculate_root_offset_pointer(brt);
if ((r=verify_brtnode(brt, *rootp, 0, 0, 0, 0, 1, null_brtnode))) goto died0;
if ((r = toku_unpin_brt_header(brt))!=0) return r;
return 0;
}
...@@ -22,11 +22,8 @@ ...@@ -22,11 +22,8 @@
* *
*/ */
#include "brttypes.h"
#include "brt.h"
#include "memory.h"
#include "brt-internal.h" #include "brt-internal.h"
#include "cachetable.h" #include "key.h"
#include <stdlib.h> #include <stdlib.h>
#include <assert.h> #include <assert.h>
...@@ -35,10 +32,11 @@ ...@@ -35,10 +32,11 @@
#include <stdio.h> #include <stdio.h>
#include <errno.h> #include <errno.h>
const BRTNODE null_brtnode=0;
extern long long n_items_malloced; extern long long n_items_malloced;
static DISKOFF malloc_diskblock (BRT brt, int size);
//static void verify_local_fingerprint_nonleaf (BRTNODE node);
/* Frees a node, including all the stuff in the hash table. */ /* Frees a node, including all the stuff in the hash table. */
void brtnode_free (BRTNODE *nodep) { void brtnode_free (BRTNODE *nodep) {
BRTNODE node=*nodep; BRTNODE node=*nodep;
...@@ -102,14 +100,32 @@ void fix_up_parent_pointers_of_children_now_that_parent_is_gone (CACHEFILE cf, B ...@@ -102,14 +100,32 @@ void fix_up_parent_pointers_of_children_now_that_parent_is_gone (CACHEFILE cf, B
} }
} }
static void fixup_child_fingerprint(BRTNODE node, int childnum_of_node, BRTNODE child) {
u_int32_t sum = child->local_fingerprint;
if (child->height>0) {
int i;
for (i=0; i<child->u.n.n_children; i++) {
sum += child->u.n.child_subtree_fingerprints[i];
}
}
// Don't try to get fancy about not modifying the fingerprint if it didn't change.
// We only call this function if we have reason to believe that the child's fingerprint did change.
node->u.n.child_subtree_fingerprints[childnum_of_node]=sum;
node->dirty=1;
}
void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnode_v, long size __attribute((unused)), int write_me, int keep_me) { void brtnode_flush_callback (CACHEFILE cachefile, DISKOFF nodename, void *brtnode_v, long size __attribute((unused)), BOOL write_me, BOOL keep_me, LSN modified_lsn, BOOL rename_p __attribute__((__unused__))) {
BRTNODE brtnode = brtnode_v; BRTNODE brtnode = brtnode_v;
// if ((write_me || keep_me) && (brtnode->height==0)) {
// pma_verify_fingerprint(brtnode->u.l.buffer, brtnode->rand4fingerprint, brtnode->subtree_fingerprint);
// }
if (0) { if (0) {
printf("%s:%d brtnode_flush_callback %p keep_me=%d height=%d", __FILE__, __LINE__, brtnode, keep_me, brtnode->height); printf("%s:%d brtnode_flush_callback %p keep_me=%d height=%d", __FILE__, __LINE__, brtnode, keep_me, brtnode->height);
if (brtnode->height==0) printf(" pma=%p", brtnode->u.l.buffer); if (brtnode->height==0) printf(" pma=%p", brtnode->u.l.buffer);
printf("\n"); printf("\n");
} }
if (modified_lsn.lsn > brtnode->lsn.lsn) brtnode->lsn=modified_lsn;
fix_up_parent_pointers_of_children_now_that_parent_is_gone(cachefile, brtnode); fix_up_parent_pointers_of_children_now_that_parent_is_gone(cachefile, brtnode);
assert(brtnode->thisnodename==nodename); assert(brtnode->thisnodename==nodename);
{ {
...@@ -124,7 +140,19 @@ void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnod ...@@ -124,7 +140,19 @@ void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnod
assert(parent->u.n.n_children<=TREE_FANOUT+1); assert(parent->u.n.n_children<=TREE_FANOUT+1);
for (i=0; i<parent->u.n.n_children; i++) { for (i=0; i<parent->u.n.n_children; i++) {
//printf(" %lld\n", parent->u.n.children[i]); //printf(" %lld\n", parent->u.n.children[i]);
if (parent->u.n.children[i]==nodename) goto ok; if (parent->u.n.children[i]==nodename) {
// Rename the block, informing the parent of the new block
if (rename_p) {
DISKOFF newnodename = malloc_diskblock(brtnode->brt, brtnode->nodesize);
int r=tokulogger_log_block_rename(cachefile_logger(cachefile), cachefile_filenum(cachefile), nodename, newnodename, parent->thisnodename, i);
assert(r!=0); // !!! This error should be handled better (e.g., what if the disk fills up)
// !!! Don't forget to free the old node (sometime after some future checkpoint. TODO!!!)
brtnode->thisnodename=newnodename;
parent->u.n.children[i] = newnodename;
cachetable_rename(cachefile, nodename, newnodename);
}
goto ok;
}
} }
printf("%s:%d Whoops, the parent of %p (%p) isn't right\n", __FILE__, __LINE__, brtnode, parent); printf("%s:%d Whoops, the parent of %p (%p) isn't right\n", __FILE__, __LINE__, brtnode, parent);
assert(0); assert(0);
...@@ -143,18 +171,19 @@ void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnod ...@@ -143,18 +171,19 @@ void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnod
//printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced); //printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced);
} }
int brtnode_fetch_callback (CACHEFILE cachefile, diskoff nodename, void **brtnode_pv, long *sizep __attribute__((unused)), void*extraargs) { int brtnode_fetch_callback (CACHEFILE cachefile, DISKOFF nodename, void **brtnode_pv, long *sizep __attribute__((unused)), void*extraargs, LSN *written_lsn) {
long nodesize=(long)extraargs; long nodesize=(long)extraargs;
BRTNODE *result=(BRTNODE*)brtnode_pv; BRTNODE *result=(BRTNODE*)brtnode_pv;
int r = deserialize_brtnode_from(cachefile_fd(cachefile), nodename, result, nodesize); int r = deserialize_brtnode_from(cachefile_fd(cachefile), nodename, result, nodesize);
if (r == 0) if (r == 0)
*sizep = brtnode_size(*result); *sizep = brtnode_size(*result);
*written_lsn = (*result)->lsn;
//(*result)->parent_brtnode = 0; /* Don't know it right now. */ //(*result)->parent_brtnode = 0; /* Don't know it right now. */
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename); //printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
return r; return r;
} }
void brtheader_flush_callback (CACHEFILE cachefile, diskoff nodename, void *header_v, long size __attribute((unused)), int write_me, int keep_me) { void brtheader_flush_callback (CACHEFILE cachefile, DISKOFF nodename, void *header_v, long size __attribute((unused)), BOOL write_me, BOOL keep_me, LSN lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
struct brt_header *h = header_v; struct brt_header *h = header_v;
assert(nodename==0); assert(nodename==0);
assert(!h->dirty); // shouldn't be dirty once it is unpinned. assert(!h->dirty); // shouldn't be dirty once it is unpinned.
...@@ -174,14 +203,15 @@ void brtheader_flush_callback (CACHEFILE cachefile, diskoff nodename, void *head ...@@ -174,14 +203,15 @@ void brtheader_flush_callback (CACHEFILE cachefile, diskoff nodename, void *head
} }
} }
int brtheader_fetch_callback (CACHEFILE cachefile, diskoff nodename, void **headerp_v, long *sizep __attribute__((unused)), void*extraargs __attribute__((__unused__))) { int brtheader_fetch_callback (CACHEFILE cachefile, DISKOFF nodename, void **headerp_v, long *sizep __attribute__((unused)), void*extraargs __attribute__((__unused__)), LSN *written_lsn) {
struct brt_header **h = (struct brt_header **)headerp_v; struct brt_header **h = (struct brt_header **)headerp_v;
assert(nodename==0); assert(nodename==0);
int r = deserialize_brtheader_from(cachefile_fd(cachefile), nodename, h); int r = deserialize_brtheader_from(cachefile_fd(cachefile), nodename, h);
written_lsn->lsn = 0; // !!! WRONG. This should be stored or kept redundantly or something.
return r; return r;
} }
int read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) { int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) {
void *header_p; void *header_p;
//fprintf(stderr, "%s:%d read_and_pin_brt_header(...)\n", __FILE__, __LINE__); //fprintf(stderr, "%s:%d read_and_pin_brt_header(...)\n", __FILE__, __LINE__);
int r = cachetable_get_and_pin(cf, 0, &header_p, NULL, int r = cachetable_get_and_pin(cf, 0, &header_p, NULL,
...@@ -191,7 +221,7 @@ int read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) { ...@@ -191,7 +221,7 @@ int read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) {
return 0; return 0;
} }
int unpin_brt_header (BRT brt) { int toku_unpin_brt_header (BRT brt) {
int r = cachetable_unpin(brt->cf, 0, brt->h->dirty, 0); int r = cachetable_unpin(brt->cf, 0, brt->h->dirty, 0);
brt->h->dirty=0; brt->h->dirty=0;
brt->h=0; brt->h=0;
...@@ -215,18 +245,18 @@ int kvpair_compare (const void *av, const void *bv) { ...@@ -215,18 +245,18 @@ int kvpair_compare (const void *av, const void *bv) {
} }
/* Forgot to handle the case where there is something in the freelist. */ /* Forgot to handle the case where there is something in the freelist. */
diskoff malloc_diskblock_header_is_in_memory (BRT brt, int size) { static DISKOFF malloc_diskblock_header_is_in_memory (BRT brt, int size) {
diskoff result = brt->h->unused_memory; DISKOFF result = brt->h->unused_memory;
brt->h->unused_memory+=size; brt->h->unused_memory+=size;
return result; return result;
} }
diskoff malloc_diskblock (BRT brt, int size) { DISKOFF malloc_diskblock (BRT brt, int size) {
#if 0 #if 0
int r = read_and_pin_brt_header(brt->fd, &brt->h); int r = read_and_pin_brt_header(brt->fd, &brt->h);
assert(r==0); assert(r==0);
{ {
diskoff result = malloc_diskblock_header_is_in_memory(brt, size); DISKOFF result = malloc_diskblock_header_is_in_memory(brt, size);
r = write_brt_header(brt->fd, &brt->h); r = write_brt_header(brt->fd, &brt->h);
assert(r==0); assert(r==0);
return result; return result;
...@@ -236,26 +266,32 @@ diskoff malloc_diskblock (BRT brt, int size) { ...@@ -236,26 +266,32 @@ diskoff malloc_diskblock (BRT brt, int size) {
#endif #endif
} }
static void initialize_brtnode (BRT t, BRTNODE n, diskoff nodename, int height) { static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height) {
int i; int i;
n->tag = TYP_BRTNODE; n->tag = TYP_BRTNODE;
n->brt = t;
n->nodesize = t->h->nodesize; n->nodesize = t->h->nodesize;
n->thisnodename = nodename; n->thisnodename = nodename;
n->lsn.lsn = 0; // a new one can always be 0.
n->layout_version = 0;
n->height = height; n->height = height;
n->rand4fingerprint = random();
n->local_fingerprint = 0;
brtnode_set_dirty(n); brtnode_set_dirty(n);
assert(height>=0); assert(height>=0);
if (height>0) { if (height>0) {
n->u.n.n_children = 0; n->u.n.n_children = 0;
for (i=0; i<TREE_FANOUT; i++) { for (i=0; i<TREE_FANOUT; i++) {
n->u.n.childkeys[i] = 0; // n->u.n.childkeys[i] = 0;
n->u.n.childkeylens[i] = 0; // n->u.n.childkeylens[i] = 0;
} }
n->u.n.totalchildkeylens = 0; n->u.n.totalchildkeylens = 0;
for (i=0; i<TREE_FANOUT+1; i++) { for (i=0; i<TREE_FANOUT+1; i++) {
n->u.n.children[i] = 0; n->u.n.child_subtree_fingerprints[i] = 0;
n->u.n.htables[i] = 0; // n->u.n.children[i] = 0;
// n->u.n.htables[i] = 0;
n->u.n.n_bytes_in_hashtable[i] = 0; n->u.n.n_bytes_in_hashtable[i] = 0;
n->u.n.n_cursors[i] = 0; n->u.n.n_cursors[i] = 0; // This one is simpler to initialize properly
} }
n->u.n.n_bytes_in_hashtables = 0; n->u.n.n_bytes_in_hashtables = 0;
} else { } else {
...@@ -271,7 +307,7 @@ static void initialize_brtnode (BRT t, BRTNODE n, diskoff nodename, int height) ...@@ -271,7 +307,7 @@ static void initialize_brtnode (BRT t, BRTNODE n, diskoff nodename, int height)
static void create_new_brtnode (BRT t, BRTNODE *result, int height, BRTNODE parent_brtnode) { static void create_new_brtnode (BRT t, BRTNODE *result, int height, BRTNODE parent_brtnode) {
TAGMALLOC(BRTNODE, n); TAGMALLOC(BRTNODE, n);
int r; int r;
diskoff name = malloc_diskblock(t, t->h->nodesize); DISKOFF name = malloc_diskblock(t, t->h->nodesize);
assert(n); assert(n);
assert(t->h->nodesize>0); assert(t->h->nodesize>0);
//printf("%s:%d malloced %lld (and malloc again=%lld)\n", __FILE__, __LINE__, name, malloc_diskblock(t, t->nodesize)); //printf("%s:%d malloced %lld (and malloc again=%lld)\n", __FILE__, __LINE__, name, malloc_diskblock(t, t->nodesize));
...@@ -279,6 +315,7 @@ static void create_new_brtnode (BRT t, BRTNODE *result, int height, BRTNODE pare ...@@ -279,6 +315,7 @@ static void create_new_brtnode (BRT t, BRTNODE *result, int height, BRTNODE pare
*result = n; *result = n;
assert(n->nodesize>0); assert(n->nodesize>0);
n->parent_brtnode = parent_brtnode; n->parent_brtnode = parent_brtnode;
n->brt = t;
//printf("%s:%d putting %p (%lld) parent=%p\n", __FILE__, __LINE__, n, n->thisnodename, parent_brtnode); //printf("%s:%d putting %p (%lld) parent=%p\n", __FILE__, __LINE__, n, n->thisnodename, parent_brtnode);
r=cachetable_put(t->cf, n->thisnodename, n, brtnode_size(n), r=cachetable_put(t->cf, n->thisnodename, n, brtnode_size(n),
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)t->h->nodesize); brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)t->h->nodesize);
...@@ -324,6 +361,7 @@ static int insert_to_hash_in_nonleaf (BRTNODE node, int childnum, DBT *k, DBT *v ...@@ -324,6 +361,7 @@ static int insert_to_hash_in_nonleaf (BRTNODE node, int childnum, DBT *k, DBT *v
unsigned int n_bytes_added = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + k->size + v->size; unsigned int n_bytes_added = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + k->size + v->size;
int r = toku_hash_insert(node->u.n.htables[childnum], k->data, k->size, v->data, v->size, type); int r = toku_hash_insert(node->u.n.htables[childnum], k->data, k->size, v->data, v->size, type);
if (r!=0) return r; if (r!=0) return r;
node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_cmd(type, k->data, k->size, v->data, v->size);
node->u.n.n_bytes_in_hashtable[childnum] += n_bytes_added; node->u.n.n_bytes_in_hashtable[childnum] += n_bytes_added;
node->u.n.n_bytes_in_hashtables += n_bytes_added; node->u.n.n_bytes_in_hashtables += n_bytes_added;
brtnode_set_dirty(node); brtnode_set_dirty(node);
...@@ -350,8 +388,8 @@ int brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -350,8 +388,8 @@ int brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
int r; int r;
r = pma_split(node->u.l.buffer, &node->u.l.n_bytes_in_buffer, r = pma_split(node->u.l.buffer, &node->u.l.n_bytes_in_buffer,
A->u.l.buffer, &A->u.l.n_bytes_in_buffer, A->u.l.buffer, &A->u.l.n_bytes_in_buffer, A->rand4fingerprint, &A->local_fingerprint,
B->u.l.buffer, &B->u.l.n_bytes_in_buffer); B->u.l.buffer, &B->u.l.n_bytes_in_buffer, B->rand4fingerprint, &B->local_fingerprint);
assert(r == 0); assert(r == 0);
r = pma_get_last(A->u.l.buffer, splitk, 0); r = pma_get_last(A->u.l.buffer, splitk, 0);
...@@ -393,6 +431,14 @@ int brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -393,6 +431,14 @@ int brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
return 0; return 0;
} }
static void brt_update_fingerprint_when_moving_hashtable (BRTNODE oldnode, BRTNODE newnode, HASHTABLE table_being_moved) {
u_int32_t sum = 0;
HASHTABLE_ITERATE(table_being_moved, key, keylen, data, datalen, type,
sum += toku_calccrc32_cmd(type, key, keylen, data, datalen));
oldnode->local_fingerprint -= oldnode->rand4fingerprint * sum;
newnode->local_fingerprint += newnode->rand4fingerprint * sum;
}
/* Side effect: sets splitk->data pointer to a malloc'd value */ /* Side effect: sets splitk->data pointer to a malloc'd value */
void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) { void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) {
int n_children_in_a = node->u.n.n_children/2; int n_children_in_a = node->u.n.n_children/2;
...@@ -412,23 +458,31 @@ void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT ...@@ -412,23 +458,31 @@ void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT
* The splitter key is key number n_children_in_a */ * The splitter key is key number n_children_in_a */
int i; int i;
for (i=0; i<n_children_in_a; i++) { for (i=0; i<n_children_in_a; i++) {
HASHTABLE htab = node->u.n.htables[i];
A->u.n.children[i] = node->u.n.children[i]; A->u.n.children[i] = node->u.n.children[i];
A->u.n.htables[i] = node->u.n.htables[i]; A->u.n.htables[i] = htab;
A->u.n.n_bytes_in_hashtables += (A->u.n.n_bytes_in_hashtable[i] = node->u.n.n_bytes_in_hashtable[i]); A->u.n.n_bytes_in_hashtables += (A->u.n.n_bytes_in_hashtable[i] = node->u.n.n_bytes_in_hashtable[i]);
A->u.n.child_subtree_fingerprints[i] = node->u.n.child_subtree_fingerprints[i];
node->u.n.htables[i] = 0; node->u.n.htables[i] = 0;
node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i]; node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i];
node->u.n.n_bytes_in_hashtable[i] = 0; node->u.n.n_bytes_in_hashtable[i] = 0;
brt_update_fingerprint_when_moving_hashtable(node, A, htab);
} }
for (i=n_children_in_a; i<node->u.n.n_children; i++) { for (i=n_children_in_a; i<node->u.n.n_children; i++) {
int targchild = i-n_children_in_a; int targchild = i-n_children_in_a;
HASHTABLE htab = node->u.n.htables[i];
B->u.n.children[targchild] = node->u.n.children[i]; B->u.n.children[targchild] = node->u.n.children[i];
B->u.n.htables[targchild] = node->u.n.htables[i]; B->u.n.htables[targchild] = htab;
B->u.n.n_bytes_in_hashtables += (B->u.n.n_bytes_in_hashtable[targchild] = node->u.n.n_bytes_in_hashtable[i]); B->u.n.n_bytes_in_hashtables += (B->u.n.n_bytes_in_hashtable[targchild] = node->u.n.n_bytes_in_hashtable[i]);
B->u.n.child_subtree_fingerprints[targchild] = node->u.n.child_subtree_fingerprints[i];
node->u.n.htables[i] = 0; node->u.n.htables[i] = 0;
node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i]; node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i];
node->u.n.n_bytes_in_hashtable[i] = 0; node->u.n.n_bytes_in_hashtable[i] = 0;
brt_update_fingerprint_when_moving_hashtable(node, B, htab);
} }
for (i=0; i<n_children_in_a-1; i++) { for (i=0; i<n_children_in_a-1; i++) {
A->u.n.childkeys[i] = node->u.n.childkeys[i]; A->u.n.childkeys[i] = node->u.n.childkeys[i];
...@@ -456,6 +510,8 @@ void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT ...@@ -456,6 +510,8 @@ void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT
fix_up_parent_pointers_of_children(t, A); fix_up_parent_pointers_of_children(t, A);
fix_up_parent_pointers_of_children(t, B); fix_up_parent_pointers_of_children(t, B);
//verify_local_fingerprint_nonleaf(A);
//verify_local_fingerprint_nonleaf(B);
} }
{ {
...@@ -523,22 +579,23 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT ...@@ -523,22 +579,23 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT
printf("\n"); printf("\n");
} }
} }
int r;
if (to_child) { if (to_child) {
int again_split=-1; BRTNODE againa,againb; int again_split=-1; BRTNODE againa,againb;
DBT againk; DBT againk;
init_dbt(&againk); init_dbt(&againk);
//printf("%s:%d hello!\n", __FILE__, __LINE__); //printf("%s:%d hello!\n", __FILE__, __LINE__);
int r = brtnode_put_cmd(t, child, cmd, r = brtnode_put_cmd(t, child, cmd,
&again_split, &againa, &againb, &againk, &again_split, &againa, &againb, &againk,
0, 0,
txn); txn);
if (r!=0) return r; if (r!=0) return r;
assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */ assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */
return r;
} else { } else {
int r=insert_to_hash_in_nonleaf(node, childnum_of_node, k, v, cmd->type); r=insert_to_hash_in_nonleaf(node, childnum_of_node, k, v, cmd->type);
return r;
} }
fixup_child_fingerprint(node, childnum_of_node, child);
return r;
} }
static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum, static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum,
...@@ -549,7 +606,6 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum ...@@ -549,7 +606,6 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
//if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, ""); //if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, "");
//printf("%s:%d hello!\n", __FILE__, __LINE__); //printf("%s:%d hello!\n", __FILE__, __LINE__);
assert(node->height>0); assert(node->height>0);
{ {
int r = brtnode_put_cmd(t, child, cmd, int r = brtnode_put_cmd(t, child, cmd,
child_did_split, childa, childb, childsplitk, child_did_split, childa, childb, childsplitk,
...@@ -561,6 +617,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum ...@@ -561,6 +617,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
DBT *k = cmd->u.id.key; DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val; DBT *v = cmd->u.id.val;
//if (debug) printf("%s:%d %*sinserted down child_did_split=%d\n", __FILE__, __LINE__, debug, "", child_did_split); //if (debug) printf("%s:%d %*sinserted down child_did_split=%d\n", __FILE__, __LINE__, debug, "", child_did_split);
node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_cmdstruct(cmd);
{ {
int r = toku_hash_delete(node->u.n.htables[childnum], k->data, k->size); // Must delete after doing the insert, to avoid operating on freed' key int r = toku_hash_delete(node->u.n.htables[childnum], k->data, k->size); // Must delete after doing the insert, to avoid operating on freed' key
//printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r); //printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r);
...@@ -572,7 +629,12 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum ...@@ -572,7 +629,12 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
node->u.n.n_bytes_in_hashtable[childnum] -= n_bytes_removed; node->u.n.n_bytes_in_hashtable[childnum] -= n_bytes_removed;
brtnode_set_dirty(node); brtnode_set_dirty(node);
} }
if (*child_did_split) {
fixup_child_fingerprint(node, childnum, *childa);
fixup_child_fingerprint(node, childnum+1, *childb);
} else {
fixup_child_fingerprint(node, childnum, child);
}
return 0; return 0;
} }
...@@ -611,19 +673,29 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -611,19 +673,29 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
brtnode_set_dirty(node); brtnode_set_dirty(node);
//verify_local_fingerprint_nonleaf(node);
// Slide the children over. // Slide the children over.
for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) { for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) {
node->u.n.children[cnum] = node->u.n.children[cnum-1]; node->u.n.children[cnum] = node->u.n.children[cnum-1];
node->u.n.htables[cnum] = node->u.n.htables[cnum-1]; node->u.n.htables[cnum] = node->u.n.htables[cnum-1];
node->u.n.child_subtree_fingerprints[cnum] = node->u.n.child_subtree_fingerprints[cnum-1];
node->u.n.n_bytes_in_hashtable[cnum] = node->u.n.n_bytes_in_hashtable[cnum-1]; node->u.n.n_bytes_in_hashtable[cnum] = node->u.n.n_bytes_in_hashtable[cnum-1];
node->u.n.n_cursors[cnum] = node->u.n.n_cursors[cnum-1]; node->u.n.n_cursors[cnum] = node->u.n.n_cursors[cnum-1];
} }
node->u.n.children[childnum] = childa->thisnodename; node->u.n.children[childnum] = childa->thisnodename;
node->u.n.children[childnum+1] = childb->thisnodename; node->u.n.children[childnum+1] = childb->thisnodename;
fixup_child_fingerprint(node, childnum, childa);
fixup_child_fingerprint(node, childnum+1, childb);
toku_hashtable_create(&node->u.n.htables[childnum]); toku_hashtable_create(&node->u.n.htables[childnum]);
toku_hashtable_create(&node->u.n.htables[childnum+1]); toku_hashtable_create(&node->u.n.htables[childnum+1]);
node->u.n.n_bytes_in_hashtable[childnum] = 0; node->u.n.n_bytes_in_hashtable[childnum] = 0;
node->u.n.n_bytes_in_hashtable[childnum+1] = 0; node->u.n.n_bytes_in_hashtable[childnum+1] = 0;
// Remove all the cmds from the local fingerprint. Some may get added in again when we try to push to the child.
HASHTABLE_ITERATE(old_h, skey, skeylen, sval, svallen, type,
node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_cmd(type, skey, skeylen, sval, svallen));
// Slide the keys over // Slide the keys over
for (cnum=node->u.n.n_children-1; cnum>childnum; cnum--) { for (cnum=node->u.n.n_children-1; cnum>childnum; cnum--) {
node->u.n.childkeys[cnum] = node->u.n.childkeys[cnum-1]; node->u.n.childkeys[cnum] = node->u.n.childkeys[cnum-1];
...@@ -651,26 +723,31 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -651,26 +723,31 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
fill_dbt(&svd, sval, svallen); fill_dbt(&svd, sval, svallen);
BRT_CMD brtcmd; BRT_CMD brtcmd;
brtcmd.type = type; brtcmd.u.id.key = &skd; brtcmd.u.id.val = &svd; brtcmd.u.id.db = db; brtcmd.type = type; brtcmd.u.id.key = &skd; brtcmd.u.id.val = &svd; brtcmd.u.id.db = db;
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
if (t->compare_fun(db, &skd, childsplitk)<=0) { if (t->compare_fun(db, &skd, childsplitk)<=0) {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, txn); r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, txn);
} else { } else {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, txn); r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, txn);
} }
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
if (r!=0) return r; if (r!=0) return r;
})); }));
toku_hashtable_free(&old_h); toku_hashtable_free(&old_h);
r=cachetable_unpin(t->cf, childa->thisnodename, childa->dirty, brtnode_size(childa)); //verify_local_fingerprint_nonleaf(childa);
assert(r==0); //verify_local_fingerprint_nonleaf(childb);
r=cachetable_unpin(t->cf, childb->thisnodename, childb->dirty, brtnode_size(childb)); //verify_local_fingerprint_nonleaf(node);
assert(r==0);
verify_counts(node); verify_counts(node);
verify_counts(childa); verify_counts(childa);
verify_counts(childb); verify_counts(childb);
r=cachetable_unpin(t->cf, childa->thisnodename, childa->dirty, brtnode_size(childa));
assert(r==0);
r=cachetable_unpin(t->cf, childb->thisnodename, childb->dirty, brtnode_size(childb));
assert(r==0);
if (node->u.n.n_children>TREE_FANOUT) { if (node->u.n.n_children>TREE_FANOUT) {
//printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs); //printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs);
brt_nonleaf_split(t, node, nodea, nodeb, splitk); brt_nonleaf_split(t, node, nodea, nodeb, splitk);
...@@ -685,6 +762,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -685,6 +762,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0); assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0);
assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize);
assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize);
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
} else { } else {
*did_split=0; *did_split=0;
assert(serialize_brtnode_size(node)<=node->nodesize); assert(serialize_brtnode_size(node)<=node->nodesize);
...@@ -703,7 +782,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -703,7 +782,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
BRTNODE child; BRTNODE child;
int r; int r;
assert(node->height>0); assert(node->height>0);
diskoff targetchild = node->u.n.children[childnum]; DISKOFF targetchild = node->u.n.children[childnum];
assert(targetchild>=0 && targetchild<t->h->unused_memory); // This assertion could fail in a concurrent setting since another process might have bumped unused memory. assert(targetchild>=0 && targetchild<t->h->unused_memory); // This assertion could fail in a concurrent setting since another process might have bumped unused memory.
r = cachetable_get_and_pin(t->cf, targetchild, &childnode_v, NULL, r = cachetable_get_and_pin(t->cf, targetchild, &childnode_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)t->h->nodesize); brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)t->h->nodesize);
...@@ -711,6 +790,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -711,6 +790,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
//printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v); //printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v);
child=childnode_v; child=childnode_v;
child->parent_brtnode = node; child->parent_brtnode = node;
//verify_local_fingerprint_nonleaf(child);
verify_counts(child); verify_counts(child);
//printf("%s:%d height=%d n_bytes_in_hashtable = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_hashtable[0], child->n_bytes_in_hashtable[1], child->n_bytes_in_hashtable[2]); //printf("%s:%d height=%d n_bytes_in_hashtable = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_hashtable[0], child->n_bytes_in_hashtable[1], child->n_bytes_in_hashtable[2]);
if (child->height>0 && child->u.n.n_children>0) assert(child->u.n.children[child->u.n.n_children-1]!=0); if (child->height>0 && child->u.n.n_children>0) assert(child->u.n.children[child->u.n.n_children-1]!=0);
...@@ -746,8 +826,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -746,8 +826,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
//printf("%s:%d random_picked\n", __FILE__, __LINE__); //printf("%s:%d random_picked\n", __FILE__, __LINE__);
init_dbt(&childsplitk); init_dbt(&childsplitk);
childsplitk.app_private = splitk->app_private; dbt_set_app_private(&childsplitk, dbt_get_app_private(splitk));
if (debug) printf("%s:%d %*spush down %s\n", __FILE__, __LINE__, debug, "", (char*)key); if (debug) printf("%s:%d %*spush down %s\n", __FILE__, __LINE__, debug, "", (char*)key);
r = push_a_brt_cmd_down (t, node, child, childnum, r = push_a_brt_cmd_down (t, node, child, childnum,
&brtcmd, &brtcmd,
...@@ -757,7 +836,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -757,7 +836,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
if (0){ if (0){
unsigned int sum=0; unsigned int sum=0;
HASHTABLE_ITERATE(node->u.n.htables[childnum], hk __attribute__((__unused__)), hkl, hd __attribute__((__unused__)), hdl, type __attribute__((__unused__)), HASHTABLE_ITERATE(node->u.n.htables[childnum], subhk __attribute__((__unused__)), hkl, hd __attribute__((__unused__)), hdl, subtype __attribute__((__unused__)),
sum+=hkl+hdl+KEY_VALUE_OVERHEAD+BRT_CMD_OVERHEAD); sum+=hkl+hdl+KEY_VALUE_OVERHEAD+BRT_CMD_OVERHEAD);
printf("%s:%d sum=%d\n", __FILE__, __LINE__, sum); printf("%s:%d sum=%d\n", __FILE__, __LINE__, sum);
assert(sum==node->u.n.n_bytes_in_hashtable[childnum]); assert(sum==node->u.n.n_bytes_in_hashtable[childnum]);
...@@ -772,16 +851,21 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum, ...@@ -772,16 +851,21 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
childa, childb, &childsplitk, childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk, did_split, nodea, nodeb, splitk,
app_private, db, txn); app_private, db, txn);
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//}
return r; /* Don't do any more pushing if the child splits. */ return r; /* Don't do any more pushing if the child splits. */
} }
} }
if (0) printf("%s:%d done random picking\n", __FILE__, __LINE__); if (0) printf("%s:%d done random picking\n", __FILE__, __LINE__);
} }
if (debug) printf("%s:%d %*sdone push_some_brt_cmds_down, unpinning %lld\n", __FILE__, __LINE__, debug, "", targetchild); if (debug) printf("%s:%d %*sdone push_some_brt_cmds_down, unpinning %lld\n", __FILE__, __LINE__, debug, "", targetchild);
assert(serialize_brtnode_size(node)<=node->nodesize);
//verify_local_fingerprint_nonleaf(node);
r=cachetable_unpin(t->cf, targetchild, child->dirty, brtnode_size(child)); r=cachetable_unpin(t->cf, targetchild, child->dirty, brtnode_size(child));
if (r!=0) return r; if (r!=0) return r;
*did_split=0; *did_split=0;
assert(serialize_brtnode_size(node)<=node->nodesize);
return 0; return 0;
} }
...@@ -816,6 +900,8 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE ...@@ -816,6 +900,8 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE
assert((*nodeb)->u.n.n_children>0); assert((*nodeb)->u.n.n_children>0);
assert((*nodea)->u.n.children[(*nodea)->u.n.n_children-1]!=0); assert((*nodea)->u.n.children[(*nodea)->u.n.n_children-1]!=0);
assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0); assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0);
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
} else { } else {
assert(serialize_brtnode_size(node)<=node->nodesize); assert(serialize_brtnode_size(node)<=node->nodesize);
} }
...@@ -824,6 +910,12 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE ...@@ -824,6 +910,12 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE
*did_split=0; *did_split=0;
assert(serialize_brtnode_size(node)<=node->nodesize); assert(serialize_brtnode_size(node)<=node->nodesize);
} }
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
return 0; return 0;
} }
...@@ -833,13 +925,14 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -833,13 +925,14 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, int debug,
TOKUTXN txn) { TOKUTXN txn) {
// pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
if (cmd->type == BRT_INSERT) { if (cmd->type == BRT_INSERT) {
DBT *k = cmd->u.id.key; DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val; DBT *v = cmd->u.id.val;
DB *db = cmd->u.id.db; DB *db = cmd->u.id.db;
#ifdef INSERT_ALL_AT_ONCE #ifdef INSERT_ALL_AT_ONCE
int replaced_v_size; int replaced_v_size;
enum pma_errors pma_status = pma_insert_or_replace(node->u.l.buffer, k, v, &replaced_v_size, db, txn, node->thisnodename); enum pma_errors pma_status = pma_insert_or_replace(node->u.l.buffer, k, v, &replaced_v_size, db, txn, node->thisnodename, node->rand4fingerprint, &node->local_fingerprint);
assert(pma_status==BRT_OK); assert(pma_status==BRT_OK);
//printf("replaced_v_size=%d\n", replaced_v_size); //printf("replaced_v_size=%d\n", replaced_v_size);
if (replaced_v_size>=0) { if (replaced_v_size>=0) {
...@@ -859,9 +952,12 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -859,9 +952,12 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
node->u.l.n_bytes_in_buffer += k->size + v->size + KEY_VALUE_OVERHEAD; node->u.l.n_bytes_in_buffer += k->size + v->size + KEY_VALUE_OVERHEAD;
#endif #endif
brtnode_set_dirty(node); brtnode_set_dirty(node);
// pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
// If it doesn't fit, then split the leaf. // If it doesn't fit, then split the leaf.
if (serialize_brtnode_size(node) > node->nodesize) { if (serialize_brtnode_size(node) > node->nodesize) {
int r = brtleaf_split (t, node, nodea, nodeb, splitk, k->app_private, db); int r = brtleaf_split (t, node, nodea, nodeb, splitk, dbt_get_app_private(k), db);
if (r!=0) return r; if (r!=0) return r;
//printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey); //printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey);
split_count++; split_count++;
...@@ -870,6 +966,8 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -870,6 +966,8 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize); if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize);
assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize); assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize);
assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize); assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize);
// pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint);
// pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint);
} else { } else {
*did_split = 0; *did_split = 0;
} }
...@@ -884,13 +982,13 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -884,13 +982,13 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
init_dbt(&val); init_dbt(&val);
r = pma_lookup(node->u.l.buffer, cmd->u.id.key, &val, cmd->u.id.db); r = pma_lookup(node->u.l.buffer, cmd->u.id.key, &val, cmd->u.id.db);
if (r == 0) { if (r == 0) {
r = pma_delete(node->u.l.buffer, cmd->u.id.key, cmd->u.id.db); r = pma_delete(node->u.l.buffer, cmd->u.id.key, cmd->u.id.db, node->rand4fingerprint, &node->local_fingerprint);
assert(r == BRT_OK); assert(r == BRT_OK);
node->u.l.n_bytes_in_buffer -= cmd->u.id.key->size + val.size + KEY_VALUE_OVERHEAD; node->u.l.n_bytes_in_buffer -= cmd->u.id.key->size + val.size + KEY_VALUE_OVERHEAD;
brtnode_set_dirty(node); brtnode_set_dirty(node);
} }
*did_split = 0; *did_split = 0;
return r; return BRT_OK;
} }
/* unknown message */ /* unknown message */
...@@ -950,9 +1048,11 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -950,9 +1048,11 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD *cmd,
r = handle_split_of_child(t, node, childnum, r = handle_split_of_child(t, node, childnum,
childa, childb, &childsplitk, childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk, did_split, nodea, nodeb, splitk,
k->app_private, db, txn); dbt_get_app_private(k), db, txn);
assert(r == 0); assert(r == 0);
} else { } else {
//verify_local_fingerprint_nonleaf(child);
fixup_child_fingerprint(node, childnum, child);
int rr = cachetable_unpin(t->cf, child->thisnodename, child->dirty, brtnode_size(child)); int rr = cachetable_unpin(t->cf, child->thisnodename, child->dirty, brtnode_size(child));
assert(rr == 0); assert(rr == 0);
} }
...@@ -966,8 +1066,7 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -966,8 +1066,7 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
DBT *splitk, DBT *splitk,
int debug, int debug,
TOKUTXN txn) { TOKUTXN txn) {
bytevec olddata; //verify_local_fingerprint_nonleaf(node);
ITEMLEN olddatalen;
unsigned int childnum; unsigned int childnum;
int found; int found;
...@@ -982,16 +1081,30 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -982,16 +1081,30 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
if (node->u.n.n_cursors[childnum] > 0) { if (node->u.n.n_cursors[childnum] > 0) {
assert(node->u.n.n_bytes_in_hashtable[childnum] == 0); assert(node->u.n.n_bytes_in_hashtable[childnum] == 0);
int r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, txn, childnum, 0); int r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, txn, childnum, 0);
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
return r; return r;
} }
//verify_local_fingerprint_nonleaf(node);
{
int anytype;
bytevec olddata;
ITEMLEN olddatalen;
found = !toku_hash_find(node->u.n.htables[childnum], k->data, k->size, &olddata, &olddatalen, &anytype);
found = !toku_hash_find(node->u.n.htables[childnum], k->data, k->size, &olddata, &olddatalen, &type); //verify_local_fingerprint_nonleaf(node);
if (debug) printf("%s:%d %*sDoing hash_insert\n", __FILE__, __LINE__, debug, ""); if (debug) printf("%s:%d %*sDoing hash_insert\n", __FILE__, __LINE__, debug, "");
verify_counts(node); verify_counts(node);
if (found) { if (found) {
//printf("%s:%d found and deleting\n", __FILE__, __LINE__);
node->local_fingerprint -= node->rand4fingerprint * toku_calccrc32_cmd(anytype, k->data, k->size, olddata, olddatalen);
int r = toku_hash_delete(node->u.n.htables[childnum], k->data, k->size); int r = toku_hash_delete(node->u.n.htables[childnum], k->data, k->size);
/* Be careful, olddata is now invalid because of the delete. */
int diff = k->size + olddatalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD; int diff = k->size + olddatalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
assert(r==0); assert(r==0);
node->u.n.n_bytes_in_hashtables -= diff; node->u.n.n_bytes_in_hashtables -= diff;
...@@ -999,24 +1112,30 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -999,24 +1112,30 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
brtnode_set_dirty(node); brtnode_set_dirty(node);
//printf("%s:%d deleted %d bytes\n", __FILE__, __LINE__, diff); //printf("%s:%d deleted %d bytes\n", __FILE__, __LINE__, diff);
} }
}
//verify_local_fingerprint_nonleaf(node);
/* if the child is in the cache table then push the cmd to it /* if the child is in the cache table then push the cmd to it
otherwise just put it into this node's buffer */ otherwise just put it into this node's buffer */
if (brt_do_push_cmd) { if (brt_do_push_cmd) {
int r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, txn, childnum, 1); int r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, txn, childnum, 1);
if (r == 0) if (r == 0) {
//printf("%s:%d\n", __FILE__, __LINE__);
return r; return r;
} }
}
//verify_local_fingerprint_nonleaf(node);
{ {
int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD; int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
int r=toku_hash_insert(node->u.n.htables[childnum], k->data, k->size, v->data, v->size, type); int r=toku_hash_insert(node->u.n.htables[childnum], k->data, k->size, v->data, v->size, type);
assert(r==0); assert(r==0);
node->local_fingerprint += node->rand4fingerprint * toku_calccrc32_cmd(type, k->data, k->size, v->data, v->size);
node->u.n.n_bytes_in_hashtables += diff; node->u.n.n_bytes_in_hashtables += diff;
node->u.n.n_bytes_in_hashtable[childnum] += diff; node->u.n.n_bytes_in_hashtable[childnum] += diff;
brtnode_set_dirty(node); brtnode_set_dirty(node);
} }
if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, "");
int r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), k->app_private, db, txn); //verify_local_fingerprint_nonleaf(node);
int r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), dbt_get_app_private(k), db, txn);
if (r!=0) return r; if (r!=0) return r;
if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, ""); if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, "");
if (*did_split) { if (*did_split) {
...@@ -1032,39 +1151,80 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, ...@@ -1032,39 +1151,80 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
assert(serialize_brtnode_size(node)<=node->nodesize); assert(serialize_brtnode_size(node)<=node->nodesize);
verify_counts(node); verify_counts(node);
} }
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
return 0; return 0;
} }
//static void verify_local_fingerprint_nonleaf (BRTNODE node) {
// u_int32_t fp=0;
// int i;
// if (node->height==0) return;
// for (i=0; i<node->u.n.n_children; i++)
// HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type,
// ({
// fp += node->rand4fingerprint * toku_calccrc32_cmd(type, key, keylen, data, datalen);
// }));
// assert(fp==node->local_fingerprint);
//}
static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd, static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk, int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug, int debug,
TOKUTXN txn) { TOKUTXN txn) {
//static int counter=0; // FOO
//static int oldcounter=0;
//int tmpcounter;
//u_int32_t oldfingerprint=node->local_fingerprint;
int r;
//counter++; tmpcounter=counter;
if (node->height==0) { if (node->height==0) {
return brt_leaf_put_cmd(t, node, cmd, // pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
r = brt_leaf_put_cmd(t, node, cmd,
did_split, nodea, nodeb, splitk, did_split, nodea, nodeb, splitk,
debug, txn); debug, txn);
} else { } else {
return brt_nonleaf_put_cmd(t, node, cmd, r = brt_nonleaf_put_cmd(t, node, cmd,
did_split, nodea, nodeb, splitk, did_split, nodea, nodeb, splitk,
debug, txn); debug, txn);
} }
//oldcounter=tmpcounter;
// Watch out. If did_split then the original node is no longer allocated.
if (*did_split) {
assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize);
assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize);
// if ((*nodea)->height==0) {
// pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint);
// pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint);
// }
} else {
assert(serialize_brtnode_size(node)<=node->nodesize);
// if (node->height==0) {
// pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint);
// } else {
// verify_local_fingerprint_nonleaf(node);
// }
}
//if (node->local_fingerprint==3522421844U) {
// if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
// }
return r;
} }
int brt_create_cachetable_size(CACHETABLE *ct, int hashsize, long cachesize) { int brt_create_cachetable(CACHETABLE *ct, long cachesize, LSN initial_lsn, TOKULOGGER logger) {
return create_cachetable(ct, hashsize, cachesize); if (cachesize == 0)
} cachesize = 128*1024*1024;
return create_cachetable(ct, cachesize, initial_lsn, logger);
//enum {n_nodes_in_cache =64};
enum {n_nodes_in_cache =127};
int brt_create_cachetable (CACHETABLE *ct, int cachelines) {
if (cachelines==0) cachelines=n_nodes_in_cache;
assert(cachelines>0);
return brt_create_cachetable_size(ct, cachelines, (cachelines+1)*1024*1024);
} }
static int setup_brt_root_node (BRT t, diskoff offset) { static int setup_brt_root_node (BRT t, DISKOFF offset) {
int r; int r;
TAGMALLOC(BRTNODE, node); TAGMALLOC(BRTNODE, node);
assert(node); assert(node);
...@@ -1073,6 +1233,7 @@ static int setup_brt_root_node (BRT t, diskoff offset) { ...@@ -1073,6 +1233,7 @@ static int setup_brt_root_node (BRT t, diskoff offset) {
offset, /* the location is one nodesize offset from 0. */ offset, /* the location is one nodesize offset from 0. */
0); 0);
node->parent_brtnode=0; node->parent_brtnode=0;
node->brt = t;
if (0) { if (0) {
printf("%s:%d for tree %p node %p mdict_create--> %p\n", __FILE__, __LINE__, t, node, node->u.l.buffer); printf("%s:%d for tree %p node %p mdict_create--> %p\n", __FILE__, __LINE__, t, node, node->u.l.buffer);
printf("%s:%d put root at %lld\n", __FILE__, __LINE__, offset); printf("%s:%d put root at %lld\n", __FILE__, __LINE__, offset);
...@@ -1086,6 +1247,7 @@ static int setup_brt_root_node (BRT t, diskoff offset) { ...@@ -1086,6 +1247,7 @@ static int setup_brt_root_node (BRT t, diskoff offset) {
} }
//printf("%s:%d created %lld\n", __FILE__, __LINE__, node->thisnodename); //printf("%s:%d created %lld\n", __FILE__, __LINE__, node->thisnodename);
verify_counts(node); verify_counts(node);
// verify_local_fingerprint_nonleaf(node);
r=cachetable_unpin(t->cf, node->thisnodename, node->dirty, brtnode_size(node)); r=cachetable_unpin(t->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (r!=0) { if (r!=0) {
toku_free(node); toku_free(node);
...@@ -1101,23 +1263,49 @@ static int setup_brt_root_node (BRT t, diskoff offset) { ...@@ -1101,23 +1263,49 @@ static int setup_brt_root_node (BRT t, diskoff offset) {
#define WHEN_BRTTRACE(x) ((void)0) #define WHEN_BRTTRACE(x) ((void)0)
#endif #endif
int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, int nodesize, CACHETABLE cachetable, int brt_create(BRT *brt_ptr) {
int (*compare_fun)(DB*,const DBT*,const DBT*)) { BRT brt = toku_malloc(sizeof *brt);
if (brt == 0)
return ENOMEM;
memset(brt, 0, sizeof *brt);
brt->flags = 0;
brt->nodesize = BRT_DEFAULT_NODE_SIZE;
brt->compare_fun = default_compare_fun;
brt->dup_compare = default_compare_fun;
*brt_ptr = brt;
return 0;
}
int brt_set_flags(BRT brt, int flags) {
brt->flags = flags;
return 0;
}
int brt_set_nodesize(BRT brt, int nodesize) {
brt->nodesize = nodesize;
return 0;
}
int brt_set_bt_compare(BRT brt, int (*bt_compare)(DB *, const DBT*, const DBT*)) {
brt->compare_fun = bt_compare;
return 0;
}
int brt_set_dup_compare(BRT brt, int (*dup_compare)(DB *, const DBT*, const DBT*)) {
brt->dup_compare = dup_compare;
return 0;
}
int brt_open(BRT t, const char *fname, const char *dbname, int is_create, CACHETABLE cachetable) {
/* If dbname is NULL then we setup to hold a single tree. Otherwise we setup an array. */ /* If dbname is NULL then we setup to hold a single tree. Otherwise we setup an array. */
int r; int r;
BRT t;
char *malloced_name=0; char *malloced_name=0;
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE: %s:%d open_brt(%s, \"%s\", %d, %p, %d, %p)\n", WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE: %s:%d open_brt(%s, \"%s\", %d, %p, %d, %p)\n",
__FILE__, __LINE__, fname, dbname, is_create, newbrt, nodesize, cachetable)); __FILE__, __LINE__, fname, dbname, is_create, newbrt, nodesize, cachetable));
if ((MALLOC(t))==0) { if (0) { died0: assert(r); return r; }
assert(errno==ENOMEM);
r = ENOMEM;
if (0) { died0: toku_free(t); }
return r;
}
t->compare_fun = compare_fun;
t->skey = t->sval = 0;
if (dbname) { if (dbname) {
malloced_name = toku_strdup(dbname); malloced_name = toku_strdup(dbname);
if (malloced_name==0) { if (malloced_name==0) {
...@@ -1130,12 +1318,13 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, ...@@ -1130,12 +1318,13 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
r=cachetable_openf(&t->cf, cachetable, fname, O_RDWR | (is_create ? O_CREAT : 0), 0777); r=cachetable_openf(&t->cf, cachetable, fname, O_RDWR | (is_create ? O_CREAT : 0), 0777);
if (r!=0) { if (r!=0) {
if (0) { died1: cachefile_close(&t->cf); } if (0) { died1: cachefile_close(&t->cf); }
t->database_name = 0;
goto died0a; goto died0a;
} }
assert(nodesize>0); assert(t->nodesize>0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items(); //printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
if (is_create) { if (is_create) {
r = read_and_pin_brt_header(t->cf, &t->h); r = toku_read_and_pin_brt_header(t->cf, &t->h);
if (r==-1) { if (r==-1) {
/* construct a new header. */ /* construct a new header. */
if ((MALLOC(t->h))==0) { if ((MALLOC(t->h))==0) {
...@@ -1145,23 +1334,24 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, ...@@ -1145,23 +1334,24 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
goto died1; goto died1;
} }
t->h->dirty=1; t->h->dirty=1;
t->h->nodesize=nodesize; t->h->flags = t->flags;
t->h->nodesize=t->nodesize;
t->h->freelist=-1; t->h->freelist=-1;
t->h->unused_memory=2*nodesize; t->h->unused_memory=2*t->nodesize;
if (dbname) { if (dbname) {
t->h->unnamed_root = -1; t->h->unnamed_root = -1;
t->h->n_named_roots = 1; t->h->n_named_roots = 1;
if ((MALLOC_N(1, t->h->names))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died3: toku_free(t->h->names); } goto died2; } if ((MALLOC_N(1, t->h->names))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died3: toku_free(t->h->names); } goto died2; }
if ((MALLOC_N(1, t->h->roots))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died4: toku_free(t->h->roots); } goto died3; } if ((MALLOC_N(1, t->h->roots))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died4: toku_free(t->h->roots); } goto died3; }
if ((t->h->names[0] = toku_strdup(dbname))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died5: toku_free(t->h->names[0]); } goto died4; } if ((t->h->names[0] = toku_strdup(dbname))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died5: toku_free(t->h->names[0]); } goto died4; }
t->h->roots[0] = nodesize; t->h->roots[0] = t->nodesize;
} else { } else {
t->h->unnamed_root = nodesize; t->h->unnamed_root = t->nodesize;
t->h->n_named_roots = -1; t->h->n_named_roots = -1;
t->h->names=0; t->h->names=0;
t->h->roots=0; t->h->roots=0;
} }
if ((r=setup_brt_root_node(t, nodesize))!=0) { if (dbname) goto died5; else goto died2; } if ((r=setup_brt_root_node(t, t->nodesize))!=0) { if (dbname) goto died5; else goto died2; }
if ((r=cachetable_put(t->cf, 0, t->h, 0, brtheader_flush_callback, brtheader_fetch_callback, 0))) { if (dbname) goto died5; else goto died2; } if ((r=cachetable_put(t->cf, 0, t->h, 0, brtheader_flush_callback, brtheader_fetch_callback, 0))) { if (dbname) goto died5; else goto died2; }
} else { } else {
int i; int i;
...@@ -1178,18 +1368,18 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, ...@@ -1178,18 +1368,18 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
if ((t->h->roots = toku_realloc(t->h->roots, (1+t->h->n_named_roots)*sizeof(*t->h->roots))) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; } if ((t->h->roots = toku_realloc(t->h->roots, (1+t->h->n_named_roots)*sizeof(*t->h->roots))) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; }
t->h->n_named_roots++; t->h->n_named_roots++;
if ((t->h->names[t->h->n_named_roots-1] = toku_strdup(dbname)) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; } if ((t->h->names[t->h->n_named_roots-1] = toku_strdup(dbname)) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; }
printf("%s:%d t=%p\n", __FILE__, __LINE__, t); //printf("%s:%d t=%p\n", __FILE__, __LINE__, t);
t->h->roots[t->h->n_named_roots-1] = malloc_diskblock_header_is_in_memory(t, t->h->nodesize); t->h->roots[t->h->n_named_roots-1] = malloc_diskblock_header_is_in_memory(t, t->h->nodesize);
t->h->dirty = 1; t->h->dirty = 1;
if ((r=setup_brt_root_node(t, t->h->roots[t->h->n_named_roots-1]))!=0) goto died1; if ((r=setup_brt_root_node(t, t->h->roots[t->h->n_named_roots-1]))!=0) goto died1;
} }
} else { } else {
if ((r = read_and_pin_brt_header(t->cf, &t->h))!=0) goto died1; if ((r = toku_read_and_pin_brt_header(t->cf, &t->h))!=0) goto died1;
if (!dbname) { if (!dbname) {
if (t->h->n_named_roots!=-1) { r = -2; /* invalid args??? */; goto died1; } if (t->h->n_named_roots!=-1) { r = -2; /* invalid args??? */; goto died1; }
} else { } else {
int i; int i;
printf("%s:%d n_roots=%d\n", __FILE__, __LINE__, t->h->n_named_roots); // printf("%s:%d n_roots=%d\n", __FILE__, __LINE__, t->h->n_named_roots);
for (i=0; i<t->h->n_named_roots; i++) { for (i=0; i<t->h->n_named_roots; i++) {
if (strcmp(t->h->names[i], dbname)==0) { if (strcmp(t->h->names[i], dbname)==0) {
goto found_it; goto found_it;
...@@ -1199,17 +1389,39 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, ...@@ -1199,17 +1389,39 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
r=ENOENT; /* the database doesn't exist */ r=ENOENT; /* the database doesn't exist */
goto died1; goto died1;
} }
found_it: ; found_it:
t->nodesize = t->h->nodesize; /* inherit the pagesize from the file */
if (t->flags != t->h->flags) { /* flags must match */
r = EINVAL; goto died1;
}
} }
assert(t->h); assert(t->h);
if ((r = unpin_brt_header(t)) !=0) goto died1; if ((r = toku_unpin_brt_header(t)) !=0) goto died1;
assert(t->h==0); assert(t->h==0);
WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE -> %p\n", t)); WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE -> %p\n", t));
t->cursors_head = t->cursors_tail = 0;
*newbrt = t;
return 0; return 0;
} }
int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, int nodesize, CACHETABLE cachetable,
int (*compare_fun)(DB*,const DBT*,const DBT*)) {
BRT brt;
int r;
r = brt_create(&brt);
if (r != 0)
return r;
brt_set_nodesize(brt, nodesize);
brt_set_bt_compare(brt, compare_fun);
r = brt_open(brt, fname, dbname, is_create, cachetable);
if (r != 0) {
return r;
}
*newbrt = brt;
return r;
}
int close_brt (BRT brt) { int close_brt (BRT brt) {
int r; int r;
while (brt->cursors_head) { while (brt->cursors_head) {
...@@ -1217,9 +1429,11 @@ int close_brt (BRT brt) { ...@@ -1217,9 +1429,11 @@ int close_brt (BRT brt) {
r=brt_cursor_close(c); r=brt_cursor_close(c);
if (r!=0) return r; if (r!=0) return r;
} }
if (brt->cf) {
assert(0==cachefile_count_pinned(brt->cf, 1)); assert(0==cachefile_count_pinned(brt->cf, 1));
//printf("%s:%d closing cachetable\n", __FILE__, __LINE__); //printf("%s:%d closing cachetable\n", __FILE__, __LINE__);
if ((r = cachefile_close(&brt->cf))!=0) return r; if ((r = cachefile_close(&brt->cf))!=0) return r;
}
if (brt->database_name) toku_free(brt->database_name); if (brt->database_name) toku_free(brt->database_name);
if (brt->skey) { toku_free(brt->skey); } if (brt->skey) { toku_free(brt->skey); }
if (brt->sval) { toku_free(brt->sval); } if (brt->sval) { toku_free(brt->sval); }
...@@ -1229,7 +1443,7 @@ int close_brt (BRT brt) { ...@@ -1229,7 +1443,7 @@ int close_brt (BRT brt) {
int brt_debug_mode = 0;//strcmp(key,"hello387")==0; int brt_debug_mode = 0;//strcmp(key,"hello387")==0;
CACHEKEY* calculate_root_offset_pointer (BRT brt) { CACHEKEY* toku_calculate_root_offset_pointer (BRT brt) {
if (brt->database_name==0) { if (brt->database_name==0) {
return &brt->h->unnamed_root; return &brt->h->unnamed_root;
} else { } else {
...@@ -1246,7 +1460,7 @@ CACHEKEY* calculate_root_offset_pointer (BRT brt) { ...@@ -1246,7 +1460,7 @@ CACHEKEY* calculate_root_offset_pointer (BRT brt) {
int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *rootp) { int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *rootp) {
TAGMALLOC(BRTNODE, newroot); TAGMALLOC(BRTNODE, newroot);
int r; int r;
diskoff newroot_diskoff=malloc_diskblock(brt, brt->h->nodesize); DISKOFF newroot_diskoff=malloc_diskblock(brt, brt->h->nodesize);
assert(newroot); assert(newroot);
*rootp=newroot_diskoff; *rootp=newroot_diskoff;
brt->h->dirty=1; brt->h->dirty=1;
...@@ -1260,9 +1474,15 @@ int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKE ...@@ -1260,9 +1474,15 @@ int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKE
newroot->u.n.totalchildkeylens=splitk.size; newroot->u.n.totalchildkeylens=splitk.size;
newroot->u.n.children[0]=nodea->thisnodename; newroot->u.n.children[0]=nodea->thisnodename;
newroot->u.n.children[1]=nodeb->thisnodename; newroot->u.n.children[1]=nodeb->thisnodename;
nodea->parent_brtnode = newroot;
nodeb->parent_brtnode = newroot;
fixup_child_fingerprint(newroot, 0, nodea);
fixup_child_fingerprint(newroot, 1, nodeb);
r=toku_hashtable_create(&newroot->u.n.htables[0]); if (r!=0) return r; r=toku_hashtable_create(&newroot->u.n.htables[0]); if (r!=0) return r;
r=toku_hashtable_create(&newroot->u.n.htables[1]); if (r!=0) return r; r=toku_hashtable_create(&newroot->u.n.htables[1]); if (r!=0) return r;
verify_counts(newroot); verify_counts(newroot);
//verify_local_fingerprint_nonleaf(nodea);
//verify_local_fingerprint_nonleaf(nodeb);
r=cachetable_unpin(brt->cf, nodea->thisnodename, nodea->dirty, brtnode_size(nodea)); r=cachetable_unpin(brt->cf, nodea->thisnodename, nodea->dirty, brtnode_size(nodea));
if (r!=0) return r; if (r!=0) return r;
r=cachetable_unpin(brt->cf, nodeb->thisnodename, nodeb->dirty, brtnode_size(nodeb)); r=cachetable_unpin(brt->cf, nodeb->thisnodename, nodeb->dirty, brtnode_size(nodeb));
...@@ -1274,7 +1494,7 @@ int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKE ...@@ -1274,7 +1494,7 @@ int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKE
return 0; return 0;
} }
int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) { static int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) {
void *node_v; void *node_v;
BRTNODE node; BRTNODE node;
CACHEKEY *rootp; CACHEKEY *rootp;
...@@ -1284,11 +1504,11 @@ int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) { ...@@ -1284,11 +1504,11 @@ int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) {
DBT splitk; DBT splitk;
int debug = brt_debug_mode;//strcmp(key,"hello387")==0; int debug = brt_debug_mode;//strcmp(key,"hello387")==0;
//assert(0==cachetable_assert_all_unpinned(brt->cachetable)); //assert(0==cachetable_assert_all_unpinned(brt->cachetable));
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); } if (0) { died0: toku_unpin_brt_header(brt); }
return r; return r;
} }
rootp = calculate_root_offset_pointer(brt); rootp = toku_calculate_root_offset_pointer(brt);
if (debug) printf("%s:%d Getting %lld\n", __FILE__, __LINE__, *rootp); if (debug) printf("%s:%d Getting %lld\n", __FILE__, __LINE__, *rootp);
if ((r=cachetable_get_and_pin(brt->cf, *rootp, &node_v, NULL, if ((r=cachetable_get_and_pin(brt->cf, *rootp, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize))) { brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize))) {
...@@ -1324,7 +1544,7 @@ int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) { ...@@ -1324,7 +1544,7 @@ int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) {
size = brtnode_size(node); size = brtnode_size(node);
} }
cachetable_unpin(brt->cf, *rootp, dirty, size); cachetable_unpin(brt->cf, *rootp, dirty, size);
r = unpin_brt_header(brt); r = toku_unpin_brt_header(brt);
assert(r == 0); assert(r == 0);
//assert(0==cachetable_assert_all_unpinned(brt->cachetable)); //assert(0==cachetable_assert_all_unpinned(brt->cachetable));
return result; return result;
...@@ -1342,7 +1562,7 @@ int brt_insert (BRT brt, DBT *key, DBT *val, DB* db, TOKUTXN txn) { ...@@ -1342,7 +1562,7 @@ int brt_insert (BRT brt, DBT *key, DBT *val, DB* db, TOKUTXN txn) {
return r; return r;
} }
int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE parent_brtnode) { int brt_lookup_node (BRT brt, DISKOFF off, DBT *k, DBT *v, DB *db, BRTNODE parent_brtnode) {
int result; int result;
void *node_v; void *node_v;
int r = cachetable_get_and_pin(brt->cf, off, &node_v, NULL, int r = cachetable_get_and_pin(brt->cf, off, &node_v, NULL,
...@@ -1361,6 +1581,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren ...@@ -1361,6 +1581,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren
if (node->height==0) { if (node->height==0) {
result = pma_lookup(node->u.l.buffer, k, v, db); result = pma_lookup(node->u.l.buffer, k, v, db);
//printf("%s:%d looked up something, got answerlen=%d\n", __FILE__, __LINE__, answerlen); //printf("%s:%d looked up something, got answerlen=%d\n", __FILE__, __LINE__, answerlen);
//verify_local_fingerprint_nonleaf(node);
r = cachetable_unpin(brt->cf, off, 0, 0); r = cachetable_unpin(brt->cf, off, 0, 0);
assert(r == 0); assert(r == 0);
return result; return result;
...@@ -1383,6 +1604,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren ...@@ -1383,6 +1604,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren
assert(0); assert(0);
result = -1; // some versions of gcc complain result = -1; // some versions of gcc complain
} }
//verify_local_fingerprint_nonleaf(node);
r = cachetable_unpin(brt->cf, off, 0, 0); r = cachetable_unpin(brt->cf, off, 0, 0);
assert(r == 0); assert(r == 0);
return result; return result;
...@@ -1390,6 +1612,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren ...@@ -1390,6 +1612,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren
} }
result = brt_lookup_node(brt, node->u.n.children[childnum], k, v, db, node); result = brt_lookup_node(brt, node->u.n.children[childnum], k, v, db, node);
//verify_local_fingerprint_nonleaf(node);
r = cachetable_unpin(brt->cf, off, 0, 0); r = cachetable_unpin(brt->cf, off, 0, 0);
assert(r == 0); assert(r == 0);
return result; return result;
...@@ -1400,20 +1623,20 @@ int brt_lookup (BRT brt, DBT *k, DBT *v, DB *db) { ...@@ -1400,20 +1623,20 @@ int brt_lookup (BRT brt, DBT *k, DBT *v, DB *db) {
int r; int r;
CACHEKEY *rootp; CACHEKEY *rootp;
assert(0==cachefile_count_pinned(brt->cf, 1)); assert(0==cachefile_count_pinned(brt->cf, 1));
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
printf("%s:%d\n", __FILE__, __LINE__); printf("%s:%d\n", __FILE__, __LINE__);
if (0) { died0: unpin_brt_header(brt); } if (0) { died0: toku_unpin_brt_header(brt); }
// printf("%s:%d returning %d\n", __FILE__, __LINE__, r); // printf("%s:%d returning %d\n", __FILE__, __LINE__, r);
assert(0==cachefile_count_pinned(brt->cf, 1)); assert(0==cachefile_count_pinned(brt->cf, 1));
return r; return r;
} }
rootp = calculate_root_offset_pointer(brt); rootp = toku_calculate_root_offset_pointer(brt);
if ((r = brt_lookup_node(brt, *rootp, k, v, db, 0))) { if ((r = brt_lookup_node(brt, *rootp, k, v, db, 0))) {
// printf("%s:%d\n", __FILE__, __LINE__); // printf("%s:%d\n", __FILE__, __LINE__);
goto died0; goto died0;
} }
//printf("%s:%d r=%d", __FILE__, __LINE__, r); if (r==0) printf(" vallen=%d", *vallen); printf("\n"); //printf("%s:%d r=%d", __FILE__, __LINE__, r); if (r==0) printf(" vallen=%d", *vallen); printf("\n");
if ((r = unpin_brt_header(brt))!=0) return r; if ((r = toku_unpin_brt_header(brt))!=0) return r;
assert(0==cachefile_count_pinned(brt->cf, 1)); assert(0==cachefile_count_pinned(brt->cf, 1));
return 0; return 0;
} }
...@@ -1433,9 +1656,9 @@ int brt_delete(BRT brt, DBT *key, DB *db) { ...@@ -1433,9 +1656,9 @@ int brt_delete(BRT brt, DBT *key, DB *db) {
return r; return r;
} }
int verify_brtnode (BRT brt, diskoff off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode); int verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode);
int dump_brtnode (BRT brt, diskoff off, int depth, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, BRTNODE parent_brtnode) { int dump_brtnode (BRT brt, DISKOFF off, int depth, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, BRTNODE parent_brtnode) {
int result=0; int result=0;
BRTNODE node; BRTNODE node;
void *node_v; void *node_v;
...@@ -1491,18 +1714,18 @@ int dump_brtnode (BRT brt, diskoff off, int depth, bytevec lorange, ITEMLEN lole ...@@ -1491,18 +1714,18 @@ int dump_brtnode (BRT brt, diskoff off, int depth, bytevec lorange, ITEMLEN lole
int dump_brt (BRT brt) { int dump_brt (BRT brt) {
int r; int r;
CACHEKEY *rootp; CACHEKEY *rootp;
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); } if (0) { died0: toku_unpin_brt_header(brt); }
return r; return r;
} }
rootp = calculate_root_offset_pointer(brt); rootp = toku_calculate_root_offset_pointer(brt);
printf("split_count=%d\n", split_count); printf("split_count=%d\n", split_count);
if ((r = dump_brtnode(brt, *rootp, 0, 0, 0, 0, 0, null_brtnode))) goto died0; if ((r = dump_brtnode(brt, *rootp, 0, 0, 0, 0, 0, null_brtnode))) goto died0;
if ((r = unpin_brt_header(brt))!=0) return r; if ((r = toku_unpin_brt_header(brt))!=0) return r;
return 0; return 0;
} }
int show_brtnode_blocknumbers (BRT brt, diskoff off, BRTNODE parent_brtnode) { int show_brtnode_blocknumbers (BRT brt, DISKOFF off, BRTNODE parent_brtnode) {
BRTNODE node; BRTNODE node;
void *node_v; void *node_v;
int i,r; int i,r;
...@@ -1528,93 +1751,15 @@ int show_brtnode_blocknumbers (BRT brt, diskoff off, BRTNODE parent_brtnode) { ...@@ -1528,93 +1751,15 @@ int show_brtnode_blocknumbers (BRT brt, diskoff off, BRTNODE parent_brtnode) {
int show_brt_blocknumbers (BRT brt) { int show_brt_blocknumbers (BRT brt) {
int r; int r;
CACHEKEY *rootp; CACHEKEY *rootp;
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) { if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); } if (0) { died0: toku_unpin_brt_header(brt); }
return r; return r;
} }
rootp = calculate_root_offset_pointer(brt); rootp = toku_calculate_root_offset_pointer(brt);
printf("BRT %p has blocks:", brt); printf("BRT %p has blocks:", brt);
if ((r=show_brtnode_blocknumbers (brt, *rootp, 0))) goto died0; if ((r=show_brtnode_blocknumbers (brt, *rootp, 0))) goto died0;
printf("\n"); printf("\n");
if ((r = unpin_brt_header(brt))!=0) return r; if ((r = toku_unpin_brt_header(brt))!=0) return r;
return 0;
}
int verify_brtnode (BRT brt, diskoff off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode) {
int result=0;
BRTNODE node;
void *node_v;
int r;
if ((r = cachetable_get_and_pin(brt->cf, off, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize)))
return r;
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
node=node_v;
node->parent_brtnode = parent_brtnode;
if (node->height>0) {
int i;
for (i=0; i< node->u.n.n_children-1; i++) {
bytevec thislorange,thishirange;
ITEMLEN thislolen, thishilen;
if (node->u.n.n_children==0 || i==0) {
thislorange=lorange;
thislolen =lolen;
} else {
thislorange=node->u.n.childkeys[i-1];
thislolen =node->u.n.childkeylens[i-1];
}
if (node->u.n.n_children==0 || i+1>=node->u.n.n_children) {
thishirange=hirange;
thishilen =hilen;
} else {
thishirange=node->u.n.childkeys[i];
thishilen =node->u.n.childkeylens[i];
}
{
void verify_pair (bytevec key, unsigned int keylen,
bytevec data __attribute__((__unused__)),
unsigned int datalen __attribute__((__unused__)),
int type __attribute__((__unused__)),
void *ignore __attribute__((__unused__))) {
if (thislorange) assert(keycompare(thislorange,thislolen,key,keylen)<0);
if (thishirange && keycompare(key,keylen,thishirange,thishilen)>0) {
printf("%s:%d in buffer %d key %s is bigger than %s\n", __FILE__, __LINE__, i, (char*)key, (char*)thishirange);
result=1;
}
}
toku_hashtable_iterate(node->u.n.htables[i], verify_pair, 0);
}
}
for (i=0; i<node->u.n.n_children; i++) {
if (i>0) {
if (lorange) assert(keycompare(lorange,lolen, node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1])<0);
if (hirange) assert(keycompare(node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1], hirange, hilen)<=0);
}
if (recurse) {
result|=verify_brtnode(brt, node->u.n.children[i],
(i==0) ? lorange : node->u.n.childkeys[i-1],
(i==0) ? lolen : node->u.n.childkeylens[i-1],
(i==node->u.n.n_children-1) ? hirange : node->u.n.childkeys[i],
(i==node->u.n.n_children-1) ? hilen : node->u.n.childkeylens[i],
recurse,
node);
}
}
}
if ((r = cachetable_unpin(brt->cf, off, 0, 0))) return r;
return result;
}
int verify_brt (BRT brt) {
int r;
CACHEKEY *rootp;
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); }
return r;
}
rootp = calculate_root_offset_pointer(brt);
if ((r=verify_brtnode(brt, *rootp, 0, 0, 0, 0, 1, null_brtnode))) goto died0;
if ((r = unpin_brt_header(brt))!=0) return r;
return 0; return 0;
} }
...@@ -1658,7 +1803,7 @@ void brt_flush_child(BRT t, BRTNODE node, int childnum, BRT_CURSOR cursor, void ...@@ -1658,7 +1803,7 @@ void brt_flush_child(BRT t, BRTNODE node, int childnum, BRT_CURSOR cursor, void
if (0) printf("child_did_split %lld %lld\n", childa->thisnodename, childb->thisnodename); if (0) printf("child_did_split %lld %lld\n", childa->thisnodename, childb->thisnodename);
if (i == 0) { if (i == 0) {
CACHEKEY *rootp = calculate_root_offset_pointer(t); CACHEKEY *rootp = toku_calculate_root_offset_pointer(t);
r = brt_init_new_root(t, childa, childb, child_splitk, rootp); r = brt_init_new_root(t, childa, childb, child_splitk, rootp);
assert(r == 0); assert(r == 0);
r = cachetable_unpin(t->cf, *rootp, CACHETABLE_DIRTY, 0); r = cachetable_unpin(t->cf, *rootp, CACHETABLE_DIRTY, 0);
...@@ -1802,6 +1947,7 @@ void brt_cursor_leaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE le ...@@ -1802,6 +1947,7 @@ void brt_cursor_leaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE le
if (0) printf("brt_cursor_leaf_split %p oldnode %lld newnode %lld\n", cursor, if (0) printf("brt_cursor_leaf_split %p oldnode %lld newnode %lld\n", cursor,
oldnode->thisnodename, newnode->thisnodename); oldnode->thisnodename, newnode->thisnodename);
//verify_local_fingerprint_nonleaf(oldnode);
r = cachetable_unpin(t->cf, oldnode->thisnodename, oldnode->dirty, brtnode_size(oldnode)); r = cachetable_unpin(t->cf, oldnode->thisnodename, oldnode->dirty, brtnode_size(oldnode));
assert(r == 0); assert(r == 0);
r = cachetable_maybe_get_and_pin(t->cf, newnode->thisnodename, &v); r = cachetable_maybe_get_and_pin(t->cf, newnode->thisnodename, &v);
...@@ -1877,6 +2023,7 @@ void brt_cursor_nonleaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE ...@@ -1877,6 +2023,7 @@ void brt_cursor_nonleaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE
if (0) printf("brt_cursor_nonleaf_split %p oldnode %lld newnode %lld\n", if (0) printf("brt_cursor_nonleaf_split %p oldnode %lld newnode %lld\n",
cursor, oldnode->thisnodename, newnode->thisnodename); cursor, oldnode->thisnodename, newnode->thisnodename);
// The oldnode is probably dead. But we say it is dirty? ???
r = cachetable_unpin(t->cf, oldnode->thisnodename, oldnode->dirty, brtnode_size(oldnode)); r = cachetable_unpin(t->cf, oldnode->thisnodename, oldnode->dirty, brtnode_size(oldnode));
assert(r == 0); assert(r == 0);
r = cachetable_maybe_get_and_pin(t->cf, newnode->thisnodename, &v); r = cachetable_maybe_get_and_pin(t->cf, newnode->thisnodename, &v);
...@@ -1949,7 +2096,7 @@ void brt_cursor_print(BRT_CURSOR cursor) { ...@@ -1949,7 +2096,7 @@ void brt_cursor_print(BRT_CURSOR cursor) {
printf("\n"); printf("\n");
} }
int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) { int brtcurs_set_position_last (BRT_CURSOR cursor, DISKOFF off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt=cursor->brt; BRT brt=cursor->brt;
void *node_v; void *node_v;
...@@ -1973,7 +2120,7 @@ int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, ...@@ -1973,7 +2120,7 @@ int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db,
cursor->pathcnum[cursor->path_len-1] = childnum; cursor->pathcnum[cursor->path_len-1] = childnum;
brt_node_add_cursor(node, childnum, cursor); brt_node_add_cursor(node, childnum, cursor);
if (node->u.n.n_bytes_in_hashtable[childnum] > 0) { if (node->u.n.n_bytes_in_hashtable[childnum] > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn); brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
/* /*
* the flush may have been partially successfull. it may have also * the flush may have been partially successfull. it may have also
* changed the tree such that the current node have expanded or been * changed the tree such that the current node have expanded or been
...@@ -2011,7 +2158,7 @@ int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, ...@@ -2011,7 +2158,7 @@ int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db,
} }
} }
int brtcurs_set_position_first (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) { int brtcurs_set_position_first (BRT_CURSOR cursor, DISKOFF off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt=cursor->brt; BRT brt=cursor->brt;
void *node_v; void *node_v;
...@@ -2035,7 +2182,7 @@ int brtcurs_set_position_first (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db ...@@ -2035,7 +2182,7 @@ int brtcurs_set_position_first (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db
cursor->pathcnum[cursor->path_len-1] = childnum; cursor->pathcnum[cursor->path_len-1] = childnum;
brt_node_add_cursor(node, childnum, cursor); brt_node_add_cursor(node, childnum, cursor);
if (node->u.n.n_bytes_in_hashtable[childnum] > 0) { if (node->u.n.n_bytes_in_hashtable[childnum] > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn); brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
/* /*
* the flush may have been partially successfull. it may have also * the flush may have been partially successfull. it may have also
* changed the tree such that the current node have expanded or been * changed the tree such that the current node have expanded or been
...@@ -2086,6 +2233,7 @@ int brtcurs_set_position_next2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn) ...@@ -2086,6 +2233,7 @@ int brtcurs_set_position_next2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
node = cursor->path[cursor->path_len-1]; node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1]; childnum = cursor->pathcnum[cursor->path_len-1];
cursor->path_len -= 1; cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(cursor->brt->cf, node->thisnodename, node->dirty, brtnode_size(node)); cachetable_unpin(cursor->brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (brt_cursor_path_empty(cursor)) if (brt_cursor_path_empty(cursor))
...@@ -2105,7 +2253,7 @@ int brtcurs_set_position_next2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn) ...@@ -2105,7 +2253,7 @@ int brtcurs_set_position_next2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
more = node->u.n.n_bytes_in_hashtable[childnum]; more = node->u.n.n_bytes_in_hashtable[childnum];
if (more == 0) if (more == 0)
break; break;
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn); brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1]; node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1]; childnum = cursor->pathcnum[cursor->path_len-1];
} }
...@@ -2146,6 +2294,7 @@ int brtcurs_set_position_prev2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn) ...@@ -2146,6 +2294,7 @@ int brtcurs_set_position_prev2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
node = cursor->path[cursor->path_len-1]; node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1]; childnum = cursor->pathcnum[cursor->path_len-1];
cursor->path_len -= 1; cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(cursor->brt->cf, node->thisnodename, node->dirty, brtnode_size(node)); cachetable_unpin(cursor->brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (brt_cursor_path_empty(cursor)) if (brt_cursor_path_empty(cursor))
...@@ -2165,7 +2314,7 @@ int brtcurs_set_position_prev2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn) ...@@ -2165,7 +2314,7 @@ int brtcurs_set_position_prev2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
more = node->u.n.n_bytes_in_hashtable[childnum]; more = node->u.n.n_bytes_in_hashtable[childnum];
if (more == 0) if (more == 0)
break; break;
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn); brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1]; node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1]; childnum = cursor->pathcnum[cursor->path_len-1];
} }
...@@ -2192,7 +2341,7 @@ int brtcurs_set_position_prev (BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn) ...@@ -2192,7 +2341,7 @@ int brtcurs_set_position_prev (BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
return 0; return 0;
} }
int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) { int brtcurs_set_key(BRT_CURSOR cursor, DISKOFF off, DBT *key, DBT *val, int flag, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt = cursor->brt; BRT brt = cursor->brt;
void *node_v; void *node_v;
int r; int r;
...@@ -2214,7 +2363,7 @@ int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag ...@@ -2214,7 +2363,7 @@ int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag
brt_node_add_cursor(node, childnum, cursor); brt_node_add_cursor(node, childnum, cursor);
int more = node->u.n.n_bytes_in_hashtable[childnum]; int more = node->u.n.n_bytes_in_hashtable[childnum];
if (more > 0) { if (more > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn); brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1]; node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1]; childnum = cursor->pathcnum[cursor->path_len-1];
brt_node_remove_cursor(node, childnum, cursor); brt_node_remove_cursor(node, childnum, cursor);
...@@ -2248,12 +2397,13 @@ int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag ...@@ -2248,12 +2397,13 @@ int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag
if (r != 0) { if (r != 0) {
cursor->path_len -= 1; cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(brt->cf, off, node->dirty, brtnode_size(node)); cachetable_unpin(brt->cf, off, node->dirty, brtnode_size(node));
} }
return r; return r;
} }
int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) { int brtcurs_set_range(BRT_CURSOR cursor, DISKOFF off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt = cursor->brt; BRT brt = cursor->brt;
void *node_v; void *node_v;
int r; int r;
...@@ -2277,7 +2427,7 @@ int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN ...@@ -2277,7 +2427,7 @@ int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN
brt_node_add_cursor(node, childnum, cursor); brt_node_add_cursor(node, childnum, cursor);
int more = node->u.n.n_bytes_in_hashtable[childnum]; int more = node->u.n.n_bytes_in_hashtable[childnum];
if (more > 0) { if (more > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn); brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1]; node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1]; childnum = cursor->pathcnum[cursor->path_len-1];
brt_node_remove_cursor(node, childnum, cursor); brt_node_remove_cursor(node, childnum, cursor);
...@@ -2311,6 +2461,7 @@ int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN ...@@ -2311,6 +2461,7 @@ int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN
if (r != 0) { if (r != 0) {
cursor->path_len -= 1; cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(brt->cf, off, node->dirty, brtnode_size(node)); cachetable_unpin(brt->cf, off, node->dirty, brtnode_size(node));
} }
return r; return r;
...@@ -2323,6 +2474,7 @@ static int unpin_cursor (BRT_CURSOR cursor) { ...@@ -2323,6 +2474,7 @@ static int unpin_cursor (BRT_CURSOR cursor) {
for (i=0; i<cursor->path_len; i++) { for (i=0; i<cursor->path_len; i++) {
BRTNODE node = cursor->path[i]; BRTNODE node = cursor->path[i];
brt_node_remove_cursor(node, cursor->pathcnum[i], cursor); brt_node_remove_cursor(node, cursor->pathcnum[i], cursor);
//verify_local_fingerprint_nonleaf(node);
int r2 = cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node)); int r2 = cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (r==0) r=r2; if (r==0) r=r2;
} }
...@@ -2359,11 +2511,11 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int flags, DB *db, TO ...@@ -2359,11 +2511,11 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int flags, DB *db, TO
//dump_brt(cursor->brt); //dump_brt(cursor->brt);
//fprintf(stderr, "%s:%d in brt_c_get(...)\n", __FILE__, __LINE__); //fprintf(stderr, "%s:%d in brt_c_get(...)\n", __FILE__, __LINE__);
if ((r = read_and_pin_brt_header(cursor->brt->cf, &cursor->brt->h))) { if ((r = toku_read_and_pin_brt_header(cursor->brt->cf, &cursor->brt->h))) {
if (0) { died0: unpin_brt_header(cursor->brt); } if (0) { died0: toku_unpin_brt_header(cursor->brt); }
return r; return r;
} }
rootp = calculate_root_offset_pointer(cursor->brt); rootp = toku_calculate_root_offset_pointer(cursor->brt);
if (flags&DB_RMW) { if (flags&DB_RMW) {
do_rmw=1; do_rmw=1;
flags &= ~DB_RMW; flags &= ~DB_RMW;
...@@ -2427,7 +2579,7 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int flags, DB *db, TO ...@@ -2427,7 +2579,7 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int flags, DB *db, TO
abort(); abort();
} }
//printf("%s:%d unpinning header\n", __FILE__, __LINE__); //printf("%s:%d unpinning header\n", __FILE__, __LINE__);
if ((r = unpin_brt_header(cursor->brt))!=0) return r; if ((r = toku_unpin_brt_header(cursor->brt))!=0) return r;
return 0; return 0;
} }
......
...@@ -11,8 +11,15 @@ ...@@ -11,8 +11,15 @@
#include "log.h" #include "log.h"
typedef struct brt *BRT; typedef struct brt *BRT;
int open_brt (const char *fname, const char *dbname, int is_create, BRT *, int nodesize, CACHETABLE, int(*)(DB*,const DBT*,const DBT*)); int open_brt (const char *fname, const char *dbname, int is_create, BRT *, int nodesize, CACHETABLE, int(*)(DB*,const DBT*,const DBT*));
//int brt_create (BRT **, int nodesize, int n_nodes_in_cache); /* the nodesize and n_nodes in cache really should be separately configured. */
//int brt_open (BRT *, char *fname, char *dbname); int brt_create(BRT *);
int brt_set_flags(BRT, int flags);
int brt_set_nodesize(BRT, int nodesize);
int brt_set_bt_compare(BRT, int (*bt_compare)(DB *, const DBT*, const DBT*));
int brt_set_dup_compare(BRT, int (*dup_compare)(DB *, const DBT*, const DBT*));
int brt_set_cachetable(BRT, CACHETABLE);
int brt_open(BRT, const char *fname, const char *dbname, int is_create, CACHETABLE ct);
int brt_insert (BRT, DBT *, DBT *, DB*, TOKUTXN); int brt_insert (BRT, DBT *, DBT *, DB*, TOKUTXN);
int brt_lookup (BRT brt, DBT *k, DBT *v, DB*db); int brt_lookup (BRT brt, DBT *k, DBT *v, DB*db);
int brt_delete (BRT brt, DBT *k, DB *db); int brt_delete (BRT brt, DBT *k, DB *db);
...@@ -22,12 +29,11 @@ void brt_fsync (BRT); /* fsync, but don't clear the caches. */ ...@@ -22,12 +29,11 @@ void brt_fsync (BRT); /* fsync, but don't clear the caches. */
void brt_flush (BRT); /* fsync and clear the caches. */ void brt_flush (BRT); /* fsync and clear the caches. */
int brt_create_cachetable (CACHETABLE *t, int n_cachlines /* Pass 0 if you want the default. */);
/* create and initialize a cache table /* create and initialize a cache table
hashsize is the initialize size of the lookup table cachesize is the upper limit on the size of the size of the values in the table
cachesize is the upper limit on the size of the size of the values in the table */ pass 0 if you want the default */
int brt_create_cachetable_size (CACHETABLE *t, int hashsize, long cachesize);
int brt_create_cachetable(CACHETABLE *t, long cachesize, LSN initial_lsn, TOKULOGGER);
extern int brt_debug_mode; extern int brt_debug_mode;
int verify_brt (BRT brt); int verify_brt (BRT brt);
...@@ -40,4 +46,7 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int brtc_flags, DB *d ...@@ -40,4 +46,7 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int brtc_flags, DB *d
int brt_cursor_delete(BRT_CURSOR cursor, int flags); int brt_cursor_delete(BRT_CURSOR cursor, int flags);
int brt_cursor_close (BRT_CURSOR curs); int brt_cursor_close (BRT_CURSOR curs);
typedef struct brtenv *BRTENV;
int brtenv_checkpoint (BRTENV env);
#endif #endif
#ifndef BRTTYPES_H #ifndef BRTTYPES_H
#define BRTTYPES_H #define BRTTYPES_H
#include <sys/types.h>
#define _XOPEN_SOURCE 500 #define _XOPEN_SOURCE 500
#define _FILE_OFFSET_BITS 64 #define _FILE_OFFSET_BITS 64
typedef unsigned int ITEMLEN; typedef unsigned int ITEMLEN;
typedef const void *bytevec; typedef const void *bytevec;
//typedef const void *bytevec; //typedef const void *bytevec;
typedef long long diskoff; /* Offset in a disk. -1 is the NULL pointer. */ typedef long long DISKOFF; /* Offset in a disk. -1 is the NULL pointer. */
typedef long long TXNID; typedef long long TXNID;
/* Make the LSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_lsn { u_int64_t lsn; } LSN;
#define ZERO_LSN ((LSN){0})
/* Make the FILEID a struct for the same reason. */
typedef struct __toku_fileid { u_int32_t fileid; } FILENUM;
typedef enum __toku_bool { FALSE=0, TRUE=1} BOOL;
typedef struct tokulogger *TOKULOGGER;
#define NULL_LOGGER ((TOKULOGGER)0)
typedef struct tokutxn *TOKUTXN;
#endif #endif
/* -*- mode: C; c-basic-offset: 4 -*- */
#include <assert.h> #include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
...@@ -28,7 +30,7 @@ static void expectN(CACHEKEY key) { ...@@ -28,7 +30,7 @@ static void expectN(CACHEKEY key) {
CACHEFILE expect_f; CACHEFILE expect_f;
static void flush (CACHEFILE f, CACHEKEY key, void*value, long size __attribute__((__unused__)), int write_me __attribute__((__unused__)), int keep_mee __attribute__((__unused__))) { static void flush (CACHEFILE f, CACHEKEY key, void*value, long size __attribute__((__unused__)), BOOL write_me __attribute__((__unused__)), BOOL keep_me __attribute__((__unused__)), LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
struct item *it = value; struct item *it = value;
int i; int i;
...@@ -60,12 +62,13 @@ struct item *make_item (CACHEKEY key) { ...@@ -60,12 +62,13 @@ struct item *make_item (CACHEKEY key) {
} }
CACHEKEY did_fetch=-1; CACHEKEY did_fetch=-1;
int fetch (CACHEFILE f, CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs) { int fetch (CACHEFILE f, CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
printf("Fetch %lld\n", key); printf("Fetch %lld\n", key);
assert (expect_f==f); assert (expect_f==f);
assert((long)extraargs==23); assert((long)extraargs==23);
*value = make_item(key); *value = make_item(key);
did_fetch=key; did_fetch=key;
written_lsn->lsn = 0;
return 0; return 0;
} }
...@@ -76,7 +79,7 @@ void test0 (void) { ...@@ -76,7 +79,7 @@ void test0 (void) {
CACHEFILE f; CACHEFILE f;
int r; int r;
char fname[] = "test.dat"; char fname[] = "test.dat";
r=create_cachetable(&t, 5, 5); r=create_cachetable(&t, 5, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777); r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777);
...@@ -177,15 +180,17 @@ void test0 (void) { ...@@ -177,15 +180,17 @@ void test0 (void) {
} }
static void flush_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)), void *value, static void flush_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)), void *value,
long size __attribute__((__unused__)), int write_me __attribute__((__unused__)), long size __attribute__((__unused__)),
int keep_me __attribute__((__unused__))) { BOOL write_me __attribute__((__unused__)), BOOL keep_me __attribute__((__unused__)),
LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute ((__unused__))) {
int *v = value; int *v = value;
assert(*v==0); assert(*v==0);
} }
static int fetch_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)), static int fetch_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)),
void**value, long *sizep __attribute__((__unused__)), void*extraargs) { void**value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==42); assert((long)extraargs==42);
*value=0; *value=0;
written_lsn->lsn = 0;
return 0; return 0;
} }
...@@ -198,7 +203,7 @@ void test_nested_pin (void) { ...@@ -198,7 +203,7 @@ void test_nested_pin (void) {
int r; int r;
void *vv; void *vv;
char fname[] = "test_ct.dat"; char fname[] = "test_ct.dat";
r = create_cachetable(&t, 1, 1); r = create_cachetable(&t, 1, ZERO_LSN, NULL_LOGGER);
assert(r==0); assert(r==0);
unlink(fname); unlink(fname);
r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777); r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777);
...@@ -231,17 +236,21 @@ void null_flush (CACHEFILE cf __attribute__((__unused__)), ...@@ -231,17 +236,21 @@ void null_flush (CACHEFILE cf __attribute__((__unused__)),
CACHEKEY k __attribute__((__unused__)), CACHEKEY k __attribute__((__unused__)),
void *v __attribute__((__unused__)), void *v __attribute__((__unused__)),
long size __attribute__((__unused__)), long size __attribute__((__unused__)),
int write_me __attribute__((__unused__)), BOOL write_me __attribute__((__unused__)),
int keep_me __attribute__((__unused__))) { BOOL keep_me __attribute__((__unused__)),
LSN modified_lsn __attribute__((__unused__)),
BOOL rename_p __attribute__((__unused__))) {
} }
int add123_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs) { int add123_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==123); assert((long)extraargs==123);
*value = (void*)((unsigned long)key+123L); *value = (void*)((unsigned long)key+123L);
written_lsn->lsn = 0;
return 0; return 0;
} }
int add222_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs) { int add222_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==222); assert((long)extraargs==222);
*value = (void*)((unsigned long)key+222L); *value = (void*)((unsigned long)key+222L);
written_lsn->lsn = 0;
return 0; return 0;
} }
...@@ -257,7 +266,7 @@ void test_multi_filehandles (void) { ...@@ -257,7 +266,7 @@ void test_multi_filehandles (void) {
unlink(fname1); unlink(fname1);
unlink(fname2); unlink(fname2);
r = create_cachetable(&t, 4, 4); assert(r==0); r = create_cachetable(&t, 4, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = cachetable_openf(&f1, t, fname1, O_RDWR|O_CREAT, 0777); assert(r==0); r = cachetable_openf(&f1, t, fname1, O_RDWR|O_CREAT, 0777); assert(r==0);
r = link(fname1, fname2); assert(r==0); r = link(fname1, fname2); assert(r==0);
r = cachetable_openf(&f2, t, fname2, O_RDWR|O_CREAT, 0777); assert(r==0); r = cachetable_openf(&f2, t, fname2, O_RDWR|O_CREAT, 0777); assert(r==0);
...@@ -282,12 +291,13 @@ void test_multi_filehandles (void) { ...@@ -282,12 +291,13 @@ void test_multi_filehandles (void) {
r = cachetable_close(&t); assert(r==0); r = cachetable_close(&t); assert(r==0);
} }
void test_dirty_flush(CACHEFILE f, CACHEKEY key, void *value, long size, int write, int keep) { void test_dirty_flush(CACHEFILE f, CACHEKEY key, void *value, long size, BOOL do_write, BOOL keep, LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
printf("test_dirty_flush %p %lld %p %ld %d %d\n", f, key, value, size, write, keep); printf("test_dirty_flush %p %lld %p %ld %d %d\n", f, key, value, size, do_write, keep);
} }
int test_dirty_fetch(CACHEFILE f, CACHEKEY key, void **value_ptr, long *size_ptr, void *arg) { int test_dirty_fetch(CACHEFILE f, CACHEKEY key, void **value_ptr, long *size_ptr, void *arg, LSN *written_lsn) {
*value_ptr = arg; *value_ptr = arg;
written_lsn->lsn = 0;
printf("test_dirty_fetch %p %lld %p %ld %p\n", f, key, *value_ptr, *size_ptr, arg); printf("test_dirty_fetch %p %lld %p %ld %p\n", f, key, *value_ptr, *size_ptr, arg);
return 0; return 0;
} }
...@@ -301,7 +311,7 @@ void test_dirty() { ...@@ -301,7 +311,7 @@ void test_dirty() {
int dirty; long long pinned; long entry_size; int dirty; long long pinned; long entry_size;
int r; int r;
r = create_cachetable(&t, 4, 4); r = create_cachetable(&t, 4, ZERO_LSN, NULL_LOGGER);
assert(r == 0); assert(r == 0);
char *fname = "test.dat"; char *fname = "test.dat";
...@@ -393,8 +403,8 @@ void test_dirty() { ...@@ -393,8 +403,8 @@ void test_dirty() {
int test_size_debug; int test_size_debug;
CACHEKEY test_size_flush_key; CACHEKEY test_size_flush_key;
void test_size_flush_callback(CACHEFILE f, CACHEKEY key, void *value, long size, int write, int keep) { void test_size_flush_callback(CACHEFILE f, CACHEKEY key, void *value, long size, BOOL do_write, BOOL keep, LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
if (test_size_debug) printf("test_size_flush %p %lld %p %ld %d %d\n", f, key, value, size, write, keep); if (test_size_debug) printf("test_size_flush %p %lld %p %ld %d %d\n", f, key, value, size, do_write, keep);
assert(write != 0); assert(write != 0);
test_size_flush_key = key; test_size_flush_key = key;
} }
...@@ -409,7 +419,7 @@ void test_size_resize() { ...@@ -409,7 +419,7 @@ void test_size_resize() {
int n = 3; int n = 3;
long size = 1; long size = 1;
r = create_cachetable(&t, n, n*size); r = create_cachetable(&t, n*size, ZERO_LSN, NULL_LOGGER);
assert(r == 0); assert(r == 0);
char *fname = "test.dat"; char *fname = "test.dat";
...@@ -460,7 +470,7 @@ void test_size_flush() { ...@@ -460,7 +470,7 @@ void test_size_flush() {
const int n = 8; const int n = 8;
long long size = 1*1024*1024; long long size = 1*1024*1024;
r = create_cachetable(&t, 3, n*size); r = create_cachetable(&t, n*size, ZERO_LSN, NULL_LOGGER);
assert(r == 0); assert(r == 0);
char *fname = "test.dat"; char *fname = "test.dat";
...@@ -509,7 +519,99 @@ void test_size_flush() { ...@@ -509,7 +519,99 @@ void test_size_flush() {
assert(r == 0); assert(r == 0);
} }
enum { KEYLIMIT = 4, TRIALLIMIT=64 };
CACHEKEY keys[KEYLIMIT];
void* vals[KEYLIMIT];
int n_keys=0;
static void r_flush (CACHEFILE f __attribute__((__unused__)),
CACHEKEY k, void *value,
long size __attribute__((__unused__)),
BOOL write_me __attribute__((__unused__)),
BOOL keep_me,
LSN modified_lsn __attribute__((__unused__)),
BOOL rename_p __attribute__((__unused__))) {
int i;
//printf("Flush\n");
for (i=0; i<n_keys; i++) {
if (keys[i]==k) {
assert(vals[i]==value);
if (!keep_me) {
keys[i]=keys[n_keys-1];
vals[i]=vals[n_keys-1];
n_keys--;
return;
}
}
}
fprintf(stderr, "Whoops\n");
abort();
}
int r_fetch (CACHEFILE f __attribute__((__unused__)),
CACHEKEY key __attribute__((__unused__)),
void**value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
void*extraargs __attribute__((__unused__)),
LSN *modified_lsn __attribute__((__unused__))) {
fprintf(stderr, "Whoops, this should never be called");
return 0;
}
void test_rename (void) {
CACHETABLE t;
CACHEFILE f;
int i;
int r;
const char fname[] = "ct-test-rename.dat";
r=create_cachetable(&t, KEYLIMIT, ZERO_LSN, NULL_LOGGER); assert(r==0);
unlink(fname);
r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777);
assert(r==0);
for (i=0; i<TRIALLIMIT; i++) {
int ra = random()%3;
if (ra<=1) {
// Insert something
CACHEKEY nkey = random();
long nval = random();
//printf("n_keys=%d Insert %08llx\n", n_keys, nkey);
r = cachetable_put(f, nkey, (void*)nval, 1,
r_flush, r_fetch, 0);
assert(r==0);
assert(n_keys<KEYLIMIT);
keys[n_keys] = nkey;
vals[n_keys] = (void*)nval;
n_keys++;
r = cachetable_unpin(f, nkey, CACHETABLE_DIRTY, 1);
assert(r==0);
} else if (ra==2 && n_keys>0) {
// Rename something
int objnum = random()%n_keys;
CACHEKEY okey = keys[objnum];
CACHEKEY nkey = random();
void *current_value;
long current_size;
keys[objnum]=nkey;
//printf("Rename %llx to %llx\n", okey, nkey);
r = cachetable_get_and_pin(f, okey, &current_value, &current_size, r_flush, r_fetch, 0);
assert(r==0);
r = cachetable_rename(f, okey, nkey);
assert(r==0);
r = cachetable_unpin(f, nkey, CACHETABLE_DIRTY, 1);
}
}
r = cachefile_close(&f);
assert(r == 0);
r = cachetable_close(&t);
assert(r == 0);
assert(n_keys == 0);
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
test_rename();
test0(); test0();
test_nested_pin(); test_nested_pin();
test_multi_filehandles (); test_multi_filehandles ();
......
...@@ -58,7 +58,14 @@ static void file_is_not_present(CACHEFILE cf) { ...@@ -58,7 +58,14 @@ static void file_is_not_present(CACHEFILE cf) {
} }
static void flush_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY key, void *value, long size __attribute__((__unused__)), int write_me __attribute__((__unused__)), int keep_me __attribute__((__unused__))) { static void flush_forchain (CACHEFILE f __attribute__((__unused__)),
CACHEKEY key,
void *value,
long size __attribute__((__unused__)),
BOOL write_me __attribute__((__unused__)),
BOOL keep_me __attribute__((__unused__)),
LSN modified_lsn __attribute__((__unused__)),
BOOL rename_p __attribute__((__unused__))) {
int *v = value; int *v = value;
//cachetable_print_state(ct); //cachetable_print_state(ct);
//printf("Flush %lld %d\n", key, (int)value); //printf("Flush %lld %d\n", key, (int)value);
...@@ -67,9 +74,10 @@ static void flush_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY ke ...@@ -67,9 +74,10 @@ static void flush_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY ke
//print_ints(); //print_ints();
} }
static int fetch_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs) { static int fetch_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==(long)key); assert((long)extraargs==(long)key);
*value = (void*)(long)key; *value = (void*)(long)key;
written_lsn->lsn = 0;
return 0; return 0;
} }
...@@ -93,9 +101,9 @@ void test_chaining (void) { ...@@ -93,9 +101,9 @@ void test_chaining (void) {
char fname[N_FILES][FILENAME_LEN]; char fname[N_FILES][FILENAME_LEN];
int r; int r;
long i, trial; long i, trial;
r = create_cachetable(&ct, N_PRESENT_LIMIT, N_PRESENT_LIMIT); assert(r==0); r = create_cachetable(&ct, N_PRESENT_LIMIT, ZERO_LSN, NULL_LOGGER); assert(r==0);
for (i=0; i<N_FILES; i++) { for (i=0; i<N_FILES; i++) {
int r = snprintf(fname[i], FILENAME_LEN, "cachetabletest2.%ld.dat", i); r = snprintf(fname[i], FILENAME_LEN, "cachetabletest2.%ld.dat", i);
assert(r>0 && r<FILENAME_LEN); assert(r>0 && r<FILENAME_LEN);
unlink(fname[i]); unlink(fname[i]);
r = cachetable_openf(&f[i], ct, fname[i], O_RDWR|O_CREAT, 0777); assert(r==0); r = cachetable_openf(&f[i], ct, fname[i], O_RDWR|O_CREAT, 0777); assert(r==0);
......
...@@ -29,12 +29,15 @@ struct ctpair { ...@@ -29,12 +29,15 @@ struct ctpair {
PAIR next,prev; // In LRU list. PAIR next,prev; // In LRU list.
PAIR hash_chain; PAIR hash_chain;
CACHEFILE cachefile; CACHEFILE cachefile;
cachetable_flush_func_t flush_callback; CACHETABLE_FLUSH_FUNC_T flush_callback;
cachetable_fetch_func_t fetch_callback; CACHETABLE_FETCH_FUNC_T fetch_callback;
void*extraargs; void *extraargs;
int verify_flag; /* Used in verify_cachetable() */ int verify_flag; /* Used in verify_cachetable() */
LSN modified_lsn; // What was the LSN when modified (undefined if not dirty)
LSN written_lsn; // What was the LSN when written (we need to get this information when we fetch)
}; };
// The cachetable is as close to an ENV as we get.
struct cachetable { struct cachetable {
enum typ_tag tag; enum typ_tag tag;
int n_in_table; int n_in_table;
...@@ -44,6 +47,8 @@ struct cachetable { ...@@ -44,6 +47,8 @@ struct cachetable {
CACHEFILE cachefiles; CACHEFILE cachefiles;
long size_current, size_limit; long size_current, size_limit;
int primeidx; int primeidx;
LSN lsn_of_checkpoint; // the most recent checkpoint in the log.
TOKULOGGER logger;
}; };
struct fileid { struct fileid {
...@@ -57,9 +62,10 @@ struct cachefile { ...@@ -57,9 +62,10 @@ struct cachefile {
int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */ int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */
CACHETABLE cachetable; CACHETABLE cachetable;
struct fileid fileid; struct fileid fileid;
FILENUM filenum;
}; };
int create_cachetable(CACHETABLE *result, int table_size __attribute__((unused)), long size_limit) { int create_cachetable(CACHETABLE *result, long size_limit, LSN initial_lsn, TOKULOGGER logger) {
TAGMALLOC(CACHETABLE, t); TAGMALLOC(CACHETABLE, t);
int i; int i;
t->n_in_table = 0; t->n_in_table = 0;
...@@ -74,6 +80,8 @@ int create_cachetable(CACHETABLE *result, int table_size __attribute__((unused)) ...@@ -74,6 +80,8 @@ int create_cachetable(CACHETABLE *result, int table_size __attribute__((unused))
t->cachefiles = 0; t->cachefiles = 0;
t->size_current = 0; t->size_current = 0;
t->size_limit = size_limit; t->size_limit = size_limit;
t->lsn_of_checkpoint = initial_lsn;
t->logger = logger;
*result = t; *result = t;
return 0; return 0;
} }
...@@ -257,13 +265,25 @@ static PAIR remove_from_hash_chain (PAIR remove_me, PAIR list) { ...@@ -257,13 +265,25 @@ static PAIR remove_from_hash_chain (PAIR remove_me, PAIR list) {
return list; return list;
} }
// Predicate to determine if a node must be renamed. Nodes are renamed on the time they are written
// after a checkpoint.
// Thus we need to rename it if it is dirty,
// if it has been modified within the current checkpoint regime (hence non-strict inequality)
// and the last time it was written was in a previous checkpoint regime (strict inequality)
static BOOL need_to_rename_p (CACHETABLE t, PAIR p) {
return (p->dirty
&& p->modified_lsn.lsn>=t->lsn_of_checkpoint.lsn // nonstrict
&& p->written_lsn.lsn < t->lsn_of_checkpoint.lsn); // strict
}
static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) { static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) {
lru_remove(t, remove_me); lru_remove(t, remove_me);
//printf("flush_callback(%lld,%p)\n", remove_me->key, remove_me->value); //printf("flush_callback(%lld,%p)\n", remove_me->key, remove_me->value);
WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=%d, 0)\n", __FILE__, __LINE__, remove_me->key, remove_me->value, remove_me->dirty && write_me)); WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=%d, 0)\n", __FILE__, __LINE__, remove_me->key, remove_me->value, remove_me->dirty && write_me));
//printf("%s:%d TAG=%x p=%p\n", __FILE__, __LINE__, remove_me->tag, remove_me); //printf("%s:%d TAG=%x p=%p\n", __FILE__, __LINE__, remove_me->tag, remove_me);
//printf("%s:%d dirty=%d\n", __FILE__, __LINE__, remove_me->dirty); //printf("%s:%d dirty=%d\n", __FILE__, __LINE__, remove_me->dirty);
remove_me->flush_callback(remove_me->cachefile, remove_me->key, remove_me->value, remove_me->size, remove_me->dirty && write_me, 0); remove_me->flush_callback(remove_me->cachefile, remove_me->key, remove_me->value, remove_me->size, remove_me->dirty && write_me, 0,
t->lsn_of_checkpoint, need_to_rename_p(t, remove_me));
t->n_in_table--; t->n_in_table--;
// Remove it from the hash chain. // Remove it from the hash chain.
{ {
...@@ -274,14 +294,6 @@ static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) { ...@@ -274,14 +294,6 @@ static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) {
toku_free(remove_me); toku_free(remove_me);
} }
static void flush_and_keep (PAIR flush_me) {
if (flush_me->dirty) {
WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=1, 0)\n", __FILE__, __LINE__, flush_me->key, flush_me->value));
flush_me->flush_callback(flush_me->cachefile, flush_me->key, flush_me->value, flush_me->size, 1, 1);
flush_me->dirty=0;
}
}
static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) { static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) {
int r = 0; int r = 0;
again: again:
...@@ -309,7 +321,8 @@ static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) { ...@@ -309,7 +321,8 @@ static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) {
static int cachetable_insert_at(CACHEFILE cachefile, int h, CACHEKEY key, void *value, long size, static int cachetable_insert_at(CACHEFILE cachefile, int h, CACHEKEY key, void *value, long size,
cachetable_flush_func_t flush_callback, cachetable_flush_func_t flush_callback,
cachetable_fetch_func_t fetch_callback, cachetable_fetch_func_t fetch_callback,
void *extraargs, int dirty) { void *extraargs, int dirty,
LSN written_lsn) {
TAGMALLOC(PAIR, p); TAGMALLOC(PAIR, p);
p->pinned = 1; p->pinned = 1;
p->dirty = dirty; p->dirty = dirty;
...@@ -322,6 +335,8 @@ static int cachetable_insert_at(CACHEFILE cachefile, int h, CACHEKEY key, void * ...@@ -322,6 +335,8 @@ static int cachetable_insert_at(CACHEFILE cachefile, int h, CACHEKEY key, void *
p->flush_callback = flush_callback; p->flush_callback = flush_callback;
p->fetch_callback = fetch_callback; p->fetch_callback = fetch_callback;
p->extraargs = extraargs; p->extraargs = extraargs;
p->modified_lsn.lsn = 0;
p->written_lsn = written_lsn;
CACHETABLE ct = cachefile->cachetable; CACHETABLE ct = cachefile->cachetable;
lru_add_to_list(ct, p); lru_add_to_list(ct, p);
p->hash_chain = ct->table[h]; p->hash_chain = ct->table[h];
...@@ -352,7 +367,7 @@ int cachetable_put(CACHEFILE cachefile, CACHEKEY key, void*value, long size, ...@@ -352,7 +367,7 @@ int cachetable_put(CACHEFILE cachefile, CACHEKEY key, void*value, long size,
if (maybe_flush_some(cachefile->cachetable, size)) if (maybe_flush_some(cachefile->cachetable, size))
return -2; return -2;
// flushing could change the result from hashit() // flushing could change the result from hashit()
int r = cachetable_insert_at(cachefile, hashit(cachefile->cachetable, key), key, value, size, flush_callback, fetch_callback, extraargs, 1); int r = cachetable_insert_at(cachefile, hashit(cachefile->cachetable, key), key, value, size, flush_callback, fetch_callback, extraargs, 1, ZERO_LSN);
return r; return r;
} }
...@@ -377,10 +392,11 @@ int cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, void**value, long ...@@ -377,10 +392,11 @@ int cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, void**value, long
void *toku_value; void *toku_value;
long size = 1; // compat long size = 1; // compat
int r; int r;
LSN written_lsn;
WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key)); WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key));
if ((r=fetch_callback(cachefile, key, &toku_value, &size, extraargs))) if ((r=fetch_callback(cachefile, key, &toku_value, &size, extraargs, &written_lsn)))
return r; return r;
cachetable_insert_at(cachefile, hashit(t,key), key, toku_value, size, flush_callback, fetch_callback, extraargs, 0); cachetable_insert_at(cachefile, hashit(t,key), key, toku_value, size, flush_callback, fetch_callback, extraargs, 0, written_lsn);
*value = toku_value; *value = toku_value;
if (sizep) if (sizep)
*sizep = size; *sizep = size;
...@@ -428,6 +444,26 @@ int cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, int dirty, long size) { ...@@ -428,6 +444,26 @@ int cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, int dirty, long size) {
return 0; return 0;
} }
// effect: Move an object from one key to another key.
// requires: The object is pinned in the table
int cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey) {
CACHETABLE t = cachefile->cachetable;
PAIR *ptr_to_p,p;
for (ptr_to_p = &t->table[hashit(t, oldkey)], p = *ptr_to_p;
p;
ptr_to_p = &p->hash_chain, p = *ptr_to_p) {
if (p->key==oldkey && p->cachefile==cachefile) {
*ptr_to_p = p->hash_chain;
p->key = newkey;
int nh = hashit(t, newkey);
p->hash_chain = t->table[nh];
t->table[nh] = p;
return 0;
}
}
return -1;
}
int cachetable_flush (CACHETABLE t) { int cachetable_flush (CACHETABLE t) {
int i; int i;
for (i=0; i<t->table_size; i++) { for (i=0; i<t->table_size; i++) {
...@@ -559,6 +595,15 @@ int cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) { ...@@ -559,6 +595,15 @@ int cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) {
return 0; return 0;
} }
#if 0
static void flush_and_keep (PAIR flush_me) {
if (flush_me->dirty) {
WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=1, 0)\n", __FILE__, __LINE__, flush_me->key, flush_me->value));
flush_me->flush_callback(flush_me->cachefile, flush_me->key, flush_me->value, flush_me->size, 1, 1);
flush_me->dirty=0;
}
}
static int cachetable_fsync_pairs (CACHETABLE t, PAIR p) { static int cachetable_fsync_pairs (CACHETABLE t, PAIR p) {
if (p) { if (p) {
int r = cachetable_fsync_pairs(t, p->hash_chain); int r = cachetable_fsync_pairs(t, p->hash_chain);
...@@ -577,6 +622,7 @@ int cachetable_fsync (CACHETABLE t) { ...@@ -577,6 +622,7 @@ int cachetable_fsync (CACHETABLE t) {
} }
return 0; return 0;
} }
#endif
#if 0 #if 0
int cachefile_pwrite (CACHEFILE cf, const void *buf, size_t count, off_t offset) { int cachefile_pwrite (CACHEFILE cf, const void *buf, size_t count, off_t offset) {
...@@ -643,3 +689,54 @@ int cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, void **value_ptr, ...@@ -643,3 +689,54 @@ int cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, void **value_ptr,
} }
return 1; return 1;
} }
int cachetable_checkpoint (CACHETABLE ct) {
// Single threaded checkpoint.
// In future: for multithreaded checkpoint we should not proceed if the previous checkpoint has not finished.
// Requires: Everything is unpinned. (In the multithreaded version we have to wait for things to get unpinned and then
// grab them (or else the unpinner has to do something.)
// Algorithm: Write a checkpoint record to the log, noting the LSN of that record.
// Note the LSN of the previous checkpoint (stored in lsn_of_checkpoint)
// For every (unpinnned) dirty node in which the LSN is newer than the prev checkpoint LSN:
// flush the node (giving it a new nodeid, and fixing up the downpointer in the parent)
// Watch out since evicting the node modifies the hash table.
//?? This is a skeleton. It compiles, but doesn't do anything reasonable yet.
//?? log_the_checkpoint();
int n_saved=0;
int n_in_table = ct->n_in_table;
struct save_something {
CACHEFILE cf;
DISKOFF key;
void *value;
long size;
LSN modified_lsn;
CACHETABLE_FLUSH_FUNC_T flush_callback;
} *MALLOC_N(n_in_table, info);
{
PAIR pair;
for (pair=ct->head; pair; pair=pair->next) {
assert(!pair->pinned);
if (pair->dirty && pair->modified_lsn.lsn>ct->lsn_of_checkpoint.lsn) {
//?? /save_something_about_the_pair(); // This read-only so it doesn't modify the table.
n_saved++;
}
}
}
{
int i;
for (i=0; i<n_saved; i++) {
info[i].flush_callback(info[i].cf, info[i].key, info[i].value, info[i].size, 1, 1, info[i].modified_lsn, 0);
}
}
toku_free(info);
return 0;
}
TOKULOGGER cachefile_logger (CACHEFILE cf) {
return cf->cachetable->logger;
}
FILENUM cachefile_filenum (CACHEFILE cf) {
return cf->filenum;
}
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define CACHETABLE_H #define CACHETABLE_H
#include <fcntl.h> #include <fcntl.h>
#include "brttypes.h"
/* Implement the cache table. */ /* Implement the cache table. */
...@@ -22,14 +23,16 @@ typedef struct cachefile *CACHEFILE; ...@@ -22,14 +23,16 @@ typedef struct cachefile *CACHEFILE;
* table_size is the initial size of the cache table hash table (in number of entries) * table_size is the initial size of the cache table hash table (in number of entries)
* size limit is the upper bound of the sum of size of the entries in the cache table (total number of bytes) * size limit is the upper bound of the sum of size of the entries in the cache table (total number of bytes)
*/ */
int create_cachetable(CACHETABLE */*result*/, int table_size, long size_limit); int create_cachetable(CACHETABLE */*result*/, long size_limit, LSN initial_lsn, TOKULOGGER);
int cachetable_openf (CACHEFILE *,CACHETABLE, const char */*fname*/, int flags, mode_t mode); int cachetable_openf (CACHEFILE *,CACHETABLE, const char */*fname*/, int flags, mode_t mode);
typedef void (*cachetable_flush_func_t)(CACHEFILE, CACHEKEY key, void*value, long size, int write_me, int keep_me); typedef void (cachetable_flush_func_t)(CACHEFILE, CACHEKEY key, void*value, long size, BOOL write_me, BOOL keep_me, LSN modified_lsn, BOOL rename_p);
typedef cachetable_flush_func_t *CACHETABLE_FLUSH_FUNC_T;
/* If we are asked to fetch something, get it by calling this back. */ /* If we are asked to fetch something, get it by calling this back. */
typedef int (*cachetable_fetch_func_t)(CACHEFILE, CACHEKEY key, void **value, long *sizep, void *extraargs); typedef int (cachetable_fetch_func_t)(CACHEFILE, CACHEKEY key, void **value, long *sizep, void *extraargs, LSN *written_lsn);
typedef cachetable_fetch_func_t *CACHETABLE_FETCH_FUNC_T;
/* Error if already present. On success, pin the value. */ /* Error if already present. On success, pin the value. */
int cachetable_put(CACHEFILE cf, CACHEKEY key, void* value, long size, int cachetable_put(CACHEFILE cf, CACHEKEY key, void* value, long size,
...@@ -51,6 +54,9 @@ int cachetable_remove (CACHEFILE, CACHEKEY, int /*write_me*/); /* Removing somet ...@@ -51,6 +54,9 @@ int cachetable_remove (CACHEFILE, CACHEKEY, int /*write_me*/); /* Removing somet
int cachetable_assert_all_unpinned (CACHETABLE); int cachetable_assert_all_unpinned (CACHETABLE);
int cachefile_count_pinned (CACHEFILE, int /*printthem*/ ); int cachefile_count_pinned (CACHEFILE, int /*printthem*/ );
/* Rename whatever is at oldkey to be newkey. Requires that the object be pinned. */
int cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey);
//int cachetable_fsync_all (CACHETABLE); /* Flush everything to disk, but keep it in cache. */ //int cachetable_fsync_all (CACHETABLE); /* Flush everything to disk, but keep it in cache. */
int cachetable_close (CACHETABLE*); /* Flushes everything to disk, and destroys the cachetable. */ int cachetable_close (CACHETABLE*); /* Flushes everything to disk, and destroys the cachetable. */
...@@ -72,4 +78,7 @@ int cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, void **value_ptr, ...@@ -72,4 +78,7 @@ int cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, void **value_ptr,
void cachefile_verify (CACHEFILE cf); // Verify the whole cachetable that the CF is in. Slow. void cachefile_verify (CACHEFILE cf); // Verify the whole cachetable that the CF is in. Slow.
void cachetable_verify (CACHETABLE t); // Slow... void cachetable_verify (CACHETABLE t); // Slow...
TOKULOGGER cachefile_logger (CACHEFILE);
FILENUM cachefile_filenum (CACHEFILE);
#endif #endif
CFLAGS = -O2 -Wall -W -Werror -g
LDFLAGS = -lz -lssl -g
adler32:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <zlib.h>
#include <openssl/md2.h>
#include <openssl/md4.h>
#include <openssl/md5.h>
const unsigned int prime = 2000000011;
unsigned int karprabin (unsigned char *datac, int N) {
assert(N%4==0);
unsigned int *data=(unsigned int*)datac;
N=N/4;
int i;
unsigned int result=0;
for (i=0; i<N; i++) {
result=(result*prime)+data[i];
}
return result;
}
// According to
// P. L'Ecuyer, "Tables of Linear Congruential Generators of
// Different Sizes and Good Lattice Structure", Mathematics of
// Computation 68:225, 249--260 (1999).
// m=2^{32}-5 a=1588635695 is good.
const unsigned int mkr = 4294967291U;
const unsigned int akr = 1588635695U;
// But this is slower
unsigned int karprabinP (unsigned char *datac, int N) {
assert(N%4==0);
unsigned int *data=(unsigned int*)datac;
N=N/4;
int i;
unsigned long long result=0;
for (i=0; i<N; i++) {
result=((result*akr)+data[i])%mkr;
}
return result;
}
float tdiff (struct timeval *start, struct timeval *end) {
return (end->tv_sec-start->tv_sec) +1e-6*(end->tv_usec - start->tv_usec);
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
struct timeval start, end;
const int N=2<<20;
unsigned char *data=malloc(N);
int i;
assert(data);
for (i=0; i<N; i++) data[i]=random();
// adler32
{
uLong a32 = adler32(0L, Z_NULL, 0);
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
a32 = adler32(a32, data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("adler32=%lu, time=%9.6fs %9.6fns/b\n", a32, tm, 1e9*tm/N);
}
}
// crc32
{
uLong c32 = crc32(0L, Z_NULL, 0);
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
c32 = crc32(c32, data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("crc32=%lu, time=%9.6fs %9.6fns/b\n", c32, tm, 1e9*tm/N);
}
}
// MD2
{
unsigned char buf[MD2_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD2(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md2=");
for (j=0; j<MD2_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// MD4
{
unsigned char buf[MD4_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD4(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md4=");
for (j=0; j<MD4_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// MD5
{
unsigned char buf[MD5_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD5(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md5=");
for (j=0; j<MD5_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// karp rabin
{
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
unsigned int kr = karprabin(data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("kr=%ud time=%9.6fs %9.6fns/b\n", kr, tm, 1e9*tm/N);
}
}
free(data);
return 0;
}
#ifndef TOKU_CRC_H
#define TOKU_CRC_H
#include <zlib.h>
// zlib crc32 has a bug: If len==0 then it should return oldcrc32, but crc32 returns 0.
static inline u_int32_t toku_crc32 (u_int32_t oldcrc32, const void *data, u_int32_t len) {
if (len==0) return oldcrc32;
else return crc32(oldcrc32, data, len);
}
static const u_int32_t toku_null_crc = 0;
// Don't use crc32, use toku_crc32 to avoid that bug.
ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)) __attribute__((deprecated));
#endif
#include <arpa/inet.h>
#include <assert.h>
#include "brt-internal.h"
// Calculate the fingerprint for a kvpair
static inline u_int32_t toku_calc_more_crc32_kvpair (u_int32_t crc, const void *key, int keylen, const void *val, int vallen) {
int i;
i = htonl(keylen);
crc = toku_crc32(crc, (void*)&i, 4);
crc = toku_crc32(crc, key, keylen);
i = htonl(vallen);
crc = toku_crc32(crc, (void*)&i, 4);
crc = toku_crc32(crc, val, vallen);
return crc;
}
u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen) {
return toku_calc_more_crc32_kvpair(toku_null_crc, key, keylen, val, vallen);
}
u_int32_t toku_calccrc32_cmd (int type, const void *key, int keylen, const void *val, int vallen) {
unsigned char type_c = type;
return toku_calc_more_crc32_kvpair(toku_crc32(toku_null_crc,
&type_c, 1),
key, keylen, val, vallen);
}
u_int32_t toku_calccrc32_cmdstruct (BRT_CMD *cmd) {
switch (cmd->type) {
case BRT_NONE:
case BRT_INSERT:
case BRT_DELETE:
return toku_calccrc32_cmd (cmd->type, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size);
}
assert(0); /* Should not have come here. */
}
#include "brttypes.h"
#include "brt-internal.h" #include "brt-internal.h"
#include "memory.h"
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h> #include <unistd.h>
#include <assert.h> #include <assert.h>
...@@ -32,7 +30,7 @@ int write_int (int fd, unsigned int v) { ...@@ -32,7 +30,7 @@ int write_int (int fd, unsigned int v) {
return 0; return 0;
} }
int read_diskoff (int fd, diskoff *result) { int read_diskoff (int fd, DISKOFF *result) {
unsigned int i0,i1; unsigned int i0,i1;
int r; int r;
r = read_uint(fd, &i0); if(r!=0) return r; r = read_uint(fd, &i0); if(r!=0) return r;
...@@ -41,7 +39,7 @@ int read_diskoff (int fd, diskoff *result) { ...@@ -41,7 +39,7 @@ int read_diskoff (int fd, diskoff *result) {
return 0; return 0;
} }
int write_diskoff (int fd, diskoff v) { int write_diskoff (int fd, DISKOFF v) {
int r; int r;
r = write_int(fd, (unsigned int)(v>>32)); if (r!=0) return r; r = write_int(fd, (unsigned int)(v>>32)); if (r!=0) return r;
r = write_int(fd, (unsigned int)(v&0xffffffff)); if (r!=0) return r; r = write_int(fd, (unsigned int)(v&0xffffffff)); if (r!=0) return r;
...@@ -97,14 +95,14 @@ int read_brt_header (int fd, struct brt_header *header) { ...@@ -97,14 +95,14 @@ int read_brt_header (int fd, struct brt_header *header) {
return 0; return 0;
} }
int read_brt_h_unused_memory (int fd, diskoff *unused_memory) { int read_brt_h_unused_memory (int fd, DISKOFF *unused_memory) {
off_t r = lseek(fd, 12, SEEK_SET); off_t r = lseek(fd, 12, SEEK_SET);
assert(r==12); assert(r==12);
r = read_diskoff(fd, unused_memory); r = read_diskoff(fd, unused_memory);
return r; return r;
} }
int write_brt_h_unused_memory (int fd, diskoff unused_memory) { int write_brt_h_unused_memory (int fd, DISKOFF unused_memory) {
off_t r = lseek(fd, 12, SEEK_SET); off_t r = lseek(fd, 12, SEEK_SET);
assert(r==12); assert(r==12);
r = write_diskoff(fd, unused_memory); r = write_diskoff(fd, unused_memory);
......
// This list is intended to be embedded in other data structures.
struct list { struct list {
struct list *next, *prev; struct list *next, *prev;
}; };
......
#if defined(__x86_64) || defined(__i386)
static inline void mfence (void) {
__asm__ volatile ("mfence":::"memory");
}
static inline void rfence (void) {
__asm__ volatile ("rfence":::"memory");
}
static inline void sfence (void) {
__asm__ volatile ("sfence":::"memory");
}
/* According to the Intel Architecture Software Developer's
* Manual, Volume 3: System Programming Guide
* (http://www.intel.com/design/pro/manuals/243192.htm), page 7-6,
* "For the P6 family processors, locked operations serialize all
* outstanding load and store operations (that is, wait for them to
* complete)."
*
* Bradley found that fence instructions is faster on an opteron
* mfence takes 8ns on a 1.5GHZ AMD64 (maybe this is an 801)
* sfence takes 5ns
* lfence takes 3ns
* xchgl takes 14ns
*/
static inline lock_xchgl(volatile int *ptr, int x)
{
__asm__("xchgl %0,%1" :"=r" (x) :"m" (*(ptr)), "0" (x) :"memory");
return x;
}
#endif
typedef volatile int SPINLOCK[1];
static inline void spin_init (SPINLOCK v) {
v[0] = 0;
mfence();
}
static inline void spin_lock (SPINLOCK v) {
while (lock_xchgl((int*)v, 1)!=0) {
while (v[0]); /* Spin using only reads. It would be better to use MCS locks, but this reduces bus traffic. */
}
}
static inline void spin_unlock (SPINLOCK v) {
sfence(); // Want all previous stores to take place before we unlock.
v[0]=0;
}
#else
#error Need to define architectur-specific stuff for other machines.
#endif
CFLAGS=-O2 -Wall -W -Werror
LDFLAGS=-lpthread
trylock:
/* Time {m,l,s}fence vs.xchgl for a memory barrier. */
/* Timing numbers:
* Intel T2500 2GHZ
do1 9.0ns/loop
mfence: 29.0ns/loop (marginal cost= 20.0ns)
sfence: 17.3ns/loop (marginal cost= 8.3ns)
lfence: 23.6ns/loop (marginal cost= 14.6ns)
xchgl: 35.8ns/loop (marginal cost= 26.8ns)
* AMD Athlon 64 X2 Dual Core Processor 4200+
Timings are more crazy
do1 20.6ns/loop
mfence: 12.9ns/loop (marginal cost= -7.6ns)
sfence: 8.4ns/loop (marginal cost= -12.1ns)
lfence: 20.2ns/loop (marginal cost= -0.3ns)
xchgl: 16.6ns/loop (marginal cost= -3.9ns)
do1 13.0ns/loop
mfence: 25.6ns/loop (marginal cost= 12.6ns)
sfence: 21.0ns/loop (marginal cost= 8.1ns)
lfence: 12.9ns/loop (marginal cost= -0.1ns)
xchgl: 29.3ns/loop (marginal cost= 16.3ns)
*/
#include <sys/time.h>
#include <stdio.h>
enum { COUNT = 100000000 };
static inline void xchgl (void) {
{
/*
* According to the Intel Architecture Software Developer's
* Manual, Volume 3: System Programming Guide
* (http://www.intel.com/design/pro/manuals/243192.htm), page
* 7-6, "For the P6 family processors, locked operations
* serialize all outstanding load and store operations (that
* is, wait for them to complete)."
* Since xchg is locked by default, it is one way to do membar.
*/
int x=0, y;
asm volatile ("xchgl %0,%1" :"=r" (x) :"m" (y), "0" (x) :"memory");
}
}
static inline void mfence (void) {
asm volatile ("mfence":::"memory");
}
static inline void lfence (void) {
asm volatile ("lfence":::"memory");
}
static inline void sfence (void) {
asm volatile ("sfence":::"memory");
}
double tdiff (struct timeval *start, struct timeval *end) {
return ((end->tv_sec-start->tv_sec + 1e-6*(end->tv_usec + start->tv_usec))/COUNT)*1e9;
}
double nop_cost;
void do1 (volatile int *x) {
int i;
struct timeval start, end;
gettimeofday(&start, 0);
for (i=0; i<COUNT; i++) {
x[0]++;
x[1]++;
x[2]++;
x[3]++;
}
gettimeofday(&end, 0);
printf("do1 %6.1fns/loop\n", nop_cost=tdiff(&start, &end));
}
#define doit(name) void do ##name (volatile int *x) { \
int i; \
struct timeval start, end; \
gettimeofday(&start, 0); \
for (i=0; i<COUNT; i++) { \
x[0]++; \
x[1]++; \
name(); \
x[2]++; \
x[3]++; \
} \
gettimeofday(&end, 0); \
double this_cost = tdiff(&start, &end); \
printf("%6s:%6.1fns/loop (marginal cost=%6.1fns)\n", #name, this_cost, this_cost-nop_cost); \
}
doit(mfence)
doit(lfence)
doit(sfence)
doit(xchgl)
int main (int argc __attribute__((__unused__)),
char *argv[] __attribute__((__unused__))) {
int x[4];
int i;
for (i=0; i<2; i++) {
do1(x);
domfence(x);
dosfence(x);
dolfence(x);
doxchgl(x);
}
return 0;
}
/* How expensive is
* - Obtaining a read-only lock for the first obtainer.
* - Obtaining it for the second one?
* - The third one? */
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <sys/time.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
/* My own rwlock implementation. */
struct brwl {
int mutex;
int state; // 0 for unlocked, -1 for a writer, otherwise many readers
};
static inline int xchg(volatile int *ptr, int x)
{
__asm__("xchgl %0,%1" :"=r" (x) :"m" (*(ptr)), "0" (x) :"memory");
return x;
}
static inline void sfence (void) {
asm volatile ("sfence":::"memory");
}
static inline void brwl_rlock (struct brwl *l) {
while (xchg(&l->mutex, 1)) ;
l->state++;
#if 1
sfence();
l->mutex=0;
#else
xchg(&l->mutex, 0);
#endif
}
enum {K=1000};
pthread_rwlock_t rwlocks[K];
struct brwl blocks[K];
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
int j;
int i;
int r;
struct timeval start, end;
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
r=pthread_rwlock_init(&rwlocks[i], NULL);
assert(r==0);
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
r = pthread_rwlock_tryrdlock(&rwlocks[i]);
assert(r==0);
}
gettimeofday(&end, 0);
printf("pthread_rwlock_tryrdlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
r=pthread_rwlock_init(&rwlocks[i], NULL);
assert(r==0);
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
r = pthread_rwlock_rdlock(&rwlocks[i]);
assert(r==0);
}
gettimeofday(&end, 0);
printf("pthread_rwlock_rdlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
blocks[i].state=0;
blocks[i].mutex=0;
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
brwl_rlock(&blocks[i]);
}
gettimeofday(&end, 0);
printf("brwl_rlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
return 0;
}
#define _MULTI_THREADED
#include <pthread.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/time.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
/* Simple function to check the return code and exit the program
if the function call failed
*/
static void compResults(char *string, int rc) {
if (rc) {
printf("Error on : %s, rc=%d",
string, rc);
exit(EXIT_FAILURE);
}
return;
}
pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
void *rdlockThread(void *arg)
{
int rc;
int count=0;
struct timeval start, end;
printf("Entered thread, getting read lock with mp wait\n");
Retry:
gettimeofday(&start, 0);
rc = pthread_rwlock_tryrdlock(&rwlock);
gettimeofday(&end, 0);
printf("pthread_rwlock_tryrdlock took %9.3fus\n", tdiff(&start,&end));
if (rc == EBUSY) {
if (count >= 10) {
printf("Retried too many times, failure!\n");
exit(EXIT_FAILURE);
}
++count;
printf("Could not get lock, do other work, then RETRY...\n");
sleep(1);
goto Retry;
}
compResults("pthread_rwlock_tryrdlock() 1\n", rc);
sleep(2);
printf("unlock the read lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_unlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_unlock()\n", rc);
printf("%d.%6d to %d.%6d is %9.2f\n", start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec, tdiff(&start, &end));
printf("Secondary thread complete\n");
return NULL;
}
int main(int argc, char **argv)
{
int rc=0;
pthread_t thread;
struct timeval start, end;
printf("Enter Testcase - %s\n", argv[0]);
gettimeofday(&start, 0);
gettimeofday(&end, 0);
printf("nop Took %9.2f\n", tdiff(&start, &end));
{
int N=1000;
int i;
printf("Main, get and release the write lock %d times\n", N);
gettimeofday(&start, 0);
for (i=0; i<N; i++) {
rc = pthread_rwlock_wrlock(&rwlock);
rc = pthread_rwlock_unlock(&rwlock);
}
gettimeofday(&end, 0);
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %9.2fns/op\n", 1000*tdiff(&start, &end)/N);
}
printf("Main, get the write lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_wrlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %9.2f\n", tdiff(&start, &end));
printf("Main, create the try read lock thread\n");
rc = pthread_create(&thread, NULL, rdlockThread, NULL);
compResults("pthread_create\n", rc);
printf("Main, wait a bit holding the write lock\n");
sleep(5);
printf("Main, Now unlock the write lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_unlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_unlock()\n", rc);
printf("Took %9.2f\n", tdiff(&start, &end));
printf("Main, wait for the thread to end\n");
rc = pthread_join(thread, NULL);
compResults("pthread_join\n", rc);
rc = pthread_rwlock_destroy(&rwlock);
compResults("pthread_rwlock_destroy()\n", rc);
printf("Main completed\n");
return 0;
}
...@@ -11,13 +11,20 @@ struct tokulogger { ...@@ -11,13 +11,20 @@ struct tokulogger {
int fd; int fd;
int n_in_file; int n_in_file;
long long next_log_file_number; long long next_log_file_number;
LSN lsn;
char buf[LOGGER_BUF_SIZE]; char buf[LOGGER_BUF_SIZE];
int n_in_buf; int n_in_buf;
}; };
int tokulogger_find_next_unused_log_file(const char *directory, long long *result); int tokulogger_find_next_unused_log_file(const char *directory, long long *result);
enum { LT_INSERT_WITH_NO_OVERWRITE = 'I', LT_DELETE = 'D', LT_COMMIT = 'C' }; enum {
LT_COMMIT = 'C',
LT_DELETE = 'D',
LT_INSERT_WITH_NO_OVERWRITE = 'I',
LT_CHECKPOINT = 'P',
LT_BLOCK_RENAME = 'R'
};
struct tokutxn { struct tokutxn {
u_int64_t txnid64; u_int64_t txnid64;
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
#include "log-internal.h" #include "log-internal.h"
#include "wbuf.h" #include "wbuf.h"
#include "memory.h" #include "memory.h"
#include "../src/ydb-internal.h"
#include <dirent.h> #include <dirent.h>
#include <errno.h> #include <errno.h>
#include <fcntl.h> #include <fcntl.h>
...@@ -11,6 +10,7 @@ ...@@ -11,6 +10,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/uio.h> #include <sys/uio.h>
#include "../src/ydb-internal.h"
int tokulogger_find_next_unused_log_file(const char *directory, long long *result) { int tokulogger_find_next_unused_log_file(const char *directory, long long *result) {
DIR *d=opendir(directory); DIR *d=opendir(directory);
...@@ -44,6 +44,9 @@ int tokulogger_create_and_open_logger (const char *directory, TOKULOGGER *result ...@@ -44,6 +44,9 @@ int tokulogger_create_and_open_logger (const char *directory, TOKULOGGER *result
result->fd = -1; result->fd = -1;
result->next_log_file_number = nexti; result->next_log_file_number = nexti;
result->n_in_buf = 0; result->n_in_buf = 0;
result->lsn.lsn = 0; // WRONG!!! This should actually be calculated by looking at the log file.
*resultp=result; *resultp=result;
return tokulogger_log_bytes(result, 0, ""); return tokulogger_log_bytes(result, 0, "");
} }
...@@ -85,26 +88,6 @@ int tokulogger_log_bytes(TOKULOGGER logger, int nbytes, void *bytes) { ...@@ -85,26 +88,6 @@ int tokulogger_log_bytes(TOKULOGGER logger, int nbytes, void *bytes) {
return 0; return 0;
} }
// Log an insertion of a key-value pair into a particular node of the tree.
int tokulogger_log_brt_insert_with_no_overwrite (TOKULOGGER logger,
TXNID txnid,
diskoff diskoff,
unsigned char *key,
int keylen,
unsigned char *val,
int vallen) {
int buflen=30+keylen+vallen;
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init(&wbuf, buf, buflen) ;
wbuf_char(&wbuf, LT_INSERT_WITH_NO_OVERWRITE);
wbuf_txnid(&wbuf, txnid);
wbuf_diskoff(&wbuf, diskoff);
wbuf_bytes(&wbuf, key, keylen);
wbuf_bytes(&wbuf, val, vallen);
return tokulogger_log_bytes(logger, wbuf.ndone, wbuf.buf);
}
int tokulogger_log_close(TOKULOGGER *loggerp) { int tokulogger_log_close(TOKULOGGER *loggerp) {
TOKULOGGER logger = *loggerp; TOKULOGGER logger = *loggerp;
int r = 0; int r = 0;
...@@ -133,57 +116,116 @@ n ...@@ -133,57 +116,116 @@ n
} }
#endif #endif
int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, diskoff diskoff, int is_add, const struct kv_pair *pair) { int tokulogger_fsync (TOKULOGGER logger) {
//return 0;/// NO TXN
//fprintf(stderr, "%s:%d syncing log\n", __FILE__, __LINE__);
if (logger->n_in_buf>0) {
int r = write(logger->fd, logger->buf, logger->n_in_buf);
if (r==-1) return errno;
logger->n_in_buf=0;
}
{
int r = fsync(logger->fd);
if (r!=0) return errno;
}
return 0;
}
static int tokulogger_finish (TOKULOGGER logger, struct wbuf *wbuf) {
wbuf_int(wbuf, toku_crc32(0, wbuf->buf, wbuf->ndone));
wbuf_int(wbuf, 4+wbuf->ndone);
return tokulogger_log_bytes(logger, wbuf->ndone, wbuf->buf);
}
// Log an insertion of a key-value pair into a particular node of the tree.
int tokulogger_log_brt_insert_with_no_overwrite (TOKULOGGER logger,
TXNID txnid,
FILENUM fileid,
DISKOFF diskoff,
unsigned char *key,
int keylen,
unsigned char *val,
int vallen) {
int buflen=(keylen+vallen+4+4 // key and value
+1 // command
+8 // lsn
+8 // txnid
+4 // fileid
+8 // diskoff
+8 // crc and len
);
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init(&wbuf, buf, buflen) ;
wbuf_char(&wbuf, LT_INSERT_WITH_NO_OVERWRITE);
wbuf_lsn (&wbuf, logger->lsn); logger->lsn.lsn++;
wbuf_txnid(&wbuf, txnid);
wbuf_filenum(&wbuf, fileid);
wbuf_diskoff(&wbuf, diskoff);
wbuf_bytes(&wbuf, key, keylen);
wbuf_bytes(&wbuf, val, vallen);
return tokulogger_finish (logger, &wbuf);
}
int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, DISKOFF diskoff, int is_add, const struct kv_pair *pair) {
if (txn==0) return 0; if (txn==0) return 0;
assert(db);
int keylen = pair->keylen; int keylen = pair->keylen;
int vallen = pair->vallen; int vallen = pair->vallen;
int buflen=(keylen+vallen+4+4 // the key and value const int buflen=(keylen+vallen+4+4 // the key and value
+1 // log command +1 // log command
+8 // lsn
+8 // txnid +8 // txnid
+8 // fileid +8 // fileid
+8 // diskoff +8 // diskoff
+8 // crc & len
); );
unsigned char buf[buflen]; unsigned char buf[buflen];
struct wbuf wbuf; struct wbuf wbuf;
wbuf_init(&wbuf, buf, buflen) ; wbuf_init(&wbuf, buf, buflen) ;
wbuf_char(&wbuf, is_add ? LT_INSERT_WITH_NO_OVERWRITE : LT_DELETE); wbuf_char(&wbuf, is_add ? LT_INSERT_WITH_NO_OVERWRITE : LT_DELETE);
wbuf_lsn (&wbuf, txn->logger->lsn);
txn->logger->lsn.lsn++;
wbuf_txnid(&wbuf, txn->txnid64); wbuf_txnid(&wbuf, txn->txnid64);
wbuf_fileid(&wbuf, db->i->fileid); wbuf_filenum(&wbuf, db->i->fileid);
wbuf_diskoff(&wbuf, diskoff); wbuf_diskoff(&wbuf, diskoff);
wbuf_bytes(&wbuf, kv_pair_key_const(pair), keylen); wbuf_bytes(&wbuf, kv_pair_key_const(pair), keylen);
wbuf_bytes(&wbuf, kv_pair_val_const(pair), vallen); wbuf_bytes(&wbuf, kv_pair_val_const(pair), vallen);
return tokulogger_log_bytes(txn->logger, wbuf.ndone, wbuf.buf); return tokulogger_finish(txn->logger, &wbuf);
}
int tokulogger_fsync (TOKULOGGER logger) {
//return 0;/// NO TXN
//fprintf(stderr, "%s:%d syncing log\n", __FILE__, __LINE__);
if (logger->n_in_buf>0) {
int r = write(logger->fd, logger->buf, logger->n_in_buf);
if (r==-1) return errno;
logger->n_in_buf=0;
}
{
int r = fsync(logger->fd);
if (r!=0) return errno;
}
return 0;
} }
int tokulogger_log_commit (TOKUTXN txn) { int tokulogger_log_commit (TOKUTXN txn) {
struct wbuf wbuf; struct wbuf wbuf;
int buflen =30; const int buflen = (1 // log command
+8 // lsn
+8 // txnid
+8 // crc & len
);
unsigned char buf[buflen]; unsigned char buf[buflen];
wbuf_init(&wbuf, buf, buflen); wbuf_init(&wbuf, buf, buflen);
wbuf_char(&wbuf, LT_COMMIT); wbuf_char(&wbuf, LT_COMMIT);
wbuf_lsn (&wbuf, txn->logger->lsn);
txn->logger->lsn.lsn++;
wbuf_txnid(&wbuf, txn->txnid64); wbuf_txnid(&wbuf, txn->txnid64);
int r = tokulogger_log_bytes(txn->logger, wbuf.ndone, wbuf.buf); int r = tokulogger_finish(txn->logger, &wbuf);
if (r!=0) return r; if (r!=0) return r;
if (txn->parent) return 0; if (txn->parent) return 0;
else return tokulogger_fsync(txn->logger); else return tokulogger_fsync(txn->logger);
} }
int tokulogger_log_checkpoint (TOKULOGGER logger, LSN *lsn) {
struct wbuf wbuf;
const int buflen =10;
unsigned char buf[buflen];
wbuf_init(&wbuf, buf, buflen);
wbuf_char(&wbuf, LT_CHECKPOINT);
wbuf_lsn (&wbuf, logger->lsn);
*lsn = logger->lsn;
logger->lsn.lsn++;
return tokulogger_log_bytes(logger, wbuf.ndone, wbuf.buf);
}
int tokutxn_begin (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TXNID txnid64, TOKULOGGER logger) { int tokutxn_begin (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TXNID txnid64, TOKULOGGER logger) {
TAGMALLOC(TOKUTXN, result); TAGMALLOC(TOKUTXN, result);
if (result==0) return errno; if (result==0) return errno;
...@@ -194,3 +236,35 @@ int tokutxn_begin (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TXNID txnid64, TOKU ...@@ -194,3 +236,35 @@ int tokutxn_begin (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TXNID txnid64, TOKU
return 0; return 0;
} }
int tokulogger_log_block_rename (TOKULOGGER logger, FILENUM fileid, DISKOFF olddiskoff, DISKOFF newdiskoff, DISKOFF parentdiskoff, int childnum) {
const int buflen=(+1 // log command
+8 // lsn
+8 // fileid
+8 // olddiskoff
+8 // newdiskoff
+8 // parentdiskoff
+4 // childnum
+8 // crc & len
);
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init (&wbuf, buf, buflen) ;
wbuf_char (&wbuf, LT_BLOCK_RENAME);
wbuf_lsn (&wbuf, logger->lsn);
logger->lsn.lsn++;
wbuf_filenum(&wbuf, fileid);
wbuf_diskoff(&wbuf, olddiskoff);
wbuf_diskoff(&wbuf, newdiskoff);
wbuf_diskoff(&wbuf, parentdiskoff);
wbuf_int (&wbuf, childnum);
return tokulogger_finish(logger, &wbuf);
}
/*
int brtenv_checkpoint (BRTENV env) {
init the checkpointing lock
acquire_spinlock(&env->checkpointing);
release_spinlock(&env->checkpointing);
return -1;
}
*/
...@@ -3,16 +3,17 @@ ...@@ -3,16 +3,17 @@
#include "../include/db.h" #include "../include/db.h"
#include "brttypes.h" #include "brttypes.h"
#include "kv-pair.h" #include "kv-pair.h"
typedef struct tokulogger *TOKULOGGER;
typedef struct tokutxn *TOKUTXN;
int tokulogger_create_and_open_logger (const char *directory, TOKULOGGER *resultp); int tokulogger_create_and_open_logger (const char *directory, TOKULOGGER *resultp);
int tokulogger_log_bytes(TOKULOGGER logger, int nbytes, void *bytes); int tokulogger_log_bytes(TOKULOGGER logger, int nbytes, void *bytes);
int tokulogger_log_close(TOKULOGGER *logger); int tokulogger_log_close(TOKULOGGER *logger);
int tokulogger_log_checkpoint (TOKULOGGER, LSN*);
int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, diskoff diskoff, int is_add, const struct kv_pair *pair); int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, DISKOFF diskoff, int is_add, const struct kv_pair *pair);
int tokulogger_log_commit (TOKUTXN txn); int tokulogger_log_commit (TOKUTXN txn);
int tokulogger_log_block_rename (TOKULOGGER logger, FILENUM fileid, DISKOFF olddiskoff, DISKOFF newdiskoff, DISKOFF parentdiskoff, int childnum);
int tokutxn_begin (TOKUTXN /*parent*/,TOKUTXN *, TXNID txnid64, TOKULOGGER logger); int tokutxn_begin (TOKUTXN /*parent*/,TOKUTXN *, TXNID txnid64, TOKULOGGER logger);
#endif #endif
...@@ -47,4 +47,8 @@ void *mempool_malloc(struct mempool *mp, int size, int alignment); ...@@ -47,4 +47,8 @@ void *mempool_malloc(struct mempool *mp, int size, int alignment);
pool does not keep track of the locations of the free chunks */ pool does not keep track of the locations of the free chunks */
void mempool_mfree(struct mempool *mp, void *vp, int size); void mempool_mfree(struct mempool *mp, void *vp, int size);
static inline int mempool_inrange(struct mempool *mp, void *vp, int size) {
return mp->base <= vp && vp + size <= mp->base + mp->size;
}
#endif #endif
...@@ -10,6 +10,7 @@ struct pma_cursor { ...@@ -10,6 +10,7 @@ struct pma_cursor {
struct pma { struct pma {
enum typ_tag tag; enum typ_tag tag;
int dup_mode;
int N; /* How long is the array? Always a power of two >= 4. */ int N; /* How long is the array? Always a power of two >= 4. */
int n_pairs_present; /* How many array elements are non-null. */ int n_pairs_present; /* How many array elements are non-null. */
struct kv_pair **pairs; struct kv_pair **pairs;
...@@ -23,7 +24,8 @@ struct pma { ...@@ -23,7 +24,8 @@ struct pma {
* The density step is 0.10. */ * The density step is 0.10. */
double ldt_step; /* lower density threshold step */ double ldt_step; /* lower density threshold step */
struct list cursors; struct list cursors;
int (*compare_fun)(DB*,const DBT*,const DBT*); pma_compare_fun_t compare_fun;
pma_compare_fun_t dup_compare_fun;
void *skey, *sval; /* used in dbts */ void *skey, *sval; /* used in dbts */
struct mempool kvspace; struct mempool kvspace;
}; };
...@@ -36,49 +38,6 @@ int pmainternal_make_space_at (PMA pma, int idx); ...@@ -36,49 +38,6 @@ int pmainternal_make_space_at (PMA pma, int idx);
int pmainternal_find (PMA pma, DBT *, DB*); // The DB is so the comparison fuction can be called. int pmainternal_find (PMA pma, DBT *, DB*); // The DB is so the comparison fuction can be called.
void print_pma (PMA pma); /* useful for debugging, so keep the name short. I.e., not pmainternal_print_pma() */ void print_pma (PMA pma); /* useful for debugging, so keep the name short. I.e., not pmainternal_print_pma() */
/*
* resize the pma array to asksize. zero all array entries starting from startx.
*/
int __pma_resize_array(PMA pma, int asksize, int startx);
/*
* extract pairs from the pma in the window delimited by lo and hi.
*/
struct kv_pair_tag *__pma_extract_pairs(PMA pma, int count, int lo, int hi);
/*
* update the cursors in a cursor set given a set of tagged pairs.
*/
void __pma_update_cursors(PMA pma, struct list *cursorset, struct kv_pair_tag *tpairs, int n);
/*
* update this pma's cursors given a set of tagged pairs.
*/
void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n);
/*
* a deletion occured at index "here" in the pma. rebalance the windows around "here". if
* necessary, shrink the pma.
*/
void __pma_delete_at(PMA pma, int here);
/*
* if the pma entry at here is deleted and there are no more references to it
* then finish the deletion
*/
void __pma_delete_resume(PMA pma, int here);
/*
* finish a deletion from the pma. called when there are no cursor references
* to the kv pair.
*/
void __pma_delete_finish(PMA pma, int here);
/*
* count the number of cursors that reference a pma pair
*/
int __pma_count_cursor_refs(PMA pma, int here);
/* density thresholds */ /* density thresholds */
#define PMA_LDT_HIGH 0.25 #define PMA_LDT_HIGH 0.25
#define PMA_LDT_LOW 0.40 #define PMA_LDT_LOW 0.40
......
#include "../include/db.h" #include "brt-internal.h"
#include "memory.h"
#include "key.h" #include "key.h"
#include <assert.h> #include <assert.h>
#include <string.h> #include <string.h>
...@@ -7,12 +6,11 @@ ...@@ -7,12 +6,11 @@
#include <stdio.h> #include <stdio.h>
#include <arpa/inet.h> #include <arpa/inet.h>
#include "list.h" #include "list.h"
#include "kv-pair.h"
#include "pma-internal.h" #include "pma-internal.h"
TOKUTXN const null_txn = 0; TOKUTXN const null_txn = 0;
DB * const null_db = 0; DB * const null_db = 0;
const diskoff null_diskoff = -1; const DISKOFF null_diskoff = -1;
#define NULL_ARGS null_db, null_txn, null_diskoff #define NULL_ARGS null_db, null_txn, null_diskoff
...@@ -253,33 +251,62 @@ static void test_count_region (void) { ...@@ -253,33 +251,62 @@ static void test_count_region (void) {
kv_pair_free(pairs[i]); kv_pair_free(pairs[i]);
} }
// Add a kvpair into a expected sum and check to see if it matches the actual sum.
void add_fingerprint_and_check(u_int32_t rand4fingerprint, u_int32_t actual_fingerprint, u_int32_t *expect_fingerprint, const void *key, int klen, const void *data, int dlen) {
*expect_fingerprint += rand4fingerprint*toku_calccrc32_kvpair(key, klen, data, dlen);
assert(*expect_fingerprint==actual_fingerprint);
}
static void do_insert (PMA pma, const void *key, int keylen, const void *data, int datalen, u_int32_t rand4fingerprint, u_int32_t *sum, u_int32_t *expect_fingerprint) {
DBT k,v;
assert(*sum==*expect_fingerprint);
int r = pma_insert(pma, fill_dbt(&k, key, keylen), fill_dbt(&v, data, datalen), NULL_ARGS, rand4fingerprint, sum);
assert(r==BRT_OK);
add_fingerprint_and_check(rand4fingerprint, *sum, expect_fingerprint, key, keylen, data, datalen);
pma_verify_fingerprint(pma, rand4fingerprint, *sum);
}
static void do_delete (PMA pma, const void *key, int keylen, const void *data, int datalen, u_int32_t rand4fingerprint, u_int32_t *sum, u_int32_t *expect_fingerprint) {
DBT k;
assert(*sum==*expect_fingerprint);
int r = pma_delete(pma, fill_dbt(&k, key, keylen), 0, rand4fingerprint, sum);
assert(r==BRT_OK);
add_fingerprint_and_check(-rand4fingerprint, *sum, expect_fingerprint, key, keylen, data, datalen); // negative rand4 means subtract.
pma_verify_fingerprint(pma, rand4fingerprint, *sum);
}
static void test_pma_random_pick (void) { static void test_pma_random_pick (void) {
PMA pma; PMA pma;
int r = pma_create(&pma, default_compare_fun, 0); int r = pma_create(&pma, default_compare_fun, 0);
bytevec key,val; bytevec key,val;
ITEMLEN keylen,vallen; ITEMLEN keylen,vallen;
DBT k,v; DBT k;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
assert(r==0); assert(r==0);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen); r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==DB_NOTFOUND); assert(r==DB_NOTFOUND);
r = pma_insert(pma, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), NULL_ARGS); do_insert(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK); pma_verify_fingerprint(pma, rand4fingerprint, sum);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen); r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==0); assert(r==0);
assert(keylen==6); assert(vallen==6); assert(keylen==6); assert(vallen==6);
assert(strcmp(key,"hello")==0); assert(strcmp(key,"hello")==0);
assert(strcmp(val,"there")==0); assert(strcmp(val,"there")==0);
r = pma_delete(pma, fill_dbt(&k, "nothello", 9), 0); r = pma_delete(pma, fill_dbt(&k, "nothello", 9), 0, rand4fingerprint, &sum);
assert(r==DB_NOTFOUND); assert(r==DB_NOTFOUND);
r = pma_delete(pma, fill_dbt(&k, "hello", 6), 0); assert(sum==expect_fingerprint); // didn't change because nothing was deleted.
assert(r==BRT_OK);
do_delete(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen); r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==DB_NOTFOUND); assert(r==DB_NOTFOUND);
r = pma_insert(pma, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), NULL_ARGS); do_insert(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen); r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==0); assert(r==0);
...@@ -287,26 +314,29 @@ static void test_pma_random_pick (void) { ...@@ -287,26 +314,29 @@ static void test_pma_random_pick (void) {
assert(strcmp(key,"hello")==0); assert(strcmp(key,"hello")==0);
assert(strcmp(val,"there")==0); assert(strcmp(val,"there")==0);
r = pma_insert(pma, fill_dbt(&k, "aaa", 4), fill_dbt(&v, "athere", 7), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aaa", 4, "athere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "aab", 4), fill_dbt(&v, "bthere", 7), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aab", 4, "bthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "aac", 4), fill_dbt(&v, "cthere", 7), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aac", 4, "cthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "aad", 4), fill_dbt(&v, "dthere", 7), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aad", 4, "dthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "aae", 4), fill_dbt(&v, "ethere", 7), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aae", 4, "ethere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "aaf", 4), fill_dbt(&v, "fthere", 7), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aaf", 4, "fthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "aag", 4), fill_dbt(&v, "gthere", 7), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aag", 4, "gthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, fill_dbt(&k, "aaa", 4), 0); assert(r==BRT_OK); pma_verify_fingerprint(pma, rand4fingerprint, sum);
r = pma_delete(pma, fill_dbt(&k, "aab", 4), 0); assert(r==BRT_OK); do_delete(pma, "aaa", 4, "athere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, fill_dbt(&k, "aac", 4), 0); assert(r==BRT_OK); do_delete(pma, "aab", 4, "bthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, fill_dbt(&k, "aad", 4), 0); assert(r==BRT_OK); do_delete(pma, "aac", 4, "cthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, fill_dbt(&k, "aae", 4), 0); assert(r==BRT_OK); do_delete(pma, "aad", 4, "dthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, fill_dbt(&k, "aag", 4), 0); assert(r==BRT_OK); do_delete(pma, "aae", 4, "ethere", 7, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, fill_dbt(&k, "hello", 6), 0); assert(r==BRT_OK); /* don't delete aaf */
do_delete(pma, "aag", 4, "gthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_delete(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen); r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==0); assert(r==0);
assert(keylen==4); assert(vallen==7); assert(keylen==4); assert(vallen==7);
assert(strcmp(key,"aaf")==0); assert(strcmp(key,"aaf")==0);
assert(strcmp(val,"fthere")==0); assert(strcmp(val,"fthere")==0);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
r=pma_free(&pma); assert(r==0); r=pma_free(&pma); assert(r==0);
assert(pma==0); assert(pma==0);
} }
...@@ -315,12 +345,17 @@ static void test_find_insert (void) { ...@@ -315,12 +345,17 @@ static void test_find_insert (void) {
PMA pma; PMA pma;
int r; int r;
DBT k,v; DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
pma_create(&pma, default_compare_fun, 0); pma_create(&pma, default_compare_fun, 0);
r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0); r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0);
assert(r==DB_NOTFOUND); assert(r==DB_NOTFOUND);
r=pma_insert(pma, fill_dbt(&k, "aaa", 3), fill_dbt(&v, "aaadata", 7), NULL_ARGS); do_insert(pma, "aaa", 3, "aaadata", 7, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK);
init_dbt(&v); init_dbt(&v);
r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0); r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0);
...@@ -329,8 +364,7 @@ static void test_find_insert (void) { ...@@ -329,8 +364,7 @@ static void test_find_insert (void) {
assert(keycompare(v.data,v.size,"aaadata", 7)==0); assert(keycompare(v.data,v.size,"aaadata", 7)==0);
//toku_free(v.data); v.data=0; //toku_free(v.data); v.data=0;
r=pma_insert(pma, fill_dbt(&k, "bbb", 4), fill_dbt(&v, "bbbdata", 8), NULL_ARGS); do_insert(pma, "bbb", 4, "bbbdata", 8, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK);
init_dbt(&v); init_dbt(&v);
r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0); r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0);
...@@ -344,14 +378,16 @@ static void test_find_insert (void) { ...@@ -344,14 +378,16 @@ static void test_find_insert (void) {
assert((unsigned long)pma->pairs[pma_index_limit(pma)]==0xdeadbeefL); assert((unsigned long)pma->pairs[pma_index_limit(pma)]==0xdeadbeefL);
r=pma_insert(pma, fill_dbt(&k, "00000", 6), fill_dbt(&v, "d0", 3), NULL_ARGS); do_insert(pma, "00000", 6, "d0", 3, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK);
assert((unsigned long)pma->pairs[pma_index_limit(pma)]==0xdeadbeefL); assert((unsigned long)pma->pairs[pma_index_limit(pma)]==0xdeadbeefL);
r=pma_free(&pma); assert(r==0); assert(pma==0); r=pma_free(&pma); assert(r==0); assert(pma==0);
pma_create(&pma, default_compare_fun, 0); assert(pma!=0); pma_create(&pma, default_compare_fun, 0); assert(pma!=0);
rand4fingerprint = random();
sum = expect_fingerprint = 0;
{ {
int i; int i;
for (i=0; i<100; i++) { for (i=0; i<100; i++) {
...@@ -359,9 +395,8 @@ static void test_find_insert (void) { ...@@ -359,9 +395,8 @@ static void test_find_insert (void) {
char dstring[10]; char dstring[10];
snprintf(string,10,"%05d",i); snprintf(string,10,"%05d",i);
snprintf(dstring,10,"d%d", i); snprintf(dstring,10,"d%d", i);
printf("Inserting %d: string=%s dstring=%s\n", i, string, dstring); //printf("Inserting %d: string=%s dstring=%s (before sum=%08x) \n", i, string, dstring, sum);
r=pma_insert(pma, fill_dbt(&k, string, strlen(string)+1), fill_dbt(&v, dstring, strlen(dstring)+1), NULL_ARGS); do_insert(pma, string, strlen(string)+1, dstring, strlen(dstring)+1, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK);
} }
} }
r=pma_free(&pma); assert(r==0); assert(pma==0); r=pma_free(&pma); assert(r==0); assert(pma==0);
...@@ -386,14 +421,16 @@ static void test_pma_iterate_internal (PMA pma, int expected_k, int expected_v) ...@@ -386,14 +421,16 @@ static void test_pma_iterate_internal (PMA pma, int expected_k, int expected_v)
static void test_pma_iterate (void) { static void test_pma_iterate (void) {
PMA pma; PMA pma;
int r; int r;
DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
pma_create(&pma, default_compare_fun, 0); pma_create(&pma, default_compare_fun, 0);
r=pma_insert(pma, fill_dbt(&k, "42", 3), fill_dbt(&v, "-19", 4), NULL_ARGS); do_insert(pma, "42", 3, "-19", 4, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK);
test_pma_iterate_internal(pma, 42, -19); test_pma_iterate_internal(pma, 42, -19);
r=pma_insert(pma, fill_dbt(&k, "12", 3), fill_dbt(&v, "-100", 5), NULL_ARGS); do_insert(pma, "12", 3, "-100", 5, rand4fingerprint, &sum, &expect_fingerprint);
assert(r==BRT_OK);
test_pma_iterate_internal(pma, 42+12, -19-100); test_pma_iterate_internal(pma, 42+12, -19-100);
r=pma_free(&pma); assert(r==0); assert(pma==0); r=pma_free(&pma); assert(r==0); assert(pma==0);
} }
...@@ -403,12 +440,20 @@ static void test_pma_iterate2 (void) { ...@@ -403,12 +440,20 @@ static void test_pma_iterate2 (void) {
int r; int r;
int sum=0; int sum=0;
int n_items=0; int n_items=0;
DBT k,v;
u_int32_t rand4fingerprint0 = random();
u_int32_t sum0 = 0;
u_int32_t expect_fingerprint0 = 0;
u_int32_t rand4fingerprint1 = random();
u_int32_t sum1 = 0;
u_int32_t expect_fingerprint1 = 0;
r=pma_create(&pma0, default_compare_fun, 0); assert(r==0); r=pma_create(&pma0, default_compare_fun, 0); assert(r==0);
r=pma_create(&pma1, default_compare_fun, 0); assert(r==0); r=pma_create(&pma1, default_compare_fun, 0); assert(r==0);
pma_insert(pma0, fill_dbt(&k, "a", 2), fill_dbt(&v, "aval", 5), NULL_ARGS); do_insert(pma0, "a", 2, "aval", 5, rand4fingerprint0, &sum0, &expect_fingerprint0);
pma_insert(pma0, fill_dbt(&k, "b", 2), fill_dbt(&v, "bval", 5), NULL_ARGS); do_insert(pma0, "b", 2, "bval", 5, rand4fingerprint0, &sum0, &expect_fingerprint0);
pma_insert(pma1, fill_dbt(&k, "x", 2), fill_dbt(&v, "xval", 5), NULL_ARGS); do_insert(pma1, "x", 2, "xval", 5, rand4fingerprint1, &sum1, &expect_fingerprint1);
PMA_ITERATE(pma0,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)),dl, (n_items++,sum+=kl+dl)); PMA_ITERATE(pma0,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)),dl, (n_items++,sum+=kl+dl));
PMA_ITERATE(pma1,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)), dl, (n_items++,sum+=kl+dl)); PMA_ITERATE(pma1,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)), dl, (n_items++,sum+=kl+dl));
assert(sum==21); assert(sum==21);
...@@ -483,11 +528,15 @@ void test_pma_cursor_3 (void) { ...@@ -483,11 +528,15 @@ void test_pma_cursor_3 (void) {
PMA_CURSOR c=0; PMA_CURSOR c=0;
int r; int r;
DBT key,val; DBT key,val;
DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r=pma_create(&pma, default_compare_fun, 0); assert(r==0); r=pma_create(&pma, default_compare_fun, 0); assert(r==0);
r=pma_insert(pma, fill_dbt(&k, "x", 2), fill_dbt(&v, "xx", 3), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "x", 2, "xx", 3, rand4fingerprint, &sum, &expect_fingerprint);
r=pma_insert(pma, fill_dbt(&k, "m", 2), fill_dbt(&v, "mm", 3), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "m", 2, "mm", 3, rand4fingerprint, &sum, &expect_fingerprint);
r=pma_insert(pma, fill_dbt(&k, "aa", 3), fill_dbt(&v,"a", 2), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "aa", 3, "a", 2, rand4fingerprint, &sum, &expect_fingerprint);
init_dbt(&key); key.flags=DB_DBT_REALLOC; init_dbt(&key); key.flags=DB_DBT_REALLOC;
init_dbt(&val); val.flags=DB_DBT_REALLOC; init_dbt(&val); val.flags=DB_DBT_REALLOC;
r=pma_cursor(pma, &c); assert(r==0); assert(c!=0); r=pma_cursor(pma, &c); assert(r==0); assert(c!=0);
...@@ -545,21 +594,20 @@ void test_pma_cursor_4 (void) { ...@@ -545,21 +594,20 @@ void test_pma_cursor_4 (void) {
PMA_CURSOR cursora, cursorb, cursorc; PMA_CURSOR cursora, cursorb, cursorc;
int i; int i;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_cursor_4\n"); printf("test_pma_cursor_4\n");
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
for (i=1; i<=4; i += 1) { for (i=1; i<=4; i += 1) {
DBT dbtk, dbtv;
char k[5]; int v; char k[5]; int v;
sprintf(k, "%4.4d", i); sprintf(k, "%4.4d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i; v = i;
fill_dbt(&dbtv, &v, sizeof v); do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_insert(pma, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
} }
assert(pma_n_entries(pma) == 4); assert(pma_n_entries(pma) == 4);
printf("a:"); print_pma(pma); printf("a:"); print_pma(pma);
...@@ -586,16 +634,11 @@ void test_pma_cursor_4 (void) { ...@@ -586,16 +634,11 @@ void test_pma_cursor_4 (void) {
assert_cursor_val(cursorc, 4); assert_cursor_val(cursorc, 4);
for (i=5; i<=8; i += 1) { for (i=5; i<=8; i += 1) {
DBT dbtk, dbtv;
char k[5]; int v; char k[5]; int v;
sprintf(k, "%4.4d", i); sprintf(k, "%4.4d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i; v = i;
fill_dbt(&dbtv, &v, sizeof v); do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_insert(pma, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
} }
assert(pma_n_entries(pma) == 8); assert(pma_n_entries(pma) == 8);
printf("a:"); print_pma(pma); printf("a:"); print_pma(pma);
...@@ -621,18 +664,19 @@ void test_pma_cursor_delete(int n) { ...@@ -621,18 +664,19 @@ void test_pma_cursor_delete(int n) {
PMA pma; PMA pma;
int error; int error;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
/* insert 1 -> 42 */ /* insert 1 -> 42 */
DBT key, val; int k, v; int k, v;
int i; int i;
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
k = i; v = -i; k = i; v = -i;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
} }
/* point the cursor to the first kv */ /* point the cursor to the first kv */
...@@ -660,11 +704,9 @@ void test_pma_cursor_delete(int n) { ...@@ -660,11 +704,9 @@ void test_pma_cursor_delete(int n) {
toku_free(cursorkey.data); toku_free(cursorkey.data);
toku_free(cursorval.data); toku_free(cursorval.data);
/* delete the first key */ /* delete the first key, which is (int)(0) with value (0) */
k = 0; k = 0;
fill_dbt(&key, &k, sizeof k); do_delete(pma, &k, sizeof k, &k, sizeof k, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_delete(pma, &key, 0);
assert(error == 0);
/* cursor get should fail */ /* cursor get should fail */
init_dbt(&cursorkey); cursorkey.flags = DB_DBT_MALLOC; init_dbt(&cursorkey); cursorkey.flags = DB_DBT_MALLOC;
...@@ -729,12 +771,16 @@ void test_pma_compare_fun (int wrong_endian_p) { ...@@ -729,12 +771,16 @@ void test_pma_compare_fun (int wrong_endian_p) {
char *right_endian_expected_keys[] = {"00", "01", "10", "11"}; char *right_endian_expected_keys[] = {"00", "01", "10", "11"};
char **expected_keys = wrong_endian_p ? wrong_endian_expected_keys : right_endian_expected_keys; char **expected_keys = wrong_endian_p ? wrong_endian_expected_keys : right_endian_expected_keys;
int i; int i;
DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, wrong_endian_p ? wrong_endian_compare_fun : default_compare_fun, 0); assert(r==0); r = pma_create(&pma, wrong_endian_p ? wrong_endian_compare_fun : default_compare_fun, 0); assert(r==0);
r = pma_insert(pma, fill_dbt(&k, "10", 3), fill_dbt(&v, "10v", 4), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "10", 3, "10v", 4, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "00", 3), fill_dbt(&v, "00v", 4), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "00", 3, "00v", 4, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "01", 3), fill_dbt(&v, "01v", 4), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "01", 3, "01v", 4, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, fill_dbt(&k, "11", 3), fill_dbt(&v, "11v", 4), NULL_ARGS); assert(r==BRT_OK); do_insert(pma, "11", 3, "11v", 4, rand4fingerprint, &sum, &expect_fingerprint);
init_dbt(&key); key.flags=DB_DBT_REALLOC; init_dbt(&key); key.flags=DB_DBT_REALLOC;
init_dbt(&val); val.flags=DB_DBT_REALLOC; init_dbt(&val); val.flags=DB_DBT_REALLOC;
r=pma_cursor(pma, &c); assert(r==0); assert(c!=0); r=pma_cursor(pma, &c); assert(r==0); assert(c!=0);
...@@ -767,6 +813,15 @@ void test_pma_split_n(int n) { ...@@ -767,6 +813,15 @@ void test_pma_split_n(int n) {
int i; int i;
int na, nb, nc; int na, nb, nc;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
u_int32_t brand = random();
u_int32_t bsum = 0;
u_int32_t crand = random();
u_int32_t csum = 0;
printf("test_pma_split_n:%d\n", n); printf("test_pma_split_n:%d\n", n);
error = pma_create(&pmaa, default_compare_fun, 0); error = pma_create(&pmaa, default_compare_fun, 0);
...@@ -778,22 +833,24 @@ void test_pma_split_n(int n) { ...@@ -778,22 +833,24 @@ void test_pma_split_n(int n) {
/* insert some kv pairs */ /* insert some kv pairs */
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
DBT dbtk, dbtv;
char k[5]; int v; char k[5]; int v;
sprintf(k, "%4.4d", i); sprintf(k, "%4.4d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i; v = i;
fill_dbt(&dbtv, &v, sizeof v); do_insert(pmaa, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_insert(pmaa, &dbtk, &dbtv, NULL_ARGS); pma_verify(pmaa, null_db);
assert(error == BRT_OK);
} }
printf("a:"); print_pma(pmaa); printf("a:"); print_pma(pmaa);
error = pma_split(pmaa, 0, pmab, 0, pmac, 0); error = pma_split(pmaa, 0, pmab, 0, brand, &bsum, pmac, 0, crand, &csum);
assert(error == 0); assert(error == 0);
pma_verify(pmaa, null_db);
pma_verify(pmab, null_db);
pma_verify(pmac, null_db);
pma_verify_fingerprint(pmab, brand, bsum);
pma_verify_fingerprint(pmac, crand, csum);
printf("a:"); print_pma(pmaa); printf("a:"); print_pma(pmaa);
na = pma_n_entries(pmaa); na = pma_n_entries(pmaa);
...@@ -821,6 +878,15 @@ void test_pma_split_varkey(void) { ...@@ -821,6 +878,15 @@ void test_pma_split_varkey(void) {
int i; int i;
int n, na, nb, nc; int n, na, nb, nc;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
u_int32_t brand = random();
u_int32_t bsum = 0;
u_int32_t crand = random();
u_int32_t csum = 0;
printf("test_pma_split_varkey\n"); printf("test_pma_split_varkey\n");
error = pma_create(&pmaa, default_compare_fun, 0); error = pma_create(&pmaa, default_compare_fun, 0);
...@@ -832,22 +898,20 @@ void test_pma_split_varkey(void) { ...@@ -832,22 +898,20 @@ void test_pma_split_varkey(void) {
/* insert some kv pairs */ /* insert some kv pairs */
for (i=0; keys[i]; i++) { for (i=0; keys[i]; i++) {
DBT dbtk, dbtv; char v = i;
char v; do_insert(pmaa, keys[i], strlen(keys[i])+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&dbtk, keys[i], strlen(keys[i])+1);
v = i;
fill_dbt(&dbtv, &v, sizeof v);
error = pma_insert(pmaa, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
} }
n = i; n = i;
printf("a:"); print_pma(pmaa); printf("a:"); print_pma(pmaa);
error = pma_split(pmaa, 0, pmab, 0, pmac, 0); error = pma_split(pmaa, 0, pmab, 0, brand, &bsum, pmac, 0, crand, &csum);
assert(error == 0); assert(error == 0);
pma_verify(pmaa, null_db);
pma_verify(pmab, null_db);
pma_verify(pmac, null_db);
pma_verify_fingerprint(pmab, brand, bsum);
pma_verify_fingerprint(pmac, crand, csum);
printf("a:"); print_pma(pmaa); printf("a:"); print_pma(pmaa);
na = pma_n_entries(pmaa); na = pma_n_entries(pmaa);
...@@ -931,6 +995,16 @@ void test_pma_split_cursor(void) { ...@@ -931,6 +995,16 @@ void test_pma_split_cursor(void) {
int i; int i;
int na, nb, nc; int na, nb, nc;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
u_int32_t brand = random();
u_int32_t bsum = 0;
u_int32_t crand = random();
u_int32_t csum = 0;
printf("test_pma_split_cursor\n"); printf("test_pma_split_cursor\n");
error = pma_create(&pmaa, default_compare_fun, 0); error = pma_create(&pmaa, default_compare_fun, 0);
...@@ -942,16 +1016,12 @@ void test_pma_split_cursor(void) { ...@@ -942,16 +1016,12 @@ void test_pma_split_cursor(void) {
/* insert some kv pairs */ /* insert some kv pairs */
for (i=1; i<=16; i += 1) { for (i=1; i<=16; i += 1) {
DBT dbtk, dbtv;
char k[11]; int v; char k[11]; int v;
snprintf(k, sizeof k, "%.10d", i); snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i; v = i;
fill_dbt(&dbtv, &v, sizeof v);
error = pma_insert(pmaa, &dbtk, &dbtv, NULL_ARGS); do_insert(pmaa, k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert(error == BRT_OK);
} }
assert(pma_n_entries(pmaa) == 16); assert(pma_n_entries(pmaa) == 16);
printf("a:"); print_pma(pmaa); printf("a:"); print_pma(pmaa);
...@@ -979,9 +1049,12 @@ void test_pma_split_cursor(void) { ...@@ -979,9 +1049,12 @@ void test_pma_split_cursor(void) {
// print_cursor("cursorc", cursorc); // print_cursor("cursorc", cursorc);
assert_cursor_val(cursorc, 16); assert_cursor_val(cursorc, 16);
error = pma_split(pmaa, 0, pmab, 0, pmac, 0); error = pma_split(pmaa, 0, pmab, 0, brand, &bsum, pmac, 0, crand, &csum);
assert(error == 0); assert(error == 0);
pma_verify_fingerprint(pmab, brand, bsum);
pma_verify_fingerprint(pmac, crand, csum);
printf("a:"); print_pma(pmaa); printf("a:"); print_pma(pmaa);
na = pma_n_entries(pmaa); na = pma_n_entries(pmaa);
assert(na == 0); assert(na == 0);
...@@ -1045,6 +1118,10 @@ void test_pma_bulk_insert_n(int n) { ...@@ -1045,6 +1118,10 @@ void test_pma_bulk_insert_n(int n) {
int i; int i;
DBT *keys, *vals; DBT *keys, *vals;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_bulk_insert_n: %d\n", n); printf("test_pma_bulk_insert_n: %d\n", n);
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
...@@ -1074,11 +1151,16 @@ void test_pma_bulk_insert_n(int n) { ...@@ -1074,11 +1151,16 @@ void test_pma_bulk_insert_n(int n) {
assert(v); assert(v);
*v = i; *v = i;
fill_dbt(&vals[i], v, vlen); fill_dbt(&vals[i], v, vlen);
expect_fingerprint += rand4fingerprint*toku_calccrc32_kvpair (k, klen, v, vlen);
} }
/* bulk insert n kv pairs */ /* bulk insert n kv pairs */
error = pma_bulk_insert(pma, keys, vals, n); error = pma_bulk_insert(pma, keys, vals, n, rand4fingerprint, &sum);
assert(error == 0); assert(error == 0);
assert(sum==expect_fingerprint);
pma_verify(pma, null_db);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
/* verify */ /* verify */
if (0) print_pma(pma); if (0) print_pma(pma);
...@@ -1122,16 +1204,21 @@ void test_pma_insert_or_replace(void) { ...@@ -1122,16 +1204,21 @@ void test_pma_insert_or_replace(void) {
int r; int r;
DBT dbtk, dbtv; DBT dbtk, dbtv;
int n_diff=-2; int n_diff=-2;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, 0); r = pma_create(&pma, default_compare_fun, 0);
assert(r==0); assert(r==0);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "aaa", 4), fill_dbt(&dbtv, "zzz", 4), &n_diff, NULL_ARGS); r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "aaa", 4), fill_dbt(&dbtv, "zzz", 4), &n_diff, NULL_ARGS, rand4fingerprint, &sum);
assert(r==0); assert(n_diff==-1); assert(r==0); assert(n_diff==-1);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, "aaa", 4, "zzz", 4);
r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0); r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0); assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "ww", 3), &n_diff, NULL_ARGS); r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "ww", 3), &n_diff, NULL_ARGS, rand4fingerprint, &sum);
assert(r==0); assert(n_diff==-1); assert(r==0); assert(n_diff==-1);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, "bbbb", 5, "ww", 3);
r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0); r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0); assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0);
...@@ -1139,8 +1226,11 @@ void test_pma_insert_or_replace(void) { ...@@ -1139,8 +1226,11 @@ void test_pma_insert_or_replace(void) {
r = pma_lookup(pma, fill_dbt(&dbtk, "bbbb", 5), init_dbt(&dbtv), 0); r = pma_lookup(pma, fill_dbt(&dbtk, "bbbb", 5), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==3); assert(memcmp(dbtv.data, "ww", 3)==0); assert(r==0); assert(dbtv.size==3); assert(memcmp(dbtv.data, "ww", 3)==0);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "xxxx", 5), &n_diff, NULL_ARGS); // replae bbbb
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "xxxx", 5), &n_diff, NULL_ARGS, rand4fingerprint, &sum);
assert(r==0); assert(n_diff==3); assert(r==0); assert(n_diff==3);
expect_fingerprint -= rand4fingerprint*toku_calccrc32_kvpair("bbbb", 5, "ww", 3);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, "bbbb", 5, "xxxx", 5);
r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0); r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0); assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0);
...@@ -1160,6 +1250,10 @@ void test_pma_delete_shrink(int n) { ...@@ -1160,6 +1250,10 @@ void test_pma_delete_shrink(int n) {
int r; int r;
int i; int i;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_delete_shrink:%d\n", n); printf("test_pma_delete_shrink:%d\n", n);
r = pma_create(&pma, default_compare_fun, n*(8 + 11 + sizeof (int))); r = pma_create(&pma, default_compare_fun, n*(8 + 11 + sizeof (int)));
...@@ -1169,25 +1263,20 @@ void test_pma_delete_shrink(int n) { ...@@ -1169,25 +1263,20 @@ void test_pma_delete_shrink(int n) {
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
char k[11]; char k[11];
int v; int v;
DBT key, val;
snprintf(k, sizeof k, "%.10d", i); snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1);
v = i; v = i;
fill_dbt(&val, &v, sizeof v);
r = pma_insert(pma, &key, &val, NULL_ARGS); do_insert(pma, k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert(r == 0);
} }
/* delete */ /* delete */
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
char k[11]; char k[11];
DBT key; int v=i;
snprintf(k, sizeof k, "%.10d", i); snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1); do_delete(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, &key, 0);
assert(r == 0);
} }
assert(pma->N == PMA_MIN_ARRAY_SIZE); assert(pma->N == PMA_MIN_ARRAY_SIZE);
...@@ -1205,6 +1294,10 @@ void test_pma_delete_random(int n) { ...@@ -1205,6 +1294,10 @@ void test_pma_delete_random(int n) {
int i; int i;
int keys[n]; int keys[n];
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_delete_random:%d\n", n); printf("test_pma_delete_random:%d\n", n);
r = pma_create(&pma, default_compare_fun, n * (8 + 11 + sizeof (int))); r = pma_create(&pma, default_compare_fun, n * (8 + 11 + sizeof (int)));
...@@ -1218,25 +1311,20 @@ void test_pma_delete_random(int n) { ...@@ -1218,25 +1311,20 @@ void test_pma_delete_random(int n) {
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
char k[11]; char k[11];
int v; int v;
DBT key, val;
snprintf(k, sizeof k, "%.10d", keys[i]); snprintf(k, sizeof k, "%.10d", keys[i]);
fill_dbt(&key, k, strlen(k)+1);
v = keys[i]; v = keys[i];
fill_dbt(&val, &v, sizeof v);
r = pma_insert(pma, &key, &val, NULL_ARGS); do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert(r == 0);
} }
/* delete */ /* delete */
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
char k[11]; char k[11];
DBT key; int v = keys[i];
snprintf(k, sizeof k, "%.10d", keys[i]); snprintf(k, sizeof k, "%.10d", keys[i]);
fill_dbt(&key, k, strlen(k)+1); do_delete(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, &key, 0);
assert(r == 0);
} }
assert(pma->N == PMA_MIN_ARRAY_SIZE); assert(pma->N == PMA_MIN_ARRAY_SIZE);
...@@ -1282,6 +1370,10 @@ void test_pma_delete_cursor(int n) { ...@@ -1282,6 +1370,10 @@ void test_pma_delete_cursor(int n) {
PMA pma; PMA pma;
int r; int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, 0); r = pma_create(&pma, default_compare_fun, 0);
assert(r == 0); assert(r == 0);
...@@ -1289,14 +1381,10 @@ void test_pma_delete_cursor(int n) { ...@@ -1289,14 +1381,10 @@ void test_pma_delete_cursor(int n) {
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
char k[11]; char k[11];
int v; int v;
DBT key, val;
snprintf(k, sizeof k, "%.10d", i); snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1);
v = i; v = i;
fill_dbt(&val, &v, sizeof v); do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_insert(pma, &key, &val, NULL_ARGS);
assert(r == 0);
} }
PMA_CURSOR pmacursor; PMA_CURSOR pmacursor;
...@@ -1311,12 +1399,10 @@ void test_pma_delete_cursor(int n) { ...@@ -1311,12 +1399,10 @@ void test_pma_delete_cursor(int n) {
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
char k[11]; char k[11];
DBT key; int v=i;
snprintf(k, sizeof k, "%.10d", i); snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1); do_delete(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_delete(pma, &key, 0);
assert(r == 0);
if (i == n-1) if (i == n-1)
assert_cursor_nokey(pmacursor); assert_cursor_nokey(pmacursor);
else else
...@@ -1347,6 +1433,10 @@ void test_pma_delete_insert() { ...@@ -1347,6 +1433,10 @@ void test_pma_delete_insert() {
PMA pma; PMA pma;
int error; int error;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
...@@ -1359,19 +1449,14 @@ void test_pma_delete_insert() { ...@@ -1359,19 +1449,14 @@ void test_pma_delete_insert() {
int k, v; int k, v;
k = 1; v = 1; k = 1; v = 1;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
error = pma_cursor_set_position_first(pmacursor); error = pma_cursor_set_position_first(pmacursor);
assert(error == 0); assert(error == 0);
assert_cursor_equal(pmacursor, 1); assert_cursor_equal(pmacursor, 1);
k = 1; k = 1; v = 1;
fill_dbt(&key, &k, sizeof k); do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_delete(pma, &key, 0);
assert(error == 0);
assert_cursor_nokey(pmacursor); assert_cursor_nokey(pmacursor);
k = 1; k = 1;
...@@ -1381,10 +1466,7 @@ void test_pma_delete_insert() { ...@@ -1381,10 +1466,7 @@ void test_pma_delete_insert() {
assert(error != 0); assert(error != 0);
k = 1; v = 2; k = 1; v = 2;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
assert_cursor_equal(pmacursor, 2); assert_cursor_equal(pmacursor, 2);
error = pma_cursor_free(&pmacursor); error = pma_cursor_free(&pmacursor);
...@@ -1400,6 +1482,10 @@ void test_pma_double_delete() { ...@@ -1400,6 +1482,10 @@ void test_pma_double_delete() {
PMA pma; PMA pma;
int error; int error;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
...@@ -1408,29 +1494,25 @@ void test_pma_double_delete() { ...@@ -1408,29 +1494,25 @@ void test_pma_double_delete() {
error = pma_cursor(pma, &pmacursor); error = pma_cursor(pma, &pmacursor);
assert(error == 0); assert(error == 0);
DBT key, val; DBT key;
int k, v; int k, v;
k = 1; v = 1; k = 1; v = 1;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
error = pma_cursor_set_position_first(pmacursor); error = pma_cursor_set_position_first(pmacursor);
assert(error == 0); assert(error == 0);
assert_cursor_equal(pmacursor, 1); assert_cursor_equal(pmacursor, 1);
k = 1; k = 1; v = 1;
fill_dbt(&key, &k, sizeof k); do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_delete(pma, &key, 0);
assert(error == 0);
assert_cursor_nokey(pmacursor); assert_cursor_nokey(pmacursor);
k = 1; k = 1;
fill_dbt(&key, &k, sizeof k); fill_dbt(&key, &k, sizeof k);
error = pma_delete(pma, &key, 0); error = pma_delete(pma, &key, 0, rand4fingerprint, &sum);
assert(error == DB_NOTFOUND); assert(error == DB_NOTFOUND);
assert(sum == expect_fingerprint);
error = pma_cursor_free(&pmacursor); error = pma_cursor_free(&pmacursor);
assert(error == 0); assert(error == 0);
...@@ -1445,20 +1527,20 @@ void test_pma_cursor_first_delete_last() { ...@@ -1445,20 +1527,20 @@ void test_pma_cursor_first_delete_last() {
int error; int error;
PMA pma; PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
DBT key, val;
int k, v; int k, v;
int i; int i;
for (i=1; i<=2; i++) { for (i=1; i<=2; i++) {
k = htonl(i); k = htonl(i);
v = i; v = i;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
} }
assert(pma_n_entries(pma) == 2); assert(pma_n_entries(pma) == 2);
...@@ -1471,9 +1553,8 @@ void test_pma_cursor_first_delete_last() { ...@@ -1471,9 +1553,8 @@ void test_pma_cursor_first_delete_last() {
assert(error == 0); assert(error == 0);
k = htonl(1); k = htonl(1);
fill_dbt(&key, &k, sizeof k); v = 1;
error = pma_delete(pma, &key, 0); do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert(error == 0);
assert(pma_n_entries(pma) == 2); assert(pma_n_entries(pma) == 2);
error = pma_cursor_set_position_last(pmacursor); error = pma_cursor_set_position_last(pmacursor);
...@@ -1493,20 +1574,20 @@ void test_pma_cursor_last_delete_first() { ...@@ -1493,20 +1574,20 @@ void test_pma_cursor_last_delete_first() {
int error; int error;
PMA pma; PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
DBT key, val;
int k, v; int k, v;
int i; int i;
for (i=1; i<=2; i++) { for (i=1; i<=2; i++) {
k = htonl(i); k = htonl(i);
v = i; v = i;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
} }
assert(pma_n_entries(pma) == 2); assert(pma_n_entries(pma) == 2);
...@@ -1519,9 +1600,8 @@ void test_pma_cursor_last_delete_first() { ...@@ -1519,9 +1600,8 @@ void test_pma_cursor_last_delete_first() {
assert(error == 0); assert(error == 0);
k = htonl(2); k = htonl(2);
fill_dbt(&key, &k, sizeof k); v = 2;
error = pma_delete(pma, &key, 0); do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert(error == 0);
assert(pma_n_entries(pma) == 2); assert(pma_n_entries(pma) == 2);
error = pma_cursor_set_position_first(pmacursor); error = pma_cursor_set_position_first(pmacursor);
...@@ -1551,6 +1631,9 @@ void test_pma_already_there() { ...@@ -1551,6 +1631,9 @@ void test_pma_already_there() {
int error; int error;
PMA pma; PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
...@@ -1560,10 +1643,12 @@ void test_pma_already_there() { ...@@ -1560,10 +1643,12 @@ void test_pma_already_there() {
k = 1; v = 1; k = 1; v = 1;
fill_dbt(&key, &k, sizeof k); fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v); fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS); error = pma_insert(pma, &key, &val, NULL_ARGS, rand4fingerprint, &sum);
assert(error == 0); assert(error == 0);
error = pma_insert(pma, &key, &val, NULL_ARGS); u_int32_t savesum = sum;
error = pma_insert(pma, &key, &val, NULL_ARGS, rand4fingerprint, &sum);
assert(error == BRT_ALREADY_THERE); assert(error == BRT_ALREADY_THERE);
assert(sum==savesum);
error = pma_free(&pma); error = pma_free(&pma);
assert(error == 0); assert(error == 0);
...@@ -1581,15 +1666,16 @@ void test_pma_cursor_set_key() { ...@@ -1581,15 +1666,16 @@ void test_pma_cursor_set_key() {
DBT key, val; DBT key, val;
int k, v; int k, v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
const int n = 100; const int n = 100;
int i; int i;
for (i=0; i<n; i += 10) { for (i=0; i<n; i += 10) {
k = htonl(i); k = htonl(i);
v = i; v = i;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
} }
PMA_CURSOR cursor; PMA_CURSOR cursor;
...@@ -1630,6 +1716,10 @@ void test_pma_cursor_set_range() { ...@@ -1630,6 +1716,10 @@ void test_pma_cursor_set_range() {
int error; int error;
PMA pma; PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0); error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0); assert(error == 0);
...@@ -1642,10 +1732,7 @@ void test_pma_cursor_set_range() { ...@@ -1642,10 +1732,7 @@ void test_pma_cursor_set_range() {
for (i=smallest_key; i<=largest_key; i += 10) { for (i=smallest_key; i<=largest_key; i += 10) {
k = htonl(i); k = htonl(i);
v = i; v = i;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
} }
PMA_CURSOR cursor; PMA_CURSOR cursor;
...@@ -1687,6 +1774,10 @@ void test_pma_cursor_delete_under() { ...@@ -1687,6 +1774,10 @@ void test_pma_cursor_delete_under() {
int error; int error;
PMA pma; PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
const int n = 1000; const int n = 1000;
error = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int))); error = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int)));
...@@ -1711,10 +1802,7 @@ void test_pma_cursor_delete_under() { ...@@ -1711,10 +1802,7 @@ void test_pma_cursor_delete_under() {
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
k = htonl(i); k = htonl(i);
v = i; v = i;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
} }
for (i=0;;i++) { for (i=0;;i++) {
...@@ -1758,6 +1846,10 @@ void test_pma_cursor_set_both() { ...@@ -1758,6 +1846,10 @@ void test_pma_cursor_set_both() {
int error; int error;
PMA pma; PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
const int n = 1000; const int n = 1000;
error = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int))); error = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int)));
...@@ -1776,10 +1868,7 @@ void test_pma_cursor_set_both() { ...@@ -1776,10 +1868,7 @@ void test_pma_cursor_set_both() {
for (i=0; i<n; i++) { for (i=0; i<n; i++) {
k = htonl(i); k = htonl(i);
v = i; v = i;
fill_dbt(&key, &k, sizeof k); do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
} }
/* verify key not in pma fails */ /* verify key not in pma fails */
...@@ -1827,6 +1916,399 @@ void test_pma_cursor_set_both() { ...@@ -1827,6 +1916,399 @@ void test_pma_cursor_set_both() {
assert(error == 0); assert(error == 0);
} }
/* insert n duplicate keys */
void test_nodup_key_insert(int n) {
printf("test_nodup_key_insert:%d\n", n);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
/* insert 0->0, 0->1, .. 0->n-1 */
DBT key, val;
int k, v;
int i;
for (i=0; i<n; i++) {
k = htonl(0);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
r = pma_insert(pma, &key, &val, NULL_ARGS, rand4fingerprint, &sum);
if (i == 0) {
assert(r == 0);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, &k, sizeof k, &v, sizeof v);
} else {
assert(r != 0);
assert(sum==expect_fingerprint);
}
}
r = pma_free(&pma);
assert(r == 0);
}
/* insert n duplicate keys */
void test_dup_key_insert(int n) {
printf("test_dup_key_insert:%d\n", n);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, DB_DUP);
assert(r == 0);
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
int i;
/* insert 2->0, 2->1, .. 2->n-1 */
for (i=0; i<n; i++) {
k = htonl(2);
v = i;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
/* cursor walk from key k should find values 0, 1, .. n-1 */
PMA_CURSOR cursor;
r = pma_cursor(pma, &cursor);
assert(r == 0);
k = htonl(2);
fill_dbt(&key, &k, sizeof k);
r = pma_cursor_set_key(cursor, &key, 0);
if (r != 0) {
assert(n == 0);
} else {
i = 0;
while (1) {
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
int kk;
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
if (k != kk) {
toku_free(key.data);
toku_free(val.data);
break;
}
int vv;
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(vv == i);
toku_free(key.data);
toku_free(val.data);
i += 1;
r = pma_cursor_set_position_next(cursor);
if (r != 0)
break;
}
assert(i == n);
}
r = pma_cursor_free(&cursor);
assert(r == 0);
r = pma_free(&pma);
assert(r == 0);
}
/* insert n duplicate keys, delete key, verify all keys are deleted */
void test_dup_key_delete(int n, int mode) {
printf("test_dup_key_delete:%d %x\n", n, mode);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, mode);
assert(r == 0);
if (mode & DB_DUPSORT) {
r = pma_set_dup_compare(pma, default_compare_fun);
assert(r == 0);
}
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
u_int32_t sum_before_all_the_duplicates = sum;
int i;
/* insert 2->0, 2->1, .. 2->n-1 */
for (i=0; i<n; i++) {
k = htonl(2);
v = i;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
k = htonl(2);
r = pma_delete(pma, fill_dbt(&key, &k, sizeof k), null_db, rand4fingerprint, &sum);
if (r != 0) assert(n == 0);
expect_fingerprint = sum_before_all_the_duplicates;
assert(sum == expect_fingerprint);
pma_verify(pma, null_db);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
/* cursor walk should find keys 1, 3 */
PMA_CURSOR cursor;
r = pma_cursor(pma, &cursor);
assert(r == 0);
r = pma_cursor_set_position_first(cursor);
assert(r == 0);
int kk, vv;
k = htonl(1); v = 1;
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
assert(k == kk);
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(v == vv);
toku_free(key.data);
toku_free(val.data);
r = pma_cursor_set_position_next(cursor);
assert(r == 0);
k = htonl(3); v = 3;
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
assert(k == kk);
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(v == vv);
toku_free(key.data);
toku_free(val.data);
r = pma_cursor_free(&cursor);
assert(r == 0);
r = pma_free(&pma);
assert(r == 0);
}
/* insert n duplicate keys */
void test_dupsort_key_insert(int n) {
printf("test_dup_key_insert:%d\n", n);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, DB_DUP+DB_DUPSORT);
assert(r == 0);
r = pma_set_dup_compare(pma, default_compare_fun);
assert(r == 0);
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
int values[n];
int i;
/* insert 2->n-i */
for (i=0; i<n; i++) {
k = htonl(2);
values[i] = htonl(random());
do_insert(pma, &k, sizeof k, &values[i], sizeof values[i], rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
/* cursor walk from key k should find values 0, 1, .. n-1 */
PMA_CURSOR cursor;
r = pma_cursor(pma, &cursor);
assert(r == 0);
k = htonl(2);
fill_dbt(&key, &k, sizeof k);
r = pma_cursor_set_key(cursor, &key, 0);
if (r != 0) {
assert(n == 0);
} else {
int cmpint(const void *a, const void *b) {
return memcmp(a, b, sizeof (int));
}
qsort(values, n, sizeof (int), cmpint);
i = 0;
while (1) {
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
int kk;
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
if (k != kk) {
toku_free(key.data);
toku_free(val.data);
break;
}
int vv;
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(vv == values[i]);
toku_free(key.data);
toku_free(val.data);
i += 1;
r = pma_cursor_set_position_next(cursor);
if (r != 0)
break;
}
assert(i == n);
}
r = pma_cursor_free(&cursor);
assert(r == 0);
r = pma_free(&pma);
assert(r == 0);
}
void test_dup_key_lookup(int n, int mode) {
printf("test_dup_lookup:%d %d\n", n, mode);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, mode);
assert(r == 0);
if (mode & DB_DUPSORT) {
r = pma_set_dup_compare(pma, default_compare_fun);
assert(r == 0);
}
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
int i;
/* insert 2->0, 2->1, .. 2->n-1 */
for (i=0; i<n; i++) {
k = htonl(2);
v = htonl(i);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
/* lookup should find the first insert and smallest value */
k = htonl(2);
r = pma_lookup(pma, fill_dbt(&key, &k, sizeof k), fill_dbt(&val, &v, sizeof v), null_db);
assert(r == 0);
int kk;
assert(key.size == sizeof k);
memcpy(&kk, key.data, key.size);
assert((unsigned int) kk == htonl(2));
int vv;
assert(val.size == sizeof v);
memcpy(&vv, val.data, val.size);
assert(vv == 0);
r = pma_free(&pma);
assert(r == 0);
}
void test_dup() {
test_nodup_key_insert(2); memory_check_all_free();
test_dup_key_insert(0); memory_check_all_free();
test_dup_key_insert(2); memory_check_all_free();
test_dup_key_insert(1000); memory_check_all_free();
test_dup_key_delete(0, DB_DUP); memory_check_all_free();
test_dup_key_delete(1000, DB_DUP); memory_check_all_free();
test_dupsort_key_insert(2); memory_check_all_free();
test_dupsort_key_insert(1000); memory_check_all_free();
test_dup_key_delete(0, DB_DUP+DB_DUPSORT); memory_check_all_free();
test_dup_key_delete(1000, DB_DUP+DB_DUPSORT); memory_check_all_free();
test_dup_key_lookup(32, DB_DUP); memory_check_all_free();
test_dup_key_lookup(32, DB_DUP+DB_DUPSORT); memory_check_all_free();
}
void pma_tests (void) { void pma_tests (void) {
memory_check=1; memory_check=1;
test_keycompare(); memory_check_all_free(); test_keycompare(); memory_check_all_free();
...@@ -1840,8 +2322,10 @@ void pma_tests (void) { ...@@ -1840,8 +2322,10 @@ void pma_tests (void) {
test_pma_find(); memory_check_all_free(); test_pma_find(); memory_check_all_free();
test_calculate_parameters(); memory_check_all_free(); test_calculate_parameters(); memory_check_all_free();
test_count_region(); memory_check_all_free(); test_count_region(); memory_check_all_free();
test_pma_random_pick(); memory_check_all_free(); test_pma_random_pick(); memory_check_all_free();
test_pma_cursor(); memory_check_all_free(); test_pma_cursor(); memory_check_all_free();
test_pma_split(); memory_check_all_free(); test_pma_split(); memory_check_all_free();
test_pma_bulk_insert(); memory_check_all_free(); test_pma_bulk_insert(); memory_check_all_free();
test_pma_insert_or_replace(); memory_check_all_free(); test_pma_insert_or_replace(); memory_check_all_free();
...@@ -1851,6 +2335,7 @@ void pma_tests (void) { ...@@ -1851,6 +2335,7 @@ void pma_tests (void) {
test_pma_cursor_set_range(); memory_check_all_free(); test_pma_cursor_set_range(); memory_check_all_free();
test_pma_cursor_delete_under(); memory_check_all_free(); test_pma_cursor_delete_under(); memory_check_all_free();
test_pma_cursor_set_both(); memory_check_all_free(); test_pma_cursor_set_both(); memory_check_all_free();
test_dup();
} }
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
......
...@@ -19,6 +19,54 @@ ...@@ -19,6 +19,54 @@
/* get KEY_VALUE_OVERHEAD */ /* get KEY_VALUE_OVERHEAD */
#include "brt-internal.h" #include "brt-internal.h"
/**************************** static functions forward declarations. *********************/
/*
* finish a deletion from the pma. called when there are no cursor references
* to the kv pair.
*/
static void __pma_delete_finish(PMA pma, int here);
/*
* resize the pma array to asksize. zero all array entries starting from startx.
*/
static int __pma_resize_array(PMA pma, int asksize, int startx);
/*
* extract pairs from the pma in the window delimited by lo and hi.
*/
static struct kv_pair_tag *__pma_extract_pairs(PMA pma, int count, int lo, int hi);
/*
* update the cursors in a cursor set given a set of tagged pairs.
*/
static void __pma_update_cursors(PMA pma, struct list *cursorset, struct kv_pair_tag *tpairs, int n);
/*
* update this pma's cursors given a set of tagged pairs.
*/
static void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n);
/*
* a deletion occured at index "here" in the pma. rebalance the windows around "here". if
* necessary, shrink the pma.
*/
static void __pma_delete_at(PMA pma, int here);
/*
* if the pma entry at here is deleted and there are no more references to it
* then finish the deletion
*/
static void __pma_delete_resume(PMA pma, int here);
/*
* count the number of cursors that reference a pma pair
*/
static int __pma_count_cursor_refs(PMA pma, int here);
/**************************** end of static functions forward declarations. *********************/
#ifndef PMA_USE_MEMPOOL #ifndef PMA_USE_MEMPOOL
#define PMA_USE_MEMPOOL 1 #define PMA_USE_MEMPOOL 1
#endif #endif
...@@ -179,6 +227,130 @@ void pma_show_stats (void) { ...@@ -179,6 +227,130 @@ void pma_show_stats (void) {
printf("%d finds, %d divides, %d scans\n", pma_count_finds, pma_count_divides, pma_count_scans); printf("%d finds, %d divides, %d scans\n", pma_count_finds, pma_count_divides, pma_count_scans);
} }
/* search the index for a matching key */
static int __pma_search(PMA pma, DBT *k, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_search(pma, k, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp > 0)
return __pma_search(pma, k, db, mi+1, hi, found);
if (cmp < 0)
return __pma_search(pma, k, db, lo, mi, found);
*found = 1;
return mi;
}
}
/* search the index for the rightmost matching key */
static int __pma_right_search(PMA pma, DBT *k, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_right_search(pma, k, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp > 0)
return __pma_right_search(pma, k, db, mi+1, hi, found);
if (cmp < 0)
return __pma_right_search(pma, k, db, lo, mi, found);
/* we have a match, try to find a match on the right tree */
int here;
here = __pma_right_search(pma, k, db, mi+1, hi, found);
if (*found == 0)
here = mi;
*found = 1;
return here;
}
}
/* search the index for the left most matching key */
static int __pma_left_search(PMA pma, DBT *k, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_left_search(pma, k, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp > 0)
return __pma_left_search(pma, k, db, mi+1, hi, found);
if (cmp < 0)
return __pma_left_search(pma, k, db, lo, mi, found);
/* we have a match, try to find a match on the left tree */
int here;
here = __pma_left_search(pma, k, db, lo, mi, found);
if (*found == 0)
here = mi;
*found = 1;
return here;
}
}
/* search the index for the right most matching key and value */
static int __pma_dup_search(PMA pma, DBT *k, DBT *v, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_dup_search(pma, k, v, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2, v2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp == 0)
cmp = pma->dup_compare_fun(db, v, fill_dbt(&v2, kv_pair_val(kv), kv_pair_vallen(kv)));
if (cmp > 0)
return __pma_dup_search(pma, k, v, db, mi+1, hi, found);
if (cmp < 0)
return __pma_dup_search(pma, k, v, db, lo, mi, found);
/* we have a match, try to find a match on the right tree */
int here;
here = __pma_dup_search(pma, k, v, db, mi+1, hi, found);
if (*found == 0)
here = mi;
*found = 1;
return here;
}
}
// Return the smallest index such that no lower index contains a larger key. // Return the smallest index such that no lower index contains a larger key.
// This will be in the range 0 (inclusive) to pma_index_limit(pma) (inclusive). // This will be in the range 0 (inclusive) to pma_index_limit(pma) (inclusive).
// Thus the returned index may not be a valid index into the array if it is == pma_index_limit(pma) // Thus the returned index may not be a valid index into the array if it is == pma_index_limit(pma)
...@@ -186,6 +358,7 @@ void pma_show_stats (void) { ...@@ -186,6 +358,7 @@ void pma_show_stats (void) {
// For example: if the array is full of small keys, that means we return pma_index_limit(pma), which is off the end of teh array. // For example: if the array is full of small keys, that means we return pma_index_limit(pma), which is off the end of teh array.
// For example: if the array is full of large keys, then we return 0. // For example: if the array is full of large keys, then we return 0.
int pmainternal_find (PMA pma, DBT *k, DB *db) { int pmainternal_find (PMA pma, DBT *k, DB *db) {
#if 1
int lo=0, hi=pma_index_limit(pma); int lo=0, hi=pma_index_limit(pma);
/* lo and hi are the minimum and maximum values (inclusive) that we could possibly return. */ /* lo and hi are the minimum and maximum values (inclusive) that we could possibly return. */
pma_count_finds++; pma_count_finds++;
...@@ -233,6 +406,17 @@ int pmainternal_find (PMA pma, DBT *k, DB *db) { ...@@ -233,6 +406,17 @@ int pmainternal_find (PMA pma, DBT *k, DB *db) {
} }
#endif #endif
return lo; return lo;
#else
int found, lo;
lo = __pma_search(pma, k, db, 0, pma->N, &found);
if (lo>0 && lo < pma_index_limit(pma) && pma->pairs[lo]) {
//printf("lo=%d\n", lo);
DBT k2;
assert(0 >= pma->compare_fun(db, k, fill_dbt(&k2, pma->pairs[lo]->key, pma->pairs[lo]->keylen)));
}
return lo;
#endif
} }
//int min (int i, int j) { if (i<j) return i; else return j; } //int min (int i, int j) { if (i<j) return i; else return j; }
...@@ -264,7 +448,7 @@ void print_pma (PMA pma) { ...@@ -264,7 +448,7 @@ void print_pma (PMA pma) {
} }
/* Smooth the data, and return the location of the null. */ /* Smooth the data, and return the location of the null. */
int distribute_data (struct kv_pair *destpairs[], int dcount, static int distribute_data (struct kv_pair *destpairs[], int dcount,
struct kv_pair_tag sourcepairs[], int scount, PMA pma) { struct kv_pair_tag sourcepairs[], int scount, PMA pma) {
assert(scount<=dcount); assert(scount<=dcount);
if (scount==0) { if (scount==0) {
...@@ -335,7 +519,7 @@ int pmainternal_smooth_region (struct kv_pair *pairs[], int n, int idx, int base ...@@ -335,7 +519,7 @@ int pmainternal_smooth_region (struct kv_pair *pairs[], int n, int idx, int base
} }
} }
int lg (int n) { int toku_lg (int n) {
int result=0; int result=0;
int two_to_result = 1; int two_to_result = 1;
while (two_to_result<n) { while (two_to_result<n) {
...@@ -348,7 +532,7 @@ int lg (int n) { ...@@ -348,7 +532,7 @@ int lg (int n) {
/* Calculate densitysteps and uplgN, given N. */ /* Calculate densitysteps and uplgN, given N. */
void pmainternal_calculate_parameters (PMA pma) { void pmainternal_calculate_parameters (PMA pma) {
int N = pma_index_limit(pma); int N = pma_index_limit(pma);
int lgN = lg(N); int lgN = toku_lg(N);
int n_divisions=0; int n_divisions=0;
//printf("N=%d lgN=%d\n", N, lgN); //printf("N=%d lgN=%d\n", N, lgN);
while (N/2>=lgN) { while (N/2>=lgN) {
...@@ -371,10 +555,11 @@ int pmainternal_count_region (struct kv_pair *pairs[], int lo, int hi) { ...@@ -371,10 +555,11 @@ int pmainternal_count_region (struct kv_pair *pairs[], int lo, int hi) {
return n; return n;
} }
int pma_create(PMA *pma, int (*compare_fun)(DB*,const DBT*,const DBT*), int maxsize) { int pma_create(PMA *pma, pma_compare_fun_t compare_fun, int maxsize) {
int error; int error;
TAGMALLOC(PMA, result); TAGMALLOC(PMA, result);
if (result==0) return -1; if (result==0) return -1;
result->dup_mode = 0;
result->n_pairs_present = 0; result->n_pairs_present = 0;
result->pairs = 0; result->pairs = 0;
list_init(&result->cursors); list_init(&result->cursors);
...@@ -401,19 +586,18 @@ int pma_create(PMA *pma, int (*compare_fun)(DB*,const DBT*,const DBT*), int maxs ...@@ -401,19 +586,18 @@ int pma_create(PMA *pma, int (*compare_fun)(DB*,const DBT*,const DBT*), int maxs
} }
/* find the smallest power of 2 >= n */ /* find the smallest power of 2 >= n */
int __pma_array_size(PMA pma __attribute__((unused)), int asksize) { static int __pma_array_size(PMA pma __attribute__((unused)), int asksize) {
int n = PMA_MIN_ARRAY_SIZE; int n = PMA_MIN_ARRAY_SIZE;
while (n < asksize) while (n < asksize)
n *= 2; n *= 2;
return n; return n;
} }
int __pma_resize_array(PMA pma, int asksize, int startz) { static int __pma_resize_array(PMA pma, int asksize, int startz) {
int i; int i;
int n; int n;
n = __pma_array_size(pma, asksize); n = __pma_array_size(pma, asksize);
// printf("pma_resize %d -> %d\n", pma->N, n);
pma->N = n; pma->N = n;
if (pma->pairs == 0) if (pma->pairs == 0)
pma->pairs = toku_malloc((1 + pma->N) * sizeof (struct kv_pair *)); pma->pairs = toku_malloc((1 + pma->N) * sizeof (struct kv_pair *));
...@@ -430,6 +614,18 @@ int __pma_resize_array(PMA pma, int asksize, int startz) { ...@@ -430,6 +614,18 @@ int __pma_resize_array(PMA pma, int asksize, int startz) {
return 0; return 0;
} }
int pma_set_dup_mode(PMA pma, int dup_mode) {
assert(dup_mode == 0 || dup_mode == DB_DUP || dup_mode == (DB_DUP+DB_DUPSORT));
pma->dup_mode = dup_mode;
return 0;
}
int pma_set_dup_compare(PMA pma, pma_compare_fun_t dup_compare_fun) {
assert(pma->dup_mode & DB_DUPSORT);
pma->dup_compare_fun = dup_compare_fun;
return 0;
}
int pma_cursor (PMA pma, PMA_CURSOR *cursp) { int pma_cursor (PMA pma, PMA_CURSOR *cursp) {
PMA_CURSOR MALLOC(curs); PMA_CURSOR MALLOC(curs);
assert(curs!=0); assert(curs!=0);
...@@ -530,7 +726,11 @@ int pma_cursor_get_current(PMA_CURSOR c, DBT *key, DBT *val) { ...@@ -530,7 +726,11 @@ int pma_cursor_get_current(PMA_CURSOR c, DBT *key, DBT *val) {
int pma_cursor_set_key(PMA_CURSOR c, DBT *key, DB *db) { int pma_cursor_set_key(PMA_CURSOR c, DBT *key, DB *db) {
PMA pma = c->pma; PMA pma = c->pma;
int here = pmainternal_find(pma, key, db); int here, found;
if (pma->dup_mode & DB_DUP) {
here = __pma_left_search(pma, key, db, 0, pma->N, &found);
} else
here = pmainternal_find(pma, key, db);
assert(0<=here ); assert(here<=pma_index_limit(pma)); assert(0<=here ); assert(here<=pma_index_limit(pma));
int r = DB_NOTFOUND; int r = DB_NOTFOUND;
if (here < pma->N) { if (here < pma->N) {
...@@ -567,7 +767,11 @@ int pma_cursor_set_both(PMA_CURSOR c, DBT *key, DBT *val, DB *db) { ...@@ -567,7 +767,11 @@ int pma_cursor_set_both(PMA_CURSOR c, DBT *key, DBT *val, DB *db) {
int pma_cursor_set_range(PMA_CURSOR c, DBT *key, DB *db) { int pma_cursor_set_range(PMA_CURSOR c, DBT *key, DB *db) {
PMA pma = c->pma; PMA pma = c->pma;
int here = pmainternal_find(pma, key, db); int here, found;
if (pma->dup_mode & DB_DUP)
here = __pma_left_search(pma, key, db, 0, pma->N, &found);
else
here = pmainternal_find(pma, key, db);
assert(0<=here ); assert(here<=pma_index_limit(pma)); assert(0<=here ); assert(here<=pma_index_limit(pma));
/* find the first valid pair where key[here] >= key */ /* find the first valid pair where key[here] >= key */
...@@ -649,21 +853,13 @@ int pmainternal_make_space_at (PMA pma, int idx) { ...@@ -649,21 +853,13 @@ int pmainternal_make_space_at (PMA pma, int idx) {
break; break;
if (lo==0 && hi==pma_index_limit(pma)) { if (lo==0 && hi==pma_index_limit(pma)) {
/* The array needs to be doubled in size. */ /* The array needs to be doubled in size. */
#if 0
int i;
#endif
assert(size==pma_index_limit(pma)); assert(size==pma_index_limit(pma));
size*=2; size*=2;
#if 0
pma->pairs = toku_realloc(pma->pairs, (1+size)*sizeof(struct kv_pair *));
for (i=hi; i<size; i++) pma->pairs[i]=0;
pma->pairs[size] = (void*)0xdeadbeefL;
pma->N=size;
pmainternal_calculate_parameters(pma);
#else
// printf("pma_make_space_realloc %d to %d hi %d\n", pma->N, size, hi); // printf("pma_make_space_realloc %d to %d hi %d\n", pma->N, size, hi);
__pma_resize_array(pma, size, hi); __pma_resize_array(pma, size, hi);
#endif
hi=size; hi=size;
//printf("doubled N\n"); //printf("doubled N\n");
break; break;
...@@ -681,12 +877,16 @@ int pmainternal_make_space_at (PMA pma, int idx) { ...@@ -681,12 +877,16 @@ int pmainternal_make_space_at (PMA pma, int idx) {
} }
enum pma_errors pma_lookup (PMA pma, DBT *k, DBT *v, DB *db) { enum pma_errors pma_lookup (PMA pma, DBT *k, DBT *v, DB *db) {
int here, found;
if (pma->dup_mode & DB_DUP) {
here = __pma_left_search(pma, k, db, 0, pma->N, &found);
} else
here = pmainternal_find(pma, k, db);
assert(0<=here ); assert(here<=pma_index_limit(pma));
if (here==pma_index_limit(pma)) return DB_NOTFOUND;
DBT k2; DBT k2;
struct kv_pair *pair; struct kv_pair *pair;
int l = pmainternal_find(pma, k, db); pair = pma->pairs[here];
assert(0<=l ); assert(l<=pma_index_limit(pma));
if (l==pma_index_limit(pma)) return DB_NOTFOUND;
pair = pma->pairs[l];
if (kv_pair_valid(pair) && pma->compare_fun(db, k, fill_dbt(&k2, pair->key, pair->keylen))==0) { if (kv_pair_valid(pair) && pma->compare_fun(db, k, fill_dbt(&k2, pair->key, pair->keylen))==0) {
return ybt_set_value(v, pair->key + pair->keylen, pair->vallen, &pma->sval); return ybt_set_value(v, pair->key + pair->keylen, pair->vallen, &pma->sval);
} else { } else {
...@@ -727,8 +927,20 @@ int pma_free (PMA *pmap) { ...@@ -727,8 +927,20 @@ int pma_free (PMA *pmap) {
} }
/* Copies keylen and datalen */ /* Copies keylen and datalen */
int pma_insert (PMA pma, DBT *k, DBT *v, DB* db, TOKUTXN txn, diskoff diskoff) { /* returns an error if the key is already present. */
int idx = pmainternal_find(pma, k, db); int pma_insert (PMA pma, DBT *k, DBT *v, DB* db, TOKUTXN txn, DISKOFF diskoff, u_int32_t rand4fingerprint, u_int32_t *fingerprint) {
int found, idx;
if (pma->dup_mode & DB_DUPSORT) {
idx = __pma_dup_search(pma, k, v, db, 0, pma->N, &found);
if (found)
idx += 1;
} else if (pma->dup_mode & DB_DUP) {
idx = __pma_right_search(pma, k, db, 0, pma->N, &found);
if (found)
idx += 1;
} else {
idx = pmainternal_find(pma, k, db);
if (idx < pma_index_limit(pma) && pma->pairs[idx]) { if (idx < pma_index_limit(pma) && pma->pairs[idx]) {
DBT k2; DBT k2;
struct kv_pair *kv = kv_pair_ptr(pma->pairs[idx]); struct kv_pair *kv = kv_pair_ptr(pma->pairs[idx]);
...@@ -737,43 +949,95 @@ int pma_insert (PMA pma, DBT *k, DBT *v, DB* db, TOKUTXN txn, diskoff diskoff) { ...@@ -737,43 +949,95 @@ int pma_insert (PMA pma, DBT *k, DBT *v, DB* db, TOKUTXN txn, diskoff diskoff) {
pma_mfree_kv_pair(pma, pma->pairs[idx]); pma_mfree_kv_pair(pma, pma->pairs[idx]);
pma->pairs[idx] = pma_malloc_kv_pair(pma, k->data, k->size, v->data, v->size); pma->pairs[idx] = pma_malloc_kv_pair(pma, k->data, k->size, v->data, v->size);
assert(pma->pairs[idx]); assert(pma->pairs[idx]);
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
int r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, pma->pairs[idx]); int r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, pma->pairs[idx]);
return r; return r;
} else } else
return BRT_ALREADY_THERE; /* It is already here. Return an error. */ return BRT_ALREADY_THERE; /* It is already here. Return an error. */
} }
} }
}
if (kv_pair_inuse(pma->pairs[idx])) { if (kv_pair_inuse(pma->pairs[idx])) {
idx = pmainternal_make_space_at (pma, idx); /* returns the new idx. */ idx = pmainternal_make_space_at (pma, idx); /* returns the new idx. */
} }
assert(0 <= idx && idx < pma->N);
assert(!kv_pair_inuse(pma->pairs[idx])); assert(!kv_pair_inuse(pma->pairs[idx]));
pma->pairs[idx] = pma_malloc_kv_pair(pma, k->data, k->size, v->data, v->size); pma->pairs[idx] = pma_malloc_kv_pair(pma, k->data, k->size, v->data, v->size);
assert(pma->pairs[idx]); assert(pma->pairs[idx]);
pma->n_pairs_present++; pma->n_pairs_present++;
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
return tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 1, pma->pairs[idx]); return tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 1, pma->pairs[idx]);
} }
int pma_delete (PMA pma, DBT *k, DB *db) { /* find the next matching key in the pma starting from index here */
int l; static int pma_next_key(PMA pma, DBT *k, DB *db, int here, int n, int *found) {
assert(0 <= here);
*found = 0;
while (here < n && !kv_pair_inuse(pma->pairs[here]))
here += 1;
if (here < n) {
struct kv_pair *kv = kv_pair_ptr(pma->pairs[here]);
DBT k2;
if (0 == pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv))))
*found = 1;
}
return here;
}
static int pma_delete_dup (PMA pma, DBT *k, DB *db, u_int32_t rand4sem, u_int32_t *fingerprint) {
/* find the left most matching key in the pma */
int found, lefthere;
lefthere = __pma_left_search(pma, k, db, 0, pma->N, &found);
int rightfound = found, righthere = lefthere;
while (rightfound) {
struct kv_pair *kv = pma->pairs[righthere];
if (kv_pair_valid(kv)) {
/* mark the pair as deleted */
*fingerprint -= rand4sem*toku_calccrc32_kvpair (kv_pair_key_const(kv), kv_pair_keylen(kv), kv_pair_val_const(kv), kv_pair_vallen(kv));
pma->pairs[righthere] = kv_pair_set_deleted(kv);
if (__pma_count_cursor_refs(pma, righthere) == 0) {
pma_mfree_kv_pair(pma, kv);
pma->pairs[righthere] = 0;
pma->n_pairs_present--;
}
}
/* find the next matching key in the pma */
righthere = pma_next_key(pma, k, db, righthere+1, pma->N, &rightfound);
}
if (found) {
/* check the density of the region centered around the deleted pairs */
__pma_delete_at(pma, (lefthere + righthere) / 2);
}
return found ? BRT_OK : DB_NOTFOUND;
}
l = pmainternal_find(pma, k, db); static int pma_delete_nodup (PMA pma, DBT *k, DB *db, u_int32_t rand4sem, u_int32_t *fingerprint) {
struct kv_pair *kv = pma->pairs[l]; int idx = pmainternal_find(pma, k, db);
struct kv_pair *kv = pma->pairs[idx];
if (!kv_pair_valid(kv)) { if (!kv_pair_valid(kv)) {
if (0) printf("%s:%d l=%d r=%d\n", __FILE__, __LINE__, l, DB_NOTFOUND); if (0) printf("%s:%d l=%d r=%d\n", __FILE__, __LINE__, idx, DB_NOTFOUND);
return DB_NOTFOUND; return DB_NOTFOUND;
} }
pma->pairs[l] = kv_pair_set_deleted(kv); *fingerprint -= rand4sem*toku_calccrc32_kvpair (kv_pair_key_const(kv), kv_pair_keylen(kv), kv_pair_val_const(kv), kv_pair_vallen(kv));
if (__pma_count_cursor_refs(pma, l) == 0) pma->pairs[idx] = kv_pair_set_deleted(kv);
__pma_delete_finish(pma, l); if (__pma_count_cursor_refs(pma, idx) == 0)
__pma_delete_finish(pma, idx);
return BRT_OK; return BRT_OK;
} }
int pma_delete (PMA pma, DBT *k, DB *db, u_int32_t rand4sem, u_int32_t *fingerprint) {
if (pma->dup_mode & DB_DUP)
return pma_delete_dup(pma, k, db, rand4sem, fingerprint);
else
return pma_delete_nodup(pma, k, db, rand4sem, fingerprint);
}
void __pma_delete_resume(PMA pma, int here) { void __pma_delete_resume(PMA pma, int here) {
if (here >= 0 && kv_pair_deleted(pma->pairs[here]) &&__pma_count_cursor_refs(pma, here) == 0) if (here >= 0 && kv_pair_deleted(pma->pairs[here]) &&__pma_count_cursor_refs(pma, here) == 0)
__pma_delete_finish(pma, here); __pma_delete_finish(pma, here);
} }
void __pma_delete_finish(PMA pma, int here) { static void __pma_delete_finish(PMA pma, int here) {
struct kv_pair *kv = pma->pairs[here]; struct kv_pair *kv = pma->pairs[here];
if (!kv_pair_inuse(kv)) if (!kv_pair_inuse(kv))
return; return;
...@@ -783,7 +1047,7 @@ void __pma_delete_finish(PMA pma, int here) { ...@@ -783,7 +1047,7 @@ void __pma_delete_finish(PMA pma, int here) {
__pma_delete_at(pma, here); __pma_delete_at(pma, here);
} }
void __pma_delete_at(PMA pma, int here) { static void __pma_delete_at(PMA pma, int here) {
int size; int size;
int count; int count;
struct kv_pair_tag *newpairs; struct kv_pair_tag *newpairs;
...@@ -854,7 +1118,8 @@ void __pma_delete_at(PMA pma, int here) { ...@@ -854,7 +1118,8 @@ void __pma_delete_at(PMA pma, int here) {
int pma_insert_or_replace (PMA pma, DBT *k, DBT *v, int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
int *replaced_v_size, /* If it is a replacement, set to the size of the old value, otherwise set to -1. */ int *replaced_v_size, /* If it is a replacement, set to the size of the old value, otherwise set to -1. */
DB *db, TOKUTXN txn, diskoff diskoff) { DB *db, TOKUTXN txn, DISKOFF diskoff,
u_int32_t rand4fingerprint, u_int32_t *fingerprint) {
//printf("%s:%d v->size=%d\n", __FILE__, __LINE__, v->size); //printf("%s:%d v->size=%d\n", __FILE__, __LINE__, v->size);
int idx = pmainternal_find(pma, k, db); int idx = pmainternal_find(pma, k, db);
struct kv_pair *kv; struct kv_pair *kv;
...@@ -866,6 +1131,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v, ...@@ -866,6 +1131,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
if (0==pma->compare_fun(db, k, fill_dbt(&k2, kv->key, kv->keylen))) { if (0==pma->compare_fun(db, k, fill_dbt(&k2, kv->key, kv->keylen))) {
if (!kv_pair_deleted(pma->pairs[idx])) { if (!kv_pair_deleted(pma->pairs[idx])) {
*replaced_v_size = kv->vallen; *replaced_v_size = kv->vallen;
*fingerprint -= rand4fingerprint*toku_calccrc32_kvpair(kv_pair_key_const(kv), kv_pair_keylen(kv), kv_pair_val_const(kv), kv_pair_vallen(kv));
r=tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, kv); r=tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, kv);
if (r!=0) return r; if (r!=0) return r;
} }
...@@ -877,6 +1143,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v, ...@@ -877,6 +1143,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
assert(pma->pairs[idx]); assert(pma->pairs[idx]);
} }
r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, pma->pairs[idx]); r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, pma->pairs[idx]);
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
return r; return r;
} }
} }
...@@ -891,6 +1158,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v, ...@@ -891,6 +1158,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
*replaced_v_size = -1; *replaced_v_size = -1;
//printf("%s:%d txn=%p\n", __FILE__, __LINE__, txn); //printf("%s:%d txn=%p\n", __FILE__, __LINE__, txn);
r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 1, pma->pairs[idx]); r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 1, pma->pairs[idx]);
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
return r; return r;
} }
...@@ -920,7 +1188,7 @@ int __pma_count_cursor_refs(PMA pma, int here) { ...@@ -920,7 +1188,7 @@ int __pma_count_cursor_refs(PMA pma, int here) {
return refs; return refs;
} }
void __pma_update_cursors_position(PMA pma, struct list *cursor_set, int oldposition, int newposition) { static void __pma_update_cursors_position(PMA pma, struct list *cursor_set, int oldposition, int newposition) {
struct list *list, *nextlist; struct list *list, *nextlist;
struct pma_cursor *cursor; struct pma_cursor *cursor;
...@@ -952,7 +1220,7 @@ void __pma_update_cursors(PMA pma, struct list *cursor_set, struct kv_pair_tag * ...@@ -952,7 +1220,7 @@ void __pma_update_cursors(PMA pma, struct list *cursor_set, struct kv_pair_tag *
} }
} }
void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n) { static void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n) {
if (list_empty(&pma->cursors)) if (list_empty(&pma->cursors))
return; return;
...@@ -967,7 +1235,7 @@ void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n) { ...@@ -967,7 +1235,7 @@ void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n) {
} }
} }
struct kv_pair_tag *__pma_extract_pairs(PMA pma, int npairs, int lo, int hi) { static struct kv_pair_tag *__pma_extract_pairs(PMA pma, int npairs, int lo, int hi) {
struct kv_pair_tag *pairs; struct kv_pair_tag *pairs;
int i; int i;
int lastpair; int lastpair;
...@@ -1007,8 +1275,8 @@ static void __pma_relocate_kvpairs(PMA pma) { ...@@ -1007,8 +1275,8 @@ static void __pma_relocate_kvpairs(PMA pma) {
#endif #endif
int pma_split(PMA origpma, unsigned int *origpma_size, int pma_split(PMA origpma, unsigned int *origpma_size,
PMA leftpma, unsigned int *leftpma_size, PMA leftpma, unsigned int *leftpma_size, u_int32_t leftrand4fp, u_int32_t *leftfingerprint,
PMA rightpma, unsigned int *rightpma_size) { PMA rightpma, unsigned int *rightpma_size, u_int32_t rightrand4fp, u_int32_t *rightfingerprint) {
int error; int error;
int npairs; int npairs;
struct kv_pair_tag *pairs; struct kv_pair_tag *pairs;
...@@ -1057,6 +1325,23 @@ int pma_split(PMA origpma, unsigned int *origpma_size, ...@@ -1057,6 +1325,23 @@ int pma_split(PMA origpma, unsigned int *origpma_size,
if (!list_empty(&origpma->cursors)) if (!list_empty(&origpma->cursors))
list_move(&cursors, &origpma->cursors); list_move(&cursors, &origpma->cursors);
{
u_int32_t sum = 0;
for (i=0; i<spliti; i++) {
sum+=toku_calccrc32_kvpair(kv_pair_key_const(pairs[i].pair), kv_pair_keylen(pairs[i].pair),
kv_pair_val_const(pairs[i].pair), kv_pair_vallen(pairs[i].pair));
}
*leftfingerprint += leftrand4fp * sum;
}
{
u_int32_t sum = 0;
for (i=spliti; i<npairs; i++) {
sum+=toku_calccrc32_kvpair(kv_pair_key_const(pairs[i].pair), kv_pair_keylen(pairs[i].pair),
kv_pair_val_const(pairs[i].pair), kv_pair_vallen(pairs[i].pair));
}
*rightfingerprint += rightrand4fp * sum;
}
/* put the first half of pairs into the left pma */ /* put the first half of pairs into the left pma */
n = spliti; n = spliti;
error = __pma_resize_array(leftpma, n + n/4, 0); error = __pma_resize_array(leftpma, n + n/4, 0);
...@@ -1119,7 +1404,7 @@ int pma_get_last(PMA pma, DBT *key, DBT *val) { ...@@ -1119,7 +1404,7 @@ int pma_get_last(PMA pma, DBT *key, DBT *val) {
return 0; return 0;
} }
void __pma_bulk_cleanup(struct pma *pma, struct kv_pair_tag *pairs, int n) { static void __pma_bulk_cleanup(struct pma *pma, struct kv_pair_tag *pairs, int n) {
int i; int i;
for (i=0; i<n; i++) for (i=0; i<n; i++)
...@@ -1127,10 +1412,11 @@ void __pma_bulk_cleanup(struct pma *pma, struct kv_pair_tag *pairs, int n) { ...@@ -1127,10 +1412,11 @@ void __pma_bulk_cleanup(struct pma *pma, struct kv_pair_tag *pairs, int n) {
pma_mfree_kv_pair(pma, pairs[i].pair); pma_mfree_kv_pair(pma, pairs[i].pair);
} }
int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) { int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs, u_int32_t rand4fp, u_int32_t *sum) {
struct kv_pair_tag *newpairs; struct kv_pair_tag *newpairs;
int i; int i;
int error; int error;
u_int32_t delta=0;
if (n_newpairs == 0) if (n_newpairs == 0)
return 0; return 0;
...@@ -1146,6 +1432,7 @@ int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) { ...@@ -1146,6 +1432,7 @@ int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) {
} }
for (i=0; i<n_newpairs; i++) { for (i=0; i<n_newpairs; i++) {
delta += rand4fp*toku_calccrc32_kvpair (keys[i].data, keys[i].size, vals[i].data, vals[i].size);
#if PMA_USE_MEMPOOL #if PMA_USE_MEMPOOL
newpairs[i].pair = kv_pair_malloc_mempool(keys[i].data, keys[i].size, newpairs[i].pair = kv_pair_malloc_mempool(keys[i].data, keys[i].size,
vals[i].data, vals[i].size, &pma->kvspace); vals[i].data, vals[i].size, &pma->kvspace);
...@@ -1169,6 +1456,68 @@ int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) { ...@@ -1169,6 +1456,68 @@ int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) {
pma->n_pairs_present = n_newpairs; pma->n_pairs_present = n_newpairs;
toku_free(newpairs); toku_free(newpairs);
*sum += delta;
return 0; return 0;
} }
/* verify that the keys in the pma index are sorted subject to the pma mode
* no duplications, duplicates, sorted duplicates.
*/
void pma_verify(PMA pma, DB *db) {
int i;
struct kv_pair *kv;
/* find the first key in the index */
for (i=0; i<pma->N; i++) {
kv = pma->pairs[i];
if (kv_pair_inuse(kv)) {
kv = kv_pair_ptr(kv);
i += 1;
break;
}
}
/* compare the current key with the next key in the index */
struct kv_pair *nextkv;
for (; i<pma->N; i++) {
nextkv = pma->pairs[i];
if (kv_pair_inuse(nextkv)) {
nextkv = kv_pair_ptr(nextkv);
DBT kv_dbt, nextkv_dbt;
fill_dbt(&kv_dbt, kv_pair_key(kv), kv_pair_keylen(kv));
fill_dbt(&nextkv_dbt, kv_pair_key(nextkv), kv_pair_keylen(nextkv));
int r = pma->compare_fun(db, &kv_dbt, &nextkv_dbt);
if (pma->dup_mode == 0)
assert(r < 0);
else if (pma->dup_mode & DB_DUP)
assert(r <= 0);
if (r == 0 && (pma->dup_mode & DB_DUPSORT)) {
fill_dbt(&kv_dbt, kv_pair_val(kv), kv_pair_vallen(kv));
fill_dbt(&nextkv_dbt, kv_pair_val(nextkv), kv_pair_vallen(nextkv));
r = pma->dup_compare_fun(db, &kv_dbt, &nextkv_dbt);
assert(r <= 0);
}
kv = nextkv;
}
}
#if PMA_USE_MEMPOOL
/* verify all kv pairs are in the memory pool */
for (i=0; i<pma->N; i++) {
kv = pma->pairs[i];
if (kv_pair_inuse(kv)) {
kv = kv_pair_ptr(kv);
assert(mempool_inrange(&pma->kvspace, kv, kv_pair_size(kv)));
}
}
#endif
}
void pma_verify_fingerprint (PMA pma, u_int32_t rand4fingerprint, u_int32_t fingerprint) {
u_int32_t actual_fingerprint=0;
PMA_ITERATE(pma, kv, kl, dv, dl,
actual_fingerprint+=rand4fingerprint*toku_calccrc32_kvpair(kv,kl,dv,dl)
);
assert(actual_fingerprint==fingerprint);
}
...@@ -10,11 +10,26 @@ ...@@ -10,11 +10,26 @@
/* An in-memory Packed Memory Array dictionary. */ /* An in-memory Packed Memory Array dictionary. */
/* There is a built-in-cursor. */ /* There is a built-in-cursor. */
/* All functions return 0 on success. */
typedef struct pma *PMA; typedef struct pma *PMA;
typedef struct pma_cursor *PMA_CURSOR; typedef struct pma_cursor *PMA_CURSOR;
/* All functions return 0 on success. */ /* compare 2 DBT's
int pma_create(PMA *, int (*compare_fun)(DB*,const DBT*,const DBT*), int maxsize); return a value < 0, = 0, > 0 if a < b, a == b, a > b respectively */
typedef int (*pma_compare_fun_t)(DB *, const DBT *a, const DBT *b);
int pma_create(PMA *, pma_compare_fun_t compare_fun, int maxsize);
/* set the duplicate mode
0 -> no duplications, DB_DUP, DB_DUPSORT */
int pma_set_dup_mode(PMA pma, int mode);
/* set the duplicate compare function */
int pma_set_dup_compare(PMA pma, pma_compare_fun_t dup_compare_fun);
/* verify the integrity of a pma */
void pma_verify(PMA pma, DB *db);
/* returns 0 if OK. /* returns 0 if OK.
* You must have freed all the cursors, otherwise returns nonzero and does nothing. */ * You must have freed all the cursors, otherwise returns nonzero and does nothing. */
...@@ -28,15 +43,16 @@ int pma_n_entries (PMA); ...@@ -28,15 +43,16 @@ int pma_n_entries (PMA);
/* Duplicates the key and keylen. */ /* Duplicates the key and keylen. */
//enum pma_errors pma_insert (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen); //enum pma_errors pma_insert (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen);
// The DB pointer is there so that the comparison function can be called. // The DB pointer is there so that the comparison function can be called.
enum pma_errors pma_insert (PMA, DBT*, DBT*, DB*, TOKUTXN txn, diskoff); enum pma_errors pma_insert (PMA, DBT*, DBT*, DB*, TOKUTXN txn, DISKOFF, u_int32_t /*random for fingerprint */, u_int32_t */*fingerprint*/);
/* This returns an error if the key is NOT present. */ /* This returns an error if the key is NOT present. */
int pma_replace (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen); int pma_replace (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen);
/* This returns an error if the key is NOT present. */ /* This returns an error if the key is NOT present. */
int pma_delete (PMA, DBT *, DB*); int pma_delete (PMA, DBT *, DB*, u_int32_t /*random for fingerprint*/, u_int32_t */*fingerprint*/);
int pma_insert_or_replace (PMA pma, DBT *k, DBT *v, int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
int *replaced_v_size, /* If it is a replacement, set to the size of the old value, otherwise set to -1. */ int *replaced_v_size, /* If it is a replacement, set to the size of the old value, otherwise set to -1. */
DB *db, TOKUTXN txn, diskoff); DB *db, TOKUTXN txn, DISKOFF,
u_int32_t /*random for fingerprint*/, u_int32_t */*fingerprint*/);
/* Exposes internals of the PMA by returning a pointer to the guts. /* Exposes internals of the PMA by returning a pointer to the guts.
...@@ -54,12 +70,13 @@ enum pma_errors pma_lookup (PMA, DBT*, DBT*, DB*); ...@@ -54,12 +70,13 @@ enum pma_errors pma_lookup (PMA, DBT*, DBT*, DB*);
* rightpma - the pma assigned keys > pivot key * rightpma - the pma assigned keys > pivot key
*/ */
int pma_split(PMA origpma, unsigned int *origpma_size, int pma_split(PMA origpma, unsigned int *origpma_size,
PMA leftpma, unsigned int *leftpma_size, PMA leftpma, unsigned int *leftpma_size, u_int32_t leftrand4sum, u_int32_t *leftfingerprint,
PMA rightpma, unsigned int *rightpma_size); PMA rightpma, unsigned int *rightpma_size, u_int32_t rightrand4sum, u_int32_t *rightfingerprint);
/* /*
* Insert several key value pairs into an empty pma. The keys are * Insert several key value pairs into an empty pma.
* assumed to be sorted. * Doesn't delete any existing keys (even if they are duplicates)
* Requires: The keys are sorted
* *
* pma - the pma that the key value pairs will be inserted into. * pma - the pma that the key value pairs will be inserted into.
* must be empty with no cursors. * must be empty with no cursors.
...@@ -67,7 +84,7 @@ int pma_split(PMA origpma, unsigned int *origpma_size, ...@@ -67,7 +84,7 @@ int pma_split(PMA origpma, unsigned int *origpma_size,
* vals - an array of values * vals - an array of values
* n_newpairs - the number of key value pairs * n_newpairs - the number of key value pairs
*/ */
int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs); int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs, u_int32_t rand4sem, u_int32_t *fingerprint);
/* Move the cursor to the beginning or the end or to a key */ /* Move the cursor to the beginning or the end or to a key */
int pma_cursor (PMA, PMA_CURSOR *); int pma_cursor (PMA, PMA_CURSOR *);
...@@ -122,4 +139,6 @@ void pma_iterate (PMA, void(*)(bytevec,ITEMLEN,bytevec,ITEMLEN, void*), void*); ...@@ -122,4 +139,6 @@ void pma_iterate (PMA, void(*)(bytevec,ITEMLEN,bytevec,ITEMLEN, void*), void*);
int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len); int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
void pma_verify_fingerprint (PMA pma, u_int32_t rand4fingerprint, u_int32_t fingerprint);
#endif #endif
...@@ -32,6 +32,7 @@ void create_directory (void) { ...@@ -32,6 +32,7 @@ void create_directory (void) {
r=env->set_cachesize(env, 0, 512*(1<<20), 0); r=env->set_cachesize(env, 0, 512*(1<<20), 0);
assert(r==0); assert(r==0);
#if DB_VERSION_MAJOR >= 4 && DB_VERSION_MINOR >= 3
IF40((void)0, IF40((void)0,
({ ({
unsigned int gbytes,bytes; unsigned int gbytes,bytes;
...@@ -40,7 +41,7 @@ void create_directory (void) { ...@@ -40,7 +41,7 @@ void create_directory (void) {
assert(r==0); assert(r==0);
printf("Using %.2fMiB Berkeley DB Cache Size\n", gbytes*1024 + ((double)bytes/(1<<20))); printf("Using %.2fMiB Berkeley DB Cache Size\n", gbytes*1024 + ((double)bytes/(1<<20)));
})); }));
#endif
r= env->open(env, dir, DB_CREATE|DB_INIT_MPOOL,0777); // No logging. r= env->open(env, dir, DB_CREATE|DB_INIT_MPOOL,0777); // No logging.
assert(r==0); assert(r==0);
......
...@@ -25,16 +25,26 @@ static unsigned int rbuf_int (struct rbuf *r) { ...@@ -25,16 +25,26 @@ static unsigned int rbuf_int (struct rbuf *r) {
(c3<<0)); (c3<<0));
} }
static inline void rbuf_literal_bytes (struct rbuf *r, bytevec *bytes, unsigned int n_bytes) {
*bytes = &r->buf[r->ndone];
r->ndone+=n_bytes;
assert(r->ndone<=r->size);
}
/* Return a pointer into the middle of the buffer. */ /* Return a pointer into the middle of the buffer. */
static void rbuf_bytes (struct rbuf *r, bytevec *bytes, unsigned int *n_bytes) static void rbuf_bytes (struct rbuf *r, bytevec *bytes, unsigned int *n_bytes)
{ {
*n_bytes = rbuf_int(r); *n_bytes = rbuf_int(r);
*bytes = &r->buf[r->ndone]; rbuf_literal_bytes(r, bytes, *n_bytes);
r->ndone+=*n_bytes; }
assert(r->ndone<=r->size);
static unsigned long long rbuf_ulonglong (struct rbuf *r) {
unsigned i0 = rbuf_int(r);
unsigned i1 = rbuf_int(r);
return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1));
} }
static diskoff rbuf_diskoff (struct rbuf *r) { static DISKOFF rbuf_diskoff (struct rbuf *r) {
unsigned i0 = rbuf_int(r); unsigned i0 = rbuf_int(r);
unsigned i1 = rbuf_int(r); unsigned i1 = rbuf_int(r);
return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1)); return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1));
......
/* Readers/writers locks implementation
*
*****************************************
* Overview
*****************************************
*
* TokuDB employs readers/writers locks for the ephemeral locks (e.g.,
* on BRT nodes) Why not just use the pthread_rwlock API?
*
* 1) we need multiprocess rwlocks (not just multithreaded)
*
* 2) pthread rwlocks are very slow since they entail a system call
* (about 2000ns on a 2GHz T2500.)
*
* Related: We expect the common case to be that the lock is
* granted
*
* 3) We are willing to employ machine-specific instructions (such
* as atomic exchange, and mfence, each of which runs in about
* 10ns.)
*
* 4) We want to guarantee nonstarvation (many rwlock
* implementations can starve the writers because another reader
* comes * along before all the other readers have unlocked.)
*
*****************************************
* How it works
*****************************************
*
* We arrange that the rwlock object is in the address space of both
* threads or processes. For processes we use mmap().
*
* The rwlock struct comprises the following fields
*
* a long mutex field (which is accessed using xchgl() or other
* machine-specific instructions. This is a spin lock.
*
* a read counter (how many readers currently have the lock?)
*
* a write boolean (does a writer have the lock?)
*
* a singly linked list of semaphores for waiting requesters. This
* list is sorted oldest requester first. Each list element
* contains a semaphore (which is provided by the requestor) and a
* boolean indicating whether it is a reader or a writer.
*
* To lock a read rwlock:
*
* 1) Acquire the mutex.
*
* 2) If the linked list is not empty or the writer boolean is true
* then
*
* a) initialize your semaphore (to 0),
* b) add your list element to the end of the list (with rw="read")
* c) release the mutex
* d) wait on the semaphore
* e) when the semaphore release, return success.
*
* 3) Otherwise increment the reader count, release the mutex, and
* return success.
*
* To lock the write rwlock is almost the same.
* 1) Acquire the mutex
* 2) If the list is not empty or the reader count is nonzero
* a) initialize semaphore
* b) add to end of list (with rw="write")
* c) release mutex
* d) wait on the semaphore
* e) return success when the semaphore releases
* 3) Otherwise set writer=TRUE, release mutex and return success.
*
* To unlock a read rwlock:
* 1) Acquire mutex
* 2) Decrement reader count
* 3) If the count is still positive or the list is empty then
* return success
* 4) Otherwise (count==zero and the list is nonempty):
* a) If the first element of the list is a reader:
* i) while the first element is a reader:
* x) pop the list
* y) increment the reader count
* z) increment the semaphore (releasing it for some waiter)
* ii) return success
* b) Else if the first element is a writer
* i) pop the list
* ii) set writer to TRUE
* iii) increment the semaphore
* iv) return success
*/
...@@ -6,6 +6,14 @@ ...@@ -6,6 +6,14 @@
#include <errno.h> #include <errno.h>
#include "memory.h" #include "memory.h"
//#define CRC_NO
#define CRC_INCR
//#define CRC_ATEND
#ifndef CRC_NO
#include "crc.h"
#endif
/* When serializing a value, write it into a buffer. */ /* When serializing a value, write it into a buffer. */
/* This code requires that the buffer be big enough to hold whatever you put into it. */ /* This code requires that the buffer be big enough to hold whatever you put into it. */
/* This abstraction doesn't do a good job of hiding its internals. /* This abstraction doesn't do a good job of hiding its internals.
...@@ -14,18 +22,27 @@ struct wbuf { ...@@ -14,18 +22,27 @@ struct wbuf {
unsigned char *buf; unsigned char *buf;
unsigned int size; unsigned int size;
unsigned int ndone; unsigned int ndone;
#ifdef CRC_INCR
u_int32_t crc32; // A 32-bit CRC of everything written so foar.
#endif
}; };
static void wbuf_init (struct wbuf *w, void *buf, diskoff size) { static void wbuf_init (struct wbuf *w, void *buf, DISKOFF size) {
w->buf=buf; w->buf=buf;
w->size=size; w->size=size;
w->ndone=0; w->ndone=0;
#ifdef CRC_INCR
w->crc32 = toku_crc32(0L, Z_NULL, 0);
#endif
} }
/* Write a character. */ /* Write a character. */
static inline void wbuf_char (struct wbuf *w, int ch) { static inline void wbuf_char (struct wbuf *w, int ch) {
assert(w->ndone<w->size); assert(w->ndone<w->size);
w->buf[w->ndone++]=ch; w->buf[w->ndone++]=ch;
#ifdef CRC_INCR
w->crc32 = toku_crc32(w->crc32, &w->buf[w->ndone-1], 1);
#endif
} }
static void wbuf_int (struct wbuf *w, unsigned int i) { static void wbuf_int (struct wbuf *w, unsigned int i) {
...@@ -40,20 +57,31 @@ static void wbuf_int (struct wbuf *w, unsigned int i) { ...@@ -40,20 +57,31 @@ static void wbuf_int (struct wbuf *w, unsigned int i) {
w->buf[w->ndone+1] = i>>16; w->buf[w->ndone+1] = i>>16;
w->buf[w->ndone+2] = i>>8; w->buf[w->ndone+2] = i>>8;
w->buf[w->ndone+3] = i>>0; w->buf[w->ndone+3] = i>>0;
#ifdef CRC_INCR
w->crc32 = toku_crc32(w->crc32, &w->buf[w->ndone], 4);
#endif
w->ndone += 4; w->ndone += 4;
#endif #endif
} }
static void wbuf_bytes (struct wbuf *w, bytevec bytes_bv, int nbytes) { static inline void wbuf_literal_bytes(struct wbuf *w, bytevec bytes_bv, int nbytes) {
const unsigned char *bytes=bytes_bv; const unsigned char *bytes=bytes_bv;
wbuf_int(w, nbytes);
#if 0 #if 0
{ int i; for (i=0; i<nbytes; i++) wbuf_char(w, bytes[i]); } { int i; for (i=0; i<nbytes; i++) wbuf_char(w, bytes[i]); }
#else #else
assert(w->ndone + nbytes <= w->size); assert(w->ndone + nbytes <= w->size);
memcpy(w->buf + w->ndone, bytes, nbytes); memcpy(w->buf + w->ndone, bytes, nbytes);
#ifdef CRC_INCR
w->crc32 = toku_crc32(w->crc32, &w->buf[w->ndone], nbytes);
#endif
w->ndone += nbytes; w->ndone += nbytes;
#endif #endif
}
static void wbuf_bytes (struct wbuf *w, bytevec bytes_bv, int nbytes) {
wbuf_int(w, nbytes);
wbuf_literal_bytes(w, bytes_bv, nbytes);
} }
static void wbuf_ulonglong (struct wbuf *w, unsigned long long ull) { static void wbuf_ulonglong (struct wbuf *w, unsigned long long ull) {
...@@ -61,7 +89,7 @@ static void wbuf_ulonglong (struct wbuf *w, unsigned long long ull) { ...@@ -61,7 +89,7 @@ static void wbuf_ulonglong (struct wbuf *w, unsigned long long ull) {
wbuf_int(w, ull&0xFFFFFFFF); wbuf_int(w, ull&0xFFFFFFFF);
} }
static void wbuf_diskoff (struct wbuf *w, diskoff off) { static void wbuf_diskoff (struct wbuf *w, DISKOFF off) {
wbuf_ulonglong(w, off); wbuf_ulonglong(w, off);
} }
...@@ -69,8 +97,12 @@ static inline void wbuf_txnid (struct wbuf *w, TXNID tid) { ...@@ -69,8 +97,12 @@ static inline void wbuf_txnid (struct wbuf *w, TXNID tid) {
wbuf_ulonglong(w, tid); wbuf_ulonglong(w, tid);
} }
static inline void wbuf_fileid (struct wbuf *w, unsigned long long fileid) { static inline void wbuf_lsn (struct wbuf *w, LSN lsn) {
wbuf_ulonglong(w, fileid); wbuf_ulonglong(w, lsn.lsn);
}
static inline void wbuf_filenum (struct wbuf *w, FILENUM fileid) {
wbuf_int(w, fileid.fileid);
} }
#endif #endif
...@@ -16,9 +16,11 @@ DBT *fill_dbt(DBT *dbt, bytevec k, ITEMLEN len) { ...@@ -16,9 +16,11 @@ DBT *fill_dbt(DBT *dbt, bytevec k, ITEMLEN len) {
return dbt; return dbt;
} }
DBT *fill_dbt_ap(DBT *dbt, bytevec k, ITEMLEN len, void *app_private) { DBT *fill_dbt_ap(DBT *dbt, bytevec k, ITEMLEN len, void *app_private __attribute__((unused))) {
fill_dbt(dbt, k, len); fill_dbt(dbt, k, len);
#if USE_DBT_APP_PRIVATE
dbt->app_private=app_private; dbt->app_private=app_private;
#endif
return dbt; return dbt;
} }
......
...@@ -11,4 +11,22 @@ DBT *fill_dbt(DBT *dbt, bytevec k, ITEMLEN len); ...@@ -11,4 +11,22 @@ DBT *fill_dbt(DBT *dbt, bytevec k, ITEMLEN len);
DBT *fill_dbt_ap(DBT *dbt, bytevec k, ITEMLEN len, void *app_private); DBT *fill_dbt_ap(DBT *dbt, bytevec k, ITEMLEN len, void *app_private);
int ybt_set_value (DBT *, bytevec val, ITEMLEN vallen, void **staticptrp); int ybt_set_value (DBT *, bytevec val, ITEMLEN vallen, void **staticptrp);
#ifndef USE_DBT_APP_PRIVATE
#define USE_DBT_APP_PRIVATE 0
#endif
static inline void *dbt_get_app_private(DBT *dbt __attribute__((unused))) {
#if USE_DBT_APP_PRIVATE
return dbt->app_private;
#else
return 0;
#endif
}
static inline void dbt_set_app_private(DBT *dbt __attribute__((unused)), void *ap __attribute__((unused))) {
#if USE_DBT_APP_PRIVATE
dbt->app_private = ap;
#endif
}
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment