Commit 6157eb7d authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul

Up

git-svn-id: file:///svn/tokudb@519 c7de825b-a66e-492c-adef-691d508d4ae1
parent 38cf566a
......@@ -12,7 +12,7 @@ FPICFLAGS = -fPIC
DTOOL = valgrind --quiet --error-exitcode=1
endif
CFLAGS = -Wall -W $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) -Werror $(FPICFLAGS)
CFLAGS = -Wall -W $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) -Werror $(FPICFLAGS) -Wshadow
LDFLAGS = $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS)
CPPFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
......@@ -31,10 +31,10 @@ REGRESSION_TESTS = \
ybt-test \
pma-test \
brt-serialize-test \
brt-test \
cachetable-test \
cachetable-test2 \
hashtest \
brt-test \
# This line intentially kept commented so I can have a \ on the end of the previous line
BINS = $(REGRESSION_TESTS) \
......@@ -46,7 +46,6 @@ BINS = $(REGRESSION_TESTS) \
libs: log.o
bins: $(BINS)
check: bins
./benchmark-test --valsize 256 --verify 1
$(DTOOL) ./ybt-test
$(DTOOL) ./pma-test
$(DTOOL) ./cachetable-test
......@@ -54,6 +53,7 @@ check: bins
$(DTOOL) ./brt-serialize-test
$(DTOOL) ./brt-test
$(DTOOL) ./hashtest
./benchmark-test --valsize 256 --verify 1
# ./mdict-test
check-fanout:
......@@ -63,33 +63,40 @@ check-fanout:
let BRT_FANOUT=BRT_FANOUT+1; \
done
pma-test benchmark-test brt-test brt-serialize-test: LDFLAGS+=-lz
# pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage
BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h hashtable.h pma.h brt.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h
key.o: brttypes.h key.h
pma-test.o: pma-internal.h pma.h yerror.h memory.h ../include/db.h list.h kv-pair.h brttypes.h ybt.h yerror.h
pma-test: pma.o memory.o key.o ybt.o log.o mempool.o
pma-test.o: $(BRT_INTERNAL_H_INCLUDES) pma-internal.h pma.h list.h mempool.h
pma-test: pma.o memory.o key.o ybt.o log.o mempool.o fingerprint.o
pma.o: pma.h yerror.h pma-internal.h memory.h key.h ybt.h brttypes.h log.h ../include/db.h
ybt.o: ybt.h brttypes.h ../include/db.h
ybt-test: ybt-test.o ybt.o memory.o
ybt-test.o: ybt.h ../include/db.h
cachetable.o: cachetable.h hashfun.h
brt-test: ybt.o brt.o hashtable.o pma.o memory.o brt-serialize.o cachetable.o header-io.o ybt.o key.o primes.o log.o mempool.o
log.o: log-internal.h log.h
brt-test: ybt.o brt.o hashtable.o pma.o memory.o brt-serialize.o cachetable.o header-io.o ybt.o key.o primes.o log.o mempool.o brt-verify.o fingerprint.o
log.o: log-internal.h log.h wbuf.h crc.h
brt-test.o brt.o: brt.h ../include/db.h hashtable.h pma.h brttypes.h cachetable.h
brt-serialize-test.o: pma.h yerror.h brt.h ../include/db.h memory.h hashtable.h brttypes.h brt-internal.h
brt.o: brt.h ../include/db.h mdict.h pma.h brttypes.h memory.h brt-internal.h cachetable.h hashtable.h
brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES)
brt.o: $(BRT_INTERNAL_H_INCLUDES)
mdict.o: pma.h
hashtable.o: hashtable.h brttypes.h memory.h key.h yerror.h ../include/db.h hashfun.h
memory.o: memory.h
primes.o: primes.h
hashtest: hashtable.o memory.o primes.o
brt-serialize.o: brt.h ../include/db.h cachetable.h memory.h mdict.h pma.h brttypes.h brt-internal.h hashtable.h wbuf.h rbuf.h
header-io.o: brttypes.h brt-internal.h brt.h ../include/db.h memory.h
brt-serialize.o: $(BRT_INTERNAL_H_INCLUDES) key.h wbuf.h rbuf.h
header-io.o: $(BRT_INTERNAL_H_INCLUDES)
mdict-test: hashtable.o pma.o memory.o
brt-bigtest: memory.o ybt.o brt.o pma.o cachetable.o key.o hashtable.o brt-serialize.o
brt-bigtest.o: brt.h ../include/db.h
log-test: log.o memory.o
brt-verify.o: $(BRT_INTERNAL_H_INCLUDES)
fingerprint.o: $(BRT_INTERNAL_H_INCLUDES)
brt-serialize-test: brt-serialize-test.o brt-serialize.o memory.o hashtable.o pma.o key.o ybt.o brt.o cachetable.o primes.o log.o mempool.o brt-verify.o fingerprint.o
brt-serialize-test: brt-serialize-test.o brt-serialize.o memory.o hashtable.o pma.o key.o ybt.o brt.o cachetable.o primes.o log.o mempool.o
cachetable-test.o: cachetable.h memory.h
cachetable-test: cachetable.o memory.o cachetable-test.o primes.o
......@@ -97,7 +104,7 @@ cachetable-test: cachetable.o memory.o cachetable-test.o primes.o
cachetable-test2.o: cachetable.h memory.h
cachetable-test2: cachetable.o memory.o cachetable-test2.o primes.o
benchmark-test: benchmark-test.o ybt.o memory.o brt.o pma.o cachetable.o key.o hashtable.o brt-serialize.o primes.o log.o mempool.o
benchmark-test: benchmark-test.o ybt.o memory.o brt.o pma.o cachetable.o key.o hashtable.o brt-serialize.o primes.o log.o mempool.o brt-verify.o fingerprint.o
benchmark-test.o: brt.h ../include/db.h
clean:
......
......@@ -30,7 +30,7 @@ BRT t;
void setup (void) {
int r;
unlink(fname);
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun); assert(r==0);
}
......@@ -69,6 +69,7 @@ long long llrandom (void) {
void random_insert_below (long long below) {
long long i;
assert(0 < below);
for (i=0; i<ITEMS_TO_INSERT_PER_ITERATION; i++) {
insert(llrandom()%below);
}
......@@ -79,7 +80,7 @@ double tdiff (struct timeval *a, struct timeval *b) {
}
void biginsert (long long n_elements, struct timeval *starttime) {
long i;
long long i;
struct timeval t1,t2;
int iteration;
for (i=0, iteration=0; i<n_elements; i+=ITEMS_TO_INSERT_PER_ITERATION, iteration++) {
......
static int brt_root_put_cmd_XY (BRT brt, BRT_CMD *md, TOKUTXN txn) {
int r;
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: toku_unpin_brt_header(brt); }
return r;
}
CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt);
if ((r=cachetable_get_and_pin(brt->cf, *rootp, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize))) {
goto died0;
}
node=node_v;
if (0) {
died1:
cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnodesize(node));
goto died0;
}
node->parent_brtnode = 0;
result = brtnode_put_cmd_XY(brt, node, cmd, txn);
// It's still pinned, and it may be too big or the fanout may be too large.
if (node->height>0 && node->u.n.n_children==TREE_FANOUT) {
// Must split it.
r = do_split_node(node, &nodea, &nodeb, &splitk); // On error: node is unmodified
if (r!=0) goto died1;
// node is garbage, and nodea and nodeb are pinned
r = brt_init_new_root(brt, nodea, nodeb, splitk, rootp); // On error: root is unmodified and nodea and nodeb are both unpinned
if (r!=0) goto died0;
// nodea and nodeb are unpinned, and the root has been fixed
// up to point at a new node (*rootp) containing two children
// (nodea and nodeb). nodea and nodeb are unpinned. *rootp is still pinned
node = *rootp;
}
// Now the fanout is small enough.
// But the node could still be too large.
if (serialize_brtnode_size(node)>node->nodesize) {
}
}
......@@ -2,14 +2,15 @@
#include "hashtable.h"
#include "pma.h"
#include "brt.h"
//#include "pma.h"
#include "crc.h"
#ifndef BRT_FANOUT
#define BRT_FANOUT 16
#endif
enum { TREE_FANOUT = BRT_FANOUT }; //, NODESIZE=1<<20 };
enum { TREE_FANOUT = BRT_FANOUT };
enum { KEY_VALUE_OVERHEAD = 8 }; /* Must store the two lengths. */
enum { BRT_CMD_OVERHEAD = 1 };
enum { BRT_DEFAULT_NODE_SIZE = 1 << 20 };
struct nodeheader_in_file {
int n_in_buffer;
......@@ -22,21 +23,28 @@ typedef struct brtnode *BRTNODE;
/* Internal nodes. */
struct brtnode {
enum typ_tag tag;
BRT brt; // The containing BRT
unsigned int nodesize;
diskoff thisnodename;
DISKOFF thisnodename; // The size of the node allocated on disk. Not all is necessarily in use.
LSN lsn; // Need the LSN as of the most recent modification.
int layout_version; // What version of the data structure?
BRTNODE parent_brtnode; /* Invariant: The parent of an in-memory node must be in main memory. This is so we can find and update the down pointer when we change the diskoff of a node. */
int height; /* height is always >= 0. 0 for leaf, >0 for nonleaf. */
int dirty;
u_int32_t rand4fingerprint;
u_int32_t local_fingerprint; /* For leaves this is everything in the buffer. For nonleaves, this is everything in the hash tables, but does not include child subtree fingerprints. */
int dirty;
union node {
struct nonleaf {
// Don't actually store the subree fingerprint in the in-memory data structure.
int n_children; /* if n_children==TREE_FANOUT+1 then the tree needs to be rebalanced. */
u_int32_t child_subtree_fingerprints[TREE_FANOUT+1];
bytevec childkeys[TREE_FANOUT]; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Note: It is possible that Child 1's keys are == to child 0's key's, so it is
not necessarily true that child 1's keys are > childkeys[0].
However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
unsigned int childkeylens[TREE_FANOUT];
unsigned int totalchildkeylens;
diskoff children[TREE_FANOUT+1]; /* unused if height==0 */ /* Note: The last element of these arrays is used only temporarily while splitting a node. */
DISKOFF children[TREE_FANOUT+1]; /* unused if height==0 */ /* Note: The last element of these arrays is used only temporarily while splitting a node. */
HASHTABLE htables[TREE_FANOUT+1];
unsigned int n_bytes_in_hashtable[TREE_FANOUT+1]; /* how many bytes are in each hashtable (including overheads) */
unsigned int n_bytes_in_hashtables;
......@@ -52,12 +60,13 @@ struct brtnode {
struct brt_header {
int dirty;
unsigned int nodesize;
diskoff freelist;
diskoff unused_memory;
diskoff unnamed_root;
DISKOFF freelist;
DISKOFF unused_memory;
DISKOFF unnamed_root;
int n_named_roots; /* -1 if the only one is unnamed */
char **names;
diskoff *roots;
DISKOFF *roots;
unsigned int flags;
};
......@@ -69,21 +78,24 @@ struct brt {
BRT_CURSOR cursors_head, cursors_tail;
unsigned int nodesize;
unsigned int flags;
int (*compare_fun)(DB*,const DBT*,const DBT*);
int (*dup_compare)(DB*,const DBT*,const DBT*);
void *skey,*sval; /* Used for DBT return values. */
};
/* serialization code */
void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node);
int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesize);
void serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node);
int deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int nodesize);
unsigned int serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
void verify_counts(BRTNODE);
int serialize_brt_header_to (int fd, struct brt_header *h);
int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth);
int deserialize_brtheader_from (int fd, DISKOFF off, struct brt_header **brth);
/* return the size of a tree node */
long brtnode_size (BRTNODE node);
......@@ -169,3 +181,21 @@ struct brt_cmd {
};
typedef struct brt_cmd BRT_CMD;
struct brtenv {
CACHETABLE ct;
TOKULOGGER logger;
long long checksum_number;
// SPINLOCK checkpointing;
};
extern cachetable_flush_func_t brtnode_flush_callback;
extern cachetable_fetch_func_t brtnode_fetch_callback;
extern int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header);
extern int toku_unpin_brt_header (BRT brt);
extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt);
static const BRTNODE null_brtnode=0;
extern u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen);
extern u_int32_t toku_calccrc32_cmd (int type, const void *key, int keylen, const void *val, int vallen);
extern u_int32_t toku_calccrc32_cmdstruct (BRT_CMD *cmd);
#include "brt.h"
#include "memory.h"
#include "brt-internal.h"
#include <fcntl.h>
#include <assert.h>
#include <string.h>
#include <zlib.h>
#include <arpa/inet.h>
#include <stdlib.h>
void test_serialize(void) {
// struct brt source_brt;
......@@ -12,41 +13,59 @@ void test_serialize(void) {
struct brtnode sn, *dn;
int fd = open("brt-serialize-test.brt", O_RDWR|O_CREAT, 0777);
int r;
const u_int32_t randval = random();
assert(fd>=0);
// source_brt.fd=fd;
char *hello_string;
sn.nodesize = nodesize;
sn.thisnodename = sn.nodesize*20;
sn.lsn.lsn = 123456;
sn.layout_version = 0;
sn.height = 1;
sn.rand4fingerprint = randval;
sn.local_fingerprint = 0;
sn.u.n.n_children = 2;
sn.u.n.childkeys[0] = hello_string = toku_strdup("hello");
sn.u.n.childkeylens[0] = 6;
sn.u.n.totalchildkeylens = 6;
sn.u.n.children[0] = sn.nodesize*30;
sn.u.n.children[1] = sn.nodesize*35;
sn.u.n.child_subtree_fingerprints[0] = random();
sn.u.n.child_subtree_fingerprints[1] = random();
r = toku_hashtable_create(&sn.u.n.htables[0]); assert(r==0);
r = toku_hashtable_create(&sn.u.n.htables[1]); assert(r==0);
r = toku_hash_insert(sn.u.n.htables[0], "a", 2, "aval", 5, BRT_NONE); assert(r==0);
r = toku_hash_insert(sn.u.n.htables[0], "b", 2, "bval", 5, BRT_NONE); assert(r==0);
r = toku_hash_insert(sn.u.n.htables[1], "x", 2, "xval", 5, BRT_NONE); assert(r==0);
r = toku_hash_insert(sn.u.n.htables[0], "a", 2, "aval", 5, BRT_NONE); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, "a", 2, "aval", 5);
r = toku_hash_insert(sn.u.n.htables[0], "b", 2, "bval", 5, BRT_NONE); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, "b", 2, "bval", 5);
r = toku_hash_insert(sn.u.n.htables[1], "x", 2, "xval", 5, BRT_NONE); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, "x", 2, "xval", 5);
sn.u.n.n_bytes_in_hashtables = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
serialize_brtnode_to(fd, sn.nodesize*20, sn.nodesize, &sn); assert(r==0);
r = deserialize_brtnode_from(fd, nodesize*20, &dn, nodesize);
assert(r==0);
assert(dn->thisnodename==nodesize*20);
assert(dn->lsn.lsn==123456);
assert(dn->layout_version ==0);
assert(dn->height == 1);
assert(dn->rand4fingerprint==randval);
assert(dn->u.n.n_children==2);
assert(strcmp(dn->u.n.childkeys[0], "hello")==0);
assert(dn->u.n.childkeylens[0]==6);
assert(dn->u.n.totalchildkeylens==6);
assert(dn->u.n.children[0]==nodesize*30);
assert(dn->u.n.children[1]==nodesize*35);
{
int i;
for (i=0; i<2; i++) {
assert(dn->u.n.child_subtree_fingerprints[i]==sn.u.n.child_subtree_fingerprints[i]);
}
assert(dn->local_fingerprint==sn.local_fingerprint);
}
{
bytevec data; ITEMLEN datalen; int type;
int r = toku_hash_find(dn->u.n.htables[0], "a", 2, &data, &datalen, &type);
r = toku_hash_find(dn->u.n.htables[0], "a", 2, &data, &datalen, &type);
assert(r==0);
assert(strcmp(data,"aval")==0);
assert(datalen==5);
......@@ -64,7 +83,7 @@ void test_serialize(void) {
assert(datalen==5);
assert(type == BRT_NONE);
}
// brtnode_free(&dn);
brtnode_free(&dn);
toku_free(hello_string);
toku_hashtable_free(&sn.u.n.htables[0]);
......
#define _XOPEN_SOURCE 500
#include "brt.h"
#include "memory.h"
//#include "pma.h"
#include "brt-internal.h"
#include "key.h"
#include "rbuf.h"
#include "wbuf.h"
#include <assert.h>
#include <unistd.h>
#include <stdio.h>
#include <arpa/inet.h>
const int brtnode_header_overhead = (8+ // magic "tokunode" or "tokuleaf"
8+ // checkpoint number
4+ // block size
4+ // data size
4+ // height
4+ // random for fingerprint
4+ // localfingerprint
4); // crc32 at the end
static unsigned int serialize_brtnode_size_slow(BRTNODE node) {
unsigned int size=4+4; /* size+height */
unsigned int size=brtnode_header_overhead;
if (node->height>0) {
unsigned int hsize=0;
unsigned int csize=0;
int i;
size+=4; /* n_children */
size+=4; /* subtree fingerprint. */
for (i=0; i<node->u.n.n_children-1; i++) {
size+=4;
csize+=node->u.n.childkeylens[i];
}
for (i=0; i<node->u.n.n_children; i++) {
size+=8;
size+=8; // diskoff
size+=4; // subsum
}
int n_hashtables = node->u.n.n_bytes_in_hashtables;
int n_hashtables = node->u.n.n_children;
size+=4; /* n_entries */
assert(0 <= n_hashtables && n_hashtables < TREE_FANOUT+1);
for (i=0; i< n_hashtables; i++) {
HASHTABLE_ITERATE(node->u.n.htables[i],
key __attribute__((__unused__)), keylen,
......@@ -53,13 +63,14 @@ static unsigned int serialize_brtnode_size_slow(BRTNODE node) {
}
unsigned int serialize_brtnode_size (BRTNODE node) {
unsigned int result = 4+4; /* size+height */
unsigned int result =brtnode_header_overhead;
assert(sizeof(off_t)==8);
if (node->height>0) {
result+=4; /* n_children */
result+=4; /* subtree fingerpirnt */
result+=4*(node->u.n.n_children-1); /* key lengths */
result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */
result+=(8+4)*(node->u.n.n_children); /* For each child, a child offset and a count for the number of hash table entries. */
result+=(8+4+4)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, and the subtree fingerprint. */
result+=node->u.n.n_bytes_in_hashtables;
} else {
result+=4; /* n_entries in buffer table. */
......@@ -73,7 +84,8 @@ unsigned int serialize_brtnode_size (BRTNODE node) {
return result;
}
void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) {
void serialize_brtnode_to(int fd, DISKOFF off, DISKOFF size, BRTNODE node) {
//printf("%s:%d serializing\n", __FILE__, __LINE__);
struct wbuf w;
int i;
unsigned int calculated_size = serialize_brtnode_size(node);
......@@ -82,11 +94,33 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) {
assert(size>0);
wbuf_init(&w, buf, size);
//printf("%s:%d serializing %lld w height=%d p0=%p\n", __FILE__, __LINE__, off, node->height, node->mdicts[0]);
wbuf_literal_bytes(&w, "toku", 4);
if (node->height==0) wbuf_literal_bytes(&w, "leaf", 4);
else wbuf_literal_bytes(&w, "node", 4);
wbuf_int(&w, node->layout_version);
wbuf_ulonglong(&w, node->lsn.lsn);
//printf("%s:%d %lld.calculated_size=%d\n", __FILE__, __LINE__, off, calculated_size);
wbuf_int(&w, calculated_size);
wbuf_int(&w, node->height);
//printf("%s:%d %lld rand=%08x sum=%08x height=%d\n", __FILE__, __LINE__, node->thisnodename, node->rand4fingerprint, node->subtree_fingerprint, node->height);
wbuf_int(&w, node->rand4fingerprint);
wbuf_int(&w, node->local_fingerprint);
//printf("%s:%d local_fingerprint=%8x\n", __FILE__, __LINE__, node->local_fingerprint);
//printf("%s:%d w.ndone=%d n_children=%d\n", __FILE__, __LINE__, w.ndone, node->n_children);
if (node->height>0) {
if (node->height>0) {
// Local fingerprint is not actually stored while in main memory. Must calculate it.
// Subtract the child fingerprints from the subtree fingerprint to get the local fingerprint.
{
u_int32_t subtree_fingerprint = node->local_fingerprint;
for (i=0; i<node->u.n.n_children; i++) {
subtree_fingerprint += node->u.n.child_subtree_fingerprints[i];
}
wbuf_int(&w, subtree_fingerprint);
}
wbuf_int(&w, node->u.n.n_children);
for (i=0; i<node->u.n.n_children; i++) {
wbuf_int(&w, node->u.n.child_subtree_fingerprints[i]);
}
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
for (i=0; i<node->u.n.n_children-1; i++) {
wbuf_bytes(&w, node->u.n.childkeys[i], node->u.n.childkeylens[i]);
......@@ -99,21 +133,37 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) {
{
int n_hash_tables = node->u.n.n_children;
u_int32_t check_local_fingerprint = 0;
for (i=0; i< n_hash_tables; i++) {
//printf("%s:%d p%d=%p n_entries=%d\n", __FILE__, __LINE__, i, node->mdicts[i], mdict_n_entries(node->mdicts[i]));
wbuf_int(&w, toku_hashtable_n_entries(node->u.n.htables[i]));
HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type,
(wbuf_char(&w, type), wbuf_bytes(&w, key, keylen),
wbuf_bytes(&w, data, datalen)));
({
wbuf_char(&w, type);
wbuf_bytes(&w, key, keylen);
wbuf_bytes(&w, data, datalen);
check_local_fingerprint+=node->rand4fingerprint*toku_calccrc32_cmd(type, key, keylen, data, datalen);
}));
}
//printf("%s:%d check_local_fingerprint=%8x\n", __FILE__, __LINE__, check_local_fingerprint);
assert(check_local_fingerprint==node->local_fingerprint);
}
} else {
//printf(" n_entries=%d\n", pma_n_entries(node->u.l.buffer));
wbuf_int(&w, pma_n_entries(node->u.l.buffer));
PMA_ITERATE(node->u.l.buffer, key, keylen, data, datalen,
(wbuf_bytes(&w, key, keylen),
wbuf_bytes(&w, data, datalen)));
}
assert(w.ndone<=w.size);
#ifdef CRC_ATEND
wbuf_int(&w, crc32(toku_null_crc, w.buf, w.ndone));
#endif
#ifdef CRC_INCR
wbuf_int(&w, w.crc32);
#endif
//write_now: printf("%s:%d Writing %d bytes\n", __FILE__, __LINE__, w.ndone);
{
ssize_t r=pwrite(fd, w.buf, w.ndone, off);
if (r<0) printf("r=%ld errno=%d\n", (long)r, errno);
......@@ -128,11 +178,11 @@ void serialize_brtnode_to(int fd, diskoff off, diskoff size, BRTNODE node) {
toku_free(buf);
}
int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesize) {
int deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int nodesize) {
TAGMALLOC(BRTNODE, result);
struct rbuf rc;
int i;
uint32_t datasize;
u_int32_t datasize;
int r;
if (errno!=0) {
r=errno;
......@@ -140,8 +190,8 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
return r;
}
{
uint32_t datasize_n;
r = pread(fd, &datasize_n, sizeof(datasize_n), off);
u_int32_t datasize_n;
r = pread(fd, &datasize_n, sizeof(datasize_n), off +8+4+8);
//printf("%s:%d r=%d the datasize=%d\n", __FILE__, __LINE__, r, ntohl(datasize_n));
if (r!=sizeof(datasize_n)) {
if (r==-1) r=errno;
......@@ -152,6 +202,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
if (datasize<=0 || datasize>(1<<30)) { r = DB_BADFORMAT; goto died0; }
}
rc.buf=toku_malloc(datasize);
//printf("%s:%d errno=%d\n", __FILE__, __LINE__, errno);
if (errno!=0) {
if (0) { died1: toku_free(rc.buf); }
r=errno;
......@@ -162,10 +213,30 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
rc.ndone=0;
//printf("Deserializing %lld datasize=%d\n", off, datasize);
{
ssize_t r=pread(fd, rc.buf, datasize, off);
if ((size_t)r!=datasize) { r=errno; goto died1; }
ssize_t rlen=pread(fd, rc.buf, datasize, off);
//printf("%s:%d pread->%d datasize=%d\n", __FILE__, __LINE__, r, datasize);
if ((size_t)rlen!=datasize) {
//printf("%s:%d size messed up\n", __FILE__, __LINE__);
r=errno;
goto died1;
}
//printf("Got %d %d %d %d\n", rc.buf[0], rc.buf[1], rc.buf[2], rc.buf[3]);
}
{
bytevec tmp;
rbuf_literal_bytes(&rc, &tmp, 8);
if (memcmp(tmp, "tokuleaf", 8)!=0
&& memcmp(tmp, "tokunode", 8)!=0) {
r = DB_BADFORMAT;
goto died1;
}
}
result->layout_version = rbuf_int(&rc);
if (result->layout_version!=0) {
r=DB_BADFORMAT;
goto died1;
}
result->lsn.lsn = rbuf_ulonglong(&rc);
{
unsigned int stored_size = rbuf_int(&rc);
if (stored_size!=datasize) { r=DB_BADFORMAT; goto died1; }
......@@ -173,11 +244,14 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
result->nodesize = nodesize; // How to compute the nodesize?
result->thisnodename = off;
result->height = rbuf_int(&rc);
result->rand4fingerprint = rbuf_int(&rc);
result->local_fingerprint = rbuf_int(&rc);
result->dirty = 0;
//printf("height==%d\n", result->height);
if (result->height>0) {
result->u.n.totalchildkeylens=0;
for (i=0; i<TREE_FANOUT; i++) {
result->u.n.child_subtree_fingerprints[i]=0;
result->u.n.childkeys[i]=0;
result->u.n.childkeylens[i]=0;
}
......@@ -187,9 +261,16 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
result->u.n.n_bytes_in_hashtable[i]=0;
result->u.n.n_cursors[i]=0;
}
u_int32_t subtree_fingerprint = rbuf_int(&rc);
u_int32_t check_subtree_fingerprint = 0;
result->u.n.n_children = rbuf_int(&rc);
//printf("n_children=%d\n", result->n_children);
assert(result->u.n.n_children>=0 && result->u.n.n_children<=TREE_FANOUT);
for (i=0; i<result->u.n.n_children; i++) {
u_int32_t childfp = rbuf_int(&rc);
result->u.n.child_subtree_fingerprints[i]= childfp;
check_subtree_fingerprint += childfp;
}
for (i=0; i<result->u.n.n_children-1; i++) {
bytevec childkeyptr;
rbuf_bytes(&rc, &childkeyptr, &result->u.n.childkeylens[i]); /* Returns a pointer into the rbuf. */
......@@ -206,7 +287,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
}
result->u.n.n_bytes_in_hashtables = 0;
for (i=0; i<result->u.n.n_children; i++) {
int r=toku_hashtable_create(&result->u.n.htables[i]);
r=toku_hashtable_create(&result->u.n.htables[i]);
if (r!=0) {
int j;
if (0) { died_12: j=result->u.n.n_bytes_in_hashtables; }
......@@ -216,6 +297,7 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
}
{
int cnum;
u_int32_t check_local_fingerprint = 0;
for (cnum=0; cnum<result->u.n.n_children; cnum++) {
int n_in_this_hash = rbuf_int(&rc);
//printf("%d in hash\n", n_in_hash);
......@@ -228,9 +310,10 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
type = rbuf_char(&rc);
rbuf_bytes(&rc, &key, &keylen); /* Returns a pointer into the rbuf. */
rbuf_bytes(&rc, &val, &vallen);
check_local_fingerprint += result->rand4fingerprint * toku_calccrc32_cmd(type, key, keylen, val, vallen);
//printf("Found %s,%s\n", (char*)key, (char*)val);
{
int r=toku_hash_insert(result->u.n.htables[cnum], key, keylen, val, vallen, type); /* Copies the data into the hash table. */
r=toku_hash_insert(result->u.n.htables[cnum], key, keylen, val, vallen, type); /* Copies the data into the hash table. */
if (r!=0) { goto died_12; }
}
diff = keylen + vallen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
......@@ -239,11 +322,19 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
//printf("Inserted\n");
}
}
if (check_local_fingerprint != result->local_fingerprint) {
fprintf(stderr, "%s:%d local fingerprint is wrong (found %8x calcualted %8x\n", __FILE__, __LINE__, result->local_fingerprint, check_local_fingerprint);
return DB_BADFORMAT;
}
if (check_subtree_fingerprint+check_local_fingerprint != subtree_fingerprint) {
fprintf(stderr, "%s:%d subtree fingerprint is wrong\n", __FILE__, __LINE__);
return DB_BADFORMAT;
}
}
} else {
int n_in_buf = rbuf_int(&rc);
result->u.l.n_bytes_in_buffer = 0;
int r=pma_create(&result->u.l.buffer, default_compare_fun, nodesize);
r=pma_create(&result->u.l.buffer, default_compare_fun, nodesize);
if (r!=0) {
if (0) { died_21: pma_free(&result->u.l.buffer); }
goto died1;
......@@ -253,7 +344,6 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
#if BRT_USE_PMA_BULK_INSERT
{
DBT keys[n_in_buf], vals[n_in_buf];
int r;
for (i=0; i<n_in_buf; i++) {
bytevec key; ITEMLEN keylen;
......@@ -266,8 +356,16 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD;
}
if (n_in_buf > 0) {
r = pma_bulk_insert(result->u.l.buffer, keys, vals, n_in_buf);
u_int32_t actual_sum = 0;
r = pma_bulk_insert(result->u.l.buffer, keys, vals, n_in_buf, result->rand4fingerprint, &actual_sum);
if (r!=0) goto died_21;
if (actual_sum!=result->local_fingerprint) {
//fprintf(stderr, "%s:%d Corrupted checksum stored=%08x rand=%08x actual=%08x height=%d n_keys=%d\n", __FILE__, __LINE__, result->rand4fingerprint, result->local_fingerprint, actual_sum, result->height, n_in_buf);
return DB_BADFORMAT;
goto died_21;
} else {
//fprintf(stderr, "%s:%d Good checksum=%08x height=%d\n", __FILE__, __LINE__, actual_sum, result->height);
}
}
}
#else
......@@ -279,13 +377,27 @@ int deserialize_brtnode_from (int fd, diskoff off, BRTNODE *brtnode, int nodesiz
rbuf_bytes(&rc, &val, &vallen);
{
DBT k,v;
int r = pma_insert(result->u.l.buffer, fill_dbt(&k, key, keylen), fill_dbt(&v, val, vallen), 0);
r = pma_insert(result->u.l.buffer, fill_dbt(&k, key, keylen), fill_dbt(&v, val, vallen), 0);
if (r!=0) goto died_21;
}
result->u.l.n_bytes_in_buffer += keylen + vallen + KEY_VALUE_OVERHEAD;
}
#endif
}
{
unsigned int n_read_so_far = rc.ndone;
if (n_read_so_far+4!=rc.size) {
r = DB_BADFORMAT; goto died_21;
}
uint32_t crc = toku_crc32(toku_null_crc, rc.buf, n_read_so_far);
uint32_t storedcrc = rbuf_int(&rc);
if (crc!=storedcrc) {
printf("Bad CRC\n");
assert(0);//this is wrong!!!
r = DB_BADFORMAT;
goto died_21;
}
}
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
toku_free(rc.buf);
*brtnode = result;
......@@ -302,9 +414,11 @@ void verify_counts (BRTNODE node) {
int i;
for (i=0; i<node->u.n.n_children; i++)
sum += node->u.n.n_bytes_in_hashtable[i];
// We don't rally care of the later hashtables have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
// But for now the code always initializes the later tables so they are 0.
for (; i<TREE_FANOUT+1; i++) {
assert(node->u.n.n_bytes_in_hashtable[i]==0);
}
}
assert(sum==node->u.n.n_bytes_in_hashtables);
}
}
......@@ -313,7 +427,7 @@ int serialize_brt_header_to (int fd, struct brt_header *h) {
struct wbuf w;
int i;
unsigned int size=0; /* I don't want to mess around calculating it exactly. */
size += 4+4+8+8+4; /* this size, the tree's nodesize, freelist, unused_memory, nnamed_rootse. */
size += 4+4+4+8+8+4; /* this size, flags, the tree's nodesize, freelist, unused_memory, nnamed_rootse. */
if (h->n_named_roots<0) {
size+=8;
} else {
......@@ -321,10 +435,9 @@ int serialize_brt_header_to (int fd, struct brt_header *h) {
size+=12 + 1 + strlen(h->names[i]);
}
}
w.buf = toku_malloc(size);
w.size = size;
w.ndone = 0;
wbuf_init(&w, toku_malloc(size), size);
wbuf_int (&w, size);
wbuf_int (&w, h->flags);
wbuf_int (&w, h->nodesize);
wbuf_diskoff(&w, h->freelist);
wbuf_diskoff(&w, h->unused_memory);
......@@ -350,7 +463,7 @@ int serialize_brt_header_to (int fd, struct brt_header *h) {
return 0;
}
int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) {
int deserialize_brtheader_from (int fd, DISKOFF off, struct brt_header **brth) {
//printf("%s:%d calling MALLOC\n", __FILE__, __LINE__);
struct brt_header *MALLOC(h);
struct rbuf rc;
......@@ -376,6 +489,7 @@ int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) {
h->dirty=0;
sizeagain = rbuf_int(&rc);
assert(sizeagain==size);
h->flags = rbuf_int(&rc);
h->nodesize = rbuf_int(&rc);
h->freelist = rbuf_diskoff(&rc);
h->unused_memory = rbuf_diskoff(&rc);
......@@ -403,3 +517,4 @@ int deserialize_brtheader_from (int fd, diskoff off, struct brt_header **brth) {
*brth = h;
return 0;
}
......@@ -24,7 +24,7 @@ static void test0 (void) {
printf("%s:%d test0\n", __FILE__, __LINE__);
memory_check=1;
memory_check_all_free();
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
printf("%s:%d test0\n", __FILE__, __LINE__);
unlink(fname);
......@@ -47,7 +47,7 @@ static void test1 (void) {
DBT k,v;
memory_check=1;
memory_check_all_free();
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun);
......@@ -74,7 +74,7 @@ static void test2 (int memcheck) {
memory_check=memcheck;
printf("%s:%d checking\n", __FILE__, __LINE__);
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun);
printf("%s:%d did setup\n", __FILE__, __LINE__);
......@@ -112,7 +112,7 @@ static void test3 (int nodesize, int count, int memcheck) {
char fname[]="testbrt.brt";
memory_check=memcheck;
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
gettimeofday(&t0, 0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun);
......@@ -145,7 +145,7 @@ static void test4 (int nodesize, int count, int memcheck) {
unlink(fname);
memory_check=memcheck;
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(fname, 0, 1, &t, nodesize, ct, default_compare_fun); assert(r==0);
for (i=0; i<count; i++) {
char key[100],val[100];
......@@ -177,7 +177,7 @@ static void test5 (void) {
MALLOC_N(limit,values);
for (i=0; i<limit; i++) values[i]=-1;
unlink(fname);
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(fname, 0, 1, &t, 1<<12, ct, default_compare_fun); assert(r==0);
for (i=0; i<limit/2; i++) {
char key[100],val[100];
......@@ -218,7 +218,7 @@ static void test_dump_empty_db (void) {
int r;
char fname[]="testbrt.brt";
memory_check=1;
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 1024, ct, default_compare_fun);
......@@ -240,7 +240,7 @@ static void test_multiple_files_of_size (int size) {
unlink(n0);
unlink(n1);
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, 0, 1, &t0, size, ct, default_compare_fun); assert(r==0);
r = open_brt(n1, 0, 1, &t1, size, ct, default_compare_fun); assert(r==0);
for (i=0; i<10000; i++) {
......@@ -264,7 +264,7 @@ static void test_multiple_files_of_size (int size) {
memory_check_all_free();
/* Now see if the data is all there. */
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, 0, 0, &t0, 1<<12, ct, default_compare_fun);
printf("%s:%d r=%d\n", __FILE__, __LINE__,r);
assert(r==0);
......@@ -309,7 +309,7 @@ static void test_named_db (void) {
unlink(n0);
unlink(n1);
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 1, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
......@@ -320,7 +320,7 @@ static void test_named_db (void) {
memory_check_all_free();
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 0, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
{
......@@ -346,7 +346,7 @@ static void test_multiple_dbs (void) {
unlink(n0);
unlink(n1);
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 1, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
r = open_brt(n1, "db2", 1, &t1, 1<<12, ct, default_compare_fun); assert(r==0);
......@@ -359,7 +359,7 @@ static void test_multiple_dbs (void) {
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n0, "db1", 0, &t0, 1<<12, ct, default_compare_fun); assert(r==0);
r = open_brt(n1, "db2", 0, &t1, 1<<12, ct, default_compare_fun); assert(r==0);
......@@ -399,7 +399,7 @@ static void test_multiple_dbs_many (void) {
printf("test_multiple_dbs_many:\n");
memory_check_all_free();
unlink(name);
r = brt_create_cachetable(&ct, MANYN+4); assert(r==0);
r = brt_create_cachetable(&ct, MANYN+4, ZERO_LSN, NULL_LOGGER); assert(r==0);
for (i=0; i<MANYN; i++) {
char dbname[20];
snprintf(dbname, 20, "db%d", i);
......@@ -430,7 +430,7 @@ static void test_multiple_brts_one_db_one_file (void) {
printf("test_multiple_brts_one_db_one_file:");
memory_check_all_free();
unlink(name);
r = brt_create_cachetable(&ct, 32); assert(r==0);
r = brt_create_cachetable(&ct, 32, ZERO_LSN, NULL_LOGGER); assert(r==0);
for (i=0; i<MANYN; i++) {
r = open_brt(name, 0, (i==0), &trees[i], 1<<12, ct, default_compare_fun);
assert(r==0);
......@@ -468,14 +468,13 @@ static void test_read_what_was_written (void) {
BRT brt;
int r;
const int NVALS=10000;
DBT k,v;
printf("test_read_what_was_written(): "); fflush(stdout);
unlink(n);
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
r = close_brt(brt); assert(r==0);
r = cachetable_close(&ct); assert(r==0);
......@@ -483,11 +482,14 @@ static void test_read_what_was_written (void) {
memory_check_all_free();
/* Now see if we can read an empty tree in. */
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
/* See if we can put something in it. */
brt_insert(brt, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), null_db, null_txn);
{
DBT k,v;
brt_insert(brt, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), null_db, null_txn);
}
r = close_brt(brt); assert(r==0);
r = cachetable_close(&ct); assert(r==0);
......@@ -495,10 +497,11 @@ static void test_read_what_was_written (void) {
memory_check_all_free();
/* Now see if we can read it in and get the value. */
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
{
DBT k,v;
r = brt_lookup(brt, fill_dbt(&k, "hello", 6), init_dbt(&v), 0);
assert(r==0);
assert(v.size==6);
......@@ -507,7 +510,7 @@ static void test_read_what_was_written (void) {
assert(verify_brt(brt)==0);
/* Now put a bunch (VALS) of things in. */
/* Now put a bunch (NVALS) of things in. */
{
int i;
for (i=0; i<NVALS; i++) {
......@@ -554,6 +557,7 @@ static void test_read_what_was_written (void) {
int i;
for (i=0; i<NVALS; i++) {
char key[100],expectedval[100];
DBT k,v;
snprintf(key, 100, "key%d", i);
snprintf(expectedval, 100, "val%d", i);
r=brt_lookup(brt, fill_dbt(&k, key, strlen(key)+1), init_dbt(&v), 0);
......@@ -569,10 +573,11 @@ static void test_read_what_was_written (void) {
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = open_brt(n, 0, 0, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
{
DBT k,v;
r = brt_lookup(brt, fill_dbt(&k, "hello", 6), init_dbt(&v), 0);
assert(r==0);
assert(v.size==6);
......@@ -582,6 +587,7 @@ static void test_read_what_was_written (void) {
int i;
for (i=0; i<NVALS; i++) {
char key[100],expectedval[100];
DBT k,v;
snprintf(key, 100, "key%d", i);
snprintf(expectedval, 100, "val%d", i);
r=brt_lookup(brt, fill_dbt(&k, key, strlen(key)+1), init_dbt(&v), 0);
......@@ -614,7 +620,7 @@ void test_cursor_last_empty(void) {
unlink(n);
memory_check_all_free();
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
......@@ -646,7 +652,7 @@ void test_cursor_next (void) {
unlink(n);
memory_check_all_free();
r = brt_create_cachetable(&ct, 0); assert(r==0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
r = open_brt(n, 0, 1, &brt, 1<<12, ct, default_compare_fun); assert(r==0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
......@@ -692,7 +698,9 @@ DB nonce_db;
DBT *fill_b(DBT *x, unsigned char *key, unsigned int keylen) {
fill_dbt(x, key, keylen);
#if USE_DBT_APP_PRIVATE
x->app_private = &nonce;
#endif
return x;
}
......@@ -702,7 +710,9 @@ int wrong_compare_fun(DB *db, const DBT *a, const DBT *b) {
unsigned char *bd=b->data;
unsigned int siz=a->size;
assert(a->size==b->size);
#if USE_DBT_APP_PRIVATE
assert(a->app_private == &nonce); // a must have the nonce in it, but I don't care if b does.
#endif
assert(db==&nonce_db); // make sure the db was passed down correctly
for (i=0; i<siz; i++) {
if (ad[siz-1-i]<bd[siz-1-i]) return -1;
......@@ -732,8 +742,8 @@ static void test_wrongendian_compare (int wrong_p, unsigned int N) {
assert(wrong_compare_fun(&nonce_db, fill_dbt_ap(&at, b, 4, &nonce), fill_dbt(&bt, a, 4))<0);
}
r = brt_create_cachetable(&ct, 0); assert(r==0);
printf("%s:%d WRONG=%d\n", __FILE__, __LINE__, wrong_p);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
//printf("%s:%d WRONG=%d\n", __FILE__, __LINE__, wrong_p);
if (0) { // ???? Why is this commented out?
r = open_brt(n, 0, 1, &brt, 1<<20, ct, wrong_p ? wrong_compare_fun : default_compare_fun); assert(r==0);
......@@ -832,7 +842,9 @@ void clear_test_db() {
int test_brt_cursor_keycompare(DB *db, const DBT *a, const DBT *b) {
assert(db == test_db);
#if USE_DBT_APP_PRIVATE
assert(a->app_private == test_app_private);
#endif
return keycompare(a->data, a->size, b->data, b->size);
}
......@@ -844,7 +856,7 @@ void assert_cursor_notfound(BRT brt, int position, DB *db, void *app_private) {
r = brt_cursor(brt, &cursor);
assert(r==0);
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, position, db, null_txn);
assert(r == DB_NOTFOUND);
......@@ -863,7 +875,7 @@ void assert_cursor_value(BRT brt, int position, long long value, DB *db, void *a
assert(r==0);
if (test_cursor_debug) printf("key: ");
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, position, db, null_txn);
assert(r == 0);
......@@ -889,7 +901,7 @@ void assert_cursor_first_last(BRT brt, long long firstv, long long lastv, DB *db
assert(r==0);
if (test_cursor_debug) printf("first key: ");
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_FIRST, db, null_txn);
assert(r == 0);
......@@ -902,7 +914,7 @@ void assert_cursor_first_last(BRT brt, long long firstv, long long lastv, DB *db
if (test_cursor_debug) printf("\n");
if (test_cursor_debug) printf("last key:");
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_LAST, db, null_txn);
assert(r == 0);
......@@ -931,7 +943,7 @@ void test_brt_cursor_first(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -977,7 +989,7 @@ void test_brt_cursor_last(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1023,7 +1035,7 @@ void test_brt_cursor_first_last(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1070,7 +1082,7 @@ void test_brt_cursor_rfirst(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1116,7 +1128,7 @@ void assert_cursor_walk(BRT brt, int n, DB *db, void *app_private) {
DBT kbt, vbt;
long long v;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
if (r != 0)
......@@ -1148,7 +1160,7 @@ void test_brt_cursor_walk(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1192,7 +1204,7 @@ void assert_cursor_rwalk(BRT brt, int n, DB *db, void *app_private) {
DBT kbt, vbt;
long long v;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_PREV, db, null_txn);
if (r != 0)
......@@ -1224,7 +1236,7 @@ void test_brt_cursor_rwalk(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1270,7 +1282,7 @@ void assert_cursor_walk_inorder(BRT brt, int n, DB *db, void *app_private) {
DBT kbt, vbt;
long long v;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
if (r != 0)
......@@ -1306,7 +1318,7 @@ void test_brt_cursor_rand(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1362,7 +1374,7 @@ void test_brt_cursor_split(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1385,7 +1397,7 @@ void test_brt_cursor_split(int n, DB *db) {
if (test_cursor_debug) printf("key: ");
for (i=0; i<n/2; i++) {
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = &my_app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, &my_app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
assert(r==0);
......@@ -1408,7 +1420,7 @@ void test_brt_cursor_split(int n, DB *db) {
if (test_cursor_debug) printf("key: ");
for (;;) {
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; kbt.app_private = &my_app_private;
init_dbt(&kbt); kbt.flags = DB_DBT_MALLOC; dbt_set_app_private(&kbt, &my_app_private);
init_dbt(&vbt); vbt.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursor, &kbt, &vbt, DB_NEXT, db, null_txn);
if (r != 0)
......@@ -1444,7 +1456,7 @@ void test_multiple_brt_cursors(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1498,7 +1510,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) {
int nodesize = 1<<12;
int h = log16(n);
int cachesize = 2 * h * ncursors * nodesize;
r = brt_create_cachetable_size(&ct, 127, cachesize);
r = brt_create_cachetable(&ct, cachesize, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1527,7 +1539,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) {
/* point cursor i / cursor_gap to the current last key i */
if ((i % cursor_gap) == 0) {
c = i / cursor_gap;
init_dbt(&key); key.flags = DB_DBT_MALLOC; key.app_private = &my_app_private;
init_dbt(&key); key.flags = DB_DBT_MALLOC; dbt_set_app_private(&key, &my_app_private);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursors[c], &key, &val, DB_LAST, db, null_txn);
assert(r == 0);
......@@ -1539,7 +1551,7 @@ void test_multiple_brt_cursor_walk(int n, DB *db) {
/* walk the cursors by cursor_gap */
for (i=0; i<cursor_gap; i++) {
for (c=0; c<ncursors; c++) {
init_dbt(&key); key.flags = DB_DBT_MALLOC; key.app_private = &my_app_private;
init_dbt(&key); key.flags = DB_DBT_MALLOC; dbt_set_app_private(&key, &my_app_private);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_cursor_get(cursors[c], &key, &val, DB_NEXT, db, null_txn);
if (r == DB_NOTFOUND) {
......@@ -1584,7 +1596,7 @@ void test_brt_cursor_set(int n, int cursor_op, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1659,7 +1671,7 @@ void test_brt_cursor_set_range(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
r = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1730,7 +1742,7 @@ void test_brt_cursor_delete(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
error = brt_create_cachetable(&ct, 0);
error = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(error == 0);
error = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1755,7 +1767,7 @@ void test_brt_cursor_delete(int n, DB *db) {
/* walk the tree and delete under the cursor */
for (;;) {
init_dbt(&key); key.flags = DB_DBT_MALLOC; key.app_private = &my_app_private;
init_dbt(&key); key.flags = DB_DBT_MALLOC; dbt_set_app_private(&key, &my_app_private);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
error = brt_cursor_get(cursor, &key, &val, DB_NEXT, db, null_txn);
if (error == DB_NOTFOUND)
......@@ -1796,7 +1808,7 @@ void test_brt_cursor_get_both(int n, DB *db) {
set_test_db_app(db, &my_app_private);
unlink(fname);
error = brt_create_cachetable(&ct, 0);
error = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(error == 0);
error = open_brt(fname, 0, 1, &brt, 1<<12, ct, test_brt_cursor_keycompare);
......@@ -1900,9 +1912,6 @@ int test_brt_cursor_limit = 10000;
void test_brt_cursor(DB *db) {
int n;
int old_brt_do_push_cmd = brt_do_push_cmd;
brt_do_push_cmd = 0;
test_multiple_brt_cursors(1, db);
test_multiple_brt_cursors(2, db);
test_multiple_brt_cursors(3, db);
......@@ -1943,8 +1952,6 @@ void test_brt_cursor(DB *db) {
test_multiple_brt_cursor_walk(10000, db); memory_check_all_free();
test_multiple_brt_cursor_walk(100000, db); memory_check_all_free();
test_brt_cursor_get_both(1000, db); memory_check_all_free();
brt_do_push_cmd = old_brt_do_push_cmd;
}
void test_large_kv(int bsize, int ksize, int vsize) {
......@@ -1955,7 +1962,7 @@ void test_large_kv(int bsize, int ksize, int vsize) {
printf("test_large_kv: %d %d %d\n", bsize, ksize, vsize);
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, bsize, ct, default_compare_fun);
......@@ -2002,7 +2009,7 @@ void test_brt_delete_empty() {
CACHETABLE ct;
char fname[]="testbrt.brt";
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
......@@ -2012,7 +2019,7 @@ void test_brt_delete_empty() {
int k = htonl(1);
fill_dbt(&key, &k, sizeof k);
r = brt_delete(t, &key, 0);
assert(r != 0);
assert(r == 0);
r = close_brt(t); assert(r==0);
r = cachetable_close(&ct); assert(r==0);
......@@ -2031,7 +2038,7 @@ void test_brt_delete_present(int n) {
char fname[]="testbrt.brt";
int i;
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
......@@ -2093,7 +2100,7 @@ void test_brt_delete_not_present(int n) {
char fname[]="testbrt.brt";
int i;
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
......@@ -2140,7 +2147,7 @@ void test_brt_delete_cursor_first(int n) {
char fname[]="testbrt.brt";
int i;
r = brt_create_cachetable(&ct, 0);
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
......@@ -2158,12 +2165,30 @@ void test_brt_delete_cursor_first(int n) {
assert(r == 0);
}
/* lookups 0 .. n-1 should succeed */
for (i=0; i<n; i++) {
k = htonl(i);
fill_dbt(&key, &k, sizeof k);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_lookup(t, &key, &val, 0);
assert(r == 0);
assert(val.size == sizeof (int));
int vv;
memcpy(&vv, val.data, val.size);
assert(vv == i);
toku_free(val.data);
}
/* delete 0 .. n-2 */
for (i=0; i<n-1; i++) {
k = htonl(i);
fill_dbt(&key, &k, sizeof k);
r = brt_delete(t, &key, 0);
assert(r == 0);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_lookup(t, &key, &val, 0);
assert(r == DB_NOTFOUND);
}
/* lookup of 0 .. n-2 should all fail */
......@@ -2199,6 +2224,56 @@ void test_brt_delete_cursor_first(int n) {
r = cachetable_close(&ct); assert(r==0);
}
/* test for bug: insert cmd in a nonleaf node, delete removes the
insert cmd, but lookup finds the insert cmd
build a 2 level tree, and expect the last insertion to be
buffered. then delete and lookup. */
void test_insert_delete_lookup(int n) {
printf("test_insert_delete_lookup:%d\n", n);
BRT t;
int r;
CACHETABLE ct;
char fname[]="testbrt.brt";
int i;
r = brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = open_brt(fname, 0, 1, &t, 4096, ct, default_compare_fun);
assert(r==0);
DBT key, val;
int k, v;
/* insert 0 .. n-1 */
for (i=0; i<n; i++) {
k = htonl(i); v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
r = brt_insert(t, &key, &val, 0, 0);
assert(r == 0);
}
if (n > 0) {
k = htonl(n-1);
fill_dbt(&key, &k, sizeof k);
r = brt_delete(t, &key, 0);
assert(r == 0);
k = htonl(n-1);
fill_dbt(&key, &k, sizeof k);
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = brt_lookup(t, &key, &val, 0);
assert(r == DB_NOTFOUND);
}
r = close_brt(t); assert(r==0);
r = cachetable_close(&ct); assert(r==0);
}
void test_brt_delete() {
test_brt_delete_empty(); memory_check_all_free();
test_brt_delete_present(1); memory_check_all_free();
......@@ -2210,6 +2285,8 @@ void test_brt_delete() {
test_brt_delete_cursor_first(1); memory_check_all_free();
test_brt_delete_cursor_first(100); memory_check_all_free();
test_brt_delete_cursor_first(500); memory_check_all_free();
test_brt_delete_cursor_first(10000); memory_check_all_free();
test_insert_delete_lookup(512); memory_check_all_free();
}
static void brt_blackbox_test (void) {
......@@ -2272,6 +2349,14 @@ static void brt_blackbox_test (void) {
test_brt_delete();
int old_brt_do_push_cmd = brt_do_push_cmd;
brt_do_push_cmd = 0;
test_brt_delete();
test_brt_cursor(db);
brt_do_push_cmd = old_brt_do_push_cmd;
// test3(1<<19, 1<<20, 0);
// test4(1<<19, 1<<20, 0);
......
/* Verify a BRT. */
/* Check:
* the fingerprint of every node (local check)
* the child's fingerprint matches the parent's copy
* the tree is of uniform depth (and the height is correct at every node)
* For non-dup trees: the values to the left are < the values to the right
* and < the pivot
* For dup trees: the values to the left are <= the values to the right
* the pivots are < or <= left values (according to the PresentL bit)
* the pivots are > or >= right values (according to the PresentR bit)
*
* Note: We don't yet have DUP trees, so thee checks on duplicate trees are unimplemented. (Nov 1 2007)
*/
#include "brt-internal.h"
#include <assert.h>
static void verify_local_fingerprint (BRTNODE node) {
u_int32_t fp=0;
int i;
if (node->height>0) {
for (i=0; i<node->u.n.n_children; i++)
HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type,
({
fp += node->rand4fingerprint * toku_calccrc32_cmd(type, key, keylen, data, datalen);
}));
assert(fp==node->local_fingerprint);
} else {
pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint);
}
}
static void verify_parent_fingerprint (BRTNODE node) {
BRTNODE parent=node->parent_brtnode;
u_int32_t subtree_fingerprint=node->local_fingerprint;
if (node->height>0) {
int i;
for (i=0; i<node->u.n.n_children; i++) {
subtree_fingerprint+=node->u.n.child_subtree_fingerprints[i];
}
}
if (parent) {
int i;
assert(parent->height>0);
for (i=0; i<parent->u.n.n_children; i++) {
if (parent->u.n.children[i]==node->thisnodename) {
assert(parent->u.n.child_subtree_fingerprints[i]==subtree_fingerprint);
return;
}
}
assert(0); // no parent matches
}
}
int verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode) {
int result=0;
BRTNODE node;
void *node_v;
int r;
if ((r = cachetable_get_and_pin(brt->cf, off, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize)))
return r;
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
node=node_v;
node->parent_brtnode = parent_brtnode;
verify_local_fingerprint(node);
verify_parent_fingerprint(node);
if (node->height>0) {
int i;
for (i=0; i< node->u.n.n_children-1; i++) {
bytevec thislorange,thishirange;
ITEMLEN thislolen, thishilen;
if (node->u.n.n_children==0 || i==0) {
thislorange=lorange;
thislolen =lolen;
} else {
thislorange=node->u.n.childkeys[i-1];
thislolen =node->u.n.childkeylens[i-1];
}
if (node->u.n.n_children==0 || i+1>=node->u.n.n_children) {
thishirange=hirange;
thishilen =hilen;
} else {
thishirange=node->u.n.childkeys[i];
thishilen =node->u.n.childkeylens[i];
}
{
void verify_pair (bytevec key, unsigned int keylen,
bytevec data __attribute__((__unused__)),
unsigned int datalen __attribute__((__unused__)),
int type __attribute__((__unused__)),
void *ignore __attribute__((__unused__))) {
if (thislorange) assert(keycompare(thislorange,thislolen,key,keylen)<0);
if (thishirange && keycompare(key,keylen,thishirange,thishilen)>0) {
printf("%s:%d in buffer %d key %s is bigger than %s\n", __FILE__, __LINE__, i, (char*)key, (char*)thishirange);
result=1;
}
}
toku_hashtable_iterate(node->u.n.htables[i], verify_pair, 0);
}
}
for (i=0; i<node->u.n.n_children; i++) {
if (i>0) {
if (lorange) assert(keycompare(lorange,lolen, node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1])<0);
if (hirange) assert(keycompare(node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1], hirange, hilen)<=0);
}
if (recurse) {
result|=verify_brtnode(brt, node->u.n.children[i],
(i==0) ? lorange : node->u.n.childkeys[i-1],
(i==0) ? lolen : node->u.n.childkeylens[i-1],
(i==node->u.n.n_children-1) ? hirange : node->u.n.childkeys[i],
(i==node->u.n.n_children-1) ? hilen : node->u.n.childkeylens[i],
recurse,
node);
}
}
}
if ((r = cachetable_unpin(brt->cf, off, 0, 0))) return r;
return result;
}
int verify_brt (BRT brt) {
int r;
CACHEKEY *rootp;
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: toku_unpin_brt_header(brt); }
return r;
}
rootp = toku_calculate_root_offset_pointer(brt);
if ((r=verify_brtnode(brt, *rootp, 0, 0, 0, 0, 1, null_brtnode))) goto died0;
if ((r = toku_unpin_brt_header(brt))!=0) return r;
return 0;
}
......@@ -22,11 +22,8 @@
*
*/
#include "brttypes.h"
#include "brt.h"
#include "memory.h"
#include "brt-internal.h"
#include "cachetable.h"
#include "key.h"
#include <stdlib.h>
#include <assert.h>
......@@ -35,10 +32,11 @@
#include <stdio.h>
#include <errno.h>
const BRTNODE null_brtnode=0;
extern long long n_items_malloced;
static DISKOFF malloc_diskblock (BRT brt, int size);
//static void verify_local_fingerprint_nonleaf (BRTNODE node);
/* Frees a node, including all the stuff in the hash table. */
void brtnode_free (BRTNODE *nodep) {
BRTNODE node=*nodep;
......@@ -102,14 +100,32 @@ void fix_up_parent_pointers_of_children_now_that_parent_is_gone (CACHEFILE cf, B
}
}
static void fixup_child_fingerprint(BRTNODE node, int childnum_of_node, BRTNODE child) {
u_int32_t sum = child->local_fingerprint;
if (child->height>0) {
int i;
for (i=0; i<child->u.n.n_children; i++) {
sum += child->u.n.child_subtree_fingerprints[i];
}
}
// Don't try to get fancy about not modifying the fingerprint if it didn't change.
// We only call this function if we have reason to believe that the child's fingerprint did change.
node->u.n.child_subtree_fingerprints[childnum_of_node]=sum;
node->dirty=1;
}
void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnode_v, long size __attribute((unused)), int write_me, int keep_me) {
void brtnode_flush_callback (CACHEFILE cachefile, DISKOFF nodename, void *brtnode_v, long size __attribute((unused)), BOOL write_me, BOOL keep_me, LSN modified_lsn, BOOL rename_p __attribute__((__unused__))) {
BRTNODE brtnode = brtnode_v;
// if ((write_me || keep_me) && (brtnode->height==0)) {
// pma_verify_fingerprint(brtnode->u.l.buffer, brtnode->rand4fingerprint, brtnode->subtree_fingerprint);
// }
if (0) {
printf("%s:%d brtnode_flush_callback %p keep_me=%d height=%d", __FILE__, __LINE__, brtnode, keep_me, brtnode->height);
if (brtnode->height==0) printf(" pma=%p", brtnode->u.l.buffer);
printf("\n");
}
if (modified_lsn.lsn > brtnode->lsn.lsn) brtnode->lsn=modified_lsn;
fix_up_parent_pointers_of_children_now_that_parent_is_gone(cachefile, brtnode);
assert(brtnode->thisnodename==nodename);
{
......@@ -124,7 +140,19 @@ void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnod
assert(parent->u.n.n_children<=TREE_FANOUT+1);
for (i=0; i<parent->u.n.n_children; i++) {
//printf(" %lld\n", parent->u.n.children[i]);
if (parent->u.n.children[i]==nodename) goto ok;
if (parent->u.n.children[i]==nodename) {
// Rename the block, informing the parent of the new block
if (rename_p) {
DISKOFF newnodename = malloc_diskblock(brtnode->brt, brtnode->nodesize);
int r=tokulogger_log_block_rename(cachefile_logger(cachefile), cachefile_filenum(cachefile), nodename, newnodename, parent->thisnodename, i);
assert(r!=0); // !!! This error should be handled better (e.g., what if the disk fills up)
// !!! Don't forget to free the old node (sometime after some future checkpoint. TODO!!!)
brtnode->thisnodename=newnodename;
parent->u.n.children[i] = newnodename;
cachetable_rename(cachefile, nodename, newnodename);
}
goto ok;
}
}
printf("%s:%d Whoops, the parent of %p (%p) isn't right\n", __FILE__, __LINE__, brtnode, parent);
assert(0);
......@@ -143,18 +171,19 @@ void brtnode_flush_callback (CACHEFILE cachefile, diskoff nodename, void *brtnod
//printf("%s:%d n_items_malloced=%lld\n", __FILE__, __LINE__, n_items_malloced);
}
int brtnode_fetch_callback (CACHEFILE cachefile, diskoff nodename, void **brtnode_pv, long *sizep __attribute__((unused)), void*extraargs) {
int brtnode_fetch_callback (CACHEFILE cachefile, DISKOFF nodename, void **brtnode_pv, long *sizep __attribute__((unused)), void*extraargs, LSN *written_lsn) {
long nodesize=(long)extraargs;
BRTNODE *result=(BRTNODE*)brtnode_pv;
int r = deserialize_brtnode_from(cachefile_fd(cachefile), nodename, result, nodesize);
if (r == 0)
*sizep = brtnode_size(*result);
*written_lsn = (*result)->lsn;
//(*result)->parent_brtnode = 0; /* Don't know it right now. */
//printf("%s:%d installed %p (offset=%lld)\n", __FILE__, __LINE__, *result, nodename);
return r;
}
void brtheader_flush_callback (CACHEFILE cachefile, diskoff nodename, void *header_v, long size __attribute((unused)), int write_me, int keep_me) {
void brtheader_flush_callback (CACHEFILE cachefile, DISKOFF nodename, void *header_v, long size __attribute((unused)), BOOL write_me, BOOL keep_me, LSN lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
struct brt_header *h = header_v;
assert(nodename==0);
assert(!h->dirty); // shouldn't be dirty once it is unpinned.
......@@ -174,14 +203,15 @@ void brtheader_flush_callback (CACHEFILE cachefile, diskoff nodename, void *head
}
}
int brtheader_fetch_callback (CACHEFILE cachefile, diskoff nodename, void **headerp_v, long *sizep __attribute__((unused)), void*extraargs __attribute__((__unused__))) {
int brtheader_fetch_callback (CACHEFILE cachefile, DISKOFF nodename, void **headerp_v, long *sizep __attribute__((unused)), void*extraargs __attribute__((__unused__)), LSN *written_lsn) {
struct brt_header **h = (struct brt_header **)headerp_v;
assert(nodename==0);
int r = deserialize_brtheader_from(cachefile_fd(cachefile), nodename, h);
written_lsn->lsn = 0; // !!! WRONG. This should be stored or kept redundantly or something.
return r;
}
int read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) {
int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) {
void *header_p;
//fprintf(stderr, "%s:%d read_and_pin_brt_header(...)\n", __FILE__, __LINE__);
int r = cachetable_get_and_pin(cf, 0, &header_p, NULL,
......@@ -191,7 +221,7 @@ int read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header) {
return 0;
}
int unpin_brt_header (BRT brt) {
int toku_unpin_brt_header (BRT brt) {
int r = cachetable_unpin(brt->cf, 0, brt->h->dirty, 0);
brt->h->dirty=0;
brt->h=0;
......@@ -215,18 +245,18 @@ int kvpair_compare (const void *av, const void *bv) {
}
/* Forgot to handle the case where there is something in the freelist. */
diskoff malloc_diskblock_header_is_in_memory (BRT brt, int size) {
diskoff result = brt->h->unused_memory;
static DISKOFF malloc_diskblock_header_is_in_memory (BRT brt, int size) {
DISKOFF result = brt->h->unused_memory;
brt->h->unused_memory+=size;
return result;
}
diskoff malloc_diskblock (BRT brt, int size) {
DISKOFF malloc_diskblock (BRT brt, int size) {
#if 0
int r = read_and_pin_brt_header(brt->fd, &brt->h);
assert(r==0);
{
diskoff result = malloc_diskblock_header_is_in_memory(brt, size);
DISKOFF result = malloc_diskblock_header_is_in_memory(brt, size);
r = write_brt_header(brt->fd, &brt->h);
assert(r==0);
return result;
......@@ -236,26 +266,32 @@ diskoff malloc_diskblock (BRT brt, int size) {
#endif
}
static void initialize_brtnode (BRT t, BRTNODE n, diskoff nodename, int height) {
static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height) {
int i;
n->tag = TYP_BRTNODE;
n->brt = t;
n->nodesize = t->h->nodesize;
n->thisnodename = nodename;
n->lsn.lsn = 0; // a new one can always be 0.
n->layout_version = 0;
n->height = height;
n->rand4fingerprint = random();
n->local_fingerprint = 0;
brtnode_set_dirty(n);
assert(height>=0);
if (height>0) {
n->u.n.n_children = 0;
for (i=0; i<TREE_FANOUT; i++) {
n->u.n.childkeys[i] = 0;
n->u.n.childkeylens[i] = 0;
// n->u.n.childkeys[i] = 0;
// n->u.n.childkeylens[i] = 0;
}
n->u.n.totalchildkeylens = 0;
for (i=0; i<TREE_FANOUT+1; i++) {
n->u.n.children[i] = 0;
n->u.n.htables[i] = 0;
n->u.n.child_subtree_fingerprints[i] = 0;
// n->u.n.children[i] = 0;
// n->u.n.htables[i] = 0;
n->u.n.n_bytes_in_hashtable[i] = 0;
n->u.n.n_cursors[i] = 0;
n->u.n.n_cursors[i] = 0; // This one is simpler to initialize properly
}
n->u.n.n_bytes_in_hashtables = 0;
} else {
......@@ -271,7 +307,7 @@ static void initialize_brtnode (BRT t, BRTNODE n, diskoff nodename, int height)
static void create_new_brtnode (BRT t, BRTNODE *result, int height, BRTNODE parent_brtnode) {
TAGMALLOC(BRTNODE, n);
int r;
diskoff name = malloc_diskblock(t, t->h->nodesize);
DISKOFF name = malloc_diskblock(t, t->h->nodesize);
assert(n);
assert(t->h->nodesize>0);
//printf("%s:%d malloced %lld (and malloc again=%lld)\n", __FILE__, __LINE__, name, malloc_diskblock(t, t->nodesize));
......@@ -279,6 +315,7 @@ static void create_new_brtnode (BRT t, BRTNODE *result, int height, BRTNODE pare
*result = n;
assert(n->nodesize>0);
n->parent_brtnode = parent_brtnode;
n->brt = t;
//printf("%s:%d putting %p (%lld) parent=%p\n", __FILE__, __LINE__, n, n->thisnodename, parent_brtnode);
r=cachetable_put(t->cf, n->thisnodename, n, brtnode_size(n),
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)t->h->nodesize);
......@@ -324,6 +361,7 @@ static int insert_to_hash_in_nonleaf (BRTNODE node, int childnum, DBT *k, DBT *v
unsigned int n_bytes_added = BRT_CMD_OVERHEAD + KEY_VALUE_OVERHEAD + k->size + v->size;
int r = toku_hash_insert(node->u.n.htables[childnum], k->data, k->size, v->data, v->size, type);
if (r!=0) return r;
node->local_fingerprint += node->rand4fingerprint*toku_calccrc32_cmd(type, k->data, k->size, v->data, v->size);
node->u.n.n_bytes_in_hashtable[childnum] += n_bytes_added;
node->u.n.n_bytes_in_hashtables += n_bytes_added;
brtnode_set_dirty(node);
......@@ -350,8 +388,8 @@ int brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
int r;
r = pma_split(node->u.l.buffer, &node->u.l.n_bytes_in_buffer,
A->u.l.buffer, &A->u.l.n_bytes_in_buffer,
B->u.l.buffer, &B->u.l.n_bytes_in_buffer);
A->u.l.buffer, &A->u.l.n_bytes_in_buffer, A->rand4fingerprint, &A->local_fingerprint,
B->u.l.buffer, &B->u.l.n_bytes_in_buffer, B->rand4fingerprint, &B->local_fingerprint);
assert(r == 0);
r = pma_get_last(A->u.l.buffer, splitk, 0);
......@@ -393,6 +431,14 @@ int brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
return 0;
}
static void brt_update_fingerprint_when_moving_hashtable (BRTNODE oldnode, BRTNODE newnode, HASHTABLE table_being_moved) {
u_int32_t sum = 0;
HASHTABLE_ITERATE(table_being_moved, key, keylen, data, datalen, type,
sum += toku_calccrc32_cmd(type, key, keylen, data, datalen));
oldnode->local_fingerprint -= oldnode->rand4fingerprint * sum;
newnode->local_fingerprint += newnode->rand4fingerprint * sum;
}
/* Side effect: sets splitk->data pointer to a malloc'd value */
void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) {
int n_children_in_a = node->u.n.n_children/2;
......@@ -412,23 +458,31 @@ void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT
* The splitter key is key number n_children_in_a */
int i;
for (i=0; i<n_children_in_a; i++) {
HASHTABLE htab = node->u.n.htables[i];
A->u.n.children[i] = node->u.n.children[i];
A->u.n.htables[i] = node->u.n.htables[i];
A->u.n.htables[i] = htab;
A->u.n.n_bytes_in_hashtables += (A->u.n.n_bytes_in_hashtable[i] = node->u.n.n_bytes_in_hashtable[i]);
A->u.n.child_subtree_fingerprints[i] = node->u.n.child_subtree_fingerprints[i];
node->u.n.htables[i] = 0;
node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i];
node->u.n.n_bytes_in_hashtable[i] = 0;
brt_update_fingerprint_when_moving_hashtable(node, A, htab);
}
for (i=n_children_in_a; i<node->u.n.n_children; i++) {
int targchild = i-n_children_in_a;
HASHTABLE htab = node->u.n.htables[i];
B->u.n.children[targchild] = node->u.n.children[i];
B->u.n.htables[targchild] = node->u.n.htables[i];
B->u.n.htables[targchild] = htab;
B->u.n.n_bytes_in_hashtables += (B->u.n.n_bytes_in_hashtable[targchild] = node->u.n.n_bytes_in_hashtable[i]);
B->u.n.child_subtree_fingerprints[targchild] = node->u.n.child_subtree_fingerprints[i];
node->u.n.htables[i] = 0;
node->u.n.n_bytes_in_hashtables -= node->u.n.n_bytes_in_hashtable[i];
node->u.n.n_bytes_in_hashtable[i] = 0;
brt_update_fingerprint_when_moving_hashtable(node, B, htab);
}
for (i=0; i<n_children_in_a-1; i++) {
A->u.n.childkeys[i] = node->u.n.childkeys[i];
......@@ -456,6 +510,8 @@ void brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT
fix_up_parent_pointers_of_children(t, A);
fix_up_parent_pointers_of_children(t, B);
//verify_local_fingerprint_nonleaf(A);
//verify_local_fingerprint_nonleaf(B);
}
{
......@@ -523,22 +579,23 @@ static int push_brt_cmd_down_only_if_it_wont_push_more_else_put_here (BRT t, BRT
printf("\n");
}
}
int r;
if (to_child) {
int again_split=-1; BRTNODE againa,againb;
DBT againk;
init_dbt(&againk);
//printf("%s:%d hello!\n", __FILE__, __LINE__);
int r = brtnode_put_cmd(t, child, cmd,
r = brtnode_put_cmd(t, child, cmd,
&again_split, &againa, &againb, &againk,
0,
txn);
if (r!=0) return r;
assert(again_split==0); /* I only did the insert if I knew it wouldn't push down, and hence wouldn't split. */
return r;
} else {
int r=insert_to_hash_in_nonleaf(node, childnum_of_node, k, v, cmd->type);
return r;
r=insert_to_hash_in_nonleaf(node, childnum_of_node, k, v, cmd->type);
}
fixup_child_fingerprint(node, childnum_of_node, child);
return r;
}
static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum,
......@@ -549,7 +606,6 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
//if (debug) printf("%s:%d %*sinserting down\n", __FILE__, __LINE__, debug, "");
//printf("%s:%d hello!\n", __FILE__, __LINE__);
assert(node->height>0);
{
int r = brtnode_put_cmd(t, child, cmd,
child_did_split, childa, childb, childsplitk,
......@@ -561,6 +617,7 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val;
//if (debug) printf("%s:%d %*sinserted down child_did_split=%d\n", __FILE__, __LINE__, debug, "", child_did_split);
node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_cmdstruct(cmd);
{
int r = toku_hash_delete(node->u.n.htables[childnum], k->data, k->size); // Must delete after doing the insert, to avoid operating on freed' key
//printf("%s:%d deleted status=%d\n", __FILE__, __LINE__, r);
......@@ -572,7 +629,12 @@ static int push_a_brt_cmd_down (BRT t, BRTNODE node, BRTNODE child, int childnum
node->u.n.n_bytes_in_hashtable[childnum] -= n_bytes_removed;
brtnode_set_dirty(node);
}
if (*child_did_split) {
fixup_child_fingerprint(node, childnum, *childa);
fixup_child_fingerprint(node, childnum+1, *childb);
} else {
fixup_child_fingerprint(node, childnum, child);
}
return 0;
}
......@@ -611,19 +673,29 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
brtnode_set_dirty(node);
//verify_local_fingerprint_nonleaf(node);
// Slide the children over.
for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) {
node->u.n.children[cnum] = node->u.n.children[cnum-1];
node->u.n.htables[cnum] = node->u.n.htables[cnum-1];
node->u.n.child_subtree_fingerprints[cnum] = node->u.n.child_subtree_fingerprints[cnum-1];
node->u.n.n_bytes_in_hashtable[cnum] = node->u.n.n_bytes_in_hashtable[cnum-1];
node->u.n.n_cursors[cnum] = node->u.n.n_cursors[cnum-1];
}
node->u.n.children[childnum] = childa->thisnodename;
node->u.n.children[childnum+1] = childb->thisnodename;
fixup_child_fingerprint(node, childnum, childa);
fixup_child_fingerprint(node, childnum+1, childb);
toku_hashtable_create(&node->u.n.htables[childnum]);
toku_hashtable_create(&node->u.n.htables[childnum+1]);
node->u.n.n_bytes_in_hashtable[childnum] = 0;
node->u.n.n_bytes_in_hashtable[childnum+1] = 0;
// Remove all the cmds from the local fingerprint. Some may get added in again when we try to push to the child.
HASHTABLE_ITERATE(old_h, skey, skeylen, sval, svallen, type,
node->local_fingerprint -= node->rand4fingerprint*toku_calccrc32_cmd(type, skey, skeylen, sval, svallen));
// Slide the keys over
for (cnum=node->u.n.n_children-1; cnum>childnum; cnum--) {
node->u.n.childkeys[cnum] = node->u.n.childkeys[cnum-1];
......@@ -651,26 +723,31 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
fill_dbt(&svd, sval, svallen);
BRT_CMD brtcmd;
brtcmd.type = type; brtcmd.u.id.key = &skd; brtcmd.u.id.val = &svd; brtcmd.u.id.db = db;
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
if (t->compare_fun(db, &skd, childsplitk)<=0) {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childa, &brtcmd, childnum, txn);
} else {
r=push_brt_cmd_down_only_if_it_wont_push_more_else_put_here(t, node, childb, &brtcmd, childnum+1, txn);
}
//verify_local_fingerprint_nonleaf(childa); verify_local_fingerprint_nonleaf(childb);
if (r!=0) return r;
}));
toku_hashtable_free(&old_h);
r=cachetable_unpin(t->cf, childa->thisnodename, childa->dirty, brtnode_size(childa));
assert(r==0);
r=cachetable_unpin(t->cf, childb->thisnodename, childb->dirty, brtnode_size(childb));
assert(r==0);
//verify_local_fingerprint_nonleaf(childa);
//verify_local_fingerprint_nonleaf(childb);
//verify_local_fingerprint_nonleaf(node);
verify_counts(node);
verify_counts(childa);
verify_counts(childb);
r=cachetable_unpin(t->cf, childa->thisnodename, childa->dirty, brtnode_size(childa));
assert(r==0);
r=cachetable_unpin(t->cf, childb->thisnodename, childb->dirty, brtnode_size(childb));
assert(r==0);
if (node->u.n.n_children>TREE_FANOUT) {
//printf("%s:%d about to split having pushed %d out of %d keys\n", __FILE__, __LINE__, i, n_pairs);
brt_nonleaf_split(t, node, nodea, nodeb, splitk);
......@@ -685,6 +762,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0);
assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize);
assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize);
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
} else {
*did_split=0;
assert(serialize_brtnode_size(node)<=node->nodesize);
......@@ -703,7 +782,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
BRTNODE child;
int r;
assert(node->height>0);
diskoff targetchild = node->u.n.children[childnum];
DISKOFF targetchild = node->u.n.children[childnum];
assert(targetchild>=0 && targetchild<t->h->unused_memory); // This assertion could fail in a concurrent setting since another process might have bumped unused memory.
r = cachetable_get_and_pin(t->cf, targetchild, &childnode_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)t->h->nodesize);
......@@ -711,6 +790,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
//printf("%s:%d pin %p\n", __FILE__, __LINE__, childnode_v);
child=childnode_v;
child->parent_brtnode = node;
//verify_local_fingerprint_nonleaf(child);
verify_counts(child);
//printf("%s:%d height=%d n_bytes_in_hashtable = {%d, %d, %d, ...}\n", __FILE__, __LINE__, child->height, child->n_bytes_in_hashtable[0], child->n_bytes_in_hashtable[1], child->n_bytes_in_hashtable[2]);
if (child->height>0 && child->u.n.n_children>0) assert(child->u.n.children[child->u.n.n_children-1]!=0);
......@@ -746,8 +826,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
//printf("%s:%d random_picked\n", __FILE__, __LINE__);
init_dbt(&childsplitk);
childsplitk.app_private = splitk->app_private;
dbt_set_app_private(&childsplitk, dbt_get_app_private(splitk));
if (debug) printf("%s:%d %*spush down %s\n", __FILE__, __LINE__, debug, "", (char*)key);
r = push_a_brt_cmd_down (t, node, child, childnum,
&brtcmd,
......@@ -757,7 +836,7 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
if (0){
unsigned int sum=0;
HASHTABLE_ITERATE(node->u.n.htables[childnum], hk __attribute__((__unused__)), hkl, hd __attribute__((__unused__)), hdl, type __attribute__((__unused__)),
HASHTABLE_ITERATE(node->u.n.htables[childnum], subhk __attribute__((__unused__)), hkl, hd __attribute__((__unused__)), hdl, subtype __attribute__((__unused__)),
sum+=hkl+hdl+KEY_VALUE_OVERHEAD+BRT_CMD_OVERHEAD);
printf("%s:%d sum=%d\n", __FILE__, __LINE__, sum);
assert(sum==node->u.n.n_bytes_in_hashtable[childnum]);
......@@ -772,16 +851,21 @@ static int push_some_brt_cmds_down (BRT t, BRTNODE node, int childnum,
childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk,
app_private, db, txn);
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//}
return r; /* Don't do any more pushing if the child splits. */
}
}
if (0) printf("%s:%d done random picking\n", __FILE__, __LINE__);
}
if (debug) printf("%s:%d %*sdone push_some_brt_cmds_down, unpinning %lld\n", __FILE__, __LINE__, debug, "", targetchild);
assert(serialize_brtnode_size(node)<=node->nodesize);
//verify_local_fingerprint_nonleaf(node);
r=cachetable_unpin(t->cf, targetchild, child->dirty, brtnode_size(child));
if (r!=0) return r;
*did_split=0;
assert(serialize_brtnode_size(node)<=node->nodesize);
return 0;
}
......@@ -816,6 +900,8 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE
assert((*nodeb)->u.n.n_children>0);
assert((*nodea)->u.n.children[(*nodea)->u.n.n_children-1]!=0);
assert((*nodeb)->u.n.children[(*nodeb)->u.n.n_children-1]!=0);
//verify_local_fingerprint_nonleaf(*nodea);
//verify_local_fingerprint_nonleaf(*nodeb);
} else {
assert(serialize_brtnode_size(node)<=node->nodesize);
}
......@@ -824,6 +910,12 @@ static int brtnode_maybe_push_down(BRT t, BRTNODE node, int *did_split, BRTNODE
*did_split=0;
assert(serialize_brtnode_size(node)<=node->nodesize);
}
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
return 0;
}
......@@ -833,13 +925,14 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug,
TOKUTXN txn) {
// pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
if (cmd->type == BRT_INSERT) {
DBT *k = cmd->u.id.key;
DBT *v = cmd->u.id.val;
DB *db = cmd->u.id.db;
#ifdef INSERT_ALL_AT_ONCE
int replaced_v_size;
enum pma_errors pma_status = pma_insert_or_replace(node->u.l.buffer, k, v, &replaced_v_size, db, txn, node->thisnodename);
enum pma_errors pma_status = pma_insert_or_replace(node->u.l.buffer, k, v, &replaced_v_size, db, txn, node->thisnodename, node->rand4fingerprint, &node->local_fingerprint);
assert(pma_status==BRT_OK);
//printf("replaced_v_size=%d\n", replaced_v_size);
if (replaced_v_size>=0) {
......@@ -859,9 +952,12 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
node->u.l.n_bytes_in_buffer += k->size + v->size + KEY_VALUE_OVERHEAD;
#endif
brtnode_set_dirty(node);
// pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
// If it doesn't fit, then split the leaf.
if (serialize_brtnode_size(node) > node->nodesize) {
int r = brtleaf_split (t, node, nodea, nodeb, splitk, k->app_private, db);
int r = brtleaf_split (t, node, nodea, nodeb, splitk, dbt_get_app_private(k), db);
if (r!=0) return r;
//printf("%s:%d splitkey=%s\n", __FILE__, __LINE__, (char*)*splitkey);
split_count++;
......@@ -870,6 +966,8 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
if (debug) printf("%s:%d %*snodeb->thisnodename=%lld nodeb->size=%d\n", __FILE__, __LINE__, debug, "", (*nodeb)->thisnodename, (*nodeb)->nodesize);
assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize);
assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize);
// pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint);
// pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint);
} else {
*did_split = 0;
}
......@@ -884,13 +982,13 @@ static int brt_leaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
init_dbt(&val);
r = pma_lookup(node->u.l.buffer, cmd->u.id.key, &val, cmd->u.id.db);
if (r == 0) {
r = pma_delete(node->u.l.buffer, cmd->u.id.key, cmd->u.id.db);
r = pma_delete(node->u.l.buffer, cmd->u.id.key, cmd->u.id.db, node->rand4fingerprint, &node->local_fingerprint);
assert(r == BRT_OK);
node->u.l.n_bytes_in_buffer -= cmd->u.id.key->size + val.size + KEY_VALUE_OVERHEAD;
brtnode_set_dirty(node);
}
*did_split = 0;
return r;
return BRT_OK;
}
/* unknown message */
......@@ -950,9 +1048,11 @@ static int brt_nonleaf_put_cmd_child (BRT t, BRTNODE node, BRT_CMD *cmd,
r = handle_split_of_child(t, node, childnum,
childa, childb, &childsplitk,
did_split, nodea, nodeb, splitk,
k->app_private, db, txn);
dbt_get_app_private(k), db, txn);
assert(r == 0);
} else {
//verify_local_fingerprint_nonleaf(child);
fixup_child_fingerprint(node, childnum, child);
int rr = cachetable_unpin(t->cf, child->thisnodename, child->dirty, brtnode_size(child));
assert(rr == 0);
}
......@@ -966,8 +1066,7 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
DBT *splitk,
int debug,
TOKUTXN txn) {
bytevec olddata;
ITEMLEN olddatalen;
//verify_local_fingerprint_nonleaf(node);
unsigned int childnum;
int found;
......@@ -982,41 +1081,61 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
if (node->u.n.n_cursors[childnum] > 0) {
assert(node->u.n.n_bytes_in_hashtable[childnum] == 0);
int r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, txn, childnum, 0);
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
return r;
}
found = !toku_hash_find(node->u.n.htables[childnum], k->data, k->size, &olddata, &olddatalen, &type);
if (debug) printf("%s:%d %*sDoing hash_insert\n", __FILE__, __LINE__, debug, "");
verify_counts(node);
if (found) {
int r = toku_hash_delete(node->u.n.htables[childnum], k->data, k->size);
int diff = k->size + olddatalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
assert(r==0);
node->u.n.n_bytes_in_hashtables -= diff;
node->u.n.n_bytes_in_hashtable[childnum] -= diff;
brtnode_set_dirty(node);
//printf("%s:%d deleted %d bytes\n", __FILE__, __LINE__, diff);
//verify_local_fingerprint_nonleaf(node);
{
int anytype;
bytevec olddata;
ITEMLEN olddatalen;
found = !toku_hash_find(node->u.n.htables[childnum], k->data, k->size, &olddata, &olddatalen, &anytype);
//verify_local_fingerprint_nonleaf(node);
if (debug) printf("%s:%d %*sDoing hash_insert\n", __FILE__, __LINE__, debug, "");
verify_counts(node);
if (found) {
//printf("%s:%d found and deleting\n", __FILE__, __LINE__);
node->local_fingerprint -= node->rand4fingerprint * toku_calccrc32_cmd(anytype, k->data, k->size, olddata, olddatalen);
int r = toku_hash_delete(node->u.n.htables[childnum], k->data, k->size);
/* Be careful, olddata is now invalid because of the delete. */
int diff = k->size + olddatalen + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
assert(r==0);
node->u.n.n_bytes_in_hashtables -= diff;
node->u.n.n_bytes_in_hashtable[childnum] -= diff;
brtnode_set_dirty(node);
//printf("%s:%d deleted %d bytes\n", __FILE__, __LINE__, diff);
}
}
//verify_local_fingerprint_nonleaf(node);
/* if the child is in the cache table then push the cmd to it
otherwise just put it into this node's buffer */
if (brt_do_push_cmd) {
int r = brt_nonleaf_put_cmd_child(t, node, cmd, did_split, nodea, nodeb, splitk, debug, txn, childnum, 1);
if (r == 0)
if (r == 0) {
//printf("%s:%d\n", __FILE__, __LINE__);
return r;
}
}
//verify_local_fingerprint_nonleaf(node);
{
int diff = k->size + v->size + KEY_VALUE_OVERHEAD + BRT_CMD_OVERHEAD;
int r=toku_hash_insert(node->u.n.htables[childnum], k->data, k->size, v->data, v->size, type);
assert(r==0);
node->local_fingerprint += node->rand4fingerprint * toku_calccrc32_cmd(type, k->data, k->size, v->data, v->size);
node->u.n.n_bytes_in_hashtables += diff;
node->u.n.n_bytes_in_hashtable[childnum] += diff;
brtnode_set_dirty(node);
}
if (debug) printf("%s:%d %*sDoing maybe_push_down\n", __FILE__, __LINE__, debug, "");
int r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), k->app_private, db, txn);
//verify_local_fingerprint_nonleaf(node);
int r = brtnode_maybe_push_down(t, node, did_split, nodea, nodeb, splitk, debugp1(debug), dbt_get_app_private(k), db, txn);
if (r!=0) return r;
if (debug) printf("%s:%d %*sDid maybe_push_down\n", __FILE__, __LINE__, debug, "");
if (*did_split) {
......@@ -1032,39 +1151,80 @@ static int brt_nonleaf_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
assert(serialize_brtnode_size(node)<=node->nodesize);
verify_counts(node);
}
//if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
//} else {
// verify_local_fingerprint_nonleaf(node);
//}
return 0;
}
//static void verify_local_fingerprint_nonleaf (BRTNODE node) {
// u_int32_t fp=0;
// int i;
// if (node->height==0) return;
// for (i=0; i<node->u.n.n_children; i++)
// HASHTABLE_ITERATE(node->u.n.htables[i], key, keylen, data, datalen, type,
// ({
// fp += node->rand4fingerprint * toku_calccrc32_cmd(type, key, keylen, data, datalen);
// }));
// assert(fp==node->local_fingerprint);
//}
static int brtnode_put_cmd (BRT t, BRTNODE node, BRT_CMD *cmd,
int *did_split, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk,
int debug,
TOKUTXN txn) {
//static int counter=0; // FOO
//static int oldcounter=0;
//int tmpcounter;
//u_int32_t oldfingerprint=node->local_fingerprint;
int r;
//counter++; tmpcounter=counter;
if (node->height==0) {
return brt_leaf_put_cmd(t, node, cmd,
// pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->subtree_fingerprint);
r = brt_leaf_put_cmd(t, node, cmd,
did_split, nodea, nodeb, splitk,
debug, txn);
} else {
return brt_nonleaf_put_cmd(t, node, cmd,
r = brt_nonleaf_put_cmd(t, node, cmd,
did_split, nodea, nodeb, splitk,
debug, txn);
}
//oldcounter=tmpcounter;
// Watch out. If did_split then the original node is no longer allocated.
if (*did_split) {
assert(serialize_brtnode_size(*nodea)<=(*nodea)->nodesize);
assert(serialize_brtnode_size(*nodeb)<=(*nodeb)->nodesize);
// if ((*nodea)->height==0) {
// pma_verify_fingerprint((*nodea)->u.l.buffer, (*nodea)->rand4fingerprint, (*nodea)->subtree_fingerprint);
// pma_verify_fingerprint((*nodeb)->u.l.buffer, (*nodeb)->rand4fingerprint, (*nodeb)->subtree_fingerprint);
// }
} else {
assert(serialize_brtnode_size(node)<=node->nodesize);
// if (node->height==0) {
// pma_verify_fingerprint(node->u.l.buffer, node->rand4fingerprint, node->local_fingerprint);
// } else {
// verify_local_fingerprint_nonleaf(node);
// }
}
//if (node->local_fingerprint==3522421844U) {
// if (*did_split) {
// verify_local_fingerprint_nonleaf(*nodea);
// verify_local_fingerprint_nonleaf(*nodeb);
// }
return r;
}
int brt_create_cachetable_size(CACHETABLE *ct, int hashsize, long cachesize) {
return create_cachetable(ct, hashsize, cachesize);
}
//enum {n_nodes_in_cache =64};
enum {n_nodes_in_cache =127};
int brt_create_cachetable (CACHETABLE *ct, int cachelines) {
if (cachelines==0) cachelines=n_nodes_in_cache;
assert(cachelines>0);
return brt_create_cachetable_size(ct, cachelines, (cachelines+1)*1024*1024);
int brt_create_cachetable(CACHETABLE *ct, long cachesize, LSN initial_lsn, TOKULOGGER logger) {
if (cachesize == 0)
cachesize = 128*1024*1024;
return create_cachetable(ct, cachesize, initial_lsn, logger);
}
static int setup_brt_root_node (BRT t, diskoff offset) {
static int setup_brt_root_node (BRT t, DISKOFF offset) {
int r;
TAGMALLOC(BRTNODE, node);
assert(node);
......@@ -1073,6 +1233,7 @@ static int setup_brt_root_node (BRT t, diskoff offset) {
offset, /* the location is one nodesize offset from 0. */
0);
node->parent_brtnode=0;
node->brt = t;
if (0) {
printf("%s:%d for tree %p node %p mdict_create--> %p\n", __FILE__, __LINE__, t, node, node->u.l.buffer);
printf("%s:%d put root at %lld\n", __FILE__, __LINE__, offset);
......@@ -1086,6 +1247,7 @@ static int setup_brt_root_node (BRT t, diskoff offset) {
}
//printf("%s:%d created %lld\n", __FILE__, __LINE__, node->thisnodename);
verify_counts(node);
// verify_local_fingerprint_nonleaf(node);
r=cachetable_unpin(t->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (r!=0) {
toku_free(node);
......@@ -1101,23 +1263,49 @@ static int setup_brt_root_node (BRT t, diskoff offset) {
#define WHEN_BRTTRACE(x) ((void)0)
#endif
int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, int nodesize, CACHETABLE cachetable,
int (*compare_fun)(DB*,const DBT*,const DBT*)) {
int brt_create(BRT *brt_ptr) {
BRT brt = toku_malloc(sizeof *brt);
if (brt == 0)
return ENOMEM;
memset(brt, 0, sizeof *brt);
brt->flags = 0;
brt->nodesize = BRT_DEFAULT_NODE_SIZE;
brt->compare_fun = default_compare_fun;
brt->dup_compare = default_compare_fun;
*brt_ptr = brt;
return 0;
}
int brt_set_flags(BRT brt, int flags) {
brt->flags = flags;
return 0;
}
int brt_set_nodesize(BRT brt, int nodesize) {
brt->nodesize = nodesize;
return 0;
}
int brt_set_bt_compare(BRT brt, int (*bt_compare)(DB *, const DBT*, const DBT*)) {
brt->compare_fun = bt_compare;
return 0;
}
int brt_set_dup_compare(BRT brt, int (*dup_compare)(DB *, const DBT*, const DBT*)) {
brt->dup_compare = dup_compare;
return 0;
}
int brt_open(BRT t, const char *fname, const char *dbname, int is_create, CACHETABLE cachetable) {
/* If dbname is NULL then we setup to hold a single tree. Otherwise we setup an array. */
int r;
BRT t;
char *malloced_name=0;
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE: %s:%d open_brt(%s, \"%s\", %d, %p, %d, %p)\n",
__FILE__, __LINE__, fname, dbname, is_create, newbrt, nodesize, cachetable));
if ((MALLOC(t))==0) {
assert(errno==ENOMEM);
r = ENOMEM;
if (0) { died0: toku_free(t); }
return r;
}
t->compare_fun = compare_fun;
t->skey = t->sval = 0;
if (0) { died0: assert(r); return r; }
if (dbname) {
malloced_name = toku_strdup(dbname);
if (malloced_name==0) {
......@@ -1130,12 +1318,13 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
r=cachetable_openf(&t->cf, cachetable, fname, O_RDWR | (is_create ? O_CREAT : 0), 0777);
if (r!=0) {
if (0) { died1: cachefile_close(&t->cf); }
t->database_name = 0;
goto died0a;
}
assert(nodesize>0);
assert(t->nodesize>0);
//printf("%s:%d %d alloced\n", __FILE__, __LINE__, get_n_items_malloced()); print_malloced_items();
if (is_create) {
r = read_and_pin_brt_header(t->cf, &t->h);
r = toku_read_and_pin_brt_header(t->cf, &t->h);
if (r==-1) {
/* construct a new header. */
if ((MALLOC(t->h))==0) {
......@@ -1145,23 +1334,24 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
goto died1;
}
t->h->dirty=1;
t->h->nodesize=nodesize;
t->h->flags = t->flags;
t->h->nodesize=t->nodesize;
t->h->freelist=-1;
t->h->unused_memory=2*nodesize;
t->h->unused_memory=2*t->nodesize;
if (dbname) {
t->h->unnamed_root = -1;
t->h->n_named_roots = 1;
if ((MALLOC_N(1, t->h->names))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died3: toku_free(t->h->names); } goto died2; }
if ((MALLOC_N(1, t->h->roots))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died4: toku_free(t->h->roots); } goto died3; }
if ((t->h->names[0] = toku_strdup(dbname))==0) { assert(errno==ENOMEM); r=ENOMEM; if (0) { died5: toku_free(t->h->names[0]); } goto died4; }
t->h->roots[0] = nodesize;
t->h->roots[0] = t->nodesize;
} else {
t->h->unnamed_root = nodesize;
t->h->unnamed_root = t->nodesize;
t->h->n_named_roots = -1;
t->h->names=0;
t->h->roots=0;
}
if ((r=setup_brt_root_node(t, nodesize))!=0) { if (dbname) goto died5; else goto died2; }
if ((r=setup_brt_root_node(t, t->nodesize))!=0) { if (dbname) goto died5; else goto died2; }
if ((r=cachetable_put(t->cf, 0, t->h, 0, brtheader_flush_callback, brtheader_fetch_callback, 0))) { if (dbname) goto died5; else goto died2; }
} else {
int i;
......@@ -1178,18 +1368,18 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
if ((t->h->roots = toku_realloc(t->h->roots, (1+t->h->n_named_roots)*sizeof(*t->h->roots))) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; }
t->h->n_named_roots++;
if ((t->h->names[t->h->n_named_roots-1] = toku_strdup(dbname)) == 0) { assert(errno==ENOMEM); r=ENOMEM; goto died1; }
printf("%s:%d t=%p\n", __FILE__, __LINE__, t);
//printf("%s:%d t=%p\n", __FILE__, __LINE__, t);
t->h->roots[t->h->n_named_roots-1] = malloc_diskblock_header_is_in_memory(t, t->h->nodesize);
t->h->dirty = 1;
if ((r=setup_brt_root_node(t, t->h->roots[t->h->n_named_roots-1]))!=0) goto died1;
}
} else {
if ((r = read_and_pin_brt_header(t->cf, &t->h))!=0) goto died1;
if ((r = toku_read_and_pin_brt_header(t->cf, &t->h))!=0) goto died1;
if (!dbname) {
if (t->h->n_named_roots!=-1) { r = -2; /* invalid args??? */; goto died1; }
} else {
int i;
printf("%s:%d n_roots=%d\n", __FILE__, __LINE__, t->h->n_named_roots);
// printf("%s:%d n_roots=%d\n", __FILE__, __LINE__, t->h->n_named_roots);
for (i=0; i<t->h->n_named_roots; i++) {
if (strcmp(t->h->names[i], dbname)==0) {
goto found_it;
......@@ -1199,17 +1389,39 @@ int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt,
r=ENOENT; /* the database doesn't exist */
goto died1;
}
found_it: ;
found_it:
t->nodesize = t->h->nodesize; /* inherit the pagesize from the file */
if (t->flags != t->h->flags) { /* flags must match */
r = EINVAL; goto died1;
}
}
assert(t->h);
if ((r = unpin_brt_header(t)) !=0) goto died1;
if ((r = toku_unpin_brt_header(t)) !=0) goto died1;
assert(t->h==0);
WHEN_BRTTRACE(fprintf(stderr, "BRTTRACE -> %p\n", t));
t->cursors_head = t->cursors_tail = 0;
*newbrt = t;
return 0;
}
int open_brt (const char *fname, const char *dbname, int is_create, BRT *newbrt, int nodesize, CACHETABLE cachetable,
int (*compare_fun)(DB*,const DBT*,const DBT*)) {
BRT brt;
int r;
r = brt_create(&brt);
if (r != 0)
return r;
brt_set_nodesize(brt, nodesize);
brt_set_bt_compare(brt, compare_fun);
r = brt_open(brt, fname, dbname, is_create, cachetable);
if (r != 0) {
return r;
}
*newbrt = brt;
return r;
}
int close_brt (BRT brt) {
int r;
while (brt->cursors_head) {
......@@ -1217,9 +1429,11 @@ int close_brt (BRT brt) {
r=brt_cursor_close(c);
if (r!=0) return r;
}
assert(0==cachefile_count_pinned(brt->cf, 1));
//printf("%s:%d closing cachetable\n", __FILE__, __LINE__);
if ((r = cachefile_close(&brt->cf))!=0) return r;
if (brt->cf) {
assert(0==cachefile_count_pinned(brt->cf, 1));
//printf("%s:%d closing cachetable\n", __FILE__, __LINE__);
if ((r = cachefile_close(&brt->cf))!=0) return r;
}
if (brt->database_name) toku_free(brt->database_name);
if (brt->skey) { toku_free(brt->skey); }
if (brt->sval) { toku_free(brt->sval); }
......@@ -1229,7 +1443,7 @@ int close_brt (BRT brt) {
int brt_debug_mode = 0;//strcmp(key,"hello387")==0;
CACHEKEY* calculate_root_offset_pointer (BRT brt) {
CACHEKEY* toku_calculate_root_offset_pointer (BRT brt) {
if (brt->database_name==0) {
return &brt->h->unnamed_root;
} else {
......@@ -1246,7 +1460,7 @@ CACHEKEY* calculate_root_offset_pointer (BRT brt) {
int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *rootp) {
TAGMALLOC(BRTNODE, newroot);
int r;
diskoff newroot_diskoff=malloc_diskblock(brt, brt->h->nodesize);
DISKOFF newroot_diskoff=malloc_diskblock(brt, brt->h->nodesize);
assert(newroot);
*rootp=newroot_diskoff;
brt->h->dirty=1;
......@@ -1260,9 +1474,15 @@ int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKE
newroot->u.n.totalchildkeylens=splitk.size;
newroot->u.n.children[0]=nodea->thisnodename;
newroot->u.n.children[1]=nodeb->thisnodename;
nodea->parent_brtnode = newroot;
nodeb->parent_brtnode = newroot;
fixup_child_fingerprint(newroot, 0, nodea);
fixup_child_fingerprint(newroot, 1, nodeb);
r=toku_hashtable_create(&newroot->u.n.htables[0]); if (r!=0) return r;
r=toku_hashtable_create(&newroot->u.n.htables[1]); if (r!=0) return r;
verify_counts(newroot);
//verify_local_fingerprint_nonleaf(nodea);
//verify_local_fingerprint_nonleaf(nodeb);
r=cachetable_unpin(brt->cf, nodea->thisnodename, nodea->dirty, brtnode_size(nodea));
if (r!=0) return r;
r=cachetable_unpin(brt->cf, nodeb->thisnodename, nodeb->dirty, brtnode_size(nodeb));
......@@ -1274,7 +1494,7 @@ int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKE
return 0;
}
int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) {
static int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) {
void *node_v;
BRTNODE node;
CACHEKEY *rootp;
......@@ -1284,11 +1504,11 @@ int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) {
DBT splitk;
int debug = brt_debug_mode;//strcmp(key,"hello387")==0;
//assert(0==cachetable_assert_all_unpinned(brt->cachetable));
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); }
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: toku_unpin_brt_header(brt); }
return r;
}
rootp = calculate_root_offset_pointer(brt);
rootp = toku_calculate_root_offset_pointer(brt);
if (debug) printf("%s:%d Getting %lld\n", __FILE__, __LINE__, *rootp);
if ((r=cachetable_get_and_pin(brt->cf, *rootp, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize))) {
......@@ -1324,7 +1544,7 @@ int brt_root_put_cmd(BRT brt, BRT_CMD *cmd, TOKUTXN txn) {
size = brtnode_size(node);
}
cachetable_unpin(brt->cf, *rootp, dirty, size);
r = unpin_brt_header(brt);
r = toku_unpin_brt_header(brt);
assert(r == 0);
//assert(0==cachetable_assert_all_unpinned(brt->cachetable));
return result;
......@@ -1342,7 +1562,7 @@ int brt_insert (BRT brt, DBT *key, DBT *val, DB* db, TOKUTXN txn) {
return r;
}
int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE parent_brtnode) {
int brt_lookup_node (BRT brt, DISKOFF off, DBT *k, DBT *v, DB *db, BRTNODE parent_brtnode) {
int result;
void *node_v;
int r = cachetable_get_and_pin(brt->cf, off, &node_v, NULL,
......@@ -1361,6 +1581,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren
if (node->height==0) {
result = pma_lookup(node->u.l.buffer, k, v, db);
//printf("%s:%d looked up something, got answerlen=%d\n", __FILE__, __LINE__, answerlen);
//verify_local_fingerprint_nonleaf(node);
r = cachetable_unpin(brt->cf, off, 0, 0);
assert(r == 0);
return result;
......@@ -1383,6 +1604,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren
assert(0);
result = -1; // some versions of gcc complain
}
//verify_local_fingerprint_nonleaf(node);
r = cachetable_unpin(brt->cf, off, 0, 0);
assert(r == 0);
return result;
......@@ -1390,6 +1612,7 @@ int brt_lookup_node (BRT brt, diskoff off, DBT *k, DBT *v, DB *db, BRTNODE paren
}
result = brt_lookup_node(brt, node->u.n.children[childnum], k, v, db, node);
//verify_local_fingerprint_nonleaf(node);
r = cachetable_unpin(brt->cf, off, 0, 0);
assert(r == 0);
return result;
......@@ -1400,20 +1623,20 @@ int brt_lookup (BRT brt, DBT *k, DBT *v, DB *db) {
int r;
CACHEKEY *rootp;
assert(0==cachefile_count_pinned(brt->cf, 1));
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) {
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
printf("%s:%d\n", __FILE__, __LINE__);
if (0) { died0: unpin_brt_header(brt); }
if (0) { died0: toku_unpin_brt_header(brt); }
// printf("%s:%d returning %d\n", __FILE__, __LINE__, r);
assert(0==cachefile_count_pinned(brt->cf, 1));
return r;
}
rootp = calculate_root_offset_pointer(brt);
rootp = toku_calculate_root_offset_pointer(brt);
if ((r = brt_lookup_node(brt, *rootp, k, v, db, 0))) {
// printf("%s:%d\n", __FILE__, __LINE__);
goto died0;
}
//printf("%s:%d r=%d", __FILE__, __LINE__, r); if (r==0) printf(" vallen=%d", *vallen); printf("\n");
if ((r = unpin_brt_header(brt))!=0) return r;
if ((r = toku_unpin_brt_header(brt))!=0) return r;
assert(0==cachefile_count_pinned(brt->cf, 1));
return 0;
}
......@@ -1433,9 +1656,9 @@ int brt_delete(BRT brt, DBT *key, DB *db) {
return r;
}
int verify_brtnode (BRT brt, diskoff off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode);
int verify_brtnode (BRT brt, DISKOFF off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode);
int dump_brtnode (BRT brt, diskoff off, int depth, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, BRTNODE parent_brtnode) {
int dump_brtnode (BRT brt, DISKOFF off, int depth, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, BRTNODE parent_brtnode) {
int result=0;
BRTNODE node;
void *node_v;
......@@ -1491,18 +1714,18 @@ int dump_brtnode (BRT brt, diskoff off, int depth, bytevec lorange, ITEMLEN lole
int dump_brt (BRT brt) {
int r;
CACHEKEY *rootp;
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); }
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: toku_unpin_brt_header(brt); }
return r;
}
rootp = calculate_root_offset_pointer(brt);
rootp = toku_calculate_root_offset_pointer(brt);
printf("split_count=%d\n", split_count);
if ((r = dump_brtnode(brt, *rootp, 0, 0, 0, 0, 0, null_brtnode))) goto died0;
if ((r = unpin_brt_header(brt))!=0) return r;
if ((r = toku_unpin_brt_header(brt))!=0) return r;
return 0;
}
int show_brtnode_blocknumbers (BRT brt, diskoff off, BRTNODE parent_brtnode) {
int show_brtnode_blocknumbers (BRT brt, DISKOFF off, BRTNODE parent_brtnode) {
BRTNODE node;
void *node_v;
int i,r;
......@@ -1528,93 +1751,15 @@ int show_brtnode_blocknumbers (BRT brt, diskoff off, BRTNODE parent_brtnode) {
int show_brt_blocknumbers (BRT brt) {
int r;
CACHEKEY *rootp;
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); }
if ((r = toku_read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: toku_unpin_brt_header(brt); }
return r;
}
rootp = calculate_root_offset_pointer(brt);
rootp = toku_calculate_root_offset_pointer(brt);
printf("BRT %p has blocks:", brt);
if ((r=show_brtnode_blocknumbers (brt, *rootp, 0))) goto died0;
printf("\n");
if ((r = unpin_brt_header(brt))!=0) return r;
return 0;
}
int verify_brtnode (BRT brt, diskoff off, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen, int recurse, BRTNODE parent_brtnode) {
int result=0;
BRTNODE node;
void *node_v;
int r;
if ((r = cachetable_get_and_pin(brt->cf, off, &node_v, NULL,
brtnode_flush_callback, brtnode_fetch_callback, (void*)(long)brt->h->nodesize)))
return r;
//printf("%s:%d pin %p\n", __FILE__, __LINE__, node_v);
node=node_v;
node->parent_brtnode = parent_brtnode;
if (node->height>0) {
int i;
for (i=0; i< node->u.n.n_children-1; i++) {
bytevec thislorange,thishirange;
ITEMLEN thislolen, thishilen;
if (node->u.n.n_children==0 || i==0) {
thislorange=lorange;
thislolen =lolen;
} else {
thislorange=node->u.n.childkeys[i-1];
thislolen =node->u.n.childkeylens[i-1];
}
if (node->u.n.n_children==0 || i+1>=node->u.n.n_children) {
thishirange=hirange;
thishilen =hilen;
} else {
thishirange=node->u.n.childkeys[i];
thishilen =node->u.n.childkeylens[i];
}
{
void verify_pair (bytevec key, unsigned int keylen,
bytevec data __attribute__((__unused__)),
unsigned int datalen __attribute__((__unused__)),
int type __attribute__((__unused__)),
void *ignore __attribute__((__unused__))) {
if (thislorange) assert(keycompare(thislorange,thislolen,key,keylen)<0);
if (thishirange && keycompare(key,keylen,thishirange,thishilen)>0) {
printf("%s:%d in buffer %d key %s is bigger than %s\n", __FILE__, __LINE__, i, (char*)key, (char*)thishirange);
result=1;
}
}
toku_hashtable_iterate(node->u.n.htables[i], verify_pair, 0);
}
}
for (i=0; i<node->u.n.n_children; i++) {
if (i>0) {
if (lorange) assert(keycompare(lorange,lolen, node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1])<0);
if (hirange) assert(keycompare(node->u.n.childkeys[i-1], node->u.n.childkeylens[i-1], hirange, hilen)<=0);
}
if (recurse) {
result|=verify_brtnode(brt, node->u.n.children[i],
(i==0) ? lorange : node->u.n.childkeys[i-1],
(i==0) ? lolen : node->u.n.childkeylens[i-1],
(i==node->u.n.n_children-1) ? hirange : node->u.n.childkeys[i],
(i==node->u.n.n_children-1) ? hilen : node->u.n.childkeylens[i],
recurse,
node);
}
}
}
if ((r = cachetable_unpin(brt->cf, off, 0, 0))) return r;
return result;
}
int verify_brt (BRT brt) {
int r;
CACHEKEY *rootp;
if ((r = read_and_pin_brt_header(brt->cf, &brt->h))) {
if (0) { died0: unpin_brt_header(brt); }
return r;
}
rootp = calculate_root_offset_pointer(brt);
if ((r=verify_brtnode(brt, *rootp, 0, 0, 0, 0, 1, null_brtnode))) goto died0;
if ((r = unpin_brt_header(brt))!=0) return r;
if ((r = toku_unpin_brt_header(brt))!=0) return r;
return 0;
}
......@@ -1658,7 +1803,7 @@ void brt_flush_child(BRT t, BRTNODE node, int childnum, BRT_CURSOR cursor, void
if (0) printf("child_did_split %lld %lld\n", childa->thisnodename, childb->thisnodename);
if (i == 0) {
CACHEKEY *rootp = calculate_root_offset_pointer(t);
CACHEKEY *rootp = toku_calculate_root_offset_pointer(t);
r = brt_init_new_root(t, childa, childb, child_splitk, rootp);
assert(r == 0);
r = cachetable_unpin(t->cf, *rootp, CACHETABLE_DIRTY, 0);
......@@ -1802,6 +1947,7 @@ void brt_cursor_leaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE le
if (0) printf("brt_cursor_leaf_split %p oldnode %lld newnode %lld\n", cursor,
oldnode->thisnodename, newnode->thisnodename);
//verify_local_fingerprint_nonleaf(oldnode);
r = cachetable_unpin(t->cf, oldnode->thisnodename, oldnode->dirty, brtnode_size(oldnode));
assert(r == 0);
r = cachetable_maybe_get_and_pin(t->cf, newnode->thisnodename, &v);
......@@ -1877,6 +2023,7 @@ void brt_cursor_nonleaf_split(BRT_CURSOR cursor, BRT t, BRTNODE oldnode, BRTNODE
if (0) printf("brt_cursor_nonleaf_split %p oldnode %lld newnode %lld\n",
cursor, oldnode->thisnodename, newnode->thisnodename);
// The oldnode is probably dead. But we say it is dirty? ???
r = cachetable_unpin(t->cf, oldnode->thisnodename, oldnode->dirty, brtnode_size(oldnode));
assert(r == 0);
r = cachetable_maybe_get_and_pin(t->cf, newnode->thisnodename, &v);
......@@ -1949,7 +2096,7 @@ void brt_cursor_print(BRT_CURSOR cursor) {
printf("\n");
}
int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
int brtcurs_set_position_last (BRT_CURSOR cursor, DISKOFF off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt=cursor->brt;
void *node_v;
......@@ -1973,7 +2120,7 @@ int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db,
cursor->pathcnum[cursor->path_len-1] = childnum;
brt_node_add_cursor(node, childnum, cursor);
if (node->u.n.n_bytes_in_hashtable[childnum] > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn);
brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
/*
* the flush may have been partially successfull. it may have also
* changed the tree such that the current node have expanded or been
......@@ -2011,7 +2158,7 @@ int brtcurs_set_position_last (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db,
}
}
int brtcurs_set_position_first (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
int brtcurs_set_position_first (BRT_CURSOR cursor, DISKOFF off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt=cursor->brt;
void *node_v;
......@@ -2035,7 +2182,7 @@ int brtcurs_set_position_first (BRT_CURSOR cursor, diskoff off, DBT *key, DB *db
cursor->pathcnum[cursor->path_len-1] = childnum;
brt_node_add_cursor(node, childnum, cursor);
if (node->u.n.n_bytes_in_hashtable[childnum] > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn);
brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
/*
* the flush may have been partially successfull. it may have also
* changed the tree such that the current node have expanded or been
......@@ -2086,6 +2233,7 @@ int brtcurs_set_position_next2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1];
cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(cursor->brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (brt_cursor_path_empty(cursor))
......@@ -2105,7 +2253,7 @@ int brtcurs_set_position_next2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
more = node->u.n.n_bytes_in_hashtable[childnum];
if (more == 0)
break;
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn);
brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1];
}
......@@ -2146,6 +2294,7 @@ int brtcurs_set_position_prev2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1];
cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(cursor->brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (brt_cursor_path_empty(cursor))
......@@ -2165,7 +2314,7 @@ int brtcurs_set_position_prev2(BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
more = node->u.n.n_bytes_in_hashtable[childnum];
if (more == 0)
break;
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn);
brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1];
}
......@@ -2192,7 +2341,7 @@ int brtcurs_set_position_prev (BRT_CURSOR cursor, DBT *key, DB *db, TOKUTXN txn)
return 0;
}
int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
int brtcurs_set_key(BRT_CURSOR cursor, DISKOFF off, DBT *key, DBT *val, int flag, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt = cursor->brt;
void *node_v;
int r;
......@@ -2214,7 +2363,7 @@ int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag
brt_node_add_cursor(node, childnum, cursor);
int more = node->u.n.n_bytes_in_hashtable[childnum];
if (more > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn);
brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1];
brt_node_remove_cursor(node, childnum, cursor);
......@@ -2248,12 +2397,13 @@ int brtcurs_set_key(BRT_CURSOR cursor, diskoff off, DBT *key, DBT *val, int flag
if (r != 0) {
cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(brt->cf, off, node->dirty, brtnode_size(node));
}
return r;
}
int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
int brtcurs_set_range(BRT_CURSOR cursor, DISKOFF off, DBT *key, DB *db, TOKUTXN txn, BRTNODE parent_brtnode) {
BRT brt = cursor->brt;
void *node_v;
int r;
......@@ -2277,7 +2427,7 @@ int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN
brt_node_add_cursor(node, childnum, cursor);
int more = node->u.n.n_bytes_in_hashtable[childnum];
if (more > 0) {
brt_flush_child(cursor->brt, node, childnum, cursor, key->app_private, db, txn);
brt_flush_child(cursor->brt, node, childnum, cursor, dbt_get_app_private(key), db, txn);
node = cursor->path[cursor->path_len-1];
childnum = cursor->pathcnum[cursor->path_len-1];
brt_node_remove_cursor(node, childnum, cursor);
......@@ -2311,6 +2461,7 @@ int brtcurs_set_range(BRT_CURSOR cursor, diskoff off, DBT *key, DB *db, TOKUTXN
if (r != 0) {
cursor->path_len -= 1;
//verify_local_fingerprint_nonleaf(node);
cachetable_unpin(brt->cf, off, node->dirty, brtnode_size(node));
}
return r;
......@@ -2323,6 +2474,7 @@ static int unpin_cursor (BRT_CURSOR cursor) {
for (i=0; i<cursor->path_len; i++) {
BRTNODE node = cursor->path[i];
brt_node_remove_cursor(node, cursor->pathcnum[i], cursor);
//verify_local_fingerprint_nonleaf(node);
int r2 = cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
if (r==0) r=r2;
}
......@@ -2359,11 +2511,11 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int flags, DB *db, TO
//dump_brt(cursor->brt);
//fprintf(stderr, "%s:%d in brt_c_get(...)\n", __FILE__, __LINE__);
if ((r = read_and_pin_brt_header(cursor->brt->cf, &cursor->brt->h))) {
if (0) { died0: unpin_brt_header(cursor->brt); }
if ((r = toku_read_and_pin_brt_header(cursor->brt->cf, &cursor->brt->h))) {
if (0) { died0: toku_unpin_brt_header(cursor->brt); }
return r;
}
rootp = calculate_root_offset_pointer(cursor->brt);
rootp = toku_calculate_root_offset_pointer(cursor->brt);
if (flags&DB_RMW) {
do_rmw=1;
flags &= ~DB_RMW;
......@@ -2427,7 +2579,7 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int flags, DB *db, TO
abort();
}
//printf("%s:%d unpinning header\n", __FILE__, __LINE__);
if ((r = unpin_brt_header(cursor->brt))!=0) return r;
if ((r = toku_unpin_brt_header(cursor->brt))!=0) return r;
return 0;
}
......
......@@ -11,8 +11,15 @@
#include "log.h"
typedef struct brt *BRT;
int open_brt (const char *fname, const char *dbname, int is_create, BRT *, int nodesize, CACHETABLE, int(*)(DB*,const DBT*,const DBT*));
//int brt_create (BRT **, int nodesize, int n_nodes_in_cache); /* the nodesize and n_nodes in cache really should be separately configured. */
//int brt_open (BRT *, char *fname, char *dbname);
int brt_create(BRT *);
int brt_set_flags(BRT, int flags);
int brt_set_nodesize(BRT, int nodesize);
int brt_set_bt_compare(BRT, int (*bt_compare)(DB *, const DBT*, const DBT*));
int brt_set_dup_compare(BRT, int (*dup_compare)(DB *, const DBT*, const DBT*));
int brt_set_cachetable(BRT, CACHETABLE);
int brt_open(BRT, const char *fname, const char *dbname, int is_create, CACHETABLE ct);
int brt_insert (BRT, DBT *, DBT *, DB*, TOKUTXN);
int brt_lookup (BRT brt, DBT *k, DBT *v, DB*db);
int brt_delete (BRT brt, DBT *k, DB *db);
......@@ -22,12 +29,11 @@ void brt_fsync (BRT); /* fsync, but don't clear the caches. */
void brt_flush (BRT); /* fsync and clear the caches. */
int brt_create_cachetable (CACHETABLE *t, int n_cachlines /* Pass 0 if you want the default. */);
/* create and initialize a cache table
hashsize is the initialize size of the lookup table
cachesize is the upper limit on the size of the size of the values in the table */
int brt_create_cachetable_size (CACHETABLE *t, int hashsize, long cachesize);
cachesize is the upper limit on the size of the size of the values in the table
pass 0 if you want the default */
int brt_create_cachetable(CACHETABLE *t, long cachesize, LSN initial_lsn, TOKULOGGER);
extern int brt_debug_mode;
int verify_brt (BRT brt);
......@@ -40,4 +46,7 @@ int brt_cursor_get (BRT_CURSOR cursor, DBT *kbt, DBT *vbt, int brtc_flags, DB *d
int brt_cursor_delete(BRT_CURSOR cursor, int flags);
int brt_cursor_close (BRT_CURSOR curs);
typedef struct brtenv *BRTENV;
int brtenv_checkpoint (BRTENV env);
#endif
#ifndef BRTTYPES_H
#define BRTTYPES_H
#include <sys/types.h>
#define _XOPEN_SOURCE 500
#define _FILE_OFFSET_BITS 64
typedef unsigned int ITEMLEN;
typedef const void *bytevec;
//typedef const void *bytevec;
typedef long long diskoff; /* Offset in a disk. -1 is the NULL pointer. */
typedef long long DISKOFF; /* Offset in a disk. -1 is the NULL pointer. */
typedef long long TXNID;
/* Make the LSN be a struct instead of an integer so that we get better type checking. */
typedef struct __toku_lsn { u_int64_t lsn; } LSN;
#define ZERO_LSN ((LSN){0})
/* Make the FILEID a struct for the same reason. */
typedef struct __toku_fileid { u_int32_t fileid; } FILENUM;
typedef enum __toku_bool { FALSE=0, TRUE=1} BOOL;
typedef struct tokulogger *TOKULOGGER;
#define NULL_LOGGER ((TOKULOGGER)0)
typedef struct tokutxn *TOKUTXN;
#endif
/* -*- mode: C; c-basic-offset: 4 -*- */
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
......@@ -28,7 +30,7 @@ static void expectN(CACHEKEY key) {
CACHEFILE expect_f;
static void flush (CACHEFILE f, CACHEKEY key, void*value, long size __attribute__((__unused__)), int write_me __attribute__((__unused__)), int keep_mee __attribute__((__unused__))) {
static void flush (CACHEFILE f, CACHEKEY key, void*value, long size __attribute__((__unused__)), BOOL write_me __attribute__((__unused__)), BOOL keep_me __attribute__((__unused__)), LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
struct item *it = value;
int i;
......@@ -60,12 +62,13 @@ struct item *make_item (CACHEKEY key) {
}
CACHEKEY did_fetch=-1;
int fetch (CACHEFILE f, CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
int fetch (CACHEFILE f, CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
printf("Fetch %lld\n", key);
assert (expect_f==f);
assert((long)extraargs==23);
*value = make_item(key);
did_fetch=key;
written_lsn->lsn = 0;
return 0;
}
......@@ -76,7 +79,7 @@ void test0 (void) {
CACHEFILE f;
int r;
char fname[] = "test.dat";
r=create_cachetable(&t, 5, 5);
r=create_cachetable(&t, 5, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777);
......@@ -177,15 +180,17 @@ void test0 (void) {
}
static void flush_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)), void *value,
long size __attribute__((__unused__)), int write_me __attribute__((__unused__)),
int keep_me __attribute__((__unused__))) {
long size __attribute__((__unused__)),
BOOL write_me __attribute__((__unused__)), BOOL keep_me __attribute__((__unused__)),
LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute ((__unused__))) {
int *v = value;
assert(*v==0);
}
static int fetch_n (CACHEFILE f __attribute__((__unused__)), CACHEKEY key __attribute__((__unused__)),
void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
void**value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==42);
*value=0;
written_lsn->lsn = 0;
return 0;
}
......@@ -198,7 +203,7 @@ void test_nested_pin (void) {
int r;
void *vv;
char fname[] = "test_ct.dat";
r = create_cachetable(&t, 1, 1);
r = create_cachetable(&t, 1, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777);
......@@ -227,21 +232,25 @@ void test_nested_pin (void) {
}
void null_flush (CACHEFILE cf __attribute__((__unused__)),
CACHEKEY k __attribute__((__unused__)),
void *v __attribute__((__unused__)),
long size __attribute__((__unused__)),
int write_me __attribute__((__unused__)),
int keep_me __attribute__((__unused__))) {
void null_flush (CACHEFILE cf __attribute__((__unused__)),
CACHEKEY k __attribute__((__unused__)),
void *v __attribute__((__unused__)),
long size __attribute__((__unused__)),
BOOL write_me __attribute__((__unused__)),
BOOL keep_me __attribute__((__unused__)),
LSN modified_lsn __attribute__((__unused__)),
BOOL rename_p __attribute__((__unused__))) {
}
int add123_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs) {
int add123_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==123);
*value = (void*)((unsigned long)key+123L);
written_lsn->lsn = 0;
return 0;
}
int add222_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs) {
int add222_fetch (CACHEFILE cf __attribute__((__unused__)), CACHEKEY key, void **value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==222);
*value = (void*)((unsigned long)key+222L);
written_lsn->lsn = 0;
return 0;
}
......@@ -257,7 +266,7 @@ void test_multi_filehandles (void) {
unlink(fname1);
unlink(fname2);
r = create_cachetable(&t, 4, 4); assert(r==0);
r = create_cachetable(&t, 4, ZERO_LSN, NULL_LOGGER); assert(r==0);
r = cachetable_openf(&f1, t, fname1, O_RDWR|O_CREAT, 0777); assert(r==0);
r = link(fname1, fname2); assert(r==0);
r = cachetable_openf(&f2, t, fname2, O_RDWR|O_CREAT, 0777); assert(r==0);
......@@ -282,12 +291,13 @@ void test_multi_filehandles (void) {
r = cachetable_close(&t); assert(r==0);
}
void test_dirty_flush(CACHEFILE f, CACHEKEY key, void *value, long size, int write, int keep) {
printf("test_dirty_flush %p %lld %p %ld %d %d\n", f, key, value, size, write, keep);
void test_dirty_flush(CACHEFILE f, CACHEKEY key, void *value, long size, BOOL do_write, BOOL keep, LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
printf("test_dirty_flush %p %lld %p %ld %d %d\n", f, key, value, size, do_write, keep);
}
int test_dirty_fetch(CACHEFILE f, CACHEKEY key, void **value_ptr, long *size_ptr, void *arg) {
int test_dirty_fetch(CACHEFILE f, CACHEKEY key, void **value_ptr, long *size_ptr, void *arg, LSN *written_lsn) {
*value_ptr = arg;
written_lsn->lsn = 0;
printf("test_dirty_fetch %p %lld %p %ld %p\n", f, key, *value_ptr, *size_ptr, arg);
return 0;
}
......@@ -301,7 +311,7 @@ void test_dirty() {
int dirty; long long pinned; long entry_size;
int r;
r = create_cachetable(&t, 4, 4);
r = create_cachetable(&t, 4, ZERO_LSN, NULL_LOGGER);
assert(r == 0);
char *fname = "test.dat";
......@@ -393,8 +403,8 @@ void test_dirty() {
int test_size_debug;
CACHEKEY test_size_flush_key;
void test_size_flush_callback(CACHEFILE f, CACHEKEY key, void *value, long size, int write, int keep) {
if (test_size_debug) printf("test_size_flush %p %lld %p %ld %d %d\n", f, key, value, size, write, keep);
void test_size_flush_callback(CACHEFILE f, CACHEKEY key, void *value, long size, BOOL do_write, BOOL keep, LSN modified_lsn __attribute__((__unused__)), BOOL rename_p __attribute__((__unused__))) {
if (test_size_debug) printf("test_size_flush %p %lld %p %ld %d %d\n", f, key, value, size, do_write, keep);
assert(write != 0);
test_size_flush_key = key;
}
......@@ -409,7 +419,7 @@ void test_size_resize() {
int n = 3;
long size = 1;
r = create_cachetable(&t, n, n*size);
r = create_cachetable(&t, n*size, ZERO_LSN, NULL_LOGGER);
assert(r == 0);
char *fname = "test.dat";
......@@ -460,7 +470,7 @@ void test_size_flush() {
const int n = 8;
long long size = 1*1024*1024;
r = create_cachetable(&t, 3, n*size);
r = create_cachetable(&t, n*size, ZERO_LSN, NULL_LOGGER);
assert(r == 0);
char *fname = "test.dat";
......@@ -509,7 +519,99 @@ void test_size_flush() {
assert(r == 0);
}
enum { KEYLIMIT = 4, TRIALLIMIT=64 };
CACHEKEY keys[KEYLIMIT];
void* vals[KEYLIMIT];
int n_keys=0;
static void r_flush (CACHEFILE f __attribute__((__unused__)),
CACHEKEY k, void *value,
long size __attribute__((__unused__)),
BOOL write_me __attribute__((__unused__)),
BOOL keep_me,
LSN modified_lsn __attribute__((__unused__)),
BOOL rename_p __attribute__((__unused__))) {
int i;
//printf("Flush\n");
for (i=0; i<n_keys; i++) {
if (keys[i]==k) {
assert(vals[i]==value);
if (!keep_me) {
keys[i]=keys[n_keys-1];
vals[i]=vals[n_keys-1];
n_keys--;
return;
}
}
}
fprintf(stderr, "Whoops\n");
abort();
}
int r_fetch (CACHEFILE f __attribute__((__unused__)),
CACHEKEY key __attribute__((__unused__)),
void**value __attribute__((__unused__)),
long *sizep __attribute__((__unused__)),
void*extraargs __attribute__((__unused__)),
LSN *modified_lsn __attribute__((__unused__))) {
fprintf(stderr, "Whoops, this should never be called");
return 0;
}
void test_rename (void) {
CACHETABLE t;
CACHEFILE f;
int i;
int r;
const char fname[] = "ct-test-rename.dat";
r=create_cachetable(&t, KEYLIMIT, ZERO_LSN, NULL_LOGGER); assert(r==0);
unlink(fname);
r = cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, 0777);
assert(r==0);
for (i=0; i<TRIALLIMIT; i++) {
int ra = random()%3;
if (ra<=1) {
// Insert something
CACHEKEY nkey = random();
long nval = random();
//printf("n_keys=%d Insert %08llx\n", n_keys, nkey);
r = cachetable_put(f, nkey, (void*)nval, 1,
r_flush, r_fetch, 0);
assert(r==0);
assert(n_keys<KEYLIMIT);
keys[n_keys] = nkey;
vals[n_keys] = (void*)nval;
n_keys++;
r = cachetable_unpin(f, nkey, CACHETABLE_DIRTY, 1);
assert(r==0);
} else if (ra==2 && n_keys>0) {
// Rename something
int objnum = random()%n_keys;
CACHEKEY okey = keys[objnum];
CACHEKEY nkey = random();
void *current_value;
long current_size;
keys[objnum]=nkey;
//printf("Rename %llx to %llx\n", okey, nkey);
r = cachetable_get_and_pin(f, okey, &current_value, &current_size, r_flush, r_fetch, 0);
assert(r==0);
r = cachetable_rename(f, okey, nkey);
assert(r==0);
r = cachetable_unpin(f, nkey, CACHETABLE_DIRTY, 1);
}
}
r = cachefile_close(&f);
assert(r == 0);
r = cachetable_close(&t);
assert(r == 0);
assert(n_keys == 0);
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
test_rename();
test0();
test_nested_pin();
test_multi_filehandles ();
......
......@@ -58,7 +58,14 @@ static void file_is_not_present(CACHEFILE cf) {
}
static void flush_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY key, void *value, long size __attribute__((__unused__)), int write_me __attribute__((__unused__)), int keep_me __attribute__((__unused__))) {
static void flush_forchain (CACHEFILE f __attribute__((__unused__)),
CACHEKEY key,
void *value,
long size __attribute__((__unused__)),
BOOL write_me __attribute__((__unused__)),
BOOL keep_me __attribute__((__unused__)),
LSN modified_lsn __attribute__((__unused__)),
BOOL rename_p __attribute__((__unused__))) {
int *v = value;
//cachetable_print_state(ct);
//printf("Flush %lld %d\n", key, (int)value);
......@@ -67,9 +74,10 @@ static void flush_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY ke
//print_ints();
}
static int fetch_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs) {
static int fetch_forchain (CACHEFILE f __attribute__((__unused__)), CACHEKEY key, void**value, long *sizep __attribute__((__unused__)), void*extraargs, LSN *written_lsn) {
assert((long)extraargs==(long)key);
*value = (void*)(long)key;
written_lsn->lsn = 0;
return 0;
}
......@@ -93,9 +101,9 @@ void test_chaining (void) {
char fname[N_FILES][FILENAME_LEN];
int r;
long i, trial;
r = create_cachetable(&ct, N_PRESENT_LIMIT, N_PRESENT_LIMIT); assert(r==0);
r = create_cachetable(&ct, N_PRESENT_LIMIT, ZERO_LSN, NULL_LOGGER); assert(r==0);
for (i=0; i<N_FILES; i++) {
int r = snprintf(fname[i], FILENAME_LEN, "cachetabletest2.%ld.dat", i);
r = snprintf(fname[i], FILENAME_LEN, "cachetabletest2.%ld.dat", i);
assert(r>0 && r<FILENAME_LEN);
unlink(fname[i]);
r = cachetable_openf(&f[i], ct, fname[i], O_RDWR|O_CREAT, 0777); assert(r==0);
......
......@@ -29,12 +29,15 @@ struct ctpair {
PAIR next,prev; // In LRU list.
PAIR hash_chain;
CACHEFILE cachefile;
cachetable_flush_func_t flush_callback;
cachetable_fetch_func_t fetch_callback;
void*extraargs;
CACHETABLE_FLUSH_FUNC_T flush_callback;
CACHETABLE_FETCH_FUNC_T fetch_callback;
void *extraargs;
int verify_flag; /* Used in verify_cachetable() */
LSN modified_lsn; // What was the LSN when modified (undefined if not dirty)
LSN written_lsn; // What was the LSN when written (we need to get this information when we fetch)
};
// The cachetable is as close to an ENV as we get.
struct cachetable {
enum typ_tag tag;
int n_in_table;
......@@ -44,6 +47,8 @@ struct cachetable {
CACHEFILE cachefiles;
long size_current, size_limit;
int primeidx;
LSN lsn_of_checkpoint; // the most recent checkpoint in the log.
TOKULOGGER logger;
};
struct fileid {
......@@ -57,9 +62,10 @@ struct cachefile {
int fd; /* Bug: If a file is opened read-only, then it is stuck in read-only. If it is opened read-write, then subsequent writers can write to it too. */
CACHETABLE cachetable;
struct fileid fileid;
FILENUM filenum;
};
int create_cachetable(CACHETABLE *result, int table_size __attribute__((unused)), long size_limit) {
int create_cachetable(CACHETABLE *result, long size_limit, LSN initial_lsn, TOKULOGGER logger) {
TAGMALLOC(CACHETABLE, t);
int i;
t->n_in_table = 0;
......@@ -74,6 +80,8 @@ int create_cachetable(CACHETABLE *result, int table_size __attribute__((unused))
t->cachefiles = 0;
t->size_current = 0;
t->size_limit = size_limit;
t->lsn_of_checkpoint = initial_lsn;
t->logger = logger;
*result = t;
return 0;
}
......@@ -257,13 +265,25 @@ static PAIR remove_from_hash_chain (PAIR remove_me, PAIR list) {
return list;
}
// Predicate to determine if a node must be renamed. Nodes are renamed on the time they are written
// after a checkpoint.
// Thus we need to rename it if it is dirty,
// if it has been modified within the current checkpoint regime (hence non-strict inequality)
// and the last time it was written was in a previous checkpoint regime (strict inequality)
static BOOL need_to_rename_p (CACHETABLE t, PAIR p) {
return (p->dirty
&& p->modified_lsn.lsn>=t->lsn_of_checkpoint.lsn // nonstrict
&& p->written_lsn.lsn < t->lsn_of_checkpoint.lsn); // strict
}
static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) {
lru_remove(t, remove_me);
//printf("flush_callback(%lld,%p)\n", remove_me->key, remove_me->value);
WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=%d, 0)\n", __FILE__, __LINE__, remove_me->key, remove_me->value, remove_me->dirty && write_me));
//printf("%s:%d TAG=%x p=%p\n", __FILE__, __LINE__, remove_me->tag, remove_me);
//printf("%s:%d dirty=%d\n", __FILE__, __LINE__, remove_me->dirty);
remove_me->flush_callback(remove_me->cachefile, remove_me->key, remove_me->value, remove_me->size, remove_me->dirty && write_me, 0);
remove_me->flush_callback(remove_me->cachefile, remove_me->key, remove_me->value, remove_me->size, remove_me->dirty && write_me, 0,
t->lsn_of_checkpoint, need_to_rename_p(t, remove_me));
t->n_in_table--;
// Remove it from the hash chain.
{
......@@ -274,14 +294,6 @@ static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) {
toku_free(remove_me);
}
static void flush_and_keep (PAIR flush_me) {
if (flush_me->dirty) {
WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=1, 0)\n", __FILE__, __LINE__, flush_me->key, flush_me->value));
flush_me->flush_callback(flush_me->cachefile, flush_me->key, flush_me->value, flush_me->size, 1, 1);
flush_me->dirty=0;
}
}
static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) {
int r = 0;
again:
......@@ -309,7 +321,8 @@ static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) {
static int cachetable_insert_at(CACHEFILE cachefile, int h, CACHEKEY key, void *value, long size,
cachetable_flush_func_t flush_callback,
cachetable_fetch_func_t fetch_callback,
void *extraargs, int dirty) {
void *extraargs, int dirty,
LSN written_lsn) {
TAGMALLOC(PAIR, p);
p->pinned = 1;
p->dirty = dirty;
......@@ -322,6 +335,8 @@ static int cachetable_insert_at(CACHEFILE cachefile, int h, CACHEKEY key, void *
p->flush_callback = flush_callback;
p->fetch_callback = fetch_callback;
p->extraargs = extraargs;
p->modified_lsn.lsn = 0;
p->written_lsn = written_lsn;
CACHETABLE ct = cachefile->cachetable;
lru_add_to_list(ct, p);
p->hash_chain = ct->table[h];
......@@ -352,7 +367,7 @@ int cachetable_put(CACHEFILE cachefile, CACHEKEY key, void*value, long size,
if (maybe_flush_some(cachefile->cachetable, size))
return -2;
// flushing could change the result from hashit()
int r = cachetable_insert_at(cachefile, hashit(cachefile->cachetable, key), key, value, size, flush_callback, fetch_callback, extraargs, 1);
int r = cachetable_insert_at(cachefile, hashit(cachefile->cachetable, key), key, value, size, flush_callback, fetch_callback, extraargs, 1, ZERO_LSN);
return r;
}
......@@ -377,10 +392,11 @@ int cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, void**value, long
void *toku_value;
long size = 1; // compat
int r;
LSN written_lsn;
WHEN_TRACE_CT(printf("%s:%d CT: fetch_callback(%lld...)\n", __FILE__, __LINE__, key));
if ((r=fetch_callback(cachefile, key, &toku_value, &size, extraargs)))
if ((r=fetch_callback(cachefile, key, &toku_value, &size, extraargs, &written_lsn)))
return r;
cachetable_insert_at(cachefile, hashit(t,key), key, toku_value, size, flush_callback, fetch_callback, extraargs, 0);
cachetable_insert_at(cachefile, hashit(t,key), key, toku_value, size, flush_callback, fetch_callback, extraargs, 0, written_lsn);
*value = toku_value;
if (sizep)
*sizep = size;
......@@ -428,6 +444,26 @@ int cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, int dirty, long size) {
return 0;
}
// effect: Move an object from one key to another key.
// requires: The object is pinned in the table
int cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey) {
CACHETABLE t = cachefile->cachetable;
PAIR *ptr_to_p,p;
for (ptr_to_p = &t->table[hashit(t, oldkey)], p = *ptr_to_p;
p;
ptr_to_p = &p->hash_chain, p = *ptr_to_p) {
if (p->key==oldkey && p->cachefile==cachefile) {
*ptr_to_p = p->hash_chain;
p->key = newkey;
int nh = hashit(t, newkey);
p->hash_chain = t->table[nh];
t->table[nh] = p;
return 0;
}
}
return -1;
}
int cachetable_flush (CACHETABLE t) {
int i;
for (i=0; i<t->table_size; i++) {
......@@ -559,6 +595,15 @@ int cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) {
return 0;
}
#if 0
static void flush_and_keep (PAIR flush_me) {
if (flush_me->dirty) {
WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=1, 0)\n", __FILE__, __LINE__, flush_me->key, flush_me->value));
flush_me->flush_callback(flush_me->cachefile, flush_me->key, flush_me->value, flush_me->size, 1, 1);
flush_me->dirty=0;
}
}
static int cachetable_fsync_pairs (CACHETABLE t, PAIR p) {
if (p) {
int r = cachetable_fsync_pairs(t, p->hash_chain);
......@@ -577,6 +622,7 @@ int cachetable_fsync (CACHETABLE t) {
}
return 0;
}
#endif
#if 0
int cachefile_pwrite (CACHEFILE cf, const void *buf, size_t count, off_t offset) {
......@@ -643,3 +689,54 @@ int cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, void **value_ptr,
}
return 1;
}
int cachetable_checkpoint (CACHETABLE ct) {
// Single threaded checkpoint.
// In future: for multithreaded checkpoint we should not proceed if the previous checkpoint has not finished.
// Requires: Everything is unpinned. (In the multithreaded version we have to wait for things to get unpinned and then
// grab them (or else the unpinner has to do something.)
// Algorithm: Write a checkpoint record to the log, noting the LSN of that record.
// Note the LSN of the previous checkpoint (stored in lsn_of_checkpoint)
// For every (unpinnned) dirty node in which the LSN is newer than the prev checkpoint LSN:
// flush the node (giving it a new nodeid, and fixing up the downpointer in the parent)
// Watch out since evicting the node modifies the hash table.
//?? This is a skeleton. It compiles, but doesn't do anything reasonable yet.
//?? log_the_checkpoint();
int n_saved=0;
int n_in_table = ct->n_in_table;
struct save_something {
CACHEFILE cf;
DISKOFF key;
void *value;
long size;
LSN modified_lsn;
CACHETABLE_FLUSH_FUNC_T flush_callback;
} *MALLOC_N(n_in_table, info);
{
PAIR pair;
for (pair=ct->head; pair; pair=pair->next) {
assert(!pair->pinned);
if (pair->dirty && pair->modified_lsn.lsn>ct->lsn_of_checkpoint.lsn) {
//?? /save_something_about_the_pair(); // This read-only so it doesn't modify the table.
n_saved++;
}
}
}
{
int i;
for (i=0; i<n_saved; i++) {
info[i].flush_callback(info[i].cf, info[i].key, info[i].value, info[i].size, 1, 1, info[i].modified_lsn, 0);
}
}
toku_free(info);
return 0;
}
TOKULOGGER cachefile_logger (CACHEFILE cf) {
return cf->cachetable->logger;
}
FILENUM cachefile_filenum (CACHEFILE cf) {
return cf->filenum;
}
......@@ -2,6 +2,7 @@
#define CACHETABLE_H
#include <fcntl.h>
#include "brttypes.h"
/* Implement the cache table. */
......@@ -22,14 +23,16 @@ typedef struct cachefile *CACHEFILE;
* table_size is the initial size of the cache table hash table (in number of entries)
* size limit is the upper bound of the sum of size of the entries in the cache table (total number of bytes)
*/
int create_cachetable(CACHETABLE */*result*/, int table_size, long size_limit);
int create_cachetable(CACHETABLE */*result*/, long size_limit, LSN initial_lsn, TOKULOGGER);
int cachetable_openf (CACHEFILE *,CACHETABLE, const char */*fname*/, int flags, mode_t mode);
typedef void (*cachetable_flush_func_t)(CACHEFILE, CACHEKEY key, void*value, long size, int write_me, int keep_me);
typedef void (cachetable_flush_func_t)(CACHEFILE, CACHEKEY key, void*value, long size, BOOL write_me, BOOL keep_me, LSN modified_lsn, BOOL rename_p);
typedef cachetable_flush_func_t *CACHETABLE_FLUSH_FUNC_T;
/* If we are asked to fetch something, get it by calling this back. */
typedef int (*cachetable_fetch_func_t)(CACHEFILE, CACHEKEY key, void **value, long *sizep, void *extraargs);
typedef int (cachetable_fetch_func_t)(CACHEFILE, CACHEKEY key, void **value, long *sizep, void *extraargs, LSN *written_lsn);
typedef cachetable_fetch_func_t *CACHETABLE_FETCH_FUNC_T;
/* Error if already present. On success, pin the value. */
int cachetable_put(CACHEFILE cf, CACHEKEY key, void* value, long size,
......@@ -51,6 +54,9 @@ int cachetable_remove (CACHEFILE, CACHEKEY, int /*write_me*/); /* Removing somet
int cachetable_assert_all_unpinned (CACHETABLE);
int cachefile_count_pinned (CACHEFILE, int /*printthem*/ );
/* Rename whatever is at oldkey to be newkey. Requires that the object be pinned. */
int cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey);
//int cachetable_fsync_all (CACHETABLE); /* Flush everything to disk, but keep it in cache. */
int cachetable_close (CACHETABLE*); /* Flushes everything to disk, and destroys the cachetable. */
......@@ -63,7 +69,7 @@ int cachefile_close (CACHEFILE*);
int cachefile_fd (CACHEFILE);
// Useful for debugging
// Useful for debugging
void cachetable_print_state (CACHETABLE ct);
void cachetable_get_state(CACHETABLE ct, int *num_entries_ptr, int *hash_size_ptr, long *size_current_ptr, long *size_limit_ptr);
int cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, void **value_ptr,
......@@ -72,4 +78,7 @@ int cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, void **value_ptr,
void cachefile_verify (CACHEFILE cf); // Verify the whole cachetable that the CF is in. Slow.
void cachetable_verify (CACHETABLE t); // Slow...
TOKULOGGER cachefile_logger (CACHEFILE);
FILENUM cachefile_filenum (CACHEFILE);
#endif
CFLAGS = -O2 -Wall -W -Werror -g
LDFLAGS = -lz -lssl -g
adler32:
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <zlib.h>
#include <openssl/md2.h>
#include <openssl/md4.h>
#include <openssl/md5.h>
const unsigned int prime = 2000000011;
unsigned int karprabin (unsigned char *datac, int N) {
assert(N%4==0);
unsigned int *data=(unsigned int*)datac;
N=N/4;
int i;
unsigned int result=0;
for (i=0; i<N; i++) {
result=(result*prime)+data[i];
}
return result;
}
// According to
// P. L'Ecuyer, "Tables of Linear Congruential Generators of
// Different Sizes and Good Lattice Structure", Mathematics of
// Computation 68:225, 249--260 (1999).
// m=2^{32}-5 a=1588635695 is good.
const unsigned int mkr = 4294967291U;
const unsigned int akr = 1588635695U;
// But this is slower
unsigned int karprabinP (unsigned char *datac, int N) {
assert(N%4==0);
unsigned int *data=(unsigned int*)datac;
N=N/4;
int i;
unsigned long long result=0;
for (i=0; i<N; i++) {
result=((result*akr)+data[i])%mkr;
}
return result;
}
float tdiff (struct timeval *start, struct timeval *end) {
return (end->tv_sec-start->tv_sec) +1e-6*(end->tv_usec - start->tv_usec);
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
struct timeval start, end;
const int N=2<<20;
unsigned char *data=malloc(N);
int i;
assert(data);
for (i=0; i<N; i++) data[i]=random();
// adler32
{
uLong a32 = adler32(0L, Z_NULL, 0);
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
a32 = adler32(a32, data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("adler32=%lu, time=%9.6fs %9.6fns/b\n", a32, tm, 1e9*tm/N);
}
}
// crc32
{
uLong c32 = crc32(0L, Z_NULL, 0);
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
c32 = crc32(c32, data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("crc32=%lu, time=%9.6fs %9.6fns/b\n", c32, tm, 1e9*tm/N);
}
}
// MD2
{
unsigned char buf[MD2_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD2(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md2=");
for (j=0; j<MD2_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// MD4
{
unsigned char buf[MD4_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD4(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md4=");
for (j=0; j<MD4_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// MD5
{
unsigned char buf[MD5_DIGEST_LENGTH];
int j;
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
MD5(data, N, buf);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("md5=");
for (j=0; j<MD5_DIGEST_LENGTH; j++) {
printf("%02x", buf[j]);
}
printf(" time=%9.6fs %9.6fns/b\n", tm, 1e9*tm/N);
}
}
// karp rabin
{
for (i=0; i<3; i++) {
gettimeofday(&start, 0);
unsigned int kr = karprabin(data, N);
gettimeofday(&end, 0);
float tm = tdiff(&start, &end);
printf("kr=%ud time=%9.6fs %9.6fns/b\n", kr, tm, 1e9*tm/N);
}
}
free(data);
return 0;
}
#ifndef TOKU_CRC_H
#define TOKU_CRC_H
#include <zlib.h>
// zlib crc32 has a bug: If len==0 then it should return oldcrc32, but crc32 returns 0.
static inline u_int32_t toku_crc32 (u_int32_t oldcrc32, const void *data, u_int32_t len) {
if (len==0) return oldcrc32;
else return crc32(oldcrc32, data, len);
}
static const u_int32_t toku_null_crc = 0;
// Don't use crc32, use toku_crc32 to avoid that bug.
ZEXTERN uLong ZEXPORT crc32 OF((uLong crc, const Bytef *buf, uInt len)) __attribute__((deprecated));
#endif
#include <arpa/inet.h>
#include <assert.h>
#include "brt-internal.h"
// Calculate the fingerprint for a kvpair
static inline u_int32_t toku_calc_more_crc32_kvpair (u_int32_t crc, const void *key, int keylen, const void *val, int vallen) {
int i;
i = htonl(keylen);
crc = toku_crc32(crc, (void*)&i, 4);
crc = toku_crc32(crc, key, keylen);
i = htonl(vallen);
crc = toku_crc32(crc, (void*)&i, 4);
crc = toku_crc32(crc, val, vallen);
return crc;
}
u_int32_t toku_calccrc32_kvpair (const void *key, int keylen, const void *val, int vallen) {
return toku_calc_more_crc32_kvpair(toku_null_crc, key, keylen, val, vallen);
}
u_int32_t toku_calccrc32_cmd (int type, const void *key, int keylen, const void *val, int vallen) {
unsigned char type_c = type;
return toku_calc_more_crc32_kvpair(toku_crc32(toku_null_crc,
&type_c, 1),
key, keylen, val, vallen);
}
u_int32_t toku_calccrc32_cmdstruct (BRT_CMD *cmd) {
switch (cmd->type) {
case BRT_NONE:
case BRT_INSERT:
case BRT_DELETE:
return toku_calccrc32_cmd (cmd->type, cmd->u.id.key->data, cmd->u.id.key->size, cmd->u.id.val->data, cmd->u.id.val->size);
}
assert(0); /* Should not have come here. */
}
#include "brttypes.h"
#include "brt-internal.h"
#include "memory.h"
#include <sys/types.h>
#include <unistd.h>
#include <assert.h>
......@@ -32,7 +30,7 @@ int write_int (int fd, unsigned int v) {
return 0;
}
int read_diskoff (int fd, diskoff *result) {
int read_diskoff (int fd, DISKOFF *result) {
unsigned int i0,i1;
int r;
r = read_uint(fd, &i0); if(r!=0) return r;
......@@ -41,7 +39,7 @@ int read_diskoff (int fd, diskoff *result) {
return 0;
}
int write_diskoff (int fd, diskoff v) {
int write_diskoff (int fd, DISKOFF v) {
int r;
r = write_int(fd, (unsigned int)(v>>32)); if (r!=0) return r;
r = write_int(fd, (unsigned int)(v&0xffffffff)); if (r!=0) return r;
......@@ -97,14 +95,14 @@ int read_brt_header (int fd, struct brt_header *header) {
return 0;
}
int read_brt_h_unused_memory (int fd, diskoff *unused_memory) {
int read_brt_h_unused_memory (int fd, DISKOFF *unused_memory) {
off_t r = lseek(fd, 12, SEEK_SET);
assert(r==12);
r = read_diskoff(fd, unused_memory);
return r;
}
int write_brt_h_unused_memory (int fd, diskoff unused_memory) {
int write_brt_h_unused_memory (int fd, DISKOFF unused_memory) {
off_t r = lseek(fd, 12, SEEK_SET);
assert(r==12);
r = write_diskoff(fd, unused_memory);
......
// This list is intended to be embedded in other data structures.
struct list {
struct list *next, *prev;
};
......
#if defined(__x86_64) || defined(__i386)
static inline void mfence (void) {
__asm__ volatile ("mfence":::"memory");
}
static inline void rfence (void) {
__asm__ volatile ("rfence":::"memory");
}
static inline void sfence (void) {
__asm__ volatile ("sfence":::"memory");
}
/* According to the Intel Architecture Software Developer's
* Manual, Volume 3: System Programming Guide
* (http://www.intel.com/design/pro/manuals/243192.htm), page 7-6,
* "For the P6 family processors, locked operations serialize all
* outstanding load and store operations (that is, wait for them to
* complete)."
*
* Bradley found that fence instructions is faster on an opteron
* mfence takes 8ns on a 1.5GHZ AMD64 (maybe this is an 801)
* sfence takes 5ns
* lfence takes 3ns
* xchgl takes 14ns
*/
static inline lock_xchgl(volatile int *ptr, int x)
{
__asm__("xchgl %0,%1" :"=r" (x) :"m" (*(ptr)), "0" (x) :"memory");
return x;
}
#endif
typedef volatile int SPINLOCK[1];
static inline void spin_init (SPINLOCK v) {
v[0] = 0;
mfence();
}
static inline void spin_lock (SPINLOCK v) {
while (lock_xchgl((int*)v, 1)!=0) {
while (v[0]); /* Spin using only reads. It would be better to use MCS locks, but this reduces bus traffic. */
}
}
static inline void spin_unlock (SPINLOCK v) {
sfence(); // Want all previous stores to take place before we unlock.
v[0]=0;
}
#else
#error Need to define architectur-specific stuff for other machines.
#endif
CFLAGS=-O2 -Wall -W -Werror
LDFLAGS=-lpthread
trylock:
/* Time {m,l,s}fence vs.xchgl for a memory barrier. */
/* Timing numbers:
* Intel T2500 2GHZ
do1 9.0ns/loop
mfence: 29.0ns/loop (marginal cost= 20.0ns)
sfence: 17.3ns/loop (marginal cost= 8.3ns)
lfence: 23.6ns/loop (marginal cost= 14.6ns)
xchgl: 35.8ns/loop (marginal cost= 26.8ns)
* AMD Athlon 64 X2 Dual Core Processor 4200+
Timings are more crazy
do1 20.6ns/loop
mfence: 12.9ns/loop (marginal cost= -7.6ns)
sfence: 8.4ns/loop (marginal cost= -12.1ns)
lfence: 20.2ns/loop (marginal cost= -0.3ns)
xchgl: 16.6ns/loop (marginal cost= -3.9ns)
do1 13.0ns/loop
mfence: 25.6ns/loop (marginal cost= 12.6ns)
sfence: 21.0ns/loop (marginal cost= 8.1ns)
lfence: 12.9ns/loop (marginal cost= -0.1ns)
xchgl: 29.3ns/loop (marginal cost= 16.3ns)
*/
#include <sys/time.h>
#include <stdio.h>
enum { COUNT = 100000000 };
static inline void xchgl (void) {
{
/*
* According to the Intel Architecture Software Developer's
* Manual, Volume 3: System Programming Guide
* (http://www.intel.com/design/pro/manuals/243192.htm), page
* 7-6, "For the P6 family processors, locked operations
* serialize all outstanding load and store operations (that
* is, wait for them to complete)."
* Since xchg is locked by default, it is one way to do membar.
*/
int x=0, y;
asm volatile ("xchgl %0,%1" :"=r" (x) :"m" (y), "0" (x) :"memory");
}
}
static inline void mfence (void) {
asm volatile ("mfence":::"memory");
}
static inline void lfence (void) {
asm volatile ("lfence":::"memory");
}
static inline void sfence (void) {
asm volatile ("sfence":::"memory");
}
double tdiff (struct timeval *start, struct timeval *end) {
return ((end->tv_sec-start->tv_sec + 1e-6*(end->tv_usec + start->tv_usec))/COUNT)*1e9;
}
double nop_cost;
void do1 (volatile int *x) {
int i;
struct timeval start, end;
gettimeofday(&start, 0);
for (i=0; i<COUNT; i++) {
x[0]++;
x[1]++;
x[2]++;
x[3]++;
}
gettimeofday(&end, 0);
printf("do1 %6.1fns/loop\n", nop_cost=tdiff(&start, &end));
}
#define doit(name) void do ##name (volatile int *x) { \
int i; \
struct timeval start, end; \
gettimeofday(&start, 0); \
for (i=0; i<COUNT; i++) { \
x[0]++; \
x[1]++; \
name(); \
x[2]++; \
x[3]++; \
} \
gettimeofday(&end, 0); \
double this_cost = tdiff(&start, &end); \
printf("%6s:%6.1fns/loop (marginal cost=%6.1fns)\n", #name, this_cost, this_cost-nop_cost); \
}
doit(mfence)
doit(lfence)
doit(sfence)
doit(xchgl)
int main (int argc __attribute__((__unused__)),
char *argv[] __attribute__((__unused__))) {
int x[4];
int i;
for (i=0; i<2; i++) {
do1(x);
domfence(x);
dosfence(x);
dolfence(x);
doxchgl(x);
}
return 0;
}
/* How expensive is
* - Obtaining a read-only lock for the first obtainer.
* - Obtaining it for the second one?
* - The third one? */
#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <sys/time.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
/* My own rwlock implementation. */
struct brwl {
int mutex;
int state; // 0 for unlocked, -1 for a writer, otherwise many readers
};
static inline int xchg(volatile int *ptr, int x)
{
__asm__("xchgl %0,%1" :"=r" (x) :"m" (*(ptr)), "0" (x) :"memory");
return x;
}
static inline void sfence (void) {
asm volatile ("sfence":::"memory");
}
static inline void brwl_rlock (struct brwl *l) {
while (xchg(&l->mutex, 1)) ;
l->state++;
#if 1
sfence();
l->mutex=0;
#else
xchg(&l->mutex, 0);
#endif
}
enum {K=1000};
pthread_rwlock_t rwlocks[K];
struct brwl blocks[K];
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
int j;
int i;
int r;
struct timeval start, end;
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
r=pthread_rwlock_init(&rwlocks[i], NULL);
assert(r==0);
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
r = pthread_rwlock_tryrdlock(&rwlocks[i]);
assert(r==0);
}
gettimeofday(&end, 0);
printf("pthread_rwlock_tryrdlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
r=pthread_rwlock_init(&rwlocks[i], NULL);
assert(r==0);
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
r = pthread_rwlock_rdlock(&rwlocks[i]);
assert(r==0);
}
gettimeofday(&end, 0);
printf("pthread_rwlock_rdlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
for (j=0; j<3; j++) {
for (i=0; i<K; i++) {
blocks[i].state=0;
blocks[i].mutex=0;
}
gettimeofday(&start, 0);
for (i=0; i<K; i++) {
brwl_rlock(&blocks[i]);
}
gettimeofday(&end, 0);
printf("brwl_rlock took %9.3fus for %d ops: %9.3fus/lock (%9.3fMops/s)\n", tdiff(&start,&end), K, tdiff(&start,&end)/K, K/tdiff(&start,&end));
}
return 0;
}
#define _MULTI_THREADED
#include <pthread.h>
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/time.h>
float tdiff (struct timeval *start, struct timeval *end) {
return 1e6*(end->tv_sec-start->tv_sec) +(end->tv_usec - start->tv_usec);
}
/* Simple function to check the return code and exit the program
if the function call failed
*/
static void compResults(char *string, int rc) {
if (rc) {
printf("Error on : %s, rc=%d",
string, rc);
exit(EXIT_FAILURE);
}
return;
}
pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;
void *rdlockThread(void *arg)
{
int rc;
int count=0;
struct timeval start, end;
printf("Entered thread, getting read lock with mp wait\n");
Retry:
gettimeofday(&start, 0);
rc = pthread_rwlock_tryrdlock(&rwlock);
gettimeofday(&end, 0);
printf("pthread_rwlock_tryrdlock took %9.3fus\n", tdiff(&start,&end));
if (rc == EBUSY) {
if (count >= 10) {
printf("Retried too many times, failure!\n");
exit(EXIT_FAILURE);
}
++count;
printf("Could not get lock, do other work, then RETRY...\n");
sleep(1);
goto Retry;
}
compResults("pthread_rwlock_tryrdlock() 1\n", rc);
sleep(2);
printf("unlock the read lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_unlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_unlock()\n", rc);
printf("%d.%6d to %d.%6d is %9.2f\n", start.tv_sec, start.tv_usec, end.tv_sec, end.tv_usec, tdiff(&start, &end));
printf("Secondary thread complete\n");
return NULL;
}
int main(int argc, char **argv)
{
int rc=0;
pthread_t thread;
struct timeval start, end;
printf("Enter Testcase - %s\n", argv[0]);
gettimeofday(&start, 0);
gettimeofday(&end, 0);
printf("nop Took %9.2f\n", tdiff(&start, &end));
{
int N=1000;
int i;
printf("Main, get and release the write lock %d times\n", N);
gettimeofday(&start, 0);
for (i=0; i<N; i++) {
rc = pthread_rwlock_wrlock(&rwlock);
rc = pthread_rwlock_unlock(&rwlock);
}
gettimeofday(&end, 0);
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %9.2fns/op\n", 1000*tdiff(&start, &end)/N);
}
printf("Main, get the write lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_wrlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_wrlock()\n", rc);
printf("Took %9.2f\n", tdiff(&start, &end));
printf("Main, create the try read lock thread\n");
rc = pthread_create(&thread, NULL, rdlockThread, NULL);
compResults("pthread_create\n", rc);
printf("Main, wait a bit holding the write lock\n");
sleep(5);
printf("Main, Now unlock the write lock\n");
gettimeofday(&start, 0);
rc = pthread_rwlock_unlock(&rwlock);
gettimeofday(&end, 0);
compResults("pthread_rwlock_unlock()\n", rc);
printf("Took %9.2f\n", tdiff(&start, &end));
printf("Main, wait for the thread to end\n");
rc = pthread_join(thread, NULL);
compResults("pthread_join\n", rc);
rc = pthread_rwlock_destroy(&rwlock);
compResults("pthread_rwlock_destroy()\n", rc);
printf("Main completed\n");
return 0;
}
......@@ -11,13 +11,20 @@ struct tokulogger {
int fd;
int n_in_file;
long long next_log_file_number;
LSN lsn;
char buf[LOGGER_BUF_SIZE];
int n_in_buf;
};
int tokulogger_find_next_unused_log_file(const char *directory, long long *result);
enum { LT_INSERT_WITH_NO_OVERWRITE = 'I', LT_DELETE = 'D', LT_COMMIT = 'C' };
enum {
LT_COMMIT = 'C',
LT_DELETE = 'D',
LT_INSERT_WITH_NO_OVERWRITE = 'I',
LT_CHECKPOINT = 'P',
LT_BLOCK_RENAME = 'R'
};
struct tokutxn {
u_int64_t txnid64;
......
......@@ -2,7 +2,6 @@
#include "log-internal.h"
#include "wbuf.h"
#include "memory.h"
#include "../src/ydb-internal.h"
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
......@@ -11,6 +10,7 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/uio.h>
#include "../src/ydb-internal.h"
int tokulogger_find_next_unused_log_file(const char *directory, long long *result) {
DIR *d=opendir(directory);
......@@ -44,6 +44,9 @@ int tokulogger_create_and_open_logger (const char *directory, TOKULOGGER *result
result->fd = -1;
result->next_log_file_number = nexti;
result->n_in_buf = 0;
result->lsn.lsn = 0; // WRONG!!! This should actually be calculated by looking at the log file.
*resultp=result;
return tokulogger_log_bytes(result, 0, "");
}
......@@ -85,26 +88,6 @@ int tokulogger_log_bytes(TOKULOGGER logger, int nbytes, void *bytes) {
return 0;
}
// Log an insertion of a key-value pair into a particular node of the tree.
int tokulogger_log_brt_insert_with_no_overwrite (TOKULOGGER logger,
TXNID txnid,
diskoff diskoff,
unsigned char *key,
int keylen,
unsigned char *val,
int vallen) {
int buflen=30+keylen+vallen;
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init(&wbuf, buf, buflen) ;
wbuf_char(&wbuf, LT_INSERT_WITH_NO_OVERWRITE);
wbuf_txnid(&wbuf, txnid);
wbuf_diskoff(&wbuf, diskoff);
wbuf_bytes(&wbuf, key, keylen);
wbuf_bytes(&wbuf, val, vallen);
return tokulogger_log_bytes(logger, wbuf.ndone, wbuf.buf);
}
int tokulogger_log_close(TOKULOGGER *loggerp) {
TOKULOGGER logger = *loggerp;
int r = 0;
......@@ -133,29 +116,6 @@ n
}
#endif
int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, diskoff diskoff, int is_add, const struct kv_pair *pair) {
if (txn==0) return 0;
int keylen = pair->keylen;
int vallen = pair->vallen;
int buflen=(keylen+vallen+4+4 // the key and value
+1 // log command
+8 // txnid
+8 // fileid
+8 // diskoff
);
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init(&wbuf, buf, buflen) ;
wbuf_char(&wbuf, is_add ? LT_INSERT_WITH_NO_OVERWRITE : LT_DELETE);
wbuf_txnid(&wbuf, txn->txnid64);
wbuf_fileid(&wbuf, db->i->fileid);
wbuf_diskoff(&wbuf, diskoff);
wbuf_bytes(&wbuf, kv_pair_key_const(pair), keylen);
wbuf_bytes(&wbuf, kv_pair_val_const(pair), vallen);
return tokulogger_log_bytes(txn->logger, wbuf.ndone, wbuf.buf);
}
int tokulogger_fsync (TOKULOGGER logger) {
//return 0;/// NO TXN
//fprintf(stderr, "%s:%d syncing log\n", __FILE__, __LINE__);
......@@ -171,19 +131,101 @@ int tokulogger_fsync (TOKULOGGER logger) {
return 0;
}
static int tokulogger_finish (TOKULOGGER logger, struct wbuf *wbuf) {
wbuf_int(wbuf, toku_crc32(0, wbuf->buf, wbuf->ndone));
wbuf_int(wbuf, 4+wbuf->ndone);
return tokulogger_log_bytes(logger, wbuf->ndone, wbuf->buf);
}
// Log an insertion of a key-value pair into a particular node of the tree.
int tokulogger_log_brt_insert_with_no_overwrite (TOKULOGGER logger,
TXNID txnid,
FILENUM fileid,
DISKOFF diskoff,
unsigned char *key,
int keylen,
unsigned char *val,
int vallen) {
int buflen=(keylen+vallen+4+4 // key and value
+1 // command
+8 // lsn
+8 // txnid
+4 // fileid
+8 // diskoff
+8 // crc and len
);
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init(&wbuf, buf, buflen) ;
wbuf_char(&wbuf, LT_INSERT_WITH_NO_OVERWRITE);
wbuf_lsn (&wbuf, logger->lsn); logger->lsn.lsn++;
wbuf_txnid(&wbuf, txnid);
wbuf_filenum(&wbuf, fileid);
wbuf_diskoff(&wbuf, diskoff);
wbuf_bytes(&wbuf, key, keylen);
wbuf_bytes(&wbuf, val, vallen);
return tokulogger_finish (logger, &wbuf);
}
int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, DISKOFF diskoff, int is_add, const struct kv_pair *pair) {
if (txn==0) return 0;
assert(db);
int keylen = pair->keylen;
int vallen = pair->vallen;
const int buflen=(keylen+vallen+4+4 // the key and value
+1 // log command
+8 // lsn
+8 // txnid
+8 // fileid
+8 // diskoff
+8 // crc & len
);
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init(&wbuf, buf, buflen) ;
wbuf_char(&wbuf, is_add ? LT_INSERT_WITH_NO_OVERWRITE : LT_DELETE);
wbuf_lsn (&wbuf, txn->logger->lsn);
txn->logger->lsn.lsn++;
wbuf_txnid(&wbuf, txn->txnid64);
wbuf_filenum(&wbuf, db->i->fileid);
wbuf_diskoff(&wbuf, diskoff);
wbuf_bytes(&wbuf, kv_pair_key_const(pair), keylen);
wbuf_bytes(&wbuf, kv_pair_val_const(pair), vallen);
return tokulogger_finish(txn->logger, &wbuf);
}
int tokulogger_log_commit (TOKUTXN txn) {
struct wbuf wbuf;
int buflen =30;
const int buflen = (1 // log command
+8 // lsn
+8 // txnid
+8 // crc & len
);
unsigned char buf[buflen];
wbuf_init(&wbuf, buf, buflen);
wbuf_char(&wbuf, LT_COMMIT);
wbuf_lsn (&wbuf, txn->logger->lsn);
txn->logger->lsn.lsn++;
wbuf_txnid(&wbuf, txn->txnid64);
int r = tokulogger_log_bytes(txn->logger, wbuf.ndone, wbuf.buf);
int r = tokulogger_finish(txn->logger, &wbuf);
if (r!=0) return r;
if (txn->parent) return 0;
else return tokulogger_fsync(txn->logger);
}
int tokulogger_log_checkpoint (TOKULOGGER logger, LSN *lsn) {
struct wbuf wbuf;
const int buflen =10;
unsigned char buf[buflen];
wbuf_init(&wbuf, buf, buflen);
wbuf_char(&wbuf, LT_CHECKPOINT);
wbuf_lsn (&wbuf, logger->lsn);
*lsn = logger->lsn;
logger->lsn.lsn++;
return tokulogger_log_bytes(logger, wbuf.ndone, wbuf.buf);
}
int tokutxn_begin (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TXNID txnid64, TOKULOGGER logger) {
TAGMALLOC(TOKUTXN, result);
if (result==0) return errno;
......@@ -194,3 +236,35 @@ int tokutxn_begin (TOKUTXN parent_tokutxn, TOKUTXN *tokutxn, TXNID txnid64, TOKU
return 0;
}
int tokulogger_log_block_rename (TOKULOGGER logger, FILENUM fileid, DISKOFF olddiskoff, DISKOFF newdiskoff, DISKOFF parentdiskoff, int childnum) {
const int buflen=(+1 // log command
+8 // lsn
+8 // fileid
+8 // olddiskoff
+8 // newdiskoff
+8 // parentdiskoff
+4 // childnum
+8 // crc & len
);
unsigned char buf[buflen];
struct wbuf wbuf;
wbuf_init (&wbuf, buf, buflen) ;
wbuf_char (&wbuf, LT_BLOCK_RENAME);
wbuf_lsn (&wbuf, logger->lsn);
logger->lsn.lsn++;
wbuf_filenum(&wbuf, fileid);
wbuf_diskoff(&wbuf, olddiskoff);
wbuf_diskoff(&wbuf, newdiskoff);
wbuf_diskoff(&wbuf, parentdiskoff);
wbuf_int (&wbuf, childnum);
return tokulogger_finish(logger, &wbuf);
}
/*
int brtenv_checkpoint (BRTENV env) {
init the checkpointing lock
acquire_spinlock(&env->checkpointing);
release_spinlock(&env->checkpointing);
return -1;
}
*/
......@@ -3,16 +3,17 @@
#include "../include/db.h"
#include "brttypes.h"
#include "kv-pair.h"
typedef struct tokulogger *TOKULOGGER;
typedef struct tokutxn *TOKUTXN;
int tokulogger_create_and_open_logger (const char *directory, TOKULOGGER *resultp);
int tokulogger_log_bytes(TOKULOGGER logger, int nbytes, void *bytes);
int tokulogger_log_close(TOKULOGGER *logger);
int tokulogger_log_checkpoint (TOKULOGGER, LSN*);
int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, diskoff diskoff, int is_add, const struct kv_pair *pair);
int tokulogger_log_phys_add_or_delete_in_leaf (DB *db, TOKUTXN txn, DISKOFF diskoff, int is_add, const struct kv_pair *pair);
int tokulogger_log_commit (TOKUTXN txn);
int tokulogger_log_block_rename (TOKULOGGER logger, FILENUM fileid, DISKOFF olddiskoff, DISKOFF newdiskoff, DISKOFF parentdiskoff, int childnum);
int tokutxn_begin (TOKUTXN /*parent*/,TOKUTXN *, TXNID txnid64, TOKULOGGER logger);
#endif
......@@ -47,4 +47,8 @@ void *mempool_malloc(struct mempool *mp, int size, int alignment);
pool does not keep track of the locations of the free chunks */
void mempool_mfree(struct mempool *mp, void *vp, int size);
static inline int mempool_inrange(struct mempool *mp, void *vp, int size) {
return mp->base <= vp && vp + size <= mp->base + mp->size;
}
#endif
......@@ -10,6 +10,7 @@ struct pma_cursor {
struct pma {
enum typ_tag tag;
int dup_mode;
int N; /* How long is the array? Always a power of two >= 4. */
int n_pairs_present; /* How many array elements are non-null. */
struct kv_pair **pairs;
......@@ -23,7 +24,8 @@ struct pma {
* The density step is 0.10. */
double ldt_step; /* lower density threshold step */
struct list cursors;
int (*compare_fun)(DB*,const DBT*,const DBT*);
pma_compare_fun_t compare_fun;
pma_compare_fun_t dup_compare_fun;
void *skey, *sval; /* used in dbts */
struct mempool kvspace;
};
......@@ -36,49 +38,6 @@ int pmainternal_make_space_at (PMA pma, int idx);
int pmainternal_find (PMA pma, DBT *, DB*); // The DB is so the comparison fuction can be called.
void print_pma (PMA pma); /* useful for debugging, so keep the name short. I.e., not pmainternal_print_pma() */
/*
* resize the pma array to asksize. zero all array entries starting from startx.
*/
int __pma_resize_array(PMA pma, int asksize, int startx);
/*
* extract pairs from the pma in the window delimited by lo and hi.
*/
struct kv_pair_tag *__pma_extract_pairs(PMA pma, int count, int lo, int hi);
/*
* update the cursors in a cursor set given a set of tagged pairs.
*/
void __pma_update_cursors(PMA pma, struct list *cursorset, struct kv_pair_tag *tpairs, int n);
/*
* update this pma's cursors given a set of tagged pairs.
*/
void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n);
/*
* a deletion occured at index "here" in the pma. rebalance the windows around "here". if
* necessary, shrink the pma.
*/
void __pma_delete_at(PMA pma, int here);
/*
* if the pma entry at here is deleted and there are no more references to it
* then finish the deletion
*/
void __pma_delete_resume(PMA pma, int here);
/*
* finish a deletion from the pma. called when there are no cursor references
* to the kv pair.
*/
void __pma_delete_finish(PMA pma, int here);
/*
* count the number of cursors that reference a pma pair
*/
int __pma_count_cursor_refs(PMA pma, int here);
/* density thresholds */
#define PMA_LDT_HIGH 0.25
#define PMA_LDT_LOW 0.40
......
#include "../include/db.h"
#include "memory.h"
#include "brt-internal.h"
#include "key.h"
#include <assert.h>
#include <string.h>
......@@ -7,12 +6,11 @@
#include <stdio.h>
#include <arpa/inet.h>
#include "list.h"
#include "kv-pair.h"
#include "pma-internal.h"
TOKUTXN const null_txn = 0;
DB * const null_db = 0;
const diskoff null_diskoff = -1;
const DISKOFF null_diskoff = -1;
#define NULL_ARGS null_db, null_txn, null_diskoff
......@@ -253,33 +251,62 @@ static void test_count_region (void) {
kv_pair_free(pairs[i]);
}
// Add a kvpair into a expected sum and check to see if it matches the actual sum.
void add_fingerprint_and_check(u_int32_t rand4fingerprint, u_int32_t actual_fingerprint, u_int32_t *expect_fingerprint, const void *key, int klen, const void *data, int dlen) {
*expect_fingerprint += rand4fingerprint*toku_calccrc32_kvpair(key, klen, data, dlen);
assert(*expect_fingerprint==actual_fingerprint);
}
static void do_insert (PMA pma, const void *key, int keylen, const void *data, int datalen, u_int32_t rand4fingerprint, u_int32_t *sum, u_int32_t *expect_fingerprint) {
DBT k,v;
assert(*sum==*expect_fingerprint);
int r = pma_insert(pma, fill_dbt(&k, key, keylen), fill_dbt(&v, data, datalen), NULL_ARGS, rand4fingerprint, sum);
assert(r==BRT_OK);
add_fingerprint_and_check(rand4fingerprint, *sum, expect_fingerprint, key, keylen, data, datalen);
pma_verify_fingerprint(pma, rand4fingerprint, *sum);
}
static void do_delete (PMA pma, const void *key, int keylen, const void *data, int datalen, u_int32_t rand4fingerprint, u_int32_t *sum, u_int32_t *expect_fingerprint) {
DBT k;
assert(*sum==*expect_fingerprint);
int r = pma_delete(pma, fill_dbt(&k, key, keylen), 0, rand4fingerprint, sum);
assert(r==BRT_OK);
add_fingerprint_and_check(-rand4fingerprint, *sum, expect_fingerprint, key, keylen, data, datalen); // negative rand4 means subtract.
pma_verify_fingerprint(pma, rand4fingerprint, *sum);
}
static void test_pma_random_pick (void) {
PMA pma;
int r = pma_create(&pma, default_compare_fun, 0);
bytevec key,val;
ITEMLEN keylen,vallen;
DBT k,v;
DBT k;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
assert(r==0);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==DB_NOTFOUND);
r = pma_insert(pma, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), NULL_ARGS);
assert(r==BRT_OK);
do_insert(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==0);
assert(keylen==6); assert(vallen==6);
assert(strcmp(key,"hello")==0);
assert(strcmp(val,"there")==0);
r = pma_delete(pma, fill_dbt(&k, "nothello", 9), 0);
r = pma_delete(pma, fill_dbt(&k, "nothello", 9), 0, rand4fingerprint, &sum);
assert(r==DB_NOTFOUND);
r = pma_delete(pma, fill_dbt(&k, "hello", 6), 0);
assert(r==BRT_OK);
assert(sum==expect_fingerprint); // didn't change because nothing was deleted.
do_delete(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==DB_NOTFOUND);
r = pma_insert(pma, fill_dbt(&k, "hello", 6), fill_dbt(&v, "there", 6), NULL_ARGS);
assert(r==BRT_OK);
do_insert(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==0);
......@@ -287,26 +314,29 @@ static void test_pma_random_pick (void) {
assert(strcmp(key,"hello")==0);
assert(strcmp(val,"there")==0);
r = pma_insert(pma, fill_dbt(&k, "aaa", 4), fill_dbt(&v, "athere", 7), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "aab", 4), fill_dbt(&v, "bthere", 7), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "aac", 4), fill_dbt(&v, "cthere", 7), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "aad", 4), fill_dbt(&v, "dthere", 7), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "aae", 4), fill_dbt(&v, "ethere", 7), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "aaf", 4), fill_dbt(&v, "fthere", 7), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "aag", 4), fill_dbt(&v, "gthere", 7), NULL_ARGS); assert(r==BRT_OK);
r = pma_delete(pma, fill_dbt(&k, "aaa", 4), 0); assert(r==BRT_OK);
r = pma_delete(pma, fill_dbt(&k, "aab", 4), 0); assert(r==BRT_OK);
r = pma_delete(pma, fill_dbt(&k, "aac", 4), 0); assert(r==BRT_OK);
r = pma_delete(pma, fill_dbt(&k, "aad", 4), 0); assert(r==BRT_OK);
r = pma_delete(pma, fill_dbt(&k, "aae", 4), 0); assert(r==BRT_OK);
r = pma_delete(pma, fill_dbt(&k, "aag", 4), 0); assert(r==BRT_OK);
r = pma_delete(pma, fill_dbt(&k, "hello", 6), 0); assert(r==BRT_OK);
do_insert(pma, "aaa", 4, "athere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "aab", 4, "bthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "aac", 4, "cthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "aad", 4, "dthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "aae", 4, "ethere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "aaf", 4, "fthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "aag", 4, "gthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
do_delete(pma, "aaa", 4, "athere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_delete(pma, "aab", 4, "bthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_delete(pma, "aac", 4, "cthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_delete(pma, "aad", 4, "dthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_delete(pma, "aae", 4, "ethere", 7, rand4fingerprint, &sum, &expect_fingerprint);
/* don't delete aaf */
do_delete(pma, "aag", 4, "gthere", 7, rand4fingerprint, &sum, &expect_fingerprint);
do_delete(pma, "hello", 6, "there", 6, rand4fingerprint, &sum, &expect_fingerprint);
r = pma_random_pick(pma, &key, &keylen, &val, &vallen);
assert(r==0);
assert(keylen==4); assert(vallen==7);
assert(strcmp(key,"aaf")==0);
assert(strcmp(val,"fthere")==0);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
r=pma_free(&pma); assert(r==0);
assert(pma==0);
}
......@@ -315,12 +345,17 @@ static void test_find_insert (void) {
PMA pma;
int r;
DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
pma_create(&pma, default_compare_fun, 0);
r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0);
assert(r==DB_NOTFOUND);
r=pma_insert(pma, fill_dbt(&k, "aaa", 3), fill_dbt(&v, "aaadata", 7), NULL_ARGS);
assert(r==BRT_OK);
do_insert(pma, "aaa", 3, "aaadata", 7, rand4fingerprint, &sum, &expect_fingerprint);
init_dbt(&v);
r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0);
......@@ -329,8 +364,7 @@ static void test_find_insert (void) {
assert(keycompare(v.data,v.size,"aaadata", 7)==0);
//toku_free(v.data); v.data=0;
r=pma_insert(pma, fill_dbt(&k, "bbb", 4), fill_dbt(&v, "bbbdata", 8), NULL_ARGS);
assert(r==BRT_OK);
do_insert(pma, "bbb", 4, "bbbdata", 8, rand4fingerprint, &sum, &expect_fingerprint);
init_dbt(&v);
r=pma_lookup(pma, fill_dbt(&k, "aaa", 3), &v, 0);
......@@ -344,14 +378,16 @@ static void test_find_insert (void) {
assert((unsigned long)pma->pairs[pma_index_limit(pma)]==0xdeadbeefL);
r=pma_insert(pma, fill_dbt(&k, "00000", 6), fill_dbt(&v, "d0", 3), NULL_ARGS);
assert(r==BRT_OK);
do_insert(pma, "00000", 6, "d0", 3, rand4fingerprint, &sum, &expect_fingerprint);
assert((unsigned long)pma->pairs[pma_index_limit(pma)]==0xdeadbeefL);
r=pma_free(&pma); assert(r==0); assert(pma==0);
pma_create(&pma, default_compare_fun, 0); assert(pma!=0);
rand4fingerprint = random();
sum = expect_fingerprint = 0;
{
int i;
for (i=0; i<100; i++) {
......@@ -359,9 +395,8 @@ static void test_find_insert (void) {
char dstring[10];
snprintf(string,10,"%05d",i);
snprintf(dstring,10,"d%d", i);
printf("Inserting %d: string=%s dstring=%s\n", i, string, dstring);
r=pma_insert(pma, fill_dbt(&k, string, strlen(string)+1), fill_dbt(&v, dstring, strlen(dstring)+1), NULL_ARGS);
assert(r==BRT_OK);
//printf("Inserting %d: string=%s dstring=%s (before sum=%08x) \n", i, string, dstring, sum);
do_insert(pma, string, strlen(string)+1, dstring, strlen(dstring)+1, rand4fingerprint, &sum, &expect_fingerprint);
}
}
r=pma_free(&pma); assert(r==0); assert(pma==0);
......@@ -386,14 +421,16 @@ static void test_pma_iterate_internal (PMA pma, int expected_k, int expected_v)
static void test_pma_iterate (void) {
PMA pma;
int r;
DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
pma_create(&pma, default_compare_fun, 0);
r=pma_insert(pma, fill_dbt(&k, "42", 3), fill_dbt(&v, "-19", 4), NULL_ARGS);
assert(r==BRT_OK);
do_insert(pma, "42", 3, "-19", 4, rand4fingerprint, &sum, &expect_fingerprint);
test_pma_iterate_internal(pma, 42, -19);
r=pma_insert(pma, fill_dbt(&k, "12", 3), fill_dbt(&v, "-100", 5), NULL_ARGS);
assert(r==BRT_OK);
do_insert(pma, "12", 3, "-100", 5, rand4fingerprint, &sum, &expect_fingerprint);
test_pma_iterate_internal(pma, 42+12, -19-100);
r=pma_free(&pma); assert(r==0); assert(pma==0);
}
......@@ -403,12 +440,20 @@ static void test_pma_iterate2 (void) {
int r;
int sum=0;
int n_items=0;
DBT k,v;
u_int32_t rand4fingerprint0 = random();
u_int32_t sum0 = 0;
u_int32_t expect_fingerprint0 = 0;
u_int32_t rand4fingerprint1 = random();
u_int32_t sum1 = 0;
u_int32_t expect_fingerprint1 = 0;
r=pma_create(&pma0, default_compare_fun, 0); assert(r==0);
r=pma_create(&pma1, default_compare_fun, 0); assert(r==0);
pma_insert(pma0, fill_dbt(&k, "a", 2), fill_dbt(&v, "aval", 5), NULL_ARGS);
pma_insert(pma0, fill_dbt(&k, "b", 2), fill_dbt(&v, "bval", 5), NULL_ARGS);
pma_insert(pma1, fill_dbt(&k, "x", 2), fill_dbt(&v, "xval", 5), NULL_ARGS);
do_insert(pma0, "a", 2, "aval", 5, rand4fingerprint0, &sum0, &expect_fingerprint0);
do_insert(pma0, "b", 2, "bval", 5, rand4fingerprint0, &sum0, &expect_fingerprint0);
do_insert(pma1, "x", 2, "xval", 5, rand4fingerprint1, &sum1, &expect_fingerprint1);
PMA_ITERATE(pma0,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)),dl, (n_items++,sum+=kl+dl));
PMA_ITERATE(pma1,kv __attribute__((__unused__)),kl,dv __attribute__((__unused__)), dl, (n_items++,sum+=kl+dl));
assert(sum==21);
......@@ -483,11 +528,15 @@ void test_pma_cursor_3 (void) {
PMA_CURSOR c=0;
int r;
DBT key,val;
DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r=pma_create(&pma, default_compare_fun, 0); assert(r==0);
r=pma_insert(pma, fill_dbt(&k, "x", 2), fill_dbt(&v, "xx", 3), NULL_ARGS); assert(r==BRT_OK);
r=pma_insert(pma, fill_dbt(&k, "m", 2), fill_dbt(&v, "mm", 3), NULL_ARGS); assert(r==BRT_OK);
r=pma_insert(pma, fill_dbt(&k, "aa", 3), fill_dbt(&v,"a", 2), NULL_ARGS); assert(r==BRT_OK);
do_insert(pma, "x", 2, "xx", 3, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "m", 2, "mm", 3, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "aa", 3, "a", 2, rand4fingerprint, &sum, &expect_fingerprint);
init_dbt(&key); key.flags=DB_DBT_REALLOC;
init_dbt(&val); val.flags=DB_DBT_REALLOC;
r=pma_cursor(pma, &c); assert(r==0); assert(c!=0);
......@@ -545,21 +594,20 @@ void test_pma_cursor_4 (void) {
PMA_CURSOR cursora, cursorb, cursorc;
int i;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_cursor_4\n");
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
for (i=1; i<=4; i += 1) {
DBT dbtk, dbtv;
char k[5]; int v;
sprintf(k, "%4.4d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i;
fill_dbt(&dbtv, &v, sizeof v);
error = pma_insert(pma, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
assert(pma_n_entries(pma) == 4);
printf("a:"); print_pma(pma);
......@@ -586,16 +634,11 @@ void test_pma_cursor_4 (void) {
assert_cursor_val(cursorc, 4);
for (i=5; i<=8; i += 1) {
DBT dbtk, dbtv;
char k[5]; int v;
sprintf(k, "%4.4d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i;
fill_dbt(&dbtv, &v, sizeof v);
error = pma_insert(pma, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
assert(pma_n_entries(pma) == 8);
printf("a:"); print_pma(pma);
......@@ -621,18 +664,19 @@ void test_pma_cursor_delete(int n) {
PMA pma;
int error;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
/* insert 1 -> 42 */
DBT key, val; int k, v;
int k, v;
int i;
for (i=0; i<n; i++) {
k = i; v = -i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
/* point the cursor to the first kv */
......@@ -660,11 +704,9 @@ void test_pma_cursor_delete(int n) {
toku_free(cursorkey.data);
toku_free(cursorval.data);
/* delete the first key */
/* delete the first key, which is (int)(0) with value (0) */
k = 0;
fill_dbt(&key, &k, sizeof k);
error = pma_delete(pma, &key, 0);
assert(error == 0);
do_delete(pma, &k, sizeof k, &k, sizeof k, rand4fingerprint, &sum, &expect_fingerprint);
/* cursor get should fail */
init_dbt(&cursorkey); cursorkey.flags = DB_DBT_MALLOC;
......@@ -729,12 +771,16 @@ void test_pma_compare_fun (int wrong_endian_p) {
char *right_endian_expected_keys[] = {"00", "01", "10", "11"};
char **expected_keys = wrong_endian_p ? wrong_endian_expected_keys : right_endian_expected_keys;
int i;
DBT k,v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, wrong_endian_p ? wrong_endian_compare_fun : default_compare_fun, 0); assert(r==0);
r = pma_insert(pma, fill_dbt(&k, "10", 3), fill_dbt(&v, "10v", 4), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "00", 3), fill_dbt(&v, "00v", 4), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "01", 3), fill_dbt(&v, "01v", 4), NULL_ARGS); assert(r==BRT_OK);
r = pma_insert(pma, fill_dbt(&k, "11", 3), fill_dbt(&v, "11v", 4), NULL_ARGS); assert(r==BRT_OK);
do_insert(pma, "10", 3, "10v", 4, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "00", 3, "00v", 4, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "01", 3, "01v", 4, rand4fingerprint, &sum, &expect_fingerprint);
do_insert(pma, "11", 3, "11v", 4, rand4fingerprint, &sum, &expect_fingerprint);
init_dbt(&key); key.flags=DB_DBT_REALLOC;
init_dbt(&val); val.flags=DB_DBT_REALLOC;
r=pma_cursor(pma, &c); assert(r==0); assert(c!=0);
......@@ -767,6 +813,15 @@ void test_pma_split_n(int n) {
int i;
int na, nb, nc;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
u_int32_t brand = random();
u_int32_t bsum = 0;
u_int32_t crand = random();
u_int32_t csum = 0;
printf("test_pma_split_n:%d\n", n);
error = pma_create(&pmaa, default_compare_fun, 0);
......@@ -778,22 +833,24 @@ void test_pma_split_n(int n) {
/* insert some kv pairs */
for (i=0; i<n; i++) {
DBT dbtk, dbtv;
char k[5]; int v;
sprintf(k, "%4.4d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i;
fill_dbt(&dbtv, &v, sizeof v);
do_insert(pmaa, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_insert(pmaa, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
pma_verify(pmaa, null_db);
}
printf("a:"); print_pma(pmaa);
error = pma_split(pmaa, 0, pmab, 0, pmac, 0);
error = pma_split(pmaa, 0, pmab, 0, brand, &bsum, pmac, 0, crand, &csum);
assert(error == 0);
pma_verify(pmaa, null_db);
pma_verify(pmab, null_db);
pma_verify(pmac, null_db);
pma_verify_fingerprint(pmab, brand, bsum);
pma_verify_fingerprint(pmac, crand, csum);
printf("a:"); print_pma(pmaa);
na = pma_n_entries(pmaa);
......@@ -821,6 +878,15 @@ void test_pma_split_varkey(void) {
int i;
int n, na, nb, nc;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
u_int32_t brand = random();
u_int32_t bsum = 0;
u_int32_t crand = random();
u_int32_t csum = 0;
printf("test_pma_split_varkey\n");
error = pma_create(&pmaa, default_compare_fun, 0);
......@@ -832,22 +898,20 @@ void test_pma_split_varkey(void) {
/* insert some kv pairs */
for (i=0; keys[i]; i++) {
DBT dbtk, dbtv;
char v;
fill_dbt(&dbtk, keys[i], strlen(keys[i])+1);
v = i;
fill_dbt(&dbtv, &v, sizeof v);
error = pma_insert(pmaa, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
char v = i;
do_insert(pmaa, keys[i], strlen(keys[i])+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
n = i;
printf("a:"); print_pma(pmaa);
error = pma_split(pmaa, 0, pmab, 0, pmac, 0);
error = pma_split(pmaa, 0, pmab, 0, brand, &bsum, pmac, 0, crand, &csum);
assert(error == 0);
pma_verify(pmaa, null_db);
pma_verify(pmab, null_db);
pma_verify(pmac, null_db);
pma_verify_fingerprint(pmab, brand, bsum);
pma_verify_fingerprint(pmac, crand, csum);
printf("a:"); print_pma(pmaa);
na = pma_n_entries(pmaa);
......@@ -931,6 +995,16 @@ void test_pma_split_cursor(void) {
int i;
int na, nb, nc;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
u_int32_t brand = random();
u_int32_t bsum = 0;
u_int32_t crand = random();
u_int32_t csum = 0;
printf("test_pma_split_cursor\n");
error = pma_create(&pmaa, default_compare_fun, 0);
......@@ -942,16 +1016,12 @@ void test_pma_split_cursor(void) {
/* insert some kv pairs */
for (i=1; i<=16; i += 1) {
DBT dbtk, dbtv;
char k[11]; int v;
snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&dbtk, &k, strlen(k)+1);
v = i;
fill_dbt(&dbtv, &v, sizeof v);
error = pma_insert(pmaa, &dbtk, &dbtv, NULL_ARGS);
assert(error == BRT_OK);
do_insert(pmaa, k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
assert(pma_n_entries(pmaa) == 16);
printf("a:"); print_pma(pmaa);
......@@ -979,9 +1049,12 @@ void test_pma_split_cursor(void) {
// print_cursor("cursorc", cursorc);
assert_cursor_val(cursorc, 16);
error = pma_split(pmaa, 0, pmab, 0, pmac, 0);
error = pma_split(pmaa, 0, pmab, 0, brand, &bsum, pmac, 0, crand, &csum);
assert(error == 0);
pma_verify_fingerprint(pmab, brand, bsum);
pma_verify_fingerprint(pmac, crand, csum);
printf("a:"); print_pma(pmaa);
na = pma_n_entries(pmaa);
assert(na == 0);
......@@ -1045,6 +1118,10 @@ void test_pma_bulk_insert_n(int n) {
int i;
DBT *keys, *vals;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_bulk_insert_n: %d\n", n);
error = pma_create(&pma, default_compare_fun, 0);
......@@ -1074,11 +1151,16 @@ void test_pma_bulk_insert_n(int n) {
assert(v);
*v = i;
fill_dbt(&vals[i], v, vlen);
expect_fingerprint += rand4fingerprint*toku_calccrc32_kvpair (k, klen, v, vlen);
}
/* bulk insert n kv pairs */
error = pma_bulk_insert(pma, keys, vals, n);
error = pma_bulk_insert(pma, keys, vals, n, rand4fingerprint, &sum);
assert(error == 0);
assert(sum==expect_fingerprint);
pma_verify(pma, null_db);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
/* verify */
if (0) print_pma(pma);
......@@ -1122,16 +1204,21 @@ void test_pma_insert_or_replace(void) {
int r;
DBT dbtk, dbtv;
int n_diff=-2;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, 0);
assert(r==0);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "aaa", 4), fill_dbt(&dbtv, "zzz", 4), &n_diff, NULL_ARGS);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "aaa", 4), fill_dbt(&dbtv, "zzz", 4), &n_diff, NULL_ARGS, rand4fingerprint, &sum);
assert(r==0); assert(n_diff==-1);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, "aaa", 4, "zzz", 4);
r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "ww", 3), &n_diff, NULL_ARGS);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "ww", 3), &n_diff, NULL_ARGS, rand4fingerprint, &sum);
assert(r==0); assert(n_diff==-1);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, "bbbb", 5, "ww", 3);
r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0);
......@@ -1139,8 +1226,11 @@ void test_pma_insert_or_replace(void) {
r = pma_lookup(pma, fill_dbt(&dbtk, "bbbb", 5), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==3); assert(memcmp(dbtv.data, "ww", 3)==0);
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "xxxx", 5), &n_diff, NULL_ARGS);
// replae bbbb
r = pma_insert_or_replace(pma, fill_dbt(&dbtk, "bbbb", 5), fill_dbt(&dbtv, "xxxx", 5), &n_diff, NULL_ARGS, rand4fingerprint, &sum);
assert(r==0); assert(n_diff==3);
expect_fingerprint -= rand4fingerprint*toku_calccrc32_kvpair("bbbb", 5, "ww", 3);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, "bbbb", 5, "xxxx", 5);
r = pma_lookup(pma, fill_dbt(&dbtk, "aaa", 4), init_dbt(&dbtv), 0);
assert(r==0); assert(dbtv.size==4); assert(memcmp(dbtv.data, "zzz", 4)==0);
......@@ -1160,6 +1250,10 @@ void test_pma_delete_shrink(int n) {
int r;
int i;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_delete_shrink:%d\n", n);
r = pma_create(&pma, default_compare_fun, n*(8 + 11 + sizeof (int)));
......@@ -1169,25 +1263,20 @@ void test_pma_delete_shrink(int n) {
for (i=0; i<n; i++) {
char k[11];
int v;
DBT key, val;
snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1);
v = i;
fill_dbt(&val, &v, sizeof v);
r = pma_insert(pma, &key, &val, NULL_ARGS);
assert(r == 0);
do_insert(pma, k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
/* delete */
for (i=0; i<n; i++) {
char k[11];
DBT key;
int v=i;
snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1);
r = pma_delete(pma, &key, 0);
assert(r == 0);
do_delete(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
assert(pma->N == PMA_MIN_ARRAY_SIZE);
......@@ -1205,6 +1294,10 @@ void test_pma_delete_random(int n) {
int i;
int keys[n];
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
printf("test_pma_delete_random:%d\n", n);
r = pma_create(&pma, default_compare_fun, n * (8 + 11 + sizeof (int)));
......@@ -1218,25 +1311,20 @@ void test_pma_delete_random(int n) {
for (i=0; i<n; i++) {
char k[11];
int v;
DBT key, val;
snprintf(k, sizeof k, "%.10d", keys[i]);
fill_dbt(&key, k, strlen(k)+1);
v = keys[i];
fill_dbt(&val, &v, sizeof v);
r = pma_insert(pma, &key, &val, NULL_ARGS);
assert(r == 0);
do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
/* delete */
for (i=0; i<n; i++) {
char k[11];
DBT key;
int v = keys[i];
snprintf(k, sizeof k, "%.10d", keys[i]);
fill_dbt(&key, k, strlen(k)+1);
r = pma_delete(pma, &key, 0);
assert(r == 0);
do_delete(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
assert(pma->N == PMA_MIN_ARRAY_SIZE);
......@@ -1282,6 +1370,10 @@ void test_pma_delete_cursor(int n) {
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, 0);
assert(r == 0);
......@@ -1289,14 +1381,10 @@ void test_pma_delete_cursor(int n) {
for (i=0; i<n; i++) {
char k[11];
int v;
DBT key, val;
snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1);
v = i;
fill_dbt(&val, &v, sizeof v);
r = pma_insert(pma, &key, &val, NULL_ARGS);
assert(r == 0);
do_insert(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
PMA_CURSOR pmacursor;
......@@ -1311,12 +1399,10 @@ void test_pma_delete_cursor(int n) {
for (i=0; i<n; i++) {
char k[11];
DBT key;
int v=i;
snprintf(k, sizeof k, "%.10d", i);
fill_dbt(&key, k, strlen(k)+1);
r = pma_delete(pma, &key, 0);
assert(r == 0);
do_delete(pma, k, strlen(k)+1, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
if (i == n-1)
assert_cursor_nokey(pmacursor);
else
......@@ -1347,6 +1433,10 @@ void test_pma_delete_insert() {
PMA pma;
int error;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
......@@ -1359,19 +1449,14 @@ void test_pma_delete_insert() {
int k, v;
k = 1; v = 1;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_cursor_set_position_first(pmacursor);
assert(error == 0);
assert_cursor_equal(pmacursor, 1);
k = 1;
fill_dbt(&key, &k, sizeof k);
error = pma_delete(pma, &key, 0);
assert(error == 0);
k = 1; v = 1;
do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert_cursor_nokey(pmacursor);
k = 1;
......@@ -1381,10 +1466,7 @@ void test_pma_delete_insert() {
assert(error != 0);
k = 1; v = 2;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert_cursor_equal(pmacursor, 2);
error = pma_cursor_free(&pmacursor);
......@@ -1400,6 +1482,10 @@ void test_pma_double_delete() {
PMA pma;
int error;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
......@@ -1408,29 +1494,25 @@ void test_pma_double_delete() {
error = pma_cursor(pma, &pmacursor);
assert(error == 0);
DBT key, val;
DBT key;
int k, v;
k = 1; v = 1;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
error = pma_cursor_set_position_first(pmacursor);
assert(error == 0);
assert_cursor_equal(pmacursor, 1);
k = 1;
fill_dbt(&key, &k, sizeof k);
error = pma_delete(pma, &key, 0);
assert(error == 0);
k = 1; v = 1;
do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert_cursor_nokey(pmacursor);
k = 1;
fill_dbt(&key, &k, sizeof k);
error = pma_delete(pma, &key, 0);
error = pma_delete(pma, &key, 0, rand4fingerprint, &sum);
assert(error == DB_NOTFOUND);
assert(sum == expect_fingerprint);
error = pma_cursor_free(&pmacursor);
assert(error == 0);
......@@ -1445,20 +1527,20 @@ void test_pma_cursor_first_delete_last() {
int error;
PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
DBT key, val;
int k, v;
int i;
for (i=1; i<=2; i++) {
k = htonl(i);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
assert(pma_n_entries(pma) == 2);
......@@ -1471,9 +1553,8 @@ void test_pma_cursor_first_delete_last() {
assert(error == 0);
k = htonl(1);
fill_dbt(&key, &k, sizeof k);
error = pma_delete(pma, &key, 0);
assert(error == 0);
v = 1;
do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert(pma_n_entries(pma) == 2);
error = pma_cursor_set_position_last(pmacursor);
......@@ -1493,20 +1574,20 @@ void test_pma_cursor_last_delete_first() {
int error;
PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
DBT key, val;
int k, v;
int i;
for (i=1; i<=2; i++) {
k = htonl(i);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
assert(pma_n_entries(pma) == 2);
......@@ -1519,9 +1600,8 @@ void test_pma_cursor_last_delete_first() {
assert(error == 0);
k = htonl(2);
fill_dbt(&key, &k, sizeof k);
error = pma_delete(pma, &key, 0);
assert(error == 0);
v = 2;
do_delete(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
assert(pma_n_entries(pma) == 2);
error = pma_cursor_set_position_first(pmacursor);
......@@ -1551,6 +1631,9 @@ void test_pma_already_there() {
int error;
PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
......@@ -1560,10 +1643,12 @@ void test_pma_already_there() {
k = 1; v = 1;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
error = pma_insert(pma, &key, &val, NULL_ARGS, rand4fingerprint, &sum);
assert(error == 0);
error = pma_insert(pma, &key, &val, NULL_ARGS);
u_int32_t savesum = sum;
error = pma_insert(pma, &key, &val, NULL_ARGS, rand4fingerprint, &sum);
assert(error == BRT_ALREADY_THERE);
assert(sum==savesum);
error = pma_free(&pma);
assert(error == 0);
......@@ -1581,15 +1666,16 @@ void test_pma_cursor_set_key() {
DBT key, val;
int k, v;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
const int n = 100;
int i;
for (i=0; i<n; i += 10) {
k = htonl(i);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
PMA_CURSOR cursor;
......@@ -1630,6 +1716,10 @@ void test_pma_cursor_set_range() {
int error;
PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
error = pma_create(&pma, default_compare_fun, 0);
assert(error == 0);
......@@ -1642,10 +1732,7 @@ void test_pma_cursor_set_range() {
for (i=smallest_key; i<=largest_key; i += 10) {
k = htonl(i);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
PMA_CURSOR cursor;
......@@ -1687,6 +1774,10 @@ void test_pma_cursor_delete_under() {
int error;
PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
const int n = 1000;
error = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int)));
......@@ -1711,10 +1802,7 @@ void test_pma_cursor_delete_under() {
for (i=0; i<n; i++) {
k = htonl(i);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
for (i=0;;i++) {
......@@ -1758,6 +1846,10 @@ void test_pma_cursor_set_both() {
int error;
PMA pma;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
const int n = 1000;
error = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int)));
......@@ -1776,10 +1868,7 @@ void test_pma_cursor_set_both() {
for (i=0; i<n; i++) {
k = htonl(i);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
error = pma_insert(pma, &key, &val, NULL_ARGS);
assert(error == 0);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
}
/* verify key not in pma fails */
......@@ -1827,6 +1916,399 @@ void test_pma_cursor_set_both() {
assert(error == 0);
}
/* insert n duplicate keys */
void test_nodup_key_insert(int n) {
printf("test_nodup_key_insert:%d\n", n);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, n * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
/* insert 0->0, 0->1, .. 0->n-1 */
DBT key, val;
int k, v;
int i;
for (i=0; i<n; i++) {
k = htonl(0);
v = i;
fill_dbt(&key, &k, sizeof k);
fill_dbt(&val, &v, sizeof v);
r = pma_insert(pma, &key, &val, NULL_ARGS, rand4fingerprint, &sum);
if (i == 0) {
assert(r == 0);
add_fingerprint_and_check(rand4fingerprint, sum, &expect_fingerprint, &k, sizeof k, &v, sizeof v);
} else {
assert(r != 0);
assert(sum==expect_fingerprint);
}
}
r = pma_free(&pma);
assert(r == 0);
}
/* insert n duplicate keys */
void test_dup_key_insert(int n) {
printf("test_dup_key_insert:%d\n", n);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, DB_DUP);
assert(r == 0);
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
int i;
/* insert 2->0, 2->1, .. 2->n-1 */
for (i=0; i<n; i++) {
k = htonl(2);
v = i;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
/* cursor walk from key k should find values 0, 1, .. n-1 */
PMA_CURSOR cursor;
r = pma_cursor(pma, &cursor);
assert(r == 0);
k = htonl(2);
fill_dbt(&key, &k, sizeof k);
r = pma_cursor_set_key(cursor, &key, 0);
if (r != 0) {
assert(n == 0);
} else {
i = 0;
while (1) {
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
int kk;
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
if (k != kk) {
toku_free(key.data);
toku_free(val.data);
break;
}
int vv;
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(vv == i);
toku_free(key.data);
toku_free(val.data);
i += 1;
r = pma_cursor_set_position_next(cursor);
if (r != 0)
break;
}
assert(i == n);
}
r = pma_cursor_free(&cursor);
assert(r == 0);
r = pma_free(&pma);
assert(r == 0);
}
/* insert n duplicate keys, delete key, verify all keys are deleted */
void test_dup_key_delete(int n, int mode) {
printf("test_dup_key_delete:%d %x\n", n, mode);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, mode);
assert(r == 0);
if (mode & DB_DUPSORT) {
r = pma_set_dup_compare(pma, default_compare_fun);
assert(r == 0);
}
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
u_int32_t sum_before_all_the_duplicates = sum;
int i;
/* insert 2->0, 2->1, .. 2->n-1 */
for (i=0; i<n; i++) {
k = htonl(2);
v = i;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
k = htonl(2);
r = pma_delete(pma, fill_dbt(&key, &k, sizeof k), null_db, rand4fingerprint, &sum);
if (r != 0) assert(n == 0);
expect_fingerprint = sum_before_all_the_duplicates;
assert(sum == expect_fingerprint);
pma_verify(pma, null_db);
pma_verify_fingerprint(pma, rand4fingerprint, sum);
/* cursor walk should find keys 1, 3 */
PMA_CURSOR cursor;
r = pma_cursor(pma, &cursor);
assert(r == 0);
r = pma_cursor_set_position_first(cursor);
assert(r == 0);
int kk, vv;
k = htonl(1); v = 1;
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
assert(k == kk);
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(v == vv);
toku_free(key.data);
toku_free(val.data);
r = pma_cursor_set_position_next(cursor);
assert(r == 0);
k = htonl(3); v = 3;
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
assert(k == kk);
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(v == vv);
toku_free(key.data);
toku_free(val.data);
r = pma_cursor_free(&cursor);
assert(r == 0);
r = pma_free(&pma);
assert(r == 0);
}
/* insert n duplicate keys */
void test_dupsort_key_insert(int n) {
printf("test_dup_key_insert:%d\n", n);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, DB_DUP+DB_DUPSORT);
assert(r == 0);
r = pma_set_dup_compare(pma, default_compare_fun);
assert(r == 0);
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
int values[n];
int i;
/* insert 2->n-i */
for (i=0; i<n; i++) {
k = htonl(2);
values[i] = htonl(random());
do_insert(pma, &k, sizeof k, &values[i], sizeof values[i], rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
/* cursor walk from key k should find values 0, 1, .. n-1 */
PMA_CURSOR cursor;
r = pma_cursor(pma, &cursor);
assert(r == 0);
k = htonl(2);
fill_dbt(&key, &k, sizeof k);
r = pma_cursor_set_key(cursor, &key, 0);
if (r != 0) {
assert(n == 0);
} else {
int cmpint(const void *a, const void *b) {
return memcmp(a, b, sizeof (int));
}
qsort(values, n, sizeof (int), cmpint);
i = 0;
while (1) {
init_dbt(&key); key.flags = DB_DBT_MALLOC;
init_dbt(&val); val.flags = DB_DBT_MALLOC;
r = pma_cursor_get_current(cursor, &key, &val);
assert(r == 0);
int kk;
assert(key.size == sizeof kk);
memcpy(&kk, key.data, key.size);
if (k != kk) {
toku_free(key.data);
toku_free(val.data);
break;
}
int vv;
assert(val.size == sizeof vv);
memcpy(&vv, val.data, val.size);
assert(vv == values[i]);
toku_free(key.data);
toku_free(val.data);
i += 1;
r = pma_cursor_set_position_next(cursor);
if (r != 0)
break;
}
assert(i == n);
}
r = pma_cursor_free(&cursor);
assert(r == 0);
r = pma_free(&pma);
assert(r == 0);
}
void test_dup_key_lookup(int n, int mode) {
printf("test_dup_lookup:%d %d\n", n, mode);
PMA pma;
int r;
u_int32_t rand4fingerprint = random();
u_int32_t sum = 0;
u_int32_t expect_fingerprint = 0;
r = pma_create(&pma, default_compare_fun, (n + 2) * (8 + sizeof (int) + sizeof (int)));
assert(r == 0);
pma_verify(pma, null_db);
r = pma_set_dup_mode(pma, mode);
assert(r == 0);
if (mode & DB_DUPSORT) {
r = pma_set_dup_compare(pma, default_compare_fun);
assert(r == 0);
}
DBT key, val;
int k, v;
/* insert 1->1, 3->3 */
k = htonl(1); v = 1;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
k = htonl(3); v = 3;
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
int i;
/* insert 2->0, 2->1, .. 2->n-1 */
for (i=0; i<n; i++) {
k = htonl(2);
v = htonl(i);
do_insert(pma, &k, sizeof k, &v, sizeof v, rand4fingerprint, &sum, &expect_fingerprint);
pma_verify(pma, null_db);
}
/* lookup should find the first insert and smallest value */
k = htonl(2);
r = pma_lookup(pma, fill_dbt(&key, &k, sizeof k), fill_dbt(&val, &v, sizeof v), null_db);
assert(r == 0);
int kk;
assert(key.size == sizeof k);
memcpy(&kk, key.data, key.size);
assert((unsigned int) kk == htonl(2));
int vv;
assert(val.size == sizeof v);
memcpy(&vv, val.data, val.size);
assert(vv == 0);
r = pma_free(&pma);
assert(r == 0);
}
void test_dup() {
test_nodup_key_insert(2); memory_check_all_free();
test_dup_key_insert(0); memory_check_all_free();
test_dup_key_insert(2); memory_check_all_free();
test_dup_key_insert(1000); memory_check_all_free();
test_dup_key_delete(0, DB_DUP); memory_check_all_free();
test_dup_key_delete(1000, DB_DUP); memory_check_all_free();
test_dupsort_key_insert(2); memory_check_all_free();
test_dupsort_key_insert(1000); memory_check_all_free();
test_dup_key_delete(0, DB_DUP+DB_DUPSORT); memory_check_all_free();
test_dup_key_delete(1000, DB_DUP+DB_DUPSORT); memory_check_all_free();
test_dup_key_lookup(32, DB_DUP); memory_check_all_free();
test_dup_key_lookup(32, DB_DUP+DB_DUPSORT); memory_check_all_free();
}
void pma_tests (void) {
memory_check=1;
test_keycompare(); memory_check_all_free();
......@@ -1840,8 +2322,10 @@ void pma_tests (void) {
test_pma_find(); memory_check_all_free();
test_calculate_parameters(); memory_check_all_free();
test_count_region(); memory_check_all_free();
test_pma_random_pick(); memory_check_all_free();
test_pma_cursor(); memory_check_all_free();
test_pma_split(); memory_check_all_free();
test_pma_bulk_insert(); memory_check_all_free();
test_pma_insert_or_replace(); memory_check_all_free();
......@@ -1851,6 +2335,7 @@ void pma_tests (void) {
test_pma_cursor_set_range(); memory_check_all_free();
test_pma_cursor_delete_under(); memory_check_all_free();
test_pma_cursor_set_both(); memory_check_all_free();
test_dup();
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
......
......@@ -19,6 +19,54 @@
/* get KEY_VALUE_OVERHEAD */
#include "brt-internal.h"
/**************************** static functions forward declarations. *********************/
/*
* finish a deletion from the pma. called when there are no cursor references
* to the kv pair.
*/
static void __pma_delete_finish(PMA pma, int here);
/*
* resize the pma array to asksize. zero all array entries starting from startx.
*/
static int __pma_resize_array(PMA pma, int asksize, int startx);
/*
* extract pairs from the pma in the window delimited by lo and hi.
*/
static struct kv_pair_tag *__pma_extract_pairs(PMA pma, int count, int lo, int hi);
/*
* update the cursors in a cursor set given a set of tagged pairs.
*/
static void __pma_update_cursors(PMA pma, struct list *cursorset, struct kv_pair_tag *tpairs, int n);
/*
* update this pma's cursors given a set of tagged pairs.
*/
static void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n);
/*
* a deletion occured at index "here" in the pma. rebalance the windows around "here". if
* necessary, shrink the pma.
*/
static void __pma_delete_at(PMA pma, int here);
/*
* if the pma entry at here is deleted and there are no more references to it
* then finish the deletion
*/
static void __pma_delete_resume(PMA pma, int here);
/*
* count the number of cursors that reference a pma pair
*/
static int __pma_count_cursor_refs(PMA pma, int here);
/**************************** end of static functions forward declarations. *********************/
#ifndef PMA_USE_MEMPOOL
#define PMA_USE_MEMPOOL 1
#endif
......@@ -179,6 +227,130 @@ void pma_show_stats (void) {
printf("%d finds, %d divides, %d scans\n", pma_count_finds, pma_count_divides, pma_count_scans);
}
/* search the index for a matching key */
static int __pma_search(PMA pma, DBT *k, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_search(pma, k, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp > 0)
return __pma_search(pma, k, db, mi+1, hi, found);
if (cmp < 0)
return __pma_search(pma, k, db, lo, mi, found);
*found = 1;
return mi;
}
}
/* search the index for the rightmost matching key */
static int __pma_right_search(PMA pma, DBT *k, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_right_search(pma, k, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp > 0)
return __pma_right_search(pma, k, db, mi+1, hi, found);
if (cmp < 0)
return __pma_right_search(pma, k, db, lo, mi, found);
/* we have a match, try to find a match on the right tree */
int here;
here = __pma_right_search(pma, k, db, mi+1, hi, found);
if (*found == 0)
here = mi;
*found = 1;
return here;
}
}
/* search the index for the left most matching key */
static int __pma_left_search(PMA pma, DBT *k, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_left_search(pma, k, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp > 0)
return __pma_left_search(pma, k, db, mi+1, hi, found);
if (cmp < 0)
return __pma_left_search(pma, k, db, lo, mi, found);
/* we have a match, try to find a match on the left tree */
int here;
here = __pma_left_search(pma, k, db, lo, mi, found);
if (*found == 0)
here = mi;
*found = 1;
return here;
}
}
/* search the index for the right most matching key and value */
static int __pma_dup_search(PMA pma, DBT *k, DBT *v, DB *db, int lo, int hi, int *found) {
assert(0 <= lo && lo <= hi);
if (lo >= hi) {
*found = 0;
return lo;
} else {
int mi = (lo + hi)/2;
assert(lo <= mi && mi < hi);
int omi = mi;
while (mi < hi && !kv_pair_inuse(pma->pairs[mi]))
mi++;
if (mi >= hi)
return __pma_dup_search(pma, k, v, db, lo, omi, found);
struct kv_pair *kv = kv_pair_ptr(pma->pairs[mi]);
DBT k2, v2;
int cmp = pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv)));
if (cmp == 0)
cmp = pma->dup_compare_fun(db, v, fill_dbt(&v2, kv_pair_val(kv), kv_pair_vallen(kv)));
if (cmp > 0)
return __pma_dup_search(pma, k, v, db, mi+1, hi, found);
if (cmp < 0)
return __pma_dup_search(pma, k, v, db, lo, mi, found);
/* we have a match, try to find a match on the right tree */
int here;
here = __pma_dup_search(pma, k, v, db, mi+1, hi, found);
if (*found == 0)
here = mi;
*found = 1;
return here;
}
}
// Return the smallest index such that no lower index contains a larger key.
// This will be in the range 0 (inclusive) to pma_index_limit(pma) (inclusive).
// Thus the returned index may not be a valid index into the array if it is == pma_index_limit(pma)
......@@ -186,6 +358,7 @@ void pma_show_stats (void) {
// For example: if the array is full of small keys, that means we return pma_index_limit(pma), which is off the end of teh array.
// For example: if the array is full of large keys, then we return 0.
int pmainternal_find (PMA pma, DBT *k, DB *db) {
#if 1
int lo=0, hi=pma_index_limit(pma);
/* lo and hi are the minimum and maximum values (inclusive) that we could possibly return. */
pma_count_finds++;
......@@ -233,6 +406,17 @@ int pmainternal_find (PMA pma, DBT *k, DB *db) {
}
#endif
return lo;
#else
int found, lo;
lo = __pma_search(pma, k, db, 0, pma->N, &found);
if (lo>0 && lo < pma_index_limit(pma) && pma->pairs[lo]) {
//printf("lo=%d\n", lo);
DBT k2;
assert(0 >= pma->compare_fun(db, k, fill_dbt(&k2, pma->pairs[lo]->key, pma->pairs[lo]->keylen)));
}
return lo;
#endif
}
//int min (int i, int j) { if (i<j) return i; else return j; }
......@@ -264,7 +448,7 @@ void print_pma (PMA pma) {
}
/* Smooth the data, and return the location of the null. */
int distribute_data (struct kv_pair *destpairs[], int dcount,
static int distribute_data (struct kv_pair *destpairs[], int dcount,
struct kv_pair_tag sourcepairs[], int scount, PMA pma) {
assert(scount<=dcount);
if (scount==0) {
......@@ -335,7 +519,7 @@ int pmainternal_smooth_region (struct kv_pair *pairs[], int n, int idx, int base
}
}
int lg (int n) {
int toku_lg (int n) {
int result=0;
int two_to_result = 1;
while (two_to_result<n) {
......@@ -348,7 +532,7 @@ int lg (int n) {
/* Calculate densitysteps and uplgN, given N. */
void pmainternal_calculate_parameters (PMA pma) {
int N = pma_index_limit(pma);
int lgN = lg(N);
int lgN = toku_lg(N);
int n_divisions=0;
//printf("N=%d lgN=%d\n", N, lgN);
while (N/2>=lgN) {
......@@ -371,10 +555,11 @@ int pmainternal_count_region (struct kv_pair *pairs[], int lo, int hi) {
return n;
}
int pma_create(PMA *pma, int (*compare_fun)(DB*,const DBT*,const DBT*), int maxsize) {
int pma_create(PMA *pma, pma_compare_fun_t compare_fun, int maxsize) {
int error;
TAGMALLOC(PMA, result);
if (result==0) return -1;
result->dup_mode = 0;
result->n_pairs_present = 0;
result->pairs = 0;
list_init(&result->cursors);
......@@ -401,19 +586,18 @@ int pma_create(PMA *pma, int (*compare_fun)(DB*,const DBT*,const DBT*), int maxs
}
/* find the smallest power of 2 >= n */
int __pma_array_size(PMA pma __attribute__((unused)), int asksize) {
static int __pma_array_size(PMA pma __attribute__((unused)), int asksize) {
int n = PMA_MIN_ARRAY_SIZE;
while (n < asksize)
n *= 2;
return n;
}
int __pma_resize_array(PMA pma, int asksize, int startz) {
static int __pma_resize_array(PMA pma, int asksize, int startz) {
int i;
int n;
n = __pma_array_size(pma, asksize);
// printf("pma_resize %d -> %d\n", pma->N, n);
pma->N = n;
if (pma->pairs == 0)
pma->pairs = toku_malloc((1 + pma->N) * sizeof (struct kv_pair *));
......@@ -430,6 +614,18 @@ int __pma_resize_array(PMA pma, int asksize, int startz) {
return 0;
}
int pma_set_dup_mode(PMA pma, int dup_mode) {
assert(dup_mode == 0 || dup_mode == DB_DUP || dup_mode == (DB_DUP+DB_DUPSORT));
pma->dup_mode = dup_mode;
return 0;
}
int pma_set_dup_compare(PMA pma, pma_compare_fun_t dup_compare_fun) {
assert(pma->dup_mode & DB_DUPSORT);
pma->dup_compare_fun = dup_compare_fun;
return 0;
}
int pma_cursor (PMA pma, PMA_CURSOR *cursp) {
PMA_CURSOR MALLOC(curs);
assert(curs!=0);
......@@ -530,7 +726,11 @@ int pma_cursor_get_current(PMA_CURSOR c, DBT *key, DBT *val) {
int pma_cursor_set_key(PMA_CURSOR c, DBT *key, DB *db) {
PMA pma = c->pma;
int here = pmainternal_find(pma, key, db);
int here, found;
if (pma->dup_mode & DB_DUP) {
here = __pma_left_search(pma, key, db, 0, pma->N, &found);
} else
here = pmainternal_find(pma, key, db);
assert(0<=here ); assert(here<=pma_index_limit(pma));
int r = DB_NOTFOUND;
if (here < pma->N) {
......@@ -567,7 +767,11 @@ int pma_cursor_set_both(PMA_CURSOR c, DBT *key, DBT *val, DB *db) {
int pma_cursor_set_range(PMA_CURSOR c, DBT *key, DB *db) {
PMA pma = c->pma;
int here = pmainternal_find(pma, key, db);
int here, found;
if (pma->dup_mode & DB_DUP)
here = __pma_left_search(pma, key, db, 0, pma->N, &found);
else
here = pmainternal_find(pma, key, db);
assert(0<=here ); assert(here<=pma_index_limit(pma));
/* find the first valid pair where key[here] >= key */
......@@ -649,21 +853,13 @@ int pmainternal_make_space_at (PMA pma, int idx) {
break;
if (lo==0 && hi==pma_index_limit(pma)) {
/* The array needs to be doubled in size. */
#if 0
int i;
#endif
assert(size==pma_index_limit(pma));
size*=2;
#if 0
pma->pairs = toku_realloc(pma->pairs, (1+size)*sizeof(struct kv_pair *));
for (i=hi; i<size; i++) pma->pairs[i]=0;
pma->pairs[size] = (void*)0xdeadbeefL;
pma->N=size;
pmainternal_calculate_parameters(pma);
#else
// printf("pma_make_space_realloc %d to %d hi %d\n", pma->N, size, hi);
__pma_resize_array(pma, size, hi);
#endif
hi=size;
//printf("doubled N\n");
break;
......@@ -681,12 +877,16 @@ int pmainternal_make_space_at (PMA pma, int idx) {
}
enum pma_errors pma_lookup (PMA pma, DBT *k, DBT *v, DB *db) {
int here, found;
if (pma->dup_mode & DB_DUP) {
here = __pma_left_search(pma, k, db, 0, pma->N, &found);
} else
here = pmainternal_find(pma, k, db);
assert(0<=here ); assert(here<=pma_index_limit(pma));
if (here==pma_index_limit(pma)) return DB_NOTFOUND;
DBT k2;
struct kv_pair *pair;
int l = pmainternal_find(pma, k, db);
assert(0<=l ); assert(l<=pma_index_limit(pma));
if (l==pma_index_limit(pma)) return DB_NOTFOUND;
pair = pma->pairs[l];
pair = pma->pairs[here];
if (kv_pair_valid(pair) && pma->compare_fun(db, k, fill_dbt(&k2, pair->key, pair->keylen))==0) {
return ybt_set_value(v, pair->key + pair->keylen, pair->vallen, &pma->sval);
} else {
......@@ -727,53 +927,117 @@ int pma_free (PMA *pmap) {
}
/* Copies keylen and datalen */
int pma_insert (PMA pma, DBT *k, DBT *v, DB* db, TOKUTXN txn, diskoff diskoff) {
int idx = pmainternal_find(pma, k, db);
if (idx < pma_index_limit(pma) && pma->pairs[idx]) {
DBT k2;
struct kv_pair *kv = kv_pair_ptr(pma->pairs[idx]);
if (0==pma->compare_fun(db, k, fill_dbt(&k2, kv->key, kv->keylen))) {
if (kv_pair_deleted(pma->pairs[idx])) {
pma_mfree_kv_pair(pma, pma->pairs[idx]);
pma->pairs[idx] = pma_malloc_kv_pair(pma, k->data, k->size, v->data, v->size);
assert(pma->pairs[idx]);
int r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, pma->pairs[idx]);
return r;
} else
return BRT_ALREADY_THERE; /* It is already here. Return an error. */
}
/* returns an error if the key is already present. */
int pma_insert (PMA pma, DBT *k, DBT *v, DB* db, TOKUTXN txn, DISKOFF diskoff, u_int32_t rand4fingerprint, u_int32_t *fingerprint) {
int found, idx;
if (pma->dup_mode & DB_DUPSORT) {
idx = __pma_dup_search(pma, k, v, db, 0, pma->N, &found);
if (found)
idx += 1;
} else if (pma->dup_mode & DB_DUP) {
idx = __pma_right_search(pma, k, db, 0, pma->N, &found);
if (found)
idx += 1;
} else {
idx = pmainternal_find(pma, k, db);
if (idx < pma_index_limit(pma) && pma->pairs[idx]) {
DBT k2;
struct kv_pair *kv = kv_pair_ptr(pma->pairs[idx]);
if (0==pma->compare_fun(db, k, fill_dbt(&k2, kv->key, kv->keylen))) {
if (kv_pair_deleted(pma->pairs[idx])) {
pma_mfree_kv_pair(pma, pma->pairs[idx]);
pma->pairs[idx] = pma_malloc_kv_pair(pma, k->data, k->size, v->data, v->size);
assert(pma->pairs[idx]);
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
int r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, pma->pairs[idx]);
return r;
} else
return BRT_ALREADY_THERE; /* It is already here. Return an error. */
}
}
}
if (kv_pair_inuse(pma->pairs[idx])) {
idx = pmainternal_make_space_at (pma, idx); /* returns the new idx. */
}
assert(0 <= idx && idx < pma->N);
assert(!kv_pair_inuse(pma->pairs[idx]));
pma->pairs[idx] = pma_malloc_kv_pair(pma, k->data, k->size, v->data, v->size);
assert(pma->pairs[idx]);
pma->n_pairs_present++;
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
return tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 1, pma->pairs[idx]);
}
int pma_delete (PMA pma, DBT *k, DB *db) {
int l;
/* find the next matching key in the pma starting from index here */
static int pma_next_key(PMA pma, DBT *k, DB *db, int here, int n, int *found) {
assert(0 <= here);
*found = 0;
while (here < n && !kv_pair_inuse(pma->pairs[here]))
here += 1;
if (here < n) {
struct kv_pair *kv = kv_pair_ptr(pma->pairs[here]);
DBT k2;
if (0 == pma->compare_fun(db, k, fill_dbt(&k2, kv_pair_key(kv), kv_pair_keylen(kv))))
*found = 1;
}
return here;
}
static int pma_delete_dup (PMA pma, DBT *k, DB *db, u_int32_t rand4sem, u_int32_t *fingerprint) {
/* find the left most matching key in the pma */
int found, lefthere;
lefthere = __pma_left_search(pma, k, db, 0, pma->N, &found);
int rightfound = found, righthere = lefthere;
while (rightfound) {
struct kv_pair *kv = pma->pairs[righthere];
if (kv_pair_valid(kv)) {
/* mark the pair as deleted */
*fingerprint -= rand4sem*toku_calccrc32_kvpair (kv_pair_key_const(kv), kv_pair_keylen(kv), kv_pair_val_const(kv), kv_pair_vallen(kv));
pma->pairs[righthere] = kv_pair_set_deleted(kv);
if (__pma_count_cursor_refs(pma, righthere) == 0) {
pma_mfree_kv_pair(pma, kv);
pma->pairs[righthere] = 0;
pma->n_pairs_present--;
}
}
/* find the next matching key in the pma */
righthere = pma_next_key(pma, k, db, righthere+1, pma->N, &rightfound);
}
if (found) {
/* check the density of the region centered around the deleted pairs */
__pma_delete_at(pma, (lefthere + righthere) / 2);
}
return found ? BRT_OK : DB_NOTFOUND;
}
l = pmainternal_find(pma, k, db);
struct kv_pair *kv = pma->pairs[l];
static int pma_delete_nodup (PMA pma, DBT *k, DB *db, u_int32_t rand4sem, u_int32_t *fingerprint) {
int idx = pmainternal_find(pma, k, db);
struct kv_pair *kv = pma->pairs[idx];
if (!kv_pair_valid(kv)) {
if (0) printf("%s:%d l=%d r=%d\n", __FILE__, __LINE__, l, DB_NOTFOUND);
return DB_NOTFOUND;
if (0) printf("%s:%d l=%d r=%d\n", __FILE__, __LINE__, idx, DB_NOTFOUND);
return DB_NOTFOUND;
}
pma->pairs[l] = kv_pair_set_deleted(kv);
if (__pma_count_cursor_refs(pma, l) == 0)
__pma_delete_finish(pma, l);
*fingerprint -= rand4sem*toku_calccrc32_kvpair (kv_pair_key_const(kv), kv_pair_keylen(kv), kv_pair_val_const(kv), kv_pair_vallen(kv));
pma->pairs[idx] = kv_pair_set_deleted(kv);
if (__pma_count_cursor_refs(pma, idx) == 0)
__pma_delete_finish(pma, idx);
return BRT_OK;
}
int pma_delete (PMA pma, DBT *k, DB *db, u_int32_t rand4sem, u_int32_t *fingerprint) {
if (pma->dup_mode & DB_DUP)
return pma_delete_dup(pma, k, db, rand4sem, fingerprint);
else
return pma_delete_nodup(pma, k, db, rand4sem, fingerprint);
}
void __pma_delete_resume(PMA pma, int here) {
if (here >= 0 && kv_pair_deleted(pma->pairs[here]) &&__pma_count_cursor_refs(pma, here) == 0)
__pma_delete_finish(pma, here);
}
void __pma_delete_finish(PMA pma, int here) {
static void __pma_delete_finish(PMA pma, int here) {
struct kv_pair *kv = pma->pairs[here];
if (!kv_pair_inuse(kv))
return;
......@@ -783,7 +1047,7 @@ void __pma_delete_finish(PMA pma, int here) {
__pma_delete_at(pma, here);
}
void __pma_delete_at(PMA pma, int here) {
static void __pma_delete_at(PMA pma, int here) {
int size;
int count;
struct kv_pair_tag *newpairs;
......@@ -854,7 +1118,8 @@ void __pma_delete_at(PMA pma, int here) {
int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
int *replaced_v_size, /* If it is a replacement, set to the size of the old value, otherwise set to -1. */
DB *db, TOKUTXN txn, diskoff diskoff) {
DB *db, TOKUTXN txn, DISKOFF diskoff,
u_int32_t rand4fingerprint, u_int32_t *fingerprint) {
//printf("%s:%d v->size=%d\n", __FILE__, __LINE__, v->size);
int idx = pmainternal_find(pma, k, db);
struct kv_pair *kv;
......@@ -866,6 +1131,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
if (0==pma->compare_fun(db, k, fill_dbt(&k2, kv->key, kv->keylen))) {
if (!kv_pair_deleted(pma->pairs[idx])) {
*replaced_v_size = kv->vallen;
*fingerprint -= rand4fingerprint*toku_calccrc32_kvpair(kv_pair_key_const(kv), kv_pair_keylen(kv), kv_pair_val_const(kv), kv_pair_vallen(kv));
r=tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, kv);
if (r!=0) return r;
}
......@@ -877,6 +1143,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
assert(pma->pairs[idx]);
}
r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 0, pma->pairs[idx]);
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
return r;
}
}
......@@ -891,6 +1158,7 @@ int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
*replaced_v_size = -1;
//printf("%s:%d txn=%p\n", __FILE__, __LINE__, txn);
r = tokulogger_log_phys_add_or_delete_in_leaf(db, txn, diskoff, 1, pma->pairs[idx]);
*fingerprint += rand4fingerprint*toku_calccrc32_kvpair(k->data, k->size, v->data, v->size);
return r;
}
......@@ -920,7 +1188,7 @@ int __pma_count_cursor_refs(PMA pma, int here) {
return refs;
}
void __pma_update_cursors_position(PMA pma, struct list *cursor_set, int oldposition, int newposition) {
static void __pma_update_cursors_position(PMA pma, struct list *cursor_set, int oldposition, int newposition) {
struct list *list, *nextlist;
struct pma_cursor *cursor;
......@@ -952,7 +1220,7 @@ void __pma_update_cursors(PMA pma, struct list *cursor_set, struct kv_pair_tag *
}
}
void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n) {
static void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n) {
if (list_empty(&pma->cursors))
return;
......@@ -967,7 +1235,7 @@ void __pma_update_my_cursors(PMA pma, struct kv_pair_tag *tpairs, int n) {
}
}
struct kv_pair_tag *__pma_extract_pairs(PMA pma, int npairs, int lo, int hi) {
static struct kv_pair_tag *__pma_extract_pairs(PMA pma, int npairs, int lo, int hi) {
struct kv_pair_tag *pairs;
int i;
int lastpair;
......@@ -1007,8 +1275,8 @@ static void __pma_relocate_kvpairs(PMA pma) {
#endif
int pma_split(PMA origpma, unsigned int *origpma_size,
PMA leftpma, unsigned int *leftpma_size,
PMA rightpma, unsigned int *rightpma_size) {
PMA leftpma, unsigned int *leftpma_size, u_int32_t leftrand4fp, u_int32_t *leftfingerprint,
PMA rightpma, unsigned int *rightpma_size, u_int32_t rightrand4fp, u_int32_t *rightfingerprint) {
int error;
int npairs;
struct kv_pair_tag *pairs;
......@@ -1057,6 +1325,23 @@ int pma_split(PMA origpma, unsigned int *origpma_size,
if (!list_empty(&origpma->cursors))
list_move(&cursors, &origpma->cursors);
{
u_int32_t sum = 0;
for (i=0; i<spliti; i++) {
sum+=toku_calccrc32_kvpair(kv_pair_key_const(pairs[i].pair), kv_pair_keylen(pairs[i].pair),
kv_pair_val_const(pairs[i].pair), kv_pair_vallen(pairs[i].pair));
}
*leftfingerprint += leftrand4fp * sum;
}
{
u_int32_t sum = 0;
for (i=spliti; i<npairs; i++) {
sum+=toku_calccrc32_kvpair(kv_pair_key_const(pairs[i].pair), kv_pair_keylen(pairs[i].pair),
kv_pair_val_const(pairs[i].pair), kv_pair_vallen(pairs[i].pair));
}
*rightfingerprint += rightrand4fp * sum;
}
/* put the first half of pairs into the left pma */
n = spliti;
error = __pma_resize_array(leftpma, n + n/4, 0);
......@@ -1119,7 +1404,7 @@ int pma_get_last(PMA pma, DBT *key, DBT *val) {
return 0;
}
void __pma_bulk_cleanup(struct pma *pma, struct kv_pair_tag *pairs, int n) {
static void __pma_bulk_cleanup(struct pma *pma, struct kv_pair_tag *pairs, int n) {
int i;
for (i=0; i<n; i++)
......@@ -1127,10 +1412,11 @@ void __pma_bulk_cleanup(struct pma *pma, struct kv_pair_tag *pairs, int n) {
pma_mfree_kv_pair(pma, pairs[i].pair);
}
int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) {
int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs, u_int32_t rand4fp, u_int32_t *sum) {
struct kv_pair_tag *newpairs;
int i;
int error;
u_int32_t delta=0;
if (n_newpairs == 0)
return 0;
......@@ -1146,6 +1432,7 @@ int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) {
}
for (i=0; i<n_newpairs; i++) {
delta += rand4fp*toku_calccrc32_kvpair (keys[i].data, keys[i].size, vals[i].data, vals[i].size);
#if PMA_USE_MEMPOOL
newpairs[i].pair = kv_pair_malloc_mempool(keys[i].data, keys[i].size,
vals[i].data, vals[i].size, &pma->kvspace);
......@@ -1169,6 +1456,68 @@ int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs) {
pma->n_pairs_present = n_newpairs;
toku_free(newpairs);
*sum += delta;
return 0;
}
/* verify that the keys in the pma index are sorted subject to the pma mode
* no duplications, duplicates, sorted duplicates.
*/
void pma_verify(PMA pma, DB *db) {
int i;
struct kv_pair *kv;
/* find the first key in the index */
for (i=0; i<pma->N; i++) {
kv = pma->pairs[i];
if (kv_pair_inuse(kv)) {
kv = kv_pair_ptr(kv);
i += 1;
break;
}
}
/* compare the current key with the next key in the index */
struct kv_pair *nextkv;
for (; i<pma->N; i++) {
nextkv = pma->pairs[i];
if (kv_pair_inuse(nextkv)) {
nextkv = kv_pair_ptr(nextkv);
DBT kv_dbt, nextkv_dbt;
fill_dbt(&kv_dbt, kv_pair_key(kv), kv_pair_keylen(kv));
fill_dbt(&nextkv_dbt, kv_pair_key(nextkv), kv_pair_keylen(nextkv));
int r = pma->compare_fun(db, &kv_dbt, &nextkv_dbt);
if (pma->dup_mode == 0)
assert(r < 0);
else if (pma->dup_mode & DB_DUP)
assert(r <= 0);
if (r == 0 && (pma->dup_mode & DB_DUPSORT)) {
fill_dbt(&kv_dbt, kv_pair_val(kv), kv_pair_vallen(kv));
fill_dbt(&nextkv_dbt, kv_pair_val(nextkv), kv_pair_vallen(nextkv));
r = pma->dup_compare_fun(db, &kv_dbt, &nextkv_dbt);
assert(r <= 0);
}
kv = nextkv;
}
}
#if PMA_USE_MEMPOOL
/* verify all kv pairs are in the memory pool */
for (i=0; i<pma->N; i++) {
kv = pma->pairs[i];
if (kv_pair_inuse(kv)) {
kv = kv_pair_ptr(kv);
assert(mempool_inrange(&pma->kvspace, kv, kv_pair_size(kv)));
}
}
#endif
}
void pma_verify_fingerprint (PMA pma, u_int32_t rand4fingerprint, u_int32_t fingerprint) {
u_int32_t actual_fingerprint=0;
PMA_ITERATE(pma, kv, kl, dv, dl,
actual_fingerprint+=rand4fingerprint*toku_calccrc32_kvpair(kv,kl,dv,dl)
);
assert(actual_fingerprint==fingerprint);
}
......@@ -10,11 +10,26 @@
/* An in-memory Packed Memory Array dictionary. */
/* There is a built-in-cursor. */
/* All functions return 0 on success. */
typedef struct pma *PMA;
typedef struct pma_cursor *PMA_CURSOR;
/* All functions return 0 on success. */
int pma_create(PMA *, int (*compare_fun)(DB*,const DBT*,const DBT*), int maxsize);
/* compare 2 DBT's
return a value < 0, = 0, > 0 if a < b, a == b, a > b respectively */
typedef int (*pma_compare_fun_t)(DB *, const DBT *a, const DBT *b);
int pma_create(PMA *, pma_compare_fun_t compare_fun, int maxsize);
/* set the duplicate mode
0 -> no duplications, DB_DUP, DB_DUPSORT */
int pma_set_dup_mode(PMA pma, int mode);
/* set the duplicate compare function */
int pma_set_dup_compare(PMA pma, pma_compare_fun_t dup_compare_fun);
/* verify the integrity of a pma */
void pma_verify(PMA pma, DB *db);
/* returns 0 if OK.
* You must have freed all the cursors, otherwise returns nonzero and does nothing. */
......@@ -28,15 +43,16 @@ int pma_n_entries (PMA);
/* Duplicates the key and keylen. */
//enum pma_errors pma_insert (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen);
// The DB pointer is there so that the comparison function can be called.
enum pma_errors pma_insert (PMA, DBT*, DBT*, DB*, TOKUTXN txn, diskoff);
enum pma_errors pma_insert (PMA, DBT*, DBT*, DB*, TOKUTXN txn, DISKOFF, u_int32_t /*random for fingerprint */, u_int32_t */*fingerprint*/);
/* This returns an error if the key is NOT present. */
int pma_replace (PMA, bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen);
/* This returns an error if the key is NOT present. */
int pma_delete (PMA, DBT *, DB*);
int pma_delete (PMA, DBT *, DB*, u_int32_t /*random for fingerprint*/, u_int32_t */*fingerprint*/);
int pma_insert_or_replace (PMA pma, DBT *k, DBT *v,
int *replaced_v_size, /* If it is a replacement, set to the size of the old value, otherwise set to -1. */
DB *db, TOKUTXN txn, diskoff);
DB *db, TOKUTXN txn, DISKOFF,
u_int32_t /*random for fingerprint*/, u_int32_t */*fingerprint*/);
/* Exposes internals of the PMA by returning a pointer to the guts.
......@@ -53,13 +69,14 @@ enum pma_errors pma_lookup (PMA, DBT*, DBT*, DB*);
* leftpma - the pma assigned keys <= pivot key
* rightpma - the pma assigned keys > pivot key
*/
int pma_split(PMA origpma, unsigned int *origpma_size,
PMA leftpma, unsigned int *leftpma_size,
PMA rightpma, unsigned int *rightpma_size);
int pma_split(PMA origpma, unsigned int *origpma_size,
PMA leftpma, unsigned int *leftpma_size, u_int32_t leftrand4sum, u_int32_t *leftfingerprint,
PMA rightpma, unsigned int *rightpma_size, u_int32_t rightrand4sum, u_int32_t *rightfingerprint);
/*
* Insert several key value pairs into an empty pma. The keys are
* assumed to be sorted.
* Insert several key value pairs into an empty pma.
* Doesn't delete any existing keys (even if they are duplicates)
* Requires: The keys are sorted
*
* pma - the pma that the key value pairs will be inserted into.
* must be empty with no cursors.
......@@ -67,7 +84,7 @@ int pma_split(PMA origpma, unsigned int *origpma_size,
* vals - an array of values
* n_newpairs - the number of key value pairs
*/
int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs);
int pma_bulk_insert(PMA pma, DBT *keys, DBT *vals, int n_newpairs, u_int32_t rand4sem, u_int32_t *fingerprint);
/* Move the cursor to the beginning or the end or to a key */
int pma_cursor (PMA, PMA_CURSOR *);
......@@ -122,4 +139,6 @@ void pma_iterate (PMA, void(*)(bytevec,ITEMLEN,bytevec,ITEMLEN, void*), void*);
int keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);
void pma_verify_fingerprint (PMA pma, u_int32_t rand4fingerprint, u_int32_t fingerprint);
#endif
......@@ -31,7 +31,8 @@ void create_directory (void) {
assert(r==0);
r=env->set_cachesize(env, 0, 512*(1<<20), 0);
assert(r==0);
#if DB_VERSION_MAJOR >= 4 && DB_VERSION_MINOR >= 3
IF40((void)0,
({
unsigned int gbytes,bytes;
......@@ -40,7 +41,7 @@ void create_directory (void) {
assert(r==0);
printf("Using %.2fMiB Berkeley DB Cache Size\n", gbytes*1024 + ((double)bytes/(1<<20)));
}));
#endif
r= env->open(env, dir, DB_CREATE|DB_INIT_MPOOL,0777); // No logging.
assert(r==0);
......
......@@ -25,16 +25,26 @@ static unsigned int rbuf_int (struct rbuf *r) {
(c3<<0));
}
static inline void rbuf_literal_bytes (struct rbuf *r, bytevec *bytes, unsigned int n_bytes) {
*bytes = &r->buf[r->ndone];
r->ndone+=n_bytes;
assert(r->ndone<=r->size);
}
/* Return a pointer into the middle of the buffer. */
static void rbuf_bytes (struct rbuf *r, bytevec *bytes, unsigned int *n_bytes)
{
*n_bytes = rbuf_int(r);
*bytes = &r->buf[r->ndone];
r->ndone+=*n_bytes;
assert(r->ndone<=r->size);
rbuf_literal_bytes(r, bytes, *n_bytes);
}
static unsigned long long rbuf_ulonglong (struct rbuf *r) {
unsigned i0 = rbuf_int(r);
unsigned i1 = rbuf_int(r);
return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1));
}
static diskoff rbuf_diskoff (struct rbuf *r) {
static DISKOFF rbuf_diskoff (struct rbuf *r) {
unsigned i0 = rbuf_int(r);
unsigned i1 = rbuf_int(r);
return ((unsigned long long)(i0)<<32) | ((unsigned long long)(i1));
......
/* Readers/writers locks implementation
*
*****************************************
* Overview
*****************************************
*
* TokuDB employs readers/writers locks for the ephemeral locks (e.g.,
* on BRT nodes) Why not just use the pthread_rwlock API?
*
* 1) we need multiprocess rwlocks (not just multithreaded)
*
* 2) pthread rwlocks are very slow since they entail a system call
* (about 2000ns on a 2GHz T2500.)
*
* Related: We expect the common case to be that the lock is
* granted
*
* 3) We are willing to employ machine-specific instructions (such
* as atomic exchange, and mfence, each of which runs in about
* 10ns.)
*
* 4) We want to guarantee nonstarvation (many rwlock
* implementations can starve the writers because another reader
* comes * along before all the other readers have unlocked.)
*
*****************************************
* How it works
*****************************************
*
* We arrange that the rwlock object is in the address space of both
* threads or processes. For processes we use mmap().
*
* The rwlock struct comprises the following fields
*
* a long mutex field (which is accessed using xchgl() or other
* machine-specific instructions. This is a spin lock.
*
* a read counter (how many readers currently have the lock?)
*
* a write boolean (does a writer have the lock?)
*
* a singly linked list of semaphores for waiting requesters. This
* list is sorted oldest requester first. Each list element
* contains a semaphore (which is provided by the requestor) and a
* boolean indicating whether it is a reader or a writer.
*
* To lock a read rwlock:
*
* 1) Acquire the mutex.
*
* 2) If the linked list is not empty or the writer boolean is true
* then
*
* a) initialize your semaphore (to 0),
* b) add your list element to the end of the list (with rw="read")
* c) release the mutex
* d) wait on the semaphore
* e) when the semaphore release, return success.
*
* 3) Otherwise increment the reader count, release the mutex, and
* return success.
*
* To lock the write rwlock is almost the same.
* 1) Acquire the mutex
* 2) If the list is not empty or the reader count is nonzero
* a) initialize semaphore
* b) add to end of list (with rw="write")
* c) release mutex
* d) wait on the semaphore
* e) return success when the semaphore releases
* 3) Otherwise set writer=TRUE, release mutex and return success.
*
* To unlock a read rwlock:
* 1) Acquire mutex
* 2) Decrement reader count
* 3) If the count is still positive or the list is empty then
* return success
* 4) Otherwise (count==zero and the list is nonempty):
* a) If the first element of the list is a reader:
* i) while the first element is a reader:
* x) pop the list
* y) increment the reader count
* z) increment the semaphore (releasing it for some waiter)
* ii) return success
* b) Else if the first element is a writer
* i) pop the list
* ii) set writer to TRUE
* iii) increment the semaphore
* iv) return success
*/
......@@ -6,6 +6,14 @@
#include <errno.h>
#include "memory.h"
//#define CRC_NO
#define CRC_INCR
//#define CRC_ATEND
#ifndef CRC_NO
#include "crc.h"
#endif
/* When serializing a value, write it into a buffer. */
/* This code requires that the buffer be big enough to hold whatever you put into it. */
/* This abstraction doesn't do a good job of hiding its internals.
......@@ -14,18 +22,27 @@ struct wbuf {
unsigned char *buf;
unsigned int size;
unsigned int ndone;
#ifdef CRC_INCR
u_int32_t crc32; // A 32-bit CRC of everything written so foar.
#endif
};
static void wbuf_init (struct wbuf *w, void *buf, diskoff size) {
static void wbuf_init (struct wbuf *w, void *buf, DISKOFF size) {
w->buf=buf;
w->size=size;
w->ndone=0;
#ifdef CRC_INCR
w->crc32 = toku_crc32(0L, Z_NULL, 0);
#endif
}
/* Write a character. */
static inline void wbuf_char (struct wbuf *w, int ch) {
assert(w->ndone<w->size);
w->buf[w->ndone++]=ch;
#ifdef CRC_INCR
w->crc32 = toku_crc32(w->crc32, &w->buf[w->ndone-1], 1);
#endif
}
static void wbuf_int (struct wbuf *w, unsigned int i) {
......@@ -40,20 +57,31 @@ static void wbuf_int (struct wbuf *w, unsigned int i) {
w->buf[w->ndone+1] = i>>16;
w->buf[w->ndone+2] = i>>8;
w->buf[w->ndone+3] = i>>0;
#ifdef CRC_INCR
w->crc32 = toku_crc32(w->crc32, &w->buf[w->ndone], 4);
#endif
w->ndone += 4;
#endif
}
static void wbuf_bytes (struct wbuf *w, bytevec bytes_bv, int nbytes) {
static inline void wbuf_literal_bytes(struct wbuf *w, bytevec bytes_bv, int nbytes) {
const unsigned char *bytes=bytes_bv;
wbuf_int(w, nbytes);
#if 0
{ int i; for (i=0; i<nbytes; i++) wbuf_char(w, bytes[i]); }
#else
assert(w->ndone + nbytes <= w->size);
memcpy(w->buf + w->ndone, bytes, nbytes);
#ifdef CRC_INCR
w->crc32 = toku_crc32(w->crc32, &w->buf[w->ndone], nbytes);
#endif
w->ndone += nbytes;
#endif
}
static void wbuf_bytes (struct wbuf *w, bytevec bytes_bv, int nbytes) {
wbuf_int(w, nbytes);
wbuf_literal_bytes(w, bytes_bv, nbytes);
}
static void wbuf_ulonglong (struct wbuf *w, unsigned long long ull) {
......@@ -61,7 +89,7 @@ static void wbuf_ulonglong (struct wbuf *w, unsigned long long ull) {
wbuf_int(w, ull&0xFFFFFFFF);
}
static void wbuf_diskoff (struct wbuf *w, diskoff off) {
static void wbuf_diskoff (struct wbuf *w, DISKOFF off) {
wbuf_ulonglong(w, off);
}
......@@ -69,8 +97,12 @@ static inline void wbuf_txnid (struct wbuf *w, TXNID tid) {
wbuf_ulonglong(w, tid);
}
static inline void wbuf_fileid (struct wbuf *w, unsigned long long fileid) {
wbuf_ulonglong(w, fileid);
static inline void wbuf_lsn (struct wbuf *w, LSN lsn) {
wbuf_ulonglong(w, lsn.lsn);
}
static inline void wbuf_filenum (struct wbuf *w, FILENUM fileid) {
wbuf_int(w, fileid.fileid);
}
#endif
......@@ -16,9 +16,11 @@ DBT *fill_dbt(DBT *dbt, bytevec k, ITEMLEN len) {
return dbt;
}
DBT *fill_dbt_ap(DBT *dbt, bytevec k, ITEMLEN len, void *app_private) {
DBT *fill_dbt_ap(DBT *dbt, bytevec k, ITEMLEN len, void *app_private __attribute__((unused))) {
fill_dbt(dbt, k, len);
#if USE_DBT_APP_PRIVATE
dbt->app_private=app_private;
#endif
return dbt;
}
......
......@@ -11,4 +11,22 @@ DBT *fill_dbt(DBT *dbt, bytevec k, ITEMLEN len);
DBT *fill_dbt_ap(DBT *dbt, bytevec k, ITEMLEN len, void *app_private);
int ybt_set_value (DBT *, bytevec val, ITEMLEN vallen, void **staticptrp);
#ifndef USE_DBT_APP_PRIVATE
#define USE_DBT_APP_PRIVATE 0
#endif
static inline void *dbt_get_app_private(DBT *dbt __attribute__((unused))) {
#if USE_DBT_APP_PRIVATE
return dbt->app_private;
#else
return 0;
#endif
}
static inline void dbt_set_app_private(DBT *dbt __attribute__((unused)), void *ap __attribute__((unused))) {
#if USE_DBT_APP_PRIVATE
dbt->app_private = ap;
#endif
}
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment