Commit 8bfd0ff4 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul

Break up brt-test some more. Addresses #475. Also make the fanout flexible. Fixes #126.

git-svn-id: file:///svn/tokudb@2593 c7de825b-a66e-492c-adef-691d508d4ae1
parent 23d5d500
...@@ -46,14 +46,15 @@ default: bins libs recover tdb_logprint ...@@ -46,14 +46,15 @@ default: bins libs recover tdb_logprint
# Put these one-per-line so that if we insert a new one the svn diff can understand it better. # Put these one-per-line so that if we insert a new one the svn diff can understand it better.
# Also keep them sorted. # Also keep them sorted.
REGRESSION_TESTS = \ REGRESSION_TESTS = \
ybt-test \ ybt-test \
pma-test \ pma-test \
brt-serialize-test \ brt-serialize-test \
cachetable-test \ cachetable-test \
cachetable-test2 \ cachetable-test2 \
fifo-test \ fifo-test \
test-brt-delete-both \ test-brt-delete-both \
brt-test \ brt-test \
brt-test3 \
brt-test4 \ brt-test4 \
brt-test-cursor \ brt-test-cursor \
test_oexcl \ test_oexcl \
...@@ -99,6 +100,7 @@ CHECKS = \ ...@@ -99,6 +100,7 @@ CHECKS = \
test-brt-delete-both \ test-brt-delete-both \
brt-test \ brt-test \
brt-test-cursor \ brt-test-cursor \
brt-test3 \
brt-test4 \ brt-test4 \
fifo-test \ fifo-test \
test_toku_malloc_plain_free \ test_toku_malloc_plain_free \
...@@ -106,7 +108,8 @@ CHECKS = \ ...@@ -106,7 +108,8 @@ CHECKS = \
list-test \ list-test \
# This line intentially kept commented so I can have a \ on the previous line # This line intentially kept commented so I can have a \ on the previous line
check: bins $(patsubst %,check_%,$(CHECKS)) check_benchmarktest_256 # Put check_benchmarktest_256 first because it is long-running (and therefore on the critical path, so get it started)
check: bins check_benchmarktest_256 $(patsubst %,check_%,$(CHECKS))
check_benchmarktest_256: benchmark-test check_benchmarktest_256: benchmark-test
$(DTOOL) ./benchmark-test $(VERBVERBOSE) --valsize 256 --verify 1 $(DTOOL) ./benchmark-test $(VERBVERBOSE) --valsize 256 --verify 1
...@@ -121,7 +124,7 @@ check_test-assert: test-assert ...@@ -121,7 +124,7 @@ check_test-assert: test-assert
@# one argument, "ok" should not error @# one argument, "ok" should not error
$(DTOOL) ./test-assert ok $(DTOOL) ./test-assert ok
check_%: % check_%: %
$(DTOOL) ./$< $(VERBVERBOSE) time $(DTOOL) ./$< $(VERBVERBOSE)
check-fanout: check-fanout:
let BRT_FANOUT=4; \ let BRT_FANOUT=4; \
...@@ -130,7 +133,7 @@ check-fanout: ...@@ -130,7 +133,7 @@ check-fanout:
let BRT_FANOUT=BRT_FANOUT+1; \ let BRT_FANOUT=BRT_FANOUT+1; \
done done
pma-test benchmark-test brt-test brt-test4 brt-test-cursor test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz pma-test benchmark-test brt-test brt-test3 brt-test4 brt-test-cursor test-brt-delete-both brt-serialize-test brtdump test-inc-split test-del-inorder: LDFLAGS+=-lz
# pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage # pma: PROF_FLAGS=-fprofile-arcs -ftest-coverage
BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h pma.h brt.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h BRT_INTERNAL_H_INCLUDES = brt-internal.h cachetable.h fifo.h pma.h brt.h brttypes.h yerror.h ybt.h log.h ../include/db.h kv-pair.h memory.h crc.h
...@@ -146,10 +149,10 @@ ybt.o: ybt.h brttypes.h ../include/db.h ...@@ -146,10 +149,10 @@ ybt.o: ybt.h brttypes.h ../include/db.h
ybt-test: ybt-test.o ybt.o memory.o toku_assert.o ybt-test: ybt-test.o ybt.o memory.o toku_assert.o
ybt-test.o: ybt.h ../include/db.h ybt-test.o: ybt.h ../include/db.h
cachetable.o: cachetable.h hashfun.h memory.h cachetable.o: cachetable.h hashfun.h memory.h
brt-test4 brt-test-cursor brt-test: ybt.o brt.o fifo.o pma.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o brt-test3 brt-test4 brt-test-cursor brt-test: ybt.o brt.o fifo.o pma.o memory.o brt-serialize.o cachetable.o ybt.o key.o primes.o toku_assert.o log.o mempool.o brt-verify.o fingerprint.o log_code.o roll.o
log.o: log_header.h log-internal.h log.h wbuf.h crc.h brttypes.h $(BRT_INTERNAL_H_INCLUDES) log.o: log_header.h log-internal.h log.h wbuf.h crc.h brttypes.h $(BRT_INTERNAL_H_INCLUDES)
logformat: logformat.o toku_assert.o logformat: logformat.o toku_assert.o
brt-test4.o brt-test-cursor.o brt-test.o brt.o: brt.h ../include/db.h fifo.h pma.h brttypes.h cachetable.h memory.h brt-test3.o brt-test4.o brt-test-cursor.o brt-test.o brt.o: brt.h ../include/db.h fifo.h pma.h brttypes.h cachetable.h memory.h
brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES) brt-serialize-test.o: $(BRT_INTERNAL_H_INCLUDES)
brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h brt.o: $(BRT_INTERNAL_H_INCLUDES) key.h log_header.h
fifo.o: fifo.h brttypes.h fifo.o: fifo.h brttypes.h
......
...@@ -64,14 +64,14 @@ struct brtnode { ...@@ -64,14 +64,14 @@ struct brtnode {
unsigned int totalchildkeylens; unsigned int totalchildkeylens;
unsigned int n_bytes_in_buffers; unsigned int n_bytes_in_buffers;
struct brtnode_nonleaf_childinfo childinfos[TREE_FANOUT+1]; /* One extra so we can grow */ struct brtnode_nonleaf_childinfo *childinfos; /* One extra so we can grow */
#define BNC_SUBTREE_FINGERPRINT(node,i) ((node)->u.n.childinfos[i].subtree_fingerprint) #define BNC_SUBTREE_FINGERPRINT(node,i) ((node)->u.n.childinfos[i].subtree_fingerprint)
#define BNC_DISKOFF(node,i) ((node)->u.n.childinfos[i].diskoff) #define BNC_DISKOFF(node,i) ((node)->u.n.childinfos[i].diskoff)
#define BNC_BUFFER(node,i) ((node)->u.n.childinfos[i].buffer) #define BNC_BUFFER(node,i) ((node)->u.n.childinfos[i].buffer)
#define BNC_NBYTESINBUF(node,i) ((node)->u.n.childinfos[i].n_bytes_in_buffer) #define BNC_NBYTESINBUF(node,i) ((node)->u.n.childinfos[i].n_bytes_in_buffer)
struct kv_pair *childkeys[TREE_FANOUT]; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1]. struct kv_pair **childkeys; /* Pivot keys. Child 0's keys are <= childkeys[0]. Child 1's keys are <= childkeys[1].
Note: It is possible that Child 1's keys are == to child 0's key's, so it is Note: It is possible that Child 1's keys are == to child 0's key's, so it is
not necessarily true that child 1's keys are > childkeys[0]. not necessarily true that child 1's keys are > childkeys[0].
However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */ However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
...@@ -175,7 +175,7 @@ struct brt_cursor { ...@@ -175,7 +175,7 @@ struct brt_cursor {
DBT val; DBT val;
}; };
void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger); int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger);
int toku_unpin_brtnode (BRT brt, BRTNODE node) ; int toku_unpin_brtnode (BRT brt, BRTNODE node) ;
unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t); unsigned int toku_brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t);
......
...@@ -32,6 +32,8 @@ static void test_serialize(void) { ...@@ -32,6 +32,8 @@ static void test_serialize(void) {
sn.local_fingerprint = 0; sn.local_fingerprint = 0;
sn.u.n.n_children = 2; sn.u.n.n_children = 2;
hello_string = toku_strdup("hello"); hello_string = toku_strdup("hello");
MALLOC_N(2, sn.u.n.childinfos);
MALLOC_N(1, sn.u.n.childkeys);
sn.u.n.childkeys[0] = kv_pair_malloc(hello_string, 6, 0, 0); sn.u.n.childkeys[0] = kv_pair_malloc(hello_string, 6, 0, 0);
sn.u.n.totalchildkeylens = 6; sn.u.n.totalchildkeylens = 6;
BNC_DISKOFF(&sn, 0) = sn.nodesize*30; BNC_DISKOFF(&sn, 0) = sn.nodesize*30;
...@@ -45,11 +47,6 @@ static void test_serialize(void) { ...@@ -45,11 +47,6 @@ static void test_serialize(void) {
r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, (TXNID)234); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, (TXNID)234, "x", 2, "xval", 5); r = toku_fifo_enq(BNC_BUFFER(&sn,1), "x", 2, "xval", 5, BRT_NONE, (TXNID)234); assert(r==0); sn.local_fingerprint += randval*toku_calccrc32_cmd(BRT_NONE, (TXNID)234, "x", 2, "xval", 5);
BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5); BNC_NBYTESINBUF(&sn, 0) = 2*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5); BNC_NBYTESINBUF(&sn, 1) = 1*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
{
int i;
for (i=2; i<TREE_FANOUT+1; i++)
BNC_NBYTESINBUF(&sn, i)=0;
}
sn.u.n.n_bytes_in_buffers = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5); sn.u.n.n_bytes_in_buffers = 3*(BRT_CMD_OVERHEAD+KEY_VALUE_OVERHEAD+2+5);
toku_serialize_brtnode_to(fd, sn.nodesize*20, sn.nodesize, &sn); assert(r==0); toku_serialize_brtnode_to(fd, sn.nodesize*20, sn.nodesize, &sn); assert(r==0);
......
...@@ -278,18 +278,11 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int fl ...@@ -278,18 +278,11 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int fl
//printf("height==%d\n", result->height); //printf("height==%d\n", result->height);
if (result->height>0) { if (result->height>0) {
result->u.n.totalchildkeylens=0; result->u.n.totalchildkeylens=0;
for (i=0; i<TREE_FANOUT; i++) {
result->u.n.childkeys[i]=0;
}
for (i=0; i<TREE_FANOUT+1; i++) {
BNC_SUBTREE_FINGERPRINT(result, i)=0;
BNC_DISKOFF(result,i)=0;
BNC_BUFFER(result,i)=0;
BNC_NBYTESINBUF(result,i)=0;
}
u_int32_t subtree_fingerprint = rbuf_int(&rc); u_int32_t subtree_fingerprint = rbuf_int(&rc);
u_int32_t check_subtree_fingerprint = 0; u_int32_t check_subtree_fingerprint = 0;
result->u.n.n_children = rbuf_int(&rc); result->u.n.n_children = rbuf_int(&rc);
MALLOC_N(result->u.n.n_children, result->u.n.childinfos);
MALLOC_N(result->u.n.n_children-1, result->u.n.childkeys);
//printf("n_children=%d\n", result->n_children); //printf("n_children=%d\n", result->n_children);
assert(result->u.n.n_children>=0 && result->u.n.n_children<=TREE_FANOUT); assert(result->u.n.n_children>=0 && result->u.n.n_children<=TREE_FANOUT);
for (i=0; i<result->u.n.n_children; i++) { for (i=0; i<result->u.n.n_children; i++) {
...@@ -315,11 +308,9 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int fl ...@@ -315,11 +308,9 @@ int toku_deserialize_brtnode_from (int fd, DISKOFF off, BRTNODE *brtnode, int fl
} }
for (i=0; i<result->u.n.n_children; i++) { for (i=0; i<result->u.n.n_children; i++) {
BNC_DISKOFF(result,i) = rbuf_diskoff(&rc); BNC_DISKOFF(result,i) = rbuf_diskoff(&rc);
BNC_NBYTESINBUF(result,i) = 0;
//printf("Child %d at %lld\n", i, result->children[i]); //printf("Child %d at %lld\n", i, result->children[i]);
} }
for (i=0; i<TREE_FANOUT+1; i++) {
BNC_NBYTESINBUF(result,i)=0;
}
result->u.n.n_bytes_in_buffers = 0; result->u.n.n_bytes_in_buffers = 0;
for (i=0; i<result->u.n.n_children; i++) { for (i=0; i<result->u.n.n_children; i++) {
r=toku_fifo_create(&BNC_BUFFER(result,i)); r=toku_fifo_create(&BNC_BUFFER(result,i));
...@@ -472,9 +463,6 @@ void toku_verify_counts (BRTNODE node) { ...@@ -472,9 +463,6 @@ void toku_verify_counts (BRTNODE node) {
sum += BNC_NBYTESINBUF(node,i); sum += BNC_NBYTESINBUF(node,i);
// We don't rally care of the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized. // We don't rally care of the later buffers have garbage in them. Valgrind would do a better job noticing if we leave it uninitialized.
// But for now the code always initializes the later tables so they are 0. // But for now the code always initializes the later tables so they are 0.
for (; i<TREE_FANOUT+1; i++) {
assert(BNC_NBYTESINBUF(node,i)==0);
}
assert(sum==node->u.n.n_bytes_in_buffers); assert(sum==node->u.n.n_bytes_in_buffers);
} }
} }
......
...@@ -108,37 +108,6 @@ static void test2 (int memcheck) { ...@@ -108,37 +108,6 @@ static void test2 (int memcheck) {
if (verbose) printf("test2 ok\n"); if (verbose) printf("test2 ok\n");
} }
static void test3 (int nodesize, int count, int memcheck) {
BRT t;
int r;
struct timeval t0,t1;
int i;
CACHETABLE ct;
char fname[]="testbrt.brt";
toku_memory_check=memcheck;
toku_memory_check_all_free();
r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
gettimeofday(&t0, 0);
unlink(fname);
r = toku_open_brt(fname, 0, 1, &t, nodesize, ct, null_txn, toku_default_compare_fun, null_db);
assert(r==0);
for (i=0; i<count; i++) {
char key[100],val[100];
DBT k,v;
snprintf(key,100,"hello%d",i);
snprintf(val,100,"there%d",i);
toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
}
r = toku_close_brt(t); assert(r==0);
r = toku_cachetable_close(&ct); assert(r==0);
toku_memory_check_all_free();
gettimeofday(&t1, 0);
{
double tdiff = (t1.tv_sec-t0.tv_sec)+1e-6*(t1.tv_usec-t0.tv_usec);
if (verbose) printf("serial insertions: blocksize=%d %d insertions in %.3f seconds, %.2f insertions/second\n", nodesize, count, tdiff, count/tdiff);
}
}
static void test5 (void) { static void test5 (void) {
int r; int r;
BRT t; BRT t;
...@@ -1689,17 +1658,6 @@ static void brt_blackbox_test (void) { ...@@ -1689,17 +1658,6 @@ static void brt_blackbox_test (void) {
test5(); test5();
if (verbose) printf("test_multiple_files\n"); if (verbose) printf("test_multiple_files\n");
test_multiple_files(); test_multiple_files();
if (verbose) printf("test3 slow\n");
toku_memory_check=0;
test3(2048, 1<<15, 1);
if (verbose) printf("test3 fast\n");
if (verbose) toku_pma_show_stats();
test3(1<<15, 1024, 1);
if (verbose) printf("test3 fast\n");
test3(1<<18, 1<<20, 0);
toku_memory_check = 1; toku_memory_check = 1;
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
#include "brt.h"
#include "key.h"
#include "pma.h"
#include "brt-internal.h"
#include "memory.h"
#include "toku_assert.h"
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include <stdlib.h>
#include <unistd.h>
#include <arpa/inet.h>
#include "test.h"
static const char fname[]= __FILE__ ".brt";
static TOKUTXN const null_txn = 0;
static DB * const null_db = 0;
static void test3 (int nodesize, int count, int memcheck) {
BRT t;
int r;
struct timeval t0,t1;
int i;
CACHETABLE ct;
toku_memory_check=memcheck;
toku_memory_check_all_free();
r = toku_brt_create_cachetable(&ct, 0, ZERO_LSN, NULL_LOGGER); assert(r==0);
gettimeofday(&t0, 0);
unlink(fname);
r = toku_open_brt(fname, 0, 1, &t, nodesize, ct, null_txn, toku_default_compare_fun, null_db);
assert(r==0);
for (i=0; i<count; i++) {
char key[100],val[100];
DBT k,v;
snprintf(key,100,"hello%d",i);
snprintf(val,100,"there%d",i);
toku_brt_insert(t, toku_fill_dbt(&k, key, 1+strlen(key)), toku_fill_dbt(&v, val, 1+strlen(val)), null_txn);
}
r = toku_close_brt(t); assert(r==0);
r = toku_cachetable_close(&ct); assert(r==0);
toku_memory_check_all_free();
gettimeofday(&t1, 0);
{
double tdiff = (t1.tv_sec-t0.tv_sec)+1e-6*(t1.tv_usec-t0.tv_usec);
if (verbose) printf("serial insertions: blocksize=%d %d insertions in %.3f seconds, %.2f insertions/second\n", nodesize, count, tdiff, count/tdiff);
}
}
static void brt_blackbox_test (void) {
if (verbose) printf("test3 slow\n");
toku_memory_check=0;
test3(2048, 1<<15, 1);
if (verbose) printf("test3 fast\n");
if (verbose) toku_pma_show_stats();
test3(1<<15, 1024, 1);
if (verbose) printf("test3 fast\n");
test3(1<<18, 1<<20, 0);
toku_memory_check = 1;
// test3(1<<19, 1<<20, 0);
// test3(1<<20, 1<<20, 0);
// test3(1<<20, 1<<21, 0);
// test3(1<<20, 1<<22, 0);
}
int main (int argc , const char *argv[]) {
default_parse_args(argc, argv);
brt_blackbox_test();
toku_malloc_cleanup();
if (verbose) printf("test ok\n");
return 0;
}
...@@ -49,13 +49,15 @@ void toku_brtnode_free (BRTNODE *nodep) { ...@@ -49,13 +49,15 @@ void toku_brtnode_free (BRTNODE *nodep) {
//printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, node, node->mdicts[0]); //printf("%s:%d %p->mdict[0]=%p\n", __FILE__, __LINE__, node, node->mdicts[0]);
if (node->height>0) { if (node->height>0) {
for (i=0; i<node->u.n.n_children-1; i++) { for (i=0; i<node->u.n.n_children-1; i++) {
toku_free((void*)node->u.n.childkeys[i]); toku_free(node->u.n.childkeys[i]);
} }
for (i=0; i<node->u.n.n_children; i++) { for (i=0; i<node->u.n.n_children; i++) {
if (BNC_BUFFER(node,i)) { if (BNC_BUFFER(node,i)) {
toku_fifo_free(&BNC_BUFFER(node,i)); toku_fifo_free(&BNC_BUFFER(node,i));
} }
} }
toku_free(node->u.n.childkeys);
toku_free(node->u.n.childinfos);
} else { } else {
if (node->u.l.buffer) // The buffer may have been freed already, in some cases. if (node->u.l.buffer) // The buffer may have been freed already, in some cases.
toku_pma_free(&node->u.l.buffer); toku_pma_free(&node->u.l.buffer);
...@@ -246,7 +248,6 @@ int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER logger) { ...@@ -246,7 +248,6 @@ int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER logger) {
} }
static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height) { static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height) {
int i;
n->tag = TYP_BRTNODE; n->tag = TYP_BRTNODE;
n->nodesize = t->h->nodesize; n->nodesize = t->h->nodesize;
n->flags = t->h->flags; n->flags = t->h->flags;
...@@ -261,18 +262,10 @@ static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height) ...@@ -261,18 +262,10 @@ static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height)
assert(height>=0); assert(height>=0);
if (height>0) { if (height>0) {
n->u.n.n_children = 0; n->u.n.n_children = 0;
for (i=0; i<TREE_FANOUT; i++) {
// n->u.n.childkeys[i] = 0;
// n->u.n.childkeylens[i] = 0;
}
n->u.n.totalchildkeylens = 0; n->u.n.totalchildkeylens = 0;
for (i=0; i<TREE_FANOUT+1; i++) {
BNC_SUBTREE_FINGERPRINT(n, i) = 0;
// n->u.n.children[i] = 0;
// n->u.n.buffers[i] = 0;
BNC_NBYTESINBUF(n,i) = 0;
}
n->u.n.n_bytes_in_buffers = 0; n->u.n.n_bytes_in_buffers = 0;
n->u.n.childinfos=0;
n->u.n.childkeys=0;
} else { } else {
int r = toku_pma_create(&n->u.l.buffer, t->compare_fun, t->db, toku_cachefile_filenum(t->cf), n->nodesize); int r = toku_pma_create(&n->u.l.buffer, t->compare_fun, t->db, toku_cachefile_filenum(t->cf), n->nodesize);
assert(r==0); assert(r==0);
...@@ -285,7 +278,7 @@ static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height) ...@@ -285,7 +278,7 @@ static void initialize_brtnode (BRT t, BRTNODE n, DISKOFF nodename, int height)
} }
} }
void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger) { int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logger) {
TAGMALLOC(BRTNODE, n); TAGMALLOC(BRTNODE, n);
int r; int r;
DISKOFF name; DISKOFF name;
...@@ -305,6 +298,7 @@ void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER log ...@@ -305,6 +298,7 @@ void toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER log
r=toku_log_newbrtnode(logger, toku_cachefile_filenum(t->cf), n->thisnodename, height, n->nodesize, (t->flags&TOKU_DB_DUPSORT)!=0, n->rand4fingerprint); r=toku_log_newbrtnode(logger, toku_cachefile_filenum(t->cf), n->thisnodename, height, n->nodesize, (t->flags&TOKU_DB_DUPSORT)!=0, n->rand4fingerprint);
assert(r==0); assert(r==0);
toku_update_brtnode_loggerlsn(n, logger); toku_update_brtnode_loggerlsn(n, logger);
return 0;
} }
static int insert_to_buffer_in_nonleaf (BRTNODE node, int childnum, DBT *k, DBT *v, int type, TXNID xid) { static int insert_to_buffer_in_nonleaf (BRTNODE node, int childnum, DBT *k, DBT *v, int type, TXNID xid) {
...@@ -360,6 +354,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -360,6 +354,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
assert(node->u.n.n_children>=2); // Otherwise, how do we split? We need at least two children to split. */ assert(node->u.n.n_children>=2); // Otherwise, how do we split? We need at least two children to split. */
assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */ assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */
toku_create_new_brtnode(t, &B, node->height, logger); toku_create_new_brtnode(t, &B, node->height, logger);
MALLOC_N(n_children_in_b+1, B->u.n.childinfos);
MALLOC_N(n_children_in_b, B->u.n.childkeys);
B->u.n.n_children =n_children_in_b; B->u.n.n_children =n_children_in_b;
//printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B); //printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B);
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename); //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
...@@ -372,6 +368,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -372,6 +368,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
for (i=0; i<n_children_in_b; i++) { for (i=0; i<n_children_in_b; i++) {
int r = toku_fifo_create(&BNC_BUFFER(B,i)); int r = toku_fifo_create(&BNC_BUFFER(B,i));
if (r!=0) return r; if (r!=0) return r;
BNC_NBYTESINBUF(B,i)=0;
BNC_SUBTREE_FINGERPRINT(B,i)=0;
} }
for (i=n_children_in_a; i<old_n_children; i++) { for (i=n_children_in_a; i<old_n_children; i++) {
...@@ -453,7 +451,9 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -453,7 +451,9 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
splitk->data = (void*)(node->u.n.childkeys[n_children_in_a-1]); splitk->data = (void*)(node->u.n.childkeys[n_children_in_a-1]);
splitk->size = toku_brt_pivot_key_len(t, node->u.n.childkeys[n_children_in_a-1]); splitk->size = toku_brt_pivot_key_len(t, node->u.n.childkeys[n_children_in_a-1]);
node->u.n.totalchildkeylens -= toku_brt_pivot_key_len(t, node->u.n.childkeys[n_children_in_a-1]); node->u.n.totalchildkeylens -= toku_brt_pivot_key_len(t, node->u.n.childkeys[n_children_in_a-1]);
node->u.n.childkeys[n_children_in_a-1]=0;
REALLOC_N(n_children_in_a+1, node->u.n.childinfos);
REALLOC_N(n_children_in_a, node->u.n.childkeys);
verify_local_fingerprint_nonleaf(node); verify_local_fingerprint_nonleaf(node);
verify_local_fingerprint_nonleaf(B); verify_local_fingerprint_nonleaf(B);
...@@ -618,6 +618,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -618,6 +618,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
//verify_local_fingerprint_nonleaf(node); //verify_local_fingerprint_nonleaf(node);
REALLOC_N(node->u.n.n_children+2, node->u.n.childinfos);
REALLOC_N(node->u.n.n_children+1, node->u.n.childkeys);
// Slide the children over. // Slide the children over.
for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) { for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) {
node->u.n.childinfos[cnum] = node->u.n.childinfos[cnum-1]; node->u.n.childinfos[cnum] = node->u.n.childinfos[cnum-1];
...@@ -625,6 +627,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -625,6 +627,8 @@ static int handle_split_of_child (BRT t, BRTNODE node, int childnum,
r = toku_log_addchild(logger, toku_cachefile_filenum(t->cf), node->thisnodename, childnum+1, childb->thisnodename, 0); r = toku_log_addchild(logger, toku_cachefile_filenum(t->cf), node->thisnodename, childnum+1, childb->thisnodename, 0);
assert(BNC_DISKOFF(node, childnum)==childa->thisnodename); assert(BNC_DISKOFF(node, childnum)==childa->thisnodename);
BNC_DISKOFF(node, childnum+1) = childb->thisnodename; BNC_DISKOFF(node, childnum+1) = childb->thisnodename;
BNC_SUBTREE_FINGERPRINT(node, childnum)=0;
BNC_SUBTREE_FINGERPRINT(node, childnum+1)=0;
fixup_child_fingerprint(node, childnum, childa, t, logger); fixup_child_fingerprint(node, childnum, childa, t, logger);
fixup_child_fingerprint(node, childnum+1, childb, t, logger); fixup_child_fingerprint(node, childnum+1, childb, t, logger);
r=toku_fifo_create(&BNC_BUFFER(node,childnum)); assert(r==0); // ??? SHould handle this error case r=toku_fifo_create(&BNC_BUFFER(node,childnum)); assert(r==0); // ??? SHould handle this error case
...@@ -1625,6 +1629,8 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, ...@@ -1625,6 +1629,8 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk,
initialize_brtnode (brt, newroot, newroot_diskoff, new_height); initialize_brtnode (brt, newroot, newroot_diskoff, new_height);
//printf("new_root %lld %d %lld %lld\n", newroot_diskoff, newroot->height, nodea->thisnodename, nodeb->thisnodename); //printf("new_root %lld %d %lld %lld\n", newroot_diskoff, newroot->height, nodea->thisnodename, nodeb->thisnodename);
newroot->u.n.n_children=2; newroot->u.n.n_children=2;
MALLOC_N(3, newroot->u.n.childinfos);
MALLOC_N(2, newroot->u.n.childkeys);
//printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey); //printf("%s:%d Splitkey=%p %s\n", __FILE__, __LINE__, splitkey, splitkey);
newroot->u.n.childkeys[0] = splitk.data; newroot->u.n.childkeys[0] = splitk.data;
newroot->u.n.totalchildkeylens=splitk.size; newroot->u.n.totalchildkeylens=splitk.size;
...@@ -1632,6 +1638,10 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, ...@@ -1632,6 +1638,10 @@ static int brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk,
BNC_DISKOFF(newroot,1)=nodeb->thisnodename; BNC_DISKOFF(newroot,1)=nodeb->thisnodename;
r=toku_fifo_create(&BNC_BUFFER(newroot,0)); if (r!=0) return r; r=toku_fifo_create(&BNC_BUFFER(newroot,0)); if (r!=0) return r;
r=toku_fifo_create(&BNC_BUFFER(newroot,1)); if (r!=0) return r; r=toku_fifo_create(&BNC_BUFFER(newroot,1)); if (r!=0) return r;
BNC_NBYTESINBUF(newroot, 0)=0;
BNC_NBYTESINBUF(newroot, 1)=0;
BNC_SUBTREE_FINGERPRINT(newroot, 0)=0;
BNC_SUBTREE_FINGERPRINT(newroot, 1)=0;
toku_verify_counts(newroot); toku_verify_counts(newroot);
//verify_local_fingerprint_nonleaf(nodea); //verify_local_fingerprint_nonleaf(nodea);
//verify_local_fingerprint_nonleaf(nodeb); //verify_local_fingerprint_nonleaf(nodeb);
......
...@@ -33,6 +33,8 @@ extern long long n_items_malloced; ...@@ -33,6 +33,8 @@ extern long long n_items_malloced;
static int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER); static int malloc_diskblock (DISKOFF *res, BRT brt, int size, TOKULOGGER);
static void verify_local_fingerprint_nonleaf (BRTNODE node); static void verify_local_fingerprint_nonleaf (BRTNODE node);
#ifdef FOO
/* Frees a node, including all the stuff in the hash table. */ /* Frees a node, including all the stuff in the hash table. */
void toku_brtnode_free (BRTNODE *nodep) { void toku_brtnode_free (BRTNODE *nodep) {
BRTNODE node=*nodep; BRTNODE node=*nodep;
...@@ -55,9 +57,11 @@ void toku_brtnode_free (BRTNODE *nodep) { ...@@ -55,9 +57,11 @@ void toku_brtnode_free (BRTNODE *nodep) {
*nodep=0; *nodep=0;
} }
#endif
static long brtnode_size(BRTNODE node) { static long brtnode_size(BRTNODE node) {
return toku_serialize_brtnode_size(node); return toku_serialize_brtnode_size(node);
} }
#ifdef FOO
static void toku_update_brtnode_loggerlsn(BRTNODE node, TOKULOGGER logger) { static void toku_update_brtnode_loggerlsn(BRTNODE node, TOKULOGGER logger) {
if (logger) { if (logger) {
...@@ -82,6 +86,8 @@ static void fixup_child_fingerprint(BRTNODE node, int childnum_of_node, BRTNODE ...@@ -82,6 +86,8 @@ static void fixup_child_fingerprint(BRTNODE node, int childnum_of_node, BRTNODE
toku_update_brtnode_loggerlsn(node, logger); toku_update_brtnode_loggerlsn(node, logger);
} }
#endif
// If you pass in data==0 then it only compares the key, not the data (even if is a DUPSORT database) // If you pass in data==0 then it only compares the key, not the data (even if is a DUPSORT database)
static int brt_compare_pivot(BRT brt, DBT *key, DBT *data, bytevec ck) { static int brt_compare_pivot(BRT brt, DBT *key, DBT *data, bytevec ck) {
int cmp; int cmp;
...@@ -97,6 +103,7 @@ static int brt_compare_pivot(BRT brt, DBT *key, DBT *data, bytevec ck) { ...@@ -97,6 +103,7 @@ static int brt_compare_pivot(BRT brt, DBT *key, DBT *data, bytevec ck) {
return cmp; return cmp;
} }
#ifdef FOO
void toku_brtnode_flush_callback (CACHEFILE cachefile, DISKOFF nodename, void *brtnode_v, long size __attribute((unused)), BOOL write_me, BOOL keep_me, LSN modified_lsn __attribute__((__unused__)) , BOOL rename_p __attribute__((__unused__))) { void toku_brtnode_flush_callback (CACHEFILE cachefile, DISKOFF nodename, void *brtnode_v, long size __attribute((unused)), BOOL write_me, BOOL keep_me, LSN modified_lsn __attribute__((__unused__)) , BOOL rename_p __attribute__((__unused__))) {
BRTNODE brtnode = brtnode_v; BRTNODE brtnode = brtnode_v;
...@@ -170,9 +177,11 @@ int toku_unpin_brt_header (BRT brt) { ...@@ -170,9 +177,11 @@ int toku_unpin_brt_header (BRT brt) {
brt->h=0; brt->h=0;
return r; return r;
} }
#endif
static int unpin_brtnode (BRT brt, BRTNODE node) { static int unpin_brtnode (BRT brt, BRTNODE node) {
return toku_cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node)); return toku_cachetable_unpin(brt->cf, node->thisnodename, node->dirty, brtnode_size(node));
} }
#ifdef FOO
typedef struct kvpair { typedef struct kvpair {
bytevec key; bytevec key;
...@@ -293,7 +302,7 @@ static int split_leaf_node (BRT t, TOKULOGGER logger, BRTNODE node, int *n_new_n ...@@ -293,7 +302,7 @@ static int split_leaf_node (BRT t, TOKULOGGER logger, BRTNODE node, int *n_new_n
while (toku_serialize_brtnode_size(node)>node->nodesize) { while (toku_serialize_brtnode_size(node)>node->nodesize) {
BRTNODE B; BRTNODE B;
DBT splitk; DBT splitk;
if ((r = create_new_brtnode(t, &B, 0, logger))) return r; if ((r = toku_create_new_brtnode(t, &B, 0, logger))) return r;
// Split so that B is at least 1/2 full // Split so that B is at least 1/2 full
// The stuff in B goes *before* node // The stuff in B goes *before* node
if ((r = toku_pma_split(logger, toku_cachefile_filenum(t->cf), if ((r = toku_pma_split(logger, toku_cachefile_filenum(t->cf),
...@@ -324,7 +333,7 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -324,7 +333,7 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
assert(node->height>0); assert(node->height>0);
assert(node->u.n.n_children>=2); // Otherwise, how do we split? We need at least two children to split. */ assert(node->u.n.n_children>=2); // Otherwise, how do we split? We need at least two children to split. */
assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */ assert(t->h->nodesize>=node->nodesize); /* otherwise we might be in trouble because the nodesize shrank. */
create_new_brtnode(t, &B, node->height, logger); toku_create_new_brtnode(t, &B, node->height, logger);
B->u.n.n_children =n_children_in_b; B->u.n.n_children =n_children_in_b;
//printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B); //printf("%s:%d %p (%lld) becomes %p and %p\n", __FILE__, __LINE__, node, node->thisnodename, A, B);
//printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename); //printf("%s:%d A is at %lld\n", __FILE__, __LINE__, A->thisnodename);
...@@ -432,6 +441,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node ...@@ -432,6 +441,8 @@ static int brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *node
return 0; return 0;
} }
#endif
static void find_heaviest_child (BRTNODE node, int *childnum) { static void find_heaviest_child (BRTNODE node, int *childnum) {
int max_child = 0; int max_child = 0;
int max_weight = BNC_NBYTESINBUF(node, 0); int max_weight = BNC_NBYTESINBUF(node, 0);
...@@ -465,8 +476,140 @@ static unsigned int brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) { ...@@ -465,8 +476,140 @@ static unsigned int brtnode_which_child (BRTNODE node , DBT *k, DBT *d, BRT t) {
} }
static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS weak_p); static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS weak_p);
static int maybe_fixup_fat_child(BRT t, BRTNODE node, int childnum, BRTNODE child, TOKULOGGER logger); // If the node is too big then deal with it. Unpin the child (or children if it splits) NODE may be too big at the end
// If CHILD is too wide, split it, and create a new node with the new children. Unpin CHILD or the new children (even if something goes wrong).
// If it does split, unpin the new root node also.
static int maybe_split_root(BRT brt, BRTNODE child, CACHEKEY *rootp, TOKULOGGER logger);
// if CHILD is too wide, split it, and fix up NODE. Either way, unpin the child or resulting children (even if it fails do the unpin)
static int maybe_split_nonroot (BRT brt, BRTNODE node, int childnum, BRTNODE child, int *n_children_replacing_child, TOKULOGGER logger);
// Push stuff into a child weakly. (That is don't cause any I/O or cause the child to get too big.)
static int weak_push_to_child (BRT brt, BRTNODE node, int childnum, TOKULOGGER logger) {
void *child_v;
int r = toku_cachetable_maybe_get_and_pin(brt->cf, BNC_DISKOFF(node, childnum), &child_v);
if (r!=0) return 0;
BRTNODE child = child_v;
DBT key,val;
BRT_CMD_S cmd;
while (0 == toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
r = brtnode_put(brt, child, &cmd, logger, WEAK);
if (r==EAGAIN) break;
if (r!=0) goto died;
r=toku_fifo_deq(BNC_BUFFER(node, childnum));
if (r!=0) goto died;
}
return unpin_brtnode(brt, child);
died:
unpin_brtnode(brt, child);
return r;
}
// If the buffers are too big, push stuff down. The subchild may need to be split, in which case our fanout may get too large.
// When are done, this node is has little enough stuff in its buffers (but the fanout may be too large), and all the descendant
// nodes are properly sized (the buffer sizes and fanouts are all small enough).
static int push_down_if_buffers_too_full(BRT brt, BRTNODE node, TOKULOGGER logger) {
if (node->height==0) return 0; // can't push down for leaf nodes
while (node->u.n.n_bytes_in_buffers > 0 && toku_serialize_brtnode_size(node)>node->nodesize) {
int childnum;
find_heaviest_child(node, &childnum);
void *child_v;
int r = toku_cachetable_get_and_pin(brt->cf, BNC_DISKOFF(node, childnum), &child_v, NULL,
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt);
if (r!=0) return r;
BRTNODE child=child_v;
if (0) { died: unpin_brtnode(brt, child); return r; }
BRT_CMD_S cmd;
DBT key,val;
while (0==toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
r=toku_fifo_deq(BNC_BUFFER(node, childnum));
assert(r==0); // we just did a peek, so the buffer must be nonempty
r=brtnode_put(brt, child, &cmd, logger, WEAK);
if (r!=EAGAIN && r!=0) goto died;
if (r==EAGAIN) {
// Weak pushes ran out of steam. Now do a strong push if there is still something in the buffer.
if (0==toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
r=brtnode_put(brt, child, &cmd, logger, STRONG);
if (r!=0) goto died;
r=toku_fifo_deq(BNC_BUFFER(node, childnum));
if (r!=0) goto died;
// Now it's possible that the child must be split. (Or maybe the child managed to flush stuff to our grandchildren)
int n_children_replacing_child;
r=maybe_split_nonroot(brt, node, childnum, child, &n_children_replacing_child, logger);
if (r!=0) return r; // don't go to died since that unpins
int i;
for (i=0; i<n_children_replacing_child; i++) {
r=weak_push_to_child(brt, node, childnum+i, logger);
if (r!=0) return r;
}
// we basically pushed as much as we could to that child
}
}
}
}
return 0;
}
static int nonleaf_node_is_too_wide (BRT, BRTNODE);
static int maybe_fixup_fat_child(BRT brt, BRTNODE node, int childnum, BRTNODE child, TOKULOGGER logger) // If the node is too big then deal with it. Unpin the child (or children if it splits) NODE may be too big at the end
{
int r = push_down_if_buffers_too_full(brt, child, logger);
if (r!=0) return r;
// now the child may have too much fanout.
if (child->height>0) {
if (nonleaf_node_is_too_wide(brt, child)) {
int n_new_nodes; BRTNODE *new_nodes; DBT *splitks;
if ((r=split_nonleaf_node(brt, child, &n_new_nodes, &new_nodes, &splitks))) return r;
int i;
int old_n_children = node->u.n.n_children;
FIFO old_fifo = BNC_BUFFER(node, childnum);
node->u.n.childinfos = toku_realloc(node->u.n.childinfos, (old_n_children+n_new_nodes-1) * sizeof(struct brt_nonleaf_childinfo));
// slide the children over
for (i=old_n_children-1; i>childnum; i--)
node->u.n.childinfos[i+n_new_nodes-1] = node->u.n.childinfos[i];
// fill in the new children
for (; i<childnum+n_new_nodes-1; i++) {
node->u.n.childinfos[i] = (struct brtnode_nonleaf_childinfo) { .subtree_fingerprint = 0,
.diskoff = new_nodes[i-childnum]->thisnodename,
.n_bytes_in_buffer = 0 };
r=toku_fifo_create(&BNC_BUFFER(node, i));
}
// slide the keys over
node->u.n.childkeys = toku_realloc(node->u.n.childkeys, (old_n_children+n_new_nodes-2 ) * sizeof(node->u.n.childkeys[0]));
for (i=node->u.n.n_children; cnum>=childnum; cnum--) {
node->u.n.childkeys[cnum+n_new_nodes-1] = node->u.n.childkeys[cnum];
}
// fix up fingerprints
for (i=0; i<n_new_nodes; i++) {
fixup_child_fingerprint(node, childnum+i, new_nodes[i], brt, logger);
}
toku_free(new_nodes);
// now everything in the fifos must be put again
BRT_CMD_S cmd;
DBT key,val;
while (0=toku_fifo_peek_deq_cmdstruct(old_fifo, &cmd, &key, &val)) {
for (i=childnum; i<childnum+n_new_nodes-1; i++) {
int cmp = brt_compare_pivot(t, cmd->u.id.key, 0, node->u.n.childkeys[i]);
if (cmp<=0) {
r=toku_fifo_enq_cmdstruct(BNC_BUFFER(node, i), cmd);
if (r!=0) return r;
if (cmd->type!=DELETE || 0==(t->flags&TOKU_DB_DUPSORT)) goto filled; // we only need to put one in
}
}
r=toku_fifo_enq_cmdstruct(BNC_BUFFER(node, i), cmd);
if (r!=0) return r;
filled: /*nothing*/;
}
r=toku_fifo_free(&old_fifo);
if (r!=0) return r;
}
} else {
abort(); // if a leaf is too fat need to split it.
}
return 0;
}
// There are two kinds of puts: // There are two kinds of puts:
// A "weak" put that is guaranteed to trigger no I/O, and will not leaf the node overfull. // A "weak" put that is guaranteed to trigger no I/O, and will not leaf the node overfull.
...@@ -507,40 +650,6 @@ static int brt_leaf_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS ...@@ -507,40 +650,6 @@ static int brt_leaf_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS
return EINVAL; // if none of the cases match, then the command is messed up. return EINVAL; // if none of the cases match, then the command is messed up.
} }
static int brt_leaf_strong_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger) {
FILENUM filenum = toku_cachefile_filenum(t->cf);
switch (cmd->type) {
case BRT_INSERT: {
int r = toku_pma_strong_insert_or_replace(node->u.l.buffer,
cmd->u.id.key, cmd->u.id.val,
logger, cmd->xid,
filenum, node->thisnodename, node->rand4fingerprint, &node->local_fingerprint,
&node->log_lsn, &node->u.l.n_bytes_in_buffer);
assert(r==0);
node->dirty=1;
return 0;
}
case BRT_DELETE: {
int r = toku_pma_delete_fixupsize(node->u.l.buffer, cmd->u.id.key, (DBT*)0,
logger, cmd->xid, node->thisnodename,
node->rand4fingerprint, &node->local_fingerprint, &node->log_lsn, &node->u.l.n_bytes_in_buffer);
if (r==0) node->dirty=1;
if (r==DB_NOTFOUND) r=0;
return r;
}
case BRT_DELETE_BOTH: {
int r = toku_pma_delete_fixupsize(node->u.l.buffer, cmd->u.id.key, cmd->u.id.val,
logger, cmd->xid, node->thisnodename,
node->rand4fingerprint, &node->local_fingerprint, &node->log_lsn,&node->u.l.n_bytes_in_buffer);
if (r == 0) node->dirty = 1;
if (r == DB_NOTFOUND) r=0;
return r;
}
case BRT_NONE: return 0;
}
return EINVAL; // if none of the cases match, then the command is messed up.
}
// Put an command in a particular child's fifo. // Put an command in a particular child's fifo.
// If weak_p then do it without doing I/O or overfilling the child. // If weak_p then do it without doing I/O or overfilling the child.
// If the child is in main memory and we can do a weak put on the child, then push into the child. // If the child is in main memory and we can do a weak put on the child, then push into the child.
...@@ -561,7 +670,7 @@ static int brt_nonleaf_put_cmd_to_child (BRT t, BRTNODE node, int childnum, BRT_ ...@@ -561,7 +670,7 @@ static int brt_nonleaf_put_cmd_to_child (BRT t, BRTNODE node, int childnum, BRT_
r = unpin_brtnode(t, child); r = unpin_brtnode(t, child);
if (r!=0) return r; // node is still OK if (r!=0) return r; // node is still OK
} else if (r==0) { } else if (r==0) {
return maybe_fixup_fat_child(t, node, childnum, child, logger); // If the node is too big then deal with it. Unpin the child. NODE may be too big return maybe_fixup_fat_child(t, node, childnum, child, logger); // If the node is too big then deal with it. Unpin the child. NODE may be too big. I think the only way a node can get fat is if weak_p==STRONG.
} else { } else {
unpin_brtnode(t, child); unpin_brtnode(t, child);
return r; // node is still OK return r; // node is still OK
...@@ -650,11 +759,9 @@ static int brt_nonleaf_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, ...@@ -650,11 +759,9 @@ static int brt_nonleaf_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger,
return EINVAL; return EINVAL;
} }
// Put the command into the node. For leaf nodes, that means execute the command. // Put the command into the node.
// For internal nodes, just put it into the fifo, unless the appropriate child is in main memory and has a place to put the command without getting too big. // If weak_p is set then neither the node nor any descendants will get too big, and no I/O will occur.
// The node could end up overfull (but the children cannot get too big) // if !weak_p then I/O could occur and the node could end up with too much fanout. (But the children will all be properly sized)
// However, if you precalculate that the node is big enough, then the node will not get too big.
// (This implies that none of the children will overflow since we precalculate before calling this function on a child.)
static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS weak_p) { static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS weak_p) {
if (node->height==0) { if (node->height==0) {
return brt_leaf_put(t, node, cmd, logger, weak_p); return brt_leaf_put(t, node, cmd, logger, weak_p);
...@@ -662,6 +769,7 @@ static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS ...@@ -662,6 +769,7 @@ static int brtnode_put (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger, WS
return brt_nonleaf_put(t, node, cmd, logger, weak_p); return brt_nonleaf_put(t, node, cmd, logger, weak_p);
} }
} }
#ifdef FOO
static void verify_local_fingerprint_nonleaf (BRTNODE node) { static void verify_local_fingerprint_nonleaf (BRTNODE node) {
u_int32_t fp=0; u_int32_t fp=0;
...@@ -1083,55 +1191,12 @@ static int brt_init_new_root(BRT brt, int n_new_nodes, BRTNODE *new_nodes, DBT * ...@@ -1083,55 +1191,12 @@ static int brt_init_new_root(BRT brt, int n_new_nodes, BRTNODE *new_nodes, DBT *
return 0; return 0;
} }
static int nonleaf_node_is_too_wide (BRT, BRTNODE);
static int split_nonleaf_node(BRT, int *n_new_nodes, BRTNODE **new_nodes, DBT **splitks); static int split_nonleaf_node(BRT, int *n_new_nodes, BRTNODE **new_nodes, DBT **splitks);
static int leaf_node_is_too_full (BRT, BRTNODE); static int leaf_node_is_too_full (BRT, BRTNODE);
// If CHILD is too wide, split it, and create a new node with the new children. Unpin CHILD or the new children (even if something goes wrong).
// If it does split, unpin the new root node also.
static int maybe_split_root(BRT brt, BRTNODE child, CACHEKEY *rootp, TOKULOGGER logger);
// if CHILD is too wide, split it, and fix up NODE. Either way, unpin the child or resulting children (even if it fails do the unpin)
static int maybe_split_nonroot (BRT brt, BRTNODE node, int childnum, BRTNODE child, TOKULOGGER logger);
// push things down into node's children (and into their children and so forth) but don't make any descendant too big. // push things down into node's children (and into their children and so forth) but don't make any descendant too big.
static int push_down_without_overfilling (BRT brt, BRTNODE node, TOKULOGGER logger); static int push_down_without_overfilling (BRT brt, BRTNODE node, TOKULOGGER logger);
// If the buffers are too big, push stuff down. The subchild may need to be split, in which case our fanout may get too large.
// When are done, this node is has little enough stuff in its buffers (but the fanout may be too large), and all the descendant
// nodes are properly sized (the buffer sizes and fanouts are all small enough).
static int push_down_if_buffers_too_full(BRT brt, BRTNODE node, TOKULOGGER logger) {
if (node->height==0) return 0; // can't push down for leaf nodes
while (node->u.n.n_bytes_in_buffers > 0 && toku_serialize_brtnode_size(node)>node->nodesize) {
int childnum;
find_heaviest_child(node, &childnum);
void *child_v;
int r = toku_cachetable_get_and_pin(brt->cf, BNC_DISKOFF(node, childnum), &child_v, NULL,
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt);
if (r!=0) return r;
BRTNODE child=child_v;
if (0) { died: unpin_brtnode(brt, child); return r; }
BRT_CMD_S cmd;
DBT key,val;
while (0==toku_fifo_peek_cmdstruct(BNC_BUFFER(node, childnum), &cmd, &key, &val)) {
r=toku_fifo_deq(BNC_BUFFER(node, childnum));
assert(r==0); // we just did a peek, so the buffer must be nonempty
r=brtnode_put_cmd_no_io(brt, child, &cmd, logger); if (r!=0) goto died;
if (toku_serialize_brtnode_size(child)>child->nodesize) {
// The child got too big, so do the fixup on the child
r = push_down_if_buffers_too_full(brt, child, logger); if (r!=0) goto died;
// After the split_nonroot call, the children are all unpinned...
r = maybe_split_nonroot(brt, node, childnum, child, logger);
if (r!=0) return r; // so on error just return r instead of going to died.
r =push_down_without_overfilling(brt, node, logger);
if (r!=0) return r;
// We hope that NODE is now not too full. One can imagine cases where it is too full, however, so we
// stop popping from this fifo, and go around the outer while loop to look at the node to see if it is too big again.
break;
}
}
}
return 0;
}
// Push data toward a child. If the child gets too big then the child will push down or split. // Push data toward a child. If the child gets too big then the child will push down or split.
// If a split happens, then return immediately so that we can check to see if NODE needs to be split // If a split happens, then return immediately so that we can check to see if NODE needs to be split
static int flush_toward_child (BRT brt, BRTNODE node, int childnum, TOKULOGGER logger); static int flush_toward_child (BRT brt, BRTNODE node, int childnum, TOKULOGGER logger);
...@@ -1143,7 +1208,7 @@ static int maybe_fixup_root (BRT brt, BRTNODE node, CACHEKEY *rootp, TOKULOGGER ...@@ -1143,7 +1208,7 @@ static int maybe_fixup_root (BRT brt, BRTNODE node, CACHEKEY *rootp, TOKULOGGER
maybe_reshape_internal_node: maybe_reshape_internal_node:
while (nonleaf_node_is_too_wide(brt, node)) { while (nonleaf_node_is_too_wide(brt, node)) {
int n_new_nodes; BRTNODE *new_nodes; DBT *splitks; int n_new_nodes; BRTNODE *new_nodes; DBT *splitks;
if ((r=split_nonleaf_node(brt, &n_new_nodes, &new_nodes, &splitks))) return r; if ((r=split_nonleaf_node(brt, node, &n_new_nodes, &new_nodes, &splitks))) return r;
if ((r=brt_init_new_root(brt, n_new_nodes, new_nodes, splitks, rootp, logger, &node))) return r; // unpins all the new nodes, which are all small enough if ((r=brt_init_new_root(brt, n_new_nodes, new_nodes, splitks, rootp, logger, &node))) return r; // unpins all the new nodes, which are all small enough
// now node is still possibly too wide, hence the loop // now node is still possibly too wide, hence the loop
} }
...@@ -1160,6 +1225,8 @@ static int maybe_fixup_root (BRT brt, BRTNODE node, CACHEKEY *rootp, TOKULOGGER ...@@ -1160,6 +1225,8 @@ static int maybe_fixup_root (BRT brt, BRTNODE node, CACHEKEY *rootp, TOKULOGGER
return 0; return 0;
} }
#endif
static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) { static int brt_root_put_cmd(BRT brt, BRT_CMD cmd, TOKULOGGER logger) {
void *node_v; void *node_v;
BRTNODE node; BRTNODE node;
...@@ -1344,6 +1411,7 @@ int toku_brt_dbt_set_value(BRT brt, DBT *ybt, bytevec val, ITEMLEN vallen) { ...@@ -1344,6 +1411,7 @@ int toku_brt_dbt_set_value(BRT brt, DBT *ybt, bytevec val, ITEMLEN vallen) {
return r; return r;
} }
#ifdef FOO
/* search in a node's child */ /* search in a node's child */
static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, TOKULOGGER logger) { static int brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, DBT *newkey, DBT *newval, TOKULOGGER logger) {
int r, rr; int r, rr;
...@@ -1834,3 +1902,4 @@ int toku_brt_height_of_root(BRT brt, int *height) { ...@@ -1834,3 +1902,4 @@ int toku_brt_height_of_root(BRT brt, int *height) {
r = toku_unpin_brt_header(brt); assert(r==0); r = toku_unpin_brt_header(brt); assert(r==0);
return 0; return 0;
} }
#endif
...@@ -42,6 +42,8 @@ void *toku_realloc(void *, size_t size); ...@@ -42,6 +42,8 @@ void *toku_realloc(void *, size_t size);
*/ */
#define MALLOC_N(n,v) v = toku_malloc((n)*sizeof(*v)) #define MALLOC_N(n,v) v = toku_malloc((n)*sizeof(*v))
#define REALLOC_N(n,v) v = toku_realloc(v, (n)*sizeof(*v))
/* If you have a type such as /* If you have a type such as
* struct pma *PMA; * struct pma *PMA;
* and you define a corresponding int constant, such as * and you define a corresponding int constant, such as
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment