Commit 586eb63c authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

Merge {{{tokudb.1381}}} onto the main line.

Fixes #1381.  Addresses #1393 which was also fixed in this branch.

{{{
svn merge -r 8803:8831 https://svn.tokutek.com/tokudb/toku/tokudb.1381
}}}
and delete the 1381 branch.


git-svn-id: file:///svn/toku/tokudb@8832 c7de825b-a66e-492c-adef-691d508d4ae1
parent a36a55c4
......@@ -47,19 +47,19 @@ build.tdb: $(TARGET_TDB) $(SCANSCAN_TDB)
check: check-default check-xfast check-x check-no-rolltmp
check-default: $(TARGET_TDB)
$(VGRIND) ./$(TARGET_TDB) $(QUIET) $(SUMMARIZE_CMD)
$(VGRIND) ./$(TARGET_TDB) $(VERBVERBOSE) $(SUMMARIZE_CMD)
check-x: $(TARGET_TDB)
$(VGRIND) ./$(TARGET_TDB) $(QUIET) -x --xcount 1000 --periter 100000 --env x.dir 10 $(SUMMARIZE_CMD)
$(VGRIND) ./$(TARGET_TDB) $(VERBVERBOSE) -x --xcount 1000 --periter 100000 --env x.dir 10 $(SUMMARIZE_CMD)
# A fast transaction test that detects #455.
check-xfast: $(TARGET_TDB)
./$(TARGET_TDB) $(QUIET) --noserial -x --valsize 1000 --cachesize 8000000 --xcount 1000 --periter 20000 --env xfast.dir 1 $(SUMMARIZE_CMD)
./$(TARGET_TDB) $(VERBVERBOSE) --noserial -x --valsize 1000 --cachesize 8000000 --xcount 1000 --periter 20000 --env xfast.dir 1 $(SUMMARIZE_CMD)
# A relatively fast test that detects #853 (don't log changes to a dictionary created in the same txn)
check-no-rolltmp: $(TARGET_TDB)
./$(TARGET_TDB) --env no-rolltmp.dir --singlex --nolog --check_small_rolltmp $(SUMMARIZE_CMD)
./$(TARGET_TDB) $(VERBVERBOSE) --env no-rolltmp.dir --singlex --nolog --check_small_rolltmp $(SUMMARIZE_CMD)
clean:
rm -f $(TARGETS)
......
......@@ -173,8 +173,11 @@ struct brt {
void *skey,*sval; /* Used for DBT return values. */
OMT txns; // transactions that are using this OMT (note that the transaction checks the cf also)
u_int64_t txn_that_created; // which txn created it. Use 0 if no such txn.
// If a transaction created this BRT, which one?
// If a transaction locked the BRT when it was empty, which transaction? (Only the latest one matters)
// 0 if no such transaction
TXNID txnid_that_created_or_locked_when_empty;
};
/* serialization code */
......
......@@ -475,7 +475,7 @@ static int log_and_save_brtenq(TOKULOGGER logger, BRT t, BRTNODE node, int child
u_int32_t new_fingerprint = old_fingerprint + fdiff;
//printf("%s:%d node=%lld fingerprint old=%08x new=%08x diff=%08x xid=%lld\n", __FILE__, __LINE__, node->thisnodename, old_fingerprint, new_fingerprint, fdiff, (long long)xid);
*fingerprint = new_fingerprint;
if (t->txn_that_created != xid) {
if (t->txnid_that_created_or_locked_when_empty != xid) {
int r = toku_log_brtenq(logger, &node->log_lsn, 0, toku_cachefile_filenum(t->cf), node->thisnodename, childnum, xid, type, keybs, databs);
if (r!=0) return r;
}
......@@ -979,7 +979,7 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
u_int32_t delta = toku_calc_fingerprint_cmd(type, xid, key, keylen, data, datalen);
u_int32_t new_from_fingerprint = old_from_fingerprint - node->rand4fingerprint*delta;
if (r!=0) return r;
if (t->txn_that_created != xid) {
if (t->txnid_that_created_or_locked_when_empty != xid) {
r = toku_log_brtdeq(logger, &node->log_lsn, 0, fnum, node->thisnodename, n_children_in_a);
if (r!=0) return r;
}
......@@ -1521,7 +1521,7 @@ brt_leaf_apply_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger,
//printf(" got "); print_leafentry(stdout, new_le); printf("\n");
if (le && new_le) {
if (t->txn_that_created != cmd->xid) {
if (t->txnid_that_created_or_locked_when_empty != cmd->xid) {
if ((r = toku_log_deleteleafentry(logger, &node->log_lsn, 0, filenum, node->thisnodename, idx))) goto return_r;
if ((r = toku_log_insertleafentry(logger, &node->log_lsn, 0, toku_cachefile_filenum(t->cf), node->thisnodename, idx, new_le))) goto return_r;
}
......@@ -1544,7 +1544,7 @@ brt_leaf_apply_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger,
if (le) {
// It's there, note that it's gone and remove it from the mempool
if (t->txn_that_created != cmd->xid) {
if (t->txnid_that_created_or_locked_when_empty != cmd->xid) {
if ((r = toku_log_deleteleafentry(logger, &node->log_lsn, 0, filenum, node->thisnodename, idx))) goto return_r;
}
......@@ -1559,7 +1559,7 @@ brt_leaf_apply_cmd_once (BRT t, BRTNODE node, BRT_CMD cmd, TOKULOGGER logger,
if (new_le) {
if ((r = toku_omt_insert_at(node->u.l.buffer, new_le, idx))) goto return_r;
if (t->txn_that_created != cmd->xid) {
if (t->txnid_that_created_or_locked_when_empty != cmd->xid) {
if ((r = toku_log_insertleafentry(logger, &node->log_lsn, 0, toku_cachefile_filenum(t->cf), node->thisnodename, idx, new_le))) goto return_r;
}
......@@ -2551,7 +2551,7 @@ int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn)
// Effect: Insert the key-val pair into brt.
{
int r;
if (txn && (brt->txn_that_created != toku_txn_get_txnid(txn))) {
if (txn && (brt->txnid_that_created_or_locked_when_empty != toku_txn_get_txnid(txn))) {
toku_cachefile_refup(brt->cf);
BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
int need_data = (brt->flags&TOKU_DB_DUPSORT)!=0; // dupsorts don't need the data part
......@@ -2573,7 +2573,7 @@ int toku_brt_insert (BRT brt, DBT *key, DBT *val, TOKUTXN txn)
int toku_brt_delete(BRT brt, DBT *key, TOKUTXN txn) {
int r;
if (txn && (brt->txn_that_created != toku_txn_get_txnid(txn))) {
if (txn && (brt->txnid_that_created_or_locked_when_empty != toku_txn_get_txnid(txn))) {
BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
toku_cachefile_refup(brt->cf);
r = toku_logger_save_rollback_cmddelete(txn, toku_txn_get_txnid(txn), toku_cachefile_filenum(brt->cf), keybs);
......@@ -2844,7 +2844,7 @@ int toku_brt_open(BRT t, const char *fname, const char *fname_in_env, const char
}
t->database_name = malloced_name;
t->db = db;
t->txn_that_created = 0; // Uses 0 for no transaction.
t->txnid_that_created_or_locked_when_empty = 0; // Uses 0 for no transaction.
{
int fd = -1;
BOOL did_create = FALSE;
......@@ -2858,7 +2858,7 @@ int toku_brt_open(BRT t, const char *fname, const char *fname_in_env, const char
mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO;
r = toku_logger_log_fcreate(txn, fname_in_env, mode);
if (r != 0) goto died_after_open;
t->txn_that_created = toku_txn_get_txnid(txn);
t->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(txn);
}
r = toku_logger_log_fopen(txn, fname_in_env, toku_cachefile_filenum(t->cf));
}
......@@ -4076,7 +4076,7 @@ int toku_brt_lookup (BRT brt, DBT *k, DBT *v) {
int toku_brt_delete_both(BRT brt, DBT *key, DBT *val, TOKUTXN txn) {
//{ unsigned i; printf("del %p keylen=%d key={", brt->db, key->size); for(i=0; i<key->size; i++) printf("%d,", ((char*)key->data)[i]); printf("} datalen=%d data={", val->size); for(i=0; i<val->size; i++) printf("%d,", ((char*)val->data)[i]); printf("}\n"); }
int r;
if (txn && (brt->txn_that_created != toku_txn_get_txnid(txn))) {
if (txn && (brt->txnid_that_created_or_locked_when_empty != toku_txn_get_txnid(txn))) {
BYTESTRING keybs = {key->size, toku_memdup_in_rollback(txn, key->data, key->size)};
BYTESTRING databs = {val->size, toku_memdup_in_rollback(txn, val->data, val->size)};
toku_cachefile_refup(brt->cf);
......@@ -4315,3 +4315,25 @@ void toku_brt_destroy(void) {
toku_brt_lock_destroy();
}
static int
brt_is_empty (BRT brt, TOKULOGGER logger) {
BRT_CURSOR cursor;
int rr = toku_brt_cursor(brt, &cursor, 1);
if (rr!=0) return 0; // not empty if there is any error
int firstr = brt_cursor_first(cursor, NULL, NULL, logger);
rr = toku_brt_cursor_close(cursor);
if (rr!=0) return 0;
if (firstr==DB_NOTFOUND) return 1;
else return 0;
}
int
toku_brt_note_table_lock (BRT brt, TOKUTXN txn)
{
if (brt_is_empty(brt, toku_txn_logger(txn))) {
brt->txnid_that_created_or_locked_when_empty = toku_txn_get_txnid(txn);
toku_cachefile_refup(brt->cf);
return toku_logger_save_rollback_tablelock_on_empty_table(txn, toku_cachefile_filenum(brt->cf));
}
return 0;
}
......@@ -121,4 +121,8 @@ void toku_pwrite_lock_destroy(void);
void maybe_preallocate_in_file (int fd, u_int64_t size);
// Effect: If file size is less than SIZE, make it bigger by either doubling it or growing by 16MB whichever is less.
int toku_brt_note_table_lock (BRT brt, TOKUTXN txn);
// Effect: Record the fact that the BRT has a table lock (and thus no other txn will modify it until this txn completes. As a result, we can limit the amount of information in the rollback data structure.
#endif
......@@ -64,6 +64,8 @@ const struct logtype rollbacks[] = {
NULLFIELD}},
{"rollinclude", 'r', FA{{"BYTESTRING", "fname", 0},
NULLFIELD}},
{"tablelock_on_empty_table", 'L', FA{{"FILENUM", "filenum", 0},
NULLFIELD}},
// {"fclose", 'c', FA{{"FILENUM", "filenum", 0},
// {"BYTESTRING", "fname", 0},
// NULLFIELD}},
......
......@@ -299,3 +299,28 @@ toku_rollback_rollinclude (BYTESTRING bs,
toku_free(fname);
return 0;
}
int
toku_rollback_tablelock_on_empty_table (FILENUM filenum, TOKUTXN txn, YIELDF UU(yield), void* UU(yield_v))
{
// on rollback we have to make the file be empty, since we locked an empty table, and then may have done things to it.
CACHEFILE cf;
//printf("%s:%d committing insert %s %s\n", __FILE__, __LINE__, key.data, data.data);
int r = toku_cachefile_of_filenum(txn->logger->ct, filenum, &cf);
assert(r==0);
OMTVALUE brtv=NULL;
r = toku_omt_find_zero(txn->open_brts, find_brt_from_filenum, &filenum, &brtv, NULL, NULL);
assert(r==0); // we cannot handle the case where the table is already closed... Is that an important case? If it is important, we could do something about it by creating a "truncate" message that propagates down the tree, removing everything.
BRT brt = brtv;
r = toku_brt_truncate(brt);
return toku_cachefile_close(&cf, toku_txn_logger(txn));
}
int
toku_commit_tablelock_on_empty_table (FILENUM filenum, TOKUTXN txn, YIELDF UU(yield), void* UU(yield_v))
{
return do_nothing_with_filenum(txn, filenum);
}
......@@ -33,6 +33,7 @@ SRCS = $(sort $(wildcard *.c))
TDB_TESTS = $(patsubst %.c,%.tdb$(BINSUF),$(SRCS))
BDB_DONTRUN_TESTS = \
bug1381 \
bug627 \
test_abort1 \
keyrange \
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
#include "test.h"
/* Test for #1381: If we insert into a locked empty table, not much goes into the rollback data structure. */
#include <db.h>
#include <sys/stat.h>
#include <memory.h>
void do_1381_maybe_lock (int do_table_lock, u_int64_t *raw_count) {
int r;
DB_TXN * const null_txn = 0;
system("rm -rf " ENVDIR);
toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
// Create an empty file
{
DB_ENV *env;
DB *db;
const int envflags = DB_CREATE|DB_INIT_MPOOL|DB_INIT_TXN|DB_INIT_LOCK|DB_INIT_LOG |DB_THREAD |DB_PRIVATE | DB_RECOVER;
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = db_create(&db, env, 0); CKERR(r);
r = db->open(db, null_txn, "main", 0, DB_BTREE, DB_CREATE, 0666); CKERR(r);
r = db->close(db, 0); CKERR(r);
r = env->close(env, 0); CKERR(r);
}
// Now open the empty file and insert
{
DB_ENV *env;
DB *db;
const int envflags = DB_INIT_MPOOL|DB_INIT_TXN|DB_INIT_LOCK|DB_INIT_LOG |DB_THREAD |DB_PRIVATE;
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = db_create(&db, env, 0); CKERR(r);
r = db->open(db, null_txn, "main", 0, DB_BTREE, 0, 0666); CKERR(r);
DB_TXN *txn;
r = env->txn_begin(env, 0, &txn, 0); CKERR(r);
if (do_table_lock) {
r = db->pre_acquire_table_lock(db, txn); CKERR(r);
}
struct txn_stat *s1, *s2;
r = txn->txn_stat(txn, &s1); CKERR(r);
{
DBT key={.data="hi", .size=3};
DBT val={.data="v", .size=2};
r = db->put(db, txn, &key, &val, 0); CKERR(r);
}
r = txn->txn_stat(txn, &s2); CKERR(r);
//printf("Raw counts = %" PRId64 ", %" PRId64 "\n", s1->rolltmp_raw_count, s2->rolltmp_raw_count);
*raw_count = s2->rolltmp_raw_count - s1->rolltmp_raw_count;
if (do_table_lock) {
assert(s1->rolltmp_raw_count == s2->rolltmp_raw_count);
} else {
assert(s1->rolltmp_raw_count < s2->rolltmp_raw_count);
}
toku_free(s1); toku_free(s2);
r = txn->commit(txn, 0); CKERR(r);
r = db->close(db, 0); CKERR(r);
r = env->close(env, 0); CKERR(r);
}
}
void
do_1381 (void) {
int do_table_lock;
u_int64_t raw_counts[2];
for (do_table_lock = 0; do_table_lock < 2 ; do_table_lock++) {
do_1381_maybe_lock(do_table_lock, &raw_counts[do_table_lock]);
}
assert(raw_counts[0] > raw_counts[1]); // the raw counts should be less for the tablelock case.
}
int
test_main (int argc, const char *argv[])
{
parse_args(argc, argv);
do_1381();
return 0;
}
......@@ -70,7 +70,7 @@ doit (void) {
gettimeofday(&endt, 0);
long long ninserts = NINSERTS_PER * NFILES;
double diff = (endt.tv_sec - startt.tv_sec) + 1e-6*(endt.tv_usec-startt.tv_usec);
printf("%lld insertions in %9.6fs, %9.3f ins/s \n", ninserts, diff, ninserts/diff);
if (verbose) printf("%lld insertions in %9.6fs, %9.3f ins/s \n", ninserts, diff, ninserts/diff);
}
int
......
......@@ -3311,6 +3311,10 @@ static int toku_db_pre_acquire_table_lock(DB *db, DB_TXN *txn) {
r = toku_lt_acquire_range_write_lock(db->i->lt, db, id_anc,
toku_lt_neg_infinity, toku_lt_neg_infinity,
toku_lt_infinity, toku_lt_infinity);
if (r==0) {
r = toku_brt_note_table_lock(db->i->brt, txn->i->tokutxn); // tell the BRT layer that the table is locked (so that it can reduce the amount of rollback (rolltmp) data.
}
return r;
}
......
......@@ -352,13 +352,13 @@ $(NEWBRT): $(@D)*.[ch]
cd $(@D) && $(MAKE) $(@F)
endif
BIN_FROM_C_FLAGS =$(CFLAGS) $(CPPFLAGS) $(BINOUTPUT)$@ $(LDFLAGS)
BIN_FROM_C_FLAGS_NOLIB=$(CFLAGS) $(CPPFLAGS) $(BINOUTPUT)$@ $(LDFLAGS_NOLIB)
BIN_FROM_C_FLAGS =$(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(BINOUTPUT)$@
BIN_FROM_C_FLAGS_NOLIB=$(CFLAGS) $(CPPFLAGS) $(LDFLAGS_NOLIB) $(BINOUTPUT)$@
%$(BINSUF):%.c $(DEPEND_COMPILE) $(DEPEND_LINK)
$(CC) $< $(BIN_FROM_C_FLAGS)
BIN_FROM_O_FLAGS =$(CFLAGS) $(CPPFLAGS) $(BINOUTPUT)$@ $(LDFLAGS)
BIN_FROM_O_FLAGS_NOLIB=$(CFLAGS) $(CPPFLAGS) $(BINOUTPUT)$@ $(LDFLAGS_NOLIB)
BIN_FROM_O_FLAGS =$(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(BINOUTPUT)$@
BIN_FROM_O_FLAGS_NOLIB=$(CFLAGS) $(CPPFLAGS) $(LDFLAGS_NOLIB) $(BINOUTPUT)$@
%$(BINSUF):%.$(OEXT) $(DEPEND_COMPILE) $(DEPEND_LINK)
$(CC) $< $(BIN_FROM_O_FLAGS)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment