Commit 149ec9d8 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

Merge the multihreaded writer changes as

{{{
svn merge -r 5899:5987 https://svn.tokutek.com/tokudb/tokudb
}}}
and resolve the conflicts.


git-svn-id: file:///svn/tokudb.1131b@5988 c7de825b-a66e-492c-adef-691d508d4ae1
parent 9475c0c8
...@@ -18,7 +18,7 @@ TARGETS = $(patsubst %.cpp,%,$(SRCS)) ...@@ -18,7 +18,7 @@ TARGETS = $(patsubst %.cpp,%,$(SRCS))
# GCOV_FLAGS = -fprofile-arcs -ftest-coverage # GCOV_FLAGS = -fprofile-arcs -ftest-coverage
CPPFLAGS = -I../ -I../../include CPPFLAGS = -I../ -I../../include
CXXFLAGS = -Wall $(OPTFLAGS) -g $(GCOV_FLAGS) CXXFLAGS = -Wall $(OPTFLAGS) -g $(GCOV_FLAGS)
LDLIBS = ../../lib/libtokudb_cxx.a ../../lib/libtokudb.a -lz LDLIBS = ../../lib/libtokudb_cxx.a ../../lib/libtokudb.a -lz -lpthread
ifneq ($(OSX),) ifneq ($(OSX),)
VGRIND= VGRIND=
......
...@@ -12,7 +12,7 @@ CXXFLAGS = -Wall -Werror -g $(OPTFLAGS) $(GCOV_FLAGS) ...@@ -12,7 +12,7 @@ CXXFLAGS = -Wall -Werror -g $(OPTFLAGS) $(GCOV_FLAGS)
ifdef BDBDIR ifdef BDBDIR
BDB_CPPFLAGS = -I$(BDBDIR)/include BDB_CPPFLAGS = -I$(BDBDIR)/include
BDB_LDFLAGS = -L$(BDBDIR)/lib -ldb_cxx -lpthread -Wl,-rpath,$(BDBDIR)/lib BDB_LDFLAGS = -L$(BDBDIR)/lib -ldb_cxx -Wl,-rpath,$(BDBDIR)/lib -lpthread
else else
BDB_CPPFLAGS = BDB_CPPFLAGS =
BDB_LDFLAGS = -ldb_cxx -lpthread BDB_LDFLAGS = -ldb_cxx -lpthread
...@@ -47,7 +47,7 @@ clean: ...@@ -47,7 +47,7 @@ clean:
db-benchmark-test-tokudb: ../lib/libtokudb_cxx.a db-benchmark-test-tokudb: ../lib/libtokudb_cxx.a
db-benchmark-test-tokudb: db-benchmark-test.cpp db-benchmark-test-tokudb: db-benchmark-test.cpp
$(CXX) $(CXXFLAGS) -I../include -L../lib -Wl,-rpath,$(PWD)/../lib $< -o $@ -ltokudb -ltokudb_cxx -lz -DDIRSUF=tokudb $(CXX) $(CXXFLAGS) -I../include -L../lib -Wl,-rpath,$(PWD)/../lib $< -o $@ -ltokudb -ltokudb_cxx -DDIRSUF=tokudb -lz -lpthread
db-benchmark-test-bdb: db-benchmark-test.cpp db-benchmark-test-bdb: db-benchmark-test.cpp
$(CXX) $(CXXFLAGS) $(BDB_CPPFLAGS) $(BDB_LDFLAGS) $< -o $@ -DDIRSUF=bdb $(CXX) $(CXXFLAGS) $(BDB_CPPFLAGS) $(BDB_LDFLAGS) $< -o $@ -DDIRSUF=bdb
...@@ -23,13 +23,13 @@ CFLAGS = -Wall -Werror -g $(OPTFLAGS) $(GCOV_FLAGS) $(PROF_FLAGS) ...@@ -23,13 +23,13 @@ CFLAGS = -Wall -Werror -g $(OPTFLAGS) $(GCOV_FLAGS) $(PROF_FLAGS)
LDFLAGS += -lpthread LDFLAGS += -lpthread
ifdef BDBDIR ifdef BDBDIR
BDB_CPPFLAGS = -I$(BDBDIR)/include BDB_CPPFLAGS = -I$(BDBDIR)/include
BDB_LDFLAGS = -L$(BDBDIR)/lib -ldb -lpthread -Wl,-rpath,$(BDBDIR)/lib BDB_LDFLAGS = -L$(BDBDIR)/lib -ldb -Wl,-rpath,$(BDBDIR)/lib -lpthread
else else
BDB_CPPFLAGS = BDB_CPPFLAGS =
BDB_LDFLAGS = -ldb BDB_LDFLAGS = -ldb
endif endif
TDB_CPPFLAGS = -I../include TDB_CPPFLAGS = -I../include
TDB_LDFLAGS = -L../lib -ltokudb -lz -Wl,-rpath,$(PWD)/../lib TDB_LDFLAGS = -L../lib -ltokudb -Wl,-rpath,$(PWD)/../lib -lpthread -lz
TARGET_BDB = db-benchmark-test-bdb TARGET_BDB = db-benchmark-test-bdb
TARGET_TDB = db-benchmark-test-tokudb TARGET_TDB = db-benchmark-test-tokudb
......
...@@ -123,7 +123,7 @@ void setup (void) { ...@@ -123,7 +123,7 @@ void setup (void) {
r = db->open(db, tid, dbfilename, NULL, DB_BTREE, DB_CREATE, 0644); r = db->open(db, tid, dbfilename, NULL, DB_BTREE, DB_CREATE, 0644);
if (r!=0) fprintf(stderr, "errno=%d, %s\n", errno, strerror(errno)); if (r!=0) fprintf(stderr, "errno=%d, %s\n", errno, strerror(errno));
assert(r == 0); assert(r == 0);
if (do_transactions && !singlex) { if (do_transactions) {
if (singlex) do_prelock(db, tid); if (singlex) do_prelock(db, tid);
else { else {
r=tid->commit(tid, 0); r=tid->commit(tid, 0);
......
...@@ -36,7 +36,7 @@ FORMAT=-Wmissing-format-attribute ...@@ -36,7 +36,7 @@ FORMAT=-Wmissing-format-attribute
endif endif
CFLAGS = -Wall -Wextra -Wcast-align -Wbad-function-cast -Wmissing-noreturn $(FORMAT) $(OPTFLAGS) -g3 -ggdb3 $(GCOV_FLAGS) $(PROF_FLAGS) -Werror $(FPICFLAGS) $(SHADOW) $(VISIBILITY) CFLAGS = -Wall -Wextra -Wcast-align -Wbad-function-cast -Wmissing-noreturn $(FORMAT) $(OPTFLAGS) -g3 -ggdb3 $(GCOV_FLAGS) $(PROF_FLAGS) -Werror $(FPICFLAGS) $(SHADOW) $(VISIBILITY)
LDFLAGS = $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) -lz LDFLAGS = $(OPTFLAGS) -g $(GCOV_FLAGS) $(PROF_FLAGS) -lz -lpthread
CPPFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_XOPEN_SOURCE=500 CPPFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_XOPEN_SOURCE=500
# Add -Wconversion # Add -Wconversion
...@@ -77,6 +77,7 @@ BRT_SOURCES = \ ...@@ -77,6 +77,7 @@ BRT_SOURCES = \
ybt \ ybt \
x1764 \ x1764 \
trace_mem \ trace_mem \
threadpool \
# keep this line so I can ha vea \ on the previous line # keep this line so I can ha vea \ on the previous line
OFILES = newbrt.o $(CYG_ADD_LIBZ) OFILES = newbrt.o $(CYG_ADD_LIBZ)
......
...@@ -183,8 +183,8 @@ struct brtenv { ...@@ -183,8 +183,8 @@ struct brtenv {
// SPINLOCK checkpointing; // SPINLOCK checkpointing;
}; };
extern cachetable_flush_func_t toku_brtnode_flush_callback, toku_brtheader_flush_callback; extern void toku_brtnode_flush_callback(), toku_brtheader_flush_callback();
extern cachetable_fetch_func_t toku_brtnode_fetch_callback, toku_brtheader_fetch_callback; extern int toku_brtnode_fetch_callback(), toku_brtheader_fetch_callback();
extern int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header); extern int toku_read_and_pin_brt_header (CACHEFILE cf, struct brt_header **header);
extern int toku_unpin_brt_header (BRT brt); extern int toku_unpin_brt_header (BRT brt);
extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *root_hash); extern CACHEKEY* toku_calculate_root_offset_pointer (BRT brt, u_int32_t *root_hash);
......
...@@ -3256,18 +3256,6 @@ static inline int brt_cursor_copyout(BRT_CURSOR cursor, DBT *key, DBT *val) { ...@@ -3256,18 +3256,6 @@ static inline int brt_cursor_copyout(BRT_CURSOR cursor, DBT *key, DBT *val) {
return r; return r;
} }
static inline int brt_cursor_copyout_with_dat(BRT_CURSOR cursor, DBT *key, DBT *val,
BRT pdb, DBT* dat, DBT* dat_source) {
int r = 0;
void** key_staticp = cursor->is_temporary_cursor ? &cursor->brt->skey : &cursor->skey;
void** val_staticp = cursor->is_temporary_cursor ? &cursor->brt->sval : &cursor->sval;
void** dat_staticp = &pdb->sval;
r = toku_dbt_set_three_values(key, (bytevec*)&cursor->key.data, cursor->key.size, key_staticp, FALSE,
val, (bytevec*)&cursor->val.data, cursor->val.size, val_staticp, FALSE,
dat, (bytevec*)&dat_source->data, dat_source->size, dat_staticp, FALSE);
return r;
}
int toku_brt_dbt_set(DBT* key, DBT* key_source) { int toku_brt_dbt_set(DBT* key, DBT* key_source) {
int r = toku_dbt_set_value(key, (bytevec*)&key_source->data, key_source->size, NULL, FALSE); int r = toku_dbt_set_value(key, (bytevec*)&key_source->data, key_source->size, NULL, FALSE);
return r; return r;
......
// A read lock is acquired by threads that get and pin an entry in the
// cachetable. A write lock is acquired by the writer thread when an entry
// is evicted from the cachetable and is being written storage.
// Properties:
// 1. multiple readers, no writers
// 2. one writer at a time
// 3. pending writers have priority over pending readers
// An external mutex must be locked when using these functions. An alternate
// design would bury a mutex into the rwlock itself. While this may
// increase parallelism at the expense of single thread performance, we
// are experimenting with a single higher level lock.
typedef struct ctpair_rwlock *CTPAIR_RWLOCK;
struct ctpair_rwlock {
int pinned; // the number of readers
int want_pin; // the number of blocked readers
pthread_cond_t wait_pin;
int writer; // the number of writers
int want_write; // the number of blocked writers
pthread_cond_t wait_write;
};
// initialize a read write lock
static void ctpair_rwlock_init(CTPAIR_RWLOCK rwlock) {
int r;
rwlock->pinned = rwlock->want_pin = 0;
r = pthread_cond_init(&rwlock->wait_pin, 0); assert(r == 0);
rwlock->writer = rwlock->want_write = 0;
r = pthread_cond_init(&rwlock->wait_write, 0); assert(r == 0);
}
// destroy a read write lock
static void ctpair_rwlock_destroy(CTPAIR_RWLOCK rwlock) {
int r;
assert(rwlock->pinned == 0 && rwlock->want_pin == 0);
assert(rwlock->writer == 0 && rwlock->want_write == 0);
r = pthread_cond_destroy(&rwlock->wait_pin); assert(r == 0);
r = pthread_cond_destroy(&rwlock->wait_write); assert(r == 0);
}
// obtain a read lock
// expects: mutex is locked
static inline void ctpair_read_lock(CTPAIR_RWLOCK rwlock, pthread_mutex_t *mutex) {
if (rwlock->writer || rwlock->want_write) {
rwlock->want_pin++;
while (rwlock->writer || rwlock->want_write) {
int r = pthread_cond_wait(&rwlock->wait_pin, mutex); assert(r == 0);
}
rwlock->want_pin--;
}
rwlock->pinned++;
}
// release a read lock
// expects: mutex is locked
static inline void ctpair_read_unlock(CTPAIR_RWLOCK rwlock) {
rwlock->pinned--;
if (rwlock->pinned == 0 && rwlock->want_write) {
int r = pthread_cond_signal(&rwlock->wait_write); assert(r == 0);
}
}
// obtain a write lock
// expects: mutex is locked
static inline void ctpair_write_lock(CTPAIR_RWLOCK rwlock, pthread_mutex_t *mutex) {
if (rwlock->pinned || rwlock->writer) {
rwlock->want_write++;
while (rwlock->pinned || rwlock->writer) {
int r = pthread_cond_wait(&rwlock->wait_write, mutex); assert(r == 0);
}
rwlock->want_write--;
}
rwlock->writer++;
}
// release a write lock
// expects: mutex is locked
static inline void ctpair_write_unlock(CTPAIR_RWLOCK rwlock) {
rwlock->writer--;
if (rwlock->writer == 0) {
if (rwlock->want_write) {
int r = pthread_cond_signal(&rwlock->wait_write); assert(r == 0);
} else if (rwlock->want_pin) {
int r = pthread_cond_broadcast(&rwlock->wait_pin); assert(r == 0);
}
}
}
// returns: the number of readers
static inline int ctpair_pinned(CTPAIR_RWLOCK rwlock) {
return rwlock->pinned;
}
// returns: the number of writers
static inline int ctpair_writers(CTPAIR_RWLOCK rwlock) {
return rwlock->writer;
}
// returns: the sum of the number of readers, pending readers, writers, and
// pending writers
static inline int ctpair_users(CTPAIR_RWLOCK rwlock) {
return rwlock->pinned + rwlock->want_pin + rwlock->writer + rwlock->want_write;
}
// When objects are evicted from the cachetable, they are written to storage by a
// thread in a thread pool. The pair's are placed onto a write queue that feeds
// the thread pool.
typedef struct writequeue *WRITEQUEUE;
struct writequeue {
PAIR head, tail; // head and tail of the linked list of pair's
pthread_cond_t wait_read; // wait for read
int want_read; // number of threads waiting to read
pthread_cond_t wait_write; // wait for write
int want_write; // number of threads waiting to write
int ninq; // number of pairs in the queue
char closed; // kicks waiting threads off of the write queue
};
// initialize a writequeue
// expects: the writequeue is not initialized
// effects: the writequeue is set to empty and the condition variable is initialized
static void writequeue_init(WRITEQUEUE wq) {
wq->head = wq->tail = 0;
int r;
r = pthread_cond_init(&wq->wait_read, 0); assert(r == 0);
wq->want_read = 0;
r = pthread_cond_init(&wq->wait_write, 0); assert(r == 0);
wq->want_write = 0;
wq->ninq = 0;
wq->closed = 0;
}
// destroy a writequeue
// expects: the writequeue must be initialized and empty
static void writequeue_destroy(WRITEQUEUE wq) {
assert(wq->head == 0 && wq->tail == 0);
int r;
r = pthread_cond_destroy(&wq->wait_read); assert(r == 0);
r = pthread_cond_destroy(&wq->wait_write); assert(r == 0);
}
// close the writequeue
// effects: signal any threads blocked in the writequeue
static void writequeue_set_closed(WRITEQUEUE wq) {
wq->closed = 1;
int r;
r = pthread_cond_broadcast(&wq->wait_read); assert(r == 0);
r = pthread_cond_broadcast(&wq->wait_write); assert(r == 0);
}
// determine whether or not the write queue is empty
// return: 1 if the write queue is empty, otherwise 0
static int writequeue_empty(WRITEQUEUE wq) {
return wq->head == 0;
}
// put a pair on the tail of the write queue
// effects: append the pair to the end of the write queue and signal
// any waiters
static void writequeue_enq(WRITEQUEUE wq, PAIR pair) {
pair->next_wq = 0;
if (wq->tail)
wq->tail->next_wq = pair;
else
wq->head = pair;
wq->tail = pair;
wq->ninq++;
if (wq->want_read) {
int r = pthread_cond_signal(&wq->wait_read); assert(r == 0);
}
}
// get a pair from the head of the write queue
// effects: wait until the writequeue is not empty, remove the first pair from the
// write queue and return it
// returns: 0 if success, otherwise an error
static int writequeue_deq(WRITEQUEUE wq, pthread_mutex_t *mutex, PAIR *pairptr) {
while (writequeue_empty(wq)) {
if (wq->closed)
return EINVAL;
wq->want_read++;
int r = pthread_cond_wait(&wq->wait_read, mutex); assert(r == 0);
wq->want_read--;
}
PAIR pair = wq->head;
wq->head = pair->next_wq;
if (wq->head == 0)
wq->tail = 0;
wq->ninq--;
pair->next_wq = 0;
*pairptr = pair;
return 0;
}
// wait for write
static void writequeue_wait_write(WRITEQUEUE wq, pthread_mutex_t *mutex) {
wq->want_write++;
int r = pthread_cond_wait(&wq->wait_write, mutex); assert(r == 0);
wq->want_write--;
}
// wakeup writers
static void writequeue_wakeup_write(WRITEQUEUE wq) {
if (wq->want_write) {
int r = pthread_cond_broadcast(&wq->wait_write); assert(r == 0);
}
}
/* -*- mode: C; c-basic-offset: 4 -*- */ /* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved." #ident "Copyright (c) 2007, 2008 Tokutek Inc. All rights reserved."
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <pthread.h>
#include "cachetable.h" #include "cachetable.h"
#include "hashfun.h" #include "hashfun.h"
#include "memory.h" #include "memory.h"
#include "toku_assert.h" #include "toku_assert.h"
#include "brt-internal.h" #include "brt-internal.h"
#include "log_header.h" #include "log_header.h"
#include "threadpool.h"
#include "cachetable-rwlock.h"
#include <malloc.h> #include <malloc.h>
#include <errno.h> // execute the cachetable callbacks using a writer thread 0->no 1->yes
#include <stdio.h> #define DO_WRITER_THREAD 0
#include <string.h> #if DO_WRITER_THREAD
#include <sys/stat.h> static void *cachetable_writer(void *);
#include <unistd.h> #endif
#include <pthread.h>
// use cachetable locks 0->no 1->yes
#define DO_CACHETABLE_LOCK 0 #define DO_CACHETABLE_LOCK 0
// unlock the cachetable while executing callbacks 0->no 1->yes
#define DO_CALLBACK_UNLOCK 0
// simulate long latency write operations with usleep. time in milliseconds.
#define DO_CALLBACK_USLEEP 0
#define DO_CALLBACK_BUSYWAIT 0
//#define TRACE_CACHETABLE //#define TRACE_CACHETABLE
#ifdef TRACE_CACHETABLE #ifdef TRACE_CACHETABLE
#define WHEN_TRACE_CT(x) x #define WHEN_TRACE_CT(x) x
...@@ -28,37 +45,51 @@ ...@@ -28,37 +45,51 @@
typedef struct ctpair *PAIR; typedef struct ctpair *PAIR;
struct ctpair { struct ctpair {
enum typ_tag tag; enum typ_tag tag;
long long pinned;
long size;
char dirty; char dirty;
char verify_flag; // Used in verify_cachetable()
char writing; // writing back
char write_me;
CACHEKEY key; CACHEKEY key;
void *value; void *value;
PAIR next,prev; // In LRU list. long size;
PAIR next,prev; // In LRU list.
PAIR hash_chain; PAIR hash_chain;
CACHEFILE cachefile; CACHEFILE cachefile;
CACHETABLE_FLUSH_FUNC_T flush_callback; CACHETABLE_FLUSH_CALLBACK flush_callback;
CACHETABLE_FETCH_FUNC_T fetch_callback; CACHETABLE_FETCH_CALLBACK fetch_callback;
void *extraargs; void *extraargs;
int verify_flag; /* Used in verify_cachetable() */ LSN modified_lsn; // What was the LSN when modified (undefined if not dirty)
LSN modified_lsn; // What was the LSN when modified (undefined if not dirty) LSN written_lsn; // What was the LSN when written (we need to get this information when we fetch)
LSN written_lsn; // What was the LSN when written (we need to get this information when we fetch)
u_int32_t fullhash; u_int32_t fullhash;
PAIR next_wq; // the ctpair's are linked into a write queue when evicted
struct ctpair_rwlock rwlock; // reader writer lock used to grant an exclusive lock to the writeback thread
struct writequeue *cq; // writers sometimes return ctpair's using this queue
}; };
#include "cachetable-writequeue.h"
static inline void ctpair_destroy(PAIR p) {
ctpair_rwlock_destroy(&p->rwlock);
toku_free(p);
}
// The cachetable is as close to an ENV as we get. // The cachetable is as close to an ENV as we get.
struct cachetable { struct cachetable {
enum typ_tag tag; enum typ_tag tag;
u_int32_t n_in_table; u_int32_t n_in_table;
u_int32_t table_size; u_int32_t table_size;
PAIR *table; PAIR *table; // hash table
PAIR head,tail; // of LRU list. head is the most recently used. tail is least recently used. PAIR head,tail; // of LRU list. head is the most recently used. tail is least recently used.
CACHEFILE cachefiles; CACHEFILE cachefiles; // list of cachefiles that use this cachetable
long size_current, size_limit; long size_current; // the sum of the sizes of the pairs in the cachetable
long size_limit; // the limit to the sum of the pair sizes
long size_writing; // the sum of the sizes of the pairs being written
LSN lsn_of_checkpoint; // the most recent checkpoint in the log. LSN lsn_of_checkpoint; // the most recent checkpoint in the log.
TOKULOGGER logger; TOKULOGGER logger;
#if DO_CACHETABLE_LOCK pthread_mutex_t mutex; // course lock that protects the cachetable, the cachefiles, and the pair's
pthread_mutex_t mutex; struct writequeue wq; // write queue for the writer threads
#endif THREADPOOL threadpool; // pool of writer threads
}; };
// lock the cachetable mutex // lock the cachetable mutex
...@@ -77,6 +108,15 @@ static inline void cachetable_unlock(CACHETABLE ct __attribute__((unused))) { ...@@ -77,6 +108,15 @@ static inline void cachetable_unlock(CACHETABLE ct __attribute__((unused))) {
#endif #endif
} }
// wait for writes to complete if the size in the write queue is 1/2 of
// the cachetable
static inline void cachetable_wait_write(CACHETABLE ct) {
while (2*ct->size_writing > ct->size_current) {
writequeue_wait_write(&ct->wq, &ct->mutex);
}
}
struct fileid { struct fileid {
dev_t st_dev; /* device and inode are enough to uniquely identify a file in unix. */ dev_t st_dev; /* device and inode are enough to uniquely identify a file in unix. */
ino_t st_ino; ino_t st_ino;
...@@ -107,22 +147,33 @@ int toku_create_cachetable(CACHETABLE *result, long size_limit, LSN initial_lsn, ...@@ -107,22 +147,33 @@ int toku_create_cachetable(CACHETABLE *result, long size_limit, LSN initial_lsn,
} }
} }
TAGMALLOC(CACHETABLE, t); TAGMALLOC(CACHETABLE, t);
u_int32_t i; if (t == 0) return ENOMEM;
t->n_in_table = 0; t->n_in_table = 0;
t->table_size = 4; t->table_size = 4;
MALLOC_N(t->table_size, t->table); MALLOC_N(t->table_size, t->table);
assert(t->table); assert(t->table);
t->head = t->tail = 0; t->head = t->tail = 0;
u_int32_t i;
for (i=0; i<t->table_size; i++) { for (i=0; i<t->table_size; i++) {
t->table[i]=0; t->table[i]=0;
} }
t->cachefiles = 0; t->cachefiles = 0;
t->size_current = 0; t->size_current = 0;
t->size_limit = size_limit; t->size_limit = size_limit;
t->size_writing = 0;
t->lsn_of_checkpoint = initial_lsn; t->lsn_of_checkpoint = initial_lsn;
t->logger = logger; t->logger = logger;
#if DO_CACHTABLE_LOCK int r;
int r = pthread_mutex_init(&t->mutex, 0); assert(r == 0); writequeue_init(&t->wq);
r = pthread_mutex_init(&t->mutex, 0); assert(r == 0);
// set the max number of writeback threads to min(4,nprocs_online)
int nprocs = sysconf(_SC_NPROCESSORS_ONLN);
if (nprocs > 4) nprocs = 4;
r = threadpool_create(&t->threadpool, nprocs); assert(r == 0);
#if DO_WRITER_THREAD
threadpool_maybe_add(t->threadpool, cachetable_writer, t);
#endif #endif
*result = t; *result = t;
return 0; return 0;
...@@ -231,7 +282,7 @@ static CACHEFILE remove_cf_from_list (CACHEFILE cf, CACHEFILE list) { ...@@ -231,7 +282,7 @@ static CACHEFILE remove_cf_from_list (CACHEFILE cf, CACHEFILE list) {
} }
} }
static int cachefile_flush_and_remove (CACHEFILE cf); static int cachetable_flush_cachefile (CACHETABLE, CACHEFILE cf, BOOL do_remove);
// Increment the reference count // Increment the reference count
void toku_cachefile_refup (CACHEFILE cf) { void toku_cachefile_refup (CACHEFILE cf) {
...@@ -240,15 +291,21 @@ void toku_cachefile_refup (CACHEFILE cf) { ...@@ -240,15 +291,21 @@ void toku_cachefile_refup (CACHEFILE cf) {
int toku_cachefile_close (CACHEFILE *cfp, TOKULOGGER logger) { int toku_cachefile_close (CACHEFILE *cfp, TOKULOGGER logger) {
CACHEFILE cf = *cfp; CACHEFILE cf = *cfp;
CACHETABLE ct = cf->cachetable;
cachetable_lock(ct);
assert(cf->refcount>0); assert(cf->refcount>0);
cf->refcount--; cf->refcount--;
if (cf->refcount==0) { if (cf->refcount==0) {
int r; int r;
if ((r = cachefile_flush_and_remove(cf))) return r; if ((r = cachetable_flush_cachefile(ct, cf, TRUE))) {
cachetable_unlock(ct);
return r;
}
cf->cachetable->cachefiles = remove_cf_from_list(cf, cf->cachetable->cachefiles);
cachetable_unlock(ct);
r = close(cf->fd); r = close(cf->fd);
assert(r == 0); assert(r == 0);
cf->fd = -1; cf->fd = -1;
cf->cachetable->cachefiles = remove_cf_from_list(cf, cf->cachetable->cachefiles);
if (logger) { if (logger) {
assert(cf->fname); assert(cf->fname);
BYTESTRING bs = {.len=strlen(cf->fname), .data=cf->fname}; BYTESTRING bs = {.len=strlen(cf->fname), .data=cf->fname};
...@@ -260,28 +317,35 @@ int toku_cachefile_close (CACHEFILE *cfp, TOKULOGGER logger) { ...@@ -260,28 +317,35 @@ int toku_cachefile_close (CACHEFILE *cfp, TOKULOGGER logger) {
*cfp=0; *cfp=0;
return r; return r;
} else { } else {
cachetable_unlock(ct);
*cfp=0; *cfp=0;
return 0; return 0;
} }
} }
int toku_cachefile_flush (CACHEFILE cf) { int toku_cachefile_flush (CACHEFILE cf) {
return cachefile_flush_and_remove(cf); CACHETABLE ct = cf->cachetable;
cachetable_lock(ct);
int r = cachetable_flush_cachefile(ct, cf, TRUE);
cachetable_unlock(ct);
return r;
} }
int toku_cachetable_assert_all_unpinned (CACHETABLE t) { int toku_cachetable_assert_all_unpinned (CACHETABLE t) {
u_int32_t i; u_int32_t i;
int some_pinned=0; int some_pinned=0;
cachetable_lock(t);
for (i=0; i<t->table_size; i++) { for (i=0; i<t->table_size; i++) {
PAIR p; PAIR p;
for (p=t->table[i]; p; p=p->hash_chain) { for (p=t->table[i]; p; p=p->hash_chain) {
assert(p->pinned>=0); assert(ctpair_pinned(&p->rwlock)>=0);
if (p->pinned) { if (ctpair_pinned(&p->rwlock)) {
printf("%s:%d pinned: %lld (%p)\n", __FILE__, __LINE__, p->key, p->value); printf("%s:%d pinned: %lld (%p)\n", __FILE__, __LINE__, p->key, p->value);
some_pinned=1; some_pinned=1;
} }
} }
} }
cachetable_unlock(t);
return some_pinned; return some_pinned;
} }
...@@ -289,27 +353,21 @@ int toku_cachefile_count_pinned (CACHEFILE cf, int print_them) { ...@@ -289,27 +353,21 @@ int toku_cachefile_count_pinned (CACHEFILE cf, int print_them) {
u_int32_t i; u_int32_t i;
int n_pinned=0; int n_pinned=0;
CACHETABLE t = cf->cachetable; CACHETABLE t = cf->cachetable;
cachetable_lock(t);
for (i=0; i<t->table_size; i++) { for (i=0; i<t->table_size; i++) {
PAIR p; PAIR p;
for (p=t->table[i]; p; p=p->hash_chain) { for (p=t->table[i]; p; p=p->hash_chain) {
assert(p->pinned>=0); assert(ctpair_pinned(&p->rwlock)>=0);
if (p->pinned && p->cachefile==cf) { if (ctpair_pinned(&p->rwlock) && (cf==0 || p->cachefile==cf)) {
if (print_them) printf("%s:%d pinned: %lld (%p)\n", __FILE__, __LINE__, p->key, p->value); if (print_them) printf("%s:%d pinned: %lld (%p)\n", __FILE__, __LINE__, p->key, p->value);
n_pinned++; n_pinned++;
} }
} }
} }
cachetable_unlock(t);
return n_pinned; return n_pinned;
} }
#if 0
unsigned int ct_hash_longlong (unsigned long long l) {
unsigned int r = hash_key((unsigned char*)&l, 8);
printf("%lld --> %d --> %d\n", l, r, r%64);
return r;
}
#endif
// This hash function comes from Jenkins: http://burtleburtle.net/bob/c/lookup3.c // This hash function comes from Jenkins: http://burtleburtle.net/bob/c/lookup3.c
// The idea here is to mix the bits thoroughly so that we don't have to do modulo by a prime number. // The idea here is to mix the bits thoroughly so that we don't have to do modulo by a prime number.
// Instead we can use a bitmask on a table of size power of two. // Instead we can use a bitmask on a table of size power of two.
...@@ -318,14 +376,14 @@ static inline u_int32_t rot(u_int32_t x, u_int32_t k) { ...@@ -318,14 +376,14 @@ static inline u_int32_t rot(u_int32_t x, u_int32_t k) {
return (x<<k) | (x>>(32-k)); return (x<<k) | (x>>(32-k));
} }
static inline u_int32_t final (u_int32_t a, u_int32_t b, u_int32_t c) { static inline u_int32_t final (u_int32_t a, u_int32_t b, u_int32_t c) {
c ^= b; c -= rot(b,14); c ^= b; c -= rot(b,14);
a ^= c; a -= rot(c,11); a ^= c; a -= rot(c,11);
b ^= a; b -= rot(a,25); b ^= a; b -= rot(a,25);
c ^= b; c -= rot(b,16); c ^= b; c -= rot(b,16);
a ^= c; a -= rot(c,4); a ^= c; a -= rot(c,4);
b ^= a; b -= rot(a,14); b ^= a; b -= rot(a,14);
c ^= b; c -= rot(b,24); c ^= b; c -= rot(b,24);
return c; return c;
} }
u_int32_t toku_cachetable_hash (CACHEFILE cachefile, CACHEKEY key) u_int32_t toku_cachetable_hash (CACHEFILE cachefile, CACHEKEY key)
...@@ -419,23 +477,123 @@ static BOOL need_to_rename_p (CACHETABLE t, PAIR p) { ...@@ -419,23 +477,123 @@ static BOOL need_to_rename_p (CACHETABLE t, PAIR p) {
&& p->written_lsn.lsn < t->lsn_of_checkpoint.lsn); // strict && p->written_lsn.lsn < t->lsn_of_checkpoint.lsn); // strict
} }
static void flush_and_remove (CACHETABLE t, PAIR remove_me, int write_me) { // Remove a pair from the cachetable
lru_remove(t, remove_me); // Effects: the pair is removed from the LRU list and from the cachetable's hash table.
//printf("flush_callback(%lld,%p)\n", remove_me->key, remove_me->value); // The size of the objects in the cachetable is adjusted by the size of the pair being
WHEN_TRACE_CT(printf("%s:%d CT flush_callback(%lld, %p, dirty=%d, 0)\n", __FILE__, __LINE__, remove_me->key, remove_me->value, remove_me->dirty && write_me)); // removed.
//printf("%s:%d TAG=%x p=%p\n", __FILE__, __LINE__, remove_me->tag, remove_me);
//printf("%s:%d dirty=%d\n", __FILE__, __LINE__, remove_me->dirty); static void cachetable_remove_pair (CACHETABLE ct, PAIR p) {
remove_me->flush_callback(remove_me->cachefile, remove_me->key, remove_me->value, remove_me->size, remove_me->dirty && write_me, 0, lru_remove(ct, p);
t->lsn_of_checkpoint, need_to_rename_p(t, remove_me));
assert(t->n_in_table>0); assert(ct->n_in_table>0);
t->n_in_table--; ct->n_in_table--;
// Remove it from the hash chain. // Remove it from the hash chain.
{ {
unsigned int h = remove_me->fullhash&(t->table_size-1); unsigned int h = p->fullhash&(ct->table_size-1);
t->table[h] = remove_from_hash_chain (remove_me, t->table[h]); ct->table[h] = remove_from_hash_chain (p, ct->table[h]);
}
ct->size_current -= p->size; assert(ct->size_current >= 0);
}
// Maybe remove a pair from the cachetable and free it, depending on whether
// or not there are any threads interested in the pair. The flush callback
// is called with write_me and keep_me both false, and the pair is destroyed.
static void cachetable_maybe_remove_and_free_pair (CACHETABLE ct, PAIR p) {
if (ctpair_users(&p->rwlock) == 0) {
cachetable_remove_pair(ct, p);
#if DO_CALLBACK_UNLOCK
cachetable_unlock(ct);
#endif
p->flush_callback(p->cachefile, p->key, p->value, p->size, FALSE, FALSE,
ct->lsn_of_checkpoint, need_to_rename_p(ct, p));
ctpair_destroy(p);
#if DO_CALLBACK_UNLOCK
cachetable_lock(ct);
#endif
} }
t->size_current -= remove_me->size; }
toku_free(remove_me);
static void cachetable_complete_write_pair (CACHETABLE ct, PAIR p, BOOL do_remove);
// Write a pair to storage
// Effects: an exclusive lock on the pair is obtained, the write callback is called,
// the pair dirty state is adjusted, and the write is completed. The write_me boolean
// is true when the pair is dirty and the pair is requested to be written. The keep_me
// boolean is true, so the pair is not yet evicted from the cachetable.
static void cachetable_write_pair(CACHETABLE ct, PAIR p) {
ctpair_write_lock(&p->rwlock, &ct->mutex);
#if DO_CALLBACK_UNLOCK
cachetable_unlock(ct);
#endif
// write callback
p->flush_callback(p->cachefile, p->key, p->value, p->size, p->dirty && p->write_me, TRUE,
ct->lsn_of_checkpoint, need_to_rename_p(ct, p));
#if DO_CALLBACK_USLEEP
usleep(DO_CALLBACK_USLEEP);
#endif
#if DO_CALLBACK_BUSYWAIT
struct timeval tstart;
gettimeofday(&tstart, 0);
long long ltstart = tstart.tv_sec * 1000000 + tstart.tv_usec;
while (1) {
struct timeval t;
gettimeofday(&t, 0);
long long lt = t.tv_sec * 1000000 + t.tv_usec;
if (lt - ltstart > DO_CALLBACK_BUSYWAIT)
break;
}
#endif
#if DO_CALLBACK_UNLOCK
cachetable_lock(ct);
#endif
// the pair is no longer dirty once written
if (p->dirty && p->write_me)
p->dirty = FALSE;
// stuff it into a completion queue for delayed completion if a completion queue exists
// otherwise complete the write now
if (p->cq)
writequeue_enq(p->cq, p);
else
cachetable_complete_write_pair(ct, p, TRUE);
}
// complete the write of a pair by reseting the writing flag, adjusting the write
// pending size, and maybe removing the pair from the cachetable if there are no
// references to it
static void cachetable_complete_write_pair (CACHETABLE ct, PAIR p, BOOL do_remove) {
p->cq = 0;
p->writing = 0;
// maybe wakeup any stalled writers when the pending writes fall below
// 1/8 of the size of the cachetable
ct->size_writing -= p->size;
assert(ct->size_writing >= 0);
if (8*ct->size_writing <= ct->size_current)
writequeue_wakeup_write(&ct->wq);
ctpair_write_unlock(&p->rwlock);
if (do_remove)
cachetable_maybe_remove_and_free_pair(ct, p);
}
// flush and remove a pair from the cachetable. the callbacks are run by a thread in
// a thread pool.
static void flush_and_remove (CACHETABLE ct, PAIR p, int write_me) {
p->writing = 1;
ct->size_writing += p->size; assert(ct->size_writing >= 0);
p->write_me = write_me;
#if DO_WRITER_THREAD
threadpool_maybe_add(ct->threadpool, cachetable_writer, ct);
writequeue_enq(&ct->wq, p);
#else
cachetable_write_pair(ct, p);
#endif
} }
static unsigned long toku_maxrss=0; static unsigned long toku_maxrss=0;
...@@ -458,11 +616,10 @@ static unsigned long check_maxrss (void) { ...@@ -458,11 +616,10 @@ static unsigned long check_maxrss (void) {
} }
static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) { static int maybe_flush_some (CACHETABLE t, long size) {
int r = 0; int r = 0;
again: again:
// if (t->n_in_table >= t->table_size) { if (size + t->size_current > t->size_limit + t->size_writing) {
if (size + t->size_current > t->size_limit) {
{ {
unsigned long rss __attribute__((__unused__)) = check_maxrss(); unsigned long rss __attribute__((__unused__)) = check_maxrss();
//printf("this-size=%.6fMB projected size = %.2fMB limit=%2.fMB rss=%2.fMB\n", size/(1024.0*1024.0), (size+t->size_current)/(1024.0*1024.0), t->size_limit/(1024.0*1024.0), rss/256.0); //printf("this-size=%.6fMB projected size = %.2fMB limit=%2.fMB rss=%2.fMB\n", size/(1024.0*1024.0), (size+t->size_current)/(1024.0*1024.0), t->size_limit/(1024.0*1024.0), rss/256.0);
...@@ -472,7 +629,7 @@ static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) { ...@@ -472,7 +629,7 @@ static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) {
/* Try to remove one. */ /* Try to remove one. */
PAIR remove_me; PAIR remove_me;
for (remove_me = t->tail; remove_me; remove_me = remove_me->prev) { for (remove_me = t->tail; remove_me; remove_me = remove_me->prev) {
if (!remove_me->pinned) { if (!ctpair_users(&remove_me->rwlock) && !remove_me->writing) {
flush_and_remove(t, remove_me, 1); flush_and_remove(t, remove_me, 1);
goto again; goto again;
} }
...@@ -489,16 +646,17 @@ static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) { ...@@ -489,16 +646,17 @@ static int maybe_flush_some (CACHETABLE t, long size __attribute__((unused))) {
} }
static int cachetable_insert_at(CACHEFILE cachefile, u_int32_t fullhash, CACHEKEY key, void *value, long size, static int cachetable_insert_at(CACHEFILE cachefile, u_int32_t fullhash, CACHEKEY key, void *value, long size,
cachetable_flush_func_t flush_callback, CACHETABLE_FLUSH_CALLBACK flush_callback,
cachetable_fetch_func_t fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
void *extraargs, int dirty, void *extraargs, int dirty,
LSN written_lsn) { LSN written_lsn) {
TAGMALLOC(PAIR, p); TAGMALLOC(PAIR, p);
memset(p, 0, sizeof *p);
ctpair_rwlock_init(&p->rwlock);
p->fullhash = fullhash; p->fullhash = fullhash;
p->pinned = 1; p->dirty = dirty; //printf("%s:%d p=%p dirty=%d\n", __FILE__, __LINE__, p, p->dirty);
p->dirty = dirty;
p->size = size; p->size = size;
//printf("%s:%d p=%p dirty=%d\n", __FILE__, __LINE__, p, p->dirty); p->writing = 0;
p->key = key; p->key = key;
p->value = value; p->value = value;
p->next = p->prev = 0; p->next = p->prev = 0;
...@@ -510,6 +668,8 @@ static int cachetable_insert_at(CACHEFILE cachefile, u_int32_t fullhash, CACHEKE ...@@ -510,6 +668,8 @@ static int cachetable_insert_at(CACHEFILE cachefile, u_int32_t fullhash, CACHEKE
p->written_lsn = written_lsn; p->written_lsn = written_lsn;
p->fullhash = fullhash; p->fullhash = fullhash;
CACHETABLE ct = cachefile->cachetable; CACHETABLE ct = cachefile->cachetable;
ctpair_read_lock(&p->rwlock, &ct->mutex);
p->cq = 0;
lru_add_to_list(ct, p); lru_add_to_list(ct, p);
u_int32_t h = fullhash & (ct->table_size-1); u_int32_t h = fullhash & (ct->table_size-1);
p->hash_chain = ct->table[h]; p->hash_chain = ct->table[h];
...@@ -537,56 +697,58 @@ void note_hash_count (int count) { ...@@ -537,56 +697,58 @@ void note_hash_count (int count) {
} }
int toku_cachetable_put(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void*value, long size, int toku_cachetable_put(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void*value, long size,
cachetable_flush_func_t flush_callback, cachetable_fetch_func_t fetch_callback, void *extraargs) { CACHETABLE_FLUSH_CALLBACK flush_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs) {
WHEN_TRACE_CT(printf("%s:%d CT cachetable_put(%lld)=%p\n", __FILE__, __LINE__, key, value)); WHEN_TRACE_CT(printf("%s:%d CT cachetable_put(%lld)=%p\n", __FILE__, __LINE__, key, value));
int count=0;
CACHETABLE ct = cachefile->cachetable; CACHETABLE ct = cachefile->cachetable;
int count=0;
cachetable_lock(ct); cachetable_lock(ct);
cachetable_wait_write(ct);
{ {
PAIR p; PAIR p;
for (p=cachefile->cachetable->table[fullhash&(cachefile->cachetable->table_size-1)]; p; p=p->hash_chain) { for (p=ct->table[fullhash&(cachefile->cachetable->table_size-1)]; p; p=p->hash_chain) {
count++; count++;
if (p->key==key && p->cachefile==cachefile) { if (p->key==key && p->cachefile==cachefile) {
note_hash_count(count);
// Semantically, these two asserts are not strictly right. After all, when are two functions eq? // Semantically, these two asserts are not strictly right. After all, when are two functions eq?
// In practice, the functions better be the same. // In practice, the functions better be the same.
assert(p->flush_callback==flush_callback); assert(p->flush_callback==flush_callback);
assert(p->fetch_callback==fetch_callback); assert(p->fetch_callback==fetch_callback);
p->pinned++; /* Already present. But increment the pin count. */ ctpair_read_lock(&p->rwlock, &ct->mutex);
cachetable_unlock(ct); cachetable_unlock(ct);
note_hash_count(count);
return -1; /* Already present. */ return -1; /* Already present. */
} }
} }
} }
int r; int r;
note_hash_count(count); if ((r=maybe_flush_some(ct, size))) {
if ((r=maybe_flush_some(cachefile->cachetable, size))) {
cachetable_unlock(ct); cachetable_unlock(ct);
return r; return r;
} }
// flushing could change the table size, but wont' change the fullhash // flushing could change the table size, but wont' change the fullhash
r = cachetable_insert_at(cachefile, fullhash, key, value, size, flush_callback, fetch_callback, extraargs, 1, ZERO_LSN); r = cachetable_insert_at(cachefile, fullhash, key, value, size, flush_callback, fetch_callback, extraargs, 1, ZERO_LSN);
cachetable_unlock(ct); cachetable_unlock(ct);
note_hash_count(count);
return r; return r;
} }
int toku_cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value, long *sizep, int toku_cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value, long *sizep,
cachetable_flush_func_t flush_callback, cachetable_fetch_func_t fetch_callback, void *extraargs) { CACHETABLE_FLUSH_CALLBACK flush_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs) {
CACHETABLE t = cachefile->cachetable; CACHETABLE t = cachefile->cachetable;
cachetable_lock(t);
int tsize __attribute__((__unused__)) = t->table_size;
PAIR p; PAIR p;
int count=0; int count=0;
cachetable_lock(t);
cachetable_wait_write(t);
for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) { for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) {
count++; count++;
if (p->key==key && p->cachefile==cachefile) { if (p->key==key && p->cachefile==cachefile) {
note_hash_count(count);
*value = p->value; *value = p->value;
if (sizep) *sizep = p->size; if (sizep) *sizep = p->size;
p->pinned++; ctpair_read_lock(&p->rwlock, &t->mutex);
lru_touch(t,p); lru_touch(t,p);
cachetable_unlock(t); cachetable_unlock(t);
note_hash_count(count);
WHEN_TRACE_CT(printf("%s:%d cachtable_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value)); WHEN_TRACE_CT(printf("%s:%d cachtable_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value));
return 0; return 0;
} }
...@@ -607,15 +769,11 @@ int toku_cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, u_int32_t ful ...@@ -607,15 +769,11 @@ int toku_cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, u_int32_t ful
*value = toku_value; *value = toku_value;
if (sizep) if (sizep)
*sizep = size; *sizep = size;
// maybe_flush_some(t, size);
}
if ((r=maybe_flush_some(t, 0))) {
cachetable_unlock(t);
return r;
} }
r = maybe_flush_some(t, 0);
cachetable_unlock(t); cachetable_unlock(t);
WHEN_TRACE_CT(printf("%s:%d did fetch: cachtable_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value)); WHEN_TRACE_CT(printf("%s:%d did fetch: cachtable_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value));
return 0; return r;
} }
int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value) { int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value) {
...@@ -625,12 +783,12 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int3 ...@@ -625,12 +783,12 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int3
cachetable_lock(t); cachetable_lock(t);
for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) { for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) {
count++; count++;
if (p->key==key && p->cachefile==cachefile) { if (p->key==key && p->cachefile==cachefile && !p->writing) {
note_hash_count(count);
*value = p->value; *value = p->value;
p->pinned++; ctpair_read_lock(&p->rwlock, &t->mutex);
lru_touch(t,p); lru_touch(t,p);
cachetable_unlock(t); cachetable_unlock(t);
note_hash_count(count);
//printf("%s:%d cachetable_maybe_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value); //printf("%s:%d cachetable_maybe_get_and_pin(%lld)--> %p\n", __FILE__, __LINE__, key, *value);
return 0; return 0;
} }
...@@ -652,23 +810,24 @@ int toku_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, ...@@ -652,23 +810,24 @@ int toku_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash,
for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) { for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) {
count++; count++;
if (p->key==key && p->cachefile==cachefile) { if (p->key==key && p->cachefile==cachefile) {
note_hash_count(count); assert(p->rwlock.pinned>0);
assert(p->pinned>0); ctpair_read_unlock(&p->rwlock);
p->pinned--;
p->dirty |= dirty; p->dirty |= dirty;
if (size != 0) { if (size != 0) {
t->size_current -= p->size; t->size_current -= p->size; if (p->writing) t->size_writing -= p->size;
p->size = size; p->size = size;
t->size_current += p->size; t->size_current += p->size; if (p->writing) t->size_writing += p->size;
} }
WHEN_TRACE_CT(printf("[count=%lld]\n", p->pinned)); WHEN_TRACE_CT(printf("[count=%lld]\n", p->pinned));
{ {
int r; int r;
if ((r=maybe_flush_some(t, 0))) { if ((r=maybe_flush_some(t, 0))) {
cachetable_unlock(t); return r; cachetable_unlock(t);
return r;
} }
} }
cachetable_unlock(t); cachetable_unlock(t);
note_hash_count(count);
return 0; return 0;
} }
} }
...@@ -680,38 +839,31 @@ int toku_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, ...@@ -680,38 +839,31 @@ int toku_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash,
// effect: Move an object from one key to another key. // effect: Move an object from one key to another key.
// requires: The object is pinned in the table // requires: The object is pinned in the table
int toku_cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey) { int toku_cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey) {
CACHETABLE t = cachefile->cachetable; CACHETABLE t = cachefile->cachetable;
PAIR *ptr_to_p,p; PAIR *ptr_to_p,p;
int count = 0; int count = 0;
u_int32_t fullhash = toku_cachetable_hash(cachefile, oldkey); u_int32_t fullhash = toku_cachetable_hash(cachefile, oldkey);
for (ptr_to_p = &t->table[fullhash&(t->table_size-1)], p = *ptr_to_p; cachetable_lock(t);
p; for (ptr_to_p = &t->table[fullhash&(t->table_size-1)], p = *ptr_to_p;
ptr_to_p = &p->hash_chain, p = *ptr_to_p) { p;
count++; ptr_to_p = &p->hash_chain, p = *ptr_to_p) {
if (p->key==oldkey && p->cachefile==cachefile) { count++;
note_hash_count(count); if (p->key==oldkey && p->cachefile==cachefile) {
*ptr_to_p = p->hash_chain; note_hash_count(count);
p->key = newkey; *ptr_to_p = p->hash_chain;
u_int32_t new_fullhash = toku_cachetable_hash(cachefile, newkey); p->key = newkey;
u_int32_t nh = new_fullhash&(t->table_size-1); u_int32_t new_fullhash = toku_cachetable_hash(cachefile, newkey);
p->fullhash = new_fullhash; u_int32_t nh = new_fullhash&(t->table_size-1);
p->hash_chain = t->table[nh]; p->fullhash = new_fullhash;
t->table[nh] = p; p->hash_chain = t->table[nh];
return 0; t->table[nh] = p;
} cachetable_unlock(t);
} return 0;
note_hash_count(count); }
return -1; }
} cachetable_unlock(t);
note_hash_count(count);
static int cachetable_flush (CACHETABLE t) { return -1;
u_int32_t i;
for (i=0; i<t->table_size; i++) {
PAIR p;
while ((p = t->table[i]))
flush_and_remove(t, p, 1); // Must be careful, since flush_and_remove kills the linked list.
}
return 0;
} }
void toku_cachefile_verify (CACHEFILE cf) { void toku_cachefile_verify (CACHEFILE cf) {
...@@ -719,6 +871,8 @@ void toku_cachefile_verify (CACHEFILE cf) { ...@@ -719,6 +871,8 @@ void toku_cachefile_verify (CACHEFILE cf) {
} }
void toku_cachetable_verify (CACHETABLE t) { void toku_cachetable_verify (CACHETABLE t) {
cachetable_lock(t);
// First clear all the verify flags by going through the hash chains // First clear all the verify flags by going through the hash chains
{ {
u_int32_t i; u_int32_t i;
...@@ -759,10 +913,11 @@ void toku_cachetable_verify (CACHETABLE t) { ...@@ -759,10 +913,11 @@ void toku_cachetable_verify (CACHETABLE t) {
} }
} }
} }
cachetable_unlock(t);
} }
static void assert_cachefile_is_flushed_and_removed (CACHEFILE cf) { static void assert_cachefile_is_flushed_and_removed (CACHETABLE t, CACHEFILE cf) {
CACHETABLE t = cf->cachetable;
u_int32_t i; u_int32_t i;
// Check it two ways // Check it two ways
// First way: Look through all the hash chains // First way: Look through all the hash chains
...@@ -781,27 +936,35 @@ static void assert_cachefile_is_flushed_and_removed (CACHEFILE cf) { ...@@ -781,27 +936,35 @@ static void assert_cachefile_is_flushed_and_removed (CACHEFILE cf) {
} }
} }
// write all dirty entries and maybe remove them
static int cachefile_flush_and_remove (CACHEFILE cf) { static int cachetable_flush_cachefile (CACHETABLE ct, CACHEFILE cf, BOOL do_remove) {
u_int32_t i; unsigned nfound = 0;
CACHETABLE t = cf->cachetable; struct writequeue cq;
for (i=0; i<t->table_size; i++) { writequeue_init(&cq);
unsigned i;
for (i=0; i < ct->table_size; i++) {
PAIR p; PAIR p;
again: for (p = ct->table[i]; p; p=p->hash_chain) {
p = t->table[i]; if (cf == 0 || p->cachefile==cf) {
while (p) { nfound++;
if (p->cachefile==cf) { p->cq = &cq;
flush_and_remove(t, p, 1); // Must be careful, since flush_and_remove kills the linked list. if (!p->writing)
goto again; flush_and_remove(ct, p, 1);
} else {
p=p->hash_chain;
} }
} }
} }
assert_cachefile_is_flushed_and_removed(cf); for (i=0; i<nfound; i++) {
PAIR p = 0;
int r = writequeue_deq(&cq, &ct->mutex, &p); assert(r == 0);
cachetable_complete_write_pair(ct, p, do_remove);
}
writequeue_destroy(&cq);
if (do_remove)
assert_cachefile_is_flushed_and_removed(ct, cf);
if ((4 * t->n_in_table < t->table_size) && (t->table_size>4)) if ((4 * ct->n_in_table < ct->table_size) && (ct->table_size>4))
cachetable_rehash(t, t->table_size/2); cachetable_rehash(ct, ct->table_size/2);
return 0; return 0;
} }
...@@ -809,27 +972,37 @@ static int cachefile_flush_and_remove (CACHEFILE cf) { ...@@ -809,27 +972,37 @@ static int cachefile_flush_and_remove (CACHEFILE cf) {
/* Require that it all be flushed. */ /* Require that it all be flushed. */
int toku_cachetable_close (CACHETABLE *tp) { int toku_cachetable_close (CACHETABLE *tp) {
CACHETABLE t=*tp; CACHETABLE t=*tp;
u_int32_t i;
int r; int r;
if ((r=cachetable_flush(t))) return r; cachetable_lock(t);
if ((r=cachetable_flush_cachefile(t, 0, TRUE))) {
cachetable_unlock(t);
return r;
}
u_int32_t i;
for (i=0; i<t->table_size; i++) { for (i=0; i<t->table_size; i++) {
if (t->table[i]) return -1; if (t->table[i]) return -1;
} }
#if DO_CACHETABLE_LOCK assert(t->size_writing == 0);
writequeue_set_closed(&t->wq);
cachetable_unlock(t);
threadpool_destroy(&t->threadpool);
writequeue_destroy(&t->wq);
r = pthread_mutex_destroy(&t->mutex); assert(r == 0); r = pthread_mutex_destroy(&t->mutex); assert(r == 0);
#endif
toku_free(t->table); toku_free(t->table);
toku_free(t); toku_free(t);
*tp = 0; *tp = 0;
return 0; return 0;
} }
#if 0
// this is broken. needs to wait for writebacks to complete
int toku_cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) { int toku_cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) {
/* Removing something already present is OK. */ /* Removing something already present is OK. */
CACHETABLE t = cachefile->cachetable; CACHETABLE t = cachefile->cachetable;
PAIR p; PAIR p;
int count = 0; int count = 0;
u_int32_t fullhash = toku_cachetable_hash(cachefile, key); u_int32_t fullhash = toku_cachetable_hash(cachefile, key);
cachetable_lock(t);
for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) { for (p=t->table[fullhash&(t->table_size-1)]; p; p=p->hash_chain) {
count++; count++;
if (p->key==key && p->cachefile==cachefile) { if (p->key==key && p->cachefile==cachefile) {
...@@ -840,9 +1013,11 @@ int toku_cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) { ...@@ -840,9 +1013,11 @@ int toku_cachetable_remove (CACHEFILE cachefile, CACHEKEY key, int write_me) {
} }
} }
done: done:
cachetable_unlock(t);
note_hash_count(count); note_hash_count(count);
return 0; return 0;
} }
#endif
#if 0 #if 0
static void flush_and_keep (PAIR flush_me) { static void flush_and_keep (PAIR flush_me) {
...@@ -890,23 +1065,107 @@ int cachefile_pread (CACHEFILE cf, void *buf, size_t count, off_t offset) { ...@@ -890,23 +1065,107 @@ int cachefile_pread (CACHEFILE cf, void *buf, size_t count, off_t offset) {
#endif #endif
/* debug functions */
int toku_cachetable_checkpoint (CACHETABLE ct) {
// Single threaded checkpoint.
// In future: for multithreaded checkpoint we should not proceed if the previous checkpoint has not finished.
// Requires: Everything is unpinned. (In the multithreaded version we have to wait for things to get unpinned and then
// grab them (or else the unpinner has to do something.)
// Algorithm: Write a checkpoint record to the log, noting the LSN of that record.
// Note the LSN of the previous checkpoint (stored in lsn_of_checkpoint)
// For every (unpinnned) dirty node in which the LSN is newer than the prev checkpoint LSN:
// flush the node (giving it a new nodeid, and fixing up the downpointer in the parent)
// Watch out since evicting the node modifies the hash table.
//?? This is a skeleton. It compiles, but doesn't do anything reasonable yet.
//?? log_the_checkpoint();
unsigned nfound = 0;
struct writequeue cq;
writequeue_init(&cq);
cachetable_lock(ct);
unsigned i;
for (i=0; i < ct->table_size; i++) {
PAIR p;
for (p = ct->table[i]; p; p=p->hash_chain) {
// p->dirty && p->modified_lsn.lsn>ct->lsn_of_checkpoint.lsn
if (1) {
nfound++;
p->cq = &cq;
if (!p->writing)
flush_and_remove(ct, p, 1);
}
}
}
for (i=0; i<nfound; i++) {
PAIR p = 0;
int r = writequeue_deq(&cq, &ct->mutex, &p); assert(r == 0);
cachetable_complete_write_pair(ct, p, FALSE);
}
cachetable_unlock(ct);
writequeue_destroy(&cq);
return 0;
}
TOKULOGGER toku_cachefile_logger (CACHEFILE cf) {
return cf->cachetable->logger;
}
FILENUM toku_cachefile_filenum (CACHEFILE cf) {
return cf->filenum;
}
u_int32_t toku_cachefile_fullhash_of_header (CACHEFILE cachefile) {
return cachefile->header_fullhash;
}
#if DO_WRITER_THREAD
// The writer thread waits for work in the write queue and writes the pair
static void *cachetable_writer(void *arg) {
// printf("%lu:%s:start %p\n", pthread_self(), __FUNCTION__, arg);
CACHETABLE ct = arg;
int r;
cachetable_lock(ct);
while (1) {
threadpool_set_thread_idle(ct->threadpool);
PAIR p = 0;
r = writequeue_deq(&ct->wq, &ct->mutex, &p);
if (r != 0)
break;
threadpool_set_thread_busy(ct->threadpool);
cachetable_write_pair(ct, p);
}
cachetable_unlock(ct);
// printf("%lu:%s:exit %p\n", pthread_self(), __FUNCTION__, arg);
return arg;
}
#endif
// debug functions
void toku_cachetable_print_state (CACHETABLE ct) { void toku_cachetable_print_state (CACHETABLE ct) {
u_int32_t i; u_int32_t i;
cachetable_lock(ct);
for (i=0; i<ct->table_size; i++) { for (i=0; i<ct->table_size; i++) {
PAIR p = ct->table[i]; PAIR p = ct->table[i];
if (p != 0) { if (p != 0) {
printf("t[%d]=", i); printf("t[%d]=", i);
for (p=ct->table[i]; p; p=p->hash_chain) { for (p=ct->table[i]; p; p=p->hash_chain) {
printf(" {%lld, %p, dirty=%d, pin=%lld, size=%ld}", p->key, p->cachefile, p->dirty, p->pinned, p->size); printf(" {%lld, %p, dirty=%d, pin=%d, size=%ld}", p->key, p->cachefile, p->dirty, p->rwlock.pinned, p->size);
} }
printf("\n"); printf("\n");
} }
} }
cachetable_unlock(ct);
} }
void toku_cachetable_get_state (CACHETABLE ct, int *num_entries_ptr, int *hash_size_ptr, long *size_current_ptr, long *size_limit_ptr) { void toku_cachetable_get_state (CACHETABLE ct, int *num_entries_ptr, int *hash_size_ptr, long *size_current_ptr, long *size_limit_ptr) {
cachetable_lock(ct);
if (num_entries_ptr) if (num_entries_ptr)
*num_entries_ptr = ct->n_in_table; *num_entries_ptr = ct->n_in_table;
if (hash_size_ptr) if (hash_size_ptr)
...@@ -915,84 +1174,32 @@ void toku_cachetable_get_state (CACHETABLE ct, int *num_entries_ptr, int *hash_s ...@@ -915,84 +1174,32 @@ void toku_cachetable_get_state (CACHETABLE ct, int *num_entries_ptr, int *hash_s
*size_current_ptr = ct->size_current; *size_current_ptr = ct->size_current;
if (size_limit_ptr) if (size_limit_ptr)
*size_limit_ptr = ct->size_limit; *size_limit_ptr = ct->size_limit;
cachetable_unlock(ct);
} }
int toku_cachetable_get_key_state (CACHETABLE ct, CACHEKEY key, CACHEFILE cf, void **value_ptr, int toku_cachetable_get_key_state (CACHETABLE ct, CACHEKEY key, CACHEFILE cf, void **value_ptr,
int *dirty_ptr, long long *pin_ptr, long *size_ptr) { int *dirty_ptr, long long *pin_ptr, long *size_ptr) {
PAIR p; PAIR p;
int count = 0; int count = 0;
int r = -1;
u_int32_t fullhash = toku_cachetable_hash(cf, key); u_int32_t fullhash = toku_cachetable_hash(cf, key);
cachetable_lock(ct);
for (p = ct->table[fullhash&(ct->table_size-1)]; p; p = p->hash_chain) { for (p = ct->table[fullhash&(ct->table_size-1)]; p; p = p->hash_chain) {
count++; count++;
if (p->key == key) { if (p->key == key && p->cachefile == cf) {
note_hash_count(count);
if (value_ptr) if (value_ptr)
*value_ptr = p->value; *value_ptr = p->value;
if (dirty_ptr) if (dirty_ptr)
*dirty_ptr = p->dirty; *dirty_ptr = p->dirty;
if (pin_ptr) if (pin_ptr)
*pin_ptr = p->pinned; *pin_ptr = p->rwlock.pinned;
if (size_ptr) if (size_ptr)
*size_ptr = p->size; *size_ptr = p->size;
return 0; r = 0;
break;
} }
} }
cachetable_unlock(ct);
note_hash_count(count); note_hash_count(count);
return 1; return r;
}
int toku_cachetable_checkpoint (CACHETABLE ct) {
// Single threaded checkpoint.
// In future: for multithreaded checkpoint we should not proceed if the previous checkpoint has not finished.
// Requires: Everything is unpinned. (In the multithreaded version we have to wait for things to get unpinned and then
// grab them (or else the unpinner has to do something.)
// Algorithm: Write a checkpoint record to the log, noting the LSN of that record.
// Note the LSN of the previous checkpoint (stored in lsn_of_checkpoint)
// For every (unpinnned) dirty node in which the LSN is newer than the prev checkpoint LSN:
// flush the node (giving it a new nodeid, and fixing up the downpointer in the parent)
// Watch out since evicting the node modifies the hash table.
//?? This is a skeleton. It compiles, but doesn't do anything reasonable yet.
//?? log_the_checkpoint();
int n_saved=0;
int n_in_table = ct->n_in_table;
struct save_something {
CACHEFILE cf;
DISKOFF key;
void *value;
long size;
LSN modified_lsn;
CACHETABLE_FLUSH_FUNC_T flush_callback;
} *MALLOC_N(n_in_table, info);
{
PAIR pair;
for (pair=ct->head; pair; pair=pair->next) {
assert(!pair->pinned);
if (pair->dirty && pair->modified_lsn.lsn>ct->lsn_of_checkpoint.lsn) {
//?? /save_something_about_the_pair(); // This read-only so it doesn't modify the table.
n_saved++;
}
}
}
{
int i;
for (i=0; i<n_saved; i++) {
info[i].flush_callback(info[i].cf, info[i].key, info[i].value, info[i].size, 1, 1, info[i].modified_lsn, 0);
}
}
toku_free(info);
return 0;
}
TOKULOGGER toku_cachefile_logger (CACHEFILE cf) {
return cf->cachetable->logger;
}
FILENUM toku_cachefile_filenum (CACHEFILE cf) {
return cf->filenum;
}
u_int32_t toku_cachefile_fullhash_of_header (CACHEFILE cachefile) {
return cachefile->header_fullhash;
} }
...@@ -6,10 +6,6 @@ ...@@ -6,10 +6,6 @@
#include <fcntl.h> #include <fcntl.h>
#include "brttypes.h" #include "brttypes.h"
/* Implement the cache table. */
typedef long long CACHEKEY;
/* Maintain a cache mapping from cachekeys to values (void*) /* Maintain a cache mapping from cachekeys to values (void*)
* Some of the keys can be pinned. Don't pin too many or for too long. * Some of the keys can be pinned. Don't pin too many or for too long.
* If the cachetable is too full, it will call the flush_callback() function with the key, the value, and the otherargs * If the cachetable is too full, it will call the flush_callback() function with the key, the value, and the otherargs
...@@ -23,50 +19,78 @@ typedef long long CACHEKEY; ...@@ -23,50 +19,78 @@ typedef long long CACHEKEY;
* table_size is the initial size of the cache table hash table (in number of entries) * table_size is the initial size of the cache table hash table (in number of entries)
* size limit is the upper bound of the sum of size of the entries in the cache table (total number of bytes) * size limit is the upper bound of the sum of size of the entries in the cache table (total number of bytes)
*/ */
typedef long long CACHEKEY;
// create a new cachetable
// returns: if success, 0 is returned and result points to the new cachetable
int toku_create_cachetable(CACHETABLE */*result*/, long size_limit, LSN initial_lsn, TOKULOGGER); int toku_create_cachetable(CACHETABLE */*result*/, long size_limit, LSN initial_lsn, TOKULOGGER);
// What is the cachefile that goes with a particular filenum?
// During a transaction, we cannot reuse a filenum.
int toku_cachefile_of_filenum (CACHETABLE t, FILENUM filenum, CACHEFILE *cf);
int toku_cachetable_checkpoint (CACHETABLE ct);
int toku_cachetable_close (CACHETABLE*); /* Flushes everything to disk, and destroys the cachetable. */
int toku_cachetable_openf (CACHEFILE *,CACHETABLE, const char */*fname*/, int flags, mode_t mode); int toku_cachetable_openf (CACHEFILE *,CACHETABLE, const char */*fname*/, int flags, mode_t mode);
int toku_cachetable_openfd (CACHEFILE *,CACHETABLE, int /*fd*/, const char */*fname (used for logging)*/); int toku_cachetable_openfd (CACHEFILE *,CACHETABLE, int /*fd*/, const char */*fname (used for logging)*/);
typedef void (cachetable_flush_func_t)(CACHEFILE, CACHEKEY key, void*value, long size, BOOL write_me, BOOL keep_me, LSN modified_lsn, BOOL rename_p); // the flush callback (write, free)
typedef cachetable_flush_func_t *CACHETABLE_FLUSH_FUNC_T; typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, CACHEKEY key, void *value, long size, BOOL write_me, BOOL keep_me, LSN modified_lsn, BOOL rename_p);
/* If we are asked to fetch something, get it by calling this back. */ // the fetch callback
typedef int (cachetable_fetch_func_t)(CACHEFILE, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs, LSN *written_lsn); typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, CACHEKEY key, u_int32_t fullhash, void **value, long *sizep, void *extraargs, LSN *written_lsn);
typedef cachetable_fetch_func_t *CACHETABLE_FETCH_FUNC_T;
// Put a key and value pair into the cachetable
// effects: if the key,cachefile is not in the cachetable, then insert the pair and pin it.
// returns: 0 if success, otherwise an error
/* Error if already present. On success, pin the value. */
int toku_cachetable_put(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash, int toku_cachetable_put(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
void* value, long size, void *value, long size,
cachetable_flush_func_t flush_callback, cachetable_fetch_func_t fetch_callback, void *extraargs); CACHETABLE_FLUSH_CALLBACK flush_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs);
int toku_cachetable_get_and_pin(CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/, int toku_cachetable_get_and_pin(CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/,
void**/*value*/, long *sizep, void **/*value*/, long *sizep,
cachetable_flush_func_t flush_callback, cachetable_fetch_func_t fetch_callback, void *extraargs); CACHETABLE_FLUSH_CALLBACK flush_callback,
CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs);
// If the the item is already in memory, then return 0 and store it in the void**.
// If the item is not in memory, then return nonzero.
/* If the the item is already in memory, then return 0 and store it in the void**.
* If the item is not in memory, then return nonzero. */
int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/, void**); int toku_cachetable_maybe_get_and_pin (CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/, void**);
/* cachetable object state wrt external memory */ // cachetable object state wrt external memory
#define CACHETABLE_CLEAN 0 #define CACHETABLE_CLEAN 0
#define CACHETABLE_DIRTY 1 #define CACHETABLE_DIRTY 1
// Unpin by key
// effects: lookup a mapping using key,cachefile. if a pair is found, then OR the dirty bit into the pair
// and update the size of the pair. the read lock on the pair is released.
int toku_cachetable_unpin(CACHEFILE, CACHEKEY, u_int32_t fullhash, int dirty, long size); /* Note whether it is dirty when we unpin it. */ int toku_cachetable_unpin(CACHEFILE, CACHEKEY, u_int32_t fullhash, int dirty, long size); /* Note whether it is dirty when we unpin it. */
int toku_cachetable_remove (CACHEFILE, CACHEKEY, int /*write_me*/); /* Removing something already present is OK. */ int toku_cachetable_remove (CACHEFILE, CACHEKEY, int /*write_me*/); /* Removing something already present is OK. */
int toku_cachetable_assert_all_unpinned (CACHETABLE); int toku_cachetable_assert_all_unpinned (CACHETABLE);
int toku_cachefile_count_pinned (CACHEFILE, int /*printthem*/ ); int toku_cachefile_count_pinned (CACHEFILE, int /*printthem*/ );
/* Rename whatever is at oldkey to be newkey. Requires that the object be pinned. */ /* Rename whatever is at oldkey to be newkey. Requires that the object be pinned. */
int toku_cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey); int toku_cachetable_rename (CACHEFILE cachefile, CACHEKEY oldkey, CACHEKEY newkey);
//int cachetable_fsync_all (CACHETABLE); /* Flush everything to disk, but keep it in cache. */ //int cachetable_fsync_all (CACHETABLE); /* Flush everything to disk, but keep it in cache. */
int toku_cachetable_close (CACHETABLE*); /* Flushes everything to disk, and destroys the cachetable. */
int toku_cachefile_close (CACHEFILE*, TOKULOGGER); int toku_cachefile_close (CACHEFILE*, TOKULOGGER);
int toku_cachefile_flush (CACHEFILE); int toku_cachefile_flush (CACHEFILE);
// effect: flush everything owned by the cachefile. // effect: flush everything owned by the cachefile from the cachetable. all dirty
// blocks are written sto storage. all unpinned blocks are evicts from the cachetable.
// returns: 0 if success // returns: 0 if success
void toku_cachefile_refup (CACHEFILE cfp); void toku_cachefile_refup (CACHEFILE cfp);
...@@ -85,27 +109,25 @@ int toku_cachefile_set_fd (CACHEFILE cf, int fd, const char *fname); ...@@ -85,27 +109,25 @@ int toku_cachefile_set_fd (CACHEFILE cf, int fd, const char *fname);
// effect: bind the cachefile to a new fd and fname. the old fd is closed. // effect: bind the cachefile to a new fd and fname. the old fd is closed.
// returns: 0 if success // returns: 0 if success
// Useful for debugging
void toku_cachetable_print_state (CACHETABLE ct);
void toku_cachetable_get_state(CACHETABLE ct, int *num_entries_ptr, int *hash_size_ptr, long *size_current_ptr, long *size_limit_ptr);
int toku_cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, CACHEFILE cf, void **value_ptr,
int *dirty_ptr, long long *pin_ptr, long *size_ptr);
void toku_cachefile_verify (CACHEFILE cf); // Verify the whole cachetable that the CF is in. Slow.
void toku_cachetable_verify (CACHETABLE t); // Slow...
TOKULOGGER toku_cachefile_logger (CACHEFILE); TOKULOGGER toku_cachefile_logger (CACHEFILE);
FILENUM toku_cachefile_filenum (CACHEFILE);
// What is the cachefile that goes with a particular filenum? FILENUM toku_cachefile_filenum (CACHEFILE);
// During a transaction, we cannot reuse a filenum.
int toku_cachefile_of_filenum (CACHETABLE t, FILENUM filenum, CACHEFILE *cf);
int toku_cachetable_checkpoint (CACHETABLE ct);
u_int32_t toku_cachetable_hash (CACHEFILE cachefile, CACHEKEY key); u_int32_t toku_cachetable_hash (CACHEFILE cachefile, CACHEKEY key);
// Effect: Return a 32-bit hash key. The hash key shall be suitable for using with bitmasking for a table of size power-of-two. // Effect: Return a 32-bit hash key. The hash key shall be suitable for using with bitmasking for a table of size power-of-two.
u_int32_t toku_cachefile_fullhash_of_header (CACHEFILE cachefile); u_int32_t toku_cachefile_fullhash_of_header (CACHEFILE cachefile);
// debug functions
void toku_cachetable_print_state (CACHETABLE ct);
void toku_cachetable_get_state(CACHETABLE ct, int *num_entries_ptr, int *hash_size_ptr, long *size_current_ptr, long *size_limit_ptr);
int toku_cachetable_get_key_state(CACHETABLE ct, CACHEKEY key, CACHEFILE cf,
void **value_ptr,
int *dirty_ptr,
long long *pin_ptr,
long *size_ptr);
void toku_cachefile_verify (CACHEFILE cf); // Verify the whole cachetable that the CF is in. Slow.
void toku_cachetable_verify (CACHETABLE t); // Slow...
#endif #endif
...@@ -61,6 +61,9 @@ REGRESSION_TESTS = \ ...@@ -61,6 +61,9 @@ REGRESSION_TESTS = \
brt-test3 \ brt-test3 \
brt-test4 \ brt-test4 \
brt-test5 \ brt-test5 \
cachetable-rwlock-test \
cachetable-writequeue-test \
threadpool-test \
cachetable-test \ cachetable-test \
cachetable-test2 \ cachetable-test2 \
cachetable-put-test \ cachetable-put-test \
...@@ -69,6 +72,7 @@ REGRESSION_TESTS = \ ...@@ -69,6 +72,7 @@ REGRESSION_TESTS = \
cachetable-fd-test \ cachetable-fd-test \
cachetable-flush-test \ cachetable-flush-test \
cachetable-count-pinned-test \ cachetable-count-pinned-test \
cachetable-debug-test \
fifo-test \ fifo-test \
list-test \ list-test \
keyrange \ keyrange \
......
#include <stdio.h>
#include <unistd.h>
#include <assert.h>
#include "test.h"
#include "cachetable.h"
void flush() {
}
int fetch() {
return 0;
}
void cachetable_debug_test(int n) {
const int test_limit = n;
int r;
CACHETABLE ct;
r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
char fname1[] = __FILE__ "test1.dat";
unlink(fname1);
CACHEFILE f1;
r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, 0777); assert(r == 0);
int num_entries, hash_size; long size_current, size_limit;
toku_cachetable_get_state(ct, &num_entries, &hash_size, &size_current, &size_limit);
assert(num_entries == 0);
assert(size_current == 0);
assert(size_limit == n);
// printf("%d %d %ld %ld\n", num_entries, hash_size, size_current, size_limit);
int i;
for (i=1; i<=n; i++) {
const int item_size = 1;
u_int32_t hi;
hi = toku_cachetable_hash(f1, i);
r = toku_cachetable_put(f1, i, hi, (void *)(long)i, item_size, flush, fetch, 0);
assert(r == 0);
void *v; int dirty; long long pinned; long pair_size;
r = toku_cachetable_get_key_state(ct, i, f1, &v, &dirty, &pinned, &pair_size);
assert(r == 0);
assert(v == (void *)(long)i);
assert(dirty == CACHETABLE_DIRTY);
assert(pinned == 1);
assert(pair_size == item_size);
r = toku_cachetable_unpin(f1, i, hi, CACHETABLE_CLEAN, 1);
assert(r == 0);
toku_cachetable_get_state(ct, &num_entries, &hash_size, &size_current, &size_limit);
assert(num_entries == i);
assert(size_current == i);
assert(size_limit == n);
toku_cachetable_print_state(ct);
}
toku_cachetable_verify(ct);
extern void print_hash_histogram();
print_hash_histogram();
r = toku_cachefile_close(&f1, NULL_LOGGER); assert(r == 0 && f1 == 0);
r = toku_cachetable_close(&ct); assert(r == 0 && ct == 0);
}
int main(int argc, const char *argv[]) {
int i;
for (i=1; i<argc; i++) {
if (strcmp(argv[i], "-v") == 0) {
verbose++;
continue;
}
}
cachetable_debug_test(8);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
#include "cachetable-rwlock.h"
int verbose = 0;
// test create and destroy
void test_create_destroy() {
struct ctpair_rwlock the_rwlock, *rwlock = &the_rwlock;
ctpair_rwlock_init(rwlock);
ctpair_rwlock_destroy(rwlock);
}
// test read lock and unlock with no writers
void test_simple_read_lock(int n) {
struct ctpair_rwlock the_rwlock, *rwlock = &the_rwlock;
ctpair_rwlock_init(rwlock);
assert(ctpair_pinned(rwlock) == 0);
int i;
for (i=1; i<=n; i++) {
ctpair_read_lock(rwlock, 0);
assert(ctpair_pinned(rwlock) == i);
assert(ctpair_users(rwlock) == i);
}
for (i=n-1; i>=0; i--) {
ctpair_read_unlock(rwlock);
assert(ctpair_pinned(rwlock) == i);
assert(ctpair_users(rwlock) == i);
}
ctpair_rwlock_destroy(rwlock);
}
// test write lock and unlock with no readers
void test_simple_write_lock() {
struct ctpair_rwlock the_rwlock, *rwlock = &the_rwlock;
ctpair_rwlock_init(rwlock);
assert(ctpair_users(rwlock) == 0);
ctpair_write_lock(rwlock, 0);
assert(ctpair_writers(rwlock) == 1);
assert(ctpair_users(rwlock) == 1);
ctpair_write_unlock(rwlock);
assert(ctpair_users(rwlock) == 0);
ctpair_rwlock_destroy(rwlock);
}
struct rw_event {
int e;
struct ctpair_rwlock the_rwlock;
pthread_mutex_t mutex;
};
void rw_event_init(struct rw_event *rwe) {
rwe->e = 0;
ctpair_rwlock_init(&rwe->the_rwlock);
int r = pthread_mutex_init(&rwe->mutex, 0); assert(r == 0);
}
void rw_event_destroy(struct rw_event *rwe) {
ctpair_rwlock_destroy(&rwe->the_rwlock);
int r = pthread_mutex_destroy(&rwe->mutex); assert(r == 0);
}
void *test_writer_priority_thread(void *arg) {
struct rw_event *rwe = arg;
int r;
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
ctpair_write_lock(&rwe->the_rwlock, &rwe->mutex);
rwe->e++; assert(rwe->e == 3);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
sleep(1);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
rwe->e++; assert(rwe->e == 4);
ctpair_write_unlock(&rwe->the_rwlock);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
return arg;
}
// test writer priority over new readers
void test_writer_priority() {
struct rw_event rw_event, *rwe = &rw_event;
int r;
rw_event_init(rwe);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
ctpair_read_lock(&rwe->the_rwlock, &rwe->mutex);
sleep(1);
rwe->e++; assert(rwe->e == 1);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
pthread_t tid;
r = pthread_create(&tid, 0, test_writer_priority_thread, rwe);
sleep(1);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
rwe->e++; assert(rwe->e == 2);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
sleep(1);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
ctpair_read_unlock(&rwe->the_rwlock);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
sleep(1);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
ctpair_read_lock(&rwe->the_rwlock, &rwe->mutex);
rwe->e++; assert(rwe->e == 5);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
sleep(1);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
ctpair_read_unlock(&rwe->the_rwlock);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
void *ret;
r = pthread_join(tid, &ret); assert(r == 0);
rw_event_destroy(rwe);
}
// test single writer
void *test_single_writer_thread(void *arg) {
struct rw_event *rwe = arg;
int r;
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
ctpair_write_lock(&rwe->the_rwlock, &rwe->mutex);
rwe->e++; assert(rwe->e == 3);
assert(ctpair_writers(&rwe->the_rwlock) == 1);
ctpair_write_unlock(&rwe->the_rwlock);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
return arg;
}
void test_single_writer() {
struct rw_event rw_event, *rwe = &rw_event;
int r;
rw_event_init(rwe);
assert(ctpair_writers(&rwe->the_rwlock) == 0);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
ctpair_write_lock(&rwe->the_rwlock, &rwe->mutex);
assert(ctpair_writers(&rwe->the_rwlock) == 1);
sleep(1);
rwe->e++; assert(rwe->e == 1);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
pthread_t tid;
r = pthread_create(&tid, 0, test_single_writer_thread, rwe);
sleep(1);
r = pthread_mutex_lock(&rwe->mutex); assert(r == 0);
rwe->e++; assert(rwe->e == 2);
assert(ctpair_writers(&rwe->the_rwlock) == 1);
assert(ctpair_users(&rwe->the_rwlock) == 2);
ctpair_write_unlock(&rwe->the_rwlock);
r = pthread_mutex_unlock(&rwe->mutex); assert(r == 0);
void *ret;
r = pthread_join(tid, &ret); assert(r == 0);
assert(ctpair_writers(&rwe->the_rwlock) == 0);
rw_event_destroy(rwe);
}
int main(int argc, char *argv[]) {
int i;
for (i=1; i<argc; i++) {
char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
verbose++;
continue;
}
}
test_create_destroy();
test_simple_read_lock(0);
test_simple_read_lock(42);
test_simple_write_lock();
test_writer_priority();
test_single_writer();
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <errno.h>
#include <string.h>
#include <pthread.h>
int verbose;
typedef struct ctpair *PAIR;
struct ctpair {
PAIR next_wq;
};
PAIR new_pair() {
PAIR p = (PAIR) malloc(sizeof *p); assert(p);
return p;
}
void destroy_pair(PAIR p) {
free(p);
}
#include "cachetable-writequeue.h"
// test simple create and destroy
void test_create_destroy() {
struct writequeue writequeue, *wq = &writequeue;
writequeue_init(wq);
assert(writequeue_empty(wq));
writequeue_destroy(wq);
}
// verify that the wq implements FIFO ordering
void test_simple_enq_deq(int n) {
struct writequeue writequeue, *wq = &writequeue;
int r;
pthread_mutex_t mutex;
r = pthread_mutex_init(&mutex, 0); assert(r == 0);
writequeue_init(wq);
assert(writequeue_empty(wq));
PAIR pairs[n];
int i;
for (i=0; i<n; i++) {
pairs[i] = new_pair();
writequeue_enq(wq, pairs[i]);
assert(!writequeue_empty(wq));
}
for (i=0; i<n; i++) {
PAIR p;
r = writequeue_deq(wq, &mutex, &p);
assert(r == 0 && p == pairs[i]);
destroy_pair(p);
}
assert(writequeue_empty(wq));
writequeue_destroy(wq);
r = pthread_mutex_destroy(&mutex); assert(r == 0);
}
// setting the wq closed should cause deq to return EINVAL
void test_set_closed() {
struct writequeue writequeue, *wq = &writequeue;
writequeue_init(wq);
writequeue_set_closed(wq);
int r = writequeue_deq(wq, 0, 0);
assert(r == EINVAL);
writequeue_destroy(wq);
}
// closing a wq with a blocked reader thread should cause the reader to get EINVAL
struct writequeue_with_mutex {
struct writequeue writequeue;
pthread_mutex_t mutex;
};
void writequeue_with_mutex_init(struct writequeue_with_mutex *wqm) {
writequeue_init(&wqm->writequeue);
int r = pthread_mutex_init(&wqm->mutex, 0); assert(r == 0);
}
void writequeue_with_mutex_destroy(struct writequeue_with_mutex *wqm) {
writequeue_destroy(&wqm->writequeue);
int r = pthread_mutex_destroy(&wqm->mutex); assert(r == 0);
}
void *test_set_closed_waiter(void *arg) {
struct writequeue_with_mutex *wqm = arg;
int r;
r = pthread_mutex_lock(&wqm->mutex); assert(r == 0);
PAIR p;
r = writequeue_deq(&wqm->writequeue, &wqm->mutex, &p);
assert(r == EINVAL);
r = pthread_mutex_unlock(&wqm->mutex); assert(r == 0);
return arg;
}
void test_set_closed_thread() {
struct writequeue_with_mutex writequeue_with_mutex, *wqm = &writequeue_with_mutex;
int r;
writequeue_with_mutex_init(wqm);
pthread_t tid;
r = pthread_create(&tid, 0, test_set_closed_waiter, wqm); assert(r == 0);
sleep(1);
writequeue_set_closed(&wqm->writequeue);
void *ret;
r = pthread_join(tid, &ret);
assert(r == 0 && ret == wqm);
writequeue_with_mutex_destroy(wqm);
}
// verify writer reader flow control
// the write (main) thread writes as fast as possible until the wq is full. then it
// waits.
// the read thread reads from the wq slowly using a random delay. it wakes up any
// writers when the wq size <= 1/2 of the wq limit
struct rwfc {
pthread_mutex_t mutex;
struct writequeue writequeue;
int current, limit;
};
void rwfc_init(struct rwfc *rwfc, int limit) {
int r;
r = pthread_mutex_init(&rwfc->mutex, 0); assert(r == 0);
writequeue_init(&rwfc->writequeue);
rwfc->current = 0; rwfc->limit = limit;
}
void rwfc_destroy(struct rwfc *rwfc) {
int r;
writequeue_destroy(&rwfc->writequeue);
r = pthread_mutex_destroy(&rwfc->mutex); assert(r == 0);
}
void *rwfc_reader(void *arg) {
struct rwfc *rwfc = arg;
int r;
while (1) {
PAIR ctpair;
r = pthread_mutex_lock(&rwfc->mutex); assert(r == 0);
r = writequeue_deq(&rwfc->writequeue, &rwfc->mutex, &ctpair);
if (r == EINVAL) {
r = pthread_mutex_unlock(&rwfc->mutex); assert(r == 0);
break;
}
if (2*rwfc->current-- > rwfc->limit && 2*rwfc->current <= rwfc->limit) {
writequeue_wakeup_write(&rwfc->writequeue);
}
r = pthread_mutex_unlock(&rwfc->mutex); assert(r == 0);
destroy_pair(ctpair);
usleep(random() % 100);
}
return arg;
}
void test_flow_control(int limit, int n) {
struct rwfc my_rwfc, *rwfc = &my_rwfc;
int r;
rwfc_init(rwfc, limit);
pthread_t tid;
r = pthread_create(&tid, 0, rwfc_reader, rwfc); assert(r == 0);
sleep(1); // this is here to block the reader on the first deq
int i;
for (i=0; i<n; i++) {
PAIR ctpair = new_pair();
r = pthread_mutex_lock(&rwfc->mutex); assert(r == 0);
writequeue_enq(&rwfc->writequeue, ctpair);
rwfc->current++;
while (rwfc->current >= rwfc->limit) {
// printf("%d - %d %d\n", i, rwfc->current, rwfc->limit);
writequeue_wait_write(&rwfc->writequeue, &rwfc->mutex);
}
r = pthread_mutex_unlock(&rwfc->mutex); assert(r == 0);
// usleep(random() % 1);
}
writequeue_set_closed(&rwfc->writequeue);
void *ret;
r = pthread_join(tid, &ret); assert(r == 0);
rwfc_destroy(rwfc);
}
int main(int argc, char *argv[]) {
int i;
for (i=1; i<argc; i++) {
char *arg = argv[i];
if (strcmp(arg, "-v") == 0) {
verbose++;
continue;
}
}
test_create_destroy();
test_simple_enq_deq(0);
test_simple_enq_deq(42);
test_set_closed();
test_set_closed_thread();
test_flow_control(8, 10000);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <assert.h>
#include <string.h>
#include <errno.h>
#include <malloc.h>
#include <pthread.h>
#include "threadpool.h"
int verbose = 0;
struct my_threadpool {
THREADPOOL threadpool;
pthread_mutex_t mutex;
pthread_cond_t wait;
int closed;
};
void my_threadpool_init(struct my_threadpool *my_threadpool, int max_threads) {
int r;
r = threadpool_create(&my_threadpool->threadpool, max_threads); assert(r == 0);
assert(my_threadpool != 0);
r = pthread_mutex_init(&my_threadpool->mutex, 0); assert(r == 0);
r = pthread_cond_init(&my_threadpool->wait, 0); assert(r == 0);
my_threadpool->closed = 0;
}
void my_threadpool_destroy(struct my_threadpool *my_threadpool) {
int r;
r = pthread_mutex_lock(&my_threadpool->mutex); assert(r == 0);
my_threadpool->closed = 1;
r = pthread_cond_broadcast(&my_threadpool->wait); assert(r == 0);
r = pthread_mutex_unlock(&my_threadpool->mutex); assert(r == 0);
if (verbose) printf("current %d\n", threadpool_get_current_threads(my_threadpool->threadpool));
threadpool_destroy(&my_threadpool->threadpool); assert(my_threadpool->threadpool == 0);
r = pthread_mutex_destroy(&my_threadpool->mutex); assert(r == 0);
r = pthread_cond_destroy(&my_threadpool->wait); assert(r == 0);
}
void *fbusy(void *arg) {
struct my_threadpool *my_threadpool = arg;
int r;
r = pthread_mutex_lock(&my_threadpool->mutex); assert(r == 0);
while (!my_threadpool->closed) {
r = pthread_cond_wait(&my_threadpool->wait, &my_threadpool->mutex); assert(r == 0);
}
r = pthread_mutex_unlock(&my_threadpool->mutex); assert(r == 0);
if (verbose) printf("%lu:%s:exit\n", pthread_self(), __FUNCTION__);
return arg;
}
void *fidle(void *arg) {
struct my_threadpool *my_threadpool = arg;
int r;
r = pthread_mutex_lock(&my_threadpool->mutex); assert(r == 0);
threadpool_set_thread_idle(my_threadpool->threadpool);
while (!my_threadpool->closed) {
r = pthread_cond_wait(&my_threadpool->wait, &my_threadpool->mutex); assert(r == 0);
}
r = pthread_mutex_unlock(&my_threadpool->mutex); assert(r == 0);
if (verbose) printf("%lu:%s:exit\n", pthread_self(), __FUNCTION__);
return arg;
}
#define DO_MALLOC_HOOK 1
#if DO_MALLOC_HOOK
static void *my_malloc_always_fails(size_t n, const __malloc_ptr_t p) {
n = n; p = p;
return 0;
}
#endif
int usage() {
printf("threadpool-test: [-v] [-malloc-fail] [N]\n");
printf("-malloc-fail simulate malloc failures\n");
printf("N max number of threads in the thread pool\n");
return 1;
}
int main(int argc, char *argv[]) {
int max_threads = 1;
int do_malloc_fail = 0;
int i;
for (i=1; i<argc; i++) {
char *arg = argv[i];
if (strcmp(arg, "-h") == 0 || strcmp(arg, "-help") == 0) {
return usage();
} else if (strcmp(arg, "-v") == 0) {
verbose++;
continue;
} else if (strcmp(arg, "-q") == 0) {
verbose = 0;
continue;
} else if (strcmp(arg, "-malloc-fail") == 0) {
do_malloc_fail = 1;
continue;
} else
max_threads = atoi(arg);
}
struct my_threadpool my_threadpool;
THREADPOOL threadpool;
// test threadpool busy causes no threads to be created
my_threadpool_init(&my_threadpool, max_threads);
threadpool = my_threadpool.threadpool;
if (verbose) printf("test threadpool_set_busy\n");
for (i=0; i<2*max_threads; i++) {
threadpool_maybe_add(threadpool, fbusy, &my_threadpool);
assert(threadpool_get_current_threads(threadpool) == 1);
}
assert(threadpool_get_current_threads(threadpool) == 1);
my_threadpool_destroy(&my_threadpool);
// test threadpool idle causes up to max_threads to be created
my_threadpool_init(&my_threadpool, max_threads);
threadpool = my_threadpool.threadpool;
if (verbose) printf("test threadpool_set_idle\n");
for (i=0; i<2*max_threads; i++) {
threadpool_maybe_add(threadpool, fidle, &my_threadpool);
sleep(1);
assert(threadpool_get_current_threads(threadpool) <= max_threads);
}
assert(threadpool_get_current_threads(threadpool) == max_threads);
my_threadpool_destroy(&my_threadpool);
#if DO_MALLOC_HOOK
if (do_malloc_fail) {
if (verbose) printf("test threadpool_create with malloc failure\n");
// test threadpool malloc fails causes ENOMEM
// glibc supports this. see malloc.h
threadpool = 0;
void *(*orig_malloc_hook) (size_t, const __malloc_ptr_t) = __malloc_hook;
__malloc_hook = my_malloc_always_fails;
int r;
r = threadpool_create(&threadpool, 0); assert(r == ENOMEM);
r = threadpool_create(&threadpool, 1); assert(r == ENOMEM);
__malloc_hook = orig_malloc_hook;
}
#endif
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <malloc.h>
#include <pthread.h>
#include <errno.h>
#include "threadpool.h"
// use gcc builtin fetch_and_add 0->no 1->yes
#define DO_ATOMIC_FETCH_AND_ADD 0
struct threadpool {
int max_threads;
int current_threads;
int busy_threads;
pthread_t pids[];
};
int threadpool_create(THREADPOOL *threadpoolptr, int max_threads) {
size_t size = sizeof (struct threadpool) + max_threads*sizeof (pthread_t);
struct threadpool *threadpool = malloc(size);
if (threadpool == 0)
return ENOMEM;
threadpool->max_threads = max_threads;
threadpool->current_threads = 0;
threadpool->busy_threads = 0;
int i;
for (i=0; i<max_threads; i++)
threadpool->pids[i] = 0;
*threadpoolptr = threadpool;
return 0;
}
void threadpool_destroy(THREADPOOL *threadpoolptr) {
struct threadpool *threadpool = *threadpoolptr;
int i;
for (i=0; i<threadpool->current_threads; i++) {
int r; void *ret;
r = pthread_join(threadpool->pids[i], &ret);
assert(r == 0);
}
*threadpoolptr = 0;
free(threadpool);
}
void threadpool_maybe_add(THREADPOOL threadpool, void *(*f)(void *), void *arg) {
if ((threadpool->current_threads == 0 || threadpool->busy_threads < threadpool->current_threads) && threadpool->current_threads < threadpool->max_threads) {
int r = pthread_create(&threadpool->pids[threadpool->current_threads], 0, f, arg);
if (r == 0) {
threadpool->current_threads++;
threadpool_set_thread_busy(threadpool);
}
}
}
void threadpool_set_thread_busy(THREADPOOL threadpool) {
#if DO_ATOMIC_FETCH_AND_ADD
(void) __sync_fetch_and_add(&threadpool->busy_threads, 1);
#else
threadpool->busy_threads++;
#endif
}
void threadpool_set_thread_idle(THREADPOOL threadpool) {
#if DO_ATOMIC_FETCH_AND_ADD
(void) __sync_fetch_and_add(&threadpool->busy_threads, -1);
#else
threadpool->busy_threads--;
#endif
}
int threadpool_get_current_threads(THREADPOOL threadpool) {
return threadpool->current_threads;
}
// A threadpool is a limited set of threads that can be used to apply a
// function to work contained in a work queue. The work queue is outside
// of the scope of the threadpool; the threadpool merely provides
// mechanisms to grow the number of threads in the threadpool on demand.
typedef struct threadpool *THREADPOOL;
// Create a new threadpool
// Effects: a new threadpool is allocated and initialized. the number of
// threads in the threadpool is limited to max_threads. initially, there
// are no threads in the pool.
// Returns: if there are no errors, the threadpool is set and zero is returned.
// Otherwise, an error number is returned.
int threadpool_create(THREADPOOL *threadpoolptr, int max_threads);
// Destroy a threadpool
// Effects: the calling thread joins with all of the threads in the threadpool.
// Effects: the threadpool memory is freed.
// Returns: the threadpool is set to null.
void threadpool_destroy(THREADPOOL *threadpoolptr);
// Maybe add a thread to the threadpool.
// Effects: the number of threads in the threadpool is expanded by 1 as long
// as the current number of threads in the threadpool is less than the max
// and there are no idle threads.
// Effects: if the thread is create, it calls the function f with argument arg
// Expects: external serialization on this function; only one thread may
// execute this function
void threadpool_maybe_add(THREADPOOL theadpool, void *(*f)(void *), void *arg);
// Set the current thread busy
// Effects: the threadpool keeps a count of the number of idle threads. It
// uses this count to control the creation of additional threads.
void threadpool_set_thread_busy(THREADPOOL);
// Set the current thread idle
void threadpool_set_thread_idle(THREADPOOL);
// get the current number of threads
int threadpool_get_current_threads(THREADPOOL);
CFLAGS = -Wall -W -Werror -g
pma: LDFLAGS=-lm
pma:
pma.o:
...@@ -21,8 +21,7 @@ CFLAGS += -Wbad-function-cast -Wcast-align -Waggregate-return ...@@ -21,8 +21,7 @@ CFLAGS += -Wbad-function-cast -Wcast-align -Waggregate-return
CFLAGS += -Wmissing-noreturn -Wmissing-format-attribute CFLAGS += -Wmissing-noreturn -Wmissing-format-attribute
CPPFLAGS += -L../ -L../../range_tree CPPFLAGS += -L../ -L../../range_tree
CPPFLAGS += -I. -I../ -I../../range_tree -I../../../newbrt -I../../../include CPPFLAGS += -I. -I../ -I../../range_tree -I../../../newbrt -I../../../include
LDFLAGS += -lz LDFLAGS = -lpthread -lz
SRCS = $(wildcard *.c) SRCS = $(wildcard *.c)
......
...@@ -19,6 +19,7 @@ CFLAGS += -Wmissing-noreturn -Wmissing-format-attribute ...@@ -19,6 +19,7 @@ CFLAGS += -Wmissing-noreturn -Wmissing-format-attribute
CPPFLAGS = -I. -I../../include -I../../newbrt CPPFLAGS = -I. -I../../include -I../../newbrt
CPPFLAGS += -D_GNU_SOURCE -D_THREAD_SAFE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE CPPFLAGS += -D_GNU_SOURCE -D_THREAD_SAFE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE
CFLAGS += $(VISIBILITY) $(PROF_FLAGS) CFLAGS += $(VISIBILITY) $(PROF_FLAGS)
LDFLAGS += -lpthread
ifneq ($(OSX),) ifneq ($(OSX),)
CFLAGS+=-fno-common CFLAGS+=-fno-common
......
...@@ -17,6 +17,7 @@ CFLAGS = -W -Wall -Wextra -Werror $(OPTFLAGS) -g3 -ggdb3 $(GCOV_FLAGS) ...@@ -17,6 +17,7 @@ CFLAGS = -W -Wall -Wextra -Werror $(OPTFLAGS) -g3 -ggdb3 $(GCOV_FLAGS)
CFLAGS += -Wbad-function-cast -Wcast-align -Wconversion -Waggregate-return CFLAGS += -Wbad-function-cast -Wcast-align -Wconversion -Waggregate-return
CFLAGS += -Wmissing-noreturn -Wmissing-format-attribute CFLAGS += -Wmissing-noreturn -Wmissing-format-attribute
CPPFLAGS += -I../ -I../../../newbrt -I../../../include CPPFLAGS += -I../ -I../../../newbrt -I../../../include
LDFLAGS = -lpthread
SRCS = $(wildcard *.c) SRCS = $(wildcard *.c)
...@@ -99,7 +100,7 @@ LDFLAGS = -lz ...@@ -99,7 +100,7 @@ LDFLAGS = -lz
%.lin: %.c $(HEADERS) $(LINEAR_BINS) %.lin: %.c $(HEADERS) $(LINEAR_BINS)
$(CC) -DDIR=\"dir.$<.lin\" $(CFLAGS) $(CPPFLAGS) $< -o $@ $(LINEAR_BINS) $(LDFLAGS) $(CC) -DDIR=\"dir.$<.lin\" $(CFLAGS) $(CPPFLAGS) $< -o $@ $(LINEAR_BINS) $(LDFLAGS)
%.tlog: %.c $(HEADERS) $(TLOG_BINS) %.tlog: %.c $(HEADERS) $(TLOG_BINS)
$(CC) -DDIR=\"dir.$<.log\" $(CFLAGS) $(CPPFLAGS) $< -o $@ $(TLOG_BINS) -DTOKU_RT_NOOVERLAPS $(LDFLAGS) $(CC) -DDIR=\"dir.$<.log\" $(CFLAGS) $(CPPFLAGS) $< -o $@ $(TLOG_BINS) -DTOKU_RT_NOOVERLAPS $(LDFLAGS)
%.log: %.c $(HEADERS) $(LOG_BINS) %.log: %.c $(HEADERS) $(LOG_BINS)
$(CC) -DDIR=\"dir.$<.log\" $(CFLAGS) $(CPPFLAGS) $< -o $@ $(LOG_BINS) $(LDFLAGS) $(CC) -DDIR=\"dir.$<.log\" $(CFLAGS) $(CPPFLAGS) $< -o $@ $(LOG_BINS) $(LDFLAGS)
......
...@@ -2593,20 +2593,27 @@ static int toku_db_delboth_noassociate(DB *db, DB_TXN *txn, DBT *key, DBT *val, ...@@ -2593,20 +2593,27 @@ static int toku_db_delboth_noassociate(DB *db, DB_TXN *txn, DBT *key, DBT *val,
u_int32_t lock_flags = get_prelocked_flags(flags); u_int32_t lock_flags = get_prelocked_flags(flags);
flags &= ~lock_flags; flags &= ~lock_flags;
u_int32_t suppress_missing = flags&DB_DELETE_ANY; u_int32_t delete_any = flags&DB_DELETE_ANY;
flags &= ~DB_DELETE_ANY; flags &= ~DB_DELETE_ANY;
if (flags!=0) return EINVAL; if (flags!=0) return EINVAL;
//DB_DELETE_ANY supresses the DB_NOTFOUND return value indicating that the key was not found prior to the delete //DB_DELETE_ANY supresses the DB_NOTFOUND return value indicating that the key was not found prior to the delete
//TODO: Speed up the DB_DELETE_ANY version by implementing it at the BRT layer. if (delete_any) {
if (db->i->lt && !(lock_flags&DB_PRELOCKED_WRITE)) {
DB_TXN* txn_anc = toku_txn_ancestor(txn);
if ((r = toku_txn_add_lt(txn_anc, db->i->lt))) goto any_cleanup;
TXNID id_anc = toku_txn_get_txnid(txn_anc->i->tokutxn);
r = toku_lt_acquire_write_lock(db->i->lt, db, id_anc, key, val);
if (r!=0) goto any_cleanup;
}
r = toku_brt_delete_both(db->i->brt, key, val, txn ? txn->i->tokutxn : NULL);
any_cleanup:
return r;
}
DBC *dbc; DBC *dbc;
if ((r = toku_db_cursor(db, txn, &dbc, 0, 0))) goto cursor_cleanup; if ((r = toku_db_cursor(db, txn, &dbc, 0, 0))) goto cursor_cleanup;
r = toku_c_get_noassociate(dbc, key, val, DB_GET_BOTH); if ((r = toku_c_get_noassociate(dbc, key, val, DB_GET_BOTH))) goto cursor_cleanup;
if (r!=0) {
if (suppress_missing && r==DB_NOTFOUND) r = 0;
goto cursor_cleanup;
}
r = toku_c_del_noassociate(dbc, lock_flags); r = toku_c_del_noassociate(dbc, lock_flags);
cursor_cleanup:; cursor_cleanup:;
int r2 = toku_c_close(dbc); int r2 = toku_c_close(dbc);
......
...@@ -30,7 +30,7 @@ LDFLAGS = -L../lib -ltokudb -lpthread $(TDB_LOADLIBES) -lz ...@@ -30,7 +30,7 @@ LDFLAGS = -L../lib -ltokudb -lpthread $(TDB_LOADLIBES) -lz
# vars to compile bins that handle tokudb using libtokudb.a # vars to compile bins that handle tokudb using libtokudb.a
STATIC_CPPFLAGS = -I../include STATIC_CPPFLAGS = -I../include
STATIC_LDFLAGS = ../lib/libtokudb.a -lz STATIC_LDFLAGS = ../lib/libtokudb.a -lz -lpthread
# vars to compile bins that handle bdb # vars to compile bins that handle bdb
BDB_CPPFLAGS = -I$(BDBDIR)/include BDB_CPPFLAGS = -I$(BDBDIR)/include
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment