Commit 3c2d3927 authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

refs #5312, merge to main

git-svn-id: file:///svn/toku/tokudb@47022 c7de825b-a66e-492c-adef-691d508d4ae1
parent 69fcb426
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved." #ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include "frwlock.h"
#include "nonblocking_mutex.h" #include "nonblocking_mutex.h"
#include "kibbutz.h" #include "kibbutz.h"
#include "background_job_manager.h" #include "background_job_manager.h"
...@@ -32,20 +33,20 @@ ...@@ -32,20 +33,20 @@
// - pair_list->pending_lock_cheap // - pair_list->pending_lock_cheap
// - cachefile_list->lock // - cachefile_list->lock
// - PAIR->mutex // - PAIR->mutex
// - PAIR->value_nb_mutex // - PAIR->value_rwlock
// - PAIR->disk_nb_mutex // - PAIR->disk_nb_mutex
// //
// Here are rules for how the locks interact: // Here are rules for how the locks interact:
// - To grab any of the pair_list's locks, or the cachefile_list's lock, // - To grab any of the pair_list's locks, or the cachefile_list's lock,
// the cachetable must be in existence // the cachetable must be in existence
// - To grab the PAIR mutex, we must know the PAIR will not dissappear: // - To grab the PAIR mutex, we must know the PAIR will not dissappear:
// - the PAIR must be pinned (value_nb_mutex or disk_nb_mutex is held) // - the PAIR must be pinned (value_rwlock or disk_nb_mutex is held)
// - OR, the pair_list's list lock is held // - OR, the pair_list's list lock is held
// - As a result, to get rid of a PAIR from the pair_list, we must hold // - As a result, to get rid of a PAIR from the pair_list, we must hold
// both the pair_list's list_lock and the PAIR's mutex // both the pair_list's list_lock and the PAIR's mutex
// - To grab PAIR->value_nb_mutex, we must hold the PAIR's mutex // - To grab PAIR->value_rwlock, we must hold the PAIR's mutex
// - To grab PAIR->disk_nb_mutex, we must hold the PAIR's mutex // - To grab PAIR->disk_nb_mutex, we must hold the PAIR's mutex
// and hold PAIR->value_nb_mutex // and hold PAIR->value_rwlock
// //
// Now let's talk about ordering. Here is an order from outer to inner (top locks must be grabbed first) // Now let's talk about ordering. Here is an order from outer to inner (top locks must be grabbed first)
// - pair_list->pending_lock_expensive // - pair_list->pending_lock_expensive
...@@ -55,7 +56,7 @@ ...@@ -55,7 +56,7 @@
// - pair_list->pending_lock_cheap <-- after grabbing this lock, // - pair_list->pending_lock_cheap <-- after grabbing this lock,
// NO other locks // NO other locks
// should be grabbed. // should be grabbed.
// - when grabbing PAIR->value_nb_mutex or PAIR->disk_nb_mutex, // - when grabbing PAIR->value_rwlock or PAIR->disk_nb_mutex,
// if the acquisition will not block, then it does not matter if any other locks held, // if the acquisition will not block, then it does not matter if any other locks held,
// BUT if the acquisition will block, then NO other locks may be held besides // BUT if the acquisition will block, then NO other locks may be held besides
// PAIR->mutex. // PAIR->mutex.
...@@ -139,7 +140,7 @@ struct ctpair { ...@@ -139,7 +140,7 @@ struct ctpair {
long cloned_value_size; // size of cloned_value_data, used for accounting of size_current long cloned_value_size; // size of cloned_value_data, used for accounting of size_current
void* disk_data; // data used to fetch/flush value_data to and from disk. void* disk_data; // data used to fetch/flush value_data to and from disk.
// access to these fields are protected by value_nb_mutex // access to these fields are protected by value_rwlock
void* value_data; // data used by client threads, FTNODEs and ROLLBACK_LOG_NODEs void* value_data; // data used by client threads, FTNODEs and ROLLBACK_LOG_NODEs
PAIR_ATTR attr; PAIR_ATTR attr;
enum cachetable_dirty dirty; enum cachetable_dirty dirty;
...@@ -148,22 +149,22 @@ struct ctpair { ...@@ -148,22 +149,22 @@ struct ctpair {
uint32_t count; // clock count uint32_t count; // clock count
// locks // locks
struct nb_mutex value_nb_mutex; // single writer, protects value_data toku::frwlock value_rwlock;
struct nb_mutex disk_nb_mutex; // single writer, protects disk_data, is used for writing cloned nodes for checkpoint struct nb_mutex disk_nb_mutex; // single writer, protects disk_data, is used for writing cloned nodes for checkpoint
toku_mutex_t mutex; toku_mutex_t mutex;
// Access to checkpoint_pending is protected by two mechanisms, // Access to checkpoint_pending is protected by two mechanisms,
// the value_nb_mutex and the pair_list's pending locks (expensive and cheap). // the value_rwlock and the pair_list's pending locks (expensive and cheap).
// checkpoint_pending may be true of false. // checkpoint_pending may be true of false.
// Here are the rules for reading/modifying this bit. // Here are the rules for reading/modifying this bit.
// - To transition this field from false to true during begin_checkpoint, // - To transition this field from false to true during begin_checkpoint,
// we must be holding both of the pair_list's pending locks. // we must be holding both of the pair_list's pending locks.
// - To transition this field from true to false during end_checkpoint, // - To transition this field from true to false during end_checkpoint,
// we must be holding the value_nb_mutex. // we must be holding the value_rwlock.
// - For a non-checkpoint thread to read the value, we must hold both the // - For a non-checkpoint thread to read the value, we must hold both the
// value_nb_mutex and one of the pair_list's pending locks // value_rwlock and one of the pair_list's pending locks
// - For the checkpoint thread to read the value, we must // - For the checkpoint thread to read the value, we must
// hold the value_nb_mutex // hold the value_rwlock
// //
bool checkpoint_pending; // If this is on, then we have got to resolve checkpointing modifying it. bool checkpoint_pending; // If this is on, then we have got to resolve checkpointing modifying it.
......
...@@ -87,7 +87,7 @@ static PAIR_ATTR const zero_attr = { ...@@ -87,7 +87,7 @@ static PAIR_ATTR const zero_attr = {
static inline void ctpair_destroy(PAIR p) { static inline void ctpair_destroy(PAIR p) {
toku_mutex_destroy(&p->mutex); toku_mutex_destroy(&p->mutex);
nb_mutex_destroy(&p->value_nb_mutex); p->value_rwlock.deinit();
nb_mutex_destroy(&p->disk_nb_mutex); nb_mutex_destroy(&p->disk_nb_mutex);
toku_free(p); toku_free(p);
} }
...@@ -585,7 +585,7 @@ static void cachetable_maybe_remove_and_free_pair ( ...@@ -585,7 +585,7 @@ static void cachetable_maybe_remove_and_free_pair (
) )
{ {
// this ensures that a clone running in the background first completes // this ensures that a clone running in the background first completes
if (nb_mutex_users(&p->value_nb_mutex) == 0) { if (p->value_rwlock.users() == 0) {
// assumption is that if we are about to remove the pair // assumption is that if we are about to remove the pair
// that no one has grabbed the disk_nb_mutex, // that no one has grabbed the disk_nb_mutex,
// and that there is no cloned_value_data, because // and that there is no cloned_value_data, because
...@@ -601,7 +601,7 @@ static void cachetable_maybe_remove_and_free_pair ( ...@@ -601,7 +601,7 @@ static void cachetable_maybe_remove_and_free_pair (
} }
} }
// assumes value_nb_mutex and disk_nb_mutex held on entry // assumes value_rwlock and disk_nb_mutex held on entry
// responsibility of this function is to only write a locked PAIR to disk // responsibility of this function is to only write a locked PAIR to disk
// and NOTHING else. We do not manipulate the state of the PAIR // and NOTHING else. We do not manipulate the state of the PAIR
// of the cachetable here (with the exception of ct->size_current for clones) // of the cachetable here (with the exception of ct->size_current for clones)
...@@ -767,7 +767,7 @@ void pair_init(PAIR p, ...@@ -767,7 +767,7 @@ void pair_init(PAIR p,
p->checkpoint_pending = false; p->checkpoint_pending = false;
toku_mutex_init(&p->mutex, NULL); toku_mutex_init(&p->mutex, NULL);
nb_mutex_init(&p->value_nb_mutex); p->value_rwlock.init(&p->mutex);
nb_mutex_init(&p->disk_nb_mutex); nb_mutex_init(&p->disk_nb_mutex);
p->size_evicting_estimate = 0; // <CER> Is zero the correct init value? p->size_evicting_estimate = 0; // <CER> Is zero the correct init value?
...@@ -860,7 +860,7 @@ static int cachetable_put_internal( ...@@ -860,7 +860,7 @@ static int cachetable_put_internal(
); );
invariant_notnull(p); invariant_notnull(p);
pair_lock(p); pair_lock(p);
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); p->value_rwlock.write_lock(true);
pair_unlock(p); pair_unlock(p);
//note_hash_count(count); //note_hash_count(count);
invariant_notnull(put_callback); invariant_notnull(put_callback);
...@@ -892,7 +892,7 @@ clone_pair(evictor* ev, PAIR p) { ...@@ -892,7 +892,7 @@ clone_pair(evictor* ev, PAIR p) {
// now we need to do the same actions we would do // now we need to do the same actions we would do
// if the PAIR had been written to disk // if the PAIR had been written to disk
// //
// because we hold the value_nb_mutex, // because we hold the value_rwlock,
// it doesn't matter whether we clear // it doesn't matter whether we clear
// the pending bit before the clone // the pending bit before the clone
// or after the clone // or after the clone
...@@ -932,7 +932,7 @@ checkpoint_cloned_pair_on_writer_thread(CACHETABLE ct, PAIR p) { ...@@ -932,7 +932,7 @@ checkpoint_cloned_pair_on_writer_thread(CACHETABLE ct, PAIR p) {
// //
// Given a PAIR p with the value_nb_mutex altready held, do the following: // Given a PAIR p with the value_rwlock altready held, do the following:
// - If the PAIR needs to be written out to disk for checkpoint: // - If the PAIR needs to be written out to disk for checkpoint:
// - If the PAIR is cloneable, clone the PAIR and place the work // - If the PAIR is cloneable, clone the PAIR and place the work
// of writing the PAIR on a background thread. // of writing the PAIR on a background thread.
...@@ -959,7 +959,7 @@ write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p, bool checkpoint_pending) ...@@ -959,7 +959,7 @@ write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p, bool checkpoint_pending)
} }
else { else {
// The pair is not cloneable, just write the pair to disk // The pair is not cloneable, just write the pair to disk
// we already have p->value_nb_mutex and we just do the write in our own thread. // we already have p->value_rwlock and we just do the write in our own thread.
cachetable_write_locked_pair(&ct->ev, p, true); // keeps the PAIR's write lock cachetable_write_locked_pair(&ct->ev, p, true); // keeps the PAIR's write lock
} }
} }
...@@ -973,7 +973,7 @@ write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p, bool checkpoint_pending) ...@@ -973,7 +973,7 @@ write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p, bool checkpoint_pending)
static void static void
write_pair_for_checkpoint_thread (evictor* ev, PAIR p) write_pair_for_checkpoint_thread (evictor* ev, PAIR p)
{ {
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); // grab an exclusive lock on the pair p->value_rwlock.write_lock(true); // grab an exclusive lock on the pair
if (p->dirty && p->checkpoint_pending) { if (p->dirty && p->checkpoint_pending) {
if (p->clone_callback) { if (p->clone_callback) {
nb_mutex_lock(&p->disk_nb_mutex, &p->mutex); nb_mutex_lock(&p->disk_nb_mutex, &p->mutex);
...@@ -983,7 +983,7 @@ write_pair_for_checkpoint_thread (evictor* ev, PAIR p) ...@@ -983,7 +983,7 @@ write_pair_for_checkpoint_thread (evictor* ev, PAIR p)
} }
else { else {
// The pair is not cloneable, just write the pair to disk // The pair is not cloneable, just write the pair to disk
// we already have p->value_nb_mutex and we just do the write in our own thread. // we already have p->value_rwlock and we just do the write in our own thread.
// this will grab and release disk_nb_mutex // this will grab and release disk_nb_mutex
pair_unlock(p); pair_unlock(p);
cachetable_write_locked_pair(ev, p, true); // keeps the PAIR's write lock cachetable_write_locked_pair(ev, p, true); // keeps the PAIR's write lock
...@@ -991,9 +991,9 @@ write_pair_for_checkpoint_thread (evictor* ev, PAIR p) ...@@ -991,9 +991,9 @@ write_pair_for_checkpoint_thread (evictor* ev, PAIR p)
} }
p->checkpoint_pending = false; p->checkpoint_pending = false;
// now release value_nb_mutex, before we write the PAIR out // now release value_rwlock, before we write the PAIR out
// so that the PAIR is available to client threads // so that the PAIR is available to client threads
nb_mutex_unlock(&p->value_nb_mutex); // didn't call cachetable_evict_pair so we have to unlock it ourselves. p->value_rwlock.write_unlock(); // didn't call cachetable_evict_pair so we have to unlock it ourselves.
if (p->clone_callback) { if (p->clone_callback) {
// note that pending lock is not needed here because // note that pending lock is not needed here because
// we KNOW we are in the middle of a checkpoint // we KNOW we are in the middle of a checkpoint
...@@ -1020,7 +1020,7 @@ write_pair_for_checkpoint_thread (evictor* ev, PAIR p) ...@@ -1020,7 +1020,7 @@ write_pair_for_checkpoint_thread (evictor* ev, PAIR p)
// and the pending lock // and the pending lock
// //
p->checkpoint_pending = false; p->checkpoint_pending = false;
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
} }
} }
...@@ -1072,7 +1072,7 @@ static void get_pairs( ...@@ -1072,7 +1072,7 @@ static void get_pairs(
assert(out_pairs[i] != NULL); assert(out_pairs[i] != NULL);
// pair had better be locked, as we are assuming // pair had better be locked, as we are assuming
// to own the write lock // to own the write lock
assert(nb_mutex_writers(&out_pairs[i]->value_nb_mutex)); assert(out_pairs[i]->value_rwlock.writers());
} }
} }
...@@ -1186,7 +1186,7 @@ static uint64_t get_tnow(void) { ...@@ -1186,7 +1186,7 @@ static uint64_t get_tnow(void) {
// On exit, cachetable lock is still held, but PAIR lock // On exit, cachetable lock is still held, but PAIR lock
// is either released. // is either released.
// //
// No locks are held on entry (besides the nb_mutex of the PAIR) // No locks are held on entry (besides the rwlock write lock of the PAIR)
// //
static void static void
do_partial_fetch( do_partial_fetch(
...@@ -1214,7 +1214,7 @@ do_partial_fetch( ...@@ -1214,7 +1214,7 @@ do_partial_fetch(
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->disk_nb_mutex); nb_mutex_unlock(&p->disk_nb_mutex);
if (!keep_pair_locked) { if (!keep_pair_locked) {
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
} }
pair_unlock(p); pair_unlock(p);
} }
...@@ -1235,7 +1235,7 @@ void toku_cachetable_pf_pinned_pair( ...@@ -1235,7 +1235,7 @@ void toku_cachetable_pf_pinned_pair(
p = ct->list.find_pair(cf, key, fullhash); p = ct->list.find_pair(cf, key, fullhash);
assert(p != NULL); assert(p != NULL);
assert(p->value_data == value); assert(p->value_data == value);
assert(nb_mutex_writers(&p->value_nb_mutex)); assert(p->value_rwlock.writers());
ct->list.read_list_unlock(); ct->list.read_list_unlock();
pair_lock(p); pair_lock(p);
...@@ -1251,6 +1251,7 @@ void toku_cachetable_pf_pinned_pair( ...@@ -1251,6 +1251,7 @@ void toku_cachetable_pf_pinned_pair(
} }
// NOW A TEST ONLY FUNCTION!!!
int toku_cachetable_get_and_pin ( int toku_cachetable_get_and_pin (
CACHEFILE cachefile, CACHEFILE cachefile,
CACHEKEY key, CACHEKEY key,
...@@ -1265,6 +1266,7 @@ int toku_cachetable_get_and_pin ( ...@@ -1265,6 +1266,7 @@ int toku_cachetable_get_and_pin (
void* read_extraargs // parameter for fetch_callback, pf_req_callback, and pf_callback void* read_extraargs // parameter for fetch_callback, pf_req_callback, and pf_callback
) )
{ {
pair_lock_type lock_type = may_modify_value ? PL_WRITE_EXPENSIVE : PL_READ;
// We have separate parameters of read_extraargs and write_extraargs because // We have separate parameters of read_extraargs and write_extraargs because
// the lifetime of the two parameters are different. write_extraargs may be used // the lifetime of the two parameters are different. write_extraargs may be used
// long after this function call (e.g. after a flush to disk), whereas read_extraargs // long after this function call (e.g. after a flush to disk), whereas read_extraargs
...@@ -1281,7 +1283,7 @@ int toku_cachetable_get_and_pin ( ...@@ -1281,7 +1283,7 @@ int toku_cachetable_get_and_pin (
fetch_callback, fetch_callback,
pf_req_callback, pf_req_callback,
pf_callback, pf_callback,
may_modify_value, lock_type,
read_extraargs, read_extraargs,
0, // number of dependent pairs that we may need to checkpoint 0, // number of dependent pairs that we may need to checkpoint
NULL, // array of cachefiles of dependent pairs NULL, // array of cachefiles of dependent pairs
...@@ -1331,7 +1333,7 @@ static void cachetable_fetch_pair( ...@@ -1331,7 +1333,7 @@ static void cachetable_fetch_pair(
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->disk_nb_mutex); nb_mutex_unlock(&p->disk_nb_mutex);
if (!keep_pair_locked) { if (!keep_pair_locked) {
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
} }
pair_unlock(p); pair_unlock(p);
} }
...@@ -1392,6 +1394,15 @@ static void checkpoint_pair_and_dependent_pairs( ...@@ -1392,6 +1394,15 @@ static void checkpoint_pair_and_dependent_pairs(
); );
} }
static void unpin_pair(PAIR p, bool read_lock_grabbed) {
if (read_lock_grabbed) {
p->value_rwlock.read_unlock();
}
else {
p->value_rwlock.write_unlock();
}
}
// on input, the pair's mutex is held, // on input, the pair's mutex is held,
// on output, the pair's mutex is not held. // on output, the pair's mutex is not held.
...@@ -1404,7 +1415,7 @@ static bool try_pin_pair( ...@@ -1404,7 +1415,7 @@ static bool try_pin_pair(
CACHETABLE ct, CACHETABLE ct,
CACHEFILE cachefile, CACHEFILE cachefile,
bool have_read_list_lock, bool have_read_list_lock,
bool may_modify_value, pair_lock_type lock_type,
uint32_t num_dependent_pairs, uint32_t num_dependent_pairs,
PAIR* dependent_pairs, PAIR* dependent_pairs,
enum cachetable_dirty* dependent_dirty, enum cachetable_dirty* dependent_dirty,
...@@ -1416,16 +1427,26 @@ static bool try_pin_pair( ...@@ -1416,16 +1427,26 @@ static bool try_pin_pair(
{ {
bool dep_checkpoint_pending[num_dependent_pairs]; bool dep_checkpoint_pending[num_dependent_pairs];
bool try_again = true; bool try_again = true;
// we need to exit with the read_list_lock, if we don't already have
// it we definitely need to reacquire it
bool reacquire_lock = !have_read_list_lock; bool reacquire_lock = !have_read_list_lock;
if (have_read_list_lock && nb_mutex_writers(&p->value_nb_mutex)) { bool expensive = (lock_type == PL_WRITE_EXPENSIVE);
// drop the read_list_lock before doing an expensive lock if (lock_type != PL_READ) {
reacquire_lock = true; if (!p->value_rwlock.try_write_lock(expensive)) {
ct->list.read_list_unlock(); reacquire_lock = true;
if (have_read_list_lock) {
ct->list.read_list_unlock();
}
p->value_rwlock.write_lock(expensive);
}
}
else {
if (!p->value_rwlock.try_read_lock()) {
reacquire_lock = true;
if (have_read_list_lock) {
ct->list.read_list_unlock();
}
p->value_rwlock.read_lock();
}
} }
nb_mutex_lock(&p->value_nb_mutex, &p->mutex);
pair_touch(p); pair_touch(p);
pair_unlock(p); pair_unlock(p);
// reacquire the read list lock here, we hold it for the rest of the function. // reacquire the read list lock here, we hold it for the rest of the function.
...@@ -1433,7 +1454,7 @@ static bool try_pin_pair( ...@@ -1433,7 +1454,7 @@ static bool try_pin_pair(
ct->list.read_list_lock(); ct->list.read_list_lock();
} }
if (may_modify_value) { if (lock_type != PL_READ) {
ct->list.read_pending_cheap_lock(); ct->list.read_pending_cheap_lock();
bool p_checkpoint_pending = p->checkpoint_pending; bool p_checkpoint_pending = p->checkpoint_pending;
p->checkpoint_pending = false; p->checkpoint_pending = false;
...@@ -1460,9 +1481,11 @@ static bool try_pin_pair( ...@@ -1460,9 +1481,11 @@ static bool try_pin_pair(
try_again = false; try_again = false;
goto exit; goto exit;
} }
// at this point, a partial fetch is required
if (ct->ev.should_client_thread_sleep() && !already_slept) { if (ct->ev.should_client_thread_sleep() && !already_slept) {
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->value_nb_mutex); unpin_pair(p, (lock_type == PL_READ));
pair_unlock(p); pair_unlock(p);
try_again = true; try_again = true;
goto exit; goto exit;
...@@ -1476,17 +1499,48 @@ static bool try_pin_pair( ...@@ -1476,17 +1499,48 @@ static bool try_pin_pair(
// if the variable is true, a partial fetch is required so we must grab the PAIR's write lock // if the variable is true, a partial fetch is required so we must grab the PAIR's write lock
// and then call a callback to retrieve what we need // and then call a callback to retrieve what we need
// //
if (partial_fetch_required) { assert(partial_fetch_required);
// As of Dr. No, only clean PAIRs may have pieces missing, // As of Dr. No, only clean PAIRs may have pieces missing,
// so we do a sanity check here. // so we do a sanity check here.
assert(!p->dirty); assert(!p->dirty);
// This may be slow, better release and re-grab the // This may be slow, better release and re-grab the
// read list lock. // read list lock.
ct->list.read_list_unlock(); ct->list.read_list_unlock();
if (lock_type == PL_READ) {
pair_lock(p);
p->value_rwlock.read_unlock();
p->value_rwlock.write_lock(true);
pair_unlock(p);
}
else if (lock_type == PL_WRITE_CHEAP) {
pair_lock(p);
p->value_rwlock.write_unlock();
p->value_rwlock.write_lock(true);
pair_unlock(p);
}
partial_fetch_required = pf_req_callback(p->value_data,read_extraargs);
if (partial_fetch_required) {
do_partial_fetch(ct, cachefile, p, pf_callback, read_extraargs, true); do_partial_fetch(ct, cachefile, p, pf_callback, read_extraargs, true);
ct->list.read_list_lock();
} }
if (lock_type == PL_READ) {
//
// TODO: Zardosht, somehow ensure that a partial eviction cannot happen
// between these two calls
//
pair_lock(p);
p->value_rwlock.write_unlock();
p->value_rwlock.read_lock();
pair_unlock(p);
}
else if (lock_type == PL_WRITE_CHEAP) {
pair_lock(p);
p->value_rwlock.write_unlock();
p->value_rwlock.write_lock(false);
pair_unlock(p);
}
ct->list.read_list_lock();
try_again = false; try_again = false;
exit: exit:
...@@ -1503,7 +1557,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched ( ...@@ -1503,7 +1557,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback, CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
bool may_modify_value, pair_lock_type lock_type,
void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs
...@@ -1554,7 +1608,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched ( ...@@ -1554,7 +1608,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
ct, ct,
cachefile, cachefile,
true, true,
may_modify_value, lock_type,
num_dependent_pairs, num_dependent_pairs,
dependent_pairs, dependent_pairs,
dependent_dirty, dependent_dirty,
...@@ -1601,7 +1655,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched ( ...@@ -1601,7 +1655,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
ct, ct,
cachefile, cachefile,
false, false,
may_modify_value, lock_type,
num_dependent_pairs, num_dependent_pairs,
dependent_pairs, dependent_pairs,
dependent_dirty, dependent_dirty,
...@@ -1636,10 +1690,10 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched ( ...@@ -1636,10 +1690,10 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
// Pin the pair. // Pin the pair.
pair_lock(p); pair_lock(p);
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); p->value_rwlock.write_lock(true);
pair_unlock(p); pair_unlock(p);
if (may_modify_value) { if (lock_type != PL_READ) {
ct->list.read_pending_cheap_lock(); ct->list.read_pending_cheap_lock();
assert(!p->checkpoint_pending); assert(!p->checkpoint_pending);
for (uint32_t i = 0; i < num_dependent_pairs; i++) { for (uint32_t i = 0; i < num_dependent_pairs; i++) {
...@@ -1651,10 +1705,9 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched ( ...@@ -1651,10 +1705,9 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
// We should release the lock before we perform // We should release the lock before we perform
// these expensive operations. // these expensive operations.
// TODO: <CER> Determine if we can move this above the may_modify_value block, but after the pin.
ct->list.write_list_unlock(); ct->list.write_list_unlock();
if (may_modify_value) { if (lock_type != PL_READ) {
checkpoint_dependent_pairs( checkpoint_dependent_pairs(
ct, ct,
num_dependent_pairs, num_dependent_pairs,
...@@ -1672,6 +1725,22 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched ( ...@@ -1672,6 +1725,22 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
cachetable_miss++; cachetable_miss++;
cachetable_misstime += get_tnow() - t0; cachetable_misstime += get_tnow() - t0;
if (lock_type == PL_READ) {
pair_lock(p);
p->value_rwlock.write_unlock();
p->value_rwlock.read_lock();
pair_unlock(p);
}
// because we grabbed an expensive lock for the fetch,
// we ought to downgrade it back to cheap if we have to
// once we are done with the fetch
else if (lock_type == PL_WRITE_CHEAP) {
pair_lock(p);
p->value_rwlock.write_unlock();
p->value_rwlock.write_lock(false);
pair_unlock(p);
}
// We need to be holding the read list lock when we exit. // We need to be holding the read list lock when we exit.
// We grab it here because we released it earlier to // We grab it here because we released it earlier to
// grab the write list lock because the checkpointing and // grab the write list lock because the checkpointing and
...@@ -1695,7 +1764,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs ( ...@@ -1695,7 +1764,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback, CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
bool may_modify_value, pair_lock_type lock_type,
void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs
...@@ -1716,7 +1785,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs ( ...@@ -1716,7 +1785,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
fetch_callback, fetch_callback,
pf_req_callback, pf_req_callback,
pf_callback, pf_callback,
may_modify_value, lock_type,
read_extraargs, read_extraargs,
num_dependent_pairs, num_dependent_pairs,
dependent_cfs, dependent_cfs,
...@@ -1746,12 +1815,8 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, uint32 ...@@ -1746,12 +1815,8 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, uint32
if (p) { if (p) {
pair_lock(p); pair_lock(p);
ct->list.read_list_unlock(); ct->list.read_list_unlock();
if (p->dirty && if (p->value_rwlock.try_write_lock(true)) {
nb_mutex_users(&p->value_nb_mutex) == 0 // we got the write lock fast, so continue
)
{
// because nb_mutex_users is 0, this is fast
nb_mutex_lock(&p->value_nb_mutex, &p->mutex);
ct->list.read_pending_cheap_lock(); ct->list.read_pending_cheap_lock();
// //
// if pending a checkpoint, then we don't want to return // if pending a checkpoint, then we don't want to return
...@@ -1759,8 +1824,8 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, uint32 ...@@ -1759,8 +1824,8 @@ int toku_cachetable_maybe_get_and_pin (CACHEFILE cachefile, CACHEKEY key, uint32
// handling the checkpointing, which we do not want to do, // handling the checkpointing, which we do not want to do,
// because it is expensive // because it is expensive
// //
if (p->checkpoint_pending) { if (!p->dirty || p->checkpoint_pending) {
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
r = -1; r = -1;
} }
else { else {
...@@ -1788,9 +1853,8 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key, ...@@ -1788,9 +1853,8 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key,
if (p) { if (p) {
pair_lock(p); pair_lock(p);
ct->list.read_list_unlock(); ct->list.read_list_unlock();
if (nb_mutex_users(&p->value_nb_mutex) == 0) { if (p->value_rwlock.try_write_lock(true)) {
// because nb_mutex_users is 0, this is fast // got the write lock fast, so continue
nb_mutex_lock(&p->value_nb_mutex, &p->mutex);
ct->list.read_pending_cheap_lock(); ct->list.read_pending_cheap_lock();
// //
// if pending a checkpoint, then we don't want to return // if pending a checkpoint, then we don't want to return
...@@ -1799,7 +1863,7 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key, ...@@ -1799,7 +1863,7 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key,
// because it is expensive // because it is expensive
// //
if (p->checkpoint_pending) { if (p->checkpoint_pending) {
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
r = -1; r = -1;
} }
else { else {
...@@ -1819,20 +1883,19 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key, ...@@ -1819,20 +1883,19 @@ int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key,
// //
// internal function to unpin a PAIR. // internal function to unpin a PAIR.
// As of Clayface, this is may be called in two ways: // As of Clayface, this is may be called in two ways:
// - with have_ct_lock true and flush false // - with flush false
// - with have_ct_lock false and flush true // - with flush true
// The first is for when this is run during run_unlockers in // The first is for when this is run during run_unlockers in
// toku_cachetable_get_and_pin_nonblocking, the second is during // toku_cachetable_get_and_pin_nonblocking, the second is during
// normal operations. Only during normal operations do we want to possibly // normal operations. Only during normal operations do we want to possibly
// induce evictions. // induce evictions or sleep.
// //
static int static int
cachetable_unpin_internal( cachetable_unpin_internal(
CACHEFILE cachefile, CACHEFILE cachefile,
PAIR p, PAIR p,
enum cachetable_dirty dirty, enum cachetable_dirty dirty,
PAIR_ATTR attr, PAIR_ATTR attr,
bool have_ct_lock,
bool flush bool flush
) )
{ {
...@@ -1844,14 +1907,14 @@ cachetable_unpin_internal( ...@@ -1844,14 +1907,14 @@ cachetable_unpin_internal(
PAIR_ATTR old_attr = p->attr; PAIR_ATTR old_attr = p->attr;
PAIR_ATTR new_attr = attr; PAIR_ATTR new_attr = attr;
pair_lock(p); pair_lock(p);
assert(nb_mutex_writers(&p->value_nb_mutex)>0);
if (dirty) { if (dirty) {
p->dirty = CACHETABLE_DIRTY; p->dirty = CACHETABLE_DIRTY;
} }
if (attr.is_valid) { if (attr.is_valid) {
p->attr = attr; p->attr = attr;
} }
nb_mutex_unlock(&p->value_nb_mutex); bool read_lock_grabbed = p->value_rwlock.readers();
unpin_pair(p, read_lock_grabbed);
pair_unlock(p); pair_unlock(p);
if (attr.is_valid) { if (attr.is_valid) {
...@@ -1862,7 +1925,7 @@ cachetable_unpin_internal( ...@@ -1862,7 +1925,7 @@ cachetable_unpin_internal(
} }
// see comments above this function to understand this code // see comments above this function to understand this code
if (flush && added_data_to_cachetable && !have_ct_lock) { if (flush && added_data_to_cachetable) {
if (ct->ev.should_client_thread_sleep()) { if (ct->ev.should_client_thread_sleep()) {
ct->ev.wait_for_cache_pressure_to_subside(); ct->ev.wait_for_cache_pressure_to_subside();
} }
...@@ -1874,12 +1937,10 @@ cachetable_unpin_internal( ...@@ -1874,12 +1937,10 @@ cachetable_unpin_internal(
} }
int toku_cachetable_unpin(CACHEFILE cachefile, PAIR p, enum cachetable_dirty dirty, PAIR_ATTR attr) { int toku_cachetable_unpin(CACHEFILE cachefile, PAIR p, enum cachetable_dirty dirty, PAIR_ATTR attr) {
// By default we don't have the lock return cachetable_unpin_internal(cachefile, p, dirty, attr, true);
return cachetable_unpin_internal(cachefile, p, dirty, attr, false, true);
} }
int toku_cachetable_unpin_ct_prelocked_no_flush(CACHEFILE cachefile, PAIR p, enum cachetable_dirty dirty, PAIR_ATTR attr) { int toku_cachetable_unpin_ct_prelocked_no_flush(CACHEFILE cachefile, PAIR p, enum cachetable_dirty dirty, PAIR_ATTR attr) {
// We hold the cachetable mutex. return cachetable_unpin_internal(cachefile, p, dirty, attr, false);
return cachetable_unpin_internal(cachefile, p, dirty, attr, true, false);
} }
static void static void
...@@ -1892,37 +1953,82 @@ run_unlockers (UNLOCKERS unlockers) { ...@@ -1892,37 +1953,82 @@ run_unlockers (UNLOCKERS unlockers) {
} }
} }
// on entry, pair mutex is held //
// on exit, is not held // This function tries to pin the pair without running the unlockers.
// If it can pin the pair cheaply, it does so, and returns 0.
// If the pin will be expensive, it runs unlockers,
// pins the pair, then releases the pin,
// and then returns TOKUDB_TRY_AGAIN
//
// on entry and exit, pair mutex is NOT held
// on entry and exit, the list read lock is held // on entry and exit, the list read lock is held
static void static int
pin_and_release_pair( maybe_pin_pair(
PAIR p, PAIR p,
CACHETABLE ct, CACHETABLE ct,
bool may_modify_value, pair_lock_type lock_type,
UNLOCKERS unlockers UNLOCKERS unlockers
) )
{ {
run_unlockers(unlockers); // The contract says the unlockers are run with the read or write list lock being held. int retval = 0;
bool expensive = (lock_type == PL_WRITE_EXPENSIVE);
pair_lock(p);
//
// first try to acquire the necessary locks without releasing the read_list_lock
//
if (lock_type == PL_READ && p->value_rwlock.try_read_lock()) {
pair_unlock(p);
goto exit;
}
else if (p->value_rwlock.try_write_lock(expensive)){
pair_unlock(p);
goto exit;
}
ct->list.read_list_unlock(); ct->list.read_list_unlock();
// now that we have released the read_list_lock,
// Now wait for the I/O to occur. // we can pin the PAIR. In each case, we check to see
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); // if acquiring the pin is expensive. If so, we run the unlockers, set the
if (may_modify_value) { // retval to TOKUDB_TRY_AGAIN, pin AND release the PAIR.
// If not, then we pin the PAIR, keep retval at 0, and do not
// run the unlockers, as we intend to return the value to the user
if (lock_type == PL_READ) {
if (p->value_rwlock.read_lock_is_expensive()) {
run_unlockers(unlockers);
retval = TOKUDB_TRY_AGAIN;
}
p->value_rwlock.read_lock();
}
else if (lock_type == PL_WRITE_EXPENSIVE || lock_type == PL_WRITE_CHEAP){
if (p->value_rwlock.write_lock_is_expensive()) {
run_unlockers(unlockers);
retval = TOKUDB_TRY_AGAIN;
}
p->value_rwlock.write_lock(expensive);
}
else {
assert(false);
}
// If we are going to be returning TOKUDB_TRY_AGAIN, we might
// as well resolve the checkpointing given the chance. This step is
// not necessary for correctness, it is just an opportunistic optimization.
if (lock_type != PL_READ && retval == TOKUDB_TRY_AGAIN) {
bool checkpoint_pending = get_checkpoint_pending(p, &ct->list); bool checkpoint_pending = get_checkpoint_pending(p, &ct->list);
pair_unlock(p); pair_unlock(p);
// We hold the read list lock throughout this call.
// This is O.K. because in production, this function
// should always put the write on a background thread.
write_locked_pair_for_checkpoint(ct, p, checkpoint_pending); write_locked_pair_for_checkpoint(ct, p, checkpoint_pending);
pair_lock(p); pair_lock(p);
} }
nb_mutex_unlock(&p->value_nb_mutex); if (retval == TOKUDB_TRY_AGAIN) {
unpin_pair(p, (lock_type == PL_READ));
}
else {
// just a sanity check
assert(retval == 0);
}
pair_unlock(p); pair_unlock(p);
ct->list.read_list_lock(); ct->list.read_list_lock();
exit:
return retval;
} }
void toku_cachetable_begin_batched_pin(CACHEFILE cf) void toku_cachetable_begin_batched_pin(CACHEFILE cf)
...@@ -1947,14 +2053,17 @@ int toku_cachetable_get_and_pin_nonblocking_batched( ...@@ -1947,14 +2053,17 @@ int toku_cachetable_get_and_pin_nonblocking_batched(
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback, CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
bool may_modify_value, pair_lock_type lock_type,
void *read_extraargs, void *read_extraargs,
UNLOCKERS unlockers UNLOCKERS unlockers
) )
// See cachetable.h. // See cachetable.h.
{ {
CACHETABLE ct = cf->cachetable; CACHETABLE ct = cf->cachetable;
assert(lock_type == PL_READ ||
lock_type == PL_WRITE_CHEAP ||
lock_type == PL_WRITE_EXPENSIVE
);
try_again: try_again:
PAIR p = ct->list.find_pair(cf, key, fullhash); PAIR p = ct->list.find_pair(cf, key, fullhash);
...@@ -1987,13 +2096,16 @@ int toku_cachetable_get_and_pin_nonblocking_batched( ...@@ -1987,13 +2096,16 @@ int toku_cachetable_get_and_pin_nonblocking_batched(
); );
assert(p); assert(p);
pair_lock(p); pair_lock(p);
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); // grab expensive write lock, because we are about to do a fetch
// off disk
p->value_rwlock.write_lock(true);
pair_unlock(p); pair_unlock(p);
run_unlockers(unlockers); // we hold the write list_lock. run_unlockers(unlockers); // we hold the write list_lock.
ct->list.write_list_unlock(); ct->list.write_list_unlock();
// at this point, only the pair is pinned, // at this point, only the pair is pinned,
// and no pair mutex held // and no pair mutex held, and
// no list lock is held
uint64_t t0 = get_tnow(); uint64_t t0 = get_tnow();
cachetable_fetch_pair(ct, cf, p, fetch_callback, read_extraargs, false); cachetable_fetch_pair(ct, cf, p, fetch_callback, read_extraargs, false);
cachetable_miss++; cachetable_miss++;
...@@ -2013,27 +2125,13 @@ int toku_cachetable_get_and_pin_nonblocking_batched( ...@@ -2013,27 +2125,13 @@ int toku_cachetable_get_and_pin_nonblocking_batched(
return TOKUDB_TRY_AGAIN; return TOKUDB_TRY_AGAIN;
} }
else { else {
// int r = maybe_pin_pair(p, ct, lock_type, unlockers);
// In Doofenshmirtz, we keep the root to leaf path pinned if (r == TOKUDB_TRY_AGAIN) {
// as we perform a query on a dictionary at any given time.
// This implies that only ONE query client can ever be
// in get_and_pin_nonblocking for this dictionary.
// So, if there is a write lock grabbed
// on the PAIR that we want to lock, then some expensive operation
// MUST be happening (read from disk, write to disk, flush, etc...),
// and we should run the unlockers.
// Otherwise, if there is no write lock grabbed, we know there will
// be no stall, so we grab the lock and return to the user
//
pair_lock(p);
if (nb_mutex_writers(&p->value_nb_mutex)) {
// The pair's mutex is released in this function call:
pin_and_release_pair(p, ct, may_modify_value, unlockers);
return TOKUDB_TRY_AGAIN; return TOKUDB_TRY_AGAIN;
} }
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); assert_zero(r);
pair_unlock(p);
if (may_modify_value) { if (lock_type != PL_READ) {
bool checkpoint_pending = get_checkpoint_pending(p, &ct->list); bool checkpoint_pending = get_checkpoint_pending(p, &ct->list);
bool is_checkpointing_fast = resolve_checkpointing_fast( bool is_checkpointing_fast = resolve_checkpointing_fast(
p, p,
...@@ -2050,7 +2148,7 @@ int toku_cachetable_get_and_pin_nonblocking_batched( ...@@ -2050,7 +2148,7 @@ int toku_cachetable_get_and_pin_nonblocking_batched(
write_locked_pair_for_checkpoint(ct, p, checkpoint_pending); write_locked_pair_for_checkpoint(ct, p, checkpoint_pending);
if (!is_checkpointing_fast) { if (!is_checkpointing_fast) {
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
pair_unlock(p); pair_unlock(p);
return TOKUDB_TRY_AGAIN; return TOKUDB_TRY_AGAIN;
...@@ -2058,22 +2156,46 @@ int toku_cachetable_get_and_pin_nonblocking_batched( ...@@ -2058,22 +2156,46 @@ int toku_cachetable_get_and_pin_nonblocking_batched(
} }
// At this point, we have pinned the PAIR // At this point, we have pinned the PAIR
// and resolved its checkpointing. The list lock is not held // and resolved its checkpointing. The pair's
// and the pair's mutex is not held. Before // mutex is not held. The read list lock IS held. Before
// returning the PAIR to the user, we must // returning the PAIR to the user, we must
// still check for partial fetch // still check for partial fetch
bool partial_fetch_required = pf_req_callback(p->value_data,read_extraargs); bool partial_fetch_required = pf_req_callback(p->value_data,read_extraargs);
if (partial_fetch_required) { if (partial_fetch_required) {
// TODO(leif): the following comment is probably wrong now
// that we can unpin without the read list lock.
run_unlockers(unlockers); // The contract says the unlockers are run with the ct lock being held.
// Since we have to do disk I/O we should temporarily // Since we have to do disk I/O we should temporarily
// release the read list lock. // release the read list lock.
ct->list.read_list_unlock(); ct->list.read_list_unlock();
// we can unpin without the read list lock
run_unlockers(unlockers);
// we are now getting an expensive write lock, because we
// are doing a partial fetch. So, if we previously have
// either a read lock or a cheap write lock, we need to
// release and reacquire the correct lock type
if (lock_type == PL_READ) {
pair_lock(p);
p->value_rwlock.read_unlock();
p->value_rwlock.write_lock(true);
pair_unlock(p);
}
else if (lock_type == PL_WRITE_CHEAP) {
pair_lock(p);
p->value_rwlock.write_unlock();
p->value_rwlock.write_lock(true);
pair_unlock(p);
}
// Now wait for the I/O to occur. // Now wait for the I/O to occur.
do_partial_fetch(ct, cf, p, pf_callback, read_extraargs, false); partial_fetch_required = pf_req_callback(p->value_data,read_extraargs);
if (partial_fetch_required) {
do_partial_fetch(ct, cf, p, pf_callback, read_extraargs, false);
}
else {
pair_lock(p);
p->value_rwlock.write_unlock();
pair_unlock(p);
}
if (ct->ev.should_client_thread_sleep()) { if (ct->ev.should_client_thread_sleep()) {
ct->ev.wait_for_cache_pressure_to_subside(); ct->ev.wait_for_cache_pressure_to_subside();
...@@ -2090,7 +2212,7 @@ int toku_cachetable_get_and_pin_nonblocking_batched( ...@@ -2090,7 +2212,7 @@ int toku_cachetable_get_and_pin_nonblocking_batched(
} }
else { else {
*value = p->value_data; *value = p->value_data;
return 0; return 0;
} }
} }
// We should not get here. Above code should hit a return in all cases. // We should not get here. Above code should hit a return in all cases.
...@@ -2107,7 +2229,7 @@ int toku_cachetable_get_and_pin_nonblocking ( ...@@ -2107,7 +2229,7 @@ int toku_cachetable_get_and_pin_nonblocking (
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback, CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
bool may_modify_value, pair_lock_type lock_type,
void *read_extraargs, void *read_extraargs,
UNLOCKERS unlockers UNLOCKERS unlockers
) )
...@@ -2125,7 +2247,7 @@ int toku_cachetable_get_and_pin_nonblocking ( ...@@ -2125,7 +2247,7 @@ int toku_cachetable_get_and_pin_nonblocking (
fetch_callback, fetch_callback,
pf_req_callback, pf_req_callback,
pf_callback, pf_callback,
may_modify_value, lock_type,
read_extraargs, read_extraargs,
unlockers unlockers
); );
...@@ -2219,7 +2341,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash, ...@@ -2219,7 +2341,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash,
); );
assert(p); assert(p);
pair_lock(p); pair_lock(p);
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); p->value_rwlock.write_lock(true);
pair_unlock(p); pair_unlock(p);
ct->list.write_list_unlock(); ct->list.write_list_unlock();
...@@ -2239,10 +2361,11 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash, ...@@ -2239,10 +2361,11 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash,
found_pair: found_pair:
// at this point, p is found, pair's mutex is grabbed, and // at this point, p is found, pair's mutex is grabbed, and
// no list lock is held // no list lock is held
if (nb_mutex_users(&p->value_nb_mutex)==0) { // TODO(leif): should this also just go ahead and wait if all there
pair_touch(p); // are to wait for are readers?
if (p->value_rwlock.try_write_lock(true)) {
// nobody else is using the node, so we should go ahead and prefetch // nobody else is using the node, so we should go ahead and prefetch
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); pair_touch(p);
pair_unlock(p); pair_unlock(p);
bool partial_fetch_required = pf_req_callback(p->value_data, read_extraargs); bool partial_fetch_required = pf_req_callback(p->value_data, read_extraargs);
...@@ -2260,11 +2383,12 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash, ...@@ -2260,11 +2383,12 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, uint32_t fullhash,
} }
else { else {
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
pair_unlock(p); pair_unlock(p);
} }
} }
else { else {
// Couldn't get the write lock cheaply
pair_unlock(p); pair_unlock(p);
} }
exit: exit:
...@@ -2354,7 +2478,7 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) { ...@@ -2354,7 +2478,7 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {
for (i=0; i < num_pairs; i++) { for (i=0; i < num_pairs; i++) {
PAIR p = list[i]; PAIR p = list[i];
pair_lock(p); pair_lock(p);
assert(nb_mutex_users(&p->value_nb_mutex) == 0); assert(p->value_rwlock.users() == 0);
assert(nb_mutex_users(&p->disk_nb_mutex) == 0); assert(nb_mutex_users(&p->disk_nb_mutex) == 0);
assert(!p->cloned_value_data); assert(!p->cloned_value_data);
if (p->dirty == CACHETABLE_DIRTY) { if (p->dirty == CACHETABLE_DIRTY) {
...@@ -2375,7 +2499,7 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) { ...@@ -2375,7 +2499,7 @@ static void cachetable_flush_cachefile(CACHETABLE ct, CACHEFILE cf) {
for (i=0; i < num_pairs; i++) { for (i=0; i < num_pairs; i++) {
PAIR p = list[i]; PAIR p = list[i];
pair_lock(p); pair_lock(p);
assert(nb_mutex_users(&p->value_nb_mutex) == 0); assert(p->value_rwlock.users() == 0);
assert(nb_mutex_users(&p->disk_nb_mutex) == 0); assert(nb_mutex_users(&p->disk_nb_mutex) == 0);
assert(!p->cloned_value_data); assert(!p->cloned_value_data);
assert(p->dirty == CACHETABLE_CLEAN); assert(p->dirty == CACHETABLE_CLEAN);
...@@ -2454,7 +2578,7 @@ static PAIR test_get_pair(CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash, ...@@ -2454,7 +2578,7 @@ static PAIR test_get_pair(CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash,
int toku_test_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash, enum cachetable_dirty dirty, PAIR_ATTR attr) { int toku_test_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, uint32_t fullhash, enum cachetable_dirty dirty, PAIR_ATTR attr) {
// By default we don't have the lock // By default we don't have the lock
PAIR p = test_get_pair(cachefile, key, fullhash, false); PAIR p = test_get_pair(cachefile, key, fullhash, false);
return toku_cachetable_unpin(cachefile, p, dirty, attr); return toku_cachetable_unpin(cachefile, p, dirty, attr); // assume read lock is not grabbed, and that it is a write lock
} }
//test-only wrapper //test-only wrapper
...@@ -2489,7 +2613,7 @@ int toku_cachetable_unpin_and_remove ( ...@@ -2489,7 +2613,7 @@ int toku_cachetable_unpin_and_remove (
CACHETABLE ct = cachefile->cachetable; CACHETABLE ct = cachefile->cachetable;
p->dirty = CACHETABLE_CLEAN; // clear the dirty bit. We're just supposed to remove it. p->dirty = CACHETABLE_CLEAN; // clear the dirty bit. We're just supposed to remove it.
assert(nb_mutex_writers(&p->value_nb_mutex)); assert(p->value_rwlock.writers());
// grab disk_nb_mutex to ensure any background thread writing // grab disk_nb_mutex to ensure any background thread writing
// out a cloned value completes // out a cloned value completes
pair_lock(p); pair_lock(p);
...@@ -2541,7 +2665,7 @@ int toku_cachetable_unpin_and_remove ( ...@@ -2541,7 +2665,7 @@ int toku_cachetable_unpin_and_remove (
ct->list.read_pending_cheap_unlock(); ct->list.read_pending_cheap_unlock();
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
nb_mutex_unlock(&p->disk_nb_mutex); nb_mutex_unlock(&p->disk_nb_mutex);
// //
// As of Dr. Noga, only these threads may be // As of Dr. Noga, only these threads may be
...@@ -2588,13 +2712,13 @@ int toku_cachetable_unpin_and_remove ( ...@@ -2588,13 +2712,13 @@ int toku_cachetable_unpin_and_remove (
// //
cachetable_remove_pair(&ct->list, &ct->ev, p); cachetable_remove_pair(&ct->list, &ct->ev, p);
ct->list.write_list_unlock(); ct->list.write_list_unlock();
if (nb_mutex_blocked_writers(&p->value_nb_mutex)>0) { if (p->value_rwlock.users() > 0) {
nb_mutex_wait_for_users( // Need to wait for everyone else to leave
&p->value_nb_mutex, p->value_rwlock.write_lock(true);
&p->mutex assert(p->value_rwlock.users() == 1); // us
);
assert(!p->checkpoint_pending); assert(!p->checkpoint_pending);
assert(p->attr.cache_pressure_size == 0); assert(p->attr.cache_pressure_size == 0);
p->value_rwlock.write_unlock();
} }
// just a sanity check // just a sanity check
assert(nb_mutex_users(&p->disk_nb_mutex) == 0); assert(nb_mutex_users(&p->disk_nb_mutex) == 0);
...@@ -2726,8 +2850,7 @@ int toku_cachetable_assert_all_unpinned (CACHETABLE ct) { ...@@ -2726,8 +2850,7 @@ int toku_cachetable_assert_all_unpinned (CACHETABLE ct) {
PAIR p; PAIR p;
for (p=ct->list.m_table[i]; p; p=p->hash_chain) { for (p=ct->list.m_table[i]; p; p=p->hash_chain) {
pair_lock(p); pair_lock(p);
assert(nb_mutex_writers(&p->value_nb_mutex)>=0); if (p->value_rwlock.users()) {
if (nb_mutex_writers(&p->value_nb_mutex)) {
//printf("%s:%d pinned: %" PRId64 " (%p)\n", __FILE__, __LINE__, p->key.b, p->value_data); //printf("%s:%d pinned: %" PRId64 " (%p)\n", __FILE__, __LINE__, p->key.b, p->value_data);
some_pinned=1; some_pinned=1;
} }
...@@ -2750,8 +2873,7 @@ int toku_cachefile_count_pinned (CACHEFILE cf, int print_them) { ...@@ -2750,8 +2873,7 @@ int toku_cachefile_count_pinned (CACHEFILE cf, int print_them) {
for (PAIR p = ct->list.m_table[i]; p; p = p->hash_chain) { for (PAIR p = ct->list.m_table[i]; p; p = p->hash_chain) {
if (p->cachefile == cf) { if (p->cachefile == cf) {
pair_lock(p); pair_lock(p);
assert(nb_mutex_writers(&p->value_nb_mutex) >= 0); if (p->value_rwlock.users()) {
if (nb_mutex_writers(&p->value_nb_mutex)) {
if (print_them) { if (print_them) {
printf("%s:%d pinned: %" PRId64 " (%p)\n", printf("%s:%d pinned: %" PRId64 " (%p)\n",
__FILE__, __FILE__,
...@@ -2779,7 +2901,7 @@ void toku_cachetable_print_state (CACHETABLE ct) { ...@@ -2779,7 +2901,7 @@ void toku_cachetable_print_state (CACHETABLE ct) {
pair_lock(p); pair_lock(p);
printf("t[%u]=", i); printf("t[%u]=", i);
for (p=ct->list.m_table[i]; p; p=p->hash_chain) { for (p=ct->list.m_table[i]; p; p=p->hash_chain) {
printf(" {%" PRId64 ", %p, dirty=%d, pin=%d, size=%ld}", p->key.b, p->cachefile, (int) p->dirty, nb_mutex_writers(&p->value_nb_mutex), p->attr.size); printf(" {%" PRId64 ", %p, dirty=%d, pin=%d, size=%ld}", p->key.b, p->cachefile, (int) p->dirty, p->value_rwlock.users(), p->attr.size);
} }
printf("\n"); printf("\n");
pair_unlock(p); pair_unlock(p);
...@@ -2806,7 +2928,7 @@ int toku_cachetable_get_key_state (CACHETABLE ct, CACHEKEY key, CACHEFILE cf, vo ...@@ -2806,7 +2928,7 @@ int toku_cachetable_get_key_state (CACHETABLE ct, CACHEKEY key, CACHEFILE cf, vo
if (dirty_ptr) if (dirty_ptr)
*dirty_ptr = p->dirty; *dirty_ptr = p->dirty;
if (pin_ptr) if (pin_ptr)
*pin_ptr = nb_mutex_writers(&p->value_nb_mutex); *pin_ptr = p->value_rwlock.users();
if (size_ptr) if (size_ptr)
*size_ptr = p->attr.size; *size_ptr = p->attr.size;
r = 0; r = 0;
...@@ -3007,7 +3129,7 @@ int cleaner::run_cleaner(void) { ...@@ -3007,7 +3129,7 @@ int cleaner::run_cleaner(void) {
// the cleaner thread from picking its PAIR (see comments in that function) // the cleaner thread from picking its PAIR (see comments in that function)
do { do {
pair_lock(m_pl->m_cleaner_head); pair_lock(m_pl->m_cleaner_head);
if (nb_mutex_users(&m_pl->m_cleaner_head->value_nb_mutex) > 0) { if (m_pl->m_cleaner_head->value_rwlock.users() > 0) {
pair_unlock(m_pl->m_cleaner_head); pair_unlock(m_pl->m_cleaner_head);
} }
else { else {
...@@ -3050,7 +3172,7 @@ int cleaner::run_cleaner(void) { ...@@ -3050,7 +3172,7 @@ int cleaner::run_cleaner(void) {
pair_unlock(best_pair); pair_unlock(best_pair);
continue; continue;
} }
nb_mutex_lock(&best_pair->value_nb_mutex, &best_pair->mutex); best_pair->value_rwlock.write_lock(true);
pair_unlock(best_pair); pair_unlock(best_pair);
// verify a key assumption. // verify a key assumption.
assert(cleaner_thread_rate_pair(best_pair) > 0); assert(cleaner_thread_rate_pair(best_pair) > 0);
...@@ -3082,7 +3204,7 @@ int cleaner::run_cleaner(void) { ...@@ -3082,7 +3204,7 @@ int cleaner::run_cleaner(void) {
// don't need to unlock it if the cleaner callback is called. // don't need to unlock it if the cleaner callback is called.
if (!cleaner_callback_called) { if (!cleaner_callback_called) {
pair_lock(best_pair); pair_lock(best_pair);
nb_mutex_unlock(&best_pair->value_nb_mutex); best_pair->value_rwlock.write_unlock();
pair_unlock(best_pair); pair_unlock(best_pair);
} }
// We need to make sure the cachefile sticks around so a close // We need to make sure the cachefile sticks around so a close
...@@ -3729,7 +3851,7 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) { ...@@ -3729,7 +3851,7 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) {
goto exit; goto exit;
} }
pair_lock(curr_in_clock); pair_lock(curr_in_clock);
if (nb_mutex_users(&curr_in_clock->value_nb_mutex) || if (curr_in_clock->value_rwlock.users() ||
nb_mutex_users(&curr_in_clock->disk_nb_mutex)) nb_mutex_users(&curr_in_clock->disk_nb_mutex))
{ {
pair_unlock(curr_in_clock); pair_unlock(curr_in_clock);
...@@ -3744,7 +3866,7 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) { ...@@ -3744,7 +3866,7 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) {
if (curr_in_clock->count > 0) { if (curr_in_clock->count > 0) {
curr_in_clock->count--; curr_in_clock->count--;
// call the partial eviction callback // call the partial eviction callback
nb_mutex_lock(&curr_in_clock->value_nb_mutex, &curr_in_clock->mutex); curr_in_clock->value_rwlock.write_lock(true);
pair_unlock(curr_in_clock); pair_unlock(curr_in_clock);
void *value = curr_in_clock->value_data; void *value = curr_in_clock->value_data;
...@@ -3780,7 +3902,7 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) { ...@@ -3780,7 +3902,7 @@ bool evictor::run_eviction_on_pair(PAIR curr_in_clock) {
} }
else { else {
pair_lock(curr_in_clock); pair_lock(curr_in_clock);
nb_mutex_unlock(&curr_in_clock->value_nb_mutex); curr_in_clock->value_rwlock.write_unlock();
pair_unlock(curr_in_clock); pair_unlock(curr_in_clock);
bjm_remove_background_job(cf->bjm); bjm_remove_background_job(cf->bjm);
} }
...@@ -3815,7 +3937,7 @@ void evictor::do_partial_eviction(PAIR p) { ...@@ -3815,7 +3937,7 @@ void evictor::do_partial_eviction(PAIR p) {
p->attr = new_attr; p->attr = new_attr;
this->decrease_size_evicting(p->size_evicting_estimate); this->decrease_size_evicting(p->size_evicting_estimate);
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
pair_unlock(p); pair_unlock(p);
} }
...@@ -3833,8 +3955,8 @@ void evictor::try_evict_pair(PAIR p) { ...@@ -3833,8 +3955,8 @@ void evictor::try_evict_pair(PAIR p) {
// the only caller, run_eviction_on_pair, should call this function // the only caller, run_eviction_on_pair, should call this function
// only if no one else is trying to use it // only if no one else is trying to use it
assert(!nb_mutex_users(&p->value_nb_mutex)); assert(!p->value_rwlock.users());
nb_mutex_lock(&p->value_nb_mutex, &p->mutex); p->value_rwlock.write_lock(true);
// if the PAIR is dirty, the running eviction requires writing the // if the PAIR is dirty, the running eviction requires writing the
// PAIR out. if the disk_nb_mutex is grabbed, then running // PAIR out. if the disk_nb_mutex is grabbed, then running
// eviction requires waiting for the disk_nb_mutex to become available, // eviction requires waiting for the disk_nb_mutex to become available,
...@@ -3887,7 +4009,7 @@ void evictor::evict_pair(PAIR p, bool for_checkpoint) { ...@@ -3887,7 +4009,7 @@ void evictor::evict_pair(PAIR p, bool for_checkpoint) {
pair_unlock(p); pair_unlock(p);
m_pl->write_list_lock(); m_pl->write_list_lock();
pair_lock(p); pair_lock(p);
nb_mutex_unlock(&p->value_nb_mutex); p->value_rwlock.write_unlock();
nb_mutex_unlock(&p->disk_nb_mutex); nb_mutex_unlock(&p->disk_nb_mutex);
// at this point, we have the pair list's write list lock // at this point, we have the pair list's write list lock
// and we have the pair's mutex (p->mutex) held // and we have the pair's mutex (p->mutex) held
......
...@@ -209,6 +209,12 @@ CACHETABLE toku_cachefile_get_cachetable(CACHEFILE cf); ...@@ -209,6 +209,12 @@ CACHETABLE toku_cachefile_get_cachetable(CACHEFILE cf);
// Effect: Get the cachetable. // Effect: Get the cachetable.
typedef enum {
PL_READ = 0,
PL_WRITE_CHEAP,
PL_WRITE_EXPENSIVE
} pair_lock_type;
// put something into the cachetable and checkpoint dependent pairs // put something into the cachetable and checkpoint dependent pairs
// if the checkpointing is necessary // if the checkpointing is necessary
int toku_cachetable_put_with_dep_pairs( int toku_cachetable_put_with_dep_pairs(
...@@ -265,7 +271,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched ( ...@@ -265,7 +271,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs_batched (
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback, CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
bool may_modify_value, pair_lock_type lock_type,
void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs
...@@ -286,7 +292,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs ( ...@@ -286,7 +292,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback, CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback,
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback,
bool may_modify_value, pair_lock_type lock_type,
void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback void* read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint uint32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs CACHEFILE* dependent_cfs, // array of cachefiles of dependent pairs
...@@ -355,7 +361,7 @@ int toku_cachetable_get_and_pin_nonblocking_batched ( ...@@ -355,7 +361,7 @@ int toku_cachetable_get_and_pin_nonblocking_batched (
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback __attribute__((unused)), CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback __attribute__((unused)),
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback __attribute__((unused)), CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback __attribute__((unused)),
bool may_modify_value, pair_lock_type lock_type,
void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
UNLOCKERS unlockers UNLOCKERS unlockers
); );
...@@ -372,7 +378,7 @@ int toku_cachetable_get_and_pin_nonblocking ( ...@@ -372,7 +378,7 @@ int toku_cachetable_get_and_pin_nonblocking (
CACHETABLE_FETCH_CALLBACK fetch_callback, CACHETABLE_FETCH_CALLBACK fetch_callback,
CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback __attribute__((unused)), CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK pf_req_callback __attribute__((unused)),
CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback __attribute__((unused)), CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback __attribute__((unused)),
bool may_modify_value, pair_lock_type lock_type,
void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback void *read_extraargs, // parameter for fetch_callback, pf_req_callback, and pf_callback
UNLOCKERS unlockers UNLOCKERS unlockers
); );
......
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id: frwlock.h 45930 2012-07-19 19:18:35Z zardosht $"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_assert.h>
namespace toku {
void frwlock::init(toku_mutex_t *const mutex) {
m_mutex = mutex;
m_num_readers = 0;
m_num_writers = 0;
m_num_want_write = 0;
m_num_want_read = 0;
m_num_signaled_readers = 0;
m_num_expensive_want_write = 0;
toku_cond_init(&m_wait_read, nullptr);
m_queue_item_read = { .cond = &m_wait_read, .next = nullptr };
m_wait_read_is_in_queue = false;
m_current_writer_expensive = false;
m_read_wait_expensive = false;
m_wait_head = nullptr;
m_wait_tail = nullptr;
}
void frwlock::deinit(void) {
toku_cond_destroy(&m_wait_read);
}
inline bool frwlock::queue_is_empty(void) const {
return m_wait_head == nullptr;
}
inline void frwlock::enq_item(queue_item *const item) {
invariant_null(item->next);
if (m_wait_tail != nullptr) {
m_wait_tail->next = item;
} else {
invariant_null(m_wait_head);
m_wait_head = item;
}
m_wait_tail = item;
}
inline toku_cond_t *frwlock::deq_item(void) {
invariant_notnull(m_wait_head);
invariant_notnull(m_wait_tail);
queue_item *item = m_wait_head;
m_wait_head = m_wait_head->next;
if (m_wait_tail == item) {
m_wait_tail = nullptr;
}
return item->cond;
}
// Prerequisite: Holds m_mutex.
inline void frwlock::write_lock(bool expensive) {
if (this->try_write_lock(expensive)) {
return;
}
toku_cond_t cond = TOKU_COND_INITIALIZER;
queue_item item = { .cond = &cond, .next = nullptr };
this->enq_item(&item);
// Wait for our turn.
++m_num_want_write;
if (expensive) {
++m_num_expensive_want_write;
}
toku_cond_wait(&cond, m_mutex);
toku_cond_destroy(&cond);
// Now it's our turn.
invariant(m_num_want_write > 0);
invariant_zero(m_num_readers);
invariant_zero(m_num_writers);
invariant_zero(m_num_signaled_readers);
// Not waiting anymore; grab the lock.
--m_num_want_write;
if (expensive) {
--m_num_expensive_want_write;
}
m_num_writers = 1;
m_current_writer_expensive = expensive;
}
inline bool frwlock::try_write_lock(bool expensive) {
if (m_num_readers > 0 || m_num_writers > 0 || m_num_signaled_readers > 0 || m_num_want_write > 0) {
return false;
}
// No one holds the lock. Grant the write lock.
invariant_zero(m_num_want_write);
invariant_zero(m_num_want_read);
m_num_writers = 1;
m_current_writer_expensive = expensive;
return true;
}
inline void frwlock::read_lock(void) {
if (m_num_writers > 0 || m_num_want_write > 0) {
if (!m_wait_read_is_in_queue) {
// Throw the read cond_t onto the queue.
invariant(m_num_signaled_readers == m_num_want_read);
m_queue_item_read.next = nullptr;
this->enq_item(&m_queue_item_read);
m_wait_read_is_in_queue = true;
invariant(!m_read_wait_expensive);
m_read_wait_expensive = (
m_current_writer_expensive ||
(m_num_expensive_want_write > 0)
);
}
// Wait for our turn.
++m_num_want_read;
toku_cond_wait(&m_wait_read, m_mutex);
// Now it's our turn.
invariant_zero(m_num_writers);
invariant(m_num_want_read > 0);
invariant(m_num_signaled_readers > 0);
// Not waiting anymore; grab the lock.
--m_num_want_read;
--m_num_signaled_readers;
}
++m_num_readers;
}
inline bool frwlock::try_read_lock(void) {
if (m_num_writers > 0 || m_num_want_write > 0) {
return false;
}
// No writer holds the lock.
// No writers are waiting.
// Grant the read lock.
++m_num_readers;
return true;
}
inline void frwlock::maybe_signal_next_writer(void) {
if (m_num_want_write > 0 && m_num_signaled_readers == 0 && m_num_readers == 0) {
toku_cond_t *cond = this->deq_item();
invariant(cond != &m_wait_read);
// Grant write lock to waiting writer.
invariant(m_num_want_write > 0);
toku_cond_signal(cond);
}
}
inline void frwlock::read_unlock(void) {
invariant(m_num_writers == 0);
invariant(m_num_readers > 0);
--m_num_readers;
this->maybe_signal_next_writer();
}
inline bool frwlock::read_lock_is_expensive(void) {
if (m_wait_read_is_in_queue) {
return m_read_wait_expensive;
}
else {
return m_current_writer_expensive || (m_num_expensive_want_write > 0);
}
}
inline void frwlock::maybe_signal_or_broadcast_next(void) {
invariant(m_num_signaled_readers == 0);
if (this->queue_is_empty()) {
invariant(m_num_want_write == 0);
invariant(m_num_want_read == 0);
return;
}
toku_cond_t *cond = this->deq_item();
if (cond == &m_wait_read) {
// Grant read locks to all waiting readers
invariant(m_wait_read_is_in_queue);
invariant(m_num_want_read > 0);
m_num_signaled_readers = m_num_want_read;
m_wait_read_is_in_queue = false;
m_read_wait_expensive = false;
toku_cond_broadcast(cond);
}
else {
// Grant write lock to waiting writer.
invariant(m_num_want_write > 0);
toku_cond_signal(cond);
}
}
inline void frwlock::write_unlock(void) {
invariant(m_num_writers == 1);
m_num_writers = 0;
m_current_writer_expensive = false;
this->maybe_signal_or_broadcast_next();
}
inline bool frwlock::write_lock_is_expensive(void) {
return (m_num_expensive_want_write > 0) || (m_current_writer_expensive);
}
inline uint32_t frwlock::users(void) const {
return m_num_readers + m_num_writers + m_num_want_read + m_num_want_write;
}
inline uint32_t frwlock::blocked_users(void) const {
return m_num_want_read + m_num_want_write;
}
inline uint32_t frwlock::writers(void) const {
return m_num_writers;
}
inline uint32_t frwlock::blocked_writers(void) const {
return m_num_want_write;
}
inline uint32_t frwlock::readers(void) const {
return m_num_readers;
}
inline uint32_t frwlock::blocked_readers(void) const {
return m_num_want_read;
}
} // namespace toku
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ifndef TOKU_FRWLOCK_H
#define TOKU_FRWLOCK_H
#ident "$Id: frwlock.h 45930 2012-07-19 19:18:35Z zardosht $"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_portability.h>
#include <toku_pthread.h>
#include <stdbool.h>
#include <stdint.h>
//TODO: update comment, this is from rwlock.h
namespace toku {
class frwlock {
public:
void init(toku_mutex_t *const mutex);
void deinit(void);
inline void write_lock(bool expensive);
inline bool try_write_lock(bool expensive);
inline void write_unlock(void);
// returns true if acquiring a write lock will be expensive
inline bool write_lock_is_expensive(void);
inline void read_lock(void);
inline bool try_read_lock(void);
inline void read_unlock(void);
// returns true if acquiring a read lock will be expensive
inline bool read_lock_is_expensive(void);
inline uint32_t users(void) const;
inline uint32_t blocked_users(void) const;
inline uint32_t writers(void) const;
inline uint32_t blocked_writers(void) const;
inline uint32_t readers(void) const;
inline uint32_t blocked_readers(void) const;
private:
struct queue_item {
toku_cond_t *cond;
struct queue_item *next;
};
inline bool queue_is_empty(void) const;
inline void enq_item(queue_item *const item);
inline toku_cond_t *deq_item(void);
inline void maybe_signal_or_broadcast_next(void);
inline void maybe_signal_next_writer(void);
toku_mutex_t *m_mutex;
uint32_t m_num_readers;
uint32_t m_num_writers;
uint32_t m_num_want_write;
uint32_t m_num_want_read;
uint32_t m_num_signaled_readers;
// number of writers waiting that are expensive
// MUST be < m_num_want_write
uint32_t m_num_expensive_want_write;
// bool that states if the current writer is expensive
// if there is no current writer, then is false
bool m_current_writer_expensive;
// bool that states if waiting for a read
// is expensive
// if there are currently no waiting readers, then set to false
bool m_read_wait_expensive;
toku_cond_t m_wait_read;
queue_item m_queue_item_read;
bool m_wait_read_is_in_queue;
queue_item *m_wait_head;
queue_item *m_wait_tail;
};
static_assert(std::is_pod<frwlock>::value, "not pod");
} // namespace toku
// include the implementation here
#include "frwlock.cc"
#endif
...@@ -61,7 +61,6 @@ cachetable_put_empty_node_with_dep_nodes( ...@@ -61,7 +61,6 @@ cachetable_put_empty_node_with_dep_nodes(
fullhash, fullhash,
toku_node_save_ct_pair); toku_node_save_ct_pair);
assert_zero(r); assert_zero(r);
*result = new_node; *result = new_node;
} }
...@@ -129,7 +128,7 @@ toku_pin_ftnode( ...@@ -129,7 +128,7 @@ toku_pin_ftnode(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS bounds, const PIVOT_BOUNDS bounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p, FTNODE *node_p,
bool* msgs_applied) bool* msgs_applied)
...@@ -143,7 +142,7 @@ toku_pin_ftnode( ...@@ -143,7 +142,7 @@ toku_pin_ftnode(
ancestors, ancestors,
bounds, bounds,
bfe, bfe,
may_modify_node, lock_type,
apply_ancestor_messages, apply_ancestor_messages,
false, false,
node_p, node_p,
...@@ -162,7 +161,7 @@ toku_pin_ftnode_batched( ...@@ -162,7 +161,7 @@ toku_pin_ftnode_batched(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS bounds, const PIVOT_BOUNDS bounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
bool end_batch_on_success, bool end_batch_on_success,
FTNODE *node_p, FTNODE *node_p,
...@@ -180,7 +179,7 @@ toku_pin_ftnode_batched( ...@@ -180,7 +179,7 @@ toku_pin_ftnode_batched(
toku_ftnode_fetch_callback, toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback, toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback, toku_ftnode_pf_callback,
may_modify_node, lock_type,
bfe, //read_extraargs bfe, //read_extraargs
unlockers); unlockers);
if (r==0) { if (r==0) {
...@@ -191,7 +190,7 @@ toku_pin_ftnode_batched( ...@@ -191,7 +190,7 @@ toku_pin_ftnode_batched(
if (apply_ancestor_messages && node->height == 0) { if (apply_ancestor_messages && node->height == 0) {
toku_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied); toku_apply_ancestors_messages_to_node(brt, node, ancestors, bounds, msgs_applied);
} }
if (may_modify_node && node->height > 0) { if ((lock_type != PL_READ) && node->height > 0) {
toku_move_ftnode_messages_to_stale(brt->ft, node); toku_move_ftnode_messages_to_stale(brt->ft, node);
} }
*node_p = node; *node_p = node;
...@@ -209,7 +208,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages( ...@@ -209,7 +208,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p, FTNODE *node_p,
...@@ -221,7 +220,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages( ...@@ -221,7 +220,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
blocknum, blocknum,
fullhash, fullhash,
bfe, bfe,
may_modify_node, lock_type,
num_dependent_nodes, num_dependent_nodes,
dependent_nodes, dependent_nodes,
node_p, node_p,
...@@ -236,13 +235,13 @@ toku_pin_ftnode_off_client_thread( ...@@ -236,13 +235,13 @@ toku_pin_ftnode_off_client_thread(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p) FTNODE *node_p)
{ {
toku_pin_ftnode_off_client_thread_and_maybe_move_messages( toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
h, blocknum, fullhash, bfe, may_modify_node, num_dependent_nodes, dependent_nodes, node_p, true); h, blocknum, fullhash, bfe, lock_type, num_dependent_nodes, dependent_nodes, node_p, true);
} }
void void
...@@ -251,7 +250,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages( ...@@ -251,7 +250,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p, FTNODE *node_p,
...@@ -279,7 +278,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages( ...@@ -279,7 +278,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
toku_ftnode_fetch_callback, toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback, toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback, toku_ftnode_pf_callback,
may_modify_node, lock_type,
bfe, bfe,
num_dependent_nodes, num_dependent_nodes,
dependent_cf, dependent_cf,
...@@ -289,7 +288,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages( ...@@ -289,7 +288,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
); );
assert(r==0); assert(r==0);
FTNODE node = (FTNODE) node_v; FTNODE node = (FTNODE) node_v;
if (may_modify_node && node->height > 0 && move_messages) { if ((lock_type != PL_READ) && node->height > 0 && move_messages) {
toku_move_ftnode_messages_to_stale(h, node); toku_move_ftnode_messages_to_stale(h, node);
} }
*node_p = node; *node_p = node;
...@@ -301,23 +300,23 @@ toku_pin_ftnode_off_client_thread_batched( ...@@ -301,23 +300,23 @@ toku_pin_ftnode_off_client_thread_batched(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p) FTNODE *node_p)
{ {
toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages( toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
h, blocknum, fullhash, bfe, may_modify_node, num_dependent_nodes, dependent_nodes, node_p, true); h, blocknum, fullhash, bfe, lock_type, num_dependent_nodes, dependent_nodes, node_p, true);
} }
int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, FTNODE *nodep, bool may_modify_node) { int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, FTNODE *nodep) {
void *node_v; void *node_v;
int r = toku_cachetable_maybe_get_and_pin_clean(ft->cf, blocknum, fullhash, &node_v); int r = toku_cachetable_maybe_get_and_pin_clean(ft->cf, blocknum, fullhash, &node_v);
if (r != 0) { if (r != 0) {
goto cleanup; goto cleanup;
} }
CAST_FROM_VOIDP(*nodep, node_v); CAST_FROM_VOIDP(*nodep, node_v);
if (may_modify_node && (*nodep)->height > 0) { if ((*nodep)->height > 0) {
toku_move_ftnode_messages_to_stale(ft, *nodep); toku_move_ftnode_messages_to_stale(ft, *nodep);
} }
cleanup: cleanup:
......
...@@ -69,7 +69,7 @@ toku_pin_ftnode( ...@@ -69,7 +69,7 @@ toku_pin_ftnode(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS pbounds, const PIVOT_BOUNDS pbounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
FTNODE *node_p, FTNODE *node_p,
bool* msgs_applied bool* msgs_applied
...@@ -88,7 +88,7 @@ toku_pin_ftnode_batched( ...@@ -88,7 +88,7 @@ toku_pin_ftnode_batched(
ANCESTORS ancestors, ANCESTORS ancestors,
const PIVOT_BOUNDS pbounds, const PIVOT_BOUNDS pbounds,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this bool apply_ancestor_messages, // this bool is probably temporary, for #3972, once we know how range query estimates work, will revisit this
bool end_batch_on_success, bool end_batch_on_success,
FTNODE *node_p, FTNODE *node_p,
...@@ -108,7 +108,7 @@ toku_pin_ftnode_off_client_thread( ...@@ -108,7 +108,7 @@ toku_pin_ftnode_off_client_thread(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p FTNODE *node_p
...@@ -120,7 +120,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages( ...@@ -120,7 +120,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p, FTNODE *node_p,
...@@ -131,7 +131,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages( ...@@ -131,7 +131,7 @@ toku_pin_ftnode_off_client_thread_and_maybe_move_messages(
* This function may return a pinned ftnode to the caller, if pinning is cheap. * This function may return a pinned ftnode to the caller, if pinning is cheap.
* If the node is already locked, or is pending a checkpoint, the node is not pinned and -1 is returned. * If the node is already locked, or is pending a checkpoint, the node is not pinned and -1 is returned.
*/ */
int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, FTNODE *nodep, bool may_modify_node); int toku_maybe_pin_ftnode_clean(FT ft, BLOCKNUM blocknum, uint32_t fullhash, FTNODE *nodep);
/** /**
* Batched version of toku_pin_ftnode_off_client_thread, see cachetable * Batched version of toku_pin_ftnode_off_client_thread, see cachetable
...@@ -143,7 +143,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages( ...@@ -143,7 +143,7 @@ toku_pin_ftnode_off_client_thread_batched_and_maybe_move_messages(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p, FTNODE *node_p,
...@@ -160,7 +160,7 @@ toku_pin_ftnode_off_client_thread_batched( ...@@ -160,7 +160,7 @@ toku_pin_ftnode_off_client_thread_batched(
BLOCKNUM blocknum, BLOCKNUM blocknum,
uint32_t fullhash, uint32_t fullhash,
FTNODE_FETCH_EXTRA bfe, FTNODE_FETCH_EXTRA bfe,
bool may_modify_node, pair_lock_type lock_type,
uint32_t num_dependent_nodes, uint32_t num_dependent_nodes,
FTNODE* dependent_nodes, FTNODE* dependent_nodes,
FTNODE *node_p FTNODE *node_p
......
...@@ -402,7 +402,7 @@ ct_maybe_merge_child(struct flusher_advice *fa, ...@@ -402,7 +402,7 @@ ct_maybe_merge_child(struct flusher_advice *fa,
toku_calculate_root_offset_pointer(h, &root, &fullhash); toku_calculate_root_offset_pointer(h, &root, &fullhash);
struct ftnode_fetch_extra bfe; struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, h); fill_bfe_for_full_read(&bfe, h);
toku_pin_ftnode_off_client_thread(h, root, fullhash, &bfe, true, 0, NULL, &root_node); toku_pin_ftnode_off_client_thread(h, root, fullhash, &bfe, PL_WRITE_EXPENSIVE, 0, NULL, &root_node);
toku_assert_entire_node_in_memory(root_node); toku_assert_entire_node_in_memory(root_node);
toku_ft_release_treelock(h); toku_ft_release_treelock(h);
...@@ -1342,7 +1342,7 @@ ft_merge_child( ...@@ -1342,7 +1342,7 @@ ft_merge_child(
uint32_t childfullhash = compute_child_fullhash(h->cf, node, childnuma); uint32_t childfullhash = compute_child_fullhash(h->cf, node, childnuma);
struct ftnode_fetch_extra bfe; struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, h); fill_bfe_for_full_read(&bfe, h);
toku_pin_ftnode_off_client_thread(h, BP_BLOCKNUM(node, childnuma), childfullhash, &bfe, true, 1, &node, &childa); toku_pin_ftnode_off_client_thread(h, BP_BLOCKNUM(node, childnuma), childfullhash, &bfe, PL_WRITE_EXPENSIVE, 1, &node, &childa);
} }
// for test // for test
call_flusher_thread_callback(flt_flush_before_pin_second_node_for_merge); call_flusher_thread_callback(flt_flush_before_pin_second_node_for_merge);
...@@ -1353,7 +1353,7 @@ ft_merge_child( ...@@ -1353,7 +1353,7 @@ ft_merge_child(
uint32_t childfullhash = compute_child_fullhash(h->cf, node, childnumb); uint32_t childfullhash = compute_child_fullhash(h->cf, node, childnumb);
struct ftnode_fetch_extra bfe; struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, h); fill_bfe_for_full_read(&bfe, h);
toku_pin_ftnode_off_client_thread(h, BP_BLOCKNUM(node, childnumb), childfullhash, &bfe, true, 2, dep_nodes, &childb); toku_pin_ftnode_off_client_thread(h, BP_BLOCKNUM(node, childnumb), childfullhash, &bfe, PL_WRITE_EXPENSIVE, 2, dep_nodes, &childb);
} }
if (toku_bnc_n_entries(BNC(node,childnuma))>0) { if (toku_bnc_n_entries(BNC(node,childnuma))>0) {
...@@ -1486,7 +1486,7 @@ flush_some_child( ...@@ -1486,7 +1486,7 @@ flush_some_child(
// Note that we don't read the entire node into memory yet. // Note that we don't read the entire node into memory yet.
// The idea is let's try to do the minimum work before releasing the parent lock // The idea is let's try to do the minimum work before releasing the parent lock
fill_bfe_for_min_read(&bfe, h); fill_bfe_for_min_read(&bfe, h);
toku_pin_ftnode_off_client_thread(h, targetchild, childfullhash, &bfe, true, 1, &parent, &child); toku_pin_ftnode_off_client_thread(h, targetchild, childfullhash, &bfe, PL_WRITE_EXPENSIVE, 1, &parent, &child);
// for test // for test
call_flusher_thread_callback(ft_flush_aflter_child_pin); call_flusher_thread_callback(ft_flush_aflter_child_pin);
...@@ -1785,7 +1785,7 @@ flush_node_on_background_thread(FT h, FTNODE parent) ...@@ -1785,7 +1785,7 @@ flush_node_on_background_thread(FT h, FTNODE parent)
// //
FTNODE child; FTNODE child;
uint32_t childfullhash = compute_child_fullhash(h->cf, parent, childnum); uint32_t childfullhash = compute_child_fullhash(h->cf, parent, childnum);
int r = toku_maybe_pin_ftnode_clean(h, BP_BLOCKNUM(parent, childnum), childfullhash, &child, true); int r = toku_maybe_pin_ftnode_clean(h, BP_BLOCKNUM(parent, childnum), childfullhash, &child);
if (r != 0) { if (r != 0) {
// In this case, we could not lock the child, so just place the parent on the background thread // In this case, we could not lock the child, so just place the parent on the background thread
// In the callback, we will use flush_some_child, which checks to // In the callback, we will use flush_some_child, which checks to
......
...@@ -277,7 +277,7 @@ toku_ft_hot_optimize(FT_HANDLE brt, ...@@ -277,7 +277,7 @@ toku_ft_hot_optimize(FT_HANDLE brt,
(BLOCKNUM) root_key, (BLOCKNUM) root_key,
fullhash, fullhash,
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&root); &root);
......
...@@ -265,7 +265,13 @@ struct ftnode { ...@@ -265,7 +265,13 @@ struct ftnode {
// macros for managing a node's clock // macros for managing a node's clock
// Should be managed by ft-ops.c, NOT by serialize/deserialize // Should be managed by ft-ops.c, NOT by serialize/deserialize
// //
#define BP_TOUCH_CLOCK(node, i) ((node)->bp[i].clock_count = 1)
//
// BP_TOUCH_CLOCK uses a compare and swap because multiple threads
// that have a read lock on an internal node may try to touch the clock
// simultaneously
//
#define BP_TOUCH_CLOCK(node, i) ((void) __sync_val_compare_and_swap(&(node)->bp[i].clock_count, 0, 1))
#define BP_SWEEP_CLOCK(node, i) ((node)->bp[i].clock_count = 0) #define BP_SWEEP_CLOCK(node, i) ((node)->bp[i].clock_count = 0)
#define BP_SHOULD_EVICT(node, i) ((node)->bp[i].clock_count == 0) #define BP_SHOULD_EVICT(node, i) ((node)->bp[i].clock_count == 0)
// not crazy about having these two here, one is for the case where we create new // not crazy about having these two here, one is for the case where we create new
......
...@@ -1443,9 +1443,9 @@ toku_ft_bn_apply_cmd_once ( ...@@ -1443,9 +1443,9 @@ toku_ft_bn_apply_cmd_once (
} }
} }
if (workdone) { // test programs may call with NULL if (workdone) { // test programs may call with NULL
*workdone += workdone_this_le; uint64_t new_workdone = __sync_add_and_fetch(workdone, workdone_this_le);
if (*workdone > STATUS_VALUE(FT_MAX_WORKDONE)) if (new_workdone > STATUS_VALUE(FT_MAX_WORKDONE))
STATUS_VALUE(FT_MAX_WORKDONE) = *workdone; STATUS_VALUE(FT_MAX_WORKDONE) = new_workdone;
} }
// if we created a new mempool buffer, free the old one // if we created a new mempool buffer, free the old one
...@@ -2511,7 +2511,7 @@ toku_ft_root_put_cmd (FT ft, FT_MSG_S * cmd) ...@@ -2511,7 +2511,7 @@ toku_ft_root_put_cmd (FT ft, FT_MSG_S * cmd)
root_key, root_key,
fullhash, fullhash,
&bfe, &bfe,
true, // may_modify_node PL_WRITE_EXPENSIVE, // may_modify_node
0, 0,
NULL, NULL,
&node &node
...@@ -4354,7 +4354,8 @@ struct unlock_ftnode_extra { ...@@ -4354,7 +4354,8 @@ struct unlock_ftnode_extra {
// When this is called, the cachetable lock is held // When this is called, the cachetable lock is held
static void static void
unlock_ftnode_fun (void *v) { unlock_ftnode_fun (void *v) {
struct unlock_ftnode_extra *CAST_FROM_VOIDP(x, v); struct unlock_ftnode_extra *x = NULL;
CAST_FROM_VOIDP(x, v);
FT_HANDLE brt = x->ft_handle; FT_HANDLE brt = x->ft_handle;
FTNODE node = x->node; FTNODE node = x->node;
// CT lock is held // CT lock is held
...@@ -4392,11 +4393,12 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F ...@@ -4392,11 +4393,12 @@ ft_search_child(FT_HANDLE brt, FTNODE node, int childnum, ft_search_t *search, F
); );
bool msgs_applied = false; bool msgs_applied = false;
{ {
pair_lock_type lock_type = (node->height == 1) ? PL_WRITE_CHEAP : PL_READ;
int rr = toku_pin_ftnode_batched(brt, childblocknum, fullhash, int rr = toku_pin_ftnode_batched(brt, childblocknum, fullhash,
unlockers, unlockers,
&next_ancestors, bounds, &next_ancestors, bounds,
&bfe, &bfe,
(node->height == 1), // may_modify_node true iff child is leaf lock_type, // may_modify_node true iff child is leaf
true, true,
(node->height == 1), // end_batch_on_success true iff child is a leaf (node->height == 1), // end_batch_on_success true iff child is a leaf
&childnode, &childnode,
...@@ -4745,7 +4747,7 @@ toku_ft_search (FT_HANDLE brt, ft_search_t *search, FT_GET_CALLBACK_FUNCTION get ...@@ -4745,7 +4747,7 @@ toku_ft_search (FT_HANDLE brt, ft_search_t *search, FT_GET_CALLBACK_FUNCTION get
root_key, root_key,
fullhash, fullhash,
&bfe, &bfe,
false, // may_modify_node set to false, because root cannot change during search PL_READ, // may_modify_node set to false, because root cannot change during search
0, 0,
NULL, NULL,
&node &node
...@@ -5258,7 +5260,7 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node, ...@@ -5258,7 +5260,7 @@ toku_ft_keyrange_internal (FT_HANDLE brt, FTNODE node,
&next_ancestors, &next_ancestors,
bounds, bounds,
bfe, bfe,
false, // may_modify_node is false, because node guaranteed to not change PL_READ, // may_modify_node is false, because node guaranteed to not change
false, false,
&childnode, &childnode,
&msgs_applied &msgs_applied
...@@ -5315,7 +5317,7 @@ toku_ft_keyrange (FT_HANDLE brt, DBT *key, uint64_t *less_p, uint64_t *equal_p, ...@@ -5315,7 +5317,7 @@ toku_ft_keyrange (FT_HANDLE brt, DBT *key, uint64_t *less_p, uint64_t *equal_p,
root_key, root_key,
fullhash, fullhash,
&bfe, &bfe,
false, // may_modify_node, cannot change root during keyrange PL_READ, // may_modify_node, cannot change root during keyrange
0, 0,
NULL, NULL,
&node &node
...@@ -5361,27 +5363,21 @@ static int ...@@ -5361,27 +5363,21 @@ static int
toku_dump_ftnode (FILE *file, FT_HANDLE brt, BLOCKNUM blocknum, int depth, const DBT *lorange, const DBT *hirange) { toku_dump_ftnode (FILE *file, FT_HANDLE brt, BLOCKNUM blocknum, int depth, const DBT *lorange, const DBT *hirange) {
int result=0; int result=0;
FTNODE node; FTNODE node;
void* node_v;
toku_get_node_for_verify(blocknum, brt, &node); toku_get_node_for_verify(blocknum, brt, &node);
result=toku_verify_ftnode(brt, ZERO_MSN, ZERO_MSN, node, -1, lorange, hirange, NULL, NULL, 0, 1, 0); result=toku_verify_ftnode(brt, ZERO_MSN, ZERO_MSN, node, -1, lorange, hirange, NULL, NULL, 0, 1, 0);
uint32_t fullhash = toku_cachetable_hash(brt->ft->cf, blocknum); uint32_t fullhash = toku_cachetable_hash(brt->ft->cf, blocknum);
struct ftnode_fetch_extra bfe; struct ftnode_fetch_extra bfe;
fill_bfe_for_full_read(&bfe, brt->ft); fill_bfe_for_full_read(&bfe, brt->ft);
int r = toku_cachetable_get_and_pin( toku_pin_ftnode_off_client_thread(
brt->ft->cf, brt->ft,
blocknum, blocknum,
fullhash, fullhash,
&node_v, &bfe,
PL_WRITE_EXPENSIVE,
0,
NULL, NULL,
get_write_callbacks_for_node(brt->ft), &node
toku_ftnode_fetch_callback,
toku_ftnode_pf_req_callback,
toku_ftnode_pf_callback,
true, // may_modify_value, just safe to set to true, I think it could theoretically be false
&bfe
); );
assert_zero(r);
CAST_FROM_VOIDP(node, node_v);
assert(node->fullhash==fullhash); assert(node->fullhash==fullhash);
fprintf(file, "%*sNode=%p\n", depth, "", node); fprintf(file, "%*sNode=%p\n", depth, "", node);
...@@ -5411,7 +5407,7 @@ toku_dump_ftnode (FILE *file, FT_HANDLE brt, BLOCKNUM blocknum, int depth, const ...@@ -5411,7 +5407,7 @@ toku_dump_ftnode (FILE *file, FT_HANDLE brt, BLOCKNUM blocknum, int depth, const
if (0) if (0)
for (int j=0; j<size; j++) { for (int j=0; j<size; j++) {
OMTVALUE v = 0; OMTVALUE v = 0;
r = toku_omt_fetch(BLB_BUFFER(node, i), j, &v); int r = toku_omt_fetch(BLB_BUFFER(node, i), j, &v);
assert_zero(r); assert_zero(r);
LEAFENTRY CAST_FROM_VOIDP(le, v); LEAFENTRY CAST_FROM_VOIDP(le, v);
fprintf(file, " [%d]=", j); fprintf(file, " [%d]=", j);
...@@ -5435,8 +5431,7 @@ toku_dump_ftnode (FILE *file, FT_HANDLE brt, BLOCKNUM blocknum, int depth, const ...@@ -5435,8 +5431,7 @@ toku_dump_ftnode (FILE *file, FT_HANDLE brt, BLOCKNUM blocknum, int depth, const
} }
} }
} }
r = toku_cachetable_unpin(brt->ft->cf, node->ct_pair, CACHETABLE_CLEAN, make_ftnode_pair_attr(node)); toku_unpin_ftnode_off_client_thread(brt->ft, node);
assert_zero(r);
return result; return result;
} }
...@@ -5590,7 +5585,7 @@ static bool is_empty_fast_iter (FT_HANDLE brt, FTNODE node) { ...@@ -5590,7 +5585,7 @@ static bool is_empty_fast_iter (FT_HANDLE brt, FTNODE node) {
childblocknum, childblocknum,
fullhash, fullhash,
&bfe, &bfe,
false, // may_modify_node set to false, as nodes not modified PL_READ, // may_modify_node set to false, as nodes not modified
0, 0,
NULL, NULL,
&childnode &childnode
...@@ -5631,7 +5626,7 @@ bool toku_ft_is_empty_fast (FT_HANDLE brt) ...@@ -5631,7 +5626,7 @@ bool toku_ft_is_empty_fast (FT_HANDLE brt)
root_key, root_key,
fullhash, fullhash,
&bfe, &bfe,
false, // may_modify_node set to false, node does not change PL_READ, // may_modify_node set to false, node does not change
0, 0,
NULL, NULL,
&node &node
......
...@@ -169,7 +169,7 @@ toku_pin_node_with_min_bfe(FTNODE* node, BLOCKNUM b, FT_HANDLE t) ...@@ -169,7 +169,7 @@ toku_pin_node_with_min_bfe(FTNODE* node, BLOCKNUM b, FT_HANDLE t)
b, b,
toku_cachetable_hash(t->ft->cf, b), toku_cachetable_hash(t->ft->cf, b),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
node node
......
...@@ -234,7 +234,7 @@ toku_get_node_for_verify( ...@@ -234,7 +234,7 @@ toku_get_node_for_verify(
blocknum, blocknum,
fullhash, fullhash,
&bfe, &bfe,
true, // may_modify_node PL_WRITE_EXPENSIVE, // may_modify_node
0, 0,
NULL, NULL,
nodep, nodep,
...@@ -446,15 +446,7 @@ toku_verify_ftnode (FT_HANDLE brt, ...@@ -446,15 +446,7 @@ toku_verify_ftnode (FT_HANDLE brt,
} }
} }
done: done:
{ toku_unpin_ftnode(brt->ft, node);
int r = toku_cachetable_unpin(
brt->ft->cf,
node->ct_pair,
CACHETABLE_CLEAN,
make_ftnode_pair_attr(node)
);
assert_zero(r); // this is a bad failure if it happens.
}
if (result == 0 && progress_callback) if (result == 0 && progress_callback)
result = progress_callback(progress_extra, 0.0); result = progress_callback(progress_extra, 0.0);
......
...@@ -8,7 +8,9 @@ ...@@ -8,7 +8,9 @@
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <toku_portability.h> #include <toku_portability.h>
#include <valgrind/drd.h>
#include <stdint.h> #include <stdint.h>
#include "memory.h"
#include "growable_array.h" #include "growable_array.h"
namespace toku { namespace toku {
...@@ -117,7 +119,10 @@ class subtree_templated<true> { ...@@ -117,7 +119,10 @@ class subtree_templated<true> {
} }
inline uint32_t get_index(void) const { inline uint32_t get_index(void) const {
return m_bitfield & MASK_INDEX; TOKU_DRD_IGNORE_VAR(m_bitfield);
const uint32_t bits = m_bitfield;
TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
return bits & MASK_INDEX;
} }
inline void set_index(uint32_t index) { inline void set_index(uint32_t index) {
...@@ -126,11 +131,23 @@ class subtree_templated<true> { ...@@ -126,11 +131,23 @@ class subtree_templated<true> {
} }
inline bool get_bit(void) const { inline bool get_bit(void) const {
return (m_bitfield & MASK_BIT) != 0; TOKU_DRD_IGNORE_VAR(m_bitfield);
const uint32_t bits = m_bitfield;
TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
return (bits & MASK_BIT) != 0;
} }
inline void enable_bit(void) { inline void enable_bit(void) {
// These bits may be set by a thread with a write lock on some
// leaf, and the index can be read by another thread with a (read
// or write) lock on another thread. Also, the has_marks_below
// bit can be set by two threads simultaneously. Neither of these
// are real races, so if we are using DRD we should tell it to
// ignore these bits just while we set this bit. If there were a
// race in setting the index, that would be a real race.
TOKU_DRD_IGNORE_VAR(m_bitfield);
m_bitfield |= MASK_BIT; m_bitfield |= MASK_BIT;
TOKU_DRD_STOP_IGNORING_VAR(m_bitfield);
} }
inline void disable_bit(void) { inline void disable_bit(void) {
......
...@@ -106,8 +106,12 @@ static void rollback_log_create (TOKUTXN txn, BLOCKNUM previous, uint32_t previo ...@@ -106,8 +106,12 @@ static void rollback_log_create (TOKUTXN txn, BLOCKNUM previous, uint32_t previo
void toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log) { void toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log) {
int r; int r;
CACHEFILE cf = txn->logger->rollback_cachefile; CACHEFILE cf = txn->logger->rollback_cachefile;
r = toku_cachetable_unpin(cf, log->ct_pair, r = toku_cachetable_unpin(
(enum cachetable_dirty)log->dirty, rollback_memory_size(log)); cf,
log->ct_pair,
(enum cachetable_dirty)log->dirty,
rollback_memory_size(log)
);
assert(r == 0); assert(r == 0);
} }
...@@ -202,14 +206,15 @@ void toku_get_and_pin_rollback_log(TOKUTXN txn, BLOCKNUM blocknum, uint32_t hash ...@@ -202,14 +206,15 @@ void toku_get_and_pin_rollback_log(TOKUTXN txn, BLOCKNUM blocknum, uint32_t hash
void * value; void * value;
CACHEFILE cf = txn->logger->rollback_cachefile; CACHEFILE cf = txn->logger->rollback_cachefile;
FT CAST_FROM_VOIDP(h, toku_cachefile_get_userdata(cf)); FT CAST_FROM_VOIDP(h, toku_cachefile_get_userdata(cf));
int r = toku_cachetable_get_and_pin(cf, blocknum, hash, int r = toku_cachetable_get_and_pin_with_dep_pairs(cf, blocknum, hash,
&value, NULL, &value, NULL,
get_write_callbacks_for_rollback_log(h), get_write_callbacks_for_rollback_log(h),
toku_rollback_fetch_callback, toku_rollback_fetch_callback,
toku_rollback_pf_req_callback, toku_rollback_pf_req_callback,
toku_rollback_pf_callback, toku_rollback_pf_callback,
true, // may_modify_value PL_WRITE_EXPENSIVE, // lock_type
h h,
0, NULL, NULL, NULL, NULL
); );
assert(r == 0); assert(r == 0);
ROLLBACK_LOG_NODE CAST_FROM_VOIDP(pinned_log, value); ROLLBACK_LOG_NODE CAST_FROM_VOIDP(pinned_log, value);
......
...@@ -152,6 +152,14 @@ static inline int rwlock_writers(RWLOCK rwlock) { ...@@ -152,6 +152,14 @@ static inline int rwlock_writers(RWLOCK rwlock) {
return rwlock->writer; return rwlock->writer;
} }
static inline bool rwlock_write_will_block(RWLOCK rwlock) {
return (rwlock->writer > 0 || rwlock->reader > 0);
}
static inline int rwlock_read_will_block(RWLOCK rwlock) {
return (rwlock->writer > 0 || rwlock->want_write > 0);
}
static inline void rwlock_wait_for_users( static inline void rwlock_wait_for_users(
RWLOCK rwlock, RWLOCK rwlock,
toku_mutex_t *mutex toku_mutex_t *mutex
......
...@@ -72,7 +72,7 @@ run_test (void) { ...@@ -72,7 +72,7 @@ run_test (void) {
def_fetch, def_fetch,
def_pf_req_callback, def_pf_req_callback,
def_pf_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
&foo &foo
); );
......
...@@ -17,7 +17,7 @@ static void *pin_nonblocking(void *arg) { ...@@ -17,7 +17,7 @@ static void *pin_nonblocking(void *arg) {
&v1, &v1,
&s1, &s1,
def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
NULL NULL
); );
......
...@@ -17,7 +17,7 @@ static void *pin_nonblocking(void *arg) { ...@@ -17,7 +17,7 @@ static void *pin_nonblocking(void *arg) {
&v1, &v1,
&s1, &s1,
def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback, def_write_callback(NULL), def_fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
NULL NULL
); );
......
...@@ -56,12 +56,12 @@ cachetable_test (enum cachetable_dirty dirty, bool cloneable) { ...@@ -56,12 +56,12 @@ cachetable_test (enum cachetable_dirty dirty, bool cloneable) {
// test that having a pin that passes false for may_modify_value does not stall behind checkpoint // test that having a pin that passes false for may_modify_value does not stall behind checkpoint
CHECKPOINTER cp = toku_cachetable_get_checkpointer(ct); CHECKPOINTER cp = toku_cachetable_get_checkpointer(ct);
r = toku_cachetable_begin_checkpoint(cp, NULL); assert_zero(r); r = toku_cachetable_begin_checkpoint(cp, NULL); assert_zero(r);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, false, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_READ, NULL, NULL);
assert(r == 0); assert(r == 0);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r == 0); assert(r == 0);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
if (dirty == CACHETABLE_DIRTY && !cloneable) { if (dirty == CACHETABLE_DIRTY && !cloneable) {
assert(r == TOKUDB_TRY_AGAIN); assert(r == TOKUDB_TRY_AGAIN);
} }
......
...@@ -102,7 +102,7 @@ static void cachetable_predef_fetch_maybegetandpin_test (void) { ...@@ -102,7 +102,7 @@ static void cachetable_predef_fetch_maybegetandpin_test (void) {
// now verify that the block we are trying to evict is gone // now verify that the block we are trying to evict is gone
wc = def_write_callback(NULL); wc = def_write_callback(NULL);
wc.flush_callback = flush; wc.flush_callback = flush;
r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r == TOKUDB_TRY_AGAIN); assert(r == TOKUDB_TRY_AGAIN);
r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL); r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL);
assert(r == 0 && v == 0 && size == 8); assert(r == 0 && v == 0 && size == 8);
......
...@@ -117,7 +117,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) { ...@@ -117,7 +117,7 @@ static void cachetable_prefetch_maybegetandpin_test (void) {
def_fetch, def_fetch,
def_pf_req_callback, def_pf_req_callback,
def_pf_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
NULL NULL
); );
......
...@@ -49,7 +49,7 @@ cachetable_test (enum pin_evictor_test_type test_type, bool nonblocking) { ...@@ -49,7 +49,7 @@ cachetable_test (enum pin_evictor_test_type test_type, bool nonblocking) {
if (test_type == pin_in_memory) { if (test_type == pin_in_memory) {
old_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev); old_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev);
if (nonblocking) { if (nonblocking) {
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert_zero(r); assert_zero(r);
} }
else { else {
...@@ -64,7 +64,7 @@ cachetable_test (enum pin_evictor_test_type test_type, bool nonblocking) { ...@@ -64,7 +64,7 @@ cachetable_test (enum pin_evictor_test_type test_type, bool nonblocking) {
else if (test_type == pin_fetch) { else if (test_type == pin_fetch) {
old_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev); old_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev);
if (nonblocking) { if (nonblocking) {
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(2), 2, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(2), 2, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r == TOKUDB_TRY_AGAIN); assert(r == TOKUDB_TRY_AGAIN);
new_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev); new_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev);
assert(new_num_ev_runs > old_num_ev_runs); assert(new_num_ev_runs > old_num_ev_runs);
...@@ -81,7 +81,7 @@ cachetable_test (enum pin_evictor_test_type test_type, bool nonblocking) { ...@@ -81,7 +81,7 @@ cachetable_test (enum pin_evictor_test_type test_type, bool nonblocking) {
else if (test_type == pin_partial_fetch) { else if (test_type == pin_partial_fetch) {
old_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev); old_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev);
if (nonblocking) { if (nonblocking) {
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, pf_req_callback, pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, pf_req_callback, pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r == TOKUDB_TRY_AGAIN); assert(r == TOKUDB_TRY_AGAIN);
new_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev); new_num_ev_runs = evictor_test_helpers::get_num_eviction_runs(&ct->ev);
assert(new_num_ev_runs > old_num_ev_runs); assert(new_num_ev_runs > old_num_ev_runs);
......
...@@ -149,7 +149,7 @@ static void *move_numbers(void *arg) { ...@@ -149,7 +149,7 @@ static void *move_numbers(void *arg) {
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
0, //num_dependent_pairs 0, //num_dependent_pairs
NULL, NULL,
...@@ -171,7 +171,7 @@ static void *move_numbers(void *arg) { ...@@ -171,7 +171,7 @@ static void *move_numbers(void *arg) {
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
1, //num_dependent_pairs 1, //num_dependent_pairs
&f1, &f1,
...@@ -205,7 +205,7 @@ static void *move_numbers(void *arg) { ...@@ -205,7 +205,7 @@ static void *move_numbers(void *arg) {
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
1, //num_dependent_pairs 1, //num_dependent_pairs
&f1, &f1,
...@@ -243,7 +243,7 @@ static void *read_random_numbers(void *arg) { ...@@ -243,7 +243,7 @@ static void *read_random_numbers(void *arg) {
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
false, PL_READ,
NULL, NULL,
NULL NULL
); );
......
...@@ -50,7 +50,7 @@ run_test (void) { ...@@ -50,7 +50,7 @@ run_test (void) {
def_fetch, def_fetch,
def_pf_req_callback, def_pf_req_callback,
def_pf_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
NULL NULL
); );
......
...@@ -121,7 +121,7 @@ static void cachetable_prefetch_maybegetandpin_test (bool do_partial_fetch) { ...@@ -121,7 +121,7 @@ static void cachetable_prefetch_maybegetandpin_test (bool do_partial_fetch) {
void *v = 0; void *v = 0;
long size = 0; long size = 0;
do_pf = false; do_pf = false;
r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==TOKUDB_TRY_AGAIN); assert(r==TOKUDB_TRY_AGAIN);
r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, true, NULL); r = toku_cachetable_get_and_pin(f1, key, fullhash, &v, &size, wc, fetch, pf_req_callback, pf_callback, true, NULL);
assert(r == 0 && v == 0 && size == 2); assert(r == 0 && v == 0 && size == 2);
......
...@@ -143,7 +143,7 @@ static void move_number_to_child( ...@@ -143,7 +143,7 @@ static void move_number_to_child(
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
1, //num_dependent_pairs 1, //num_dependent_pairs
&f1, &f1,
...@@ -190,7 +190,7 @@ static void *move_numbers(void *arg) { ...@@ -190,7 +190,7 @@ static void *move_numbers(void *arg) {
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
0, //num_dependent_pairs 0, //num_dependent_pairs
NULL, NULL,
...@@ -256,7 +256,7 @@ static void merge_and_split_child( ...@@ -256,7 +256,7 @@ static void merge_and_split_child(
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
1, //num_dependent_pairs 1, //num_dependent_pairs
&f1, &f1,
...@@ -290,7 +290,7 @@ static void merge_and_split_child( ...@@ -290,7 +290,7 @@ static void merge_and_split_child(
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
2, //num_dependent_pairs 2, //num_dependent_pairs
cfs, cfs,
...@@ -368,7 +368,7 @@ static void *merge_and_split(void *arg) { ...@@ -368,7 +368,7 @@ static void *merge_and_split(void *arg) {
&v1, &v1,
&s1, &s1,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
NULL, NULL,
0, //num_dependent_pairs 0, //num_dependent_pairs
NULL, NULL,
......
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id: cachetable-simple-pin-nonblocking.cc 46977 2012-08-19 01:56:34Z zardosht $"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#include "includes.h"
#include "test.h"
bool pf_called;
static bool true_pf_req_callback(void* UU(ftnode_pv), void* UU(read_extraargs)) {
return true;
}
static int true_pf_callback(void* UU(ftnode_pv), void* UU(dd), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
*sizep = make_pair_attr(9);
pf_called = true;
return 0;
}
static void kibbutz_work(void *fe_v)
{
CACHEFILE CAST_FROM_VOIDP(f1, fe_v);
sleep(2);
int r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
remove_background_job_from_cf(f1);
}
static void
unlock_dummy (void* UU(v)) {
}
static void reset_unlockers(UNLOCKERS unlockers) {
unlockers->locked = true;
}
static void
run_test (pair_lock_type lock_type) {
const int test_limit = 12;
struct unlockers unlockers = {true, unlock_dummy, NULL, NULL};
int r;
CACHETABLE ct;
r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
char fname1[] = __SRCFILE__ "test1.dat";
unlink(fname1);
CACHEFILE f1;
r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
r = toku_cachetable_get_and_pin_with_dep_pairs(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, lock_type, NULL, 0, NULL, NULL, NULL, NULL);
cachefile_kibbutz_enq(f1, kibbutz_work, f1);
reset_unlockers(&unlockers);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, &unlockers);
if (lock_type == PL_WRITE_EXPENSIVE) {
assert(r == TOKUDB_TRY_AGAIN); assert(!unlockers.locked);
}
else {
assert(r == 0); assert(unlockers.locked);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
}
// now do the same test with a partial fetch required
pf_called = false;
r = toku_cachetable_get_and_pin_with_dep_pairs(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, true_pf_req_callback, true_pf_callback, lock_type, NULL, 0, NULL, NULL, NULL, NULL);
assert(pf_called);
cachefile_kibbutz_enq(f1, kibbutz_work, f1);
reset_unlockers(&unlockers);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, &unlockers);
if (lock_type == PL_WRITE_EXPENSIVE) {
assert(r == TOKUDB_TRY_AGAIN); assert(!unlockers.locked);
}
else {
assert(r == 0); assert(unlockers.locked);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
}
toku_cachetable_verify(ct);
r = toku_cachefile_close(&f1, 0, false, ZERO_LSN);
assert(r == 0);
r = toku_cachetable_close(&ct); lazy_assert_zero(r);
}
int
test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
run_test(PL_READ);
run_test(PL_WRITE_CHEAP);
run_test(PL_WRITE_EXPENSIVE);
return 0;
}
...@@ -124,7 +124,7 @@ cachetable_test (bool write_first, bool write_second, bool start_checkpoint) { ...@@ -124,7 +124,7 @@ cachetable_test (bool write_first, bool write_second, bool start_checkpoint) {
&v3, &v3,
&s3, &s3,
wc, fetch, def_pf_req_callback, def_pf_callback, wc, fetch, def_pf_req_callback, def_pf_callback,
true, PL_WRITE_EXPENSIVE,
&val3, &val3,
2, //num_dependent_pairs 2, //num_dependent_pairs
dependent_cfs, dependent_cfs,
......
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id: cachetable-simple-pin-nonblocking.cc 46977 2012-08-19 01:56:34Z zardosht $"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#include "includes.h"
#include "test.h"
static void
flush (CACHEFILE f __attribute__((__unused__)),
int UU(fd),
CACHEKEY k __attribute__((__unused__)),
void *v __attribute__((__unused__)),
void** UU(dd),
void *e __attribute__((__unused__)),
PAIR_ATTR s __attribute__((__unused__)),
PAIR_ATTR* new_size __attribute__((__unused__)),
bool w __attribute__((__unused__)),
bool keep __attribute__((__unused__)),
bool c __attribute__((__unused__)),
bool UU(is_clone)
) {
if (w) {
assert(c);
assert(keep);
}
}
static void kibbutz_work(void *fe_v)
{
CACHEFILE CAST_FROM_VOIDP(f1, fe_v);
sleep(2);
int r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
remove_background_job_from_cf(f1);
}
static void
unlock_dummy (void* UU(v)) {
}
static void reset_unlockers(UNLOCKERS unlockers) {
unlockers->locked = true;
}
static void
run_case_that_should_succeed(CACHEFILE f1, pair_lock_type first_lock, pair_lock_type second_lock) {
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
wc.flush_callback = flush;
struct unlockers unlockers = {true, unlock_dummy, NULL, NULL};
int r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, first_lock, NULL, NULL);
assert(r==0);
cachefile_kibbutz_enq(f1, kibbutz_work, f1);
reset_unlockers(&unlockers);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, second_lock, NULL, &unlockers);
assert(r==0); assert(unlockers.locked);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
}
static void
run_case_that_should_fail(CACHEFILE f1, pair_lock_type first_lock, pair_lock_type second_lock) {
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
wc.flush_callback = flush;
struct unlockers unlockers = {true, unlock_dummy, NULL, NULL};
int r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, first_lock, NULL, NULL);
assert(r==0);
cachefile_kibbutz_enq(f1, kibbutz_work, f1);
reset_unlockers(&unlockers);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, second_lock, NULL, &unlockers);
assert(r == TOKUDB_TRY_AGAIN); assert(!unlockers.locked);
}
static void
run_test (void) {
const int test_limit = 12;
int r;
CACHETABLE ct;
r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
char fname1[] = __SRCFILE__ "test1.dat";
unlink(fname1);
CACHEFILE f1;
r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
wc.flush_callback = flush;
//
// test that if we are getting a PAIR for the first time that TOKUDB_TRY_AGAIN is returned
// because the PAIR was not in the cachetable.
//
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==TOKUDB_TRY_AGAIN);
run_case_that_should_succeed(f1, PL_READ, PL_WRITE_CHEAP);
run_case_that_should_succeed(f1, PL_READ, PL_WRITE_EXPENSIVE);
run_case_that_should_succeed(f1, PL_WRITE_CHEAP, PL_READ);
run_case_that_should_succeed(f1, PL_WRITE_CHEAP, PL_WRITE_CHEAP);
run_case_that_should_succeed(f1, PL_WRITE_CHEAP, PL_WRITE_EXPENSIVE);
run_case_that_should_fail(f1, PL_WRITE_EXPENSIVE, PL_READ);
run_case_that_should_fail(f1, PL_WRITE_EXPENSIVE, PL_WRITE_CHEAP);
run_case_that_should_fail(f1, PL_WRITE_EXPENSIVE, PL_WRITE_EXPENSIVE);
toku_cachetable_verify(ct);
r = toku_cachefile_close(&f1, 0, false, ZERO_LSN);
assert(r == 0);
r = toku_cachetable_close(&ct); lazy_assert_zero(r);
}
int
test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
run_test();
return 0;
}
...@@ -76,15 +76,15 @@ run_test (void) { ...@@ -76,15 +76,15 @@ run_test (void) {
// test that if we are getting a PAIR for the first time that TOKUDB_TRY_AGAIN is returned // test that if we are getting a PAIR for the first time that TOKUDB_TRY_AGAIN is returned
// because the PAIR was not in the cachetable. // because the PAIR was not in the cachetable.
// //
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==TOKUDB_TRY_AGAIN); assert(r==TOKUDB_TRY_AGAIN);
// now it should succeed // now it should succeed
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==0); assert(r==0);
foo = false; foo = false;
cachefile_kibbutz_enq(f1, kibbutz_work, f1); cachefile_kibbutz_enq(f1, kibbutz_work, f1);
// because node is in use, should return TOKUDB_TRY_AGAIN // because node is in use, should return TOKUDB_TRY_AGAIN
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==TOKUDB_TRY_AGAIN); assert(r==TOKUDB_TRY_AGAIN);
r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL); r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL);
assert(foo); assert(foo);
...@@ -92,24 +92,24 @@ run_test (void) { ...@@ -92,24 +92,24 @@ run_test (void) {
// now make sure we get TOKUDB_TRY_AGAIN when a partial fetch is involved // now make sure we get TOKUDB_TRY_AGAIN when a partial fetch is involved
// first make sure value is there // first make sure value is there
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==0); assert(r==0);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0); r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8)); assert(r==0);
// now make sure that we get TOKUDB_TRY_AGAIN for the partial fetch // now make sure that we get TOKUDB_TRY_AGAIN for the partial fetch
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, true_def_pf_req_callback, true_def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, true_def_pf_req_callback, true_def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==TOKUDB_TRY_AGAIN); assert(r==TOKUDB_TRY_AGAIN);
// //
// now test that if there is a checkpoint pending, // now test that if there is a checkpoint pending,
// first pin and unpin with dirty // first pin and unpin with dirty
// //
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==0); assert(r==0);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8)); assert(r==0); r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_DIRTY, make_pair_attr(8)); assert(r==0);
// this should mark the PAIR as pending // this should mark the PAIR as pending
CHECKPOINTER cp = toku_cachetable_get_checkpointer(ct); CHECKPOINTER cp = toku_cachetable_get_checkpointer(ct);
r = toku_cachetable_begin_checkpoint(cp, NULL); assert(r == 0); r = toku_cachetable_begin_checkpoint(cp, NULL); assert(r == 0);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, true, NULL, NULL); r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, def_fetch, def_pf_req_callback, def_pf_callback, PL_WRITE_EXPENSIVE, NULL, NULL);
assert(r==TOKUDB_TRY_AGAIN); assert(r==TOKUDB_TRY_AGAIN);
r = toku_cachetable_end_checkpoint( r = toku_cachetable_end_checkpoint(
cp, cp,
......
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id: cachetable-simple-pin.cc 46797 2012-08-15 01:56:49Z zardosht $"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#include "includes.h"
#include "test.h"
bool pf_called;
bool fetch_called;
CACHEFILE f1;
static int
sleep_fetch (CACHEFILE f __attribute__((__unused__)),
PAIR UU(p),
int UU(fd),
CACHEKEY k __attribute__((__unused__)),
uint32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
void **dd __attribute__((__unused__)),
PAIR_ATTR *sizep __attribute__((__unused__)),
int *dirtyp,
void *extraargs __attribute__((__unused__))
) {
sleep(2);
*dirtyp = 0;
*value = NULL;
*sizep = make_pair_attr(8);
fetch_called = true;
return 0;
}
static bool sleep_pf_req_callback(void* UU(ftnode_pv), void* UU(read_extraargs)) {
return true;
}
static int sleep_pf_callback(void* UU(ftnode_pv), void* UU(disk_data), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
sleep(2);
*sizep = make_pair_attr(8);
pf_called = true;
return 0;
}
static void *run_expensive_pf(void *arg) {
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
int r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, sleep_pf_req_callback, sleep_pf_callback, PL_READ, NULL, NULL);
assert(r == TOKUDB_TRY_AGAIN);
assert(pf_called);
return arg;
}
static void *run_expensive_fetch(void *arg) {
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
int r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, sleep_pf_req_callback, sleep_pf_callback, PL_READ, NULL, NULL);
assert(fetch_called);
assert(r == TOKUDB_TRY_AGAIN);
return arg;
}
static void
run_test (void) {
const int test_limit = 12;
int r;
void *ret;
CACHETABLE ct;
r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
char fname1[] = __SRCFILE__ "test1.dat";
unlink(fname1);
r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
toku_pthread_t fetch_tid;
fetch_called = false;
r = toku_pthread_create(&fetch_tid, NULL, run_expensive_fetch, NULL);
sleep(1);
r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, false, NULL);
assert_zero(r);
assert(fetch_called);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_pthread_join(fetch_tid, &ret);
assert_zero(r);
// call with may_modify_node = false twice, make sure we can get it
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, PL_READ, NULL, NULL);
assert_zero(r);
r = toku_cachetable_get_and_pin_nonblocking(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, PL_READ, NULL, NULL);
assert_zero(r);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
toku_pthread_t pf_tid;
pf_called = false;
r = toku_pthread_create(&pf_tid, NULL, run_expensive_pf, NULL);
sleep(1);
r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, false, NULL);
assert_zero(r);
assert(pf_called);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_pthread_join(pf_tid, &ret);
assert_zero(r);
toku_cachetable_verify(ct);
r = toku_cachefile_close(&f1, 0, false, ZERO_LSN); assert(r == 0);
r = toku_cachetable_close(&ct); lazy_assert_zero(r);
}
int
test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
run_test();
return 0;
}
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id: cachetable-simple-pin.cc 46797 2012-08-15 01:56:49Z zardosht $"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#include "includes.h"
#include "test.h"
bool pf_called;
bool fetch_called;
CACHEFILE f1;
static int
sleep_fetch (CACHEFILE f __attribute__((__unused__)),
PAIR UU(p),
int UU(fd),
CACHEKEY k __attribute__((__unused__)),
uint32_t fullhash __attribute__((__unused__)),
void **value __attribute__((__unused__)),
void **dd __attribute__((__unused__)),
PAIR_ATTR *sizep __attribute__((__unused__)),
int *dirtyp,
void *extraargs __attribute__((__unused__))
) {
sleep(2);
*dirtyp = 0;
*value = NULL;
*sizep = make_pair_attr(8);
fetch_called = true;
return 0;
}
static bool sleep_pf_req_callback(void* UU(ftnode_pv), void* UU(read_extraargs)) {
return true;
}
static int sleep_pf_callback(void* UU(ftnode_pv), void* UU(disk_data), void* UU(read_extraargs), int UU(fd), PAIR_ATTR* sizep) {
sleep(2);
*sizep = make_pair_attr(8);
pf_called = true;
return 0;
}
static void *run_expensive_pf(void *arg) {
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
int r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, sleep_pf_req_callback, sleep_pf_callback, false, NULL);
assert_zero(r);
assert(pf_called);
return arg;
}
static void *run_expensive_fetch(void *arg) {
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
int r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, sleep_pf_req_callback, sleep_pf_callback, false, NULL);
assert_zero(r);
assert(fetch_called);
return arg;
}
static void
run_test (void) {
const int test_limit = 12;
int r;
void *ret;
CACHETABLE ct;
r = toku_create_cachetable(&ct, test_limit, ZERO_LSN, NULL_LOGGER); assert(r == 0);
char fname1[] = __SRCFILE__ "test1.dat";
unlink(fname1);
r = toku_cachetable_openf(&f1, ct, fname1, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO); assert(r == 0);
void* v1;
long s1;
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(NULL);
toku_pthread_t fetch_tid;
fetch_called = false;
r = toku_pthread_create(&fetch_tid, NULL, run_expensive_fetch, NULL);
sleep(1);
r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, false, NULL);
assert_zero(r);
assert(fetch_called);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_pthread_join(fetch_tid, &ret);
assert_zero(r);
// call with may_modify_node = false twice, make sure we can get it
r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, false, NULL);
assert_zero(r);
r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, false, NULL);
assert_zero(r);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
toku_pthread_t pf_tid;
pf_called = false;
r = toku_pthread_create(&pf_tid, NULL, run_expensive_pf, NULL);
sleep(1);
r = toku_cachetable_get_and_pin(f1, make_blocknum(1), 1, &v1, &s1, wc, sleep_fetch, def_pf_req_callback, def_pf_callback, false, NULL);
assert_zero(r);
assert(pf_called);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_test_cachetable_unpin(f1, make_blocknum(1), 1, CACHETABLE_CLEAN, make_pair_attr(8));
assert(r==0);
r = toku_pthread_join(pf_tid, &ret);
assert_zero(r);
toku_cachetable_verify(ct);
r = toku_cachefile_close(&f1, 0, false, ZERO_LSN); assert(r == 0);
r = toku_cachetable_close(&ct); lazy_assert_zero(r);
}
int
test_main(int argc, const char *argv[]) {
default_parse_args(argc, argv);
run_test();
return 0;
}
...@@ -64,233 +64,14 @@ struct item { ...@@ -64,233 +64,14 @@ struct item {
}; };
static volatile int expect_n_flushes=0; static volatile int expect_n_flushes=0;
static CACHEKEY flushes[100];
static void expect_init(void) {
test_mutex_lock();
expect_n_flushes = 0;
test_mutex_unlock();
}
static void expect1(int64_t blocknum_n) {
test_mutex_lock();
expect_n_flushes=1;
flushes[0].b=blocknum_n;
//if (verbose) printf("%s:%d %lld\n", __FUNCTION__, 0, key.b);
test_mutex_unlock();
}
static void expectN(int64_t blocknum_n) {
test_mutex_lock();
//if (verbose) printf("%s:%d %lld\n", __FUNCTION__, expect_n_flushes, key);
flushes[expect_n_flushes++].b=blocknum_n;
test_mutex_unlock();
}
static CACHEFILE expect_f; static CACHEFILE expect_f;
static void flush (CACHEFILE f,
int UU(fd),
CACHEKEY key,
void*value,
void** UU(dd),
void *extra __attribute__((__unused__)),
PAIR_ATTR size __attribute__((__unused__)),
PAIR_ATTR* new_size __attribute__((__unused__)),
bool write_me __attribute__((__unused__)),
bool keep_me __attribute__((__unused__)),
bool for_checkpoint __attribute__((__unused__)),
bool UU(is_clone)
) {
struct item *CAST_FROM_VOIDP(it, value);
int i;
if (keep_me) return;
if (verbose) printf("Flushing %" PRId64 " (it=>key=%" PRId64 ")\n", key.b, it->key.b);
test_mutex_lock();
if (write_me) assert(expect_f==f);
assert(strcmp(it->something,"something")==0);
assert(it->key.b==key.b);
/* Verify that we expected the flush. */
for (i=0; i<expect_n_flushes; i++) {
if (key.b==flushes[i].b) {
flushes[i] = flushes[expect_n_flushes-1];
expect_n_flushes--;
goto found_flush;
}
}
fprintf(stderr, "%" PRId64 " was flushed, but I didn't expect it\n", key.b);
abort();
found_flush:
test_mutex_unlock();
toku_free(value);
}
static struct item *make_item (uint64_t key) {
struct item *MALLOC(it);
it->key.b=key;
it->something="something";
return it;
}
static CACHEKEY did_fetch={-1};
static int fetch (CACHEFILE f, PAIR UU(p), int UU(fd), CACHEKEY key, uint32_t fullhash __attribute__((__unused__)), void**value, void** UU(dd), PAIR_ATTR *sizep __attribute__((__unused__)), int *dirtyp, void*extraargs) {
if (verbose) printf("Fetch %" PRId64 "\n", key.b);
assert (expect_f==f);
assert((long)extraargs==23);
*value = make_item(key.b);
*sizep = make_pair_attr(test_object_size);
*dirtyp = 0;
did_fetch=key;
return 0;
}
static void maybe_flush(CACHETABLE t) { static void maybe_flush(CACHETABLE t) {
toku_cachetable_maybe_flush_some(t); toku_cachetable_maybe_flush_some(t);
} }
// verify that a sequence of cachetable operations causes a particular sequence of
// callbacks
static void test0 (void) {
void* t3=(void*)23;
CACHETABLE t;
CACHEFILE f;
int r;
char fname[] = __SRCFILE__ "test.dat";
r=toku_create_cachetable(&t, 5, ZERO_LSN, NULL_LOGGER);
assert(r==0);
unlink(fname);
r = toku_cachetable_openf(&f, t, fname, O_RDWR|O_CREAT, S_IRWXU|S_IRWXG|S_IRWXO);
assert(r==0);
TOKULOGGER logger = toku_cachefile_logger(f);
assert(logger == NULL_LOGGER);
expect_f = f;
expect_n_flushes=0;
uint32_t h1 = toku_cachetable_hash(f, make_blocknum(1));
uint32_t h2 = toku_cachetable_hash(f, make_blocknum(2));
uint32_t h3 = toku_cachetable_hash(f, make_blocknum(3));
uint32_t h4 = toku_cachetable_hash(f, make_blocknum(4));
uint32_t h5 = toku_cachetable_hash(f, make_blocknum(5));
uint32_t h6 = toku_cachetable_hash(f, make_blocknum(6));
uint32_t h7 = toku_cachetable_hash(f, make_blocknum(7));
CACHETABLE_WRITE_CALLBACK wc = def_write_callback(t3);
wc.flush_callback = flush;
r=toku_cachetable_put(f, make_blocknum(1), h1, make_item(1), make_pair_attr(test_object_size), wc, put_callback_nop); /* 1P */ /* this is the lru list. 1 is pinned. */
assert(r==0);
assert(expect_n_flushes==0);
expect_init();
r=toku_cachetable_put(f, make_blocknum(2), h2, make_item(2), make_pair_attr(test_object_size), wc, put_callback_nop);
assert(r==0);
r=toku_test_cachetable_unpin(f, make_blocknum(2), h2, CACHETABLE_DIRTY, make_pair_attr(1)); /* 2U 1P */
assert(expect_n_flushes==0);
expect_init();
r=toku_cachetable_put(f, make_blocknum(3), h3, make_item(3), make_pair_attr(test_object_size), wc, put_callback_nop);
assert(r==0);
assert(expect_n_flushes==0); /* 3P 2U 1P */ /* 3 is most recently used (pinned), 2 is next (unpinned), 1 is least recent (pinned) */
expect_init();
r=toku_cachetable_put(f, make_blocknum(4), h4, make_item(4), make_pair_attr(test_object_size), wc, put_callback_nop);
assert(r==0);
assert(expect_n_flushes==0); /* 4P 3P 2U 1P */
expect_init();
r=toku_cachetable_put(f, make_blocknum(5), h5, make_item(5), make_pair_attr(test_object_size), wc, put_callback_nop);
assert(r==0);
r=toku_test_cachetable_unpin(f, make_blocknum(5), h5, CACHETABLE_DIRTY, make_pair_attr(test_object_size));
assert(r==0);
r=toku_test_cachetable_unpin(f, make_blocknum(3), h3, CACHETABLE_DIRTY, make_pair_attr(test_object_size));
assert(r==0);
assert(expect_n_flushes==0); /* 5U 4P 3U 2U 1P */
expect1(2); /* 2 is the oldest unpinned item. */
r=toku_cachetable_put(f, make_blocknum(6), h6, make_item(6), make_pair_attr(test_object_size), wc, put_callback_nop); /* 6P 5U 4P 3U 1P */
assert(r==0);
test_mutex_lock();
while (expect_n_flushes != 0) {
test_mutex_unlock(); toku_pthread_yield(); maybe_flush(t); test_mutex_lock();
}
assert(expect_n_flushes==0);
test_mutex_unlock();
expect1(3);
r=toku_cachetable_put(f, make_blocknum(7), h7, make_item(7), make_pair_attr(test_object_size), wc, put_callback_nop);
assert(r==0);
test_mutex_lock();
while (expect_n_flushes != 0) {
test_mutex_unlock(); toku_pthread_yield(); maybe_flush(t); test_mutex_lock();
}
assert(expect_n_flushes==0);
test_mutex_unlock();
r=toku_test_cachetable_unpin(f, make_blocknum(7), h7, CACHETABLE_DIRTY, make_pair_attr(test_object_size)); /* 7U 6P 5U 4P 1P */
assert(r==0);
{
void *item_v=0;
expect_init();
r=toku_cachetable_get_and_pin(f, make_blocknum(5), toku_cachetable_hash(f, make_blocknum(5)), &item_v, NULL, wc, fetch, def_pf_req_callback, def_pf_callback, true, t3); /* 5P 7U 6P 4P 1P */
assert(r==0);
assert(((struct item *)item_v)->key.b==5);
assert(strcmp(((struct item *)item_v)->something,"something")==0);
test_mutex_lock();
assert(expect_n_flushes==0);
test_mutex_unlock();
}
{
void *item_v=0;
r=toku_test_cachetable_unpin(f, make_blocknum(4), h4, CACHETABLE_DIRTY, make_pair_attr(test_object_size));
assert(r==0);
expect1(4);
did_fetch=make_blocknum(-1);
CACHETABLE_WRITE_CALLBACK wc2 = def_write_callback(t3);
wc2.flush_callback = flush;
r=toku_cachetable_get_and_pin(f, make_blocknum(2), toku_cachetable_hash(f, make_blocknum(2)), &item_v, NULL, wc2, fetch, def_pf_req_callback, def_pf_callback, true, t3); /* 2p 5P 7U 6P 1P */
assert(r==0);
assert(did_fetch.b==2); /* Expect that 2 is fetched in. */
assert(((struct item *)item_v)->key.b==2);
assert(strcmp(((struct item *)item_v)->something,"something")==0);
test_mutex_lock();
while (expect_n_flushes != 0) {
test_mutex_unlock(); toku_pthread_yield(); maybe_flush(t); test_mutex_lock();
}
assert(expect_n_flushes==0);
test_mutex_unlock();
}
r=toku_test_cachetable_unpin(f, make_blocknum(2), h2, CACHETABLE_DIRTY, make_pair_attr(test_object_size));
assert(r==0);
r=toku_test_cachetable_unpin(f, make_blocknum(5), h5, CACHETABLE_DIRTY, make_pair_attr(test_object_size));
assert(r==0);
r=toku_test_cachetable_unpin(f, make_blocknum(6), h6, CACHETABLE_DIRTY, make_pair_attr(test_object_size));
assert(r==0);
r=toku_test_cachetable_unpin(f, make_blocknum(1), h1, CACHETABLE_DIRTY, make_pair_attr(test_object_size));
assert(r==0);
r=toku_cachetable_assert_all_unpinned(t);
assert(r==0);
if (verbose) printf("Closing\n");
expect1(2);
expectN(5);
expectN(7);
expectN(6);
expectN(1);
r=toku_cachefile_close(&f, 0, false, ZERO_LSN);
assert(r==0);
r=toku_cachetable_close(&t);
assert(r==0);
assert(expect_n_flushes==0);
expect_f = 0;
}
static void flush_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)), static void flush_n (CACHEFILE f __attribute__((__unused__)), int UU(fd), CACHEKEY key __attribute__((__unused__)),
void *value, void *value,
...@@ -767,7 +548,6 @@ test_main (int argc, const char *argv[]) { ...@@ -767,7 +548,6 @@ test_main (int argc, const char *argv[]) {
if (do_malloc_fail) if (do_malloc_fail)
test_cachetable_create_no_memory(); // fails with valgrind test_cachetable_create_no_memory(); // fails with valgrind
for (i=0; i<1; i++) { for (i=0; i<1; i++) {
test0();
test_nested_pin(); test_nested_pin();
#if !TOKU_WINDOWS #if !TOKU_WINDOWS
test_multi_filehandles (); test_multi_filehandles ();
......
/* Fair readers writer lock implemented using condition variables.
* This is maintained so that we can measure the performance of a relatively simple implementation (this one)
* compared to a fast one that uses compare-and-swap (the one in ../toku_rwlock.c)
* For now it's only for testing.
*/
#ident "$Id$"
#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved."
// Fair readers/writer locks. These are fair (meaning first-come first-served. No reader starvation, and no writer starvation). And they are
// probably faster than the linux readers/writer locks (pthread_rwlock_t).
struct toku_cv_fair_rwlock_waiter_state; // this structure is used internally.
typedef struct toku_cv_fair_rwlock_s {
toku_mutex_t mutex;
int state; // 0 means no locks, + is number of readers locked, -1 is a writer
struct toku_cv_fair_rwlock_waiter_state *waiters_head, *waiters_tail;
} toku_cv_fair_rwlock_t;
void toku_cv_fair_rwlock_init (toku_cv_fair_rwlock_t *rwlock);
void toku_cv_fair_rwlock_destroy (toku_cv_fair_rwlock_t *rwlock);
int toku_cv_fair_rwlock_rdlock (toku_cv_fair_rwlock_t *rwlock);
int toku_cv_fair_rwlock_wrlock (toku_cv_fair_rwlock_t *rwlock);
int toku_cv_fair_rwlock_unlock (toku_cv_fair_rwlock_t *rwlock);
struct toku_cv_fair_rwlock_waiter_state {
char is_read;
struct toku_cv_fair_rwlock_waiter_state *next;
toku_cond_t cond;
};
static __thread struct toku_cv_fair_rwlock_waiter_state waitstate = {0, NULL, {PTHREAD_COND_INITIALIZER} };
void toku_cv_fair_rwlock_init (toku_cv_fair_rwlock_t *rwlock) {
rwlock->state=0;
rwlock->waiters_head = NULL;
rwlock->waiters_tail = NULL;
toku_mutex_init(&rwlock->mutex, NULL);
}
void toku_cv_fair_rwlock_destroy (toku_cv_fair_rwlock_t *rwlock) {
toku_mutex_destroy(&rwlock->mutex);
}
int toku_cv_fair_rwlock_rdlock (toku_cv_fair_rwlock_t *rwlock) {
toku_mutex_lock(&rwlock->mutex);
if (rwlock->waiters_head!=NULL || rwlock->state<0) {
// Someone is ahead of me in the queue, or someone has a lock.
// We use per-thread-state for the condition variable. A thread cannot get control and try to reuse the waiter state for something else.
if (rwlock->waiters_tail) {
rwlock->waiters_tail->next = &waitstate;
} else {
rwlock->waiters_head = &waitstate;
}
rwlock->waiters_tail = &waitstate;
waitstate.next = NULL;
waitstate.is_read = 1;
do {
toku_cond_wait(&waitstate.cond, &rwlock->mutex);
} while (rwlock->waiters_head!=&waitstate || rwlock->state<0);
rwlock->state++;
rwlock->waiters_head=waitstate.next;
if (waitstate.next==NULL) rwlock->waiters_tail=NULL;
if (rwlock->waiters_head && rwlock->waiters_head->is_read) {
toku_cond_signal(&rwlock->waiters_head->cond);
}
} else {
// No one is waiting, and any holders are readers.
rwlock->state++;
}
toku_mutex_unlock(&rwlock->mutex);
return 0;
}
int toku_cv_fair_rwlock_wrlock (toku_cv_fair_rwlock_t *rwlock) {
toku_mutex_lock(&rwlock->mutex);
if (rwlock->waiters_head!=NULL || rwlock->state!=0) {
// Someone else is ahead of me, or someone has a lock the lock, so we must wait our turn.
if (rwlock->waiters_tail) {
rwlock->waiters_tail->next = &waitstate;
} else {
rwlock->waiters_head = &waitstate;
}
rwlock->waiters_tail = &waitstate;
waitstate.next = NULL;
waitstate.is_read = 0;
do {
toku_cond_wait(&waitstate.cond, &rwlock->mutex);
} while (rwlock->waiters_head!=&waitstate || rwlock->state!=0);
rwlock->waiters_head = waitstate.next;
if (waitstate.next==NULL) rwlock->waiters_tail=NULL;
}
rwlock->state = -1;
toku_mutex_unlock(&rwlock->mutex);
return 0;
}
int toku_cv_fair_rwlock_unlock (toku_cv_fair_rwlock_t *rwlock) {
toku_mutex_lock(&rwlock->mutex);
assert(rwlock->state!=0);
if (rwlock->state>0) {
rwlock->state--;
} else {
rwlock->state=0;
}
if (rwlock->state==0 && rwlock->waiters_head) {
toku_cond_signal(&rwlock->waiters_head->cond);
} else {
// printf(" No one to wake\n");
}
toku_mutex_unlock(&rwlock->mutex);
return 0;
}
...@@ -150,7 +150,7 @@ doit (bool after_child_pin) { ...@@ -150,7 +150,7 @@ doit (bool after_child_pin) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -169,7 +169,7 @@ doit (bool after_child_pin) { ...@@ -169,7 +169,7 @@ doit (bool after_child_pin) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -206,7 +206,7 @@ doit (bool after_child_pin) { ...@@ -206,7 +206,7 @@ doit (bool after_child_pin) {
node_root, node_root,
toku_cachetable_hash(c_ft->ft->cf, node_root), toku_cachetable_hash(c_ft->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -227,7 +227,7 @@ doit (bool after_child_pin) { ...@@ -227,7 +227,7 @@ doit (bool after_child_pin) {
node_leaf, node_leaf,
toku_cachetable_hash(c_ft->ft->cf, node_root), toku_cachetable_hash(c_ft->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
...@@ -168,7 +168,7 @@ doit (int state) { ...@@ -168,7 +168,7 @@ doit (int state) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -186,7 +186,7 @@ doit (int state) { ...@@ -186,7 +186,7 @@ doit (int state) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -225,7 +225,7 @@ doit (int state) { ...@@ -225,7 +225,7 @@ doit (int state) {
node_root, node_root,
toku_cachetable_hash(c_ft->ft->cf, node_root), toku_cachetable_hash(c_ft->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -255,7 +255,7 @@ doit (int state) { ...@@ -255,7 +255,7 @@ doit (int state) {
left_child, left_child,
toku_cachetable_hash(c_ft->ft->cf, left_child), toku_cachetable_hash(c_ft->ft->cf, left_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -271,7 +271,7 @@ doit (int state) { ...@@ -271,7 +271,7 @@ doit (int state) {
right_child, right_child,
toku_cachetable_hash(c_ft->ft->cf, right_child), toku_cachetable_hash(c_ft->ft->cf, right_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -288,7 +288,7 @@ doit (int state) { ...@@ -288,7 +288,7 @@ doit (int state) {
left_child, left_child,
toku_cachetable_hash(c_ft->ft->cf, left_child), toku_cachetable_hash(c_ft->ft->cf, left_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
...@@ -188,7 +188,7 @@ doit (int state) { ...@@ -188,7 +188,7 @@ doit (int state) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -206,7 +206,7 @@ doit (int state) { ...@@ -206,7 +206,7 @@ doit (int state) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -245,7 +245,7 @@ doit (int state) { ...@@ -245,7 +245,7 @@ doit (int state) {
node_root, node_root,
toku_cachetable_hash(c_ft->ft->cf, node_root), toku_cachetable_hash(c_ft->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -266,7 +266,7 @@ doit (int state) { ...@@ -266,7 +266,7 @@ doit (int state) {
left_child, left_child,
toku_cachetable_hash(c_ft->ft->cf, left_child), toku_cachetable_hash(c_ft->ft->cf, left_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -282,7 +282,7 @@ doit (int state) { ...@@ -282,7 +282,7 @@ doit (int state) {
right_child, right_child,
toku_cachetable_hash(c_ft->ft->cf, right_child), toku_cachetable_hash(c_ft->ft->cf, right_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
...@@ -164,7 +164,7 @@ doit (bool after_split) { ...@@ -164,7 +164,7 @@ doit (bool after_split) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -182,7 +182,7 @@ doit (bool after_split) { ...@@ -182,7 +182,7 @@ doit (bool after_split) {
node_root, node_root,
toku_cachetable_hash(t->ft->cf, node_root), toku_cachetable_hash(t->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -221,7 +221,7 @@ doit (bool after_split) { ...@@ -221,7 +221,7 @@ doit (bool after_split) {
node_root, node_root,
toku_cachetable_hash(c_ft->ft->cf, node_root), toku_cachetable_hash(c_ft->ft->cf, node_root),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -249,7 +249,7 @@ doit (bool after_split) { ...@@ -249,7 +249,7 @@ doit (bool after_split) {
left_child, left_child,
toku_cachetable_hash(c_ft->ft->cf, left_child), toku_cachetable_hash(c_ft->ft->cf, left_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -265,7 +265,7 @@ doit (bool after_split) { ...@@ -265,7 +265,7 @@ doit (bool after_split) {
right_child, right_child,
toku_cachetable_hash(c_ft->ft->cf, right_child), toku_cachetable_hash(c_ft->ft->cf, right_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -282,7 +282,7 @@ doit (bool after_split) { ...@@ -282,7 +282,7 @@ doit (bool after_split) {
left_child, left_child,
toku_cachetable_hash(c_ft->ft->cf, left_child), toku_cachetable_hash(c_ft->ft->cf, left_child),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
...@@ -166,7 +166,7 @@ doit (void) { ...@@ -166,7 +166,7 @@ doit (void) {
node_leaf, node_leaf,
toku_cachetable_hash(brt->ft->cf, node_leaf), toku_cachetable_hash(brt->ft->cf, node_leaf),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -195,7 +195,7 @@ doit (void) { ...@@ -195,7 +195,7 @@ doit (void) {
node_leaf, node_leaf,
toku_cachetable_hash(brt->ft->cf, node_leaf), toku_cachetable_hash(brt->ft->cf, node_leaf),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -215,7 +215,7 @@ doit (void) { ...@@ -215,7 +215,7 @@ doit (void) {
node_internal, node_internal,
toku_cachetable_hash(brt->ft->cf, node_internal), toku_cachetable_hash(brt->ft->cf, node_internal),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -239,7 +239,7 @@ doit (void) { ...@@ -239,7 +239,7 @@ doit (void) {
node_internal, node_internal,
toku_cachetable_hash(brt->ft->cf, node_internal), toku_cachetable_hash(brt->ft->cf, node_internal),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
...@@ -172,7 +172,7 @@ doit (bool keep_other_bn_in_memory) { ...@@ -172,7 +172,7 @@ doit (bool keep_other_bn_in_memory) {
node_leaf, node_leaf,
toku_cachetable_hash(brt->ft->cf, node_leaf), toku_cachetable_hash(brt->ft->cf, node_leaf),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -220,7 +220,7 @@ doit (bool keep_other_bn_in_memory) { ...@@ -220,7 +220,7 @@ doit (bool keep_other_bn_in_memory) {
node_leaf, node_leaf,
toku_cachetable_hash(brt->ft->cf, node_leaf), toku_cachetable_hash(brt->ft->cf, node_leaf),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -245,7 +245,7 @@ doit (bool keep_other_bn_in_memory) { ...@@ -245,7 +245,7 @@ doit (bool keep_other_bn_in_memory) {
node_internal, node_internal,
toku_cachetable_hash(brt->ft->cf, node_internal), toku_cachetable_hash(brt->ft->cf, node_internal),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -269,7 +269,7 @@ doit (bool keep_other_bn_in_memory) { ...@@ -269,7 +269,7 @@ doit (bool keep_other_bn_in_memory) {
node_internal, node_internal,
toku_cachetable_hash(brt->ft->cf, node_internal), toku_cachetable_hash(brt->ft->cf, node_internal),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
...@@ -158,7 +158,7 @@ doit (void) { ...@@ -158,7 +158,7 @@ doit (void) {
node_internal, node_internal,
toku_cachetable_hash(brt->ft->cf, node_internal), toku_cachetable_hash(brt->ft->cf, node_internal),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
...@@ -181,7 +181,7 @@ doit (void) { ...@@ -181,7 +181,7 @@ doit (void) {
node_internal, node_internal,
toku_cachetable_hash(brt->ft->cf, node_internal), toku_cachetable_hash(brt->ft->cf, node_internal),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "$Id: test-rwlock.cc 46971 2012-08-18 22:03:43Z zardosht $"
#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved."
#include <toku_pthread.h>
#include <toku_portability.h>
#include <toku_time.h>
#include <toku_assert.h>
#include <toku_portability.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include "rwlock.h"
#include <sys/types.h>
#include "rwlock_condvar.h"
#include "toku_fair_rwlock.h"
#include "frwlock.h"
toku_mutex_t mutex;
toku::frwlock w;
static void grab_write_lock(bool expensive) {
toku_mutex_lock(&mutex);
w.write_lock(expensive);
toku_mutex_unlock(&mutex);
}
static void release_write_lock(void) {
toku_mutex_lock(&mutex);
w.write_unlock();
toku_mutex_unlock(&mutex);
}
static void grab_read_lock(void) {
toku_mutex_lock(&mutex);
w.read_lock();
toku_mutex_unlock(&mutex);
}
static void release_read_lock(void) {
toku_mutex_lock(&mutex);
w.read_unlock();
toku_mutex_unlock(&mutex);
}
static void *do_cheap_wait(void *arg) {
grab_write_lock(false);
release_write_lock();
return arg;
}
static void *do_expensive_wait(void *arg) {
grab_write_lock(true);
release_write_lock();
return arg;
}
static void *do_read_wait(void *arg) {
grab_read_lock();
release_read_lock();
return arg;
}
static void launch_cheap_waiter(void) {
toku_pthread_t tid;
int r = toku_pthread_create(&tid, NULL, do_cheap_wait, NULL);
assert_zero(r);
toku_pthread_detach(tid);
sleep(1);
}
static void launch_expensive_waiter(void) {
toku_pthread_t tid;
int r = toku_pthread_create(&tid, NULL, do_expensive_wait, NULL);
assert_zero(r);
toku_pthread_detach(tid);
sleep(1);
}
static void launch_reader(void) {
toku_pthread_t tid;
int r = toku_pthread_create(&tid, NULL, do_read_wait, NULL);
assert_zero(r);
toku_pthread_detach(tid);
sleep(1);
}
static void test_write_cheapness(void) {
toku_mutex_init(&mutex, NULL);
w.init(&mutex);
// single expensive write lock
grab_write_lock(true);
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
release_write_lock();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
// single cheap write lock
grab_write_lock(false);
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
release_write_lock();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
// multiple read locks
grab_read_lock();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
grab_read_lock();
grab_read_lock();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
release_read_lock();
release_read_lock();
release_read_lock();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
// expensive write lock and cheap writers waiting
grab_write_lock(true);
launch_cheap_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
launch_cheap_waiter();
launch_cheap_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
release_write_lock();
sleep(1);
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
// cheap write lock and expensive writer waiter
grab_write_lock(false);
launch_expensive_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
release_write_lock();
sleep(1);
// expensive write lock and expensive waiter
grab_write_lock(true);
launch_expensive_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
release_write_lock();
sleep(1);
// cheap write lock and cheap waiter
grab_write_lock(false);
launch_cheap_waiter();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
release_write_lock();
sleep(1);
// read lock held and cheap waiter
grab_read_lock();
launch_cheap_waiter();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
// add expensive waiter
launch_expensive_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
release_read_lock();
sleep(1);
// read lock held and expensive waiter
grab_read_lock();
launch_expensive_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
// add expensive waiter
launch_cheap_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
release_read_lock();
sleep(1);
// cheap write lock held and waiting read
grab_write_lock(false);
launch_reader();
assert(!w.write_lock_is_expensive());
assert(!w.read_lock_is_expensive());
launch_expensive_waiter();
assert(w.write_lock_is_expensive());
// tricky case here, because we have a launched reader
// that should be in the queue, a new read lock
// should piggy back off that
assert(!w.read_lock_is_expensive());
release_write_lock();
sleep(1);
// expensive write lock held and waiting read
grab_write_lock(true);
launch_reader();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
launch_cheap_waiter();
assert(w.write_lock_is_expensive());
assert(w.read_lock_is_expensive());
release_write_lock();
sleep(1);
w.deinit();
toku_mutex_destroy(&mutex);
}
int main (int UU(argc), const char* UU(argv[])) {
test_write_cheapness();
return 0;
}
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "$Id$"
#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved."
// Here are some timing numbers:
// (Note: The not-quite-working version with cas can be found in r22519 of https://svn.tokutek.com/tokudb/toku/tokudb.2825/) It's about as fast as "Best cas".)
//
// On ramie (2.53GHz E5540)
// Best nop time= 1.074300ns
// Best cas time= 8.595600ns
// Best mutex time= 19.340201ns
// Best rwlock time= 34.024799ns
// Best newbrt rwlock time= 38.680500ns
// Best prelocked time= 2.148700ns
// Best fair rwlock time= 45.127600ns
// On laptop
// Best nop time= 2.876000ns
// Best cas time= 15.362500ns
// Best mutex time= 51.951498ns
// Best rwlock time= 97.721201ns
// Best newbrt rwlock time=110.456800ns
// Best prelocked time= 4.240100ns
// Best fair rwlock time=113.119102ns
//
// Analysis: If the mutex can be prelocked (as cachetable does, it uses the same mutex for the cachetable and for the condition variable protecting the cache table)
// then you can save quite a bit. What does the cachetable do?
// During pin: (In the common case:) It grabs the mutex, grabs a read lock, and releases the mutex.
// During unpin: It grabs the mutex, unlocks the rwlock lock in the pair, and releases the mutex.
// Both actions must acquire a cachetable lock during that time, so definitely saves time to do it that way.
#include <toku_pthread.h>
#include <toku_portability.h>
#include <toku_time.h>
#include <toku_assert.h>
#include <toku_portability.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include "rwlock.h"
#include <sys/types.h>
#include "rwlock_condvar.h"
#include "toku_fair_rwlock.h"
#include "frwlock.h"
static int verbose=1;
static int timing_only=0;
static void parse_args (int argc, const char *argv[]) {
const char *progname = argv[0];
argc--; argv++;
while (argc>0) {
if (strcmp(argv[0], "-v")==0) {
verbose++;
} else if (strcmp(argv[0], "-q")==0) {
verbose--;
} else if (strcmp(argv[0], "--timing-only")==0) {
timing_only=1;
} else {
fprintf(stderr, "Usage: %s {-q}* {-v}* {--timing-only}\n", progname);
exit(1);
}
argc--; argv++;
}
}
static const int T=6;
static const int N=10000000;
static double best_nop_time=1e12;
static double best_fcall_time=1e12;
static double best_cas_time=1e12;
static double best_mutex_time=1e12;
static double best_rwlock_time=1e12;
static double best_newbrt_time=1e12;
static double best_prelocked_time=1e12;
static double best_cv_fair_rwlock_time=1e12; // fair from condition variables
static double best_fair_rwlock_time=1e12;
static double best_frwlock_time=1e12;
static double best_frwlock_prelocked_time=1e12;
static double mind(double a, double b) { if (a<b) return a; else return b; }
#if 0
// gcc 4.4.4 (fedora 12) doesn't introduce memory barriers on these writes, so I think that volatile is not enough for sequential consistency.
// Intel guarantees that writes are seen in the same order as they were performed on one processor. But if there were two processors, funny things could happen.
volatile int sc_a, sc_b;
void sequential_consistency (void) {
sc_a = 1;
sc_b = 0;
}
#endif
// Declaring val to be volatile produces essentially identical code as putting the asm volatile memory statements in.
// gcc is not introducing memory barriers to force sequential consistency on volatile memory writes.
// That's probably good enough for us, since we'll have a barrier instruction anywhere it matters.
volatile int val = 0;
static void time_nop (void) __attribute((__noinline__)); // don't want it inline, because it messes up timing.
static void time_nop (void) {
struct timeval start,end;
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
if (val!=0) abort();
val=1;
//__asm__ volatile ("" : : : "memory");
val=0;
//__asm__ volatile ("" : : : "memory");
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "nop = %.6fns/(lock+unlock)\n", diff);
best_nop_time=mind(best_nop_time,diff);
}
}
// This function is defined so we can measure the cost of a function call.
int fcall_nop (int i) __attribute__((__noinline__));
int fcall_nop (int i) {
return i;
}
void time_fcall (void) __attribute((__noinline__));
void time_fcall (void) {
struct timeval start,end;
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
fcall_nop(i);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "fcall = %.6fns/(lock+unlock)\n", diff);
best_fcall_time=mind(best_fcall_time,diff);
}
}
void time_cas (void) __attribute__((__noinline__));
void time_cas (void) {
volatile int64_t tval = 0;
struct timeval start,end;
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
{ int r = __sync_val_compare_and_swap(&tval, 0, 1); assert(r==0); }
{ int r = __sync_val_compare_and_swap(&tval, 1, 0); assert(r==1); }
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "cas = %.6fns/(lock+unlock)\n", diff);
best_cas_time=mind(best_cas_time,diff);
}
}
void time_pthread_mutex (void) __attribute__((__noinline__));
void time_pthread_mutex (void) {
pthread_mutex_t mutex;
{ int r = pthread_mutex_init(&mutex, NULL); assert(r==0); }
struct timeval start,end;
pthread_mutex_lock(&mutex);
pthread_mutex_unlock(&mutex);
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
pthread_mutex_lock(&mutex);
pthread_mutex_unlock(&mutex);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "pthread_mutex = %.6fns/(lock+unlock)\n", diff);
best_mutex_time=mind(best_mutex_time,diff);
}
{ int r = pthread_mutex_destroy(&mutex); assert(r==0); }
}
void time_pthread_rwlock (void) __attribute__((__noinline__));
void time_pthread_rwlock (void) {
pthread_rwlock_t mutex;
{ int r = pthread_rwlock_init(&mutex, NULL); assert(r==0); }
struct timeval start,end;
pthread_rwlock_rdlock(&mutex);
pthread_rwlock_unlock(&mutex);
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
pthread_rwlock_rdlock(&mutex);
pthread_rwlock_unlock(&mutex);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "pthread_rwlock(r) = %.6fns/(lock+unlock)\n", diff);
best_rwlock_time=mind(best_rwlock_time,diff);
}
{ int r = pthread_rwlock_destroy(&mutex); assert(r==0); }
}
static void newbrt_rwlock_lock (RWLOCK rwlock, toku_mutex_t *mutex) {
toku_mutex_lock(mutex);
rwlock_read_lock(rwlock, mutex);
toku_mutex_unlock(mutex);
}
static void newbrt_rwlock_unlock (RWLOCK rwlock, toku_mutex_t *mutex) {
toku_mutex_lock(mutex);
rwlock_read_unlock(rwlock);
toku_mutex_unlock(mutex);
}
// Time the read lock that's in newbrt/rwlock.h
void time_newbrt_rwlock (void) __attribute((__noinline__));
void time_newbrt_rwlock (void) {
struct rwlock rwlock;
toku_mutex_t external_mutex;
toku_mutex_init(&external_mutex, NULL);
rwlock_init(&rwlock);
struct timeval start,end;
newbrt_rwlock_lock(&rwlock, &external_mutex);
newbrt_rwlock_unlock(&rwlock, &external_mutex);
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
newbrt_rwlock_lock(&rwlock, &external_mutex);
newbrt_rwlock_unlock(&rwlock, &external_mutex);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "newbrt_rwlock(r) = %.6fns/(lock+unlock)\n", diff);
best_newbrt_time=mind(best_newbrt_time,diff);
}
rwlock_destroy(&rwlock);
toku_mutex_destroy(&external_mutex);
}
// Time the read lock that's in newbrt/rwlock.h, assuming the mutex is already held.
void time_newbrt_prelocked_rwlock (void) __attribute__((__noinline__));
void time_newbrt_prelocked_rwlock (void) {
struct rwlock rwlock;
toku_mutex_t external_mutex;
toku_mutex_init(&external_mutex, NULL);
toku_mutex_lock(&external_mutex);
rwlock_init(&rwlock);
struct timeval start,end;
rwlock_read_lock(&rwlock, &external_mutex);
rwlock_read_unlock(&rwlock);
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
rwlock_read_lock(&rwlock, &external_mutex);
rwlock_read_unlock(&rwlock);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "pre_newbrt_rwlock(r) = %.6fns/(lock+unlock)\n", diff);
best_prelocked_time=mind(best_prelocked_time,diff);
}
rwlock_destroy(&rwlock);
toku_mutex_unlock(&external_mutex);
toku_mutex_destroy(&external_mutex);
}
void time_toku_fair_rwlock (void) __attribute__((__noinline__));
void time_toku_fair_rwlock (void) {
toku_fair_rwlock_t mutex;
toku_fair_rwlock_init(&mutex);
struct timeval start,end;
toku_fair_rwlock_rdlock(&mutex);
toku_fair_rwlock_unlock(&mutex);
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
toku_fair_rwlock_rdlock(&mutex);
toku_fair_rwlock_unlock(&mutex);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "pthread_fair(r) = %.6fns/(lock+unlock)\n", diff);
best_fair_rwlock_time=mind(best_fair_rwlock_time,diff);
}
toku_fair_rwlock_destroy(&mutex);
}
/* not static*/
void time_toku_cv_fair_rwlock(void) __attribute__((__noinline__));
void time_toku_cv_fair_rwlock(void) {
toku_cv_fair_rwlock_t mutex;
toku_cv_fair_rwlock_init(&mutex);
struct timeval start,end;
toku_cv_fair_rwlock_rdlock(&mutex);
toku_cv_fair_rwlock_unlock(&mutex);
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
toku_cv_fair_rwlock_rdlock(&mutex);
toku_cv_fair_rwlock_unlock(&mutex);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "pthread_cvfair(r) = %.6fns/(lock+unlock)\n", diff);
best_cv_fair_rwlock_time=mind(best_cv_fair_rwlock_time,diff);
}
toku_cv_fair_rwlock_destroy(&mutex);
}
void time_frwlock_prelocked(void) __attribute__((__noinline__));
void time_frwlock_prelocked(void) {
toku_mutex_t external_mutex;
toku_mutex_init(&external_mutex, NULL);
struct timeval start,end;
toku::frwlock x;
x.init(&external_mutex);
toku_mutex_lock(&external_mutex);
bool got_lock;
x.read_lock();
x.read_unlock();
got_lock = x.try_read_lock();
invariant(got_lock);
x.read_unlock();
x.write_lock(true);
x.write_unlock();
got_lock = x.try_write_lock(true);
invariant(got_lock);
x.write_unlock();
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
x.read_lock();
x.read_unlock();
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "frwlock_prelocked = %.6fns/(lock+unlock)\n", diff);
best_frwlock_prelocked_time=mind(best_frwlock_prelocked_time,diff);
}
x.deinit();
toku_mutex_unlock(&external_mutex);
toku_mutex_destroy(&external_mutex);
}
void time_frwlock(void) __attribute__((__noinline__));
void time_frwlock(void) {
toku_mutex_t external_mutex;
toku_mutex_init(&external_mutex, NULL);
struct timeval start,end;
toku::frwlock x;
x.init(&external_mutex);
toku_mutex_lock(&external_mutex);
x.read_lock();
x.read_unlock();
toku_mutex_unlock(&external_mutex);
for (int t=0; t<T; t++) {
gettimeofday(&start, NULL);
for (int i=0; i<N; i++) {
toku_mutex_lock(&external_mutex);
x.read_lock();
toku_mutex_unlock(&external_mutex);
toku_mutex_lock(&external_mutex);
x.read_unlock();
toku_mutex_unlock(&external_mutex);
}
gettimeofday(&end, NULL);
double diff = 1e9*toku_tdiff(&end, &start)/N;
if (verbose>1)
fprintf(stderr, "frwlock = %.6fns/(lock+unlock)\n", diff);
best_frwlock_time=mind(best_frwlock_time,diff);
}
x.deinit();
toku_mutex_destroy(&external_mutex);
}
#define N 6
#define T 100000
#define L 5
#define N_LOG_ENTRIES (L*N*4)
static toku_fair_rwlock_t rwlock;
static struct log_s {
int threadid, loopid;
char action;
} actionlog[N_LOG_ENTRIES];
static int log_counter=0;
static void logit (int threadid, int loopid, char action) {
//printf("%d %d %c\n", threadid, loopid, action);
int my_log_counter = __sync_fetch_and_add(&log_counter, 1);
assert(my_log_counter<N_LOG_ENTRIES);
actionlog[my_log_counter].threadid = threadid;
actionlog[my_log_counter].loopid = loopid;
actionlog[my_log_counter].action = action;
}
// The action should look like this:
// Threads 0-2 are reader threads.
// Threads 3-6 are writer threads.
// The threads all repeatedly grab the lock, wait T steps, and release.
// If the readers can starve the writers, then most of the writers will be at the end.
// If the writers can starve the readers, then most of the readers will be at the end.
// The reader threads all grab the lock, wait T*2 steps, and release the lock.
// The writer threads
// First the writer threads wait time T while the reader threads all go for the lock.
// Before the first one lets go, the writer threads wake up and try to grab the lock. But the readers are still
// 3 threads (0-2) try to grab the lock all at once. They'll get it. They each sleep for time T*2
// 3 threads (3-6) try to grab the write lock. They'll get it one after another.
extern __thread int mytid;
static void grab_rdlock (int threadid, int iteration) {
logit(threadid, iteration, 't');
{ int r = toku_fair_rwlock_rdlock(&rwlock); assert(r==0); }
logit(threadid, iteration, 'R');
}
static void release_rdlock (int threadid, int iteration) {
logit(threadid, iteration, 'u');
{ int r = toku_fair_rwlock_unlock(&rwlock); assert(r==0); }
}
static void grab_wrlock (int threadid, int iteration) {
logit(threadid, iteration, 'T');
{ int r = toku_fair_rwlock_wrlock(&rwlock); assert(r==0); }
logit(threadid, iteration, 'W');
}
static void release_wrlock (int threadid, int iteration) {
logit(threadid, iteration, 'U');
{ int r = toku_fair_rwlock_unlock(&rwlock); assert(r==0);}
}
static void *start_thread (void *vv) {
int *vp=(int*)vv;
int v=*vp;
//printf("T%d=%ld\n", v, pthread_self());
switch(v) {
case 0:
case 1:
case 2:
for (int i=0; i<L; i++) {
grab_rdlock(v, i);
usleep(T);
release_rdlock(v, i);
}
break;
case 3:
case 4:
case 5:
for (int i=0; i<L; i++) {
grab_wrlock(v, i);
usleep(T);
release_wrlock(v, i);
}
}
return NULL;
}
static void *start_thread_random (void *vv) {
int *vp=(int*)vv;
int v=*vp;
for (int i=0; i<L; i++) {
if (random()%2==0) {
grab_rdlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield();
release_rdlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield();
} else {
grab_wrlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield();
release_wrlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield();
}
}
return NULL;
}
static void check_actionlog (int expected_writer_max_count,
int expected_reader_parallelism_min,
int expected_reader_parallelism_max)
// Effect:
// Make sure that writers are exclusive.
// Make sure that anyone who asks for a lock doesn't have one.
// Make sure that anyone granted a lock actually asked for a lock.
// Make sure that anyone who releases a lock has it.
// Make sure that readers don't starve writers, and writers don't starve readers. (Not sure how to code this up...)
{
int reader_max=0;
int writer_max=0;
int state=0;
char tstate[N];
for (int i=0; i<N; i++) tstate[i]=0;
for (int i=0; i<log_counter; i++) {
switch (actionlog[i].action) {
case 't': // fall through to 'T'
case 'T':
assert(tstate[actionlog[i].threadid]==0);
tstate[actionlog[i].threadid]=actionlog[i].action;
break;
case 'W':
assert(tstate[actionlog[i].threadid]=='T');
tstate[actionlog[i].threadid]=actionlog[i].action;
assert(state==0);
state=-1;
writer_max = 1;
break;
case 'U':
assert(tstate[actionlog[i].threadid]=='W');
tstate[actionlog[i].threadid]=0;
assert(state==-1);
state=0;
break;
case 'R':
assert(tstate[actionlog[i].threadid]=='t');
tstate[actionlog[i].threadid]=actionlog[i].action;
if (state<0) { printf("On step %d\n", i); }
assert(state>=0);
state++;
if (state>reader_max) reader_max=state;
break;
case 'u':
assert(tstate[actionlog[i].threadid]=='R');
tstate[actionlog[i].threadid]=0;
assert(state>=0);
state--;
break;
default:
abort();
}
}
assert(reader_max>=expected_reader_parallelism_min);
assert(reader_max<=expected_reader_parallelism_max);
assert(writer_max==expected_writer_max_count);
}
static void test_rwlock_internal (void *(*start_th)(void*), int max_wr, int min_rd, int max_rd) {
if (verbose>=2) printf("Running threads:\n");
log_counter=0;
pthread_t threads[N];
int v[N];
toku_fair_rwlock_init(&rwlock);
for (int i=0; i<N; i++) {
v[i]=i;
int r = pthread_create(&threads[i], NULL, start_th, &v[i]);
assert(r==0);
}
for (int i=0; i<N; i++) {
void *rv;
int r = pthread_join(threads[i], &rv);
assert(rv==NULL);
assert(r==0);
}
if (verbose>1) {
for (int i=0; i<log_counter; i++) {
printf("%d: %*s%c%d\n", i, actionlog[i].threadid*4, "", actionlog[i].action, actionlog[i].loopid);
}
}
check_actionlog(max_wr, min_rd, max_rd);
toku_fair_rwlock_destroy(&rwlock);
if (verbose>2) printf("OK\n");
}
static void test_rwlock (void) {
test_rwlock_internal(start_thread, 1, 2, 3);
for (int i=0; i<10; i++) {
test_rwlock_internal(start_thread_random, 1, 0, N);
}
}
int main (int argc, const char *argv[]) {
parse_args(argc, argv);
if (timing_only) {
if (1) { // to make it easy to only time the templated frwlock
time_nop();
time_fcall();
time_cas();
time_pthread_mutex();
time_pthread_rwlock();
time_newbrt_rwlock();
time_newbrt_prelocked_rwlock();
time_toku_cv_fair_rwlock();
time_toku_fair_rwlock();
}
time_frwlock();
time_frwlock_prelocked();
if (verbose>0) {
if (1) { // to make it easy to only time the templated frwlock
printf("// Best nop time=%10.6fns\n", best_nop_time);
printf("// Best fcall time=%10.6fns\n", best_fcall_time);
printf("// Best cas time=%10.6fns\n", best_cas_time);
printf("// Best mutex time=%10.6fns\n", best_mutex_time);
printf("// Best rwlock time=%10.6fns\n", best_rwlock_time);
printf("// Best newbrt rwlock time=%10.6fns\n", best_newbrt_time);
printf("// Best prelocked time=%10.6fns\n", best_prelocked_time);
printf("// Best fair cv rwlock time=%10.6fns\n", best_cv_fair_rwlock_time);
printf("// Best fair fast rwlock time=%10.6fns\n", best_fair_rwlock_time);
}
printf("// Best frwlock time=%10.6fns\n", best_frwlock_time);
printf("// Best frwlock_pre time=%10.6fns\n", best_frwlock_prelocked_time);
}
} else {
test_rwlock();
}
return 0;
}
...@@ -77,7 +77,7 @@ doit (void) { ...@@ -77,7 +77,7 @@ doit (void) {
node_internal, node_internal,
toku_cachetable_hash(t->ft->cf, node_internal), toku_cachetable_hash(t->ft->cf, node_internal),
&bfe, &bfe,
true, PL_WRITE_EXPENSIVE,
0, 0,
NULL, NULL,
&node &node
......
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <errno.h> #include <errno.h>
#include "../../ft/rwlock.h" #include "../../ft/rwlock.h"
#include "../../ft/frwlock.h"
#include "toku_fair_rwlock.h" #include "toku_fair_rwlock.h"
#include <sys/types.h> #include <sys/types.h>
...@@ -311,6 +312,10 @@ void time_toku_cv_fair_rwlock (void) { ...@@ -311,6 +312,10 @@ void time_toku_cv_fair_rwlock (void) {
#define N_LOG_ENTRIES (L*N*4) #define N_LOG_ENTRIES (L*N*4)
static toku_fair_rwlock_t rwlock; static toku_fair_rwlock_t rwlock;
static toku::frwlock frwlock;
static toku_mutex_t fmutex;
static bool use_frwlock_for_locking;
static struct log_s { static struct log_s {
int threadid, loopid; int threadid, loopid;
...@@ -344,24 +349,44 @@ static void logit (int threadid, int loopid, char action) { ...@@ -344,24 +349,44 @@ static void logit (int threadid, int loopid, char action) {
static void grab_rdlock (int threadid, int iteration) { static void grab_rdlock (int threadid, int iteration) {
logit(threadid, iteration, 't'); logit(threadid, iteration, 't');
{ int r = toku_fair_rwlock_rdlock(&rwlock); assert(r==0); } if (use_frwlock_for_locking) {
toku_mutex_lock(&fmutex);
frwlock.read_lock();
toku_mutex_unlock(&fmutex);
}
else { int r = toku_fair_rwlock_rdlock(&rwlock); assert(r==0); }
logit(threadid, iteration, 'R'); logit(threadid, iteration, 'R');
} }
static void release_rdlock (int threadid, int iteration) { static void release_rdlock (int threadid, int iteration) {
logit(threadid, iteration, 'u'); logit(threadid, iteration, 'u');
{ int r = toku_fair_rwlock_unlock(&rwlock); assert(r==0); } if (use_frwlock_for_locking) {
toku_mutex_lock(&fmutex);
frwlock.read_unlock();
toku_mutex_unlock(&fmutex);
}
else { int r = toku_fair_rwlock_unlock(&rwlock); assert(r==0); }
} }
static void grab_wrlock (int threadid, int iteration) { static void grab_wrlock (int threadid, int iteration) {
logit(threadid, iteration, 'T'); logit(threadid, iteration, 'T');
{ int r = toku_fair_rwlock_wrlock(&rwlock); assert(r==0); } if (use_frwlock_for_locking) {
toku_mutex_lock(&fmutex);
frwlock.write_lock(true);
toku_mutex_unlock(&fmutex);
}
else { int r = toku_fair_rwlock_wrlock(&rwlock); assert(r==0); }
logit(threadid, iteration, 'W'); logit(threadid, iteration, 'W');
} }
static void release_wrlock (int threadid, int iteration) { static void release_wrlock (int threadid, int iteration) {
logit(threadid, iteration, 'U'); logit(threadid, iteration, 'U');
{ int r = toku_fair_rwlock_unlock(&rwlock); assert(r==0);} if (use_frwlock_for_locking) {
toku_mutex_lock(&fmutex);
frwlock.write_unlock();
toku_mutex_unlock(&fmutex);
}
else { int r = toku_fair_rwlock_unlock(&rwlock); assert(r==0);}
} }
static void *start_thread (void *vv) { static void *start_thread (void *vv) {
...@@ -394,18 +419,23 @@ static void *start_thread (void *vv) { ...@@ -394,18 +419,23 @@ static void *start_thread (void *vv) {
static void *start_thread_random (void *vv) { static void *start_thread_random (void *vv) {
int *vp=(int*)vv; int *vp=(int*)vv;
int v=*vp; int v=*vp;
int wait;
for (int i=0; i<L; i++) { for (int i=0; i<L; i++) {
if (random()%2==0) { if (random()%2==0) {
grab_rdlock(v, i); grab_rdlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield(); wait = random() % 20;
for (int j=0; j<wait; j++) sched_yield();
release_rdlock(v, i); release_rdlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield(); wait = random() % 20;
for (int j=0; j<wait; j++) sched_yield();
} else { } else {
grab_wrlock(v, i); grab_wrlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield(); wait = random() % 20;
for (int j=0; j<wait; j++) sched_yield();
release_wrlock(v, i); release_wrlock(v, i);
for (int j=0; j<random()%20; j++) sched_yield(); wait = random() % 20;
for (int j=0; j<wait; j++) sched_yield();
} }
} }
return NULL; return NULL;
...@@ -470,12 +500,19 @@ static void check_actionlog (int expected_writer_max_count, ...@@ -470,12 +500,19 @@ static void check_actionlog (int expected_writer_max_count,
} }
static void test_rwlock_internal (void *(*start_th)(void*), int max_wr, int min_rd, int max_rd) { static void test_rwlock_internal (void *(*start_th)(void*), bool use_frwlock, int max_wr, int min_rd, int max_rd) {
if (verbose>=2) printf("Running threads:\n"); if (verbose>=2) printf("Running threads:\n");
log_counter=0; log_counter=0;
pthread_t threads[N]; pthread_t threads[N];
int v[N]; int v[N];
use_frwlock_for_locking = use_frwlock;
if (use_frwlock_for_locking) {
fmutex = TOKU_MUTEX_INITIALIZER;
frwlock.init(&fmutex);
}
else {
toku_fair_rwlock_init(&rwlock); toku_fair_rwlock_init(&rwlock);
}
for (int i=0; i<N; i++) { for (int i=0; i<N; i++) {
v[i]=i; v[i]=i;
int r = pthread_create(&threads[i], NULL, start_th, &v[i]); int r = pthread_create(&threads[i], NULL, start_th, &v[i]);
...@@ -493,14 +530,20 @@ static void test_rwlock_internal (void *(*start_th)(void*), int max_wr, int min_ ...@@ -493,14 +530,20 @@ static void test_rwlock_internal (void *(*start_th)(void*), int max_wr, int min_
} }
} }
check_actionlog(max_wr, min_rd, max_rd); check_actionlog(max_wr, min_rd, max_rd);
toku_fair_rwlock_destroy(&rwlock); if (use_frwlock_for_locking) {
frwlock.deinit();
toku_mutex_destroy(&fmutex);
}
else {
toku_fair_rwlock_destroy(&rwlock);
}
if (verbose>2) printf("OK\n"); if (verbose>2) printf("OK\n");
} }
static void test_rwlock (void) { static void test_rwlock (bool use_frwlock) {
test_rwlock_internal(start_thread, 1, 2, 3); test_rwlock_internal(start_thread, use_frwlock, 1, 2, 3);
for (int i=0; i<10; i++) { for (int i=0; i<10; i++) {
test_rwlock_internal(start_thread_random, 1, 0, N); test_rwlock_internal(start_thread_random, use_frwlock, 1, 0, N);
} }
} }
int main (int argc, const char *argv[]) { int main (int argc, const char *argv[]) {
...@@ -527,7 +570,8 @@ int main (int argc, const char *argv[]) { ...@@ -527,7 +570,8 @@ int main (int argc, const char *argv[]) {
printf("// Best fair fast rwlock time=%10.6fns\n", best_fair_rwlock_time); printf("// Best fair fast rwlock time=%10.6fns\n", best_fair_rwlock_time);
} }
} else { } else {
test_rwlock(); test_rwlock(true);
test_rwlock(false);
} }
return 0; return 0;
} }
......
...@@ -36,8 +36,10 @@ typedef struct toku_mutex { ...@@ -36,8 +36,10 @@ typedef struct toku_mutex {
#if defined(__APPLE__) #if defined(__APPLE__)
static const toku_mutex_t ZERO_MUTEX_INITIALIZER = {{0}}; static const toku_mutex_t ZERO_MUTEX_INITIALIZER = {{0}};
static const toku_mutex_t TOKU_MUTEX_INITIALIZER = { .pmutex = PTHREAD_MUTEX_INITIALIZER };
#else #else
static const toku_mutex_t ZERO_MUTEX_INITIALIZER = {{{0}}}; static const toku_mutex_t ZERO_MUTEX_INITIALIZER = {{{0}}};
static const toku_mutex_t TOKU_MUTEX_INITIALIZER = { .pmutex = PTHREAD_MUTEX_INITIALIZER };
#endif #endif
static inline void static inline void
...@@ -95,6 +97,8 @@ typedef struct toku_cond { ...@@ -95,6 +97,8 @@ typedef struct toku_cond {
pthread_cond_t pcond; pthread_cond_t pcond;
} toku_cond_t; } toku_cond_t;
#define TOKU_COND_INITIALIZER {PTHREAD_COND_INITIALIZER}
static inline void static inline void
toku_cond_init(toku_cond_t *cond, const toku_pthread_condattr_t *attr) { toku_cond_init(toku_cond_t *cond, const toku_pthread_condattr_t *attr) {
int r = pthread_cond_init(&cond->pcond, attr); int r = pthread_cond_init(&cond->pcond, attr);
...@@ -205,6 +209,11 @@ toku_pthread_join(toku_pthread_t thread, void **value_ptr) { ...@@ -205,6 +209,11 @@ toku_pthread_join(toku_pthread_t thread, void **value_ptr) {
return pthread_join(thread, value_ptr); return pthread_join(thread, value_ptr);
} }
static inline int
toku_pthread_detach(toku_pthread_t thread) {
return pthread_detach(thread);
}
static inline int static inline int
toku_pthread_key_create(toku_pthread_key_t *key, void (*destroyf)(void *)) { toku_pthread_key_create(toku_pthread_key_t *key, void (*destroyf)(void *)) {
return pthread_key_create(key, destroyf); return pthread_key_create(key, destroyf);
......
...@@ -924,6 +924,10 @@ static int UU() scan_op_no_check(DB_TXN *txn, ARG arg, void* operation_extra, vo ...@@ -924,6 +924,10 @@ static int UU() scan_op_no_check(DB_TXN *txn, ARG arg, void* operation_extra, vo
return 0; return 0;
} }
static int dbt_do_nothing (DBT const *UU(key), DBT const *UU(row), void *UU(context)) {
return 0;
}
static int UU() ptquery_and_maybe_check_op(DB* db, DB_TXN *txn, ARG arg, bool check) { static int UU() ptquery_and_maybe_check_op(DB* db, DB_TXN *txn, ARG arg, bool check) {
int r; int r;
int rand_key = myrandom_r(arg->random_data); int rand_key = myrandom_r(arg->random_data);
...@@ -933,7 +937,14 @@ static int UU() ptquery_and_maybe_check_op(DB* db, DB_TXN *txn, ARG arg, bool ch ...@@ -933,7 +937,14 @@ static int UU() ptquery_and_maybe_check_op(DB* db, DB_TXN *txn, ARG arg, bool ch
DBT key, val; DBT key, val;
dbt_init(&key, &rand_key, sizeof rand_key); dbt_init(&key, &rand_key, sizeof rand_key);
dbt_init(&val, NULL, 0); dbt_init(&val, NULL, 0);
r = db->get(db, txn, &key, &val, 0); r = db->getf_set(
db,
txn,
0,
&key,
dbt_do_nothing,
NULL
);
if (check) assert(r != DB_NOTFOUND); if (check) assert(r != DB_NOTFOUND);
r = 0; r = 0;
return r; return r;
......
...@@ -126,10 +126,14 @@ size_t toku_memory_footprint(void * p, size_t touched); ...@@ -126,10 +126,14 @@ size_t toku_memory_footprint(void * p, size_t touched);
# define HELGRIND_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size) # define HELGRIND_ANNOTATE_NEW_MEMORY(p, size) ANNOTATE_NEW_MEMORY(p, size)
# define HELGRIND_VALGRIND_HG_ENABLE_CHECKING(p, size) VALGRIND_HG_ENABLE_CHECKING(p, size) # define HELGRIND_VALGRIND_HG_ENABLE_CHECKING(p, size) VALGRIND_HG_ENABLE_CHECKING(p, size)
# define HELGRIND_VALGRIND_HG_DISABLE_CHECKING(p, size) VALGRIND_HG_DISABLE_CHECKING(p, size) # define HELGRIND_VALGRIND_HG_DISABLE_CHECKING(p, size) VALGRIND_HG_DISABLE_CHECKING(p, size)
# define TOKU_DRD_IGNORE_VAR(v) DRD_IGNORE_VAR(v)
# define TOKU_DRD_STOP_IGNORING_VAR(v) DRD_STOP_IGNORING_VAR(v)
#else #else
# define HELGRIND_ANNOTATE_NEW_MEMORY(p, size) ((void) 0) # define HELGRIND_ANNOTATE_NEW_MEMORY(p, size) ((void) 0)
# define HELGRIND_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void) 0) # define HELGRIND_VALGRIND_HG_ENABLE_CHECKING(p, size) ((void) 0)
# define HELGRIND_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void) 0) # define HELGRIND_VALGRIND_HG_DISABLE_CHECKING(p, size) ((void) 0)
# define TOKU_DRD_IGNORE_VAR(v)
# define TOKU_DRD_STOP_IGNORING_VAR(v)
#endif #endif
......
...@@ -54,6 +54,7 @@ extern void (*do_assert_hook)(void); // Set this to a function you want called a ...@@ -54,6 +54,7 @@ extern void (*do_assert_hook)(void); // Set this to a function you want called a
#else #else
#define assert(expr) ((expr) ? (void)0 : toku_do_assert_fail(#expr, __FUNCTION__, __FILE__, __LINE__, get_maybe_error_errno())) #define assert(expr) ((expr) ? (void)0 : toku_do_assert_fail(#expr, __FUNCTION__, __FILE__, __LINE__, get_maybe_error_errno()))
#define assert_zero(expr) ((expr) == 0 ? (void)0 : toku_do_assert_zero_fail((uintptr_t)(expr), #expr, __FUNCTION__, __FILE__, __LINE__, get_maybe_error_errno())) #define assert_zero(expr) ((expr) == 0 ? (void)0 : toku_do_assert_zero_fail((uintptr_t)(expr), #expr, __FUNCTION__, __FILE__, __LINE__, get_maybe_error_errno()))
#define assert_null(expr) ((expr) == nullptr ? (void)0 : toku_do_assert_zero_fail((uintptr_t)(expr), #expr, __FUNCTION__, __FILE__, __LINE__, get_maybe_error_errno()))
#endif #endif
#ifdef GCOV #ifdef GCOV
...@@ -67,7 +68,7 @@ extern void (*do_assert_hook)(void); // Set this to a function you want called a ...@@ -67,7 +68,7 @@ extern void (*do_assert_hook)(void); // Set this to a function you want called a
#define lazy_assert(a) assert(a) // indicates code is incomplete #define lazy_assert(a) assert(a) // indicates code is incomplete
#define lazy_assert_zero(a) assert_zero(a) // indicates code is incomplete #define lazy_assert_zero(a) assert_zero(a) // indicates code is incomplete
#define invariant(a) assert(a) // indicates a code invariant that must be true #define invariant(a) assert(a) // indicates a code invariant that must be true
#define invariant_null(a) assert_zero(a) // indicates a code invariant that must be true #define invariant_null(a) assert_null(a) // indicates a code invariant that must be true
#define invariant_notnull(a) assert(a) // indicates a code invariant that must be true #define invariant_notnull(a) assert(a) // indicates a code invariant that must be true
#define invariant_zero(a) assert_zero(a) // indicates a code invariant that must be true #define invariant_zero(a) assert_zero(a) // indicates a code invariant that must be true
#define resource_assert(a) assert(a) // indicates resource must be available, otherwise unrecoverable #define resource_assert(a) assert(a) // indicates resource must be available, otherwise unrecoverable
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment