Commit 90d72eba authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

Merge partitioned counters onto main. Fixes #5267.

git-svn-id: file:///svn/toku/tokudb@46044 c7de825b-a66e-492c-adef-691d508d4ae1
parent daa30d26
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "log-internal.h" #include "log-internal.h"
#include "kibbutz.h" #include "kibbutz.h"
#include "background_job_manager.h" #include "background_job_manager.h"
#include "partitioned_counter.h"
/////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////
// Engine status // Engine status
...@@ -30,12 +31,14 @@ ...@@ -30,12 +31,14 @@
// These should be in the cachetable object, but we make them file-wide so that gdb can get them easily. // These should be in the cachetable object, but we make them file-wide so that gdb can get them easily.
// They were left here after engine status cleanup (#2949, rather than moved into the status struct) // They were left here after engine status cleanup (#2949, rather than moved into the status struct)
// so they are still easily available to the debugger and to save lots of typing. // so they are still easily available to the debugger and to save lots of typing.
static u_int64_t cachetable_miss;
static u_int64_t cachetable_misstime; // time spent waiting for disk read // if we had constructors and destructors, this would be cleaner. For now, we initialize with setup_cachetable_statistics().
static u_int64_t cachetable_puts; // how many times has a newly created node been put into the cachetable? static PARTITIONED_COUNTER cachetable_miss;
static u_int64_t cachetable_prefetches; // how many times has a block been prefetched into the cachetable? static PARTITIONED_COUNTER cachetable_misstime; // time spent waiting for disk read
static u_int64_t cachetable_evictions; static PARTITIONED_COUNTER cachetable_puts; // how many times has a newly created node been put into the cachetable?
static u_int64_t cleaner_executions; // number of times the cleaner thread's loop has executed static PARTITIONED_COUNTER cachetable_prefetches; // how many times has a block been prefetched into the cachetable?
static PARTITIONED_COUNTER cachetable_evictions;
static PARTITIONED_COUNTER cleaner_executions; // number of times the cleaner thread's loop has executed
static CACHETABLE_STATUS_S ct_status; static CACHETABLE_STATUS_S ct_status;
...@@ -183,10 +186,10 @@ toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS statp) { ...@@ -183,10 +186,10 @@ toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS statp) {
if (!ct_status.initialized) { if (!ct_status.initialized) {
status_init(); status_init();
} }
STATUS_VALUE(CT_MISS) = cachetable_miss; STATUS_VALUE(CT_MISS) = cachetable_miss.read();
STATUS_VALUE(CT_MISSTIME) = cachetable_misstime; STATUS_VALUE(CT_MISSTIME) = cachetable_misstime.read();
STATUS_VALUE(CT_PUTS) = cachetable_puts; STATUS_VALUE(CT_PUTS) = cachetable_puts.read();
STATUS_VALUE(CT_PREFETCHES) = cachetable_prefetches; STATUS_VALUE(CT_PREFETCHES) = cachetable_prefetches.read();
STATUS_VALUE(CT_SIZE_CURRENT) = ct->size_current; STATUS_VALUE(CT_SIZE_CURRENT) = ct->size_current;
STATUS_VALUE(CT_SIZE_LIMIT) = ct->size_limit; STATUS_VALUE(CT_SIZE_LIMIT) = ct->size_limit;
STATUS_VALUE(CT_SIZE_WRITING) = ct->size_evicting; STATUS_VALUE(CT_SIZE_WRITING) = ct->size_evicting;
...@@ -194,8 +197,8 @@ toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS statp) { ...@@ -194,8 +197,8 @@ toku_cachetable_get_status(CACHETABLE ct, CACHETABLE_STATUS statp) {
STATUS_VALUE(CT_SIZE_LEAF) = ct->size_leaf; STATUS_VALUE(CT_SIZE_LEAF) = ct->size_leaf;
STATUS_VALUE(CT_SIZE_ROLLBACK) = ct->size_rollback; STATUS_VALUE(CT_SIZE_ROLLBACK) = ct->size_rollback;
STATUS_VALUE(CT_SIZE_CACHEPRESSURE) = ct->size_cachepressure; STATUS_VALUE(CT_SIZE_CACHEPRESSURE) = ct->size_cachepressure;
STATUS_VALUE(CT_EVICTIONS) = cachetable_evictions; STATUS_VALUE(CT_EVICTIONS) = cachetable_evictions.read();
STATUS_VALUE(CT_CLEANER_EXECUTIONS) = cleaner_executions; STATUS_VALUE(CT_CLEANER_EXECUTIONS) = cleaner_executions.read();
STATUS_VALUE(CT_CLEANER_PERIOD) = toku_get_cleaner_period_unlocked(ct); STATUS_VALUE(CT_CLEANER_PERIOD) = toku_get_cleaner_period_unlocked(ct);
STATUS_VALUE(CT_CLEANER_ITERATIONS) = toku_get_cleaner_iterations_unlocked(ct); STATUS_VALUE(CT_CLEANER_ITERATIONS) = toku_get_cleaner_iterations_unlocked(ct);
*statp = ct_status; *statp = ct_status;
...@@ -857,7 +860,7 @@ static void cachetable_free_pair(CACHETABLE ct, PAIR p) { ...@@ -857,7 +860,7 @@ static void cachetable_free_pair(CACHETABLE ct, PAIR p) {
void *write_extraargs = p->write_extraargs; void *write_extraargs = p->write_extraargs;
PAIR_ATTR old_attr = p->attr; PAIR_ATTR old_attr = p->attr;
cachetable_evictions++; cachetable_evictions.increment(1);
cachetable_unlock(ct); cachetable_unlock(ct);
PAIR_ATTR new_attr = p->attr; PAIR_ATTR new_attr = p->attr;
// Note that flush_callback is called with write_me FALSE, so the only purpose of this // Note that flush_callback is called with write_me FALSE, so the only purpose of this
...@@ -1279,7 +1282,7 @@ static int cachetable_put_internal( ...@@ -1279,7 +1282,7 @@ static int cachetable_put_internal(
} }
} }
// flushing could change the table size, but wont' change the fullhash // flushing could change the table size, but wont' change the fullhash
cachetable_puts++; cachetable_puts.increment(1);
PAIR p = cachetable_insert_at( PAIR p = cachetable_insert_at(
ct, ct,
cachefile, cachefile,
...@@ -1916,8 +1919,8 @@ int toku_cachetable_get_and_pin_with_dep_pairs ( ...@@ -1916,8 +1919,8 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
// The pair being fetched will be marked as pending if a checkpoint happens during the // The pair being fetched will be marked as pending if a checkpoint happens during the
// fetch because begin_checkpoint will mark as pending any pair that is locked even if it is clean. // fetch because begin_checkpoint will mark as pending any pair that is locked even if it is clean.
cachetable_fetch_pair(ct, cachefile, p, fetch_callback, read_extraargs, TRUE); cachetable_fetch_pair(ct, cachefile, p, fetch_callback, read_extraargs, TRUE);
cachetable_miss++; cachetable_miss.increment(1);
cachetable_misstime += get_tnow() - t0; cachetable_misstime.increment(get_tnow() - t0);
goto got_value; goto got_value;
} }
got_value: got_value:
...@@ -2142,8 +2145,8 @@ int toku_cachetable_get_and_pin_nonblocking ( ...@@ -2142,8 +2145,8 @@ int toku_cachetable_get_and_pin_nonblocking (
run_unlockers(unlockers); // we hold the ct mutex. run_unlockers(unlockers); // we hold the ct mutex.
u_int64_t t0 = get_tnow(); u_int64_t t0 = get_tnow();
cachetable_fetch_pair(ct, cf, p, fetch_callback, read_extraargs, FALSE); cachetable_fetch_pair(ct, cf, p, fetch_callback, read_extraargs, FALSE);
cachetable_miss++; cachetable_miss.increment(1);
cachetable_misstime += get_tnow() - t0; cachetable_misstime.increment(get_tnow() - t0);
cachetable_unlock(ct); cachetable_unlock(ct);
return TOKUDB_TRY_AGAIN; return TOKUDB_TRY_AGAIN;
} }
...@@ -2216,7 +2219,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash, ...@@ -2216,7 +2219,7 @@ int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
// if not found then create a pair in the READING state and fetch it // if not found then create a pair in the READING state and fetch it
if (p == 0) { if (p == 0) {
cachetable_prefetches++; cachetable_prefetches.increment(1);
r = bjm_add_background_job(cf->bjm); r = bjm_add_background_job(cf->bjm);
assert_zero(r); assert_zero(r);
p = cachetable_insert_at( p = cachetable_insert_at(
...@@ -3207,7 +3210,7 @@ toku_cleaner_thread (void *cachetable_v) ...@@ -3207,7 +3210,7 @@ toku_cleaner_thread (void *cachetable_v)
assert(ct); assert(ct);
u_int32_t num_iterations = toku_get_cleaner_iterations(ct); u_int32_t num_iterations = toku_get_cleaner_iterations(ct);
for (u_int32_t i = 0; i < num_iterations; ++i) { for (u_int32_t i = 0; i < num_iterations; ++i) {
cleaner_executions++; cleaner_executions.increment(1);
cachetable_lock(ct); cachetable_lock(ct);
PAIR best_pair = NULL; PAIR best_pair = NULL;
int n_seen = 0; int n_seen = 0;
...@@ -3310,10 +3313,6 @@ toku_cleaner_thread (void *cachetable_v) ...@@ -3310,10 +3313,6 @@ toku_cleaner_thread (void *cachetable_v)
void __attribute__((__constructor__)) toku_cachetable_helgrind_ignore(void); void __attribute__((__constructor__)) toku_cachetable_helgrind_ignore(void);
void void
toku_cachetable_helgrind_ignore(void) { toku_cachetable_helgrind_ignore(void) {
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cachetable_miss, sizeof cachetable_miss);
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cachetable_misstime, sizeof cachetable_misstime);
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cachetable_puts, sizeof cachetable_puts);
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cachetable_prefetches, sizeof cachetable_prefetches);
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cachetable_evictions, sizeof cachetable_evictions); HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cachetable_evictions, sizeof cachetable_evictions);
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cleaner_executions, sizeof cleaner_executions); HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&cleaner_executions, sizeof cleaner_executions);
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&ct_status, sizeof ct_status); HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&ct_status, sizeof ct_status);
......
...@@ -3,134 +3,276 @@ ...@@ -3,134 +3,276 @@
#ident "$Id$" #ident "$Id$"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved." #ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it." #ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <valgrind/helgrind.h>
#include "partitioned_counter.h" #include "partitioned_counter.h"
#include "memory.h" #include "memory.h"
#include <pthread.h>
#include <valgrind/helgrind.h>
struct local_counter { #include <sys/types.h>
unsigned long sum;
struct local_counter *prev, *next;
PARTITIONED_COUNTER owner;
};
struct partitioned_counter { //******************************************************************************
unsigned long sum_of_dead; // Representation: The representation of a partitioned counter
pthread_key_t key; // comprises a sum, called _sum_of_dead, a pthread_key called _key,
struct local_counter *first, *last; // and a linked list of thread-local parts.
}; // There is also a linked list, for each thread that has a
// thread-local part of any counter, of all the thread-local parts of
// all the counters.
// Abstraction function: The sum is represented by the sum of _sum and
// the sum's of the thread-local parts of the counter.
// Representation invariant: Every thread-local part is in the linked
// list of the thread-local parts of its counter, as well as in the
// linked list of the counters of a the thread.
//******************************************************************************
//******************************************************************************
// The mutex for the PARTITIONED_COUNTER
// We have a single mutex for all the counters because // We have a single mutex for all the counters because
// (a) the mutex is obtained infrequently, and // (a) the mutex is obtained infrequently, and
// (b) it helps us avoid race conditions when destroying the counters. // (b) it helps us avoid race conditions when destroying the counters.
static pthread_mutex_t partitioned_counter_mutex = PTHREAD_MUTEX_INITIALIZER; // The alternative that I couldn't make work is to have a mutex per counter.
// But the problem is that the counter can be destroyed before threads
// terminate, or maybe a thread terminates before the counter is destroyed.
// If the counter is destroyed first, then the mutex is no longer available.
//******************************************************************************
static pthread_mutex_t partitioned_counter_mutex = PTHREAD_MUTEX_INITIALIZER;
static void pc_lock (void) { static void pc_lock (void)
// Effect: Lock the mutex.
{
int r = pthread_mutex_lock(&partitioned_counter_mutex); int r = pthread_mutex_lock(&partitioned_counter_mutex);
assert(r==0); assert(r==0);
} }
static void pc_unlock (void) { static void pc_unlock (void)
// Effect: Unlock the mutex.
{
int r = pthread_mutex_unlock(&partitioned_counter_mutex); int r = pthread_mutex_unlock(&partitioned_counter_mutex);
assert(r==0); assert(r==0);
} }
static void local_destroy_counter (void *counterp) { //******************************************************************************
pc_lock(); // Key creation primivites.
struct local_counter *CAST_FROM_VOIDP(lc, counterp); //******************************************************************************
PARTITIONED_COUNTER owner = lc->owner; static void pk_create (pthread_key_t *key, void (*destructor)(void*)) {
// Save the sum int r = pthread_key_create(key, destructor);
owner->sum_of_dead += lc->sum; assert(r==0);
// Remove from linked list. }
if (lc->prev) {
lc->prev->next = lc->next; static void pk_delete (pthread_key_t key) {
int r = pthread_key_delete(key);
assert(r==0);
}
static void pk_setspecific (pthread_key_t key, const void *value) {
int r = pthread_setspecific(key, value);
assert(r==0);
}
//******************************************************************************
// Doubly-linked list primitives. In an ideal world these would be
// parameterized and put into a separate library. Perhaps STL even has them,
// but I (bradley) don't see how to make the STL linked lists avoid an extra
// malloc for the list cell. This linked list allows you to embed several
// linked lists into a single object.
//******************************************************************************
struct linked_list_element {
struct local_counter *container; // The container is to avoid the offset nonsense in the toku linked list.
// // The container points to the object in which we are embedded.
struct linked_list_element *next, *prev; // prev==NULL for the first element of the list. next==NULL for the last element.
};
static void ll_insert (struct linked_list_head *head, struct linked_list_element *item, struct local_counter *container)
// Effect: Add an item to a linked list.
// Implementation note: Push the item to the head of the list.
{
struct linked_list_element *old_first = head->first;
item->container = container;
item->next = old_first;
item->prev = NULL;
if (old_first) {
old_first->prev = item;
}
head->first = item;
}
static void ll_remove (struct linked_list_head *head, struct linked_list_element *item)
// Effect: Remove an item from a linked list.
// Requires: The item is in the list identified by head.
{
struct linked_list_element *old_prev = item->prev;
struct linked_list_element *old_next = item->next;
if (old_prev==NULL) {
head->first = old_next;
} else { } else {
owner->first = lc->next; old_prev->next = old_next;
} }
if (lc->next) { if (item->next==NULL) {
lc->next->prev = lc->prev; /* nothing */
} else { } else {
owner->last = lc->prev; old_next->prev = old_prev;
} }
}
// Free the local part of the counter and return.
toku_free(lc); static bool ll_pop (struct linked_list_head *head, struct linked_list_element **item)
{ // Effect: if head is an empty list return false.
int r = pthread_setspecific(owner->key, NULL); // Otherwise return true and set *item to the first item, and remove that item from the list.
assert(r==0); {
struct linked_list_element *first = head->first;
if (first) {
assert(first->prev==NULL);
head->first = first->next;
if (first->next) {
first->next->prev=NULL;
}
first->next=NULL;
*item = first;
return true;
} else {
return false;
}
}
//******************************************************************************
// The thread local part of a counter, comprising the thread-local sum a pointer
// to the partitioned_counter, a pointer to the thread_local list head, and two
// linked lists. One of the lists is all the thread-local parts that belong to
// the same counter, and the other is all the thread-local parts that belogn to
// the same thread.
//******************************************************************************
struct local_counter {
u_int64_t sum; // The thread-local sum.
PARTITIONED_COUNTER *owner_pc; // The partitioned counter that this is part of.
struct linked_list_head *thread_head; // The head of the list of all the counters for a thread.
struct linked_list_element ll_in_counter; // Linked list elements for the doubly-linked list of thread-local information for the same PARTITIONED_COUNTER.
struct linked_list_element ll_in_thread; // Linked list elements for the doubly-linked list of all the local parts of counters for the same thread.
};
// The head of the ll_in_thread linked list for each thread is stored in this thread-local variable.
__thread struct linked_list_head ll_thread_head = {NULL};
// I want this to be static, but I have to use hidden visibility instead because it's a friend function.
void destroy_thread_local_part_of_partitioned_counters (void *ignore_me) __attribute__((__visibility__("hidden")));
void destroy_thread_local_part_of_partitioned_counters (void *ignore_me __attribute__((__unused__)))
// Effect: This function is called whenever a thread terminates using the
// destructor of the thread_destructor_key (defined below). First grab the
// lock, then go through all the partitioned counters and removes the part that
// is local to this thread. We don't actually need the contents of the
// thread_destructor_key except to cause this function to run. The content of
// the key is a static string, so don't try to free it.
{
pc_lock();
struct linked_list_element *le;
// The ll_thread_head variable is still present even while running
// pthread_key destructors. I tried to use a different pthread_key, but it
// turned out that the other one was being destroyed before this one gets to
// run. So I'm using a __thread variable to get at the head of the linked
// list.
while (ll_pop(&ll_thread_head, &le)) {
struct local_counter *lc = le->container;
// We just removed lc from the list from the thread.
// Remove lc from the partitioned counter in which it resides.
PARTITIONED_COUNTER *owner = lc->owner_pc;
owner->_sum_of_dead += lc->sum;
ll_remove(&owner->_ll_counter_head, &lc->ll_in_counter);
toku_free(lc);
} }
pc_unlock(); pc_unlock();
} }
//******************************************************************************
// We employ a system-wide pthread_key simply to get a notification when a
// thread terminates. The key will simply contain a constant string (it's "dont
// care", but it doesn't matter what it is, as long as it's not NULL. We need
// a constructor function to set up the pthread_key. We used a constructor
// function intead of a C++ constructor because that's what we are used to,
// rather than because it's necessarily better. Whenever a thread tries to
// increment a partitioned_counter for the first time, it sets the
// pthread_setspecific for the thread_destructor_key. It's OK if the key gets
// setspecific multiple times, it's always the same value. When a thread (that
// has created a thread-local part of any partitioned counter) terminates, the
// destroy_thread_local_part_of_partitioned_counters will run. It may run
// before or after other pthread_key destructors, but the thread-local
// ll_thread_head variable is still present until the thread is completely done
// running.
//******************************************************************************
static pthread_key_t thread_destructor_key;
static void initiate_partitioned_counters (void) __attribute__((__constructor__));
static void initiate_partitioned_counters (void)
// Effect: This constructor function runs before any of the other code here, and
// sets up a pthread_key with a destructor.
{
pk_create(&thread_destructor_key, destroy_thread_local_part_of_partitioned_counters);
}
PARTITIONED_COUNTER create_partitioned_counter(void) PARTITIONED_COUNTER::PARTITIONED_COUNTER(void)
// Effect: Create a counter, initialized to zero. // Effect: Create a counter, initialized to zero.
{ {
PARTITIONED_COUNTER MALLOC(result); _sum_of_dead = 0;
result->sum_of_dead = 0; pk_create(&_key, NULL);
{ _ll_counter_head.first = NULL;
int r = pthread_key_create(&result->key, local_destroy_counter);
assert(r==0);
}
result->first = NULL;
result->last = NULL;
return result;
} }
void destroy_partitioned_counter (PARTITIONED_COUNTER pc) PARTITIONED_COUNTER::~PARTITIONED_COUNTER(void)
// Effect: Destroy the counter. No operations on that counter are permitted after this. // Effect: Destroy the counter. No operations on this counter are permitted after.
// Implementation note: Since we have a global lock, we can destroy all the key-specific versions as well. // Implementation note: Since we have a global lock, we can destroy all the key-specific versions as well.
{ {
pk_delete(_key);
pc_lock(); pc_lock();
while (pc->first) { struct linked_list_element *first;
struct local_counter *next = pc->first->next; while (ll_pop(&_ll_counter_head, &first)) {
assert(pc->first->owner==pc); // We just removed first from the counter list, now we must remove it from the thread head
toku_free(pc->first); struct local_counter *lc = first->container;
pc->first = next; ll_remove(lc->thread_head, &lc->ll_in_thread);
} toku_free(first->container);
{
int r = pthread_key_delete(pc->key);
assert(r==0);
} }
toku_free(pc);
pc_unlock(); pc_unlock();
} }
void increment_partitioned_counter (PARTITIONED_COUNTER pc, unsigned long amount) void PARTITIONED_COUNTER::increment(u_int64_t amount)
// Effect: Increment the counter by amount. // Effect: Increment the counter by amount.
// Requires: No overflows. This is a 64-bit unsigned counter. // Requires: No overflows. This is a 64-bit unsigned counter.
// Requires: You may not increment this after a destroy has occured.
{ {
struct local_counter *CAST_FROM_VOIDP(lc, pthread_getspecific(pc->key)); struct local_counter *CAST_FROM_VOIDP(lc, pthread_getspecific(_key));
if (lc==NULL) { if (lc==NULL) {
pc_lock(); XMALLOC(lc);
MALLOC(lc); lc->sum = 0;
lc->sum = 0;
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&lc->sum, sizeof(lc->sum)); // the counter increment is kind of racy. HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&lc->sum, sizeof(lc->sum)); // the counter increment is kind of racy.
lc->prev = pc->last; lc->owner_pc = this;
lc->next = NULL; lc->thread_head = &ll_thread_head;
lc->owner = pc; pk_setspecific(_key, lc);
if (pc->first==NULL) {
pc->first = lc; // Set things up so that this thread terminates, the thread-local parts of the counter will be destroyed and merged into their respective counters.
} else { pk_setspecific(thread_destructor_key, "dont care");
pc->last->next = lc;
} // Of all the code in this method, only the following two
pc->last = lc; // operations must be done with the lock held. The
if (pc->first==NULL) pc->first=lc; // setspecific operations don't matter because the pthread_key
int r = pthread_setspecific(pc->key, lc); // accessors and destructors are run by the same thread. The
assert(r==0); // other contents of *lc are similarly thread-local.
// It probably doesn't really matter since this increment is relatively infrequent.
pc_lock();
ll_insert(&_ll_counter_head, &lc->ll_in_counter, lc);
ll_insert(&ll_thread_head, &lc->ll_in_thread, lc); // We do have to hold the lock for this insert because the thread destructor may access
// // ll_thread_head through some ll_remove(lc->thread_head,...) operation in the
// // partitioned_counter destructor.
pc_unlock(); pc_unlock();
} }
lc->sum += amount; lc->sum += amount;
} }
unsigned long read_partitioned_counter (PARTITIONED_COUNTER pc) u_int64_t PARTITIONED_COUNTER::read(void)
// Effect: Return the current value of the counter. // Effect: Return the current value of the counter.
// Implementation note: Sum all the thread-local counts along with the sum_of_the_dead.
{ {
pc_lock(); pc_lock();
unsigned long sum = pc->sum_of_dead; u_int64_t sum = _sum_of_dead;
for (struct local_counter *lc = pc->first; lc; lc=lc->next) { for (struct linked_list_element *le = _ll_counter_head.first; le; le=le->next) {
sum += lc->sum; sum += le->container->sum;
} }
pc_unlock(); pc_unlock();
return sum; return sum;
......
...@@ -27,7 +27,15 @@ ...@@ -27,7 +27,15 @@
// destroy_partitioned_counter Destroy it. // destroy_partitioned_counter Destroy it.
// increment_partitioned_counter Increment it. This is the frequent operation. // increment_partitioned_counter Increment it. This is the frequent operation.
// read_partitioned_counter Get the current value. This is infrequent. // read_partitioned_counter Get the current value. This is infrequent.
// See partitioned_counter.cc for the abstraction function and representation invariant.
//
// Restrictions: You may not access a partitioned_counter during
// destructor operation. So don't put engine-status in a destructor
// or a destructor function.
//
#if 0
// The old C interface. This required a bunch of explicit ___attribute__((__destructor__)) functions to remember to destroy counters at the end.
typedef struct partitioned_counter *PARTITIONED_COUNTER; typedef struct partitioned_counter *PARTITIONED_COUNTER;
PARTITIONED_COUNTER create_partitioned_counter(void); PARTITIONED_COUNTER create_partitioned_counter(void);
// Effect: Create a counter, initialized to zero. // Effect: Create a counter, initialized to zero.
...@@ -35,11 +43,47 @@ PARTITIONED_COUNTER create_partitioned_counter(void); ...@@ -35,11 +43,47 @@ PARTITIONED_COUNTER create_partitioned_counter(void);
void destroy_partitioned_counter (PARTITIONED_COUNTER); void destroy_partitioned_counter (PARTITIONED_COUNTER);
// Effect: Destroy the counter. No operations on that counter are permitted after this. // Effect: Destroy the counter. No operations on that counter are permitted after this.
void increment_partitioned_counter (PARTITIONED_COUNTER, unsigned long amount); void increment_partitioned_counter (PARTITIONED_COUNTER, u_int64_t amount);
// Effect: Increment the counter by amount. // Effect: Increment the counter by amount.
// Requires: No overflows. This is a 64-bit unsigned counter. // Requires: No overflows. This is a 64-bit unsigned counter.
unsigned long read_partitioned_counter (PARTITIONED_COUNTER); u_int64_t read_partitioned_counter (PARTITIONED_COUNTER);
// Effect: Return the current value of the counter. // Effect: Return the current value of the counter.
#endif
#include <pthread.h>
#include "fttypes.h"
// Used inside the PARTITIONED_COUNTER.
struct linked_list_head {
struct linked_list_element *first;
};
class PARTITIONED_COUNTER {
public:
PARTITIONED_COUNTER(void);
// Effect: Construct a counter, initialized to zero.
~PARTITIONED_COUNTER(void);
// Effect: Destruct the counter.
void increment(u_int64_t amount);
// Effect: Increment the counter by amount. This is a 64-bit unsigned counter, and if you overflow it, you will get overflowed results (that is mod 2^64).
// Requires: Don't use this from a static constructor or destructor.
u_int64_t read(void);
// Effect: Read the sum.
// Requires: Don't use this from a static constructor or destructor.
private:
u_int64_t _sum_of_dead; // The sum of all thread-local counts from threads that have terminated.
pthread_key_t _key; // The pthread_key which gives us the hook to construct and destruct thread-local storage.
struct linked_list_head _ll_counter_head; // A linked list of all the thread-local information for this counter.
// This function is used to destroy the thread-local part of the state when a thread terminates.
// But it's not the destructor for the local part of the counter, it's a destructor on a "dummy" key just so that we get a notification when a thread ends.
friend void destroy_thread_local_part_of_partitioned_counters (void *);
};
#endif #endif
...@@ -16,11 +16,11 @@ ...@@ -16,11 +16,11 @@
* alf 16-core server (xeon E5-2665 2.4GHz) sandybridge * alf 16-core server (xeon E5-2665 2.4GHz) sandybridge
* *
* mork mindy bradley alf * mork mindy bradley alf
* 0.3ns 1.07ns 1.27ns 0.58ns to do a ++, but it's got a race in it. * 0.3ns 1.07ns 1.27ns 0.61ns to do a ++, but it's got a race in it.
* 28.0ns 20.47ns 18.75ns 39.38ns to do a sync_fetch_and_add(). * 28.0ns 20.47ns 18.75ns 34.15ns to do a sync_fetch_and_add().
* 0.4ns 0.29ns 0.71ns 0.19ns to do with a single version of a counter * 0.4ns 0.29ns 0.71ns 0.19ns to do with a single version of a counter
* 0.33ns 0.69ns 0.18ns pure thread-local variable (no way to add things up) * 0.33ns 0.69ns 0.18ns pure thread-local variable (no way to add things up)
* 0.76ns 1.50ns 0.35ns partitioned_counter.c (using link-time optimization, otherwise the function all overwhelms everything) * 0.76ns 2.40ns 0.54ns partitioned_counter.c (using gcc link-time optimization, otherwise the function call overwhelms everything)
* *
* *
* How it works. Each thread has a thread-local counter structure with an integer in it. To increment, we increment the thread-local structure. * How it works. Each thread has a thread-local counter structure with an integer in it. To increment, we increment the thread-local structure.
...@@ -43,75 +43,101 @@ ...@@ -43,75 +43,101 @@
#include "toku_assert.h" #include "toku_assert.h"
#include "partitioned_counter.h" #include "partitioned_counter.h"
#include "memory.h" #include "memory.h"
#include "test.h"
// The test code includes the fastest version I could figure out to make, implemented below.
struct counter_s { struct counter_s {
bool inited; bool inited;
int counter; volatile int counter;
struct counter_s *prev, *next; struct counter_s *prev, *next;
int myid; int myid;
}; };
static __thread struct counter_s counter = {false,0, NULL,NULL,0}; static __thread struct counter_s counter = {false,0, NULL,NULL,0};
static int finished_counter=0; // counter for all threads that are done. static int finished_counter=0; // counter for all threads that are done.
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
static struct counter_s *head=NULL, *tail=NULL; // We use a single mutex for anything complex. We'd like to use a mutex per partitioned counter, but we must cope with the possibility of a race between
// a terminating pthread (which calls destroy_counter()), and a call to the counter destructor. So we use a global mutex.
static pthread_mutex_t pc_mutex = PTHREAD_MUTEX_INITIALIZER;
static struct counter_s *head=NULL;
static pthread_key_t counter_key; static pthread_key_t counter_key;
static void destroy_counter (void *counterp) { static void pc_lock (void)
// Effect: Lock the pc mutex.
{
int r = pthread_mutex_lock(&pc_mutex);
assert(r==0);
}
static void pc_unlock (void)
// Effect: Unlock the pc mutex.
{
int r = pthread_mutex_unlock(&pc_mutex);
assert(r==0);
}
static void destroy_counter (void *counterp)
// Effect: This is the function passed to pthread_key_create that is to run whenever a thread terminates.
// The thread-local part of the counter must be copied into the shared state, and the thread-local part of the counter must be
// removed from the linked list of all thread-local parts.
{
assert((struct counter_s*)counterp==&counter); assert((struct counter_s*)counterp==&counter);
{ int r = pthread_mutex_lock(&mutex); assert(r==0); } pc_lock();
if (counter.prev==NULL) { if (counter.prev==NULL) {
assert(head==&counter); assert(head==&counter);
head = counter.next; head = counter.next;
} else { } else {
counter.prev->next = counter.next; counter.prev->next = counter.next;
} }
if (counter.next==NULL) { if (counter.next!=NULL) {
assert(tail==&counter);
tail = counter.prev;
} else {
counter.next->prev = counter.prev; counter.next->prev = counter.prev;
} }
finished_counter += counter.counter; finished_counter += counter.counter;
HELGRIND_VALGRIND_HG_ENABLE_CHECKING(&counter.counter, sizeof(counter.counter)); // stop ignoring races HELGRIND_VALGRIND_HG_ENABLE_CHECKING(&counter.counter, sizeof(counter.counter)); // stop ignoring races
//printf("finished counter now %d\n", finished_counter); //printf("finished counter now %d\n", finished_counter);
{ int r = pthread_mutex_unlock(&mutex); assert(r==0); } pc_unlock();
} }
static int idcounter=0; static int idcounter=0;
static inline void increment (void) { static inline void increment (void)
{
if (!counter.inited) { if (!counter.inited) {
{ int r = pthread_mutex_lock(&mutex); assert(r==0); } pc_lock();
{ int r = pthread_setspecific(counter_key, &counter); assert(r==0); } struct counter_s *cp = &counter;
counter.prev = tail; { int r = pthread_setspecific(counter_key, cp); assert(r==0); }
counter.next = NULL; cp->prev = NULL;
if (head==NULL) { cp->next = head;
head = &counter; if (head!=NULL) {
tail = &counter; head->prev = cp;
} else {
tail->next = &counter;
tail = &counter;
} }
counter.counter = 0; head = cp;
counter.inited = true; #ifdef __INTEL_COMPILER
counter.myid = idcounter++; __memory_barrier(); // for some reason I don't understand, ICC needs a memory barrier here. -Bradley
#endif
cp->counter = 0;
cp->inited = true;
cp->myid = idcounter++;
HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&counter.counter, sizeof(counter.counter)); // the counter increment is kind of racy. HELGRIND_VALGRIND_HG_DISABLE_CHECKING(&counter.counter, sizeof(counter.counter)); // the counter increment is kind of racy.
{ int r = pthread_mutex_unlock(&mutex); assert(r==0); } pc_unlock();
} }
counter.counter++; counter.counter++;
} }
static int getvals (void) { static int getvals (void) {
{ int r = pthread_mutex_lock(&mutex); assert(r==0); } pc_lock();
int sum=finished_counter; int sum=finished_counter;
for (struct counter_s *p=head; p; p=p->next) { for (struct counter_s *p=head; p; p=p->next) {
sum+=p->counter; sum+=p->counter;
} }
{ int r = pthread_mutex_unlock(&mutex); assert(r==0); } pc_unlock();
return sum; return sum;
} }
/**********************************************************************************/
/* And now for some actual test code. */
/**********************************************************************************/
static const int N=10000000; static const int N=10000000;
static const int T=20; static const int T=20;
...@@ -120,7 +146,7 @@ static const int T=20; ...@@ -120,7 +146,7 @@ static const int T=20;
PARTITIONED_COUNTER pc; PARTITIONED_COUNTER pc;
static void *pc_doit (void *v) { static void *pc_doit (void *v) {
for (int i=0; i<N; i++) { for (int i=0; i<N; i++) {
increment_partitioned_counter(pc, 1); pc.increment(1);
} }
//printf("val=%ld\n", read_partitioned_counter(pc)); //printf("val=%ld\n", read_partitioned_counter(pc));
return v; return v;
...@@ -212,47 +238,42 @@ static void parse_args (int argc, const char *argv[]) { ...@@ -212,47 +238,42 @@ static void parse_args (int argc, const char *argv[]) {
static void do_timeit (void) { static void do_timeit (void) {
{ int r = pthread_key_create(&counter_key, destroy_counter); assert(r==0); } { int r = pthread_key_create(&counter_key, destroy_counter); assert(r==0); }
pc = create_partitioned_counter();
printf("%d threads\n%d increments per thread\n", T, N); printf("%d threads\n%d increments per thread\n", T, N);
timeit("++", old_doit_nonatomic); timeit("++", old_doit_nonatomic);
timeit("atomic++", old_doit); timeit("atomic++", old_doit);
timeit("fast", new_doit); timeit("fast", new_doit);
timeit("puretl", tl_doit); timeit("puretl", tl_doit);
timeit("pc", pc_doit); timeit("pc", pc_doit);
destroy_partitioned_counter(pc);
} }
struct test_arguments { struct test_arguments {
PARTITIONED_COUNTER pc; PARTITIONED_COUNTER pc;
unsigned long limit; u_int64_t limit;
unsigned long total_increment_per_writer; u_int64_t total_increment_per_writer;
volatile unsigned long unfinished_count; volatile u_int64_t unfinished_count;
}; };
static void *reader_test_fun (void *ta_v) { static void *reader_test_fun (void *ta_v) {
struct test_arguments *ta = (struct test_arguments *)ta_v; struct test_arguments *ta = (struct test_arguments *)ta_v;
unsigned long lastval = 0; u_int64_t lastval = 0;
printf("reader starting\n");
while (ta->unfinished_count>0) { while (ta->unfinished_count>0) {
unsigned long thisval = read_partitioned_counter(ta->pc); u_int64_t thisval = ta->pc.read();
assert(lastval <= thisval); assert(lastval <= thisval);
assert(thisval <= ta->limit); assert(thisval <= ta->limit);
lastval = thisval; lastval = thisval;
if (verboseness_cmdarg && (0==(thisval & (thisval-1)))) printf("Thisval=%ld\n", thisval); if (verboseness_cmdarg && (0==(thisval & (thisval-1)))) printf("Thisval=%ld\n", thisval);
} }
unsigned long thisval = read_partitioned_counter(ta->pc); u_int64_t thisval = ta->pc.read();
assert(thisval==ta->limit); assert(thisval==ta->limit);
return ta_v; return ta_v;
} }
static void *writer_test_fun (void *ta_v) { static void *writer_test_fun (void *ta_v) {
struct test_arguments *ta = (struct test_arguments *)ta_v; struct test_arguments *ta = (struct test_arguments *)ta_v;
printf("writer starting\n"); for (u_int64_t i=0; i<ta->total_increment_per_writer; i++) {
for (unsigned long i=0; i<ta->total_increment_per_writer; i++) {
if (i%1000 == 0) sched_yield(); if (i%1000 == 0) sched_yield();
increment_partitioned_counter(ta->pc, 1); ta->pc.increment(1);
} }
printf("writer done\n");
__sync_fetch_and_sub(&ta->unfinished_count, 1); __sync_fetch_and_sub(&ta->unfinished_count, 1);
return ta_v; return ta_v;
} }
...@@ -260,45 +281,67 @@ static void *writer_test_fun (void *ta_v) { ...@@ -260,45 +281,67 @@ static void *writer_test_fun (void *ta_v) {
static void do_testit (void) { static void do_testit (void) {
const int NGROUPS = 2; const int NGROUPS = 2;
PARTITIONED_COUNTER pcs[NGROUPS]; u_int64_t limits[NGROUPS];
unsigned long limits[NGROUPS]; limits [0] = 200000;
limits [0] = 2000000; limits [1] = 100000;
limits [1] = 1000000; u_int64_t n_writers[NGROUPS];
unsigned long n_writers[NGROUPS]; n_writers[0] = 2;
n_writers[0] = 20; n_writers[1] = 4;
n_writers[1] = 40;
struct test_arguments tas[NGROUPS]; struct test_arguments tas[NGROUPS];
pthread_t reader_threads[NGROUPS]; pthread_t reader_threads[NGROUPS];
pthread_t *writer_threads[NGROUPS]; pthread_t *writer_threads[NGROUPS];
for (int i=0; i<NGROUPS; i++) { for (int i=0; i<NGROUPS; i++) {
pcs[i] = create_partitioned_counter();
tas[i].pc = pcs[i];
tas[i].limit = limits[i]; tas[i].limit = limits[i];
tas[i].unfinished_count = n_writers[i]; tas[i].unfinished_count = n_writers[i];
tas[i].total_increment_per_writer = limits[i]/n_writers[i]; tas[i].total_increment_per_writer = limits[i]/n_writers[i];
assert(tas[i].total_increment_per_writer * n_writers[i] == limits[i]); assert(tas[i].total_increment_per_writer * n_writers[i] == limits[i]);
pt_create(&reader_threads[i], reader_test_fun, &tas[i]); pt_create(&reader_threads[i], reader_test_fun, &tas[i]);
MALLOC_N(n_writers[i], writer_threads[i]); MALLOC_N(n_writers[i], writer_threads[i]);
for (unsigned long j=0; j<n_writers[i] ; j++) { for (u_int64_t j=0; j<n_writers[i] ; j++) {
pt_create(&writer_threads[i][j], writer_test_fun, &tas[i]); pt_create(&writer_threads[i][j], writer_test_fun, &tas[i]);
} }
} }
for (int i=0; i<NGROUPS; i++) { for (int i=0; i<NGROUPS; i++) {
pt_join(reader_threads[i], &tas[i]); pt_join(reader_threads[i], &tas[i]);
for (unsigned long j=0; j<n_writers[i] ; j++) { for (u_int64_t j=0; j<n_writers[i] ; j++) {
pt_join(writer_threads[i][j], &tas[i]); pt_join(writer_threads[i][j], &tas[i]);
} }
toku_free(writer_threads[i]); toku_free(writer_threads[i]);
destroy_partitioned_counter(pcs[i]);
} }
} }
int main (int argc, const char *argv[]) { volatile int spinwait=0;
static void* test2_fun (void* mypc_v) {
PARTITIONED_COUNTER *mypc = (PARTITIONED_COUNTER*)mypc_v;
mypc->increment(3);
spinwait=1;
while (spinwait==1);
// mypc no longer points at a valid data structure.
return NULL;
}
static void do_testit2 (void)
// This test checks to see what happens if a thread is still live when we destruct a counter.
// A thread increments the counter, then lets us know through a spin wait, then waits until we destroy the counter.
{
pthread_t t;
{
PARTITIONED_COUNTER mypc;
pt_create(&t, test2_fun, &mypc);
while(spinwait==0); // wait until he incremented the counter.
assert(mypc.read()==3);
} // leave scope, so the counter goes away.
spinwait=2; // tell the other guy to finish up.
pt_join(t, NULL);
}
int test_main (int argc, const char *argv[]) {
parse_args(argc, argv); parse_args(argc, argv);
if (time_cmdarg) { if (time_cmdarg) {
do_timeit(); do_timeit();
} else { } else {
do_testit(); do_testit();
do_testit2();
} }
return 0; return 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment