Commit 579083a8 authored by Leif Walsh's avatar Leif Walsh Committed by Yoni Fogel

refs #5671 implement join_timeout with a portable 'crash and dump core' function

git-svn-id: file:///svn/toku/tokudb@50308 c7de825b-a66e-492c-adef-691d508d4ae1
parent 2325fbc6
/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
#ident "$Id$"
#ident "Copyright (c) 2007-2012 Tokutek Inc. All rights reserved."
#ifndef PORTABILITY_TOKU_CRASH_H
#define PORTABILITY_TOKU_CRASH_H
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
//Simulate as hard a crash as possible.
//Choices:
// raise(SIGABRT)
// kill -SIGKILL $pid
// divide by 0
// null dereference
// abort()
// assert(false) (from <assert.h>)
// assert(false) (from <toku_assert.h>)
//
//Linux:
// abort() and both assert(false) cause FILE buffers to be flushed and written to disk: Unacceptable
//Windows:
// None of them cause file buffers to be flushed/written to disk, however
// abort(), assert(false) <assert.h>, null dereference, and divide by 0 cause popups requiring user intervention during tests: Unacceptable
//
//kill -SIGKILL $pid is annoying (and so far untested)
//
//raise(SIGABRT) has the downside that perhaps it could be caught?
//I'm choosing raise(SIGABRT), followed by divide by 0, followed by null dereference, followed by all the others just in case one gets caught.
static void __attribute__((unused, noreturn))
toku_hard_crash_on_purpose(void) {
#if TOKU_WINDOWS
TerminateProcess(GetCurrentProcess(), 137);
#else
raise(SIGKILL); //Does not flush buffers on linux; cannot be caught.
#endif
{
int zero = 0;
int infinity = 1/zero;
fprintf(stderr, "Force use of %d\n", infinity);
fflush(stderr); //Make certain the string is calculated.
}
{
void * intothevoid = NULL;
(*(int*)intothevoid)++;
fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid);
fflush(stderr);
}
abort();
fprintf(stderr, "This line should never be printed\n");
fflush(stderr);
}
// Similar to toku_hard_crash_on_purpose, but the goal isn't to crash hard, the primary goal is to get a corefile, the secondary goal is to terminate in any way possible.
// We don't really care if buffers get flushed etc, in fact they may as well flush since there may be useful output in stdout or stderr.
//
// By default, the following signals generate cores:
// Linux, from signal(7):
// SIGQUIT 3 Core
// SIGILL 4 Core
// SIGABRT 6 Core
// SIGFPE 8 Core
// SIGSEGV 11 Core
//
// Darwin and FreeBSD, from signal(3):
// 3 SIGQUIT create core image
// 4 SIGILL create core image
// 5 SIGTRAP create core image
// 6 SIGABRT create core image
// 7 SIGEMT create core image
// 8 SIGFPE create core image
// 10 SIGBUS create core image
// 11 SIGSEGV create core image
// 12 SIGSYS create core image
//
// We'll raise these in some sequence (common ones first), then try emulating the things that would cause these signals to be raised, then eventually just try to die normally and then loop like abort does.
static void __attribute__((unused, noreturn))
toku_crash_and_dump_core_on_purpose(void) {
raise(SIGQUIT);
raise(SIGILL);
raise(SIGABRT);
raise(SIGFPE);
raise(SIGSEGV);
#if defined(__FreeBSD__) || defined(__APPLE__)
raise(SIGTRAP);
raise(SIGEMT);
raise(SIGBUS);
raise(SIGSYS);
#endif
abort();
{
int zero = 0;
int infinity = 1/zero;
fprintf(stderr, "Force use of %d\n", infinity);
fflush(stderr); //Make certain the string is calculated.
}
{
void * intothevoid = NULL;
(*(int*)intothevoid)++;
fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid);
fflush(stderr);
}
raise(SIGKILL);
while (true) {
// don't return
}
}
#endif // PORTABILITY_TOKU_CRASH_H
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <limits.h> #include <limits.h>
#include <errno.h> #include <errno.h>
#include <toku_htonl.h> #include <toku_htonl.h>
#include <portability/toku_crash.h>
#include "toku_assert.h" #include "toku_assert.h"
#include <signal.h> #include <signal.h>
#include <time.h> #include <time.h>
...@@ -272,50 +273,6 @@ void print_time_now(void) { ...@@ -272,50 +273,6 @@ void print_time_now(void) {
printf("%s", timestr); printf("%s", timestr);
} }
//Simulate as hard a crash as possible.
//Choices:
// raise(SIGABRT)
// kill -SIGKILL $pid
// divide by 0
// null dereference
// abort()
// assert(false) (from <assert.h>)
// assert(false) (from <toku_assert.h>)
//
//Linux:
// abort() and both assert(false) cause FILE buffers to be flushed and written to disk: Unacceptable
//Windows:
// None of them cause file buffers to be flushed/written to disk, however
// abort(), assert(false) <assert.h>, null dereference, and divide by 0 cause popups requiring user intervention during tests: Unacceptable
//
//kill -SIGKILL $pid is annoying (and so far untested)
//
//raise(SIGABRT) has the downside that perhaps it could be caught?
//I'm choosing raise(SIGABRT), followed by divide by 0, followed by null dereference, followed by all the others just in case one gets caught.
static void UU()
toku_hard_crash_on_purpose(void) {
#if TOKU_WINDOWS
TerminateProcess(GetCurrentProcess(), 137);
#else
raise(SIGKILL); //Does not flush buffers on linux; cannot be caught.
#endif
{
int zero = 0;
int infinity = 1/zero;
fprintf(stderr, "Force use of %d\n", infinity);
fflush(stderr); //Make certain the string is calculated.
}
{
void * intothevoid = NULL;
(*(int*)intothevoid)++;
fprintf(stderr, "Force use of *(%p) = %d\n", intothevoid, *(int*)intothevoid);
fflush(stderr);
}
abort();
fprintf(stderr, "This line should never be printed\n");
fflush(stderr);
}
static void UU() static void UU()
multiply_locks_for_n_dbs(DB_ENV *env, int num_dbs) { multiply_locks_for_n_dbs(DB_ENV *env, int num_dbs) {
#ifdef USE_TDB #ifdef USE_TDB
......
...@@ -13,7 +13,6 @@ ...@@ -13,7 +13,6 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <signal.h>
#include <locale.h> #include <locale.h>
#include <unistd.h> #include <unistd.h>
#include <sys/stat.h> #include <sys/stat.h>
...@@ -1432,9 +1431,44 @@ static void *test_time(void *arg) { ...@@ -1432,9 +1431,44 @@ static void *test_time(void *arg) {
return arg; return arg;
} }
static void crashing_alarm_handler(int sig) { struct sleep_and_crash_extra {
assert(sig == SIGALRM); toku_mutex_t mutex;
toku_hard_crash_on_purpose(); toku_cond_t cond;
int seconds;
bool is_setup;
bool threads_have_joined;
};
static void *sleep_and_crash(void *extra) {
sleep_and_crash_extra *e = static_cast<sleep_and_crash_extra *>(extra);
toku_mutex_lock(&e->mutex);
struct timeval tv;
toku_timespec_t ts;
gettimeofday(&tv, nullptr);
ts.tv_sec = tv.tv_sec + e->seconds;
ts.tv_nsec = 0;
e->is_setup = true;
if (verbose) {
printf("Waiting %d seconds for other threads to join.\n", e->seconds);
fflush(stdout);
}
int r = toku_cond_timedwait(&e->cond, &e->mutex, &ts);
toku_mutex_assert_locked(&e->mutex);
if (r == ETIMEDOUT) {
invariant(!e->threads_have_joined);
if (verbose) {
printf("Some thread didn't join on time, crashing.\n");
fflush(stdout);
}
toku_crash_and_dump_core_on_purpose();
} else {
assert(r == 0);
assert(e->threads_have_joined);
if (verbose) {
printf("Other threads joined on time, exiting cleanly.\n");
}
}
toku_mutex_unlock(&e->mutex);
return nullptr;
} }
static int run_workers( static int run_workers(
...@@ -1484,20 +1518,48 @@ static int run_workers( ...@@ -1484,20 +1518,48 @@ static int run_workers(
void *ret; void *ret;
r = toku_pthread_join(time_tid, &ret); assert_zero(r); r = toku_pthread_join(time_tid, &ret); assert_zero(r);
if (verbose) printf("%lu joined\n", (unsigned long) time_tid); if (verbose) printf("%lu joined\n", (unsigned long) time_tid);
sighandler_t old_alarm = signal(SIGALRM, crashing_alarm_handler);
assert(old_alarm != SIG_ERR); {
// Set an alarm that will kill us if it takes too long to join all the // Set an alarm that will kill us if it takes too long to join all the
// threads (i.e. there is some runaway thread). // threads (i.e. there is some runaway thread).
unsigned int remaining = alarm(cli_args->join_timeout); struct sleep_and_crash_extra sac_extra;
assert_zero(remaining); ZERO_STRUCT(sac_extra);
toku_mutex_init(&sac_extra.mutex, nullptr);
toku_cond_init(&sac_extra.cond, nullptr);
sac_extra.seconds = cli_args->join_timeout;
sac_extra.is_setup = false;
sac_extra.threads_have_joined = false;
toku_mutex_lock(&sac_extra.mutex);
toku_pthread_t sac_thread;
r = toku_pthread_create(&sac_thread, nullptr, sleep_and_crash, &sac_extra);
assert_zero(r);
// Wait for sleep_and_crash thread to get set up, spinning is ok, this should be quick.
while (!sac_extra.is_setup) {
toku_mutex_unlock(&sac_extra.mutex);
r = toku_pthread_yield();
assert_zero(r);
toku_mutex_lock(&sac_extra.mutex);
}
toku_mutex_unlock(&sac_extra.mutex);
// Timeout thread has started, join everyone
for (int i = 0; i < num_threads; ++i) { for (int i = 0; i < num_threads; ++i) {
r = toku_pthread_join(tids[i], &ret); assert_zero(r); r = toku_pthread_join(tids[i], &ret); assert_zero(r);
if (verbose) if (verbose)
printf("%lu joined\n", (unsigned long) tids[i]); printf("%lu joined\n", (unsigned long) tids[i]);
} }
// All threads joined, deschedule the alarm.
remaining = alarm(0); // Signal timeout thread not to crash.
assert(remaining > 0); toku_mutex_lock(&sac_extra.mutex);
sac_extra.threads_have_joined = true;
toku_cond_signal(&sac_extra.cond);
toku_mutex_unlock(&sac_extra.mutex);
r = toku_pthread_join(sac_thread, nullptr);
assert_zero(r);
toku_cond_destroy(&sac_extra.cond);
toku_mutex_destroy(&sac_extra.mutex);
}
if (cli_args->print_performance) { if (cli_args->print_performance) {
uint64_t *counters[num_threads]; uint64_t *counters[num_threads];
...@@ -2244,7 +2306,6 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct ...@@ -2244,7 +2306,6 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct
INT32_ARG_NONNEG("--num_elements", num_elements, ""), INT32_ARG_NONNEG("--num_elements", num_elements, ""),
INT32_ARG_NONNEG("--num_DBs", num_DBs, ""), INT32_ARG_NONNEG("--num_DBs", num_DBs, ""),
INT32_ARG_NONNEG("--num_seconds", num_seconds, "s"), INT32_ARG_NONNEG("--num_seconds", num_seconds, "s"),
INT32_ARG_NONNEG("--join_timeout", join_timeout, "s"),
INT32_ARG_NONNEG("--node_size", env_args.node_size, " bytes"), INT32_ARG_NONNEG("--node_size", env_args.node_size, " bytes"),
INT32_ARG_NONNEG("--basement_node_size", env_args.basement_node_size, " bytes"), INT32_ARG_NONNEG("--basement_node_size", env_args.basement_node_size, " bytes"),
INT32_ARG_NONNEG("--rollback_node_size", env_args.rollback_node_size, " bytes"), INT32_ARG_NONNEG("--rollback_node_size", env_args.rollback_node_size, " bytes"),
...@@ -2259,6 +2320,7 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct ...@@ -2259,6 +2320,7 @@ static inline void parse_stress_test_args (int argc, char *const argv[], struct
UINT32_ARG("--txn_size", txn_size, " rows"), UINT32_ARG("--txn_size", txn_size, " rows"),
UINT32_ARG("--num_bucket_mutexes", env_args.num_bucket_mutexes, " mutexes"), UINT32_ARG("--num_bucket_mutexes", env_args.num_bucket_mutexes, " mutexes"),
INT32_ARG_R("--join_timeout", join_timeout, "s", 1, INT32_MAX),
INT32_ARG_R("--performance_period", performance_period, "s", 1, INT32_MAX), INT32_ARG_R("--performance_period", performance_period, "s", 1, INT32_MAX),
// TODO: John thinks the cachetable size should be in megabytes // TODO: John thinks the cachetable size should be in megabytes
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment