Commit b23b43a0 authored by Rich Prohaska's avatar Rich Prohaska

#50 count long tail events like long fsyncs, checkpoints, and lock tree waits

parent 146344e9
......@@ -163,6 +163,11 @@ status_init(void) {
STATUS_INIT(CP_WAITERS_MAX, nullptr, UINT64, "waiters max", TOKU_ENGINE_STATUS);
STATUS_INIT(CP_CLIENT_WAIT_ON_MO, nullptr, UINT64, "non-checkpoint client wait on mo lock", TOKU_ENGINE_STATUS);
STATUS_INIT(CP_CLIENT_WAIT_ON_CS, nullptr, UINT64, "non-checkpoint client wait on cs lock", TOKU_ENGINE_STATUS);
STATUS_INIT(CP_BEGIN_TIME, nullptr, UINT64, "checkpoint begin time", TOKU_ENGINE_STATUS);
STATUS_INIT(CP_LONG_BEGIN_COUNT, nullptr, UINT64, "long checkpoint begin count", TOKU_ENGINE_STATUS);
STATUS_INIT(CP_LONG_BEGIN_TIME, nullptr, UINT64, "long checkpoint begin time", TOKU_ENGINE_STATUS);
cp_status.initialized = true;
}
#undef STATUS_INIT
......@@ -188,12 +193,11 @@ static toku_pthread_rwlock_t low_priority_multi_operation_lock;
static bool initialized = false; // sanity check
static volatile bool locked_mo = false; // true when the multi_operation write lock is held (by checkpoint)
static volatile bool locked_cs = false; // true when the checkpoint_safe write lock is held (by checkpoint)
static volatile uint64_t toku_checkpoint_long_threshold = 1000000;
// Note following static functions are called from checkpoint internal logic only,
// and use the "writer" calls for locking and unlocking.
static void
multi_operation_lock_init(void) {
pthread_rwlockattr_t attr;
......@@ -335,7 +339,9 @@ toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,
SET_CHECKPOINT_FOOTPRINT(30);
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN) = time(NULL);
uint64_t t_checkpoint_begin_start = toku_current_time_microsec();
toku_cachetable_begin_checkpoint(cp, logger);
uint64_t t_checkpoint_begin_end = toku_current_time_microsec();
toku_ft_open_close_unlock();
multi_operation_checkpoint_unlock();
......@@ -357,6 +363,12 @@ toku_checkpoint(CHECKPOINTER cp, TOKULOGGER logger,
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_END) = time(NULL);
STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN_COMPLETE) = STATUS_VALUE(CP_TIME_LAST_CHECKPOINT_BEGIN);
STATUS_VALUE(CP_CHECKPOINT_COUNT)++;
uint64_t duration = t_checkpoint_begin_end - t_checkpoint_begin_start;
STATUS_VALUE(CP_BEGIN_TIME) += duration;
if (duration >= toku_checkpoint_long_threshold) {
STATUS_VALUE(CP_LONG_BEGIN_TIME) += duration;
STATUS_VALUE(CP_LONG_BEGIN_COUNT) += 1;
}
STATUS_VALUE(CP_FOOTPRINT) = 0;
checkpoint_safe_checkpoint_unlock();
......
......@@ -185,7 +185,10 @@ typedef enum {
CP_WAITERS_MAX, // max threads ever simultaneously waiting for the checkpoint_safe lock to perform a checkpoint
CP_CLIENT_WAIT_ON_MO, // how many times a client thread waited to take the multi_operation lock, not for checkpoint
CP_CLIENT_WAIT_ON_CS, // how many times a client thread waited for the checkpoint_safe lock, not for checkpoint
CP_STATUS_NUM_ROWS // number of rows in this status array
CP_BEGIN_TIME,
CP_LONG_BEGIN_TIME,
CP_LONG_BEGIN_COUNT,
CP_STATUS_NUM_ROWS // number of rows in this status array. must be last.
} cp_status_entry;
typedef struct {
......
......@@ -245,6 +245,7 @@ void lock_request::calculate_cond_wakeup_time(struct timespec *ts) {
// sleep on the lock request until it becomes resolved or the wait time has elapsed.
int lock_request::wait(void) {
uint64_t t_start = toku_current_time_microsec();
toku_mutex_lock(&m_info->mutex);
while (m_state == state::PENDING) {
struct timespec ts;
......@@ -259,6 +260,14 @@ int lock_request::wait(void) {
complete(DB_LOCK_NOTGRANTED);
}
}
uint64_t t_end = toku_current_time_microsec();
uint64_t duration = t_end - t_start;
m_info->wait_count += 1;
m_info->wait_time += duration;
if (duration >= 1000000) {
m_info->long_wait_count += 1;
m_info->long_wait_time += duration;
}
toku_mutex_unlock(&m_info->mutex);
invariant(m_state == state::COMPLETE);
......
......@@ -115,7 +115,11 @@ enum {
LTM_STO_NUM_ELIGIBLE,
LTM_STO_END_EARLY_COUNT,
LTM_STO_END_EARLY_TIME,
LTM_STATUS_NUM_ROWS
LTM_WAIT_COUNT,
LTM_WAIT_TIME,
LTM_LONG_WAIT_COUNT,
LTM_LONG_WAIT_TIME,
LTM_STATUS_NUM_ROWS // must be last
};
typedef struct {
......@@ -187,6 +191,8 @@ class locktree {
omt<lock_request *> pending_lock_requests;
toku_mutex_t mutex;
bool should_retry_lock_requests;
uint64_t wait_count, wait_time;
uint64_t long_wait_count, long_wait_time;
};
// Private info struct for storing pending lock request state.
......
......@@ -368,7 +368,7 @@ bool locktree::manager::memory_tracker::out_of_locks(void) const {
void locktree::manager::status_init(void) {
STATUS_INIT(LTM_SIZE_CURRENT, LOCKTREE_MEMORY_SIZE, UINT64, "memory size", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_SIZE_LIMIT, LOCKTREE_MEMORY_SIZE_LIMIT, UINT64, "memory size limit", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, UINT64, "number of times lock escalation ran", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_ESCALATION_COUNT, LOCKTREE_ESCALATION_NUM, UINT64, "number of times lock escalation ran", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_ESCALATION_TIME, LOCKTREE_ESCALATION_SECONDS, TOKUTIME, "time spent running escalation (seconds)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_ESCALATION_LATEST_RESULT, LOCKTREE_LATEST_POST_ESCALATION_MEMORY_SIZE, UINT64, "latest post-escalation memory size", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_NUM_LOCKTREES, LOCKTREE_OPEN_CURRENT, UINT64, "number of locktrees open now", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
......@@ -376,6 +376,12 @@ void locktree::manager::status_init(void) {
STATUS_INIT(LTM_STO_NUM_ELIGIBLE, LOCKTREE_STO_ELIGIBLE_NUM, UINT64, "number of locktrees eligible for the STO", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_STO_END_EARLY_COUNT, LOCKTREE_STO_ENDED_NUM, UINT64, "number of times a locktree ended the STO early", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_STO_END_EARLY_TIME, LOCKTREE_STO_ENDED_SECONDS, TOKUTIME, "time spent ending the STO early (seconds)", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
STATUS_INIT(LTM_WAIT_COUNT, LOCKTREE_WAIT_COUNT, UINT64, "number of wait locks", TOKU_ENGINE_STATUS);
STATUS_INIT(LTM_WAIT_TIME, LOCKTREE_WAIT_TIME, UINT64, "time waiting for locks", TOKU_ENGINE_STATUS);
STATUS_INIT(LTM_LONG_WAIT_COUNT, LOCKTREE_LONG_WAIT_COUNT, UINT64, "number of long wait locks ", TOKU_ENGINE_STATUS);
STATUS_INIT(LTM_LONG_WAIT_TIME, LOCKTREE_LONG_WAIT_TIME, UINT64, "long time waiting for locks", TOKU_ENGINE_STATUS);
status.initialized = true;
}
......@@ -399,6 +405,9 @@ void locktree::manager::get_status(LTM_STATUS statp) {
uint64_t sto_num_eligible = 0;
uint64_t sto_end_early_count = 0;
tokutime_t sto_end_early_time = 0;
uint64_t lock_wait_count = 0, lock_wait_time = 0;
uint64_t long_lock_wait_count = 0, long_lock_wait_time = 0;
size_t num_locktrees = m_locktree_map.size();
for (size_t i = 0; i < num_locktrees; i++) {
......@@ -407,7 +416,11 @@ void locktree::manager::get_status(LTM_STATUS statp) {
invariant_zero(r);
toku_mutex_lock(&lt->m_lock_request_info.mutex);
lock_requests_pending += lt->get_lock_request_info()->pending_lock_requests.size();
lock_requests_pending += lt->m_lock_request_info.pending_lock_requests.size();
lock_wait_count += lt->m_lock_request_info.wait_count;
lock_wait_time += lt->m_lock_request_info.wait_time;
long_lock_wait_count += lt->m_lock_request_info.long_wait_count;
long_lock_wait_time += lt->m_lock_request_info.long_wait_time;
toku_mutex_unlock(&lt->m_lock_request_info.mutex);
sto_num_eligible += lt->sto_txnid_is_valid_unsafe() ? 1 : 0;
......@@ -422,6 +435,10 @@ void locktree::manager::get_status(LTM_STATUS statp) {
STATUS_VALUE(LTM_STO_NUM_ELIGIBLE) = sto_num_eligible;
STATUS_VALUE(LTM_STO_END_EARLY_COUNT) = sto_end_early_count;
STATUS_VALUE(LTM_STO_END_EARLY_TIME) = sto_end_early_time;
STATUS_VALUE(LTM_WAIT_COUNT) = lock_wait_count;
STATUS_VALUE(LTM_WAIT_TIME) = lock_wait_time;
STATUS_VALUE(LTM_LONG_WAIT_COUNT) = long_lock_wait_count;
STATUS_VALUE(LTM_LONG_WAIT_TIME) = long_lock_wait_time;
*statp = status;
}
#undef STATUS_VALUE
......
......@@ -439,23 +439,22 @@ void toku_os_recursive_delete(const char *path) {
assert_zero(r);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// fsync logic:
// t_fsync exists for testing purposes only
static int (*t_fsync)(int) = 0;
static uint64_t toku_fsync_count;
static uint64_t toku_fsync_time;
static uint64_t sched_fsync_count;
static uint64_t sched_fsync_time;
static uint64_t toku_long_fsync_threshold = 1000000;
static uint64_t toku_long_fsync_count;
static uint64_t toku_long_fsync_time;
void toku_set_func_fsync(int (*fsync_function)(int)) {
t_fsync = fsync_function;
}
// keep trying if fsync fails because of EINTR
static void file_fsync_internal (int fd, uint64_t *duration_p) {
static void file_fsync_internal (int fd) {
uint64_t tstart = toku_current_time_microsec();
int r = -1;
while (r != 0) {
......@@ -471,13 +470,14 @@ static void file_fsync_internal (int fd, uint64_t *duration_p) {
toku_sync_fetch_and_add(&toku_fsync_count, 1);
uint64_t duration = toku_current_time_microsec() - tstart;
toku_sync_fetch_and_add(&toku_fsync_time, duration);
if (duration_p) {
*duration_p = duration;
if (duration >= toku_long_fsync_threshold) {
toku_sync_fetch_and_add(&toku_long_fsync_count, 1);
toku_sync_fetch_and_add(&toku_long_fsync_time, duration);
}
}
void toku_file_fsync_without_accounting(int fd) {
file_fsync_internal(fd, NULL);
file_fsync_internal(fd);
}
void toku_fsync_dirfd_without_accounting(DIR *dir) {
......@@ -502,22 +502,16 @@ int toku_fsync_dir_by_name_without_accounting(const char *dir_name) {
// include fsync in scheduling accounting
void toku_file_fsync(int fd) {
uint64_t duration;
file_fsync_internal (fd, &duration);
toku_sync_fetch_and_add(&sched_fsync_count, 1);
toku_sync_fetch_and_add(&sched_fsync_time, duration);
file_fsync_internal (fd);
}
// for real accounting
void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time) {
void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time, uint64_t *long_fsync_threshold, uint64_t *long_fsync_count, uint64_t *long_fsync_time) {
*fsync_count = toku_fsync_count;
*fsync_time = toku_fsync_time;
}
// for scheduling algorithm only
void toku_get_fsync_sched(uint64_t *fsync_count, uint64_t *fsync_time) {
*fsync_count = sched_fsync_count;
*fsync_time = sched_fsync_time;
*long_fsync_threshold = toku_long_fsync_threshold;
*long_fsync_count = toku_long_fsync_count;
*long_fsync_time = toku_long_fsync_time;
}
int toku_fsync_directory(const char *fname) {
......
......@@ -1745,9 +1745,11 @@ typedef enum {
FS_ENOSPC_REDZONE_CTR, // number of operations rejected by enospc prevention (red zone)
FS_ENOSPC_MOST_RECENT, // most recent time that file system was completely full
FS_ENOSPC_COUNT, // total number of times ENOSPC was returned from an attempt to write
FS_FSYNC_TIME ,
FS_FSYNC_TIME,
FS_FSYNC_COUNT,
FS_STATUS_NUM_ROWS
FS_LONG_FSYNC_TIME,
FS_LONG_FSYNC_COUNT,
FS_STATUS_NUM_ROWS, // must be last
} fs_status_entry;
typedef struct {
......@@ -1766,8 +1768,10 @@ fs_status_init(void) {
FS_STATUS_INIT(FS_ENOSPC_REDZONE_CTR, nullptr, UINT64, "number of operations rejected by enospc prevention (red zone)", TOKU_ENGINE_STATUS);
FS_STATUS_INIT(FS_ENOSPC_MOST_RECENT, nullptr, UNIXTIME, "most recent disk full", TOKU_ENGINE_STATUS);
FS_STATUS_INIT(FS_ENOSPC_COUNT, nullptr, UINT64, "number of write operations that returned ENOSPC", TOKU_ENGINE_STATUS);
FS_STATUS_INIT(FS_FSYNC_TIME, FILESYSTEM_FSYNC_SECONDS, UINT64, "fsync time", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
FS_STATUS_INIT(FS_FSYNC_TIME, FILESYSTEM_FSYNC_TIME, UINT64, "fsync time", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
FS_STATUS_INIT(FS_FSYNC_COUNT, FILESYSTEM_FSYNC_NUM, UINT64, "fsync count", TOKU_ENGINE_STATUS|TOKU_GLOBAL_STATUS);
FS_STATUS_INIT(FS_LONG_FSYNC_TIME, FILESYSTEM_LONG_FSYNC_TIME, UINT64, "long fsync time", TOKU_ENGINE_STATUS);
FS_STATUS_INIT(FS_LONG_FSYNC_COUNT, FILESYSTEM_LONG_FSYNC_NUM, UINT64, "long fsync count", TOKU_ENGINE_STATUS);
fsstat.initialized = true;
}
#undef FS_STATUS_INIT
......@@ -1792,10 +1796,12 @@ fs_get_status(DB_ENV * env, fs_redzone_state * redzone_state) {
FS_STATUS_VALUE(FS_ENOSPC_MOST_RECENT) = enospc_most_recent_timestamp;
FS_STATUS_VALUE(FS_ENOSPC_COUNT) = enospc_total;
uint64_t fsync_count, fsync_time;
toku_get_fsync_times(&fsync_count, &fsync_time);
uint64_t fsync_count, fsync_time, long_fsync_threshold, long_fsync_count, long_fsync_time;
toku_get_fsync_times(&fsync_count, &fsync_time, &long_fsync_threshold, &long_fsync_count, &long_fsync_time);
FS_STATUS_VALUE(FS_FSYNC_COUNT) = fsync_count;
FS_STATUS_VALUE(FS_FSYNC_TIME) = fsync_time;
FS_STATUS_VALUE(FS_LONG_FSYNC_COUNT) = long_fsync_count;
FS_STATUS_VALUE(FS_LONG_FSYNC_TIME) = long_fsync_time;
}
#undef FS_STATUS_VALUE
......
......@@ -397,10 +397,7 @@ void toku_file_fsync(int fd);
int toku_fsync_directory(const char *fname);
// get the number of fsync calls and the fsync times (total)
void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time);
// get the number of fsync calls and the fsync times for use by scheduler (subset of total)
void toku_get_fsync_sched(uint64_t *fsync_count, uint64_t *fsync_time);
void toku_get_fsync_times(uint64_t *fsync_count, uint64_t *fsync_time, uint64_t *long_fsync_threshold, uint64_t *long_fsync_count, uint64_t *long_fsync_time);
void toku_set_func_fsync (int (*fsync_function)(int));
void toku_set_func_pwrite (ssize_t (*)(int, const void *, size_t, toku_off_t));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment