Commit 6e6d1e86 authored by inaam's avatar inaam

branches/zip rb://133

This patch introduces heuristics based flushing rate of dirty pages to
avoid IO bursts at checkpoint.

1) log_capacity / log_generated per second gives us number of seconds
in which ALL dirty pages need to be flushed. Based on this rough
assumption we can say that
n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate

2) We use weighted averages (hard coded to 20 seconds) of
log_generation_rate to avoid resonance.

3) From the desired_flush_rate we subtract the number of pages that have
been flushed due to LRU flushing. That gives us pages that we should
flush as part of flush_list cleanup. And that is the number (capped by
maximum io_capacity) that we try to flush from the master thread.

Knobs:
======

innodb_adaptive_flushing: boolean, global, dynamic, default TRUE.
Since this heuristic is very experimental and has the potential to
dramatically change the IO pattern I think it is a good idea to leave a
knob to turn it off.

Approved by: Heikki
parent b6cb94ed
......@@ -44,6 +44,39 @@ Created 11/11/1995 Heikki Tuuri
#include "os0file.h"
#include "trx0sys.h"
/**********************************************************************
These statistics are generated for heuristics used in estimating the
rate at which we should flush the dirty blocks to avoid bursty IO
activity. Note that the rate of flushing not only depends on how many
dirty pages we have in the buffer pool but it is also a fucntion of
how much redo the workload is generating and at what rate. */
/* @{ */
/** Number of intervals for which we keep the history of these stats.
Each interval is 1 second, defined by the rate at which
srv_error_monitor_thread() calls buf_flush_stat_update(). */
#define BUF_FLUSH_STAT_N_INTERVAL 20
/** Sampled values buf_flush_stat_cur.
Not protected by any mutex. Updated by buf_flush_stat_update(). */
static buf_flush_stat_t buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
static ulint buf_flush_stat_arr_ind;
/** Values at start of the current interval. Reset by
buf_flush_stat_update(). */
static buf_flush_stat_t buf_flush_stat_cur;
/** Running sum of past values of buf_flush_stat_cur.
Updated by buf_flush_stat_update(). Not protected by any mutex. */
static buf_flush_stat_t buf_flush_stat_sum;
/** Number of pages flushed through non flush_list flushes. */
static ulint buf_lru_flush_page_count = 0;
/* @} */
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/******************************************************************//**
Validates the flush list.
......@@ -1132,6 +1165,13 @@ buf_flush_batch(
srv_buf_pool_flushed += page_count;
/* We keep track of all flushes happening as part of LRU
flush. When estimating the desired rate at which flush_list
should be flushed we factor in this value. */
if (flush_type == BUF_FLUSH_LRU) {
buf_lru_flush_page_count += page_count;
}
return(page_count);
}
......@@ -1227,6 +1267,116 @@ buf_flush_free_margin(void)
}
}
/*********************************************************************
Update the historical stats that we are collecting for flush rate
heuristics at the end of each interval.
Flush rate heuristic depends on (a) rate of redo log generation and
(b) the rate at which LRU flush is happening. */
UNIV_INTERN
void
buf_flush_stat_update(void)
/*=======================*/
{
buf_flush_stat_t* item;
ib_uint64_t lsn_diff;
ib_uint64_t lsn;
ulint n_flushed;
lsn = log_get_lsn();
if (buf_flush_stat_cur.redo == 0) {
/* First time around. Just update the current LSN
and return. */
buf_flush_stat_cur.redo = lsn;
return;
}
item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
/* values for this interval */
lsn_diff = lsn - buf_flush_stat_cur.redo;
n_flushed = buf_lru_flush_page_count
- buf_flush_stat_cur.n_flushed;
/* add the current value and subtract the obsolete entry. */
buf_flush_stat_sum.redo += lsn_diff - item->redo;
buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
/* put current entry in the array. */
item->redo = lsn_diff;
item->n_flushed = n_flushed;
/* update the index */
buf_flush_stat_arr_ind++;
buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
/* reset the current entry. */
buf_flush_stat_cur.redo = lsn;
buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
}
/*********************************************************************
Determines the fraction of dirty pages that need to be flushed based
on the speed at which we generate redo log. Note that if redo log
is generated at a significant rate without corresponding increase
in the number of dirty pages (for example, an in-memory workload)
it can cause IO bursts of flushing. This function implements heuristics
to avoid this burstiness.
@return number of dirty pages to be flushed / second */
UNIV_INTERN
ulint
buf_flush_get_desired_flush_rate(void)
/*==================================*/
{
ulint redo_avg;
ulint lru_flush_avg;
ulint n_dirty;
ulint n_flush_req;
lint rate;
ib_uint64_t lsn = log_get_lsn();
ib_uint64_t log_capacity = log_get_capacity();
/* log_capacity should never be zero after the initialization
of log subsystem. */
ut_ad(log_capacity != 0);
/* Get total number of dirty pages. It is OK to access
flush_list without holding any mtex as we are using this
only for heuristics. */
n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
/* An overflow can happen if we generate more than 2^32 bytes
of redo in this interval i.e.: 4G of redo in 1 second. We can
safely consider this as infinity because if we ever come close
to 4G we'll start a synchronous flush of dirty pages. */
/* redo_avg below is average at which redo is generated in
past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
interval. */
redo_avg = buf_flush_stat_sum.redo / BUF_FLUSH_STAT_N_INTERVAL
+ (lsn - buf_flush_stat_cur.redo);
/* An overflow can happen possibly if we flush more than 2^32
pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
unlikely scenario. Even when this happens it means that our
flush rate will be off the mark. It won't affect correctness
of any subsystem. */
/* lru_flush_avg below is rate at which pages are flushed as
part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
number of pages flushed in the current interval. */
lru_flush_avg = buf_flush_stat_sum.n_flushed
/ BUF_FLUSH_STAT_N_INTERVAL
+ (buf_lru_flush_page_count
- buf_flush_stat_cur.n_flushed);
n_flush_req = (n_dirty * redo_avg) / log_capacity;
/* The number of pages that we want to flush from the flush
list is the difference between the required rate and the
number of pages that we are historically flushing from the
LRU list */
rate = n_flush_req - lru_flush_avg;
return(rate > 0 ? (ulint) rate : 0);
}
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/******************************************************************//**
Validates the flush list.
......
......@@ -9687,6 +9687,11 @@ static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
"Percentage of dirty pages allowed in bufferpool.",
NULL, NULL, 75, 0, 99, 0);
static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
PLUGIN_VAR_NOCMDARG,
"Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
NULL, NULL, TRUE);
static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
PLUGIN_VAR_RQCMDARG,
"Desired maximum length of the purge queue (0 = no limit)",
......@@ -9886,6 +9891,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(log_files_in_group),
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct),
MYSQL_SYSVAR(adaptive_flushing),
MYSQL_SYSVAR(max_purge_lag),
MYSQL_SYSVAR(mirrored_log_groups),
MYSQL_SYSVAR(open_files),
......
......@@ -127,6 +127,44 @@ buf_flush_ready_for_replace(
/*========================*/
buf_page_t* bpage); /*!< in: buffer control block, must be
buf_page_in_file(bpage) and in the LRU list */
/** @brief Statistics for selecting flush rate based on redo log
generation speed.
These statistics are generated for heuristics used in estimating the
rate at which we should flush the dirty blocks to avoid bursty IO
activity. Note that the rate of flushing not only depends on how many
dirty pages we have in the buffer pool but it is also a fucntion of
how much redo the workload is generating and at what rate. */
struct buf_flush_stat_struct
{
ib_uint64_t redo; /**< amount of redo generated. */
ulint n_flushed; /**< number of pages flushed. */
};
/** Statistics for selecting flush rate of dirty pages. */
typedef struct buf_flush_stat_struct buf_flush_stat_t;
/*********************************************************************
Update the historical stats that we are collecting for flush rate
heuristics at the end of each interval. */
UNIV_INTERN
void
buf_flush_stat_update(void);
/*=======================*/
/*********************************************************************
Determines the fraction of dirty pages that need to be flushed based
on the speed at which we generate redo log. Note that if redo log
is generated at significant rate without a corresponding increase
in the number of dirty pages (for example, an in-memory workload)
it can cause IO bursts of flushing. This function implements heuristics
to avoid this burstiness.
@return number of dirty pages to be flushed / second */
UNIV_INTERN
ulint
buf_flush_get_desired_flush_rate(void);
/*==================================*/
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/******************************************************************//**
Validates the flush list.
......
......@@ -169,6 +169,14 @@ UNIV_INLINE
ib_uint64_t
log_get_lsn(void);
/*=============*/
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
@return log group capacity */
UNIV_INLINE
ib_uint64_t
log_get_capacity(void);
/*==================*/
/******************************************************//**
Initializes the log. */
UNIV_INTERN
......
......@@ -385,6 +385,18 @@ log_get_lsn(void)
return(lsn);
}
/****************************************************************
Gets the log group capacity. It is OK to read the value without
holding log_sys->mutex because it is constant.
@return log group capacity */
UNIV_INLINE
ib_uint64_t
log_get_capacity(void)
/*==================*/
{
return(log_sys->log_group_capacity);
}
/***********************************************************************//**
Checks if there is need for a log buffer flush or a new checkpoint, and does
this if yes. Any database operation should call this when it has modified
......
......@@ -139,6 +139,8 @@ extern ulint srv_n_log_files;
extern ulint srv_log_file_size;
extern ulint srv_log_buffer_size;
extern ulong srv_flush_log_at_trx_commit;
extern char srv_adaptive_flushing;
/* The sort order table of the MySQL latin1_swedish_ci character set
collation */
......
......@@ -183,6 +183,10 @@ UNIV_INTERN ulint srv_log_file_size = ULINT_MAX;
UNIV_INTERN ulint srv_log_buffer_size = ULINT_MAX;
UNIV_INTERN ulong srv_flush_log_at_trx_commit = 1;
/* Try to flush dirty pages so as to avoid IO bursts at
the checkpoints. */
UNIV_INTERN char srv_adaptive_flushing = TRUE;
/* The sort order table of the MySQL latin1_swedish_ci character set
collation */
UNIV_INTERN const byte* srv_latin1_ordering;
......@@ -2175,13 +2179,16 @@ srv_error_monitor_thread(
}
/* Update the statistics collected for deciding LRU
eviction policy. */
eviction policy. */
buf_LRU_stat_update();
/* Update the statistics collected for flush rate policy. */
buf_flush_stat_update();
/* In case mutex_exit is not a memory barrier, it is
theoretically possible some threads are left waiting though
the semaphore is already released. Wake up those threads: */
sync_arr_wake_threads_if_sema_free();
if (sync_array_print_long_waits()) {
......@@ -2423,6 +2430,22 @@ srv_master_thread(
iteration of this loop. */
skip_sleep = TRUE;
} else if (srv_adaptive_flushing) {
/* Try to keep the rate of flushing of dirty
pages such that redo log generation does not
produce bursts of IO at checkpoint time. */
ulint n_flush = buf_flush_get_desired_flush_rate();
if (n_flush) {
n_flush = ut_min(PCT_IO(100), n_flush);
n_pages_flushed =
buf_flush_batch(
BUF_FLUSH_LIST,
n_flush,
IB_ULONGLONG_MAX);
skip_sleep = TRUE;
}
}
if (srv_activity_count == old_activity_count) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment