branches/zip rb://133

This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki

branches/zip rb://133
This patch introduces heuristics based flushing rate of dirty pages to avoid IO bursts at checkpoint. 1) log_capacity / log_generated per second gives us number of seconds in which ALL dirty pages need to be flushed. Based on this rough assumption we can say that n_dirty_pages / (log_capacity / log_generation_rate) = desired_flush_rate 2) We use weighted averages (hard coded to 20 seconds) of log_generation_rate to avoid resonance. 3) From the desired_flush_rate we subtract the number of pages that have been flushed due to LRU flushing. That gives us pages that we should flush as part of flush_list cleanup. And that is the number (capped by maximum io_capacity) that we try to flush from the master thread. Knobs: ====== innodb_adaptive_flushing: boolean, global, dynamic, default TRUE. Since this heuristic is very experimental and has the potential to dramatically change the IO pattern I think it is a good idea to leave a knob to turn it off. Approved by: Heikki
6e6d1e86 · inaam · b6cb94ed · 6e6d1e86 · 6e6d1e86 · 6e6d1e86
Commit 6e6d1e86 authored Jul 08, 2009 by inaam
7 changed files
--- a/buf/buf0flu.c
+++ b/buf/buf0flu.c
@@ -44,6 +44,39 @@ Created 11/11/1995 Heikki Tuuri
 #include "os0file.h"
 #include "trx0sys.h"
+/**********************************************************************
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+/* @{ */
+/** Number of intervals for which we keep the history of these stats.
+Each interval is 1 second, defined by the rate at which
+srv_error_monitor_thread() calls buf_flush_stat_update(). */
+#define BUF_FLUSH_STAT_N_INTERVAL 20
+/** Sampled values buf_flush_stat_cur.
+Not protected by any mutex.  Updated by buf_flush_stat_update(). */
+static buf_flush_stat_t	buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
+/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
+static ulint		buf_flush_stat_arr_ind;
+/** Values at start of the current interval. Reset by
+buf_flush_stat_update(). */
+static buf_flush_stat_t	buf_flush_stat_cur;
+/** Running sum of past values of buf_flush_stat_cur.
+Updated by buf_flush_stat_update(). Not protected by any mutex. */
+static buf_flush_stat_t	buf_flush_stat_sum;
+/** Number of pages flushed through non flush_list flushes. */
+static ulint buf_lru_flush_page_count = 0;
+/* @} */
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /******************************************************************//**
 Validates the flush list.
@@ -1132,6 +1165,13 @@ flush_next:
 	srv_buf_pool_flushed += page_count;
+	/* We keep track of all flushes happening as part of LRU
+	flush. When estimating the desired rate at which flush_list
+	should be flushed we factor in this value. */
+	if (flush_type == BUF_FLUSH_LRU) {
+		buf_lru_flush_page_count += page_count;
+	}
 	return(page_count);
 }
@@ -1227,6 +1267,116 @@ buf_flush_free_margin(void)
 	}
 }
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval.
+Flush rate heuristic depends on (a) rate of redo log generation and
+(b) the rate at which LRU flush is happening. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void)
+/*=======================*/
+{
+	buf_flush_stat_t*	item;
+	ib_uint64_t		lsn_diff;
+	ib_uint64_t		lsn;
+	ulint			n_flushed;
+	lsn = log_get_lsn();
+	if (buf_flush_stat_cur.redo == 0) {
+		/* First time around. Just update the current LSN
+		and return. */
+		buf_flush_stat_cur.redo = lsn;
+		return;
+	}
+	item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
+	/* values for this interval */
+	lsn_diff = lsn - buf_flush_stat_cur.redo;
+	n_flushed = buf_lru_flush_page_count
+		    - buf_flush_stat_cur.n_flushed;
+	/* add the current value and subtract the obsolete entry. */
+	buf_flush_stat_sum.redo += lsn_diff - item->redo;
+	buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
+	/* put current entry in the array. */
+	item->redo = lsn_diff;
+	item->n_flushed = n_flushed;
+	/* update the index */
+	buf_flush_stat_arr_ind++;
+	buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
+	/* reset the current entry. */
+	buf_flush_stat_cur.redo = lsn;
+	buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
+}
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at a significant rate without corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return	number of dirty pages to be flushed / second */
+UNIV_INTERN
+ulint
+buf_flush_get_desired_flush_rate(void)
+/*==================================*/
+{
+	ulint			redo_avg;
+	ulint			lru_flush_avg;
+	ulint			n_dirty;
+	ulint			n_flush_req;
+	lint			rate;
+	ib_uint64_t		lsn = log_get_lsn();
+	ib_uint64_t		log_capacity = log_get_capacity();
+	/* log_capacity should never be zero after the initialization
+	of log subsystem. */
+	ut_ad(log_capacity != 0);
+	/* Get total number of dirty pages. It is OK to access
+	flush_list without holding any mtex as we are using this
+	only for heuristics. */
+	n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
+	/* An overflow can happen if we generate more than 2^32 bytes
+	of redo in this interval i.e.: 4G of redo in 1 second. We can
+	safely consider this as infinity because if we ever come close
+	to 4G we'll start a synchronous flush of dirty pages. */
+	/* redo_avg below is average at which redo is generated in
+	past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
+	interval. */
+	redo_avg = buf_flush_stat_sum.redo / BUF_FLUSH_STAT_N_INTERVAL
+		   + (lsn - buf_flush_stat_cur.redo);
+	/* An overflow can happen possibly if we flush more than 2^32
+	pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
+	unlikely scenario. Even when this happens it means that our
+	flush rate will be off the mark. It won't affect correctness
+	of any subsystem. */
+	/* lru_flush_avg below is rate at which pages are flushed as
+	part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
+	number of pages flushed in the current interval. */
+	lru_flush_avg = buf_flush_stat_sum.n_flushed
+			/ BUF_FLUSH_STAT_N_INTERVAL
+			+ (buf_lru_flush_page_count
+			   - buf_flush_stat_cur.n_flushed);
+	n_flush_req = (n_dirty * redo_avg) / log_capacity;
+	/* The number of pages that we want to flush from the flush
+	list is the difference between the required rate and the
+	number of pages that we are historically flushing from the
+	LRU list */
+	rate = n_flush_req - lru_flush_avg;
+	return(rate > 0 ? (ulint) rate : 0);
+}
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /******************************************************************//**
 Validates the flush list.

--- a/handler/ha_innodb.cc
+++ b/handler/ha_innodb.cc
@@ -9687,6 +9687,11 @@ static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
  "Percentage of dirty pages allowed in bufferpool.",
  NULL, NULL, 75, 0, 99, 0);
+static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
+  PLUGIN_VAR_NOCMDARG,
+  "Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
+  NULL, NULL, TRUE);
 static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
  PLUGIN_VAR_RQCMDARG,
  "Desired maximum length of the purge queue (0 = no limit)",
@@ -9886,6 +9891,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
  MYSQL_SYSVAR(log_files_in_group),
  MYSQL_SYSVAR(log_group_home_dir),
  MYSQL_SYSVAR(max_dirty_pages_pct),
+  MYSQL_SYSVAR(adaptive_flushing),
  MYSQL_SYSVAR(max_purge_lag),
  MYSQL_SYSVAR(mirrored_log_groups),
  MYSQL_SYSVAR(open_files),

--- a/include/buf0flu.h
+++ b/include/buf0flu.h
@@ -127,6 +127,44 @@ buf_flush_ready_for_replace(
 /*========================*/
 	buf_page_t*	bpage);	/*!< in: buffer control block, must be
 				buf_page_in_file(bpage) and in the LRU list */
+/** @brief Statistics for selecting flush rate based on redo log
+generation speed.
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+struct buf_flush_stat_struct
+{
+	ib_uint64_t	redo;		/**< amount of redo generated. */
+	ulint		n_flushed;	/**< number of pages flushed. */
+};
+/** Statistics for selecting flush rate of dirty pages. */
+typedef struct buf_flush_stat_struct buf_flush_stat_t;
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void);
+/*=======================*/
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at significant rate without a corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return	number of dirty pages to be flushed / second */
+UNIV_INTERN
+ulint
+buf_flush_get_desired_flush_rate(void);
+/*==================================*/
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /******************************************************************//**
 Validates the flush list.

--- a/include/log0log.h
+++ b/include/log0log.h
@@ -169,6 +169,14 @@ UNIV_INLINE
 ib_uint64_t
 log_get_lsn(void);
 /*=============*/
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return	log group capacity */
+UNIV_INLINE
+ib_uint64_t
+log_get_capacity(void);
+/*==================*/
 /******************************************************//**
 Initializes the log. */
 UNIV_INTERN

--- a/include/log0log.ic
+++ b/include/log0log.ic
@@ -385,6 +385,18 @@ log_get_lsn(void)
 	return(lsn);
 }
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return	log group capacity */
+UNIV_INLINE
+ib_uint64_t
+log_get_capacity(void)
+/*==================*/
+{
+	return(log_sys->log_group_capacity);
+}
 /***********************************************************************//**
 Checks if there is need for a log buffer flush or a new checkpoint, and does
 this if yes. Any database operation should call this when it has modified

--- a/include/srv0srv.h
+++ b/include/srv0srv.h
@@ -139,6 +139,8 @@ extern ulint	srv_n_log_files;
 extern ulint	srv_log_file_size;
 extern ulint	srv_log_buffer_size;
 extern ulong	srv_flush_log_at_trx_commit;
+extern char	srv_adaptive_flushing;
 /* The sort order table of the MySQL latin1_swedish_ci character set
 collation */

--- a/srv/srv0srv.c
+++ b/srv/srv0srv.c
@@ -183,6 +183,10 @@ UNIV_INTERN ulint	srv_log_file_size	= ULINT_MAX;
 UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
 UNIV_INTERN ulong	srv_flush_log_at_trx_commit = 1;
+/* Try to flush dirty pages so as to avoid IO bursts at
+the checkpoints. */
+UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
 /* The sort order table of the MySQL latin1_swedish_ci character set
 collation */
 UNIV_INTERN const byte*	srv_latin1_ordering;
@@ -2178,6 +2182,9 @@ loop:
 	eviction policy. */
 	buf_LRU_stat_update();
+	/* Update the statistics collected for flush rate policy. */
+	buf_flush_stat_update();
 	/* In case mutex_exit is not a memory barrier, it is
 	theoretically possible some threads are left waiting though
 	the semaphore is already released. Wake up those threads: */
@@ -2423,6 +2430,22 @@ loop:
 			iteration of this loop. */
 			skip_sleep = TRUE;
+		} else if (srv_adaptive_flushing) {
+			/* Try to keep the rate of flushing of dirty
+			pages such that redo log generation does not
+			produce bursts of IO at checkpoint time. */
+			ulint n_flush = buf_flush_get_desired_flush_rate();
+			if (n_flush) {
+				n_flush = ut_min(PCT_IO(100), n_flush);
+				n_pages_flushed =
+					buf_flush_batch(
+						BUF_FLUSH_LIST,
+						n_flush,
+						IB_ULONGLONG_MAX);
+				skip_sleep = TRUE;
+			}
 		}
 		if (srv_activity_count == old_activity_count) {