Commit 4c0cd953 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-28766: SET GLOBAL innodb_log_file_buffering

In commit c4c88307 (MDEV-28111) we disabled
the file system cache on the InnoDB write-ahead log file (ib_logfile0)
by default on Linux.

It turns out that especially with innodb_flush_trx_log_at_commit=2,
writing to the log via the file system cache typically improves throughput,
especially on slow storage or at a small number of concurrent transactions.
For other values of innodb_flush_log_at_trx_commit, direct writes were
observed to be mostly but not always faster. Whether it pays off to
disable the file system cache on the log may depend on the type of storage,
the workload, and the operating system kernel version.

On Linux and Microsoft Windows, we will introduce the settable Boolean
global variable innodb_log_file_buffering that indicates whether the
file system cache on the redo log file is enabled. The default value is
innodb_log_file_buffering=OFF. If the server is started up with
innodb_flush_log_at_trx_commit=2, the value will be changed to
innodb_log_file_buffering=ON.

When a persistent memory interface is being used for the log,
the value cannot be changed from innodb_log_file_buffering=OFF.
On Linux, when the physical block size cannot be determined
to be a power of 2 between 64 and 4096 bytes, the file system cache
cannot be disabled, and innodb_log_file_buffering=ON cannot be changed.

Server log messages will indicate whether the file system cache is
enabled for the redo log:

[Note] InnoDB: Buffered log writes (block size=512 bytes)
[Note] InnoDB: File system buffers for log disabled (block size=512 bytes)

After this change, the startup parameter innodb_flush_method will no
longer control whether O_DIRECT will be set on the redo log on Linux.

On other operating systems that support O_DIRECT, no interface has been
implemented for controlling the file system cache for the redo log.
The innodb_flush_method values O_DIRECT, O_DIRECT_NO_FSYNC, O_DSYNC
will enable O_DIRECT for data files, not the log.

Tested by: Matthias Leich, Axel Schwenke
parent 813986a6
......@@ -5,6 +5,7 @@ variable_name not in (
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
VARIABLE_NAME INNODB_ADAPTIVE_FLUSHING
......@@ -1020,7 +1021,7 @@ SESSION_VALUE NULL
DEFAULT_VALUE
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE VARCHAR
VARIABLE_COMMENT Path to InnoDB log files.
VARIABLE_COMMENT Path to ib_logfile0
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
......
......@@ -12,5 +12,6 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
......@@ -4066,6 +4066,14 @@ static int innodb_init_params()
}
#endif
#if defined __linux__ || defined _WIN32
if (srv_flush_log_at_trx_commit == 2) {
/* Do not disable the file system cache if
innodb_flush_log_at_trx_commit=2. */
log_sys.log_buffered = true;
}
#endif
if (srv_read_only_mode) {
ib::info() << "Started in read only mode";
srv_use_doublewrite_buf = FALSE;
......@@ -18442,6 +18450,16 @@ buffer_pool_load_abort(
}
}
#if defined __linux__ || defined _WIN32
static void innodb_log_file_buffering_update(THD *thd, st_mysql_sys_var*,
void *, const void *save)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
log_sys.set_buffered(*static_cast<const my_bool*>(save));
mysql_mutex_lock(&LOCK_global_system_variables);
}
#endif
/** Update innodb_status_output or innodb_status_output_locks,
which control InnoDB "status monitor" output to the error log.
@param[out] var current value
......@@ -18858,7 +18876,7 @@ static MYSQL_SYSVAR_ENUM(flush_method, srv_file_flush_method,
static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Path to InnoDB log files.", NULL, NULL, NULL);
"Path to ib_logfile0", NULL, NULL, NULL);
static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
PLUGIN_VAR_RQCMDARG,
......@@ -19250,6 +19268,13 @@ static MYSQL_SYSVAR_SIZE_T(log_buffer_size, log_sys.buf_size,
"Redo log buffer size in bytes.",
NULL, NULL, 16U << 20, 2U << 20, SIZE_T_MAX, 4096);
#if defined __linux__ || defined _WIN32
static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered,
PLUGIN_VAR_OPCMDARG,
"Whether the file system cache for ib_logfile0 is enabled",
nullptr, innodb_log_file_buffering_update, FALSE);
#endif
static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Redo log size in bytes.",
......@@ -19692,6 +19717,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(deadlock_report),
MYSQL_SYSVAR(page_size),
MYSQL_SYSVAR(log_buffer_size),
#if defined __linux__ || defined _WIN32
MYSQL_SYSVAR(log_file_buffering),
#endif
MYSQL_SYSVAR(log_file_size),
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(max_dirty_pages_pct),
......
......@@ -249,6 +249,16 @@ typedef srw_lock log_rwlock_t;
uint32_t format;
/** Log file */
log_file_t log;
#if defined __linux__ || defined _WIN32
/** whether file system caching is enabled for the log */
my_bool log_buffered;
# ifdef _WIN32
static constexpr bool log_maybe_unbuffered= true;
# else
/** whether file system caching may be disabled */
bool log_maybe_unbuffered;
# endif
#endif
/** Fields involved in checkpoints @{ */
lsn_t log_capacity; /*!< capacity of the log; if
......@@ -289,10 +299,17 @@ typedef srw_lock log_rwlock_t;
bool is_opened() const noexcept { return log.is_opened(); }
static constexpr bool resize_in_progress() { return false; }
/** Rename a log file after resizing.
@return whether an error occurred */
static bool rename_resized() noexcept;
#if defined __linux__ || defined _WIN32
/** Try to enable or disable file system caching (update log_buffered) */
void set_buffered(bool buffered);
#endif
void attach(log_file_t file, os_offset_t size);
void close_file();
......
......@@ -209,6 +209,8 @@ void log_t::attach(log_file_t file, os_offset_t size)
#if defined __linux__ || defined _WIN32
set_block_size(CPU_LEVEL1_DCACHE_LINESIZE);
#endif
log_maybe_unbuffered= true;
log_buffered= false;
return;
}
}
......@@ -220,19 +222,12 @@ void log_t::attach(log_file_t file, os_offset_t size)
#endif
#if defined __linux__ || defined _WIN32
if (!block_size)
set_block_size(512);
# ifdef __linux__
else if (srv_file_flush_method != SRV_O_DSYNC &&
srv_file_flush_method != SRV_O_DIRECT &&
srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC)
sql_print_information("InnoDB: Buffered log writes (block size=%u bytes)",
sql_print_information("InnoDB: %s (block size=%u bytes)",
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
block_size);
#endif
else
sql_print_information("InnoDB: File system buffers for log"
" disabled (block size=%u bytes)", block_size);
#endif
#ifdef HAVE_PMEM
checkpoint_buf= static_cast<byte*>(aligned_malloc(block_size, block_size));
......@@ -327,6 +322,62 @@ void log_t::close_file()
ib::fatal() << "closing ib_logfile0 failed: " << err;
}
#if defined __linux__ || defined _WIN32
/** Acquire all latches that protect the log. */
static void log_resize_acquire()
{
if (!log_sys.is_pmem())
{
while (flush_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
group_commit_lock::ACQUIRED);
while (write_lock.acquire(log_sys.get_lsn() + 1, nullptr) !=
group_commit_lock::ACQUIRED);
}
log_sys.latch.wr_lock(SRW_LOCK_CALL);
}
/** Release the latches that protect the log. */
void log_resize_release()
{
log_sys.latch.wr_unlock();
if (!log_sys.is_pmem())
{
lsn_t lsn1= write_lock.release(write_lock.value());
lsn_t lsn2= flush_lock.release(flush_lock.value());
if (lsn1 || lsn2)
log_write_up_to(std::max(lsn1, lsn2), true, nullptr);
}
}
/** Try to enable or disable file system caching (update log_buffered) */
void log_t::set_buffered(bool buffered)
{
if (!log_maybe_unbuffered || is_pmem() || high_level_read_only)
return;
log_resize_acquire();
if (!resize_in_progress() && is_opened() && bool(log_buffered) != buffered)
{
os_file_close_func(log.m_file);
log.m_file= OS_FILE_CLOSED;
std::string path{get_log_file_path()};
log_buffered= buffered;
bool success;
log.m_file= os_file_create_func(path.c_str(),
OS_FILE_OPEN, OS_FILE_NORMAL, OS_LOG_FILE,
false, &success);
ut_a(log.m_file != OS_FILE_CLOSED);
sql_print_information("InnoDB: %s (block size=%u bytes)",
log_buffered
? "Buffered log writes"
: "File system buffers for log disabled",
block_size);
}
log_resize_release();
}
#endif
/** Write an aligned buffer to ib_logfile0.
@param buf buffer to be written
@param len length of data to be written
......
......@@ -1055,6 +1055,7 @@ os_file_create_simple_func(
we open the same file in the same mode, see man page of open(2). */
if (!srv_read_only_mode && *success) {
switch (srv_file_flush_method) {
case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
os_file_set_nocache(file, name, mode_str);
......@@ -1240,13 +1241,13 @@ os_file_create_func(
#if (defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)) || defined O_DIRECT
if (type == OS_DATA_FILE) {
# ifdef __linux__
use_o_direct:
# endif
switch (srv_file_flush_method) {
case SRV_O_DSYNC:
case SRV_O_DIRECT:
case SRV_O_DIRECT_NO_FSYNC:
# ifdef __linux__
use_o_direct:
# endif
os_file_set_nocache(file, name, mode_str);
break;
default:
......@@ -1263,9 +1264,6 @@ os_file_create_func(
goto skip_o_direct;
}
MSAN_STAT_WORKAROUND(&st);
if (st.st_size & 4095) {
goto skip_o_direct;
}
if (snprintf(b, sizeof b,
"/sys/dev/block/%u:%u/queue/physical_block_size",
major(st.st_dev), minor(st.st_dev))
......@@ -1298,11 +1296,16 @@ os_file_create_func(
if (s > 4096 || s < 64 || !ut_is_2pow(s)) {
goto skip_o_direct;
}
log_sys.log_maybe_unbuffered= true;
log_sys.set_block_size(uint32_t(s));
if (!log_sys.log_buffered && !(st.st_size & (s - 1))) {
goto use_o_direct;
}
} else {
skip_o_direct:
log_sys.set_block_size(0);
log_sys.log_maybe_unbuffered= false;
log_sys.log_buffered= true;
log_sys.set_block_size(512);
}
}
# endif
......@@ -2057,7 +2060,7 @@ os_file_create_directory(
}
/** Get disk sector size for a file. */
size_t get_sector_size(HANDLE file)
static size_t get_sector_size(HANDLE file)
{
FILE_STORAGE_INFO fsi;
ULONG s= 4096;
......@@ -2065,10 +2068,8 @@ size_t get_sector_size(HANDLE file)
{
s= fsi.PhysicalBytesPerSectorForPerformance;
if (s > 4096 || s < 64 || !ut_is_2pow(s))
{
return 4096;
}
}
return s;
}
......@@ -2165,8 +2166,9 @@ os_file_create_func(
? FILE_FLAG_OVERLAPPED : 0;
if (type == OS_LOG_FILE) {
if(srv_flush_log_at_trx_commit != 2 && !log_sys.is_opened())
if (!log_sys.is_opened() && !log_sys.log_buffered) {
attributes|= FILE_FLAG_NO_BUFFERING;
}
if (srv_file_flush_method == SRV_O_DSYNC)
attributes|= FILE_FLAG_WRITE_THROUGH;
}
......@@ -2197,21 +2199,22 @@ os_file_create_func(
name, access, share_mode, my_win_file_secattr(),
create_flag, attributes, NULL);
if (file != INVALID_HANDLE_VALUE && type == OS_LOG_FILE
&& (attributes & FILE_FLAG_NO_BUFFERING)) {
uint32 s= (uint32_t) get_sector_size(file);
log_sys.set_block_size(uint32_t(s));
/* FIXME! remove it when backup is fixed, so that it
does not produce redo with irregular sizes.*/
*success = file != INVALID_HANDLE_VALUE;
if (*success && type == OS_LOG_FILE) {
uint32_t s = uint32_t(get_sector_size(file));
log_sys.set_block_size(s);
if (attributes & FILE_FLAG_NO_BUFFERING) {
if (os_file_get_size(file) % s) {
attributes &= ~FILE_FLAG_NO_BUFFERING;
create_flag = OPEN_ALWAYS;
CloseHandle(file);
continue;
}
log_sys.log_buffered = false;
}
}
*success = (file != INVALID_HANDLE_VALUE);
if (*success) {
break;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment