Commit 4c3ad244 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-27416 InnoDB hang in buf_flush_wait_flushed(), on log checkpoint

InnoDB could sometimes hang when triggering a log checkpoint. This is
due to commit 7b1252c0 (MDEV-24278),
which introduced an untimed wait to buf_flush_page_cleaner().

The hang was noticed by occasional failures of IMPORT TABLESPACE tests,
such as innodb.innodb-wl5522, which would (unnecessarily) invoke
log_make_checkpoint() from row_import_cleanup().

The reason of the hang was that buf_flush_page_cleaner() would enter
untimed sleep despite buf_flush_sync_lsn being set. The exact failure
scenario is unclear, because buf_flush_sync_lsn should actually be
protected by buf_pool.flush_list_mutex. We prevent the hang by
invoking buf_pool.page_cleaner_set_idle(false) whenever we are
setting buf_flush_sync_lsn and signaling buf_pool.do_flush_list.

The bulk of these changes was originally developed as a preparation
for MDEV-26827, to invoke buf_flush_list() from fewer threads,
and tested on 10.6 by Matthias Leich.

This fix was tested by running 100 repetitions of 100 concurrent instances
of the test innodb.innodb-wl5522 on a RelWithDebInfo build, using ext4fs
and innodb_flush_method=O_DIRECT on a SATA SSD with 4096-byte block size.
During the test, the call to log_make_checkpoint() in row_import_cleanup()
was present.

buf_flush_list(): Make static.

buf_flush_wait(): Wait for buf_pool.get_oldest_modification()
to reach a target, by work done in the buf_flush_page_cleaner.
If buf_flush_sync_lsn is going to be set, we will invoke
buf_pool.page_cleaner_set_idle(false).

buf_flush_ahead(): If buf_flush_sync_lsn or buf_flush_async_lsn
is going to be set and the page cleaner woken up, we will invoke
buf_pool.page_cleaner_set_idle(false).

buf_flush_wait_flushed(): Invoke buf_flush_wait().

buf_flush_sync(): Invoke recv_sys.apply() at the start in case
crash recovery is active. Invoke buf_flush_wait().

buf_flush_sync_batch(): A lower-level variant of buf_flush_sync()
that is only called by recv_sys_t::apply().

buf_flush_sync_for_checkpoint(): Do not trigger log apply
or checkpoint during recovery.

buf_dblwr_t::create(): Only initiate a buffer pool flush, not
a checkpoint.

row_import_cleanup(): Do not unnecessarily invoke log_make_checkpoint().
Invoking buf_flush_list_space() before starting to generate redo log
for the imported tablespace should suffice.

srv_prepare_to_delete_redo_log_file():
Set recv_sys.recovery_on in order to prevent
buf_flush_sync_for_checkpoint() from initiating a checkpoint
while the log is inaccessible. Remove a wait loop that is already
part of buf_flush_sync().
Do not invoke fil_names_clear() if the log is being upgraded,
because the FILE_MODIFY record is specific to the latest format.

create_log_file(): Clear recv_sys.recovery_on only after calling
log_make_checkpoint(), to prevent buf_flush_page_cleaner from
invoking a checkpoint.

innodb_shutdown(): Simplify the logic in mariadb-backup --prepare.

os_aio_wait_until_no_pending_writes(): Update the function comment.
Apart from row_quiesce_table_start() during FLUSH TABLES...FOR EXPORT,
this is being called by buf_flush_list_space(), which is invoked
by ALTER TABLE...IMPORT TABLESPACE as well as some encryption operations.
parent eab89f14
......@@ -3300,10 +3300,7 @@ buf_page_get_low(
fix_block->fix();
mysql_mutex_unlock(&buf_pool.mutex);
buf_flush_list();
buf_flush_wait_batch_end_acquiring_mutex(false);
while (buf_flush_list_space(space));
os_aio_wait_until_no_pending_writes();
buf_flush_sync();
if (fix_block->page.buf_fix_count() == 1
&& !fix_block->page.oldest_modification()) {
......
......@@ -219,8 +219,7 @@ bool buf_dblwr_t::create()
trx_sys_block->frame, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N);
mtr.commit();
/* Flush the modified pages to disk and make a checkpoint */
log_make_checkpoint();
buf_flush_wait_flushed(mtr.commit_lsn());
/* Remove doublewrite pages from LRU */
buf_pool_invalidate();
......
......@@ -1538,7 +1538,7 @@ void buf_flush_wait_batch_end(bool lru)
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
@return the number of processed pages
@retval 0 if a buf_pool.flush_list batch is already running */
ulint buf_flush_list(ulint max_n, lsn_t lsn)
static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX)
{
ut_ad(lsn);
......@@ -1833,6 +1833,30 @@ ATTRIBUTE_COLD void log_make_checkpoint()
while (!log_checkpoint());
}
/** Wait for all dirty pages up to an LSN to be written out.
NOTE: The calling thread is not allowed to hold any buffer page latches! */
static void buf_flush_wait(lsn_t lsn)
{
ut_ad(lsn <= log_sys.get_lsn());
while (buf_pool.get_oldest_modification(lsn) < lsn)
{
if (buf_flush_sync_lsn < lsn)
{
buf_flush_sync_lsn= lsn;
buf_pool.page_cleaner_set_idle(false);
pthread_cond_signal(&buf_pool.do_flush_list);
}
my_cond_wait(&buf_pool.done_flush_list,
&buf_pool.flush_list_mutex.m_mutex);
}
/* Wait for the checkpoint. */
while (buf_flush_sync_lsn)
my_cond_wait(&buf_pool.done_flush_list,
&buf_pool.flush_list_mutex.m_mutex);
}
/** Wait until all persistent pages are flushed up to a limit.
@param sync_lsn buf_pool.get_oldest_modification(LSN_MAX) to wait for */
ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
......@@ -1849,9 +1873,9 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
if (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn)
{
MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
#if 1 /* FIXME: remove this, and guarantee that the page cleaner serves us */
if (UNIV_UNLIKELY(!buf_page_cleaner_is_active)
ut_d(|| innodb_page_cleaner_disabled_debug))
if (UNIV_UNLIKELY(!buf_page_cleaner_is_active))
{
do
{
......@@ -1864,35 +1888,21 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
MONITOR_FLUSH_SYNC_COUNT,
MONITOR_FLUSH_SYNC_PAGES, n_pages);
}
MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
}
while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
goto try_checkpoint;
}
else
#endif
if (buf_flush_sync_lsn < sync_lsn)
{
buf_flush_sync_lsn= sync_lsn;
pthread_cond_signal(&buf_pool.do_flush_list);
}
do
{
tpool::tpool_wait_begin();
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
my_cond_wait(&buf_pool.done_flush_list,
&buf_pool.flush_list_mutex.m_mutex);
thd_wait_end(nullptr);
tpool::tpool_wait_begin();
buf_flush_wait(sync_lsn);
tpool::tpool_wait_end();
MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
thd_wait_end(nullptr);
}
while (buf_pool.get_oldest_modification(sync_lsn) < sync_lsn);
}
try_checkpoint:
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (UNIV_UNLIKELY(log_sys.last_checkpoint_lsn < sync_lsn))
......@@ -1925,8 +1935,11 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
{
mysql_mutex_lock(&buf_pool.flush_list_mutex);
if (limit < lsn)
{
limit= lsn;
pthread_cond_signal(&buf_pool.do_flush_list);
buf_pool.page_cleaner_set_idle(false);
pthread_cond_signal(&buf_pool.do_flush_list);
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
}
}
......@@ -1960,10 +1973,6 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
MONITOR_FLUSH_SYNC_PAGES, n_flushed);
}
/* Attempt to perform a log checkpoint upon completing each batch. */
if (recv_recovery_is_on())
recv_sys.apply(true);
switch (srv_file_flush_method) {
case SRV_NOSYNC:
case SRV_O_DIRECT_NO_FSYNC:
......@@ -1980,7 +1989,8 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
mysql_mutex_unlock(&log_sys.flush_order_mutex);
const lsn_t checkpoint_lsn= measure ? measure : newest_lsn;
if (checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
if (!recv_recovery_is_on() &&
checkpoint_lsn > log_sys.last_checkpoint_lsn + SIZE_OF_FILE_CHECKPOINT)
{
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
log_checkpoint_low(checkpoint_lsn, newest_lsn);
......@@ -2004,7 +2014,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
else if (measure >= buf_flush_async_lsn)
buf_flush_async_lsn= 0;
/* wake up buf_flush_wait_flushed() */
/* wake up buf_flush_wait() */
pthread_cond_broadcast(&buf_pool.done_flush_list);
lsn= std::max(lsn, target);
......@@ -2264,7 +2274,7 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
if (UNIV_UNLIKELY(lsn_limit != 0))
{
buf_flush_sync_lsn= 0;
/* wake up buf_flush_wait_flushed() */
/* wake up buf_flush_wait() */
pthread_cond_broadcast(&buf_pool.done_flush_list);
}
unemployed:
......@@ -2334,7 +2344,7 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
if (UNIV_UNLIKELY(lsn_limit != 0))
{
n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
/* wake up buf_flush_wait_flushed() */
/* wake up buf_flush_wait() */
pthread_cond_broadcast(&buf_pool.done_flush_list);
goto try_checkpoint;
}
......@@ -2440,6 +2450,7 @@ ATTRIBUTE_COLD void buf_flush_page_cleaner_init()
os_thread_create(buf_flush_page_cleaner);
}
#if defined(HAVE_SYSTEMD) && !defined(EMBEDDED_LIBRARY)
/** @return the number of dirty pages in the buffer pool */
static ulint buf_flush_list_length()
{
......@@ -2448,6 +2459,7 @@ static ulint buf_flush_list_length()
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
return len;
}
#endif
/** Flush the buffer pool on shutdown. */
ATTRIBUTE_COLD void buf_flush_buffer_pool()
......@@ -2458,13 +2470,15 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool()
service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
"Waiting to flush the buffer pool");
while (buf_pool.n_flush_list() || buf_flush_list_length())
mysql_mutex_lock(&buf_pool.flush_list_mutex);
while (buf_pool.get_oldest_modification(0))
{
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
buf_flush_list(srv_max_io_capacity);
timespec abstime;
if (buf_pool.n_flush_list())
{
timespec abstime;
service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
"Waiting to flush " ULINTPF " pages",
buf_flush_list_length());
......@@ -2475,24 +2489,48 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool()
&abstime);
mysql_mutex_unlock(&buf_pool.mutex);
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ut_ad(!buf_pool.any_io_pending());
}
/** Synchronously flush dirty blocks during recv_sys_t::apply().
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync_batch(lsn_t lsn)
{
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
tpool::tpool_wait_begin();
mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_flush_wait(lsn);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
tpool::tpool_wait_end();
thd_wait_end(nullptr);
}
/** Synchronously flush dirty blocks.
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync()
{
ut_ad(!sync_check_iterate(dict_sync_check()));
if (recv_recovery_is_on())
recv_sys.apply(true);
thd_wait_begin(nullptr, THD_WAIT_DISKIO);
tpool::tpool_wait_begin();
mysql_mutex_lock(&buf_pool.flush_list_mutex);
for (;;)
{
const ulint n_flushed= buf_flush_list(srv_max_io_capacity);
buf_flush_wait_batch_end_acquiring_mutex(false);
if (!n_flushed && !buf_flush_list_length())
return;
const lsn_t lsn= log_sys.get_lsn();
buf_flush_wait(lsn);
if (lsn == log_sys.get_lsn())
break;
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
tpool::tpool_wait_end();
thd_wait_end(nullptr);
}
#ifdef UNIV_DEBUG
......
......@@ -85,13 +85,6 @@ buf_flush_init_for_writing(
void* page_zip_,
bool use_full_checksum);
/** Write out dirty blocks from buf_pool.flush_list.
@param max_n wished maximum mumber of blocks flushed
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
@return the number of processed pages
@retval 0 if a buf_pool.flush_list batch is already running */
ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX);
/** Try to flush dirty pages that belong to a given tablespace.
@param space tablespace
@param n_flushed number of pages written
......@@ -144,6 +137,10 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool();
void buf_flush_validate();
#endif /* UNIV_DEBUG */
/** Synchronously flush dirty blocks during recv_sys_t::apply().
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync_batch(lsn_t lsn);
/** Synchronously flush dirty blocks.
NOTE: The calling thread is not allowed to hold any buffer page latches! */
void buf_flush_sync();
......
......@@ -1111,8 +1111,7 @@ void os_aio_free();
@retval DB_IO_ERROR on I/O error */
dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n);
/** Wait until there are no pending asynchronous writes.
Only used on FLUSH TABLES...FOR EXPORT. */
/** Wait until there are no pending asynchronous writes. */
void os_aio_wait_until_no_pending_writes();
......
......@@ -511,7 +511,7 @@ do { \
#ifdef HAVE_PSI_STAGE_INTERFACE
/** Performance schema stage event for monitoring ALTER TABLE progress
everything after flush log_make_checkpoint(). */
in ha_innobase::commit_inplace_alter_table(). */
extern PSI_stage_info srv_stage_alter_table_end;
/** Performance schema stage event for monitoring ALTER TABLE progress
......
......@@ -2752,7 +2752,7 @@ void recv_sys_t::apply(bool last_batch)
/* Instead of flushing, last_batch could sort the buf_pool.flush_list
in ascending order of buf_page_t::oldest_modification. */
buf_flush_sync();
buf_flush_sync_batch(recovered_lsn);
if (!last_batch)
{
......
......@@ -3821,8 +3821,7 @@ static void os_aio_wait_until_no_pending_writes_low()
tpool::tpool_wait_end();
}
/** Wait until there are no pending asynchronous writes.
Only used on FLUSH TABLES...FOR EXPORT. */
/** Wait until there are no pending asynchronous writes. */
void os_aio_wait_until_no_pending_writes()
{
os_aio_wait_until_no_pending_writes_low();
......
......@@ -2211,8 +2211,6 @@ row_import_cleanup(
DBUG_EXECUTE_IF("ib_import_before_checkpoint_crash", DBUG_SUICIDE(););
log_make_checkpoint();
return(err);
}
......
......@@ -557,7 +557,7 @@ char srv_buffer_pool_load_at_startup = TRUE;
#ifdef HAVE_PSI_STAGE_INTERFACE
/** Performance schema stage event for monitoring ALTER TABLE progress
everything after flush log_make_checkpoint(). */
in ha_innobase::commit_inplace_alter_table(). */
PSI_stage_info srv_stage_alter_table_end
= {0, "alter table (end)", PSI_FLAG_STAGE_PROGRESS};
......
......@@ -329,6 +329,13 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn,
log_sys.log.write_header_durable(lsn);
ut_ad(srv_startup_is_before_trx_rollback_phase);
if (create_new_db) {
srv_startup_is_before_trx_rollback_phase = false;
}
/* Enable checkpoints in buf_flush_page_cleaner(). */
recv_sys.recovery_on = false;
mysql_mutex_unlock(&log_sys.mutex);
log_make_checkpoint();
......@@ -895,91 +902,74 @@ buffer pools. Flush the redo log buffer to the redo log file.
@return lsn upto which data pages have been flushed. */
static lsn_t srv_prepare_to_delete_redo_log_file(bool old_exists)
{
DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
DBUG_ENTER("srv_prepare_to_delete_redo_log_file");
lsn_t flushed_lsn;
ulint count = 0;
/* Disable checkpoints in the page cleaner. */
ut_ad(!recv_sys.recovery_on);
recv_sys.recovery_on= true;
if (log_sys.log.subformat != 2) {
srv_log_file_size = 0;
}
/* Clean the buffer pool. */
buf_flush_sync();
for (;;) {
/* Clean the buffer pool. */
buf_flush_sync();
if (log_sys.log.subformat != 2)
srv_log_file_size= 0;
DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
DBUG_EXECUTE_IF("innodb_log_abort_1", DBUG_RETURN(0););
DBUG_PRINT("ib_log", ("After innodb_log_abort_1"));
mysql_mutex_lock(&log_sys.mutex);
fil_names_clear(log_sys.get_lsn(), false);
flushed_lsn = log_sys.get_lsn();
{
ib::info info;
if (srv_log_file_size == 0
|| (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED)
!= log_t::FORMAT_10_5) {
info << "Upgrading redo log: ";
} else if (!old_exists
|| srv_log_file_size
!= srv_log_file_size_requested) {
if (srv_encrypt_log
== (my_bool)log_sys.is_encrypted()) {
info << (srv_encrypt_log
? "Resizing encrypted"
: "Resizing");
} else if (srv_encrypt_log) {
info << "Encrypting and resizing";
} else {
info << "Removing encryption"
" and resizing";
}
info << " redo log from " << srv_log_file_size
<< " to ";
} else if (srv_encrypt_log) {
info << "Encrypting redo log: ";
} else {
info << "Removing redo log encryption: ";
}
info << srv_log_file_size_requested
<< " bytes; LSN=" << flushed_lsn;
}
mysql_mutex_lock(&log_sys.mutex);
const bool latest_format= (log_sys.log.format & ~log_t::FORMAT_ENCRYPTED) ==
log_t::FORMAT_10_5;
lsn_t flushed_lsn= log_sys.get_lsn();
mysql_mutex_unlock(&log_sys.mutex);
if (latest_format)
{
fil_names_clear(flushed_lsn, false);
flushed_lsn= log_sys.get_lsn();
}
if (flushed_lsn != log_sys.get_flushed_lsn()) {
log_write_up_to(flushed_lsn, false);
log_sys.log.flush();
}
{
const char *msg;
if (!latest_format || srv_log_file_size == 0)
{
msg= "Upgrading redo log: ";
same_size:
ib::info() << msg << srv_log_file_size_requested << " bytes; LSN="
<< flushed_lsn;
}
else if (old_exists && srv_log_file_size == srv_log_file_size_requested)
{
msg= srv_encrypt_log
? "Encrypting redo log: " : "Removing redo log encryption: ";
goto same_size;
}
else
{
if (srv_encrypt_log == (my_bool)log_sys.is_encrypted())
msg= srv_encrypt_log ? "Resizing encrypted" : "Resizing";
else
msg= srv_encrypt_log
? "Encrypting and resizing"
: "Removing encryption and resizing";
ib::info() << msg << " redo log from " << srv_log_file_size << " to "
<< srv_log_file_size_requested
<< " bytes; LSN=" << flushed_lsn;
}
}
ut_ad(flushed_lsn == log_sys.get_lsn());
/* Check if the buffer pools are clean. If not
retry till it is clean. */
if (ulint pending_io = buf_pool.io_pending()) {
count++;
/* Print a message every 60 seconds if we
are waiting to clean the buffer pools */
if (srv_print_verbose_log && count > 600) {
ib::info() << "Waiting for "
<< pending_io << " buffer "
<< "page I/Os to complete";
count = 0;
}
mysql_mutex_unlock(&log_sys.mutex);
os_thread_sleep(100000);
continue;
}
if (flushed_lsn != log_sys.get_flushed_lsn())
{
log_write_up_to(flushed_lsn, false);
log_sys.log.flush();
}
break;
}
ut_ad(flushed_lsn == log_sys.get_lsn());
ut_ad(!buf_pool.any_io_pending());
DBUG_RETURN(flushed_lsn);
DBUG_RETURN(flushed_lsn);
}
/** Tries to locate LOG_FILE_NAME and check it's size, etc
......@@ -1259,7 +1249,7 @@ dberr_t srv_start(bool create_new_db)
ut_ad(buf_page_cleaner_is_active);
}
srv_startup_is_before_trx_rollback_phase = !create_new_db;
srv_startup_is_before_trx_rollback_phase = true;
/* Check if undo tablespaces and redo log files exist before creating
a new system tablespace */
......@@ -1308,7 +1298,6 @@ dberr_t srv_start(bool create_new_db)
if (create_new_db) {
flushed_lsn = log_sys.get_lsn();
log_sys.set_flushed_lsn(flushed_lsn);
buf_flush_sync();
err = create_log_file(true, flushed_lsn, logfile0);
......@@ -1371,6 +1360,9 @@ dberr_t srv_start(bool create_new_db)
if (!log_set_capacity(srv_log_file_size_requested)) {
return(srv_init_abort(DB_ERROR));
}
/* Enable checkpoints in the page cleaner. */
recv_sys.recovery_on = false;
}
file_checked:
......@@ -2024,11 +2016,8 @@ void innodb_shutdown()
break;
case SRV_OPERATION_RESTORE:
case SRV_OPERATION_RESTORE_EXPORT:
srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
if (!buf_page_cleaner_is_active) {
break;
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
while (buf_page_cleaner_is_active) {
pthread_cond_signal(&buf_pool.do_flush_list);
my_cond_wait(&buf_pool.done_flush_list,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment