Commit 8af53897 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-25801: buf_flush_dirty_pages() is very slow

In commit 7cffb5f6 (MDEV-23399)
the implementation of buf_flush_dirty_pages() was replaced with
a slow one, which would perform excessive scans of the
buf_pool.flush_list and make little progress.

buf_flush_list(), buf_flush_LRU(): Split from buf_flush_lists().
Vladislav Vaintroub noticed that we will not need to invoke
log_flush_task.wait() for the LRU eviction flushing.

buf_flush_list_space(): Replaces buf_flush_dirty_pages().
This is like buf_flush_list(), but operating on a single
tablespace at a time. Writes at most innodb_io_capacity
pages. Returns whether some of the tablespace might remain
in the buffer pool.
parent 762bcb81
......@@ -1729,10 +1729,10 @@ inline bool buf_pool_t::withdraw_blocks()
/* reserve free_list length */
if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
ulint n_flushed = buf_flush_lists(
ulint n_flushed = buf_flush_LRU(
std::max<ulint>(withdraw_target
- UT_LIST_GET_LEN(withdraw),
srv_LRU_scan_depth), 0);
srv_LRU_scan_depth));
buf_flush_wait_batch_end_acquiring_mutex(true);
if (n_flushed) {
......@@ -3321,7 +3321,7 @@ buf_page_get_low(
fix_block->fix();
mysql_mutex_unlock(&buf_pool.mutex);
buf_flush_lists(ULINT_UNDEFINED, LSN_MAX);
buf_flush_list();
buf_flush_wait_batch_end_acquiring_mutex(false);
if (fix_block->page.buf_fix_count() == 1
......
......@@ -258,33 +258,6 @@ void buf_flush_remove_pages(ulint id)
mysql_mutex_unlock(&buf_pool.mutex);
}
/** Try to flush all the dirty pages that belong to a given tablespace.
@param id tablespace identifier
@return number dirty pages that there were for this tablespace */
ulint buf_flush_dirty_pages(ulint id)
{
ut_ad(!sync_check_iterate(dict_sync_check()));
ulint n= 0;
mysql_mutex_lock(&buf_pool.flush_list_mutex);
for (buf_page_t *bpage= UT_LIST_GET_FIRST(buf_pool.flush_list); bpage;
bpage= UT_LIST_GET_NEXT(list, bpage))
{
ut_d(const auto s= bpage->state());
ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
s == BUF_BLOCK_REMOVE_HASH);
ut_ad(bpage->oldest_modification());
if (id == bpage->id().space())
n++;
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (n)
buf_flush_lists(srv_max_io_capacity, LSN_MAX);
return n;
}
/*******************************************************************//**
Relocates a buffer control block on the flush_list.
Note that it is assumed that the contents of bpage have already been
......@@ -1431,11 +1404,6 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
mysql_mutex_lock(&buf_pool.flush_list_mutex);
ulint len= UT_LIST_GET_LEN(buf_pool.flush_list);
/* In order not to degenerate this scan to O(n*n) we attempt to
preserve pointer of previous block in the flush list. To do so we
declare it a hazard pointer. Any thread working on the flush list
must check the hazard pointer and if it is removing the same block
then it must reset it. */
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list);
bpage && len && count < max_n;
bpage= buf_pool.flush_hp.get(), ++scanned, len--)
......@@ -1444,16 +1412,28 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
if (oldest_modification >= lsn)
break;
ut_ad(oldest_modification);
ut_ad(bpage->in_file());
buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
if (!bpage->ready_for_flush())
{
bpage= prev;
continue;
}
/* In order not to degenerate this scan to O(n*n) we attempt to
preserve the pointer position. Any thread that would remove 'prev'
from buf_pool.flush_list must adjust the hazard pointer.
Note: A concurrent execution of buf_flush_list_space() may
terminate this scan prematurely. The buf_pool.n_flush_list()
should prevent multiple threads from executing
buf_do_flush_list_batch() concurrently,
but buf_flush_list_space() is ignoring that. */
buf_pool.flush_hp.set(prev);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ut_ad(bpage->in_file());
const bool flushed= bpage->ready_for_flush();
if (flushed)
{
const page_id_t page_id(bpage->id());
const uint32_t space_id= page_id.space();
if (!space || space->id != space_id)
......@@ -1481,7 +1461,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
mysql_mutex_unlock(&buf_pool.mutex);
count+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
false, count, max_n);
reacquire_mutex:
reacquire_mutex:
mysql_mutex_lock(&buf_pool.mutex);
}
else if (buf_flush_page(bpage, false, space))
......@@ -1489,10 +1469,9 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
++count;
goto reacquire_mutex;
}
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
ut_ad(flushed || buf_pool.flush_hp.is_hp(prev));
bpage= buf_pool.flush_hp.get();
}
buf_pool.flush_hp.set(nullptr);
......@@ -1537,51 +1516,189 @@ void buf_flush_wait_batch_end(bool lru)
/** Write out dirty blocks from buf_pool.flush_list.
@param max_n wished maximum mumber of blocks flushed
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target (0=LRU flush)
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
@return the number of processed pages
@retval 0 if a batch of the same type (lsn==0 or lsn!=0) is already running */
ulint buf_flush_lists(ulint max_n, lsn_t lsn)
@retval 0 if a buf_pool.flush_list batch is already running */
ulint buf_flush_list(ulint max_n, lsn_t lsn)
{
auto &n_flush= lsn ? buf_pool.n_flush_list : buf_pool.n_flush_LRU;
ut_ad(lsn);
if (n_flush)
if (buf_pool.n_flush_list)
return 0;
auto cond= lsn ? &buf_pool.done_flush_list : &buf_pool.done_flush_LRU;
mysql_mutex_lock(&buf_pool.mutex);
const bool running= n_flush != 0;
const bool running= buf_pool.n_flush_list != 0;
/* FIXME: we are performing a dirty read of buf_pool.flush_list.count
while not holding buf_pool.flush_list_mutex */
if (running || (lsn && !UT_LIST_GET_LEN(buf_pool.flush_list)))
if (running || !UT_LIST_GET_LEN(buf_pool.flush_list))
{
if (!running)
pthread_cond_broadcast(cond);
pthread_cond_broadcast(&buf_pool.done_flush_list);
mysql_mutex_unlock(&buf_pool.mutex);
return 0;
}
n_flush++;
ulint n_flushed= lsn
? buf_do_flush_list_batch(max_n, lsn)
: buf_do_LRU_batch(max_n);
buf_pool.n_flush_list++;
const auto n_flushing= --n_flush;
ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
const auto n_flushing= --buf_pool.n_flush_list;
buf_pool.try_LRU_scan= true;
mysql_mutex_unlock(&buf_pool.mutex);
if (!n_flushing)
pthread_cond_broadcast(cond);
pthread_cond_broadcast(&buf_pool.done_flush_list);
buf_dblwr.flush_buffered_writes();
DBUG_PRINT("ib_buf", ("%s completed, " ULINTPF " pages",
lsn ? "flush_list" : "LRU flush", n_flushed));
DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
return n_flushed;
}
/** Try to flush all the dirty pages that belong to a given tablespace.
@param space tablespace
@param n_flushed number of pages written
@return whether any pages might not have been flushed */
bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
{
const auto space_id= space->id;
ut_ad(space_id <= SRV_SPACE_ID_UPPER_BOUND);
bool may_have_skipped= false;
ulint max_n_flush= srv_io_capacity;
mysql_mutex_lock(&buf_pool.mutex);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
bool acquired= space->acquire();
buf_flush_freed_pages(space);
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
{
ut_d(const auto s= bpage->state());
ut_ad(s == BUF_BLOCK_ZIP_PAGE || s == BUF_BLOCK_FILE_PAGE ||
s == BUF_BLOCK_REMOVE_HASH);
ut_ad(bpage->oldest_modification());
ut_ad(bpage->in_file());
buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
if (bpage->id().space() != space_id);
else if (!bpage->ready_for_flush())
may_have_skipped= true;
else
{
/* In order not to degenerate this scan to O(n*n) we attempt to
preserve the pointer position. Any thread that would remove 'prev'
from buf_pool.flush_list must adjust the hazard pointer.
Note: Multiple executions of buf_flush_list_space() may be
interleaved, and also buf_do_flush_list_batch() may be running
concurrently. This may terminate our iteration prematurely,
leading us to return may_have_skipped=true. */
buf_pool.flush_hp.set(prev);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (!acquired)
{
was_freed:
buf_flush_discard_page(bpage);
}
else
{
if (space->is_stopping())
{
space->release();
acquired= false;
goto was_freed;
}
if (!buf_flush_page(bpage, false, space))
{
may_have_skipped= true;
mysql_mutex_lock(&buf_pool.flush_list_mutex);
goto next_after_skip;
}
if (n_flushed)
++*n_flushed;
if (!--max_n_flush)
{
mysql_mutex_lock(&buf_pool.mutex);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
may_have_skipped= true;
break;
}
mysql_mutex_lock(&buf_pool.mutex);
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
if (!buf_pool.flush_hp.is_hp(prev))
may_have_skipped= true;
next_after_skip:
bpage= buf_pool.flush_hp.get();
continue;
}
bpage= prev;
}
/* Note: this loop may have been executed concurrently with
buf_do_flush_list_batch() as well as other threads executing
buf_flush_list_space(). We should always return true from
buf_flush_list_space() if that should be the case; in
buf_do_flush_list_batch() we will simply perform less work. */
buf_pool.flush_hp.set(nullptr);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
buf_pool.try_LRU_scan= true;
mysql_mutex_unlock(&buf_pool.mutex);
if (acquired)
space->release();
if (space->purpose == FIL_TYPE_IMPORT)
os_aio_wait_until_no_pending_writes();
else
buf_dblwr.flush_buffered_writes();
return may_have_skipped;
}
/** Write out dirty blocks from buf_pool.LRU.
@param max_n wished maximum mumber of blocks flushed
@return the number of processed pages
@retval 0 if a buf_pool.LRU batch is already running */
ulint buf_flush_LRU(ulint max_n)
{
if (buf_pool.n_flush_LRU)
return 0;
log_buffer_flush_to_disk(true);
mysql_mutex_lock(&buf_pool.mutex);
if (buf_pool.n_flush_LRU)
{
mysql_mutex_unlock(&buf_pool.mutex);
return 0;
}
buf_pool.n_flush_LRU++;
ulint n_flushed= buf_do_LRU_batch(max_n);
const auto n_flushing= --buf_pool.n_flush_LRU;
buf_pool.try_LRU_scan= true;
mysql_mutex_unlock(&buf_pool.mutex);
if (!n_flushing)
pthread_cond_broadcast(&buf_pool.done_flush_LRU);
buf_dblwr.flush_buffered_writes();
DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed));
return n_flushed;
}
/** Initiate a log checkpoint, discarding the start of the log.
@param oldest_lsn the checkpoint LSN
......@@ -1715,7 +1832,7 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
do
{
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ulint n_pages= buf_flush_lists(srv_max_io_capacity, sync_lsn);
ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
buf_flush_wait_batch_end_acquiring_mutex(false);
if (n_pages)
{
......@@ -1810,7 +1927,7 @@ ATTRIBUTE_COLD static void buf_flush_sync_for_checkpoint(lsn_t lsn)
{
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (ulint n_flushed= buf_flush_lists(srv_max_io_capacity, lsn))
if (ulint n_flushed= buf_flush_list(srv_max_io_capacity, lsn))
{
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
MONITOR_FLUSH_SYNC_COUNT,
......@@ -2173,14 +2290,14 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
if (UNIV_UNLIKELY(lsn_limit != 0))
{
n_flushed= buf_flush_lists(srv_max_io_capacity, lsn_limit);
n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit);
/* wake up buf_flush_wait_flushed() */
pthread_cond_broadcast(&buf_pool.done_flush_list);
goto try_checkpoint;
}
else if (idle_flush || !srv_adaptive_flushing)
{
n_flushed= buf_flush_lists(srv_io_capacity, LSN_MAX);
n_flushed= buf_flush_list(srv_io_capacity);
try_checkpoint:
if (n_flushed)
{
......@@ -2207,7 +2324,7 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*)
{
page_cleaner.flush_pass++;
const ulint tm= ut_time_ms();
last_pages= n_flushed= buf_flush_lists(n, LSN_MAX);
last_pages= n_flushed= buf_flush_list(n);
page_cleaner.flush_time+= ut_time_ms() - tm;
if (n_flushed)
......@@ -2299,7 +2416,7 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool()
while (buf_pool.n_flush_list || buf_flush_list_length())
{
buf_flush_lists(srv_max_io_capacity, LSN_MAX);
buf_flush_list(srv_max_io_capacity);
timespec abstime;
if (buf_pool.n_flush_list)
......@@ -2327,7 +2444,7 @@ void buf_flush_sync()
for (;;)
{
const ulint n_flushed= buf_flush_lists(srv_max_io_capacity, LSN_MAX);
const ulint n_flushed= buf_flush_list(srv_max_io_capacity);
buf_flush_wait_batch_end_acquiring_mutex(false);
if (!n_flushed && !buf_flush_list_length())
return;
......
......@@ -487,7 +487,7 @@ buf_block_t* buf_LRU_get_free_block(bool have_mutex)
involved (particularly in case of ROW_FORMAT=COMPRESSED pages). We
can do that in a separate patch sometime in future. */
if (!buf_flush_lists(innodb_lru_flush_size, 0)) {
if (!buf_flush_LRU(innodb_lru_flush_size)) {
MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
++flush_failures;
}
......
......@@ -1098,7 +1098,7 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space)
mtr.commit();
/* 4 - sync tablespace before publishing crypt data */
while (buf_flush_dirty_pages(space->id));
while (buf_flush_list_space(space));
/* 5 - publish crypt data */
mutex_enter(&fil_crypt_threads_mutex);
......@@ -2036,14 +2036,7 @@ fil_crypt_flush_space(
if (end_lsn > 0 && !space->is_stopping()) {
ulint sum_pages = 0;
const ulonglong start = my_interval_timer();
do {
ulint n_dirty= buf_flush_dirty_pages(state->space->id);
if (!n_dirty) {
break;
}
sum_pages += n_dirty;
} while (!space->is_stopping());
while (buf_flush_list_space(space, &sum_pages));
if (sum_pages) {
const ulonglong end = my_interval_timer();
......
......@@ -1742,7 +1742,7 @@ void fil_close_tablespace(ulint id)
can no longer read more pages of this tablespace to buf_pool.
Thus we can clean the tablespace out of buf_pool
completely and permanently. */
while (buf_flush_dirty_pages(id));
while (buf_flush_list_space(space));
ut_ad(space->is_stopping());
/* If the free is successful, the X lock will be released before
......
/*****************************************************************************
Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2014, 2020, MariaDB Corporation.
Copyright (c) 2014, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -54,12 +54,6 @@ the list as they age towards the tail of the LRU.
@param id tablespace identifier */
void buf_flush_remove_pages(ulint id);
/** Try to flush all the dirty pages that belong to a given tablespace.
@param id tablespace identifier
@return number dirty pages that there were for this tablespace */
ulint buf_flush_dirty_pages(ulint id)
MY_ATTRIBUTE((warn_unused_result));
/*******************************************************************//**
Relocates a buffer control block on the flush_list.
Note that it is assumed that the contents of bpage has already been
......@@ -93,10 +87,23 @@ buf_flush_init_for_writing(
/** Write out dirty blocks from buf_pool.flush_list.
@param max_n wished maximum mumber of blocks flushed
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target (0=LRU flush)
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
@return the number of processed pages
@retval 0 if a buf_pool.flush_list batch is already running */
ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX);
/** Try to flush dirty pages that belong to a given tablespace.
@param space tablespace
@param n_flushed number of pages written
@return whether any pages might not have been flushed */
bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
MY_ATTRIBUTE((warn_unused_result));
/** Write out dirty blocks from buf_pool.LRU.
@param max_n wished maximum mumber of blocks flushed
@return the number of processed pages
@retval 0 if a batch of the same type (lsn==0 or lsn!=0) is already running */
ulint buf_flush_lists(ulint max_n, lsn_t lsn);
@retval 0 if a buf_pool.LRU batch is already running */
ulint buf_flush_LRU(ulint max_n);
/** Wait until a flush batch ends.
@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
......
......@@ -4232,7 +4232,7 @@ row_import_for_mysql(
/* Ensure that all pages dirtied during the IMPORT make it to disk.
The only dirty pages generated should be from the pessimistic purge
of delete marked records that couldn't be purged in Phase I. */
while (buf_flush_dirty_pages(prebuilt->table->space_id));
while (buf_flush_list_space(prebuilt->table->space));
for (ulint count = 0; prebuilt->table->space->referenced(); count++) {
/* Issue a warning every 10.24 seconds, starting after
......
/*****************************************************************************
Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, 2020, MariaDB Corporation.
Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -536,7 +536,7 @@ row_quiesce_table_start(
}
}
while (buf_flush_dirty_pages(table->space_id)) {
while (buf_flush_list_space(table->space)) {
if (trx_is_interrupted(trx)) {
goto aborted;
}
......
......@@ -596,7 +596,7 @@ static void trx_purge_truncate_history()
return;
}
const fil_space_t& space = *purge_sys.truncate.current;
fil_space_t& space = *purge_sys.truncate.current;
/* Undo tablespace always are a single file. */
ut_a(UT_LIST_GET_LEN(space.chain) == 1);
fil_node_t* file = UT_LIST_GET_FIRST(space.chain);
......@@ -672,7 +672,7 @@ static void trx_purge_truncate_history()
mini-transaction commit and the server was killed, then
discarding the to-be-trimmed pages without flushing would
break crash recovery. So, we cannot avoid the write. */
while (buf_flush_dirty_pages(space.id));
while (buf_flush_list_space(&space));
log_free_check();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment