Commit a3d0d5fc authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-26055: Improve adaptive flushing

This is a 10.5 backport from 10.6
commit 9593cccf.

Adaptive flushing is enabled by setting innodb_max_dirty_pages_pct_lwm>0
(not default) and innodb_adaptive_flushing=ON (default).
There is also the parameter innodb_adaptive_flushing_lwm
(default: 10 per cent of the log capacity). It should enable some
adaptive flushing even when innodb_max_dirty_pages_pct_lwm=0.
That is not being changed here.

This idea was first presented by Inaam Rana several years ago,
and I discussed it with Jean-François Gagné at FOSDEM 2023.

buf_flush_page_cleaner(): When we are not near the log capacity limit
(neither buf_flush_async_lsn nor buf_flush_sync_lsn are set),
also try to move clean blocks from the buf_pool.LRU list to buf_pool.free
or initiate writes (but not the eviction) of dirty blocks, until
the remaining I/O capacity has been consumed.

buf_flush_LRU_list_batch(): Add the parameter bool evict, to specify
whether dirty least recently used pages (from buf_pool.LRU) should
be evicted immediately after they have been written out. Callers outside
buf_flush_page_cleaner() will pass evict=true, to retain the existing
behaviour.

buf_do_LRU_batch(): Add the parameter bool evict.
Return counts of evicted and flushed pages.

buf_flush_LRU(): Add the parameter bool evict.
Assume that the caller holds buf_pool.mutex and
will invoke buf_dblwr.flush_buffered_writes() afterwards.

buf_flush_list_holding_mutex(): A low-level variant of buf_flush_list()
whose caller must hold buf_pool.mutex and invoke
buf_dblwr.flush_buffered_writes() afterwards.

buf_flush_wait_batch_end_acquiring_mutex(): Remove. It is enough to have
buf_flush_wait_batch_end().

page_cleaner_flush_pages_recommendation(): Avoid some floating-point
arithmetics.

buf_flush_page(), buf_flush_check_neighbor(), buf_flush_check_neighbors(),
buf_flush_try_neighbors(): Rename the parameter "bool lru" to "bool evict".

buf_free_from_unzip_LRU_list_batch(): Remove the parameter.
Only actual page writes will contribute towards the limit.

buf_LRU_free_page(): Evict freed pages of temporary tables.

buf_pool.done_free: Broadcast whenever a block is freed
(and buf_pool.try_LRU_scan is set).

buf_pool_t::io_buf_t::reserve(): Retry indefinitely.
During the test encryption.innochecksum we easily run out of
these buffers for PAGE_COMPRESSED or ENCRYPTED pages.

Tested by Matthias Leich and Axel Schwenke
parent 8b509a5d
......@@ -410,9 +410,11 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
return (true);
}
buf_tmp_buffer_t* slot;
if (node.space->purpose == FIL_TYPE_TEMPORARY
&& innodb_encrypt_temporary_tables) {
buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve();
slot = buf_pool.io_buf_reserve();
ut_a(slot);
slot->allocate();
......@@ -431,7 +433,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
tablespace and page contains used key_version. This is true
also for pages first compressed and then encrypted. */
buf_tmp_buffer_t* slot;
uint key_version = buf_page_get_key_version(dst_frame, flags);
if (page_compressed && !key_version) {
......@@ -444,7 +445,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
}
slot = buf_pool.io_buf_reserve();
ut_a(slot);
slot->allocate();
decompress_with_slot:
......@@ -472,7 +472,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
}
slot = buf_pool.io_buf_reserve();
ut_a(slot);
slot->allocate();
ut_d(fil_page_type_validate(node.space, dst_frame));
......@@ -1494,6 +1493,41 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
return(true); /* free_list was enough */
}
void buf_pool_t::io_buf_t::create(ulint n_slots)
{
this->n_slots= n_slots;
slots= static_cast<buf_tmp_buffer_t*>
(ut_malloc_nokey(n_slots * sizeof *slots));
memset((void*) slots, 0, n_slots * sizeof *slots);
}
void buf_pool_t::io_buf_t::close()
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
{
aligned_free(s->crypt_buf);
aligned_free(s->comp_buf);
}
ut_free(slots);
slots= nullptr;
n_slots= 0;
}
buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve()
{
for (;;)
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
os_aio_wait_until_no_pending_writes();
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
os_aio_wait_until_no_pending_reads();
}
}
/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
to the specified string. The format and the following parameters are the
same as the ones used for printf(3).
......@@ -1564,15 +1598,18 @@ inline bool buf_pool_t::withdraw_blocks()
block = next_block;
}
mysql_mutex_unlock(&mutex);
/* reserve free_list length */
if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
ulint n_flushed = buf_flush_LRU(
std::max<ulint>(withdraw_target
- UT_LIST_GET_LEN(withdraw),
srv_LRU_scan_depth));
buf_flush_wait_batch_end_acquiring_mutex(true);
srv_LRU_scan_depth),
true);
mysql_mutex_unlock(&buf_pool.mutex);
buf_dblwr.flush_buffered_writes();
mysql_mutex_lock(&buf_pool.mutex);
buf_flush_wait_batch_end(true);
if (n_flushed) {
MONITOR_INC_VALUE_CUMULATIVE(
......@@ -1586,12 +1623,12 @@ inline bool buf_pool_t::withdraw_blocks()
/* relocate blocks/buddies in withdrawn area */
ulint count2 = 0;
mysql_mutex_lock(&mutex);
buf_page_t* bpage;
bpage = UT_LIST_GET_FIRST(LRU);
while (bpage != NULL) {
buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
if (bpage->zip.data != NULL
buf_pool_mutex_exit_forbid();
for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage;
bpage; bpage = next_bpage) {
ut_ad(bpage->in_file());
next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
if (UNIV_LIKELY_NULL(bpage->zip.data)
&& will_be_withdrawn(bpage->zip.data)
&& bpage->can_relocate()) {
buf_pool_mutex_exit_forbid();
......@@ -2667,11 +2704,6 @@ buf_page_get_low(
|| (rw_latch == RW_X_LATCH)
|| (rw_latch == RW_SX_LATCH)
|| (rw_latch == RW_NO_LATCH));
ut_ad(!allow_ibuf_merge
|| mode == BUF_GET
|| mode == BUF_GET_POSSIBLY_FREED
|| mode == BUF_GET_IF_IN_POOL
|| mode == BUF_GET_IF_IN_POOL_OR_WATCH);
if (err) {
*err = DB_SUCCESS;
......@@ -2685,15 +2717,15 @@ buf_page_get_low(
replace any old pages, which were not evicted during DISCARD.
Skip the assertion on space_page_size. */
break;
case BUF_PEEK_IF_IN_POOL:
default:
ut_ad(!allow_ibuf_merge);
ut_ad(mode == BUF_PEEK_IF_IN_POOL);
break;
case BUF_GET_POSSIBLY_FREED:
case BUF_GET_IF_IN_POOL:
/* The caller may pass a dummy page size,
because it does not really matter. */
break;
default:
ut_error;
case BUF_GET_POSSIBLY_FREED:
break;
case BUF_GET_NO_LATCH:
ut_ad(rw_latch == RW_NO_LATCH);
/* fall through */
......
This diff is collapsed.
......@@ -413,6 +413,7 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex)
&& recv_sys.apply_log_recs) {
goto flush_lru;
});
get_mutex:
mysql_mutex_lock(&buf_pool.mutex);
got_mutex:
buf_LRU_check_size_of_non_data_objects();
......@@ -501,15 +502,18 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex)
#ifndef DBUG_OFF
flush_lru:
#endif
if (!buf_flush_LRU(innodb_lru_flush_size)) {
mysql_mutex_lock(&buf_pool.mutex);
if (!buf_flush_LRU(innodb_lru_flush_size, true)) {
MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
++flush_failures;
}
n_iterations++;
mysql_mutex_lock(&buf_pool.mutex);
buf_pool.stat.LRU_waits++;
goto got_mutex;
mysql_mutex_unlock(&buf_pool.mutex);
buf_dblwr.flush_buffered_writes();
goto get_mutex;
}
/** Move the LRU_old pointer so that the length of the old blocks list
......@@ -815,52 +819,60 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
const ulint fold = id.fold();
page_hash_latch* hash_lock = buf_pool.page_hash.lock_get(fold);
hash_lock->write_lock();
lsn_t oldest_modification = bpage->oldest_modification_acquire();
const lsn_t oldest_modification = bpage->oldest_modification_acquire();
if (UNIV_UNLIKELY(!bpage->can_relocate())) {
/* Do not free buffer fixed and I/O-fixed blocks. */
goto func_exit;
}
if (oldest_modification == 1) {
switch (oldest_modification) {
case 2:
ut_ad(id.space() == SRV_TMP_SPACE_ID);
ut_ad(!bpage->zip.data);
if (bpage->status != buf_page_t::FREED) {
goto func_exit;
}
bpage->clear_oldest_modification();
break;
case 1:
mysql_mutex_lock(&buf_pool.flush_list_mutex);
oldest_modification = bpage->oldest_modification();
if (oldest_modification) {
ut_ad(oldest_modification == 1);
if (const lsn_t om = bpage->oldest_modification()) {
ut_ad(om == 1);
buf_pool.delete_from_flush_list(bpage);
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ut_ad(!bpage->oldest_modification());
oldest_modification = 0;
/* fall through */
case 0:
if (zip || !bpage->zip.data
|| bpage->state() != BUF_BLOCK_FILE_PAGE) {
break;
}
if (zip || !bpage->zip.data) {
relocate_compressed:
b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b));
ut_a(b);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
new (b) buf_page_t(*bpage);
b->set_state(BUF_BLOCK_ZIP_PAGE);
break;
default:
if (zip || !bpage->zip.data
|| bpage->state() != BUF_BLOCK_FILE_PAGE) {
/* This would completely free the block. */
/* Do not completely free dirty blocks. */
if (oldest_modification) {
goto func_exit;
}
} else if (oldest_modification
&& bpage->state() != BUF_BLOCK_FILE_PAGE) {
func_exit:
hash_lock->write_unlock();
return(false);
} else if (bpage->state() == BUF_BLOCK_FILE_PAGE) {
b = buf_page_alloc_descriptor();
ut_a(b);
mysql_mutex_lock(&buf_pool.flush_list_mutex);
new (b) buf_page_t(*bpage);
b->set_state(BUF_BLOCK_ZIP_PAGE);
}
goto relocate_compressed;
}
mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(bpage->in_file());
ut_ad(bpage->in_LRU_list);
DBUG_PRINT("ib_buf", ("free page %u:%u",
id.space(), id.page_no()));
DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no()));
ut_ad(bpage->can_relocate());
......
......@@ -1786,16 +1786,13 @@ class buf_pool_t
MY_ALIGNED(CPU_LEVEL1_DCACHE_LINESIZE) mysql_mutex_t mutex;
/** Number of pending LRU flush; protected by mutex. */
ulint n_flush_LRU_;
/** broadcast when n_flush_LRU reaches 0; protected by mutex */
/** broadcast when n_flush_LRU_ reaches 0; protected by mutex */
pthread_cond_t done_flush_LRU;
/** Number of pending flush_list flush; protected by mutex */
ulint n_flush_list_;
/** broadcast when n_flush_list reaches 0; protected by mutex */
/** broadcast when n_flush_list_ reaches 0; protected by mutex */
pthread_cond_t done_flush_list;
TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
/** @name General fields */
/* @{ */
ulint curr_pool_size; /*!< Current pool size in bytes */
......@@ -1949,7 +1946,7 @@ class buf_pool_t
last_activity_count= activity_count;
}
// n_flush_LRU() + n_flush_list()
// n_flush_LRU_ + n_flush_list_
// is approximately COUNT(io_fix()==BUF_IO_WRITE) in flush_list
unsigned freed_page_clock;/*!< a sequence number used
......@@ -1979,7 +1976,8 @@ class buf_pool_t
UT_LIST_BASE_NODE_T(buf_page_t) free;
/*!< base node of the free
block list */
/** signaled each time when the free list grows; protected by mutex */
/** signaled each time when the free list grows and
broadcast each time try_LRU_scan is set; protected by mutex */
pthread_cond_t done_free;
UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
......@@ -2045,9 +2043,9 @@ class buf_pool_t
return any_pending;
}
/** @return total amount of pending I/O */
ulint io_pending() const
TPOOL_SUPPRESS_TSAN ulint io_pending() const
{
return n_pend_reads + n_flush_LRU() + n_flush_list();
return n_pend_reads + n_flush_LRU_ + n_flush_list_;
}
private:
......@@ -2080,34 +2078,12 @@ class buf_pool_t
/** array of slots */
buf_tmp_buffer_t *slots;
void create(ulint n_slots)
{
this->n_slots= n_slots;
slots= static_cast<buf_tmp_buffer_t*>
(ut_malloc_nokey(n_slots * sizeof *slots));
memset((void*) slots, 0, n_slots * sizeof *slots);
}
void create(ulint n_slots);
void close()
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
{
aligned_free(s->crypt_buf);
aligned_free(s->comp_buf);
}
ut_free(slots);
slots= nullptr;
n_slots= 0;
}
void close();
/** Reserve a buffer */
buf_tmp_buffer_t *reserve()
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
return nullptr;
}
buf_tmp_buffer_t *reserve();
} io_buf;
/** whether resize() is in the critical path */
......@@ -2218,7 +2194,10 @@ inline void buf_page_t::set_oldest_modification(lsn_t lsn)
/** Clear oldest_modification after removing from buf_pool.flush_list */
inline void buf_page_t::clear_oldest_modification()
{
#ifdef SAFE_MUTEX
if (oldest_modification() != 2)
mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
#endif /* SAFE_MUTEX */
ut_d(const auto state= state_);
ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE ||
state == BUF_BLOCK_REMOVE_HASH);
......@@ -2337,7 +2316,7 @@ MEMORY: is not in free list, LRU list, or flush list, nor page
hash table
FILE_PAGE: space and offset are defined, is in page hash table
if io_fix == BUF_IO_WRITE,
buf_pool.n_flush_LRU() || buf_pool.n_flush_list()
buf_pool.n_flush_LRU_ || buf_pool.n_flush_list_
(1) if buf_fix_count == 0, then
is in LRU list, not in free list
......
......@@ -85,11 +85,15 @@ buf_flush_init_for_writing(
bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
MY_ATTRIBUTE((warn_unused_result));
/** Write out dirty blocks from buf_pool.LRU.
/** Write out dirty blocks from buf_pool.LRU,
and move clean blocks to buf_pool.free.
The caller must invoke buf_dblwr.flush_buffered_writes()
after releasing buf_pool.mutex.
@param max_n wished maximum mumber of blocks flushed
@return the number of processed pages
@param evict whether to evict pages after flushing
@return evict ? number of processed pages : number of pages written
@retval 0 if a buf_pool.LRU batch is already running */
ulint buf_flush_LRU(ulint max_n);
ulint buf_flush_LRU(ulint max_n, bool evict);
/** Wait until a flush batch ends.
@param lru true=buf_pool.LRU; false=buf_pool.flush_list */
......@@ -119,9 +123,6 @@ buf_flush_note_modification(
/** Initialize page_cleaner. */
ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
/** Wait for pending flushes to complete. */
void buf_flush_wait_batch_end_acquiring_mutex(bool lru);
/** Flush the buffer pool on shutdown. */
ATTRIBUTE_COLD void buf_flush_buffer_pool();
......
......@@ -843,6 +843,7 @@ constexpr const char* const auto_event_names[] =
"buf0buf",
"buf0dblwr",
"buf0dump",
"buf0lru",
"dict0dict",
"dict0mem",
"dict0stats",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment