Commit 9593cccf authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-26055: Improve adaptive flushing

Adaptive flushing is enabled by setting innodb_max_dirty_pages_pct_lwm>0
(not default) and innodb_adaptive_flushing=ON (default).
There is also the parameter innodb_adaptive_flushing_lwm
(default: 10 per cent of the log capacity). It should enable some
adaptive flushing even when innodb_max_dirty_pages_pct_lwm=0.
That is not being changed here.

This idea was first presented by Inaam Rana several years ago,
and I discussed it with Jean-François Gagné at FOSDEM 2023.

buf_flush_page_cleaner(): When we are not near the log capacity limit
(neither buf_flush_async_lsn nor buf_flush_sync_lsn are set),
also try to move clean blocks from the buf_pool.LRU list to buf_pool.free
or initiate writes (but not the eviction) of dirty blocks, until
the remaining I/O capacity has been consumed.

buf_flush_LRU_list_batch(): Add the parameter bool evict, to specify
whether dirty least recently used pages (from buf_pool.LRU) should
be evicted immediately after they have been written out. Callers outside
buf_flush_page_cleaner() will pass evict=true, to retain the existing
behaviour.

buf_do_LRU_batch(): Add the parameter bool evict.
Return counts of evicted and flushed pages.

buf_flush_LRU(): Add the parameter bool evict.
Assume that the caller holds buf_pool.mutex and
will invoke buf_dblwr.flush_buffered_writes() afterwards.

buf_flush_list_holding_mutex(): A low-level variant of buf_flush_list()
whose caller must hold buf_pool.mutex and invoke
buf_dblwr.flush_buffered_writes() afterwards.

buf_flush_wait_batch_end_acquiring_mutex(): Remove. It is enough to have
buf_flush_wait_batch_end().

page_cleaner_flush_pages_recommendation(): Avoid some floating-point
arithmetics.

buf_flush_page(), buf_flush_check_neighbor(), buf_flush_check_neighbors(),
buf_flush_try_neighbors(): Rename the parameter "bool lru" to "bool evict".

buf_free_from_unzip_LRU_list_batch(): Remove the parameter.
Only actual page writes will contribute towards the limit.

buf_LRU_free_page(): Evict freed pages of temporary tables.

buf_pool.done_free: Broadcast whenever a block is freed
(and buf_pool.try_LRU_scan is set).

buf_pool_t::io_buf_t::reserve(): Retry indefinitely.
During the test encryption.innochecksum we easily run out of
these buffers for PAGE_COMPRESSED or ENCRYPTED pages.

Tested by Matthias Leich and Axel Schwenke
parent 4105017a
...@@ -408,7 +408,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage, ...@@ -408,7 +408,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
if (id.space() == SRV_TMP_SPACE_ID if (id.space() == SRV_TMP_SPACE_ID
&& innodb_encrypt_temporary_tables) { && innodb_encrypt_temporary_tables) {
slot = buf_pool.io_buf_reserve(); slot = buf_pool.io_buf_reserve();
ut_a(slot);
slot->allocate(); slot->allocate();
bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame); bool ok = buf_tmp_page_decrypt(slot->crypt_buf, dst_frame);
slot->release(); slot->release();
...@@ -431,7 +430,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage, ...@@ -431,7 +430,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
} }
slot = buf_pool.io_buf_reserve(); slot = buf_pool.io_buf_reserve();
ut_a(slot);
slot->allocate(); slot->allocate();
decompress_with_slot: decompress_with_slot:
...@@ -455,7 +453,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage, ...@@ -455,7 +453,6 @@ static bool buf_page_decrypt_after_read(buf_page_t *bpage,
} }
slot = buf_pool.io_buf_reserve(); slot = buf_pool.io_buf_reserve();
ut_a(slot);
slot->allocate(); slot->allocate();
/* decrypt using crypt_buf to dst_frame */ /* decrypt using crypt_buf to dst_frame */
...@@ -1293,6 +1290,41 @@ inline bool buf_pool_t::realloc(buf_block_t *block) ...@@ -1293,6 +1290,41 @@ inline bool buf_pool_t::realloc(buf_block_t *block)
return(true); /* free_list was enough */ return(true); /* free_list was enough */
} }
void buf_pool_t::io_buf_t::create(ulint n_slots)
{
this->n_slots= n_slots;
slots= static_cast<buf_tmp_buffer_t*>
(ut_malloc_nokey(n_slots * sizeof *slots));
memset((void*) slots, 0, n_slots * sizeof *slots);
}
void buf_pool_t::io_buf_t::close()
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
{
aligned_free(s->crypt_buf);
aligned_free(s->comp_buf);
}
ut_free(slots);
slots= nullptr;
n_slots= 0;
}
buf_tmp_buffer_t *buf_pool_t::io_buf_t::reserve()
{
for (;;)
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
os_aio_wait_until_no_pending_writes();
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
os_aio_wait_until_no_pending_reads();
}
}
/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status /** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
to the specified string. The format and the following parameters are the to the specified string. The format and the following parameters are the
same as the ones used for printf(3). same as the ones used for printf(3).
...@@ -1359,21 +1391,23 @@ inline bool buf_pool_t::withdraw_blocks() ...@@ -1359,21 +1391,23 @@ inline bool buf_pool_t::withdraw_blocks()
block = next_block; block = next_block;
} }
mysql_mutex_unlock(&mutex);
/* reserve free_list length */ /* reserve free_list length */
if (UT_LIST_GET_LEN(withdraw) < withdraw_target) { if (UT_LIST_GET_LEN(withdraw) < withdraw_target) {
buf_flush_LRU( buf_flush_LRU(
std::max<ulint>(withdraw_target std::max<ulint>(withdraw_target
- UT_LIST_GET_LEN(withdraw), - UT_LIST_GET_LEN(withdraw),
srv_LRU_scan_depth)); srv_LRU_scan_depth),
buf_flush_wait_batch_end_acquiring_mutex(true); true);
mysql_mutex_unlock(&buf_pool.mutex);
buf_dblwr.flush_buffered_writes();
mysql_mutex_lock(&buf_pool.mutex);
buf_flush_wait_batch_end(true);
} }
/* relocate blocks/buddies in withdrawn area */ /* relocate blocks/buddies in withdrawn area */
ulint count2 = 0; ulint count2 = 0;
mysql_mutex_lock(&mutex);
buf_pool_mutex_exit_forbid(); buf_pool_mutex_exit_forbid();
for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage; for (buf_page_t* bpage = UT_LIST_GET_FIRST(LRU), *next_bpage;
bpage; bpage = next_bpage) { bpage; bpage = next_bpage) {
...@@ -2380,11 +2414,6 @@ buf_page_get_low( ...@@ -2380,11 +2414,6 @@ buf_page_get_low(
|| (rw_latch == RW_X_LATCH) || (rw_latch == RW_X_LATCH)
|| (rw_latch == RW_SX_LATCH) || (rw_latch == RW_SX_LATCH)
|| (rw_latch == RW_NO_LATCH)); || (rw_latch == RW_NO_LATCH));
ut_ad(!allow_ibuf_merge
|| mode == BUF_GET
|| mode == BUF_GET_POSSIBLY_FREED
|| mode == BUF_GET_IF_IN_POOL
|| mode == BUF_GET_IF_IN_POOL_OR_WATCH);
if (err) { if (err) {
*err = DB_SUCCESS; *err = DB_SUCCESS;
...@@ -2392,15 +2421,15 @@ buf_page_get_low( ...@@ -2392,15 +2421,15 @@ buf_page_get_low(
#ifdef UNIV_DEBUG #ifdef UNIV_DEBUG
switch (mode) { switch (mode) {
case BUF_PEEK_IF_IN_POOL: default:
ut_ad(!allow_ibuf_merge);
ut_ad(mode == BUF_PEEK_IF_IN_POOL);
break;
case BUF_GET_POSSIBLY_FREED:
case BUF_GET_IF_IN_POOL: case BUF_GET_IF_IN_POOL:
/* The caller may pass a dummy page size, /* The caller may pass a dummy page size,
because it does not really matter. */ because it does not really matter. */
break; break;
default:
MY_ASSERT_UNREACHABLE();
case BUF_GET_POSSIBLY_FREED:
break;
case BUF_GET: case BUF_GET:
case BUF_GET_IF_IN_POOL_OR_WATCH: case BUF_GET_IF_IN_POOL_OR_WATCH:
ut_ad(!mtr->is_freeing_tree()); ut_ad(!mtr->is_freeing_tree());
...@@ -2552,6 +2581,7 @@ buf_page_get_low( ...@@ -2552,6 +2581,7 @@ buf_page_get_low(
mysql_mutex_lock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.mutex);
block->unfix(); block->unfix();
free_unfixed_block:
if (!buf_LRU_free_page(&block->page, true)) { if (!buf_LRU_free_page(&block->page, true)) {
ut_ad(0); ut_ad(0);
} }
...@@ -2667,20 +2697,19 @@ buf_page_get_low( ...@@ -2667,20 +2697,19 @@ buf_page_get_low(
/* Decompress the page while not holding /* Decompress the page while not holding
buf_pool.mutex. */ buf_pool.mutex. */
auto ok = buf_zip_decompress(block, false); const auto ok = buf_zip_decompress(block, false);
block->page.read_unfix(state);
state = block->page.state();
block->page.lock.x_unlock();
--buf_pool.n_pend_unzip; --buf_pool.n_pend_unzip;
if (!ok) { if (!ok) {
/* FIXME: Evict the corrupted
ROW_FORMAT=COMPRESSED page! */
if (err) { if (err) {
*err = DB_PAGE_CORRUPTED; *err = DB_PAGE_CORRUPTED;
} }
return nullptr; mysql_mutex_lock(&buf_pool.mutex);
}
state = block->page.read_unfix(state);
block->page.lock.x_unlock();
if (!ok) {
goto free_unfixed_block;
} }
} }
...@@ -2886,9 +2915,11 @@ buf_page_get_gen( ...@@ -2886,9 +2915,11 @@ buf_page_get_gen(
dberr_t* err, dberr_t* err,
bool allow_ibuf_merge) bool allow_ibuf_merge)
{ {
if (buf_block_t *block= recv_sys.recover(page_id)) buf_block_t *block= recv_sys.recover(page_id);
{ if (UNIV_LIKELY(!block))
if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1))) return buf_page_get_low(page_id, zip_size, rw_latch,
guess, mode, mtr, err, allow_ibuf_merge);
else if (UNIV_UNLIKELY(block == reinterpret_cast<buf_block_t*>(-1)))
{ {
corrupted: corrupted:
if (err) if (err)
...@@ -2909,7 +2940,10 @@ buf_page_get_gen( ...@@ -2909,7 +2940,10 @@ buf_page_get_gen(
{ {
got_freed_page: got_freed_page:
ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL); ut_ad(mode == BUF_GET_POSSIBLY_FREED || mode == BUF_PEEK_IF_IN_POOL);
mysql_mutex_lock(&buf_pool.mutex);
block->page.unfix(); block->page.unfix();
buf_LRU_free_page(&block->page, true);
mysql_mutex_unlock(&buf_pool.mutex);
goto corrupted; goto corrupted;
} }
else if (must_merge && else if (must_merge &&
...@@ -2948,10 +2982,6 @@ buf_page_get_gen( ...@@ -2948,10 +2982,6 @@ buf_page_get_gen(
} }
mtr->page_lock(block, rw_latch); mtr->page_lock(block, rw_latch);
return block; return block;
}
return buf_page_get_low(page_id, zip_size, rw_latch,
guess, mode, mtr, err, allow_ibuf_merge);
} }
/********************************************************************//** /********************************************************************//**
......
...@@ -380,13 +380,12 @@ void buf_page_write_complete(const IORequest &request) ...@@ -380,13 +380,12 @@ void buf_page_write_complete(const IORequest &request)
if (request.is_LRU()) if (request.is_LRU())
{ {
buf_LRU_free_page(bpage, true); buf_LRU_free_page(bpage, true);
buf_pool.try_LRU_scan= true;
pthread_cond_signal(&buf_pool.done_free);
ut_ad(buf_pool.n_flush_LRU_); ut_ad(buf_pool.n_flush_LRU_);
if (!--buf_pool.n_flush_LRU_) if (!--buf_pool.n_flush_LRU_)
{
pthread_cond_broadcast(&buf_pool.done_flush_LRU); pthread_cond_broadcast(&buf_pool.done_flush_LRU);
pthread_cond_signal(&buf_pool.done_free);
}
} }
else else
{ {
...@@ -763,17 +762,17 @@ inline void buf_pool_t::release_freed_page(buf_page_t *bpage) ...@@ -763,17 +762,17 @@ inline void buf_pool_t::release_freed_page(buf_page_t *bpage)
} }
/** Write a flushable page to a file. buf_pool.mutex must be held. /** Write a flushable page to a file. buf_pool.mutex must be held.
@param lru true=buf_pool.LRU; false=buf_pool.flush_list @param evict whether to evict the page on write completion
@param space tablespace @param space tablespace
@return whether the page was flushed and buf_pool.mutex was released */ @return whether the page was flushed and buf_pool.mutex was released */
inline bool buf_page_t::flush(bool lru, fil_space_t *space) inline bool buf_page_t::flush(bool evict, fil_space_t *space)
{ {
ut_ad(in_file()); ut_ad(in_file());
ut_ad(in_LRU_list); ut_ad(in_LRU_list);
ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == ut_ad((space->purpose == FIL_TYPE_TEMPORARY) ==
(space == fil_system.temp_space)); (space == fil_system.temp_space));
ut_ad(space->referenced()); ut_ad(space->referenced());
ut_ad(lru || space != fil_system.temp_space); ut_ad(evict || space != fil_system.temp_space);
if (!lock.u_lock_try(true)) if (!lock.u_lock_try(true))
return false; return false;
...@@ -801,7 +800,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) ...@@ -801,7 +800,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space)
Thus, it cannot be relocated or removed. */ Thus, it cannot be relocated or removed. */
DBUG_PRINT("ib_buf", ("%s %u page %u:%u", DBUG_PRINT("ib_buf", ("%s %u page %u:%u",
lru ? "LRU" : "flush_list", evict ? "LRU" : "flush_list",
id().space(), id().page_no())); id().space(), id().page_no()));
ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED);
ut_ad(f >= UNFIXED); ut_ad(f >= UNFIXED);
...@@ -809,7 +808,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) ...@@ -809,7 +808,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space)
ut_ad(space == fil_system.temp_space ut_ad(space == fil_system.temp_space
? oldest_modification() == 2 ? oldest_modification() == 2
: oldest_modification() > 2); : oldest_modification() > 2);
if (lru) if (evict)
{ {
ut_ad(buf_pool.n_flush_LRU_ < ULINT_UNDEFINED); ut_ad(buf_pool.n_flush_LRU_ < ULINT_UNDEFINED);
buf_pool.n_flush_LRU_++; buf_pool.n_flush_LRU_++;
...@@ -831,7 +830,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) ...@@ -831,7 +830,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space)
#if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32
size_t orig_size; size_t orig_size;
#endif #endif
IORequest::Type type= lru ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC; IORequest::Type type= evict ? IORequest::WRITE_LRU : IORequest::WRITE_ASYNC;
buf_tmp_buffer_t *slot= nullptr; buf_tmp_buffer_t *slot= nullptr;
if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */ if (UNIV_UNLIKELY(!frame)) /* ROW_FORMAT=COMPRESSED */
...@@ -875,7 +874,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) ...@@ -875,7 +874,7 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space)
{ {
switch (space->chain.start->punch_hole) { switch (space->chain.start->punch_hole) {
case 1: case 1:
type= lru ? IORequest::PUNCH_LRU : IORequest::PUNCH; type= evict ? IORequest::PUNCH_LRU : IORequest::PUNCH;
break; break;
case 2: case 2:
size= orig_size; size= orig_size;
...@@ -912,9 +911,10 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space) ...@@ -912,9 +911,10 @@ inline bool buf_page_t::flush(bool lru, fil_space_t *space)
/** Check whether a page can be flushed from the buf_pool. /** Check whether a page can be flushed from the buf_pool.
@param id page identifier @param id page identifier
@param fold id.fold() @param fold id.fold()
@param lru true=buf_pool.LRU; false=buf_pool.flush_list @param evict true=buf_pool.LRU; false=buf_pool.flush_list
@return whether the page can be flushed */ @return whether the page can be flushed */
static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) static bool buf_flush_check_neighbor(const page_id_t id, ulint fold,
bool evict)
{ {
mysql_mutex_assert_owner(&buf_pool.mutex); mysql_mutex_assert_owner(&buf_pool.mutex);
ut_ad(fold == id.fold()); ut_ad(fold == id.fold());
...@@ -926,9 +926,9 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) ...@@ -926,9 +926,9 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
if (!bpage || buf_pool.watch_is_sentinel(*bpage)) if (!bpage || buf_pool.watch_is_sentinel(*bpage))
return false; return false;
/* We avoid flushing 'non-old' blocks in an LRU flush, because the /* We avoid flushing 'non-old' blocks in an eviction flush, because the
flushed blocks are soon freed */ flushed blocks are soon freed */
if (lru && !bpage->is_old()) if (evict && !bpage->is_old())
return false; return false;
return bpage->oldest_modification() > 1 && bpage->ready_for_flush(); return bpage->oldest_modification() > 1 && bpage->ready_for_flush();
...@@ -938,11 +938,11 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru) ...@@ -938,11 +938,11 @@ static bool buf_flush_check_neighbor(const page_id_t id, ulint fold, bool lru)
@param space tablespace @param space tablespace
@param id page identifier of a dirty page @param id page identifier of a dirty page
@param contiguous whether to consider contiguous areas of pages @param contiguous whether to consider contiguous areas of pages
@param lru true=buf_pool.LRU; false=buf_pool.flush_list @param evict true=buf_pool.LRU; false=buf_pool.flush_list
@return last page number that can be flushed */ @return last page number that can be flushed */
static page_id_t buf_flush_check_neighbors(const fil_space_t &space, static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
page_id_t &id, bool contiguous, page_id_t &id, bool contiguous,
bool lru) bool evict)
{ {
ut_ad(id.page_no() < space.size + ut_ad(id.page_no() < space.size +
(space.physical_size() == 2048 ? 1 (space.physical_size() == 2048 ? 1
...@@ -975,7 +975,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, ...@@ -975,7 +975,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
for (page_id_t i= id - 1;; --i) for (page_id_t i= id - 1;; --i)
{ {
fold--; fold--;
if (!buf_flush_check_neighbor(i, fold, lru)) if (!buf_flush_check_neighbor(i, fold, evict))
{ {
low= i + 1; low= i + 1;
break; break;
...@@ -991,7 +991,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space, ...@@ -991,7 +991,7 @@ static page_id_t buf_flush_check_neighbors(const fil_space_t &space,
while (++i < high) while (++i < high)
{ {
++fold; ++fold;
if (!buf_flush_check_neighbor(i, fold, lru)) if (!buf_flush_check_neighbor(i, fold, evict))
break; break;
} }
...@@ -1059,20 +1059,20 @@ and also write zeroes or punch the hole for the freed ranges of pages. ...@@ -1059,20 +1059,20 @@ and also write zeroes or punch the hole for the freed ranges of pages.
@param space tablespace @param space tablespace
@param page_id page identifier @param page_id page identifier
@param contiguous whether to consider contiguous areas of pages @param contiguous whether to consider contiguous areas of pages
@param lru true=buf_pool.LRU; false=buf_pool.flush_list @param evict true=buf_pool.LRU; false=buf_pool.flush_list
@param n_flushed number of pages flushed so far in this batch @param n_flushed number of pages flushed so far in this batch
@param n_to_flush maximum number of pages we are allowed to flush @param n_to_flush maximum number of pages we are allowed to flush
@return number of pages flushed */ @return number of pages flushed */
static ulint buf_flush_try_neighbors(fil_space_t *space, static ulint buf_flush_try_neighbors(fil_space_t *space,
const page_id_t page_id, const page_id_t page_id,
bool contiguous, bool lru, bool contiguous, bool evict,
ulint n_flushed, ulint n_to_flush) ulint n_flushed, ulint n_to_flush)
{ {
ut_ad(space->id == page_id.space()); ut_ad(space->id == page_id.space());
ulint count= 0; ulint count= 0;
page_id_t id= page_id; page_id_t id= page_id;
page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, lru); page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
ut_ad(page_id >= id); ut_ad(page_id >= id);
ut_ad(page_id < high); ut_ad(page_id < high);
...@@ -1096,13 +1096,13 @@ static ulint buf_flush_try_neighbors(fil_space_t *space, ...@@ -1096,13 +1096,13 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
if (buf_page_t *bpage= buf_pool.page_hash.get(id, chain)) if (buf_page_t *bpage= buf_pool.page_hash.get(id, chain))
{ {
ut_ad(bpage->in_file()); ut_ad(bpage->in_file());
/* We avoid flushing 'non-old' blocks in an LRU flush, /* We avoid flushing 'non-old' blocks in an eviction flush,
because the flushed blocks are soon freed */ because the flushed blocks are soon freed */
if (!lru || id == page_id || bpage->is_old()) if (!evict || id == page_id || bpage->is_old())
{ {
if (!buf_pool.watch_is_sentinel(*bpage) && if (!buf_pool.watch_is_sentinel(*bpage) &&
bpage->oldest_modification() > 1 && bpage->ready_for_flush() && bpage->oldest_modification() > 1 && bpage->ready_for_flush() &&
bpage->flush(lru, space)) bpage->flush(evict, space))
{ {
++count; ++count;
continue; continue;
...@@ -1128,12 +1128,8 @@ This utility moves the uncompressed frames of pages to the free list. ...@@ -1128,12 +1128,8 @@ This utility moves the uncompressed frames of pages to the free list.
Note that this function does not actually flush any data to disk. It Note that this function does not actually flush any data to disk. It
just detaches the uncompressed frames from the compressed pages at the just detaches the uncompressed frames from the compressed pages at the
tail of the unzip_LRU and puts those freed frames in the free list. tail of the unzip_LRU and puts those freed frames in the free list.
Note that it is a best effort attempt and it is not guaranteed that
after a call to this function there will be 'max' blocks in the free
list.
@param[in] max desired number of blocks in the free_list
@return number of blocks moved to the free list. */ @return number of blocks moved to the free list. */
static ulint buf_free_from_unzip_LRU_list_batch(ulint max) static ulint buf_free_from_unzip_LRU_list_batch()
{ {
ulint scanned = 0; ulint scanned = 0;
ulint count = 0; ulint count = 0;
...@@ -1143,7 +1139,6 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max) ...@@ -1143,7 +1139,6 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max)
buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU);
while (block while (block
&& count < max
&& UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth && UT_LIST_GET_LEN(buf_pool.free) < srv_LRU_scan_depth
&& UT_LIST_GET_LEN(buf_pool.unzip_LRU) && UT_LIST_GET_LEN(buf_pool.unzip_LRU)
> UT_LIST_GET_LEN(buf_pool.LRU) / 10) { > UT_LIST_GET_LEN(buf_pool.LRU) / 10) {
...@@ -1214,10 +1209,13 @@ static void buf_flush_discard_page(buf_page_t *bpage) ...@@ -1214,10 +1209,13 @@ static void buf_flush_discard_page(buf_page_t *bpage)
buf_LRU_free_page(bpage, true); buf_LRU_free_page(bpage, true);
} }
/** Flush dirty blocks from the end of the LRU list. /** Flush dirty blocks from the end buf_pool.LRU,
@param max maximum number of blocks to make available in buf_pool.free and move clean blocks to buf_pool.free.
@param max maximum number of blocks to flush
@param evict whether dirty pages are to be evicted after flushing them
@param n counts of flushed and evicted pages */ @param n counts of flushed and evicted pages */
static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) static void buf_flush_LRU_list_batch(ulint max, bool evict,
flush_counters_t *n)
{ {
ulint scanned= 0; ulint scanned= 0;
ulint free_limit= srv_LRU_scan_depth; ulint free_limit= srv_LRU_scan_depth;
...@@ -1229,6 +1227,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) ...@@ -1229,6 +1227,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN const auto neighbors= UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN
? 0 : srv_flush_neighbors; ? 0 : srv_flush_neighbors;
fil_space_t *space= nullptr; fil_space_t *space= nullptr;
bool do_evict= evict;
uint32_t last_space_id= FIL_NULL; uint32_t last_space_id= FIL_NULL;
static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency"); static_assert(FIL_NULL > SRV_TMP_SPACE_ID, "consistency");
static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency"); static_assert(FIL_NULL > SRV_SPACE_ID_UPPER_BOUND, "consistency");
...@@ -1236,8 +1235,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) ...@@ -1236,8 +1235,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU);
bpage && bpage &&
((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && ((UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN &&
UT_LIST_GET_LEN(buf_pool.free) < free_limit && UT_LIST_GET_LEN(buf_pool.free) < free_limit) ||
n->flushed + n->evicted < max) ||
recv_recovery_is_on()); ++scanned) recv_recovery_is_on()); ++scanned)
{ {
buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage);
...@@ -1257,8 +1255,8 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) ...@@ -1257,8 +1255,8 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
} }
else if (state < buf_page_t::READ_FIX) else if (state < buf_page_t::READ_FIX)
{ {
/* Block is ready for flush. Dispatch an IO request. The IO /* Block is ready for flush. Dispatch an IO request.
helper thread will put it on free list in IO completion routine. */ If evict=true, the page will be evicted by buf_page_write_complete(). */
const page_id_t page_id(bpage->id()); const page_id_t page_id(bpage->id());
const uint32_t space_id= page_id.space(); const uint32_t space_id= page_id.space();
if (!space || space->id != space_id) if (!space || space->id != space_id)
...@@ -1271,6 +1269,9 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) ...@@ -1271,6 +1269,9 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
space->release(); space->release();
auto p= buf_flush_space(space_id); auto p= buf_flush_space(space_id);
space= p.first; space= p.first;
/* For the temporary tablespace, LRU flushing will always
evict pages upon completing the write. */
do_evict= evict || space == fil_system.temp_space;
last_space_id= space_id; last_space_id= space_id;
mysql_mutex_lock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.mutex);
if (p.second) if (p.second)
...@@ -1292,11 +1293,13 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) ...@@ -1292,11 +1293,13 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
{ {
mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_unlock(&buf_pool.mutex);
n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1, n->flushed+= buf_flush_try_neighbors(space, page_id, neighbors == 1,
true, n->flushed, max); do_evict, n->flushed, max);
reacquire_mutex: reacquire_mutex:
mysql_mutex_lock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.mutex);
} }
else if (bpage->flush(true, space)) else if (n->flushed >= max && !recv_recovery_is_on())
break;
else if (bpage->flush(do_evict, space))
{ {
++n->flushed; ++n->flushed;
goto reacquire_mutex; goto reacquire_mutex;
...@@ -1324,26 +1327,20 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) ...@@ -1324,26 +1327,20 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n)
/** Flush and move pages from LRU or unzip_LRU list to the free list. /** Flush and move pages from LRU or unzip_LRU list to the free list.
Whether LRU or unzip_LRU is used depends on the state of the system. Whether LRU or unzip_LRU is used depends on the state of the system.
@param max maximum number of blocks to make available in buf_pool.free @param max maximum number of blocks to flush
@return number of flushed pages */ @param evict whether dirty pages are to be evicted after flushing them
static ulint buf_do_LRU_batch(ulint max) @param n counts of flushed and evicted pages */
static void buf_do_LRU_batch(ulint max, bool evict, flush_counters_t *n)
{ {
const ulint n_unzip_LRU_evicted= buf_LRU_evict_from_unzip_LRU() if (buf_LRU_evict_from_unzip_LRU())
? buf_free_from_unzip_LRU_list_batch(max) buf_free_from_unzip_LRU_list_batch();
: 0; n->evicted= 0;
flush_counters_t n; n->flushed= 0;
n.flushed= 0; buf_flush_LRU_list_batch(max, evict, n);
n.evicted= n_unzip_LRU_evicted;
buf_flush_LRU_list_batch(max, &n);
mysql_mutex_assert_owner(&buf_pool.mutex);
if (const ulint evicted= n.evicted - n_unzip_LRU_evicted)
buf_lru_freed_page_count+= evicted;
if (n.flushed) mysql_mutex_assert_owner(&buf_pool.mutex);
buf_lru_flush_page_count+= n.flushed; buf_lru_freed_page_count+= n->evicted;
buf_lru_flush_page_count+= n->flushed;
return n.flushed;
} }
/** This utility flushes dirty blocks from the end of the flush_list. /** This utility flushes dirty blocks from the end of the flush_list.
...@@ -1398,7 +1395,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn) ...@@ -1398,7 +1395,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
from buf_pool.flush_list must adjust the hazard pointer. from buf_pool.flush_list must adjust the hazard pointer.
Note: A concurrent execution of buf_flush_list_space() may Note: A concurrent execution of buf_flush_list_space() may
terminate this scan prematurely. The buf_pool.n_flush_list() terminate this scan prematurely. The buf_pool.n_flush_list_
should prevent multiple threads from executing should prevent multiple threads from executing
buf_do_flush_list_batch() concurrently, buf_do_flush_list_batch() concurrently,
but buf_flush_list_space() is ignoring that. */ but buf_flush_list_space() is ignoring that. */
...@@ -1505,46 +1502,53 @@ void buf_flush_wait_batch_end(bool lru) ...@@ -1505,46 +1502,53 @@ void buf_flush_wait_batch_end(bool lru)
} }
/** Write out dirty blocks from buf_pool.flush_list. /** Write out dirty blocks from buf_pool.flush_list.
The caller must invoke buf_dblwr.flush_buffered_writes()
after releasing buf_pool.mutex.
@param max_n wished maximum mumber of blocks flushed @param max_n wished maximum mumber of blocks flushed
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target @param lsn buf_pool.get_oldest_modification(LSN_MAX) target
@return the number of processed pages @return the number of processed pages
@retval 0 if a buf_pool.flush_list batch is already running */ @retval 0 if a buf_pool.flush_list batch is already running */
static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED, lsn_t lsn= LSN_MAX) static ulint buf_flush_list_holding_mutex(ulint max_n= ULINT_UNDEFINED,
lsn_t lsn= LSN_MAX)
{ {
ut_ad(lsn); ut_ad(lsn);
mysql_mutex_assert_owner(&buf_pool.mutex);
if (buf_pool.n_flush_list()) if (buf_pool.n_flush_list_)
return 0; return 0;
mysql_mutex_lock(&buf_pool.mutex);
const bool running= buf_pool.n_flush_list_ != 0;
/* FIXME: we are performing a dirty read of buf_pool.flush_list.count /* FIXME: we are performing a dirty read of buf_pool.flush_list.count
while not holding buf_pool.flush_list_mutex */ while not holding buf_pool.flush_list_mutex */
if (running || !UT_LIST_GET_LEN(buf_pool.flush_list)) if (!UT_LIST_GET_LEN(buf_pool.flush_list))
{ {
if (!running)
pthread_cond_broadcast(&buf_pool.done_flush_list); pthread_cond_broadcast(&buf_pool.done_flush_list);
mysql_mutex_unlock(&buf_pool.mutex);
return 0; return 0;
} }
buf_pool.n_flush_list_++; buf_pool.n_flush_list_++;
const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn); const ulint n_flushed= buf_do_flush_list_batch(max_n, lsn);
const ulint n_flushing= --buf_pool.n_flush_list_; if (!--buf_pool.n_flush_list_)
buf_pool.try_LRU_scan= true;
mysql_mutex_unlock(&buf_pool.mutex);
if (!n_flushing)
pthread_cond_broadcast(&buf_pool.done_flush_list); pthread_cond_broadcast(&buf_pool.done_flush_list);
buf_dblwr.flush_buffered_writes();
DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed)); DBUG_PRINT("ib_buf", ("flush_list completed, " ULINTPF " pages", n_flushed));
return n_flushed; return n_flushed;
} }
/** Write out dirty blocks from buf_pool.flush_list.
@param max_n wished maximum mumber of blocks flushed
@param lsn buf_pool.get_oldest_modification(LSN_MAX) target
@return the number of processed pages
@retval 0 if a buf_pool.flush_list batch is already running */
static ulint buf_flush_list(ulint max_n= ULINT_UNDEFINED,
lsn_t lsn= LSN_MAX)
{
mysql_mutex_lock(&buf_pool.mutex);
ulint n= buf_flush_list_holding_mutex(max_n, lsn);
mysql_mutex_unlock(&buf_pool.mutex);
buf_dblwr.flush_buffered_writes();
return n;
}
/** Try to flush all the dirty pages that belong to a given tablespace. /** Try to flush all the dirty pages that belong to a given tablespace.
@param space tablespace @param space tablespace
@param n_flushed number of pages written @param n_flushed number of pages written
...@@ -1642,7 +1646,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) ...@@ -1642,7 +1646,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex);
buf_pool.try_LRU_scan= true; buf_pool.try_LRU_scan= true;
pthread_cond_broadcast(&buf_pool.done_free);
mysql_mutex_unlock(&buf_pool.mutex); mysql_mutex_unlock(&buf_pool.mutex);
if (acquired) if (acquired)
...@@ -1656,43 +1660,41 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed) ...@@ -1656,43 +1660,41 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
return may_have_skipped; return may_have_skipped;
} }
/** Write out dirty blocks from buf_pool.LRU. /** Write out dirty blocks from buf_pool.LRU,
and move clean blocks to buf_pool.free.
The caller must invoke buf_dblwr.flush_buffered_writes()
after releasing buf_pool.mutex.
@param max_n wished maximum mumber of blocks flushed @param max_n wished maximum mumber of blocks flushed
@return the number of processed pages @param evict whether to evict pages after flushing
@return evict ? number of processed pages : number of pages written
@retval 0 if a buf_pool.LRU batch is already running */ @retval 0 if a buf_pool.LRU batch is already running */
ulint buf_flush_LRU(ulint max_n) ulint buf_flush_LRU(ulint max_n, bool evict)
{ {
if (buf_pool.n_flush_LRU()) mysql_mutex_assert_owner(&buf_pool.mutex);
return 0;
log_buffer_flush_to_disk();
mysql_mutex_lock(&buf_pool.mutex); if (evict)
if (buf_pool.n_flush_LRU_)
{ {
mysql_mutex_unlock(&buf_pool.mutex); if (buf_pool.n_flush_LRU_)
return 0; return 0;
buf_pool.n_flush_LRU_= 1;
} }
buf_pool.n_flush_LRU_++;
ulint n_flushed= buf_do_LRU_batch(max_n); flush_counters_t n;
buf_do_LRU_batch(max_n, evict, &n);
const ulint n_flushing= --buf_pool.n_flush_LRU_;
buf_pool.try_LRU_scan= true;
mysql_mutex_unlock(&buf_pool.mutex);
if (!n_flushing) if (n.evicted)
{ {
pthread_cond_broadcast(&buf_pool.done_flush_LRU); buf_pool.try_LRU_scan= true;
pthread_cond_signal(&buf_pool.done_free); pthread_cond_signal(&buf_pool.done_free);
} }
buf_dblwr.flush_buffered_writes(); if (!evict)
return n.flushed;
DBUG_PRINT("ib_buf", ("LRU flush completed, " ULINTPF " pages", n_flushed)); if (!--buf_pool.n_flush_LRU_)
return n_flushed; pthread_cond_broadcast(&buf_pool.done_flush_LRU);
return n.evicted + n.flushed;
} }
/** Initiate a log checkpoint, discarding the start of the log. /** Initiate a log checkpoint, discarding the start of the log.
...@@ -1854,7 +1856,9 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn) ...@@ -1854,7 +1856,9 @@ ATTRIBUTE_COLD void buf_flush_wait_flushed(lsn_t sync_lsn)
{ {
mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn); ulint n_pages= buf_flush_list(srv_max_io_capacity, sync_lsn);
buf_flush_wait_batch_end_acquiring_mutex(false); mysql_mutex_lock(&buf_pool.mutex);
buf_flush_wait_batch_end(false);
mysql_mutex_unlock(&buf_pool.mutex);
if (n_pages) if (n_pages)
{ {
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE, MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_SYNC_TOTAL_PAGE,
...@@ -1920,17 +1924,6 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) ...@@ -1920,17 +1924,6 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious)
} }
} }
/** Wait for pending flushes to complete. */
void buf_flush_wait_batch_end_acquiring_mutex(bool lru)
{
if (lru ? buf_pool.n_flush_LRU() : buf_pool.n_flush_list())
{
mysql_mutex_lock(&buf_pool.mutex);
buf_flush_wait_batch_end(lru);
mysql_mutex_unlock(&buf_pool.mutex);
}
}
/** Conduct checkpoint-related flushing for innodb_flush_sync=ON, /** Conduct checkpoint-related flushing for innodb_flush_sync=ON,
and try to initiate checkpoints until the target is met. and try to initiate checkpoints until the target is met.
@param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */ @param lsn minimum value of buf_pool.get_oldest_modification(LSN_MAX) */
...@@ -2042,8 +2035,9 @@ af_get_pct_for_lsn( ...@@ -2042,8 +2035,9 @@ af_get_pct_for_lsn(
/ 7.5)); / 7.5));
} }
/** This function is called approximately once every second by the /** This function is called approximately once every second by
page_cleaner thread if innodb_adaptive_flushing=ON. buf_flush_page_cleaner() if innodb_max_dirty_pages_pct_lwm>0
and innodb_adaptive_flushing=ON.
Based on various factors it decides if there is a need to do flushing. Based on various factors it decides if there is a need to do flushing.
@return number of pages recommended to be flushed @return number of pages recommended to be flushed
@param last_pages_in number of pages flushed in previous batch @param last_pages_in number of pages flushed in previous batch
...@@ -2081,52 +2075,43 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, ...@@ -2081,52 +2075,43 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks); n_pages= std::min<ulint>(srv_io_capacity, dirty_blocks);
} }
func_exit:
page_cleaner.flush_pass++;
return n_pages; return n_pages;
} }
sum_pages += last_pages_in; sum_pages += last_pages_in;
double time_elapsed = difftime(curr_time, prev_time); const ulint time_elapsed = std::max<ulint>(curr_time - prev_time, 1);
/* We update our variables every srv_flushing_avg_loops /* We update our variables every innodb_flushing_avg_loops
iterations to smooth out transition in workload. */ iterations to smooth out transition in workload. */
if (++n_iterations >= srv_flushing_avg_loops if (++n_iterations >= srv_flushing_avg_loops
|| time_elapsed >= static_cast<double>(srv_flushing_avg_loops)) { || time_elapsed >= srv_flushing_avg_loops) {
if (time_elapsed < 1) {
time_elapsed = 1;
}
avg_page_rate = static_cast<ulint>( avg_page_rate = (sum_pages / time_elapsed + avg_page_rate) / 2;
((static_cast<double>(sum_pages)
/ time_elapsed)
+ static_cast<double>(avg_page_rate)) / 2);
/* How much LSN we have generated since last call. */ /* How much LSN we have generated since last call. */
lsn_rate = static_cast<lsn_t>( lsn_rate = (cur_lsn - prev_lsn) / time_elapsed;
static_cast<double>(cur_lsn - prev_lsn)
/ time_elapsed);
lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
ulint flush_tm = page_cleaner.flush_time; if (page_cleaner.flush_pass) {
ulint flush_pass = page_cleaner.flush_pass; page_cleaner.flush_time /= page_cleaner.flush_pass;
page_cleaner.flush_time = 0;
page_cleaner.flush_pass = 0;
if (flush_pass) {
flush_tm /= flush_pass;
} }
MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME, flush_tm);
MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, flush_pass);
prev_lsn = cur_lsn; prev_lsn = cur_lsn;
prev_time = curr_time; prev_time = curr_time;
n_iterations = 0; MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME,
page_cleaner.flush_time);
MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
page_cleaner.flush_pass);
page_cleaner.flush_time = 0;
page_cleaner.flush_pass = 0;
n_iterations = 0;
sum_pages = 0; sum_pages = 0;
} }
...@@ -2176,7 +2161,7 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in, ...@@ -2176,7 +2161,7 @@ static ulint page_cleaner_flush_pages_recommendation(ulint last_pages_in,
MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
return(n_pages); goto func_exit;
} }
/** page_cleaner thread tasked with flushing dirty pages from the buffer /** page_cleaner thread tasked with flushing dirty pages from the buffer
...@@ -2205,7 +2190,7 @@ static void buf_flush_page_cleaner() ...@@ -2205,7 +2190,7 @@ static void buf_flush_page_cleaner()
if (UNIV_UNLIKELY(lsn_limit != 0)) if (UNIV_UNLIKELY(lsn_limit != 0))
{ {
furious_flush: furious_flush:
if (UNIV_LIKELY(srv_flush_sync)) if (UNIV_LIKELY(srv_flush_sync))
{ {
buf_flush_sync_for_checkpoint(lsn_limit); buf_flush_sync_for_checkpoint(lsn_limit);
...@@ -2223,7 +2208,8 @@ static void buf_flush_page_cleaner() ...@@ -2223,7 +2208,8 @@ static void buf_flush_page_cleaner()
if (buf_pool.page_cleaner_idle() && if (buf_pool.page_cleaner_idle() &&
(!UT_LIST_GET_LEN(buf_pool.flush_list) || (!UT_LIST_GET_LEN(buf_pool.flush_list) ||
srv_max_dirty_pages_pct_lwm == 0.0)) srv_max_dirty_pages_pct_lwm == 0.0))
my_cond_wait(&buf_pool.do_flush_list, &buf_pool.flush_list_mutex.m_mutex); my_cond_wait(&buf_pool.do_flush_list,
&buf_pool.flush_list_mutex.m_mutex);
else else
my_cond_timedwait(&buf_pool.do_flush_list, my_cond_timedwait(&buf_pool.do_flush_list,
&buf_pool.flush_list_mutex.m_mutex, &abstime); &buf_pool.flush_list_mutex.m_mutex, &abstime);
...@@ -2251,19 +2237,25 @@ static void buf_flush_page_cleaner() ...@@ -2251,19 +2237,25 @@ static void buf_flush_page_cleaner()
/* wake up buf_flush_wait() */ /* wake up buf_flush_wait() */
pthread_cond_broadcast(&buf_pool.done_flush_list); pthread_cond_broadcast(&buf_pool.done_flush_list);
} }
unemployed: unemployed:
buf_flush_async_lsn= 0; buf_flush_async_lsn= 0;
set_idle:
buf_pool.page_cleaner_set_idle(true); buf_pool.page_cleaner_set_idle(true);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
end_of_batch:
buf_dblwr.flush_buffered_writes();
do
{
DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;); DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", continue;);
DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;); DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", continue;);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (!recv_recovery_is_on() && if (!recv_recovery_is_on() &&
!srv_startup_is_before_trx_rollback_phase && !srv_startup_is_before_trx_rollback_phase &&
srv_operation == SRV_OPERATION_NORMAL) srv_operation == SRV_OPERATION_NORMAL)
log_checkpoint(); log_checkpoint();
}
while (false);
mysql_mutex_lock(&buf_pool.flush_list_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex);
continue; continue;
...@@ -2314,56 +2306,40 @@ static void buf_flush_page_cleaner() ...@@ -2314,56 +2306,40 @@ static void buf_flush_page_cleaner()
if (!lsn_limit) if (!lsn_limit)
lsn_limit= soft_lsn_limit; lsn_limit= soft_lsn_limit;
ulint n_flushed; ulint n_flushed= 0, n;
if (UNIV_UNLIKELY(lsn_limit != 0)) if (UNIV_UNLIKELY(lsn_limit != 0))
{ {
n_flushed= buf_flush_list(srv_max_io_capacity, lsn_limit); n= srv_max_io_capacity;
/* wake up buf_flush_wait() */ goto background_flush;
pthread_cond_broadcast(&buf_pool.done_flush_list);
goto try_checkpoint;
} }
else if (idle_flush || !srv_adaptive_flushing) else if (idle_flush || !srv_adaptive_flushing)
{ {
n_flushed= buf_flush_list(srv_io_capacity); n= srv_io_capacity;
try_checkpoint: lsn_limit= LSN_MAX;
if (n_flushed) background_flush:
{ mysql_mutex_lock(&buf_pool.mutex);
n_flushed= buf_flush_list_holding_mutex(n, lsn_limit);
/* wake up buf_flush_wait() */
pthread_cond_broadcast(&buf_pool.done_flush_list);
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
MONITOR_FLUSH_BACKGROUND_COUNT, MONITOR_FLUSH_BACKGROUND_COUNT,
MONITOR_FLUSH_BACKGROUND_PAGES, MONITOR_FLUSH_BACKGROUND_PAGES,
n_flushed); n_flushed);
do_checkpoint:
/* The periodic log_checkpoint() call here makes it harder to
reproduce bugs in crash recovery or mariabackup --prepare, or
in code that writes the redo log records. Omitting the call
here should not affect correctness, because log_free_check()
should still be invoking checkpoints when needed. */
DBUG_EXECUTE_IF("ib_log_checkpoint_avoid", goto next;);
DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto next;);
if (!recv_recovery_is_on() && srv_operation == SRV_OPERATION_NORMAL)
log_checkpoint();
} }
} else if ((n= page_cleaner_flush_pages_recommendation(last_pages,
else if (ulint n= page_cleaner_flush_pages_recommendation(last_pages,
oldest_lsn, oldest_lsn,
dirty_blocks, dirty_blocks,
dirty_pct)) dirty_pct)) != 0)
{ {
page_cleaner.flush_pass++;
const ulint tm= ut_time_ms(); const ulint tm= ut_time_ms();
last_pages= n_flushed= buf_flush_list(n); mysql_mutex_lock(&buf_pool.mutex);
last_pages= n_flushed= buf_flush_list_holding_mutex(n);
page_cleaner.flush_time+= ut_time_ms() - tm; page_cleaner.flush_time+= ut_time_ms() - tm;
if (n_flushed)
{
MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
MONITOR_FLUSH_ADAPTIVE_COUNT, MONITOR_FLUSH_ADAPTIVE_COUNT,
MONITOR_FLUSH_ADAPTIVE_PAGES, MONITOR_FLUSH_ADAPTIVE_PAGES,
n_flushed); n_flushed);
goto do_checkpoint;
}
} }
else if (buf_flush_async_lsn <= oldest_lsn) else if (buf_flush_async_lsn <= oldest_lsn)
{ {
...@@ -2371,24 +2347,29 @@ static void buf_flush_page_cleaner() ...@@ -2371,24 +2347,29 @@ static void buf_flush_page_cleaner()
goto unemployed; goto unemployed;
} }
#ifndef DBUG_OFF n= buf_flush_LRU(n >= n_flushed ? n - n_flushed : 0, false);
next: mysql_mutex_unlock(&buf_pool.mutex);
#endif /* !DBUG_OFF */ last_pages+= n;
mysql_mutex_lock(&buf_pool.flush_list_mutex);
if (!idle_flush)
goto end_of_batch;
/* when idle flushing kicks in page_cleaner is marked active. /* when idle flushing kicks in page_cleaner is marked active.
reset it back to idle since the it was made active as part of reset it back to idle since the it was made active as part of
idle flushing stage. */ idle flushing stage. */
if (idle_flush) mysql_mutex_lock(&buf_pool.flush_list_mutex);
buf_pool.page_cleaner_set_idle(true); goto set_idle;
} }
mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (srv_fast_shutdown != 2) if (srv_fast_shutdown != 2)
{ {
buf_flush_wait_batch_end_acquiring_mutex(true); buf_dblwr.flush_buffered_writes();
buf_flush_wait_batch_end_acquiring_mutex(false); mysql_mutex_lock(&buf_pool.mutex);
buf_flush_wait_batch_end(true);
buf_flush_wait_batch_end(false);
mysql_mutex_unlock(&buf_pool.mutex);
} }
mysql_mutex_lock(&buf_pool.flush_list_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex);
...@@ -2444,20 +2425,23 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool() ...@@ -2444,20 +2425,23 @@ ATTRIBUTE_COLD void buf_flush_buffer_pool()
while (buf_pool.get_oldest_modification(0)) while (buf_pool.get_oldest_modification(0))
{ {
mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex);
buf_flush_list(srv_max_io_capacity); mysql_mutex_lock(&buf_pool.mutex);
if (buf_pool.n_flush_list()) buf_flush_list_holding_mutex(srv_max_io_capacity);
if (buf_pool.n_flush_list_)
{ {
mysql_mutex_unlock(&buf_pool.mutex);
timespec abstime; timespec abstime;
service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL, service_manager_extend_timeout(INNODB_EXTEND_TIMEOUT_INTERVAL,
"Waiting to flush " ULINTPF " pages", "Waiting to flush " ULINTPF " pages",
buf_flush_list_length()); buf_flush_list_length());
set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2); set_timespec(abstime, INNODB_EXTEND_TIMEOUT_INTERVAL / 2);
buf_dblwr.flush_buffered_writes();
mysql_mutex_lock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.mutex);
while (buf_pool.n_flush_list_) while (buf_pool.n_flush_list_)
my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex, my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.mutex.m_mutex,
&abstime); &abstime);
mysql_mutex_unlock(&buf_pool.mutex);
} }
mysql_mutex_unlock(&buf_pool.mutex);
mysql_mutex_lock(&buf_pool.flush_list_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex);
} }
......
...@@ -402,6 +402,7 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex) ...@@ -402,6 +402,7 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex)
&& recv_sys.apply_log_recs) { && recv_sys.apply_log_recs) {
goto flush_lru; goto flush_lru;
}); });
get_mutex:
mysql_mutex_lock(&buf_pool.mutex); mysql_mutex_lock(&buf_pool.mutex);
got_mutex: got_mutex:
buf_LRU_check_size_of_non_data_objects(); buf_LRU_check_size_of_non_data_objects();
...@@ -490,15 +491,18 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex) ...@@ -490,15 +491,18 @@ buf_block_t *buf_LRU_get_free_block(bool have_mutex)
#ifndef DBUG_OFF #ifndef DBUG_OFF
flush_lru: flush_lru:
#endif #endif
if (!buf_flush_LRU(innodb_lru_flush_size)) { mysql_mutex_lock(&buf_pool.mutex);
if (!buf_flush_LRU(innodb_lru_flush_size, true)) {
MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT); MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
++flush_failures; ++flush_failures;
} }
n_iterations++; n_iterations++;
mysql_mutex_lock(&buf_pool.mutex);
buf_pool.stat.LRU_waits++; buf_pool.stat.LRU_waits++;
goto got_mutex; mysql_mutex_unlock(&buf_pool.mutex);
buf_dblwr.flush_buffered_writes();
goto get_mutex;
} }
/** Move the LRU_old pointer so that the length of the old blocks list /** Move the LRU_old pointer so that the length of the old blocks list
...@@ -807,50 +811,57 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ...@@ -807,50 +811,57 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip)
/* We cannot use transactional_lock_guard here, /* We cannot use transactional_lock_guard here,
because buf_buddy_relocate() in buf_buddy_free() could get stuck. */ because buf_buddy_relocate() in buf_buddy_free() could get stuck. */
hash_lock.lock(); hash_lock.lock();
lsn_t oldest_modification = bpage->oldest_modification_acquire(); const lsn_t oldest_modification = bpage->oldest_modification_acquire();
if (UNIV_UNLIKELY(!bpage->can_relocate())) { if (UNIV_UNLIKELY(!bpage->can_relocate())) {
/* Do not free buffer fixed and I/O-fixed blocks. */ /* Do not free buffer fixed and I/O-fixed blocks. */
goto func_exit; goto func_exit;
} }
if (oldest_modification == 1) { switch (oldest_modification) {
case 2:
ut_ad(id.space() == SRV_TMP_SPACE_ID);
ut_ad(!bpage->zip.data);
if (!bpage->is_freed()) {
goto func_exit;
}
bpage->clear_oldest_modification();
break;
case 1:
mysql_mutex_lock(&buf_pool.flush_list_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex);
oldest_modification = bpage->oldest_modification(); if (const lsn_t om = bpage->oldest_modification()) {
if (oldest_modification) { ut_ad(om == 1);
ut_ad(oldest_modification == 1);
buf_pool.delete_from_flush_list(bpage); buf_pool.delete_from_flush_list(bpage);
} }
mysql_mutex_unlock(&buf_pool.flush_list_mutex); mysql_mutex_unlock(&buf_pool.flush_list_mutex);
ut_ad(!bpage->oldest_modification()); ut_ad(!bpage->oldest_modification());
oldest_modification = 0; /* fall through */
} case 0:
if (zip || !bpage->zip.data || !bpage->frame) {
if (zip || !bpage->zip.data) { break;
/* This would completely free the block. */
/* Do not completely free dirty blocks. */
if (oldest_modification) {
goto func_exit;
} }
} else if (oldest_modification && !bpage->frame) { relocate_compressed:
func_exit:
hash_lock.unlock();
return(false);
} else if (bpage->frame) {
b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b)); b = static_cast<buf_page_t*>(ut_zalloc_nokey(sizeof *b));
ut_a(b); ut_a(b);
mysql_mutex_lock(&buf_pool.flush_list_mutex); mysql_mutex_lock(&buf_pool.flush_list_mutex);
new (b) buf_page_t(*bpage); new (b) buf_page_t(*bpage);
b->frame = nullptr; b->frame = nullptr;
b->set_state(buf_page_t::UNFIXED + 1); b->set_state(buf_page_t::UNFIXED + 1);
break;
default:
if (zip || !bpage->zip.data || !bpage->frame) {
/* This would completely free the block. */
/* Do not completely free dirty blocks. */
func_exit:
hash_lock.unlock();
return(false);
}
goto relocate_compressed;
} }
mysql_mutex_assert_owner(&buf_pool.mutex); mysql_mutex_assert_owner(&buf_pool.mutex);
DBUG_PRINT("ib_buf", ("free page %u:%u", DBUG_PRINT("ib_buf", ("free page %u:%u", id.space(), id.page_no()));
id.space(), id.page_no()));
ut_ad(bpage->can_relocate()); ut_ad(bpage->can_relocate());
......
...@@ -724,13 +724,14 @@ class buf_page_t ...@@ -724,13 +724,14 @@ class buf_page_t
ut_ad(s < REINIT); ut_ad(s < REINIT);
} }
void read_unfix(uint32_t s) uint32_t read_unfix(uint32_t s)
{ {
ut_ad(lock.is_write_locked()); ut_ad(lock.is_write_locked());
ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1); ut_ad(s == UNFIXED + 1 || s == IBUF_EXIST + 1 || s == REINIT + 1);
ut_d(auto old_state=) zip.fix.fetch_add(s - READ_FIX); uint32_t old_state= zip.fix.fetch_add(s - READ_FIX);
ut_ad(old_state >= READ_FIX); ut_ad(old_state >= READ_FIX);
ut_ad(old_state < WRITE_FIX); ut_ad(old_state < WRITE_FIX);
return old_state + (s - READ_FIX);
} }
void set_freed(uint32_t prev_state, uint32_t count= 0) void set_freed(uint32_t prev_state, uint32_t count= 0)
...@@ -782,10 +783,10 @@ class buf_page_t ...@@ -782,10 +783,10 @@ class buf_page_t
inline void write_complete(bool temporary); inline void write_complete(bool temporary);
/** Write a flushable page to a file. buf_pool.mutex must be held. /** Write a flushable page to a file. buf_pool.mutex must be held.
@param lru true=buf_pool.LRU; false=buf_pool.flush_list @param evict whether to evict the page on write completion
@param space tablespace @param space tablespace
@return whether the page was flushed and buf_pool.mutex was released */ @return whether the page was flushed and buf_pool.mutex was released */
inline bool flush(bool lru, fil_space_t *space); inline bool flush(bool evict, fil_space_t *space);
/** Notify that a page in a temporary tablespace has been modified. */ /** Notify that a page in a temporary tablespace has been modified. */
void set_temp_modified() void set_temp_modified()
...@@ -1546,9 +1547,6 @@ class buf_pool_t ...@@ -1546,9 +1547,6 @@ class buf_pool_t
/** broadcast when n_flush_list reaches 0; protected by mutex */ /** broadcast when n_flush_list reaches 0; protected by mutex */
pthread_cond_t done_flush_list; pthread_cond_t done_flush_list;
TPOOL_SUPPRESS_TSAN ulint n_flush_LRU() const { return n_flush_LRU_; }
TPOOL_SUPPRESS_TSAN ulint n_flush_list() const { return n_flush_list_; }
/** @name General fields */ /** @name General fields */
/* @{ */ /* @{ */
ulint curr_pool_size; /*!< Current pool size in bytes */ ulint curr_pool_size; /*!< Current pool size in bytes */
...@@ -1755,7 +1753,7 @@ class buf_pool_t ...@@ -1755,7 +1753,7 @@ class buf_pool_t
last_activity_count= activity_count; last_activity_count= activity_count;
} }
// n_flush_LRU() + n_flush_list() // n_flush_LRU_ + n_flush_list_
// is approximately COUNT(is_write_fixed()) in flush_list // is approximately COUNT(is_write_fixed()) in flush_list
unsigned freed_page_clock;/*!< a sequence number used unsigned freed_page_clock;/*!< a sequence number used
...@@ -1785,7 +1783,8 @@ class buf_pool_t ...@@ -1785,7 +1783,8 @@ class buf_pool_t
UT_LIST_BASE_NODE_T(buf_page_t) free; UT_LIST_BASE_NODE_T(buf_page_t) free;
/*!< base node of the free /*!< base node of the free
block list */ block list */
/** signaled each time when the free list grows; protected by mutex */ /** signaled each time when the free list grows and
broadcast each time try_LRU_scan is set; protected by mutex */
pthread_cond_t done_free; pthread_cond_t done_free;
UT_LIST_BASE_NODE_T(buf_page_t) withdraw; UT_LIST_BASE_NODE_T(buf_page_t) withdraw;
...@@ -1851,9 +1850,9 @@ class buf_pool_t ...@@ -1851,9 +1850,9 @@ class buf_pool_t
return any_pending; return any_pending;
} }
/** @return total amount of pending I/O */ /** @return total amount of pending I/O */
ulint io_pending() const TPOOL_SUPPRESS_TSAN ulint io_pending() const
{ {
return n_pend_reads + n_flush_LRU() + n_flush_list(); return n_pend_reads + n_flush_LRU_ + n_flush_list_;
} }
private: private:
...@@ -1886,34 +1885,12 @@ class buf_pool_t ...@@ -1886,34 +1885,12 @@ class buf_pool_t
/** array of slots */ /** array of slots */
buf_tmp_buffer_t *slots; buf_tmp_buffer_t *slots;
void create(ulint n_slots) void create(ulint n_slots);
{
this->n_slots= n_slots;
slots= static_cast<buf_tmp_buffer_t*>
(ut_malloc_nokey(n_slots * sizeof *slots));
memset((void*) slots, 0, n_slots * sizeof *slots);
}
void close() void close();
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
{
aligned_free(s->crypt_buf);
aligned_free(s->comp_buf);
}
ut_free(slots);
slots= nullptr;
n_slots= 0;
}
/** Reserve a buffer */ /** Reserve a buffer */
buf_tmp_buffer_t *reserve() buf_tmp_buffer_t *reserve();
{
for (buf_tmp_buffer_t *s= slots, *e= slots + n_slots; s != e; s++)
if (s->acquire())
return s;
return nullptr;
}
} io_buf; } io_buf;
/** whether resize() is in the critical path */ /** whether resize() is in the critical path */
...@@ -2002,7 +1979,10 @@ inline void buf_page_t::set_oldest_modification(lsn_t lsn) ...@@ -2002,7 +1979,10 @@ inline void buf_page_t::set_oldest_modification(lsn_t lsn)
/** Clear oldest_modification after removing from buf_pool.flush_list */ /** Clear oldest_modification after removing from buf_pool.flush_list */
inline void buf_page_t::clear_oldest_modification() inline void buf_page_t::clear_oldest_modification()
{ {
#ifdef SAFE_MUTEX
if (oldest_modification() != 2)
mysql_mutex_assert_owner(&buf_pool.flush_list_mutex); mysql_mutex_assert_owner(&buf_pool.flush_list_mutex);
#endif /* SAFE_MUTEX */
ut_d(const auto s= state()); ut_d(const auto s= state());
ut_ad(s >= REMOVE_HASH); ut_ad(s >= REMOVE_HASH);
ut_ad(oldest_modification()); ut_ad(oldest_modification());
......
...@@ -86,11 +86,15 @@ buf_flush_init_for_writing( ...@@ -86,11 +86,15 @@ buf_flush_init_for_writing(
bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr) bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed= nullptr)
MY_ATTRIBUTE((warn_unused_result)); MY_ATTRIBUTE((warn_unused_result));
/** Write out dirty blocks from buf_pool.LRU. /** Write out dirty blocks from buf_pool.LRU,
and move clean blocks to buf_pool.free.
The caller must invoke buf_dblwr.flush_buffered_writes()
after releasing buf_pool.mutex.
@param max_n wished maximum mumber of blocks flushed @param max_n wished maximum mumber of blocks flushed
@return the number of processed pages @param evict whether to evict pages after flushing
@return evict ? number of processed pages : number of pages written
@retval 0 if a buf_pool.LRU batch is already running */ @retval 0 if a buf_pool.LRU batch is already running */
ulint buf_flush_LRU(ulint max_n); ulint buf_flush_LRU(ulint max_n, bool evict);
/** Wait until a flush batch ends. /** Wait until a flush batch ends.
@param lru true=buf_pool.LRU; false=buf_pool.flush_list */ @param lru true=buf_pool.LRU; false=buf_pool.flush_list */
...@@ -131,9 +135,6 @@ inline void buf_flush_note_modification(buf_block_t *b, lsn_t start, lsn_t end) ...@@ -131,9 +135,6 @@ inline void buf_flush_note_modification(buf_block_t *b, lsn_t start, lsn_t end)
/** Initialize page_cleaner. */ /** Initialize page_cleaner. */
ATTRIBUTE_COLD void buf_flush_page_cleaner_init(); ATTRIBUTE_COLD void buf_flush_page_cleaner_init();
/** Wait for pending flushes to complete. */
void buf_flush_wait_batch_end_acquiring_mutex(bool lru);
/** Flush the buffer pool on shutdown. */ /** Flush the buffer pool on shutdown. */
ATTRIBUTE_COLD void buf_flush_buffer_pool(); ATTRIBUTE_COLD void buf_flush_buffer_pool();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment