Commit 3613fb2a authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-33112 innodb_undo_log_truncate=ON is blocking page write

When innodb_undo_log_truncate=ON causes an InnoDB undo tablespace
to be truncated, we must guarantee that the undo tablespace will
be rebuilt atomically: After mtr_t::commit_shrink() has durably
written the mini-transaction that rebuilds the undo tablespace,
we must not write any old pages to the tablespace.

To guarantee this, in trx_purge_truncate_history() we used to
traverse the entire buf_pool.flush_list in order to acquire
exclusive latches on all pages for the undo tablespace that
reside in the buffer pool, so that those pages cannot be written
and will be evicted during mtr_t::commit_shrink(). But, this
traversal may interfere with the page writing activity of
buf_flush_page_cleaner(). It would be better to lazily discard
the old pages of the truncated undo tablespace.

fil_space_t::is_being_truncated, fil_space_t::clear_stopping(): Remove.

fil_space_t::create_lsn: A new field, identifying the LSN of the
latest rebuild of a tablespace.

buf_page_t::flush(), buf_flush_try_neighbors(): Evict pages whose
FIL_PAGE_LSN is below fil_space_t::create_lsn.

mtr_t::commit_shrink(): Update fil_space_t::create_lsn and
fil_space_t::size right before the log is durably written and the
tablespace file is being truncated.

fsp_page_create(), trx_purge_truncate_history(): Simplify the logic.

Reviewed by: Thirunarayanan Balathandayuthapani, Vladislav Lesin
Performance tested by: Axel Schwenke
Correctness tested by: Matthias Leich
parent 593278f9
......@@ -792,16 +792,20 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
ut_ad(space->referenced());
const auto s= state();
ut_a(s >= FREED);
const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>
(FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
ut_ad(lsn
? lsn >= oldest_modification() || oldest_modification() == 2
: space->purpose != FIL_TYPE_TABLESPACE);
if (s < UNFIXED)
{
ut_a(s >= FREED);
if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
{
const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>
(FIL_PAGE_LSN + (zip.data ? zip.data : frame)));
ut_ad(lsn >= oldest_modification());
freed:
if (lsn > log_sys.get_flushed_lsn())
{
mysql_mutex_unlock(&buf_pool.mutex);
......@@ -813,6 +817,12 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
return false;
}
if (UNIV_UNLIKELY(lsn < space->get_create_lsn()))
{
ut_ad(space->purpose == FIL_TYPE_TABLESPACE);
goto freed;
}
ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED);
ut_ad(f >= UNFIXED);
ut_ad(f < READ_FIX);
......@@ -907,16 +917,9 @@ bool buf_page_t::flush(bool evict, fil_space_t *space)
if ((s & LRU_MASK) == REINIT || !space->use_doublewrite())
{
if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE))
{
const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>(FIL_PAGE_LSN +
(write_frame ? write_frame
: frame)));
ut_ad(lsn >= oldest_modification());
if (lsn > log_sys.get_flushed_lsn())
log_write_up_to(lsn, true);
}
if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE) &&
lsn > log_sys.get_flushed_lsn())
log_write_up_to(lsn, true);
space->io(IORequest{type, this, slot}, physical_offset(), size,
write_frame, this);
}
......@@ -1096,11 +1099,25 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
bool contiguous, bool evict,
ulint n_flushed, ulint n_to_flush)
{
mysql_mutex_unlock(&buf_pool.mutex);
ut_ad(space->id == page_id.space());
ut_ad(bpage->id() == page_id);
{
const lsn_t lsn=
mach_read_from_8(my_assume_aligned<8>
(FIL_PAGE_LSN +
(bpage->zip.data ? bpage->zip.data : bpage->frame)));
ut_ad(lsn >= bpage->oldest_modification());
if (UNIV_UNLIKELY(lsn < space->get_create_lsn()))
{
ut_a(!bpage->flush(evict, space));
mysql_mutex_unlock(&buf_pool.mutex);
return 0;
}
}
mysql_mutex_unlock(&buf_pool.mutex);
ulint count= 0;
page_id_t id= page_id;
page_id_t high= buf_flush_check_neighbors(*space, id, contiguous, evict);
......
......@@ -558,7 +558,7 @@ fil_space_extend_must_retry(
ut_ad(UT_LIST_GET_LAST(space->chain) == node);
ut_ad(size >= FIL_IBD_FILE_INITIAL_SIZE);
ut_ad(node->space == space);
ut_ad(space->referenced() || space->is_being_truncated);
ut_ad(space->referenced());
*success = space->size >= size;
......@@ -647,8 +647,7 @@ fil_space_extend_must_retry(
default:
ut_ad(space->purpose == FIL_TYPE_TABLESPACE
|| space->purpose == FIL_TYPE_IMPORT);
if (space->purpose == FIL_TYPE_TABLESPACE
&& !space->is_being_truncated) {
if (space->purpose == FIL_TYPE_TABLESPACE) {
goto do_flush;
}
break;
......@@ -733,12 +732,10 @@ bool fil_space_extend(fil_space_t *space, uint32_t size)
bool success= false;
const bool acquired= space->acquire();
mysql_mutex_lock(&fil_system.mutex);
if (acquired || space->is_being_truncated)
{
if (acquired)
while (fil_space_extend_must_retry(space, UT_LIST_GET_LAST(space->chain),
size, &success))
mysql_mutex_lock(&fil_system.mutex);
}
mysql_mutex_unlock(&fil_system.mutex);
if (acquired)
space->release();
......@@ -3058,11 +3055,9 @@ fil_space_validate_for_mtr_commit(
ut_ad(!is_predefined_tablespace(space->id));
/* We are serving mtr_commit(). While there is an active
mini-transaction, we should have !space->stop_new_ops. This is
mini-transaction, we should have !space->is_stopping(). This is
guaranteed by meta-data locks or transactional locks. */
ut_ad(!space->is_stopping()
|| space->is_being_truncated /* fil_truncate_prepare() */
|| space->referenced());
ut_ad(!space->is_stopping() || space->referenced());
}
#endif /* UNIV_DEBUG */
......
......@@ -1034,77 +1034,11 @@ static
buf_block_t*
fsp_page_create(fil_space_t *space, page_no_t offset, mtr_t *mtr)
{
buf_block_t *block;
if (UNIV_UNLIKELY(space->is_being_truncated))
{
const page_id_t page_id{space->id, offset};
uint32_t state;
block= mtr->get_already_latched(page_id, MTR_MEMO_PAGE_X_FIX);
if (block)
goto have_latch;
else
{
buf_pool_t::hash_chain &chain=
buf_pool.page_hash.cell_get(page_id.fold());
mysql_mutex_lock(&buf_pool.mutex);
block= reinterpret_cast<buf_block_t*>
(buf_pool.page_hash.get(page_id, chain));
if (!block)
{
mysql_mutex_unlock(&buf_pool.mutex);
goto create;
}
}
if (!mtr->have_x_latch(*block))
{
const bool got{block->page.lock.x_lock_try()};
mysql_mutex_unlock(&buf_pool.mutex);
if (!got)
{
block->page.lock.x_lock();
const page_id_t id{block->page.id()};
if (UNIV_UNLIKELY(id != page_id))
{
ut_ad(id.is_corrupted());
block->page.lock.x_unlock();
goto create;
}
}
state= block->page.fix() + 1;
mtr->memo_push(block, MTR_MEMO_PAGE_X_FIX);
}
else
{
mysql_mutex_unlock(&buf_pool.mutex);
have_latch:
state= block->page.state();
}
ut_ad(state > buf_page_t::FREED);
ut_ad(state < buf_page_t::READ_FIX);
ut_ad((state & buf_page_t::LRU_MASK) != buf_page_t::IBUF_EXIST);
ut_ad(block->page.lock.x_lock_count() == 1);
ut_ad(block->page.frame);
#ifdef BTR_CUR_HASH_ADAPT
ut_ad(!block->index);
#endif
block->page.set_reinit(state < buf_page_t::UNFIXED
? buf_page_t::FREED
: (state & buf_page_t::LRU_MASK));
}
else
{
create:
buf_block_t *free_block= buf_LRU_get_free_block(false);
block= buf_page_create(space, static_cast<uint32_t>(offset),
space->zip_size(), mtr, free_block);
if (UNIV_UNLIKELY(block != free_block))
buf_pool.free_block(free_block);
}
buf_block_t *free_block= buf_LRU_get_free_block(false);
buf_block_t *block= buf_page_create(space, static_cast<uint32_t>(offset),
space->zip_size(), mtr, free_block);
if (UNIV_UNLIKELY(block != free_block))
buf_pool.free_block(free_block);
fsp_init_file_page(space, block, mtr);
return block;
}
......@@ -1799,7 +1733,6 @@ fseg_create(fil_space_t *space, ulint byte_offset, mtr_t *mtr, dberr_t *err,
ut_d(const auto x = block->page.lock.x_lock_count());
ut_ad(x || block->page.lock.not_recursive());
ut_ad(x == 1 || space->is_being_truncated);
ut_ad(x <= 2);
ut_ad(!fil_page_get_type(block->page.frame));
mtr->write<1>(*block, FIL_PAGE_TYPE + 1 + block->page.frame,
......
......@@ -362,8 +362,6 @@ struct fil_space_t final
Protected by log_sys.mutex.
If and only if this is nonzero, the
tablespace will be in named_spaces. */
/** whether undo tablespace truncation is in progress */
bool is_being_truncated;
fil_type_t purpose;/*!< purpose */
UT_LIST_BASE_NODE_T(fil_node_t) chain;
/*!< base node for the file chain */
......@@ -442,6 +440,8 @@ struct fil_space_t final
/** LSN of freeing last page; protected by freed_range_mutex */
lsn_t last_freed_lsn;
/** LSN of undo tablespace creation or 0; protected by latch */
lsn_t create_lsn;
public:
/** @return whether doublewrite buffering is needed */
inline bool use_doublewrite() const;
......@@ -449,6 +449,12 @@ struct fil_space_t final
/** @return whether a page has been freed */
inline bool is_freed(uint32_t page);
/** Set create_lsn. */
inline void set_create_lsn(lsn_t lsn);
/** @return the latest tablespace rebuild LSN, or 0 */
lsn_t get_create_lsn() const { return create_lsn; }
/** Apply freed_ranges to the file.
@param writable whether the file is writable
@return number of pages written or hole-punched */
......@@ -526,9 +532,6 @@ struct fil_space_t final
/** Note that operations on the tablespace must stop. */
inline void set_stopping();
/** Note that operations on the tablespace can resume after truncation */
inline void clear_stopping();
/** Drop the tablespace and wait for any pending operations to cease
@param id tablespace identifier
@param detached_handle pointer to file to be closed later, or nullptr
......@@ -1625,14 +1628,6 @@ inline void fil_space_t::set_stopping()
#endif
}
inline void fil_space_t::clear_stopping()
{
mysql_mutex_assert_owner(&fil_system.mutex);
static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed);
ut_ad((n & STOPPING) == STOPPING_WRITES);
}
/** Flush pending writes from the file system cache to the file. */
template<bool have_reference> inline void fil_space_t::flush()
{
......
......@@ -89,8 +89,9 @@ struct mtr_t {
{ auto s= m_memo.size(); rollback_to_savepoint(s - 1, s); }
/** Commit a mini-transaction that is shrinking a tablespace.
@param space tablespace that is being shrunk */
ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
@param space tablespace that is being shrunk
@param size new size in pages */
ATTRIBUTE_COLD void commit_shrink(fil_space_t &space, uint32_t size);
/** Commit a mini-transaction that is deleting or renaming a file.
@param space tablespace that is being renamed or deleted
......
......@@ -271,7 +271,7 @@ extern my_bool srv_undo_log_truncate;
extern my_bool srv_prefix_index_cluster_optimization;
/** Default size of UNDO tablespace (10MiB for innodb_page_size=16k) */
constexpr ulint SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
constexpr uint32_t SRV_UNDO_TABLESPACE_SIZE_IN_PAGES= (10U << 20) /
UNIV_PAGE_SIZE_DEF;
extern char* srv_log_group_home_dir;
......
......@@ -258,9 +258,21 @@ void mtr_t::rollback_to_savepoint(ulint begin, ulint end)
m_memo.erase(m_memo.begin() + begin, m_memo.begin() + end);
}
/** Set create_lsn. */
inline void fil_space_t::set_create_lsn(lsn_t lsn)
{
#ifndef SUX_LOCK_GENERIC
ut_ad(latch.is_write_locked());
#endif
/* Concurrent log_checkpoint_low() must be impossible. */
mysql_mutex_assert_owner(&log_sys.mutex);
create_lsn= lsn;
}
/** Commit a mini-transaction that is shrinking a tablespace.
@param space tablespace that is being shrunk */
void mtr_t::commit_shrink(fil_space_t &space)
@param space tablespace that is being shrunk
@param size new size in pages */
void mtr_t::commit_shrink(fil_space_t &space, uint32_t size)
{
ut_ad(is_active());
ut_ad(!is_inside_ibuf());
......@@ -278,16 +290,23 @@ void mtr_t::commit_shrink(fil_space_t &space)
const lsn_t start_lsn= do_write().first;
ut_d(m_log.erase());
fil_node_t *file= UT_LIST_GET_LAST(space.chain);
mysql_mutex_lock(&log_sys.flush_order_mutex);
mysql_mutex_lock(&fil_system.mutex);
ut_ad(file->is_open());
space.size= file->size= size;
space.set_create_lsn(m_commit_lsn);
mysql_mutex_unlock(&fil_system.mutex);
space.clear_freed_ranges();
/* Durably write the reduced FSP_SIZE before truncating the data file. */
log_write_and_flush();
os_file_truncate(space.chain.start->name, space.chain.start->handle,
os_offset_t{space.size} << srv_page_size_shift, true);
space.clear_freed_ranges();
os_offset_t{size} << srv_page_size_shift, true);
const page_id_t high{space.id, space.size};
const page_id_t high{space.id, size};
for (mtr_memo_slot_t &slot : m_memo)
{
......@@ -331,13 +350,6 @@ void mtr_t::commit_shrink(fil_space_t &space)
mysql_mutex_unlock(&log_sys.flush_order_mutex);
mysql_mutex_lock(&fil_system.mutex);
ut_ad(space.is_being_truncated);
ut_ad(space.is_stopping_writes());
space.clear_stopping();
space.is_being_truncated= false;
mysql_mutex_unlock(&fil_system.mutex);
release();
release_resources();
srv_stats.log_write_requests.inc();
......
......@@ -41,6 +41,7 @@ Created 3/26/1996 Heikki Tuuri
#include "dict0load.h"
#include <mysql/service_thd_mdl.h>
#include <mysql/service_wsrep.h>
#include "log.h"
/** Maximum allowable purge history length. <=0 means 'infinite'. */
ulong srv_max_purge_lag = 0;
......@@ -669,16 +670,9 @@ TRANSACTIONAL_TARGET void trx_purge_truncate_history()
rseg.latch.rd_unlock();
}
ib::info() << "Truncating " << file->name;
sql_print_information("InnoDB: Truncating %s", file->name);
trx_purge_cleanse_purge_queue(space);
log_free_check();
mtr_t mtr;
mtr.start();
mtr.x_lock_space(&space);
const auto space_id= space.id;
/* Lock all modified pages of the tablespace.
During truncation, we do not want any writes to the file.
......@@ -688,86 +682,12 @@ TRANSACTIONAL_TARGET void trx_purge_truncate_history()
discarding the to-be-trimmed pages without flushing would
break crash recovery. */
rescan:
if (UNIV_UNLIKELY(srv_shutdown_state != SRV_SHUTDOWN_NONE) &&
srv_fast_shutdown)
{
fast_shutdown:
mtr.commit();
return;
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); bpage; )
{
ut_ad(bpage->oldest_modification());
ut_ad(bpage->in_file());
buf_page_t *prev= UT_LIST_GET_PREV(list, bpage);
if (bpage->oldest_modification() > 2 && bpage->id().space() == space_id)
{
ut_ad(bpage->frame);
bpage->fix();
{
/* Try to acquire an exclusive latch while the cache line is
fresh after fix(). */
const bool got_lock{bpage->lock.x_lock_try()};
buf_pool.flush_hp.set(prev);
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (!got_lock)
bpage->lock.x_lock();
}
#ifdef BTR_CUR_HASH_ADAPT
/* There is no AHI on undo tablespaces. */
ut_ad(!reinterpret_cast<buf_block_t*>(bpage)->index);
#endif
ut_ad(!bpage->is_io_fixed());
ut_ad(bpage->id().space() == space_id);
if (bpage->oldest_modification() > 2 &&
!mtr.have_x_latch(*reinterpret_cast<buf_block_t*>(bpage)))
mtr.memo_push(reinterpret_cast<buf_block_t*>(bpage),
MTR_MEMO_PAGE_X_FIX);
else
{
bpage->unfix();
bpage->lock.x_unlock();
}
mysql_mutex_lock(&buf_pool.flush_list_mutex);
if (prev != buf_pool.flush_hp.get())
{
/* The functions buf_pool_t::release_freed_page() or
buf_do_flush_list_batch() may be right now holding
buf_pool.mutex and waiting to acquire
buf_pool.flush_list_mutex. Ensure that they can proceed,
to avoid extreme waits. */
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
mysql_mutex_lock(&buf_pool.mutex);
mysql_mutex_unlock(&buf_pool.mutex);
goto rescan;
}
}
bpage= prev;
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
if (UNIV_UNLIKELY(srv_shutdown_state != SRV_SHUTDOWN_NONE) &&
srv_fast_shutdown)
goto fast_shutdown;
/* Re-initialize tablespace, in a single mini-transaction. */
const ulint size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
/* Adjust the tablespace metadata. */
mysql_mutex_lock(&fil_system.mutex);
space.set_stopping();
space.is_being_truncated= true;
if (space.crypt_data)
{
space.reacquire();
......@@ -778,26 +698,20 @@ TRANSACTIONAL_TARGET void trx_purge_truncate_history()
else
mysql_mutex_unlock(&fil_system.mutex);
for (auto i= 6000; space.referenced();
std::this_thread::sleep_for(std::chrono::milliseconds(10)))
{
if (!--i)
{
mtr.commit();
ib::error() << "Failed to freeze UNDO tablespace " << file->name;
return;
}
}
/* Re-initialize tablespace, in a single mini-transaction. */
const uint32_t size= SRV_UNDO_TABLESPACE_SIZE_IN_PAGES;
log_free_check();
mtr_t mtr;
mtr.start();
mtr.x_lock_space(&space);
/* Associate the undo tablespace with mtr.
During mtr::commit_shrink(), InnoDB can use the undo
tablespace object to clear all freed ranges */
mtr.set_named_space(&space);
mtr.trim_pages(page_id_t(space.id, size));
ut_a(fsp_header_init(&space, size, &mtr) == DB_SUCCESS);
mysql_mutex_lock(&fil_system.mutex);
space.size= file->size= size;
mysql_mutex_unlock(&fil_system.mutex);
for (auto &rseg : trx_sys.rseg_array)
{
......@@ -823,7 +737,7 @@ TRANSACTIONAL_TARGET void trx_purge_truncate_history()
rseg.reinit(rblock->page.id().page_no());
}
mtr.commit_shrink(space);
mtr.commit_shrink(space, size);
/* No mutex; this is only updated by the purge coordinator. */
export_vars.innodb_undo_truncations++;
......@@ -840,11 +754,12 @@ TRANSACTIONAL_TARGET void trx_purge_truncate_history()
purge_sys.next_stored= false;
}
DBUG_EXECUTE_IF("ib_undo_trunc", ib::info() << "ib_undo_trunc";
DBUG_EXECUTE_IF("ib_undo_trunc",
sql_print_information("InnoDB: ib_undo_trunc");
log_buffer_flush_to_disk();
DBUG_SUICIDE(););
ib::info() << "Truncated " << file->name;
sql_print_information("InnoDB: Truncated %s", file->name);
purge_sys.truncate.last= purge_sys.truncate.current;
ut_ad(&space == purge_sys.truncate.current);
purge_sys.truncate.current= nullptr;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment