Commit b102872a authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-31767 InnoDB tables are being flagged as corrupted on an I/O bound server

The main problem is that at ever since
commit aaef2e1d removed the
function buf_wait_for_read(), it is not safe to invoke
buf_page_get_low() with RW_NO_LATCH, that is, only buffer-fixing
the page. If a page read (or decryption or decompression) is in
progress, there would be a race condition when executing consistency
checks, and a page would wrongly be flagged as corrupted.

Furthermore, if the page is actually corrupted and the initial
access to it was with RW_NO_LATCH (only buffer-fixing), the
page read handler would likely end up in an infinite loop in
buf_pool_t::corrupted_evict(). It is not safe to invoke
mtr_t::upgrade_buffer_fix() on a block on which a page latch
was not initially acquired in buf_page_get_low().

btr_block_reget(): Remove the constant parameter rw_latch=RW_X_LATCH.

btr_block_get(): Assert that RW_NO_LATCH is not being used,
and change the parameter type of rw_latch.

btr_pcur_move_to_next_page(), innobase_table_is_empty(): Adjust for the
parameter type change of btr_block_get().

btr_root_block_get(): If mode==RW_NO_LATCH, do not check the integrity of
the page, because it is not safe to do so.

btr_page_alloc_low(), btr_page_free(): If the root page latch is not
previously held by the mini-transaction, invoke btr_root_block_get()
again with the proper latching mode.

btr_latch_prev(): Helper function to safely acquire a latch on a
preceding sibling page while holding a latch on a B-tree page.
To avoid deadlocks, we must not wait for the latch while holding
a latch on the current page, because another thread may be waiting
for our page latch when moving to the next page from our preceding
sibling page. If s_lock_try() or x_lock_try() on the preceding page fails,
we must release the current page latch, and wait for the latch on the
preceding page as well as the current page, in that order.
Page splits or merges will be prevented by the parent page latch
that we are holding.

btr_cur_t::search_leaf(): Make use of btr_latch_prev().

btr_cur_t::open_leaf(): Make use of btr_latch_prev(). Do not invoke
mtr_t::upgrade_buffer_fix() (when latch_mode == BTR_MODIFY_TREE),
because we will already have acquired all page latches upfront.

btr_cur_t::pessimistic_search_leaf(): Do acquire an exclusive index latch
before accessing the page. Make use of btr_latch_prev().
parent 9bb5b253
......@@ -218,9 +218,10 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index)
@param[out] err error code
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, ulint mode, bool merge,
uint32_t page, rw_lock_type_t mode, bool merge,
mtr_t *mtr, dberr_t *err)
{
ut_ad(mode != RW_NO_LATCH);
dberr_t local_err;
if (!err)
err= &local_err;
......@@ -281,11 +282,13 @@ btr_root_block_get(
if (UNIV_LIKELY(block != nullptr))
{
if (!!page_is_comp(block->page.frame) != index->table->not_redundant() ||
btr_page_get_index_id(block->page.frame) != index->id ||
!fil_page_index_page_check(block->page.frame) ||
index->is_spatial() !=
(fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
if (UNIV_UNLIKELY(mode == RW_NO_LATCH));
else if (!!page_is_comp(block->page.frame) !=
index->table->not_redundant() ||
btr_page_get_index_id(block->page.frame) != index->id ||
!fil_page_index_page_check(block->page.frame) ||
index->is_spatial() !=
(fil_page_get_type(block->page.frame) == FIL_PAGE_RTREE))
{
*err= DB_PAGE_CORRUPTED;
block= nullptr;
......@@ -515,18 +518,16 @@ static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr)
mini-transaction. */
static buf_block_t *
btr_block_reget(mtr_t *mtr, const dict_index_t &index,
const page_id_t id, rw_lock_type_t rw_latch,
dberr_t *err)
const page_id_t id, dberr_t *err)
{
if (buf_block_t *block=
mtr->get_already_latched(id, mtr_memo_type_t(rw_latch)))
if (buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX))
{
*err= DB_SUCCESS;
return block;
}
ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK));
return btr_block_get(index, id.page_no(), rw_latch, true, mtr, err);
return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err);
}
/**************************************************************//**
......@@ -585,21 +586,15 @@ btr_page_alloc_low(
if (UNIV_UNLIKELY(!root))
return root;
if (mtr->have_u_or_x_latch(*root))
{
#ifdef BTR_CUR_HASH_ADAPT
ut_ad(!root->index || !root->index->freed());
#endif
mtr->rollback_to_savepoint(savepoint);
}
else
{
mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX);
root->page.lock.u_lock();
const bool have_latch= mtr->have_u_or_x_latch(*root);
#ifdef BTR_CUR_HASH_ADAPT
btr_search_drop_page_hash_index(root, true);
ut_ad(!have_latch || !root->index || !root->index->freed());
#endif
}
mtr->rollback_to_savepoint(savepoint);
if (!have_latch &&
UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))))
return root;
fseg_header_t *seg_header= root->page.frame +
(level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF);
......@@ -696,26 +691,18 @@ dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
const auto savepoint= mtr->get_savepoint();
if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err))
{
if (mtr->have_u_or_x_latch(*root))
{
#ifdef BTR_CUR_HASH_ADAPT
ut_ad(!root->index || !root->index->freed());
#endif
mtr->rollback_to_savepoint(savepoint);
}
else
{
mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX);
root->page.lock.u_lock();
const bool have_latch= mtr->have_u_or_x_latch(*root);
#ifdef BTR_CUR_HASH_ADAPT
btr_search_drop_page_hash_index(root, true);
ut_ad(!have_latch || !root->index || !root->index->freed());
#endif
}
err= fseg_free_page(&root->page.frame[blob ||
page_is_leaf(block->page.frame)
? PAGE_HEADER + PAGE_BTR_SEG_LEAF
: PAGE_HEADER + PAGE_BTR_SEG_TOP],
space, page, mtr, space_latched);
mtr->rollback_to_savepoint(savepoint);
if (have_latch ||
(root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err)))
err= fseg_free_page(&root->page.frame[blob ||
page_is_leaf(block->page.frame)
? PAGE_HEADER + PAGE_BTR_SEG_LEAF
: PAGE_HEADER + PAGE_BTR_SEG_TOP],
space, page, mtr, space_latched);
}
if (err == DB_SUCCESS)
buf_page_free(space, page, mtr);
......@@ -4293,7 +4280,7 @@ btr_discard_page(
if (left_page_no != FIL_NULL) {
merge_page_id.set_page_no(left_page_no);
merge_block = btr_block_reget(mtr, *index, merge_page_id,
RW_X_LATCH, &err);
&err);
if (UNIV_UNLIKELY(!merge_block)) {
return err;
}
......@@ -4319,7 +4306,7 @@ btr_discard_page(
} else if (right_page_no != FIL_NULL) {
merge_page_id.set_page_no(right_page_no);
merge_block = btr_block_reget(mtr, *index, merge_page_id,
RW_X_LATCH, &err);
&err);
if (UNIV_UNLIKELY(!merge_block)) {
return err;
}
......
This diff is collapsed.
......@@ -540,7 +540,8 @@ btr_pcur_move_to_next_page(
dberr_t err;
buf_block_t* next_block = btr_block_get(
*cursor->index(), next_page_no, cursor->latch_mode & ~12,
*cursor->index(), next_page_no,
rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)),
page_is_leaf(page), mtr, &err);
if (UNIV_UNLIKELY(!next_block)) {
......
......@@ -2156,8 +2156,7 @@ static bool innobase_table_is_empty(const dict_table_t *table,
}
next_page= false;
block= btr_block_get(*clust_index, next_page_no, BTR_SEARCH_LEAF, false,
&mtr);
block= btr_block_get(*clust_index, next_page_no, RW_S_LATCH, false, &mtr);
if (!block)
goto non_empty;
page_cur_set_before_first(block, cur);
......
......@@ -91,7 +91,7 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
@param[out] err error code
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, ulint mode, bool merge,
uint32_t page, rw_lock_type_t mode, bool merge,
mtr_t *mtr, dberr_t *err= nullptr);
/**************************************************************//**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment