Commit b102872a authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-31767 InnoDB tables are being flagged as corrupted on an I/O bound server

The main problem is that at ever since
commit aaef2e1d removed the
function buf_wait_for_read(), it is not safe to invoke
buf_page_get_low() with RW_NO_LATCH, that is, only buffer-fixing
the page. If a page read (or decryption or decompression) is in
progress, there would be a race condition when executing consistency
checks, and a page would wrongly be flagged as corrupted.

Furthermore, if the page is actually corrupted and the initial
access to it was with RW_NO_LATCH (only buffer-fixing), the
page read handler would likely end up in an infinite loop in
buf_pool_t::corrupted_evict(). It is not safe to invoke
mtr_t::upgrade_buffer_fix() on a block on which a page latch
was not initially acquired in buf_page_get_low().

btr_block_reget(): Remove the constant parameter rw_latch=RW_X_LATCH.

btr_block_get(): Assert that RW_NO_LATCH is not being used,
and change the parameter type of rw_latch.

btr_pcur_move_to_next_page(), innobase_table_is_empty(): Adjust for the
parameter type change of btr_block_get().

btr_root_block_get(): If mode==RW_NO_LATCH, do not check the integrity of
the page, because it is not safe to do so.

btr_page_alloc_low(), btr_page_free(): If the root page latch is not
previously held by the mini-transaction, invoke btr_root_block_get()
again with the proper latching mode.

btr_latch_prev(): Helper function to safely acquire a latch on a
preceding sibling page while holding a latch on a B-tree page.
To avoid deadlocks, we must not wait for the latch while holding
a latch on the current page, because another thread may be waiting
for our page latch when moving to the next page from our preceding
sibling page. If s_lock_try() or x_lock_try() on the preceding page fails,
we must release the current page latch, and wait for the latch on the
preceding page as well as the current page, in that order.
Page splits or merges will be prevented by the parent page latch
that we are holding.

btr_cur_t::search_leaf(): Make use of btr_latch_prev().

btr_cur_t::open_leaf(): Make use of btr_latch_prev(). Do not invoke
mtr_t::upgrade_buffer_fix() (when latch_mode == BTR_MODIFY_TREE),
because we will already have acquired all page latches upfront.

btr_cur_t::pessimistic_search_leaf(): Do acquire an exclusive index latch
before accessing the page. Make use of btr_latch_prev().
parent 9bb5b253
......@@ -218,9 +218,10 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index)
@param[out] err error code
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, ulint mode, bool merge,
uint32_t page, rw_lock_type_t mode, bool merge,
mtr_t *mtr, dberr_t *err)
{
ut_ad(mode != RW_NO_LATCH);
dberr_t local_err;
if (!err)
err= &local_err;
......@@ -281,7 +282,9 @@ btr_root_block_get(
if (UNIV_LIKELY(block != nullptr))
{
if (!!page_is_comp(block->page.frame) != index->table->not_redundant() ||
if (UNIV_UNLIKELY(mode == RW_NO_LATCH));
else if (!!page_is_comp(block->page.frame) !=
index->table->not_redundant() ||
btr_page_get_index_id(block->page.frame) != index->id ||
!fil_page_index_page_check(block->page.frame) ||
index->is_spatial() !=
......@@ -515,18 +518,16 @@ static buf_block_t *btr_get_latched_root(const dict_index_t &index, mtr_t *mtr)
mini-transaction. */
static buf_block_t *
btr_block_reget(mtr_t *mtr, const dict_index_t &index,
const page_id_t id, rw_lock_type_t rw_latch,
dberr_t *err)
const page_id_t id, dberr_t *err)
{
if (buf_block_t *block=
mtr->get_already_latched(id, mtr_memo_type_t(rw_latch)))
if (buf_block_t *block= mtr->get_already_latched(id, MTR_MEMO_PAGE_X_FIX))
{
*err= DB_SUCCESS;
return block;
}
ut_ad(mtr->memo_contains_flagged(&index.lock, MTR_MEMO_X_LOCK));
return btr_block_get(index, id.page_no(), rw_latch, true, mtr, err);
return btr_block_get(index, id.page_no(), RW_X_LATCH, true, mtr, err);
}
/**************************************************************//**
......@@ -585,21 +586,15 @@ btr_page_alloc_low(
if (UNIV_UNLIKELY(!root))
return root;
if (mtr->have_u_or_x_latch(*root))
{
const bool have_latch= mtr->have_u_or_x_latch(*root);
#ifdef BTR_CUR_HASH_ADAPT
ut_ad(!root->index || !root->index->freed());
ut_ad(!have_latch || !root->index || !root->index->freed());
#endif
mtr->rollback_to_savepoint(savepoint);
}
else
{
mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX);
root->page.lock.u_lock();
#ifdef BTR_CUR_HASH_ADAPT
btr_search_drop_page_hash_index(root, true);
#endif
}
if (!have_latch &&
UNIV_UNLIKELY(!(root= btr_root_block_get(index, RW_SX_LATCH, mtr, err))))
return root;
fseg_header_t *seg_header= root->page.frame +
(level ? PAGE_HEADER + PAGE_BTR_SEG_TOP : PAGE_HEADER + PAGE_BTR_SEG_LEAF);
......@@ -696,21 +691,13 @@ dberr_t btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr,
const auto savepoint= mtr->get_savepoint();
if (buf_block_t *root= btr_root_block_get(index, RW_NO_LATCH, mtr, &err))
{
if (mtr->have_u_or_x_latch(*root))
{
const bool have_latch= mtr->have_u_or_x_latch(*root);
#ifdef BTR_CUR_HASH_ADAPT
ut_ad(!root->index || !root->index->freed());
ut_ad(!have_latch || !root->index || !root->index->freed());
#endif
mtr->rollback_to_savepoint(savepoint);
}
else
{
mtr->lock_register(savepoint, MTR_MEMO_PAGE_SX_FIX);
root->page.lock.u_lock();
#ifdef BTR_CUR_HASH_ADAPT
btr_search_drop_page_hash_index(root, true);
#endif
}
if (have_latch ||
(root= btr_root_block_get(index, RW_SX_LATCH, mtr, &err)))
err= fseg_free_page(&root->page.frame[blob ||
page_is_leaf(block->page.frame)
? PAGE_HEADER + PAGE_BTR_SEG_LEAF
......@@ -4293,7 +4280,7 @@ btr_discard_page(
if (left_page_no != FIL_NULL) {
merge_page_id.set_page_no(left_page_no);
merge_block = btr_block_reget(mtr, *index, merge_page_id,
RW_X_LATCH, &err);
&err);
if (UNIV_UNLIKELY(!merge_block)) {
return err;
}
......@@ -4319,7 +4306,7 @@ btr_discard_page(
} else if (right_page_no != FIL_NULL) {
merge_page_id.set_page_no(right_page_no);
merge_block = btr_block_reget(mtr, *index, merge_page_id,
RW_X_LATCH, &err);
&err);
if (UNIV_UNLIKELY(!merge_block)) {
return err;
}
......
......@@ -938,6 +938,76 @@ static inline page_cur_mode_t btr_cur_nonleaf_mode(page_cur_mode_t mode)
return PAGE_CUR_LE;
}
static MY_ATTRIBUTE((nonnull))
/** Acquire a latch on the previous page without violating the latching order.
@param block index page
@param page_id page identifier with valid space identifier
@param zip_size ROW_FORMAT=COMPRESSED page size, or 0
@param rw_latch the latch on block (RW_S_LATCH or RW_X_LATCH)
@param mtr mini-transaction
@param err error code
@retval 0 if an error occurred
@retval 1 if the page could be latched in the wrong order
@retval -1 if the latch on block was temporarily released */
int btr_latch_prev(buf_block_t *block, page_id_t page_id, ulint zip_size,
rw_lock_type_t rw_latch, mtr_t *mtr, dberr_t *err)
{
ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH);
ut_ad(page_id.space() == block->page.id().space());
const auto prev_savepoint= mtr->get_savepoint();
ut_ad(block == mtr->at_savepoint(prev_savepoint - 1));
page_id.set_page_no(btr_page_get_prev(block->page.frame));
buf_block_t *prev= buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, nullptr,
BUF_GET, mtr, err, false);
if (UNIV_UNLIKELY(!prev))
return 0;
int ret= 1;
if (UNIV_UNLIKELY(rw_latch == RW_S_LATCH))
{
if (UNIV_LIKELY(prev->page.lock.s_lock_try()))
{
mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_S_FIX);
goto prev_latched;
}
block->page.lock.s_unlock();
}
else
{
if (UNIV_LIKELY(prev->page.lock.x_lock_try()))
{
mtr->lock_register(prev_savepoint, MTR_MEMO_PAGE_X_FIX);
goto prev_latched;
}
block->page.lock.x_unlock();
}
ret= -1;
mtr->lock_register(prev_savepoint - 1, MTR_MEMO_BUF_FIX);
mtr->rollback_to_savepoint(prev_savepoint);
prev= buf_page_get_gen(page_id, zip_size, rw_latch, prev,
BUF_GET, mtr, err, false);
if (UNIV_UNLIKELY(!prev))
return 0;
mtr->upgrade_buffer_fix(prev_savepoint - 1, rw_latch);
prev_latched:
if (memcmp_aligned<2>(FIL_PAGE_TYPE + prev->page.frame,
FIL_PAGE_TYPE + block->page.frame, 2) ||
memcmp_aligned<2>(PAGE_HEADER + PAGE_INDEX_ID + prev->page.frame,
PAGE_HEADER + PAGE_INDEX_ID + block->page.frame, 8) ||
page_is_comp(prev->page.frame) != page_is_comp(block->page.frame))
{
ut_ad("corrupted" == 0); // FIXME: remove this
*err= DB_CORRUPTION;
ret= 0;
}
return ret;
}
dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
btr_latch_mode latch_mode, mtr_t *mtr)
{
......@@ -1192,11 +1262,12 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
page_cur.block= block;
ut_ad(block == mtr->at_savepoint(block_savepoint));
ut_ad(rw_latch != RW_NO_LATCH);
#ifdef UNIV_ZIP_DEBUG
if (rw_latch == RW_NO_LATCH);
else if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
if (const page_zip_des_t *page_zip= buf_block_get_page_zip(block))
ut_a(page_zip_validate(page_zip, block->page.frame, index()));
#endif /* UNIV_ZIP_DEBUG */
const uint32_t page_level= btr_page_get_level(block->page.frame);
if (height == ULINT_UNDEFINED)
......@@ -1240,7 +1311,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
goto reached_index_root_and_leaf;
goto reached_root_and_leaf;
case RW_NO_LATCH:
ut_ad(mtr->memo_contains_flagged(&index()->lock, MTR_MEMO_X_LOCK));
ut_ad(0);
}
goto reached_leaf;
}
......@@ -1257,14 +1328,8 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
if (tree_height <= height + 2)
/* Retain the root page latch. */
break;
goto release_parent_page;
/* fall through */
default:
if (rw_latch == RW_NO_LATCH)
{
ut_ad(!height);
break;
}
release_parent_page:
ut_ad(block_savepoint > savepoint);
mtr->rollback_to_savepoint(block_savepoint - 1, block_savepoint);
block_savepoint--;
......@@ -1301,29 +1366,20 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
static_assert(BTR_MODIFY_PREV & BTR_MODIFY_LEAF, "");
static_assert(BTR_SEARCH_PREV & BTR_SEARCH_LEAF, "");
ut_ad(!latch_by_caller);
ut_ad(rw_latch ==
rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH)));
if (rw_latch == RW_NO_LATCH)
{
/* latch also siblings from left to right */
rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
if (page_has_prev(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_prev(block->page.frame),
rw_latch, false, mtr, &err))
!btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
goto func_exit;
mtr->upgrade_buffer_fix(block_savepoint, rw_latch);
if (page_has_next(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_next(block->page.frame),
rw_latch, false, mtr, &err))
goto func_exit;
}
goto release_tree;
case BTR_SEARCH_LEAF:
case BTR_MODIFY_LEAF:
if (rw_latch == RW_NO_LATCH)
{
ut_ad(index()->is_ibuf());
mtr->upgrade_buffer_fix(block_savepoint, rw_lock_type_t(latch_mode));
}
if (!latch_by_caller)
{
release_tree:
......@@ -1337,13 +1393,11 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
break;
default:
ut_ad(latch_mode == BTR_MODIFY_TREE);
ut_ad(rw_latch == RW_NO_LATCH);
ut_ad(rw_latch == RW_X_LATCH);
/* x-latch also siblings from left to right */
if (page_has_prev(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_prev(block->page.frame),
RW_X_LATCH, false, mtr, &err))
!btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err))
goto func_exit;
mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH);
if (page_has_next(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_next(block->page.frame),
RW_X_LATCH, false, mtr, &err))
......@@ -1491,25 +1545,15 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
page_rec_is_first(page_cur.rec, block->page.frame))
{
ut_ad(block_savepoint + 1 == mtr->get_savepoint());
/* Latch the previous page if the node pointer is the leftmost
of the current page. */
buf_block_t *left= btr_block_get(*index(),
btr_page_get_prev(block->page.frame),
RW_NO_LATCH, false, mtr, &err);
if (UNIV_UNLIKELY(!left))
int ret= btr_latch_prev(block, page_id, zip_size, rw_latch, mtr, &err);
if (!ret)
goto func_exit;
ut_ad(block_savepoint + 2 == mtr->get_savepoint());
if (UNIV_LIKELY(left->page.lock.s_lock_try()))
mtr->lock_register(block_savepoint + 1, MTR_MEMO_PAGE_S_FIX);
else
if (ret < 0)
{
if (rw_latch == RW_S_LATCH)
block->page.lock.s_unlock();
else
block->page.lock.x_unlock();
mtr->upgrade_buffer_fix(block_savepoint + 1, RW_S_LATCH);
mtr->lock_register(block_savepoint, MTR_MEMO_BUF_FIX);
mtr->upgrade_buffer_fix(block_savepoint, RW_S_LATCH);
/* While our latch on the level-2 page prevents splits or
merges of this level-1 block, other threads may have
modified it due to splitting or merging some level-0 (leaf)
......@@ -1524,13 +1568,12 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
offsets));
}
}
goto leaf_with_no_latch;
rw_latch= rw_lock_type_t(latch_mode & (RW_X_LATCH | RW_S_LATCH));
break;
case BTR_MODIFY_LEAF:
case BTR_SEARCH_LEAF:
if (index()->is_ibuf())
goto leaf_with_no_latch;
rw_latch= rw_lock_type_t(latch_mode);
if (btr_op != BTR_NO_OP &&
if (btr_op != BTR_NO_OP && !index()->is_ibuf() &&
ibuf_should_try(index(), btr_op != BTR_INSERT_OP))
/* Try to buffer the operation if the leaf page
is not in the buffer pool. */
......@@ -1550,10 +1593,9 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode,
mtr->rollback_to_savepoint(block_savepoint);
goto need_opposite_intention;
}
/* fall through */
break;
default:
leaf_with_no_latch:
rw_latch= RW_NO_LATCH;
ut_ad(rw_latch == RW_X_LATCH);
}
}
......@@ -1579,7 +1621,7 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
ut_ad(!index()->is_ibuf() || ibuf_inside(mtr));
rec_offs offsets_[REC_OFFS_NORMAL_SIZE];
rec_offs* offsets = offsets_;
rec_offs* offsets= offsets_;
rec_offs_init(offsets_);
ut_ad(flag == BTR_CUR_BINARY);
......@@ -1653,9 +1695,8 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
/* Go to the child node */
page_id.set_page_no(btr_node_ptr_get_child_page_no(page_cur.rec, offsets));
const auto block_savepoint= mtr->get_savepoint();
block=
buf_page_get_gen(page_id, block->zip_size(), RW_NO_LATCH, nullptr, BUF_GET,
buf_page_get_gen(page_id, block->zip_size(), RW_X_LATCH, nullptr, BUF_GET,
mtr, &err, !--height && !index()->is_clust());
if (!block)
......@@ -1674,15 +1715,15 @@ dberr_t btr_cur_t::pessimistic_search_leaf(const dtuple_t *tuple,
if (height != btr_page_get_level(block->page.frame))
goto corrupted;
if (page_has_prev(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_prev(block->page.frame),
RW_X_LATCH, false, mtr, &err))
goto func_exit;
mtr->upgrade_buffer_fix(block_savepoint, RW_X_LATCH);
#ifdef UNIV_ZIP_DEBUG
const page_zip_des_t *page_zip= buf_block_get_page_zip(block);
ut_a(!page_zip || page_zip_validate(page_zip, block->page.frame, index()));
#endif /* UNIV_ZIP_DEBUG */
if (page_has_prev(block->page.frame) &&
!btr_latch_prev(block, page_id, block->zip_size(),
RW_X_LATCH, mtr, &err))
goto func_exit;
if (page_has_next(block->page.frame) &&
!btr_block_get(*index(), btr_page_get_next(block->page.frame),
RW_X_LATCH, false, mtr, &err))
......@@ -1895,13 +1936,10 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
ut_ad(n_blocks < BTR_MAX_LEVELS);
ut_ad(savepoint + n_blocks == mtr->get_savepoint());
const rw_lock_type_t rw_latch= height && latch_mode != BTR_MODIFY_TREE
? upper_rw_latch
: RW_NO_LATCH;
buf_block_t* block=
btr_block_get(*index, page, rw_latch, !height && !index->is_clust(), mtr,
&err);
btr_block_get(*index, page,
height ? upper_rw_latch : root_leaf_rw_latch,
!height, mtr, &err);
ut_ad(!block == (err != DB_SUCCESS));
if (!block)
......@@ -1943,13 +1981,11 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
if (latch_mode == BTR_MODIFY_TREE)
{
ut_ad(rw_latch == RW_NO_LATCH);
/* x-latch also siblings from left to right */
if (page_has_prev(block->page.frame) &&
!btr_block_get(*index, btr_page_get_prev(block->page.frame),
RW_X_LATCH, false, mtr, &err))
!btr_latch_prev(block, block->page.id(), zip_size, RW_X_LATCH,
mtr, &err))
break;
mtr->upgrade_buffer_fix(leaf_savepoint - 1, RW_X_LATCH);
if (page_has_next(block->page.frame) &&
!btr_block_get(*index, btr_page_get_next(block->page.frame),
RW_X_LATCH, false, mtr, &err))
......@@ -1964,10 +2000,6 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
}
else
{
if (rw_latch == RW_NO_LATCH)
mtr->upgrade_buffer_fix(leaf_savepoint - 1,
rw_lock_type_t(latch_mode &
(RW_X_LATCH | RW_S_LATCH)));
if (latch_mode != BTR_CONT_MODIFY_TREE)
{
ut_ad(latch_mode == BTR_MODIFY_LEAF ||
......@@ -2037,21 +2069,6 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index,
n_blocks= 1;
}
}
if (!height)
{
if (page == index->page)
mtr->upgrade_buffer_fix(savepoint, RW_X_LATCH);
else
{
/* The U-latch protects BTR_SEG_HEAP, BTR_SEG_TOP. */
mtr->upgrade_buffer_fix(savepoint, RW_SX_LATCH);
/* Upgrade buffer-fix to exclusive latches on all remaining pages. */
for (ulint i= 1; i <= n_blocks; i++)
mtr->upgrade_buffer_fix(savepoint + i, RW_X_LATCH);
}
}
}
/* Go to the child node */
......
......@@ -540,7 +540,8 @@ btr_pcur_move_to_next_page(
dberr_t err;
buf_block_t* next_block = btr_block_get(
*cursor->index(), next_page_no, cursor->latch_mode & ~12,
*cursor->index(), next_page_no,
rw_lock_type_t(cursor->latch_mode & (RW_X_LATCH | RW_S_LATCH)),
page_is_leaf(page), mtr, &err);
if (UNIV_UNLIKELY(!next_block)) {
......
......@@ -2156,8 +2156,7 @@ static bool innobase_table_is_empty(const dict_table_t *table,
}
next_page= false;
block= btr_block_get(*clust_index, next_page_no, BTR_SEARCH_LEAF, false,
&mtr);
block= btr_block_get(*clust_index, next_page_no, RW_S_LATCH, false, &mtr);
if (!block)
goto non_empty;
page_cur_set_before_first(block, cur);
......
......@@ -91,7 +91,7 @@ ATTRIBUTE_COLD void btr_decryption_failed(const dict_index_t &index);
@param[out] err error code
@return block */
buf_block_t *btr_block_get(const dict_index_t &index,
uint32_t page, ulint mode, bool merge,
uint32_t page, rw_lock_type_t mode, bool merge,
mtr_t *mtr, dberr_t *err= nullptr);
/**************************************************************//**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment