Commit 165564d3 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-30009 InnoDB shutdown hangs when the change buffer is corrupted

The InnoDB change buffer (ibuf.index, stored in the system tablespace)
and the change buffer bitmaps in persistent tablespaces could get out
of sync with each other: According to the bitmap, no changes exist for
a page, while there actually exist buffered entries in ibuf.index.

InnoDB performs lazy deletion of buffered changes. When a secondary
index leaf page is freed (possibly as part of DROP INDEX), any
buffered changes will not be deleted. Instead, they would be deleted
on a subsequent buf_page_create_low().

One scenario where InnoDB failed to delete buffered changes is
as follows:
1. Some changes were buffered for a secondary index leaf page.
2. The index page had been freed.
3. ibuf_read_merge_pages() invoked ibuf_merge_or_delete_for_page(),
which noticed that the page had been freed, and reset the change buffer
bits, but did not delete the records from ibuf.index.
4. The index page was reallocated for something else.
5. The index page was removed from the buffer pool.
6. Some changes were buffered for the newly created page.
7. Finally, the buffered changes from both 1. and 6. were merged.
8. The index is corrupted.

An alternative outcome is:
4. Shutdown with innodb_fast_shutdown=0 gets into an infinite loop.

An alternative scenario is:
3. ibuf_set_bitmap_for_bulk_load() reset the IBUF_BITMAP_BUFFERED bit
but did not delete the ibuf.index records for that page number.

The shutdown hang was already once fixed in
commit d7a24017, refactored for
10.5 in commit 77e8a311 and
disabled in commit 310dff5d
due to corruption.

We will fix this as follows:

ibuf_delete_recs(): Delete all ibuf.index entries for the specified page.

ibuf_merge_or_delete_for_page(): When the change buffer bitmap bits
were set and the page had been freed, and the page does not belong
to ibuf.index itself, invoke ibuf_delete_recs(). This prevents the
corruption from occurring when a DML operation is allocating a
previously freed page for which changes had been buffered.

ibuf_set_bitmap_for_bulk_load(): When the change buffer bitmap bits
were set, invoke ibuf_delete_recs(). This prevents the corruption
from occurring when CREATE INDEX is reusing a previously freed page.

ibuf_read_merge_pages(): On slow shutdown, remove the orphan records
by invoking ibuf_delete_recs(). This fixes the hang when the change
buffer had become corrupted. We also remove the dops[] accounting,
because nothing can monitor it during shutdown. We invoke
ibuf_delete_recs() if:
(a) buf_page_get_gen() failed to load the page or merge changes
(b) the page is not a valid index leaf page
(c) the page number is out of tablespace bounds

srv_shutdown(): Invoke ibuf_max_size_update(0) to ensure that
the race condition that motivated us to disable the code in
ibuf_read_merge_pages() in commit 310dff5d
is no longer possible. That is, during slow shutdown, both the
rollback of transactions and the purge of history will return
early from ibuf_insert_low().

ibuf_merge_space(), ibuf_delete_for_discarded_space(): Cleanup:
Do not allocate a memory heap.

This was implemented by Thirunarayanan Balathandayuthapani
and tested with innodb_change_buffering_debug=1 by Matthias Leich.
parent 9d388192
......@@ -2274,16 +2274,74 @@ static MY_ATTRIBUTE((warn_unused_result, nonnull))
bool ibuf_delete_rec(const page_id_t page_id, btr_pcur_t* pcur,
const dtuple_t* search_tuple, mtr_t* mtr);
/** Delete the change buffer records for the given page id
@param page_id page identifier */
static void ibuf_delete_recs(const page_id_t page_id)
{
if (!ibuf.index || srv_read_only_mode)
return;
dfield_t dfield[IBUF_REC_FIELD_METADATA];
dtuple_t tuple {0,IBUF_REC_FIELD_METADATA,IBUF_REC_FIELD_METADATA,
dfield,0,nullptr
#ifdef UNIV_DEBUG
,DATA_TUPLE_MAGIC_N
#endif
};
byte space_id[4], page_no[4];
mach_write_to_4(space_id, page_id.space());
mach_write_to_4(page_no, page_id.page_no());
dfield_set_data(&dfield[0], space_id, 4);
dfield_set_data(&dfield[1], field_ref_zero, 1);
dfield_set_data(&dfield[2], page_no, 4);
dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA);
mtr_t mtr;
loop:
btr_pcur_t pcur;
ibuf_mtr_start(&mtr);
if (btr_pcur_open(ibuf.index, &tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
&pcur, &mtr) != DB_SUCCESS)
goto func_exit;
if (!btr_pcur_is_on_user_rec(&pcur))
{
ut_ad(btr_pcur_is_after_last_on_page(&pcur));
goto func_exit;
}
for (;;)
{
ut_ad(btr_pcur_is_on_user_rec(&pcur));
const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur);
if (ibuf_rec_get_space(&mtr, ibuf_rec) != page_id.space()
|| ibuf_rec_get_page_no(&mtr, ibuf_rec) != page_id.page_no())
break;
/* Delete the record from ibuf */
if (ibuf_delete_rec(page_id, &pcur, &tuple, &mtr))
{
/* Deletion was pessimistic and mtr was committed:
we start from the beginning again */
ut_ad(mtr.has_committed());
goto loop;
}
if (btr_pcur_is_after_last_on_page(&pcur))
{
ibuf_mtr_commit(&mtr);
btr_pcur_close(&pcur);
goto loop;
}
}
func_exit:
ibuf_mtr_commit(&mtr);
btr_pcur_close(&pcur);
}
/** Merge the change buffer to some pages. */
static void ibuf_read_merge_pages(const uint32_t* space_ids,
const uint32_t* page_nos, ulint n_stored)
{
#ifndef DBUG_OFF
mem_heap_t* heap = mem_heap_create(512);
ulint dops[IBUF_OP_COUNT];
memset(dops, 0, sizeof(dops));
#endif
for (ulint i = 0; i < n_stored; i++) {
const ulint space_id = space_ids[i];
fil_space_t* s = fil_space_t::get(space_id);
......@@ -2306,24 +2364,36 @@ static void ibuf_read_merge_pages(const uint32_t* space_ids,
if (UNIV_LIKELY(page_nos[i] < size)) {
mtr.start();
dberr_t err;
buf_block_t *block =
buf_page_get_gen(page_id_t(space_id, page_nos[i]),
zip_size, RW_X_LATCH, nullptr,
BUF_GET_POSSIBLY_FREED,
__FILE__, __LINE__, &mtr, &err, true);
bool remove = !block
|| fil_page_get_type(block->frame)
!= FIL_PAGE_INDEX
|| !page_is_leaf(block->frame);
mtr.commit();
if (err == DB_TABLESPACE_DELETED) {
goto tablespace_deleted;
}
if (!remove) {
continue;
}
#ifndef DBUG_OFF
DBUG_EXECUTE_IF("ibuf_merge_corruption", goto work_around;);
}
if (srv_shutdown_state == SRV_SHUTDOWN_NONE
|| srv_fast_shutdown) {
continue;
}
/* The following code works around a hang when the
change buffer is corrupted, likely due to the race
condition in crash recovery that was fixed in
MDEV-24449. But, it also introduces corruption by
itself in the following scenario:
change buffer is corrupted, likely due to the
failure of ibuf_merge_or_delete_for_page() to
invoke ibuf_delete_recs() if (!bitmap_bits).
It also introduced corruption by itself in the
following scenario:
(1) We merged buffered changes in buf_page_get_gen()
(2) We committed the mini-transaction
......@@ -2332,64 +2402,16 @@ static void ibuf_read_merge_pages(const uint32_t* space_ids,
(5) Other threads buffer changes for that page.
(6) We will wrongly discard those newly buffered changes below.
This code will be available in debug builds, so that
users may try to fix a shutdown hang that occurs due
to a corrupted change buffer. */
To prevent this scenario, we will only invoke this code
on shutdown. A call to ibuf_max_size_update(0) will cause
ibuf_insert_low() to refuse to insert anything into the
change buffer. */
work_around:
/* Prevent an infinite loop, by removing entries from
the change buffer also in the case the bitmap bits were
the change buffer in the case the bitmap bits were
wrongly clear even though buffered changes exist. */
const dtuple_t* tuple = ibuf_search_tuple_build(
space_id, page_nos[i], heap);
loop:
btr_pcur_t pcur;
ibuf_mtr_start(&mtr);
btr_pcur_open(ibuf.index, tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
&pcur, &mtr);
if (!btr_pcur_is_on_user_rec(&pcur)) {
ut_ad(btr_pcur_is_after_last_on_page(&pcur));
goto done;
ibuf_delete_recs(page_id_t(space_ids[i], page_nos[i]));
}
for (;;) {
ut_ad(btr_pcur_is_on_user_rec(&pcur));
const rec_t* ibuf_rec = btr_pcur_get_rec(&pcur);
if (ibuf_rec_get_space(&mtr, ibuf_rec) != space_id
|| ibuf_rec_get_page_no(&mtr, ibuf_rec)
!= page_nos[i]) {
break;
}
dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
/* Delete the record from ibuf */
if (ibuf_delete_rec(page_id_t(space_id, page_nos[i]),
&pcur, tuple, &mtr)) {
/* Deletion was pessimistic and mtr
was committed: we start from the
beginning again */
ut_ad(mtr.has_committed());
goto loop;
}
if (btr_pcur_is_after_last_on_page(&pcur)) {
ibuf_mtr_commit(&mtr);
btr_pcur_close(&pcur);
goto loop;
}
}
done:
ibuf_mtr_commit(&mtr);
btr_pcur_close(&pcur);
mem_heap_empty(heap);
#endif
}
#ifndef DBUG_OFF
ibuf_add_ops(ibuf.n_discarded_ops, dops);
mem_heap_free(heap);
#endif
}
/** Contract the change buffer by reading pages to the buffer pool.
......@@ -2455,8 +2477,23 @@ ibuf_merge_space(
{
mtr_t mtr;
btr_pcur_t pcur;
mem_heap_t* heap = mem_heap_create(512);
dtuple_t* tuple = ibuf_search_tuple_build(space, 0, heap);
dfield_t dfield[IBUF_REC_FIELD_METADATA];
dtuple_t tuple {0, IBUF_REC_FIELD_METADATA,
IBUF_REC_FIELD_METADATA,dfield,0,nullptr
#ifdef UNIV_DEBUG
, DATA_TUPLE_MAGIC_N
#endif
};
byte space_id[4];
mach_write_to_4(space_id, space);
dfield_set_data(&dfield[0], space_id, 4);
dfield_set_data(&dfield[1], field_ref_zero, 1);
dfield_set_data(&dfield[2], field_ref_zero, 4);
dtuple_set_types_binary(&tuple, IBUF_REC_FIELD_METADATA);
ulint n_pages = 0;
ut_ad(space < SRV_SPACE_ID_UPPER_BOUND);
......@@ -2467,11 +2504,9 @@ ibuf_merge_space(
/* Position the cursor on the first matching record. */
btr_pcur_open(
ibuf.index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
ibuf.index, &tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
&mtr);
mem_heap_free(heap);
ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf.index));
ulint sum_sizes = 0;
......@@ -4194,6 +4229,11 @@ void ibuf_merge_or_delete_for_page(buf_block_t *block, const page_id_t page_id,
ibuf_reset_bitmap(block, page_id, zip_size, &mtr);
ibuf_mtr_commit(&mtr);
bitmap_bits = 0;
if (!block
|| btr_page_get_index_id(block->frame)
!= DICT_IBUF_ID_MIN + IBUF_SPACE_ID) {
ibuf_delete_recs(page_id);
}
}
if (!bitmap_bits) {
......@@ -4440,22 +4480,31 @@ in DISCARD TABLESPACE, IMPORT TABLESPACE, or read-ahead.
@param[in] space missing or to-be-discarded tablespace */
void ibuf_delete_for_discarded_space(ulint space)
{
mem_heap_t* heap;
btr_pcur_t pcur;
dtuple_t* search_tuple;
const rec_t* ibuf_rec;
mtr_t mtr;
/* Counts for discarded operations. */
ulint dops[IBUF_OP_COUNT];
heap = mem_heap_create(512);
dfield_t dfield[IBUF_REC_FIELD_METADATA];
dtuple_t search_tuple {0,IBUF_REC_FIELD_METADATA,
IBUF_REC_FIELD_METADATA,dfield,0
,nullptr
#ifdef UNIV_DEBUG
,DATA_TUPLE_MAGIC_N
#endif /* UNIV_DEBUG */
};
byte space_id[4];
mach_write_to_4(space_id, space);
dfield_set_data(&dfield[0], space_id, 4);
dfield_set_data(&dfield[1], field_ref_zero, 1);
dfield_set_data(&dfield[2], field_ref_zero, 4);
dtuple_set_types_binary(&search_tuple, IBUF_REC_FIELD_METADATA);
/* Use page number 0 to build the search tuple so that we get the
cursor positioned at the first entry for this space id */
search_tuple = ibuf_search_tuple_build(space, 0, heap);
memset(dops, 0, sizeof(dops));
loop:
log_free_check();
......@@ -4464,7 +4513,7 @@ void ibuf_delete_for_discarded_space(ulint space)
/* Position pcur in the insert buffer at the first entry for the
space */
btr_pcur_open_on_user_rec(
ibuf.index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
ibuf.index, &search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
&pcur, &mtr);
if (!btr_pcur_is_on_user_rec(&pcur)) {
......@@ -4489,7 +4538,7 @@ void ibuf_delete_for_discarded_space(ulint space)
/* Delete the record from ibuf */
if (ibuf_delete_rec(page_id_t(space, page_no),
&pcur, search_tuple, &mtr)) {
&pcur, &search_tuple, &mtr)) {
/* Deletion was pessimistic and mtr was committed:
we start from the beginning again */
......@@ -4510,8 +4559,6 @@ void ibuf_delete_for_discarded_space(ulint space)
btr_pcur_close(&pcur);
ibuf_add_ops(ibuf.n_discarded_ops, dops);
mem_heap_free(heap);
}
/******************************************************************//**
......@@ -4682,23 +4729,21 @@ dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
void ibuf_set_bitmap_for_bulk_load(buf_block_t *block, mtr_t *mtr, bool reset)
{
ulint free_val;
ut_a(page_is_leaf(block->frame));
const page_id_t id{block->page.id()};
const auto zip_size= block->zip_size();
ut_a(page_is_leaf(buf_block_get_frame(block)));
free_val = ibuf_index_page_calc_free(block);
buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(block->page.id(),
block->zip_size(),
mtr);
if (buf_block_t *bitmap_page= ibuf_bitmap_get_map_page(id, zip_size, mtr))
{
if (ibuf_bitmap_page_get_bits(bitmap_page->frame, id, zip_size,
IBUF_BITMAP_BUFFERED, mtr))
ibuf_delete_recs(id);
free_val = reset ? 0 : ibuf_index_page_calc_free(block);
ulint free_val= reset ? 0 : ibuf_index_page_calc_free(block);
/* FIXME: update the bitmap byte only once! */
ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>(
bitmap_page, block->page.id(), block->physical_size(),
free_val, mtr);
ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>(
bitmap_page, block->page.id(), block->physical_size(),
false, mtr);
ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>
(bitmap_page, id, block->physical_size(), free_val, mtr);
ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>
(bitmap_page, id, block->physical_size(), false, mtr);
}
}
......@@ -1675,6 +1675,10 @@ void srv_shutdown(bool ibuf_merge)
if (ibuf_merge) {
srv_main_thread_op_info = "doing insert buffer merge";
/* Disallow the use of change buffer to
avoid a race condition with
ibuf_read_merge_pages() */
ibuf_max_size_update(0);
log_free_check();
n_read = ibuf_contract();
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment