Commit 1cb218c3 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-26450: Corruption due to innodb_undo_log_truncate

At least since commit 055a3334
(MDEV-13564) the undo log truncation in InnoDB did not work correctly.

The main issue is that during the execution of
trx_purge_truncate_history() some pages of the newly truncated
undo tablespace could be discarded.

fsp_try_extend_data_file(): Apply the peculiar rounding of
fil_space_t::size_in_header only to the system tablespace,
whose size can be expressed in megabytes in a configuration parameter.
Other files may freely grow by a number of pages.

fseg_alloc_free_page_low(): Do allow the extension of undo tablespaces,
and mention the file name in the error message.

mtr_t::commit_shrink(): Implement crash-safe shrinking of a tablespace
file. First, durably write the log, then shrink the file, and finally
release the page latches of the rebuilt tablespace. Refactored from
trx_purge_truncate_history().

log_write_and_flush_prepare(), log_write_and_flush(): New functions
to durably write log during mtr_t::commit_shrink().
parent 21d19ed4
......@@ -10,28 +10,12 @@ SET @trunc_start=
WHERE variable_name = 'innodb_undo_truncations');
create table t1(keyc int primary key, c char(100)) engine = innodb;
create table t2(keyc int primary key, c char(100)) engine = innodb;
CREATE PROCEDURE populate_t1()
BEGIN
DECLARE i INT DEFAULT 1;
while (i <= 20000) DO
insert into t1 values (i, 'a');
SET i = i + 1;
END WHILE;
END |
CREATE PROCEDURE populate_t2()
BEGIN
DECLARE i INT DEFAULT 1;
while (i <= 20000) DO
insert into t2 values (i, 'a');
SET i = i + 1;
END WHILE;
END |
connect con1,localhost,root,,;
begin;
call populate_t1();
insert into t1 select seq,'a' from seq_1_to_20000;
connect con2,localhost,root,,;
begin;
call populate_t2();
insert into t2 select seq,'a' from seq_1_to_20000;
connection con1;
update t1 set c = 'mysql';
connection con2;
......@@ -53,8 +37,6 @@ commit;
disconnect con2;
connection default;
drop table t1, t2;
drop PROCEDURE populate_t1;
drop PROCEDURE populate_t2;
InnoDB 0 transactions not purged
SET GLOBAL innodb_undo_logs = @save_undo_logs;
SET GLOBAL innodb_purge_rseg_truncate_frequency = @save_frequency;
......
......@@ -5,6 +5,7 @@
# --source include/innodb_page_size.inc
--source include/innodb_page_size_small.inc
--source include/have_undo_tablespaces.inc
--source include/have_sequence.inc
call mtr.add_suppression("InnoDB: The transaction log size is too large");
......@@ -27,37 +28,14 @@ WHERE variable_name = 'innodb_undo_truncations');
create table t1(keyc int primary key, c char(100)) engine = innodb;
create table t2(keyc int primary key, c char(100)) engine = innodb;
#
delimiter |;
CREATE PROCEDURE populate_t1()
BEGIN
DECLARE i INT DEFAULT 1;
while (i <= 20000) DO
insert into t1 values (i, 'a');
SET i = i + 1;
END WHILE;
END |
delimiter ;|
#
delimiter |;
CREATE PROCEDURE populate_t2()
BEGIN
DECLARE i INT DEFAULT 1;
while (i <= 20000) DO
insert into t2 values (i, 'a');
SET i = i + 1;
END WHILE;
END |
delimiter ;|
#
#
let DATADIR = `select @@datadir`;
connect (con1,localhost,root,,);
begin;
send call populate_t1();
send insert into t1 select seq,'a' from seq_1_to_20000;
connect (con2,localhost,root,,);
begin;
send call populate_t2();
send insert into t2 select seq,'a' from seq_1_to_20000;
connection con1; reap; send update t1 set c = 'mysql';
connection con2; reap; send update t2 set c = 'mysql';
......@@ -67,25 +45,12 @@ connection con1; reap; send delete from t1;
connection con2; reap; delete from t2;
connection con1; reap;
let CHECKFILE = $MYSQL_TMP_DIR/check.txt;
perl;
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size1)
= stat("$ENV{DATADIR}/undo001");
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size2)
= stat("$ENV{DATADIR}/undo002");
open(OUT, ">$ENV{CHECKFILE}") || die;
print OUT "let \$size1='$size1,$size2';\n";
close(OUT);
EOF
SET GLOBAL innodb_undo_log_truncate = 1;
commit; disconnect con1;
connection con2; commit; disconnect con2;
connection default;
drop table t1, t2;
drop PROCEDURE populate_t1;
drop PROCEDURE populate_t2;
--source include/wait_all_purged.inc
......@@ -100,29 +65,6 @@ if (`select @@innodb_page_size IN (4096,8192,16384)`)
source include/wait_condition.inc;
}
--source $CHECKFILE
perl;
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size1)
= stat("$ENV{DATADIR}/undo001");
($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size2)
= stat("$ENV{DATADIR}/undo002");
open(OUT, ">$ENV{CHECKFILE}") || die;
print OUT "let \$size2='$size1,$size2';\n";
close(OUT);
EOF
--source $CHECKFILE
--remove_file $CHECKFILE
if ($size1 == $size2)
{
# This fails for innodb_page_size=64k, occasionally also for 32k.
if (`select @@innodb_page_size IN (4096,8192,16384)`)
{
echo Truncation did not happen: $size1;
}
}
SET GLOBAL innodb_undo_logs = @save_undo_logs;
SET GLOBAL innodb_purge_rseg_truncate_frequency = @save_frequency;
SET GLOBAL innodb_undo_log_truncate = @save_truncate;
......@@ -956,11 +956,13 @@ fsp_try_extend_data_file(fil_space_t* space, fsp_header_t* header, mtr_t* mtr)
return(0);
}
/* We ignore any fragments of a full megabyte when storing the size
to the space header */
/* For the system tablespace, we ignore any fragments of a
full megabyte when storing the size to the space header */
space->size_in_header = ut_2pow_round(
space->size, (1024 * 1024) / page_size.physical());
space->size_in_header = space->id
? space->size
: ut_2pow_round(space->size,
(1024 * 1024) / page_size.physical());
mlog_write_ulint(
header + FSP_SIZE, space->size_in_header, MLOG_4BYTES, mtr);
......@@ -1392,7 +1394,7 @@ fsp_alloc_free_page(
/* It must be that we are extending a single-table tablespace
whose size is still < 64 pages */
ut_a(!is_system_tablespace(space_id));
ut_a(!is_predefined_tablespace(space_id));
if (page_no >= FSP_EXTENT_SIZE) {
ib::error() << "Trying to extend a single-table"
" tablespace " << space->name << " , by single"
......@@ -2514,14 +2516,14 @@ fseg_alloc_free_page_low(
return(NULL);
}
if (space->size <= ret_page && !is_system_tablespace(space_id)) {
if (space->size <= ret_page && !is_predefined_tablespace(space_id)) {
/* It must be that we are extending a single-table
tablespace whose size is still < 64 pages */
if (ret_page >= FSP_EXTENT_SIZE) {
ib::error() << "Error (2): trying to extend"
" a single-table tablespace " << space_id
<< " by single page(s) though the"
ib::error() << "Trying to extend '"
<< space->chain.start->name
<< "' by single page(s) though the"
<< " space size " << space->size
<< ". Page no " << ret_page << ".";
ut_ad(!has_done_reservation);
......
......@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2009, Google Inc.
Copyright (c) 2017, 2020, MariaDB Corporation.
Copyright (c) 2017, 2021, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
......@@ -189,9 +189,15 @@ log_write_up_to(
/** write to the log file up to the last log entry.
@param[in] sync whether we want the written log
also to be flushed to disk. */
void
log_buffer_flush_to_disk(
bool sync = true);
void log_buffer_flush_to_disk(bool sync= true);
/** Prepare to invoke log_write_and_flush(), before acquiring log_sys.mutex. */
#define log_write_and_flush_prepare() log_write_mutex_enter()
/** Durably write the log up to log_sys.lsn and release log_sys.mutex. */
ATTRIBUTE_COLD void log_write_and_flush();
/****************************************************************//**
This functions writes the log buffer to the log file and if 'flush'
is set it forces a flush of the log file as well. This is meant to be
......
......@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
Copyright (c) 2013, 2020, MariaDB Corporation.
Copyright (c) 2013, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -161,6 +161,10 @@ struct mtr_t {
/** Commit the mini-transaction. */
void commit();
/** Commit a mini-transaction that is shrinking a tablespace.
@param space tablespace that is being shrunk */
ATTRIBUTE_COLD void commit_shrink(fil_space_t &space);
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
MLOG_FILE_NAME records and a MLOG_CHECKPOINT marker.
......
/*****************************************************************************
Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, 2020, MariaDB Corporation.
Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -53,8 +53,8 @@ mtr_t::memo_push(void* object, mtr_memo_type_t type)
/* If this mtr has x-fixed a clean page then we set
the made_dirty flag. This tells us if we need to
grab log_flush_order_mutex at mtr_commit so that we
can insert the dirtied page to the flush list. */
grab log_sys.flush_order_mutex at mtr_t::commit() so that we
can insert the dirtied page into the flush list. */
if ((type == MTR_MEMO_PAGE_X_FIX || type == MTR_MEMO_PAGE_SX_FIX)
&& !m_made_dirty) {
......
......@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Google Inc.
Copyright (c) 2014, 2020, MariaDB Corporation.
Copyright (c) 2014, 2021, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
......@@ -1247,14 +1247,105 @@ log_write_up_to(
/** write to the log file up to the last log entry.
@param[in] sync whether we want the written log
also to be flushed to disk. */
void
log_buffer_flush_to_disk(
bool sync)
void log_buffer_flush_to_disk(bool sync)
{
ut_ad(!srv_read_only_mode);
log_write_up_to(log_get_lsn(), sync);
}
/** Durably write the log and release log_sys.mutex */
ATTRIBUTE_COLD void log_write_and_flush()
{
ut_ad(!srv_read_only_mode);
ut_ad(!recv_no_log_write);
ut_ad(!recv_recovery_is_on());
/* The following code is adapted from log_write_up_to(). */
DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF,
log_sys->write_lsn, log_sys->lsn));
log_sys->n_pending_flushes++;
log_sys->current_flush_lsn= log_sys->lsn;
os_event_reset(log_sys->flush_event);
ut_ad(log_sys->buf_free != log_sys->buf_next_to_write);
ulint start_offset= log_sys->buf_next_to_write;
ulint end_offset= log_sys->buf_free;
ulint area_start= ut_2pow_round(start_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
ulint area_end= ut_calc_align(end_offset, ulint(OS_FILE_LOG_BLOCK_SIZE));
ulong write_ahead_size= srv_log_write_ahead_size;
log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
log_block_set_checkpoint_no(log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
log_sys->next_checkpoint_no);
lsn_t write_lsn= log_sys->lsn;
byte *write_buf= log_sys->buf;
ut_ad(area_end - area_start > 0);
log_buffer_switch();
log_group_set_fields(&log_sys->log, log_sys->write_lsn);
/* Erase the end of the last log block. */
memset(write_buf + end_offset, 0,
~end_offset & (OS_FILE_LOG_BLOCK_SIZE - 1));
/* Calculate pad_size if needed. */
ulint pad_size= 0;
if (write_ahead_size > OS_FILE_LOG_BLOCK_SIZE)
{
lsn_t end_offset=
log_group_calc_lsn_offset(ut_uint64_align_up(write_lsn,
OS_FILE_LOG_BLOCK_SIZE),
&log_sys->log);
ulint end_offset_in_unit= (ulint) (end_offset % write_ahead_size);
if (end_offset_in_unit && (area_end - area_start) > end_offset_in_unit)
{
/* The first block in the unit was initialized after the last
writing. Needs to be written padded data once. */
pad_size= std::min(ulint(write_ahead_size) - end_offset_in_unit,
log_sys->buf_size - area_end);
memset(write_buf + area_end, 0, pad_size);
}
}
if (log_sys->is_encrypted())
log_crypt(write_buf + area_start, log_sys->write_lsn,
area_end - area_start);
/* Do the write to the log files */
log_group_write_buf(&log_sys->log, write_buf + area_start,
area_end - area_start + pad_size,
#ifdef UNIV_DEBUG
pad_size,
#endif /* UNIV_DEBUG */
ut_uint64_align_down(log_sys->write_lsn,
OS_FILE_LOG_BLOCK_SIZE),
start_offset - area_start);
srv_stats.log_padded.add(pad_size);
log_sys->write_lsn= write_lsn;
log_write_mutex_exit();
/* Code adapted from log_write_flush_to_disk_low() */
ut_a(log_sys->n_pending_flushes == 1); /* No other threads here */
if (srv_file_flush_method != SRV_O_DSYNC)
fil_flush(SRV_LOG_SPACE_FIRST_ID);
log_sys->flushed_to_disk_lsn= log_sys->current_flush_lsn;
log_sys->n_pending_flushes--;
os_event_set(log_sys->flush_event);
lsn_t wrote_lsn= log_sys->write_lsn, flush_lsn= log_sys->flushed_to_disk_lsn;
log_mutex_exit();
innobase_mysql_log_notify(wrote_lsn, flush_lsn);
}
/****************************************************************//**
This functions writes the log buffer to the log file and if 'flush'
is set it forces a flush of the log file as well. This is meant to be
......
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, 2020, MariaDB Corporation.
Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -465,6 +465,90 @@ mtr_t::commit()
release_resources();
}
#ifdef UNIV_DEBUG
/** Check that all pages belong to a shrunk tablespace. */
struct Shrink
{
const fil_space_t &space;
Shrink(const fil_space_t &space) : space(space) {}
bool operator()(const mtr_memo_slot_t *slot) const
{
if (!slot->object)
return true;
switch (slot->type) {
default:
ut_ad("invalid type" == 0);
return false;
case MTR_MEMO_MODIFY:
break;
case MTR_MEMO_SPACE_X_LOCK:
ut_ad(&space == slot->object);
return true;
case MTR_MEMO_PAGE_X_FIX:
case MTR_MEMO_PAGE_SX_FIX:
const buf_page_t &bpage= static_cast<buf_block_t*>(slot->object)->page;
const page_id_t &id= bpage.id;
if (id.space() == 0 && id.page_no() == TRX_SYS_PAGE_NO)
{
ut_ad(srv_is_undo_tablespace(space.id));
break;
}
ut_ad(id.space() == space.id);
ut_ad(id.page_no() < space.size);
ut_ad(bpage.state == BUF_BLOCK_FILE_PAGE);
ut_ad(!bpage.oldest_modification);
break;
}
return true;
}
};
#endif
/** Commit a mini-transaction that is shrinking a tablespace.
@param space tablespace that is being shrunk */
void mtr_t::commit_shrink(fil_space_t &space)
{
ut_ad(is_active());
ut_ad(!is_inside_ibuf());
ut_ad(!high_level_read_only);
ut_ad(m_modifications);
ut_ad(m_made_dirty);
ut_ad(!recv_recovery_is_on());
ut_ad(m_log_mode == MTR_LOG_ALL);
ut_ad(UT_LIST_GET_LEN(space.chain) == 1);
log_write_and_flush_prepare();
const lsn_t start_lsn= finish_write(prepare_write());
log_flush_order_mutex_enter();
/* Durably write the reduced FSP_SIZE before truncating the data file. */
log_write_and_flush();
os_file_truncate(space.chain.start->name, space.chain.start->handle,
os_offset_t(space.size) << srv_page_size_shift, true);
ut_d(m_memo.for_each_block_in_reverse(CIterate<Shrink>(space)));
m_memo.for_each_block_in_reverse(CIterate<const ReleaseBlocks>
(ReleaseBlocks(start_lsn, m_commit_lsn,
m_flush_observer)));
log_flush_order_mutex_exit();
mutex_enter(&fil_system->mutex);
ut_ad(space.is_being_truncated);
ut_ad(space.stop_new_ops);
space.stop_new_ops= false;
space.is_being_truncated= false;
mutex_exit(&fil_system->mutex);
m_memo.for_each_block_in_reverse(CIterate<ReleaseLatches>());
srv_stats.log_write_requests.inc();
release_resources();
}
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
MLOG_FILE_NAME records and a MLOG_CHECKPOINT marker.
......
/*****************************************************************************
Copyright (c) 1996, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2017, 2020, MariaDB Corporation.
Copyright (c) 2017, 2021, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -1062,26 +1062,11 @@ trx_purge_initiate_truncate(
rseg->last_del_marks = FALSE;
}
mtr.commit();
/* Write-ahead the redo log record. */
log_write_up_to(mtr.commit_lsn(), true);
mtr.commit_shrink(*space);
/* Trim the file size. */
os_file_truncate(file->name, file->handle,
os_offset_t(size) << srv_page_size_shift, true);
/* This is only executed by the srv_purge_coordinator_thread. */
/* No mutex; this is only updated by the purge coordinator. */
export_vars.innodb_undo_truncations++;
/* TODO: PUNCH_HOLE the garbage (with write-ahead logging) */
mutex_enter(&fil_system->mutex);
ut_ad(space->stop_new_ops);
ut_ad(space->is_being_truncated);
space->stop_new_ops = false;
space->is_being_truncated = false;
mutex_exit(&fil_system->mutex);
if (purge_sys->rseg != NULL
&& purge_sys->rseg->last_page_no == FIL_NULL) {
/* If purge_sys->rseg is pointing to rseg that was recently
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment