Commit 1407f999 authored by Brandon Nesterenko's avatar Brandon Nesterenko

MDEV-31177: SHOW SLAVE STATUS Last_SQL_Errno Race Condition on Errored Slave Restart

The SQL thread and a user connection executing SHOW SLAVE STATUS
have a race condition on Last_SQL_Errno, such that a slave which
previously errored and stopped, on its next start, SHOW SLAVE STATUS
can show that the SQL Thread is running while the previous error is
also showing.

The fix is to move when the last error is cleared when the SQL
thread starts to occur before setting the status of
Slave_SQL_Running.

Thanks to Kristian Nielson for his work diagnosing the problem!

Reviewed By:
============
Andrei Elkin <andrei.elkin@mariadb.com>
Kristian Nielson <knielsen@knielsen-hq.org>
parent 1831f8e4
include/master-slave.inc
[connection master]
connection master;
create table t1 (a int primary key, b int) engine=innodb;
insert t1 values (1,1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
set @save_slave_trans_retries= @@global.slave_transaction_retries;
set @@global.innodb_lock_wait_timeout= 1;
set @@global.slave_transaction_retries= 0;
connection master;
update t1 set b=b+10 where a=1;
include/save_master_gtid.inc
connection slave1;
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
a b
1 1
connection slave;
include/start_slave.inc
include/wait_for_slave_sql_error.inc [errno=1205]
connection slave1;
ROLLBACK;
connection slave;
set @save_dbug = @@global.debug_dbug;
set @@global.debug_dbug= "+d,delay_sql_thread_after_release_run_lock";
include/start_slave.inc
set debug_sync= "now wait_for sql_thread_run_lock_released";
# Validating that the SQL thread is running..
# ..success
# Validating that Last_SQL_Errno is cleared..
# ..success
set debug_sync= "now signal sql_thread_continue";
set @@global.debug_dbug= @saved_dbug;
set debug_sync= "RESET";
# Cleanup
connection master;
drop table t1;
connection slave;
include/stop_slave.inc
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
set @@global.slave_transaction_retries= @save_slave_trans_retries;
include/start_slave.inc
include/rpl_end.inc
# End of rpl_sql_thd_start_errno_cleared.test
#
# Ensure that when the slave restarts, the last error code displayed by
# SHOW SLAVE STATUS is cleared before Slave_SQL_Running is set.
#
# To ensure that, this test uses the debug_sync mechanism to pause an errored
# and restarting slave's SQL thread after it has set its running state to YES,
# and then ensures that Last_SQL_Errno is 0. The slave error is a forced innodb
# row lock timeout.
#
#
# References
# MDEV-31177: SHOW SLAVE STATUS Last_SQL_Errno Race Condition on Errored
# Slave Restart
#
source include/have_binlog_format_row.inc;
source include/have_innodb.inc;
source include/have_debug.inc;
source include/have_debug_sync.inc;
source include/master-slave.inc;
--connection master
create table t1 (a int primary key, b int) engine=innodb;
insert t1 values (1,1);
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
set @save_slave_trans_retries= @@global.slave_transaction_retries;
set @@global.innodb_lock_wait_timeout= 1;
set @@global.slave_transaction_retries= 0;
--connection master
update t1 set b=b+10 where a=1;
--source include/save_master_gtid.inc
--connection slave1
BEGIN;
--eval SELECT * FROM t1 WHERE a=1 FOR UPDATE
--connection slave
--source include/start_slave.inc
--let $slave_sql_errno= 1205
--source include/wait_for_slave_sql_error.inc
--connection slave1
ROLLBACK;
--connection slave
set @save_dbug = @@global.debug_dbug;
set @@global.debug_dbug= "+d,delay_sql_thread_after_release_run_lock";
--source include/start_slave.inc
set debug_sync= "now wait_for sql_thread_run_lock_released";
--let $sql_running = query_get_value("SHOW SLAVE STATUS", Slave_SQL_Running, 1)
--echo # Validating that the SQL thread is running..
if (`SELECT strcmp("$sql_running", "YES") != 0`)
{
--echo # ..failed
--echo # Slave_SQL_Running: $sql_running
--die Slave SQL thread is not running
}
--echo # ..success
--let $last_error = query_get_value("SHOW SLAVE STATUS", Last_SQL_Errno, 1)
--echo # Validating that Last_SQL_Errno is cleared..
if ($last_error)
{
--echo # ..failed
--echo # Last_SQL_Errno: $last_error
--die SHOW SLAVE STATUS shows the error from the last session on startup
}
--echo # ..success
set debug_sync= "now signal sql_thread_continue";
set @@global.debug_dbug= @saved_dbug;
set debug_sync= "RESET";
--echo # Cleanup
--connection master
drop table t1;
--connection slave
--source include/stop_slave.inc
set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
set @@global.slave_transaction_retries= @save_slave_trans_retries;
--source include/start_slave.inc
--source include/rpl_end.inc
--echo # End of rpl_sql_thd_start_errno_cleared.test
......@@ -5236,6 +5236,19 @@ pthread_handler_t handle_slave_sql(void *arg)
DBUG_ASSERT(rli->inited);
DBUG_ASSERT(rli->mi == mi);
/*
Reset errors for a clean start (otherwise, if the master is idle, the SQL
thread may execute no Query_log_event, so the error will remain even
though there's no problem anymore). Do not reset the master timestamp
(imagine the slave has caught everything, the STOP SLAVE and START SLAVE:
as we are not sure that we are going to receive a query, we want to
remember the last master timestamp (to say how many seconds behind we are
now.
But the master timestamp is reset by RESET SLAVE & CHANGE MASTER.
*/
rli->clear_error();
mysql_mutex_lock(&rli->run_lock);
DBUG_ASSERT(!rli->slave_running);
errmsg= 0;
......@@ -5310,17 +5323,16 @@ pthread_handler_t handle_slave_sql(void *arg)
mysql_mutex_unlock(&rli->run_lock);
mysql_cond_broadcast(&rli->start_cond);
/*
Reset errors for a clean start (otherwise, if the master is idle, the SQL
thread may execute no Query_log_event, so the error will remain even
though there's no problem anymore). Do not reset the master timestamp
(imagine the slave has caught everything, the STOP SLAVE and START SLAVE:
as we are not sure that we are going to receive a query, we want to
remember the last master timestamp (to say how many seconds behind we are
now.
But the master timestamp is reset by RESET SLAVE & CHANGE MASTER.
*/
rli->clear_error();
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("delay_sql_thread_after_release_run_lock", {
const char act[]= "now "
"signal sql_thread_run_lock_released "
"wait_for sql_thread_continue";
DBUG_ASSERT(debug_sync_service);
DBUG_ASSERT(!debug_sync_set_action(current_thd, STRING_WITH_LEN(act)));
};);
#endif
rli->parallel.reset();
//tell the I/O thread to take relay_log_space_limit into account from now on
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment