Commit 68938d2b authored by Brandon Nesterenko's avatar Brandon Nesterenko

MDEV-33500 (part 2): rpl.rpl_parallel_sbm can still fail

The failing test case validates Seconds_Behind_Master for a delayed
slave, while STOP SLAVE is executed during a delay. The test fixes
initially added to the test (commit b04c8575) added a table lock
to ensure a transaction could not finish before validating the
Seconds_Behind_Master field after SLAVE START, but did not address a
possibility that the transaction could finish before running the
STOP SLAVE command, which invalidates the validations for the rest
of the test case. Specifically, this would result in 1) a timeout in
“Waiting for table metadata lock” on the replica, which expects the
transaction to retry after slave restart and hit a lock conflict on
the locked tables (added in b04c8575), and 2) that
Seconds_Behind_Master should have increased, but did not.

The failure can be reproduced by synchronizing the slave to the master
before the MDEV-32265 echo statement (i.e. before the SLAVE STOP).

This patch fixes the test by adding a mechanism to use DEBUG_SYNC to
synchronize a MASTER_DELAY, rather than continually increase the
duration of the delay each time the test fails on buildbot. This is
to ensure that on slow machines, a delay does not pass before the
test gets a chance to validate results. Additionally, it decreases
overall test time because the test can continue immediately after
validation, thereby bypassing the remainder of a full delay for each
transaction.
parent a1adabdd
# ==== Purpose ====
#
# If using DEBUG_SYNC to coordinate a slave's SQL DELAY via the DEBUG_DBUG
# identifier "sql_delay_by_debug_sync", this helper file will help synchronize
# a slave with the master for statements which don't need to be delayed. This
# can be helpful, for example, for setup/cleanup statements, if they must be
# run in the same lifetime as the statements used for the test.
#
# The actual synchronization will take place based on the input parameter
# slave_sync_method, which can be "gtid", "file_coord", or "none"; and will use
# the helper files sync_with_master_gtid.inc or sync_with_master.inc (or none
# at all), respectively.
#
#
# ==== Requirements ====
#
# --source include/have_debug.inc
# --source include/have_debug_sync.inc
# set @@GLOBAL.debug_dbug= "+d,sql_delay_by_debug_sync";
#
#
# ==== Usage ====
#
# --let $slave_sync_method= gtid|file_coord|none
# [--let $num_event_groups= NUMBER]
# --source include/sync_with_master_sql_delay_debug_sync.inc
#
#
# Parameters:
# $slave_sync_method
# Value can be gtid, file_coord, or none; and will synchronize the slave
# with the master via this method (i.e. using sync_with_master_gtid.inc
# or sync_with_master.inc, respectively), after synchronizing the SQL
# delay
#
# $num_event_groups
# Number of event groups to synchronize the SQL delay for. If unset, will
# be default to 1.
#
--let $include_filename= sync_with_master_sql_delay_debug_sync.inc
--source include/begin_include_file.inc
if (!$slave_sync_method)
{
--die Parameter slave_sync_method must be set
}
if (`select "$slave_sync_method" not like "gtid" and "$slave_sync_method" not like "file_coord" and "$slave_sync_method" not like "none"`)
{
--die Parameter slave_sync_method must have value "gtid", "file_coord" or "none"
}
if (`select "$slave_sync_method" not like "none" and strcmp("$master_pos", "") = 0`)
{
--die sync_with_master.inc or sync_with_master_gtid.inc was not called to populate variable master_pos
}
if (!$num_event_groups)
{
--let $num_event_groups= 1
}
while ($num_event_groups)
{
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
--dec $num_event_groups
}
if (`select "$slave_sync_method" LIKE "gtid"`)
{
--source include/sync_with_master_gtid.inc
}
if (`select "$slave_sync_method" LIKE "file_coord"`)
{
--source include/sync_with_master.inc
}
--let $include_filename= sync_with_master_sql_delay_debug_sync.inc
--source include/end_include_file.inc
...@@ -5,14 +5,20 @@ include/master-slave.inc ...@@ -5,14 +5,20 @@ include/master-slave.inc
# #
connection slave; connection slave;
include/stop_slave.inc include/stop_slave.inc
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master";
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE; set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
change master to master_delay=3, master_use_gtid=Slave_Pos; set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
change master to master_delay=1, master_use_gtid=Slave_Pos;
include/start_slave.inc include/start_slave.inc
connection master; connection master;
create table t1 (a int); create table t1 (a int);
create table t2 (a int); create table t2 (a int);
include/sync_slave_sql_with_master.inc include/save_master_gtid.inc
connection slave;
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
# #
# Pt 1) Ensure SBM is updated immediately upon arrival of the next event # Pt 1) Ensure SBM is updated immediately upon arrival of the next event
connection master; connection master;
...@@ -21,12 +27,23 @@ insert into t1 values (0); ...@@ -21,12 +27,23 @@ insert into t1 values (0);
include/save_master_gtid.inc include/save_master_gtid.inc
connection slave; connection slave;
# Waiting for transaction to arrive on slave and begin SQL Delay.. # Waiting for transaction to arrive on slave and begin SQL Delay..
set debug_sync= "now WAIT_FOR at_sql_delay";
# Validating SBM is updated on event arrival.. # Validating SBM is updated on event arrival..
# ..done # ..done
# MDEV-32265. At time of STOP SLAVE, if the SQL Thread is currently # MDEV-32265. At time of STOP SLAVE, if the SQL Thread is currently
# delaying a transaction; then when the reciprocal START SLAVE occurs, # delaying a transaction; then when the reciprocal START SLAVE occurs,
# if the event is still to be delayed, SBM should resume accordingly # if the event is still to be delayed, SBM should resume accordingly
include/stop_slave.inc connection server_2;
# Ensure the kill from STOP SLAVE will be received before continuing the
# SQL thread
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
STOP SLAVE;
connection slave;
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
set debug_sync= "now SIGNAL continue_sql_thread";
connection server_2;
include/wait_for_slave_to_stop.inc
set debug_sync="RESET";
# Lock t1 on slave to ensure the event can't finish (and thereby update # Lock t1 on slave to ensure the event can't finish (and thereby update
# Seconds_Behind_Master) so slow running servers don't accidentally # Seconds_Behind_Master) so slow running servers don't accidentally
# catch up to the master before checking SBM. # catch up to the master before checking SBM.
...@@ -34,6 +51,10 @@ connection server_2; ...@@ -34,6 +51,10 @@ connection server_2;
LOCK TABLES t1 WRITE; LOCK TABLES t1 WRITE;
include/start_slave.inc include/start_slave.inc
connection slave; connection slave;
# SQL delay has no impact for the rest of the test case, so ignore it
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
# Waiting for replica to get blocked by the table lock # Waiting for replica to get blocked by the table lock
# Sleeping 1s to increment SBM # Sleeping 1s to increment SBM
# Ensuring Seconds_Behind_Master increases after sleeping.. # Ensuring Seconds_Behind_Master increases after sleeping..
...@@ -54,6 +75,13 @@ insert into t1 values (2); ...@@ -54,6 +75,13 @@ insert into t1 values (2);
include/save_master_pos.inc include/save_master_pos.inc
connection slave; connection slave;
# Wait for first transaction to complete SQL delay and begin execution.. # Wait for first transaction to complete SQL delay and begin execution..
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
# Wait for second transaction to complete SQL delay..
include/sync_with_master_sql_delay_debug_sync.inc
set debug_sync= "now WAIT_FOR at_sql_delay";
set debug_sync= "now SIGNAL continue_sql_thread";
# Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle.. # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
# ..and that SBM wasn't calculated using prior committed transactions # ..and that SBM wasn't calculated using prior committed transactions
# ..done # ..done
...@@ -63,6 +91,8 @@ include/wait_for_slave_param.inc [Relay_Master_Log_File] ...@@ -63,6 +91,8 @@ include/wait_for_slave_param.inc [Relay_Master_Log_File]
include/wait_for_slave_param.inc [Exec_Master_Log_Pos] include/wait_for_slave_param.inc [Exec_Master_Log_Pos]
# Cleanup # Cleanup
include/stop_slave.inc include/stop_slave.inc
set debug_sync= "RESET";
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
CHANGE MASTER TO master_delay=0; CHANGE MASTER TO master_delay=0;
include/start_slave.inc include/start_slave.inc
# #
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
# #
--source include/have_log_bin.inc --source include/have_log_bin.inc
--source include/have_debug.inc --source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/master-slave.inc --source include/master-slave.inc
--echo # --echo #
...@@ -16,14 +17,23 @@ ...@@ -16,14 +17,23 @@
# Seconds_Behind_Master is based on the timestamp of the new transaction, # Seconds_Behind_Master is based on the timestamp of the new transaction,
# rather than the last committed transaction. # rather than the last committed transaction.
# #
# Note that the test doesn't actually use the value of MASTER_DELAY, but
# rather uses debug_sync to coordinate the end of the delay. This is to ensure
# that on slow machines, a delay doesn't pass before the test gets a chance to
# validate results. Additionally, it lets us continue testing after validation
# so we don't have to wait out a full delay unnecessarily. The debug_sync point
# is enabled via sql_delay_by_debug_sync, which will delay transactions based
# only on GTID events, so only one synchronization is needed per transaction.
#
--connection slave --connection slave
--source include/stop_slave.inc --source include/stop_slave.inc
--let $save_dbug= `SELECT @@GLOBAL.debug_dbug` --let $save_dbug= `SELECT @@GLOBAL.debug_dbug`
--let $save_parallel_mode= `SELECT @@GLOBAL.slave_parallel_mode` --let $save_parallel_mode= `SELECT @@GLOBAL.slave_parallel_mode`
set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master";
set @@GLOBAL.slave_parallel_mode= CONSERVATIVE; set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
--let $master_delay= 3 set @@GLOBAL.debug_dbug= "d,negate_clock_diff_with_master,sql_delay_by_debug_sync";
--let $master_delay= 1
--eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos --eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos
--source include/start_slave.inc --source include/start_slave.inc
...@@ -31,7 +41,11 @@ set @@GLOBAL.slave_parallel_mode= CONSERVATIVE; ...@@ -31,7 +41,11 @@ set @@GLOBAL.slave_parallel_mode= CONSERVATIVE;
--let insert_ctr= 0 --let insert_ctr= 0
create table t1 (a int); create table t1 (a int);
create table t2 (a int); create table t2 (a int);
--source include/sync_slave_sql_with_master.inc --source include/save_master_gtid.inc
--connection slave
--let $slave_sync_method= gtid
--let $num_event_groups= 2
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # --echo #
--echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event --echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event
...@@ -48,8 +62,7 @@ sleep 2; ...@@ -48,8 +62,7 @@ sleep 2;
--connection slave --connection slave
--echo # Waiting for transaction to arrive on slave and begin SQL Delay.. --echo # Waiting for transaction to arrive on slave and begin SQL Delay..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting until MASTER_DELAY seconds after master executed event'; set debug_sync= "now WAIT_FOR at_sql_delay";
--source include/wait_condition.inc
--echo # Validating SBM is updated on event arrival.. --echo # Validating SBM is updated on event arrival..
--let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1) --let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
...@@ -66,7 +79,20 @@ if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`) ...@@ -66,7 +79,20 @@ if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`)
--echo # delaying a transaction; then when the reciprocal START SLAVE occurs, --echo # delaying a transaction; then when the reciprocal START SLAVE occurs,
--echo # if the event is still to be delayed, SBM should resume accordingly --echo # if the event is still to be delayed, SBM should resume accordingly
--source include/stop_slave.inc --connection server_2
--echo # Ensure the kill from STOP SLAVE will be received before continuing the
--echo # SQL thread
set debug_sync="after_thd_awake_kill SIGNAL slave_notified_of_kill";
--send STOP SLAVE
--connection slave
set debug_sync= "now WAIT_FOR slave_notified_of_kill";
set debug_sync= "now SIGNAL continue_sql_thread";
--connection server_2
--reap
--source include/wait_for_slave_to_stop.inc
set debug_sync="RESET";
--echo # Lock t1 on slave to ensure the event can't finish (and thereby update --echo # Lock t1 on slave to ensure the event can't finish (and thereby update
--echo # Seconds_Behind_Master) so slow running servers don't accidentally --echo # Seconds_Behind_Master) so slow running servers don't accidentally
...@@ -77,6 +103,11 @@ LOCK TABLES t1 WRITE; ...@@ -77,6 +103,11 @@ LOCK TABLES t1 WRITE;
--source include/start_slave.inc --source include/start_slave.inc
--connection slave --connection slave
--echo # SQL delay has no impact for the rest of the test case, so ignore it
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Waiting for replica to get blocked by the table lock --echo # Waiting for replica to get blocked by the table lock
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock'; --let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock';
--source include/wait_condition.inc --source include/wait_condition.inc
...@@ -120,9 +151,14 @@ sleep 3; ...@@ -120,9 +151,14 @@ sleep 3;
--connection slave --connection slave
--echo # Wait for first transaction to complete SQL delay and begin execution.. --echo # Wait for first transaction to complete SQL delay and begin execution..
--let $slave_sync_method= none
--source include/sync_with_master_sql_delay_debug_sync.inc
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker'; --let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker';
--source include/wait_condition.inc --source include/wait_condition.inc
--echo # Wait for second transaction to complete SQL delay..
--source include/sync_with_master_sql_delay_debug_sync.inc
--echo # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle.. --echo # Validate SBM calculation doesn't use the second transaction because worker threads shouldn't have gone idle..
--let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1) --let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()` --let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()`
...@@ -147,6 +183,8 @@ UNLOCK TABLES; ...@@ -147,6 +183,8 @@ UNLOCK TABLES;
--echo # Cleanup --echo # Cleanup
--source include/stop_slave.inc --source include/stop_slave.inc
set debug_sync= "RESET";
set @@GLOBAL.debug_dbug= "-d,sql_delay_by_debug_sync";
--eval CHANGE MASTER TO master_delay=0 --eval CHANGE MASTER TO master_delay=0
--source include/start_slave.inc --source include/start_slave.inc
......
...@@ -1088,6 +1088,8 @@ terminate_slave_thread(THD *thd, ...@@ -1088,6 +1088,8 @@ terminate_slave_thread(THD *thd,
mysql_mutex_unlock(&thd->LOCK_thd_kill); mysql_mutex_unlock(&thd->LOCK_thd_kill);
mysql_mutex_unlock(&thd->LOCK_thd_data); mysql_mutex_unlock(&thd->LOCK_thd_data);
DEBUG_SYNC(current_thd, "after_thd_awake_kill");
/* /*
There is a small chance that slave thread might miss the first There is a small chance that slave thread might miss the first
alarm. To protect againts it, resend the signal until it reacts alarm. To protect againts it, resend the signal until it reacts
...@@ -3794,13 +3796,26 @@ sql_delay_event(Log_event *ev, THD *thd, rpl_group_info *rgi) ...@@ -3794,13 +3796,26 @@ sql_delay_event(Log_event *ev, THD *thd, rpl_group_info *rgi)
sql_delay, (long)ev->when, sql_delay, (long)ev->when,
rli->mi->clock_diff_with_master, rli->mi->clock_diff_with_master,
(long)now, (ulonglong)sql_delay_end, (long)nap_time)); (long)now, (ulonglong)sql_delay_end, (long)nap_time));
/* if using debug_sync for sql_delay, only delay once per event group */
if (sql_delay_end > now) if (DBUG_EVALUATE_IF("sql_delay_by_debug_sync", type == GTID_EVENT,
sql_delay_end > now))
{ {
DBUG_PRINT("info", ("delaying replication event %lu secs", DBUG_PRINT("info", ("delaying replication event %lu secs",
nap_time)); nap_time));
rli->start_sql_delay(sql_delay_end); rli->start_sql_delay(sql_delay_end);
mysql_mutex_unlock(&rli->data_lock); mysql_mutex_unlock(&rli->data_lock);
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("sql_delay_by_debug_sync", {
DBUG_ASSERT(!debug_sync_set_action(
thd, STRING_WITH_LEN(
"now SIGNAL at_sql_delay WAIT_FOR continue_sql_thread")));
// Skip the actual sleep if using DEBUG_SYNC to coordinate SQL_DELAY
DBUG_RETURN(0);
};);
#endif
DBUG_RETURN(slave_sleep(thd, nap_time, sql_slave_killed, rgi)); DBUG_RETURN(slave_sleep(thd, nap_time, sql_slave_killed, rgi));
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment