Commit 03d1346e authored by Brandon Nesterenko's avatar Brandon Nesterenko

MDEV-29369: rpl.rpl_semi_sync_shutdown_await_ack fails regularly with Result content mismatch

This test was prone to failures for a few reasons, summarized below:

 1) MDEV-32168 introduced “only_running_threads=1” to
slave_stop.inc, which allowed the stop logic to bypass an
attempting-to-reconnect IO thread. That is, the IO thread could
realize the master shutdown in `read_event()`, and thereby call into
`try_to_reconnect()`. This would leave the IO thread up when the
test expected it to be stopped. Fixed by explicitly stopping the
IO thread and allowing an error state, as the above case would
lead to errno 2003.

 2) On slow systems (or those running profiling tools, e.g. MSAN),
the waiting-for-ack transaction can complete before the system
processes the `SHUTDOWN WAIT FOR ALL SLAVES`. There was shutdown
preparation logic in-between the transaction and shutdown itself,
which contributes to this problem. This patch also moves this
preparation logic before the transaction, so there is less to do
in-between the calls.

 3) Changed work-around for MDEV-28141 to use debug_sync instead
of sleep delay, as it was still possible to hit the bug on very
slow systems.

 4) Masked MTR variable reset with disable/enable query log

Reviewed By:
============
Kristian Nielsen <knielsen@knielsen-hq.org>
parent ee895583
......@@ -3,7 +3,6 @@
# replicas before shutting down.
#
# Parameters:
# server_1_dbug (string) Debug setting for primary (server 1)
# server_2_dbug (string) Debug setting to simulate delay or error on
# the first replica (server 2)
# server_3_dbug (string) Debug setting to simulate delay or error on
......@@ -32,8 +31,8 @@ while (`SELECT $i <= $slave_last`)
--connection server_$i
--sync_with_master
set global rpl_semi_sync_slave_enabled = 1;
source include/stop_slave.inc;
set global rpl_semi_sync_slave_enabled = 1;
source include/start_slave.inc;
show status like 'Rpl_semi_sync_slave_status';
......@@ -56,8 +55,6 @@ show status like 'Rpl_semi_sync_master_status';
show status like 'Rpl_semi_sync_master_clients';
--echo #-- Prepare servers to simulate delay or error
--connection server_1
--eval SET @@GLOBAL.debug_dbug= $server_1_dbug
--connection server_2
--eval SET @@GLOBAL.debug_dbug= $server_2_dbug
--connection server_3
......@@ -66,6 +63,14 @@ show status like 'Rpl_semi_sync_master_clients';
--echo #--
--echo #-- Test begins
--connection server_1_con2
--echo #-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
wait
EOF
--connection server_1
--echo #-- Begin semi-sync transaction
--send INSERT INTO t1 VALUES (1)
......@@ -75,14 +80,6 @@ show status like 'Rpl_semi_sync_master_clients';
let $status_var= Rpl_semi_sync_master_wait_sessions;
let $status_var_value= 1;
source include/wait_for_status_var.inc;
show status like 'Rpl_semi_sync_master_wait_sessions';
--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
wait
EOF
--echo #-- Give enough time after timeout/ack received to query yes_tx/no_tx
SET @@GLOBAL.debug_dbug= "+d,delay_shutdown_phase_2_after_semisync_wait";
--echo #-- Begin master shutdown
--send SHUTDOWN WAIT FOR ALL SLAVES
......@@ -111,19 +108,72 @@ show status like 'Rpl_semi_sync_master_no_tx';
--echo #
--echo #-- Re-synchronize slaves with master and disable semi-sync
if (`SELECT ($server_2_expect_row_count + $server_3_expect_row_count) < 2`)
{
--echo #-- FIXME: workaround for MDEV-28141, preventing errored replicas from
--echo # killing their semi-sync connections
# I.e. we can't create a new kill connection to the primary if we know that the
# primary is shutting down for risk of Packets out of order error. So we wait
# to hit a debug_sync point before the creation of the new kill_connection, and
# don't progress until the primary has been shutdown, so no new connection can
# be formed.
# Note this is only needed in the error case (using corrupt_queue_event), as
# the running io_thread will otherwise automatically detect that the primary
# has shutdown before progressing to the cleanup of the io thread.
}
if (!$server_2_expect_row_count)
{
--connection server_2
set debug_sync= "now wait_for at_semisync_kill_connection";
set debug_sync= "now signal continue_semisync_kill_connection";
--echo # Wait for debug_sync signal to have been received before issuing RESET
let $wait_condition= select count(*)=0 from information_schema.processlist where state like "debug sync point%";
source include/wait_condition.inc;
set debug_sync= "reset";
}
if (!$server_3_expect_row_count)
{
--connection server_3
set debug_sync= "now wait_for at_semisync_kill_connection";
set debug_sync= "now signal continue_semisync_kill_connection";
--echo # Wait for debug_sync signal to have been received before issuing RESET
let $wait_condition= select count(*)=0 from information_schema.processlist where state like "debug sync point%";
source include/wait_condition.inc;
set debug_sync= "reset";
}
--echo #-- Stop slaves
--connection server_2
--eval SET @@GLOBAL.debug_dbug= "$sav_server_2_dbug"
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0
--let $rpl_only_running_threads= 1
source include/stop_slave.inc;
# If server_2_expect_row_count is 0, we are simulating an error on the replica
# and the IO thread will end with errno 1595.
# Otherwise, we still expect error, because the master has shutdown at this
# point, and the IO thread may or may not have realized the shutdown, and
# started to try to automatically reconnect. This may result in the IO thread
# giving a 2003 error if the slave tries to reconnect to a shutdown master.
# Additionally disable warnings because the slave may have stopped in err
# automatically, and we don't want a sporadic "Slave is already stopped"
# warning.
--disable_warnings
--let $rpl_allow_error= 1
--source include/stop_slave_io.inc
--enable_warnings
--let $rpl_allow_error=
--source include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_2_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_2;
--connection server_3
--eval SET @@GLOBAL.debug_dbug= "$sav_server_3_dbug"
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled= 0
--let $rpl_only_running_threads= 1
source include/stop_slave.inc;
# Expect error for IO thread, see above comment for stopping server_2
--disable_warnings
--let $rpl_allow_error= 1
--source include/stop_slave_io.inc
--enable_warnings
--let $rpl_allow_error=
--source include/stop_slave_sql.inc
SET @@GLOBAL.debug_dbug= @sav_server_3_dbug;
SET @@GLOBAL.rpl_semi_sync_slave_enabled= @sav_enabled_server_3;
--echo #-- Bring the master back up
--connection server_1_con2
......@@ -142,13 +192,6 @@ EOF
--enable_reconnect
--source include/wait_until_connected_again.inc
--eval SET @@GLOBAL.debug_dbug= "$sav_master_dbug"
let $status_var= Rpl_semi_sync_master_clients;
let $status_var_value= 0;
source include/wait_for_status_var.inc;
--eval SET @@GLOBAL.rpl_semi_sync_master_enabled = 0
show status like 'Rpl_semi_sync_master_status';
TRUNCATE TABLE t1;
--save_master_pos
......
......@@ -69,9 +69,8 @@ call mtr.add_suppression("Timeout waiting");
call mtr.add_suppression("did not exit");
call mtr.add_suppression("Got an error reading communication packets");
--let $sav_master_timeout= `SELECT @@global.rpl_semi_sync_master_timeout`
--let $sav_master_timeout= `SELECT @@GLOBAL.rpl_semi_sync_master_timeout`
--let $sav_enabled_master= `SELECT @@GLOBAL.rpl_semi_sync_master_enabled`
--let $sav_master_dbug= `SELECT @@GLOBAL.debug_dbug`
--echo # Suppress slave errors related to the simulated error
--connection server_2
......@@ -79,16 +78,16 @@ call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
--let $sav_enabled_server_2=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled`
--let $sav_server_2_dbug= `SELECT @@GLOBAL.debug_dbug`
set @sav_enabled_server_2= @@GLOBAL.rpl_semi_sync_slave_enabled;
set @sav_server_2_dbug= @@GLOBAL.debug_dbug;
--connection server_3
call mtr.add_suppression("reply failed");
call mtr.add_suppression("Replication event checksum verification");
call mtr.add_suppression("Relay log write failure");
call mtr.add_suppression("Failed to kill the active semi-sync connection");
--let $sav_enabled_server_3=`SELECT @@GLOBAL.rpl_semi_sync_slave_enabled`
--let $sav_server_3_dbug= `SELECT @@GLOBAL.debug_dbug`
set @sav_enabled_server_3= @@GLOBAL.rpl_semi_sync_slave_enabled;
set @sav_server_3_dbug= @@GLOBAL.debug_dbug;
--connection server_1
CREATE TABLE t1 (a int);
......@@ -116,7 +115,6 @@ while (`SELECT $i <= $slave_last`)
--echo # allowed timeout, the primary should delay killing the suspended thread
--echo # until an ACK is received (Rpl_semi_sync_master_yes_tx should be 1).
--echo #
--let server_1_dbug= ""
--let server_2_dbug= "+d,simulate_delay_semisync_slave_reply"
--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply"
--let semisync_timeout= 1600
......@@ -129,9 +127,8 @@ while (`SELECT $i <= $slave_last`)
--echo # the primary should delay killing the suspended thread until the
--echo # timeout is reached (Rpl_semi_sync_master_no_tx should be 1).
--echo #
--let server_1_dbug= "+d,mysqld_delay_kill_threads_phase_1"
--let server_2_dbug= "+d,corrupt_queue_event"
--let server_3_dbug= "+d,corrupt_queue_event"
--let server_2_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141"
--let server_3_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141"
--let semisync_timeout= 500
--let server_2_expect_row_count= 0
--let server_3_expect_row_count= 0
......@@ -143,8 +140,7 @@ while (`SELECT $i <= $slave_last`)
--echo # primary should delay killing the suspended thread until it receives an
--echo # ACK from the delayed slave (Rpl_semi_sync_master_yes_tx should be 1).
--echo #
--let server_1_dbug= "+d,mysqld_delay_kill_threads_phase_1"
--let server_2_dbug= "+d,corrupt_queue_event"
--let server_2_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141"
--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply"
--let semisync_timeout= 1600
--let server_2_expect_row_count= 0
......@@ -163,15 +159,7 @@ while (`SELECT $i <= $slave_last`)
--echo # slave does not send a `QUIT` in this case (Rpl_semi_sync_master_yes_tx
--echo # should be 1 because server_3 will send the ACK within a valid timeout).
--echo #
# mysqld_delay_kill_threads_phase1 ensures that server_2 will have enough time
# to start a new connection that has the intent to kill the active semi-sync
# connection
--let server_1_dbug= "+d,mysqld_delay_kill_threads_phase_1"
# slave_delay_killing_semisync_connection ensures that the primary has force
# killed its current connection before it is able to issue `KILL`
--let server_2_dbug= "+d,corrupt_queue_event,slave_delay_killing_semisync_connection"
--let server_2_dbug= "+d,corrupt_queue_event,delay_semisync_kill_connection_for_mdev_28141"
--let server_3_dbug= "+d,simulate_delay_semisync_slave_reply"
--let semisync_timeout= 1600
--let server_2_expect_row_count= 0
......@@ -184,32 +172,24 @@ while (`SELECT $i <= $slave_last`)
--connection server_2
source include/stop_slave.inc;
SET @@GLOBAL.rpl_semi_sync_slave_enabled = @sav_enabled_server_2;
SET @@GLOBAL.debug_dbug= @sav_server_2_dbug;
source include/start_slave.inc;
--disable_query_log
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled = $sav_enabled_server_2
--eval SET @@GLOBAL.debug_dbug= "$sav_server_2_dbug"
--enable_query_log
--connection server_3
source include/stop_slave.inc;
SET @@GLOBAL.rpl_semi_sync_slave_enabled = @sav_enabled_server_3;
SET @@GLOBAL.debug_dbug= @sav_server_3_dbug;
source include/start_slave.inc;
--disable_query_log
--eval SET @@GLOBAL.rpl_semi_sync_slave_enabled = $sav_enabled_server_3
--eval SET @@GLOBAL.debug_dbug= "$sav_server_3_dbug"
--enable_query_log
--connection server_1
let $status_var= Rpl_semi_sync_master_clients;
let $status_var_value= 0;
source include/wait_for_status_var.inc;
--disable_query_log
--eval SET @@GLOBAL.rpl_semi_sync_master_timeout= $sav_master_timeout
--eval SET @@GLOBAL.rpl_semi_sync_master_enabled= $sav_enabled_master
--eval SET @@GLOBAL.debug_dbug= "$sav_master_dbug"
--eval SET @@GLOBAL.rpl_semi_sync_master_timeout= $sav_master_timeout;
--eval SET @@GLOBAL.rpl_semi_sync_master_enabled= $sav_enabled_master;
--enable_query_log
drop table t1;
......
......@@ -1742,7 +1742,6 @@ static void close_connections(void)
This will give the threads some time to gracefully abort their
statements and inform their clients that the server is about to die.
*/
DBUG_EXECUTE_IF("mysqld_delay_kill_threads_phase_1", my_sleep(200000););
int n_threads_awaiting_ack= 0;
server_threads.iterate(kill_thread_phase_1, &n_threads_awaiting_ack);
......
......@@ -17,6 +17,7 @@
#include <my_global.h>
#include "semisync_slave.h"
#include "debug_sync.h"
Repl_semi_sync_slave repl_semisync_slave;
......@@ -115,7 +116,21 @@ int Repl_semi_sync_slave::slave_start(Master_info *mi)
int Repl_semi_sync_slave::slave_stop(Master_info *mi)
{
if (get_slave_enabled())
{
#ifdef ENABLED_DEBUG_SYNC
/*
TODO: Remove after MDEV-28141
*/
DBUG_EXECUTE_IF("delay_semisync_kill_connection_for_mdev_28141", {
const char act[]= "now "
"signal at_semisync_kill_connection "
"wait_for continue_semisync_kill_connection";
DBUG_ASSERT(debug_sync_service);
DBUG_ASSERT(!debug_sync_set_action(mi->io_thd, STRING_WITH_LEN(act)));
};);
#endif
kill_connection(mi->mysql);
}
if (rpl_semi_sync_slave_status)
rpl_semi_sync_slave_status= 0;
......@@ -151,8 +166,6 @@ void Repl_semi_sync_slave::kill_connection(MYSQL *mysql)
goto failed_graceful_kill;
}
DBUG_EXECUTE_IF("slave_delay_killing_semisync_connection", my_sleep(400000););
kill_buffer_length= my_snprintf(kill_buffer, 30, "KILL %lu",
mysql->thread_id);
if (mysql_real_query(kill_mysql, kill_buffer, (ulong)kill_buffer_length))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment