Commit d4309d48 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-31448: Killing a replica thread awaiting its GCO can hang/crash a parallel replica

Various test cases for the bugs around MDEV-31448.
Test cases due to Brandon Nesterenko, thanks!
Reviewed-by: default avatarAndrei Elkin <andrei.elkin@mariadb.com>
Signed-off-by: default avatarKristian Nielsen <knielsen@knielsen-hq.org>
parent 5d61442c
--connection master
create table t1 (a int) engine=innodb;
create table t2 (a int) engine=innodb;
insert into t1 values (1);
--source include/save_master_gtid.inc
--connection slave
call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends");
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
set @save.slave_parallel_threads= @@global.slave_parallel_threads;
set @save.slave_parallel_mode= @@global.slave_parallel_mode;
set @@global.slave_parallel_threads= 3;
set @@global.slave_parallel_mode= CONSERVATIVE;
--connection slave1
BEGIN;
update t1 set a=2 where a=1;
--connection master
SET @old_dbug= @@SESSION.debug_dbug;
SET @@SESSION.debug_dbug="+d,binlog_force_commit_id";
# GCO 1
SET @commit_id= 10000;
# T1
update t1 set a=2 where a=1;
# T2
insert into t2 values (1);
# GCO 2
SET @commit_id= 10001;
# T3
insert into t1 values (3);
--connection slave
--source include/start_slave.inc
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(-1)' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--let $t3_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to start commit%'`
--evalp kill $t3_tid
--connection slave1
commit;
--connection slave
--let $slave_timeout=1032
--source include/wait_for_slave_sql_to_stop.inc
update t1 set a=1 where a=2;
set @@global.slave_parallel_threads = @save.slave_parallel_threads;
set @@global.slave_parallel_mode = @save.slave_parallel_mode;
--source include/start_slave.inc
--echo #
--echo # Cleanup
--connection master
DROP TABLE t1, t2;
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--echo # MDEV-31448 OOO finish event group by killed worker
# The test demonstrates how a killed worker access gco lists
# in finish_event_group() out-of-order to fire
# DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id);
# in the buggy version.
--echo # Initialize test data
--connection master
create table t1 (a int) engine=innodb;
create table t2 (a int) engine=innodb;
insert into t1 values (1);
--source include/save_master_gtid.inc
--connection slave
call mtr.add_suppression("Connection was killed");
call mtr.add_suppression("Can.t find record");
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
set @save.slave_parallel_threads= @@global.slave_parallel_threads;
set @save.slave_parallel_mode= @@global.slave_parallel_mode;
set @@global.slave_parallel_threads= 3;
set @@global.slave_parallel_mode= OPTIMISTIC;
--connection slave1
begin;
update t1 set a=2 where a=1;
--connection master
set @old_dbug= @@session.debug_dbug;
set @@session.debug_dbug="+d,binlog_force_commit_id";
# GCO 1
set @commit_id= 10000;
# T1
update t1 set a=2 where a=1;
if (!$killed_trx_commits)
{
set @commit_id= 10001;
# T2
set statement skip_parallel_replication=1 for insert into t2 values (1);
}
if ($killed_trx_commits)
{
insert into t2 values (1);
}
# GCO 2
# T3
drop table t2;
--connection slave
--source include/start_slave.inc
--echo # wait for T1
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(-1)' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--echo # wait for T2
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker'`
--echo # wait for T3
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--evalp kill $t2_tid
# give some little time for T2 to re-sink into the same state
--let $slave_param=Last_Errno
--let $slave_param_value=1927
--source include/wait_for_slave_param.inc
--connection slave1
commit;
--connection slave
--let $slave_timeout=1032
--source include/wait_for_slave_sql_to_stop.inc
update t1 set a=1 where a=2;
set @@global.slave_parallel_threads = @save.slave_parallel_threads;
set @@global.slave_parallel_mode = @save.slave_parallel_mode;
--source include/start_slave.inc
--echo #
--echo # Cleanup
--connection master
drop table t1;
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
include/master-slave.inc
[connection master]
# MDEV-31448 OOO finish event group by killed worker
# Initialize test data
connection master;
call mtr.add_suppression("Slave: Connection was killed");
call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends");
create table t1 (a int) engine=innodb;
create table t2 (a int) engine=innodb;
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @@global.slave_parallel_threads= 4;
set @@global.slave_parallel_mode= OPTIMISTIC;
set @@global.innodb_lock_wait_timeout= 30;
set @@global.slave_transaction_retries= 0;
connection slave1;
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
a
1
connection master;
SET @old_dbug= @@SESSION.debug_dbug;
SET @@SESSION.debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10000;
update t1 set a=2 where a=1;
set statement skip_parallel_replication=1 for insert into t2 values (1);
drop table t2;
connection slave;
include/start_slave.inc
# wait for T1
# wait for T2
# wait for T3
kill T2_TID;
connection slave1;
ROLLBACK;
connection master;
DROP TABLE t1;
include/save_master_gtid.inc
connection slave;
#
# Cleanup
include/stop_slave.inc
set @@global.slave_parallel_threads= 0;
set @@global.slave_parallel_mode= conservative;
set @@global.innodb_lock_wait_timeout= 50;
set @@global.slave_transaction_retries= 10;
include/start_slave.inc
include/sync_with_master_gtid.inc
include/rpl_end.inc
include/master-slave.inc
[connection master]
connection master;
create table t1 (a int) engine=innodb;
create table t2 (a int) engine=innodb;
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends");
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @save.slave_parallel_threads= @@global.slave_parallel_threads;
set @save.slave_parallel_mode= @@global.slave_parallel_mode;
set @@global.slave_parallel_threads= 3;
set @@global.slave_parallel_mode= CONSERVATIVE;
connection slave1;
BEGIN;
update t1 set a=2 where a=1;
connection master;
SET @old_dbug= @@SESSION.debug_dbug;
SET @@SESSION.debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10000;
update t1 set a=2 where a=1;
insert into t2 values (1);
SET @commit_id= 10001;
insert into t1 values (3);
connection slave;
include/start_slave.inc
kill $t3_tid;
connection slave1;
commit;
connection slave;
include/wait_for_slave_sql_to_stop.inc
update t1 set a=1 where a=2;
set @@global.slave_parallel_threads = @save.slave_parallel_threads;
set @@global.slave_parallel_mode = @save.slave_parallel_mode;
include/start_slave.inc
#
# Cleanup
connection master;
DROP TABLE t1, t2;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
# MDEV-31448 OOO finish event group by killed worker
# Initialize test data
connection master;
create table t1 (a int) engine=innodb;
create table t2 (a int) engine=innodb;
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
call mtr.add_suppression("Connection was killed");
call mtr.add_suppression("Can.t find record");
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @save.slave_parallel_threads= @@global.slave_parallel_threads;
set @save.slave_parallel_mode= @@global.slave_parallel_mode;
set @@global.slave_parallel_threads= 3;
set @@global.slave_parallel_mode= OPTIMISTIC;
connection slave1;
begin;
update t1 set a=2 where a=1;
connection master;
set @old_dbug= @@session.debug_dbug;
set @@session.debug_dbug="+d,binlog_force_commit_id";
set @commit_id= 10000;
update t1 set a=2 where a=1;
insert into t2 values (1);
drop table t2;
connection slave;
include/start_slave.inc
# wait for T1
# wait for T2
# wait for T3
kill $t2_tid;
include/wait_for_slave_param.inc [Last_Errno]
connection slave1;
commit;
connection slave;
include/wait_for_slave_sql_to_stop.inc
update t1 set a=1 where a=2;
set @@global.slave_parallel_threads = @save.slave_parallel_threads;
set @@global.slave_parallel_mode = @save.slave_parallel_mode;
include/start_slave.inc
#
# Cleanup
connection master;
drop table t1;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
# MDEV-31448 OOO finish event group by killed worker
# Initialize test data
connection master;
create table t1 (a int) engine=innodb;
create table t2 (a int) engine=innodb;
insert into t1 values (1);
include/save_master_gtid.inc
connection slave;
call mtr.add_suppression("Connection was killed");
call mtr.add_suppression("Can.t find record");
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @save.slave_parallel_threads= @@global.slave_parallel_threads;
set @save.slave_parallel_mode= @@global.slave_parallel_mode;
set @@global.slave_parallel_threads= 3;
set @@global.slave_parallel_mode= OPTIMISTIC;
connection slave1;
begin;
update t1 set a=2 where a=1;
connection master;
set @old_dbug= @@session.debug_dbug;
set @@session.debug_dbug="+d,binlog_force_commit_id";
set @commit_id= 10000;
update t1 set a=2 where a=1;
set @commit_id= 10001;
set statement skip_parallel_replication=1 for insert into t2 values (1);
drop table t2;
connection slave;
include/start_slave.inc
# wait for T1
# wait for T2
# wait for T3
kill $t2_tid;
include/wait_for_slave_param.inc [Last_Errno]
connection slave1;
commit;
connection slave;
include/wait_for_slave_sql_to_stop.inc
update t1 set a=1 where a=2;
set @@global.slave_parallel_threads = @save.slave_parallel_threads;
set @@global.slave_parallel_mode = @save.slave_parallel_mode;
include/start_slave.inc
#
# Cleanup
connection master;
drop table t1;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/rpl_end.inc
--source include/master-slave.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_binlog_format_row.inc
--echo # MDEV-31448 OOO finish event group by killed worker
# The test demonstrates how a killed worker access gco lists
# in finish_event_group() out-of-order to fire
# DBUG_ASSERT(!tmp_gco->next_gco || tmp_gco->last_sub_id > sub_id);
# in the buggy version.
--echo # Initialize test data
--connection master
call mtr.add_suppression("Slave: Connection was killed");
call mtr.add_suppression("Slave: Commit failed due to failure of an earlier commit on which this one depends");
create table t1 (a int) engine=innodb;
create table t2 (a int) engine=innodb;
insert into t1 values (1);
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads`
--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode`
--let $save_innodb_lock_wait_timeout= `SELECT @@global.innodb_lock_wait_timeout`
--let $save_transaction_retries= `SELECT @@global.slave_transaction_retries`
set @@global.slave_parallel_threads= 4;
set @@global.slave_parallel_mode= OPTIMISTIC;
set @@global.innodb_lock_wait_timeout= 30;
set @@global.slave_transaction_retries= 0;
--connection slave1
BEGIN;
SELECT * FROM t1 WHERE a=1 FOR UPDATE;
--connection master
SET @old_dbug= @@SESSION.debug_dbug;
SET @@SESSION.debug_dbug="+d,binlog_force_commit_id";
# GCO 1
SET @commit_id= 10000;
# T1
update t1 set a=2 where a=1;
# T2
set statement skip_parallel_replication=1 for insert into t2 values (1);
# GCO 2
# T3
drop table t2;
--connection slave
--source include/start_slave.inc
--echo # wait for T1
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Update_rows_log_event::find_row(-1)' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--echo # wait for T2
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--let $t2_tid= `SELECT ID FROM INFORMATION_SCHEMA.PROCESSLIST WHERE STATE LIKE 'Waiting for prior transaction to commit%' and command LIKE 'Slave_worker'`
--echo # wait for T3
--let $wait_condition= SELECT count(*)=1 FROM information_schema.processlist WHERE state LIKE 'Waiting for prior transaction to start commit%' and command LIKE 'Slave_worker';
--source include/wait_condition.inc
--replace_result $t2_tid T2_TID
--eval kill $t2_tid
--sleep 1
--connection slave1
# Release the blocked T1
ROLLBACK;
--connection master
DROP TABLE t1;
--source include/save_master_gtid.inc
--connection slave
--echo #
--echo # Cleanup
--source include/stop_slave.inc
eval set @@global.slave_parallel_threads= $save_slave_parallel_threads;
eval set @@global.slave_parallel_mode= $save_slave_parallel_mode;
eval set @@global.innodb_lock_wait_timeout= $save_innodb_lock_wait_timeout;
eval set @@global.slave_transaction_retries= $save_transaction_retries;
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
--source include/rpl_end.inc
--source include/master-slave.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_binlog_format_row.inc
--source include/mdev-31448_conservative.inc
--let $killed_trx_commits=1
--source include/mdev-31448_optimistic.inc
--let $killed_trx_commits=0
--source include/mdev-31448_optimistic.inc
--source include/rpl_end.inc
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment