Commit 4b4db4a8 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-34042: Deadlock kill of XA PREPARE can break replication /...

MDEV-34042: Deadlock kill of XA PREPARE can break replication / rpl.rpl_parallel_multi_domain_xa sporadic failure

Refinement of the original patch.

Move the code to reset the kill up into the parent class
Xid_apply_log_event, to also fix the similar issue for XA COMMIT.

Increase the number of slave retries in the test case
rpl.rpl_parallel_multi_domain_xa to fix some sporadic failures. The test
generates massive amounts of conflicting transactions in multiple
independent domains, which can cause multiple rollback+retry for a
transaction as it conflicts with transactions in other domains one-by-one.
Signed-off-by: default avatarKristian Nielsen <knielsen@knielsen-hq.org>
parent 2a2019e1
...@@ -6,6 +6,8 @@ connection master; ...@@ -6,6 +6,8 @@ connection master;
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
connection slave; connection slave;
include/stop_slave.inc include/stop_slave.inc
SET @old_transaction_retries = @@GLOBAL.slave_transaction_retries;
SET @@global.slave_transaction_retries = 1000;
SET @old_parallel_threads = @@GLOBAL.slave_parallel_threads; SET @old_parallel_threads = @@GLOBAL.slave_parallel_threads;
SET @old_slave_domain_parallel_threads = @@GLOBAL.slave_domain_parallel_threads; SET @old_slave_domain_parallel_threads = @@GLOBAL.slave_domain_parallel_threads;
SET @@global.slave_parallel_threads = 5; SET @@global.slave_parallel_threads = 5;
...@@ -45,6 +47,7 @@ include/stop_slave.inc ...@@ -45,6 +47,7 @@ include/stop_slave.inc
SET @@global.slave_parallel_mode = @old_parallel_mode; SET @@global.slave_parallel_mode = @old_parallel_mode;
SET @@global.slave_parallel_threads = @old_parallel_threads; SET @@global.slave_parallel_threads = @old_parallel_threads;
SET @@global.slave_domain_parallel_threads = @old_slave_domain_parallel_threads; SET @@global.slave_domain_parallel_threads = @old_slave_domain_parallel_threads;
SET @@global.slave_transaction_retries = @old_transaction_retries;
include/start_slave.inc include/start_slave.inc
connection master; connection master;
DROP TABLE t1; DROP TABLE t1;
......
...@@ -21,6 +21,12 @@ ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB; ...@@ -21,6 +21,12 @@ ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
--connection slave --connection slave
--sync_with_master --sync_with_master
--source include/stop_slave.inc --source include/stop_slave.inc
# This test runs huge number of transactions independently in parallel that
# all conflict on a single row. This requires a large number of retries, as a
# transaction can repeatedly conflict/deadlock with a large number of other
# transactions (in a different domain) one by one.
SET @old_transaction_retries = @@GLOBAL.slave_transaction_retries;
SET @@global.slave_transaction_retries = 1000;
SET @old_parallel_threads = @@GLOBAL.slave_parallel_threads; SET @old_parallel_threads = @@GLOBAL.slave_parallel_threads;
SET @old_slave_domain_parallel_threads = @@GLOBAL.slave_domain_parallel_threads; SET @old_slave_domain_parallel_threads = @@GLOBAL.slave_domain_parallel_threads;
SET @@global.slave_parallel_threads = 5; SET @@global.slave_parallel_threads = 5;
...@@ -160,6 +166,7 @@ SET @@global.slave_parallel_mode = 'optimistic'; ...@@ -160,6 +166,7 @@ SET @@global.slave_parallel_mode = 'optimistic';
SET @@global.slave_parallel_mode = @old_parallel_mode; SET @@global.slave_parallel_mode = @old_parallel_mode;
SET @@global.slave_parallel_threads = @old_parallel_threads; SET @@global.slave_parallel_threads = @old_parallel_threads;
SET @@global.slave_domain_parallel_threads = @old_slave_domain_parallel_threads; SET @@global.slave_domain_parallel_threads = @old_slave_domain_parallel_threads;
SET @@global.slave_transaction_retries = @old_transaction_retries;
--source include/start_slave.inc --source include/start_slave.inc
--connection master --connection master
......
...@@ -4066,6 +4066,9 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi) ...@@ -4066,6 +4066,9 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi)
thd->wsrep_affected_rows= 0; thd->wsrep_affected_rows= 0;
#endif #endif
#ifndef DBUG_OFF
bool record_gtid_delayed_for_xa= false;
#endif
if (rgi->gtid_pending) if (rgi->gtid_pending)
{ {
sub_id= rgi->gtid_sub_id; sub_id= rgi->gtid_sub_id;
...@@ -4084,6 +4087,10 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi) ...@@ -4084,6 +4087,10 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi)
return 1; return 1;
}); });
} }
#ifndef DBUG_OFF
else
record_gtid_delayed_for_xa= true;
#endif
} }
general_log_print(thd, COM_QUERY, get_query()); general_log_print(thd, COM_QUERY, get_query());
...@@ -4093,6 +4100,22 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi) ...@@ -4093,6 +4100,22 @@ int Xid_apply_log_event::do_apply_event(rpl_group_info *rgi)
{ {
DBUG_ASSERT(!thd->transaction->xid_state.is_explicit_XA()); DBUG_ASSERT(!thd->transaction->xid_state.is_explicit_XA());
DBUG_ASSERT(record_gtid_delayed_for_xa);
if (thd->rgi_slave->is_parallel_exec)
{
/*
With XA, since the transaction is prepared/committed without updating
the GTID pos (MDEV-32020...), we need here to clear any pending
deadlock kill.
Otherwise if the kill happened after the prepare/commit completed, it
might end up killing the subsequent GTID position update, causing the
slave to fail with error.
*/
wait_for_pending_deadlock_kill(thd, thd->rgi_slave);
thd->reset_killed();
}
if ((err= do_record_gtid(thd, rgi, false, &hton, true))) if ((err= do_record_gtid(thd, rgi, false, &hton, true)))
return err; return err;
} }
...@@ -4209,19 +4232,6 @@ int XA_prepare_log_event::do_commit() ...@@ -4209,19 +4232,6 @@ int XA_prepare_log_event::do_commit()
else else
res= trans_xa_commit(thd); res= trans_xa_commit(thd);
if (thd->rgi_slave->is_parallel_exec)
{
/*
Since the transaction is prepared/committed without updating the GTID pos
(MDEV-32020...), we need here to clear any pending deadlock kill.
Otherwise if the kill happened after the prepare/commit completed, it
might end up killing the subsequent GTID position update, causing the
slave to fail with error.
*/
wait_for_pending_deadlock_kill(thd, thd->rgi_slave);
thd->reset_killed();
}
return res; return res;
} }
#endif // HAVE_REPLICATION #endif // HAVE_REPLICATION
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment