MDEV-31949: rpl_xa_prepare_gtid_fail deterministic paths

happy_xac is where the XA COMMIT completes before noticing the error signalled by the prior XAP sad_xac is where the XA COMMIT notices the error signalled by the prior XAC and rolls back, leaving a dangling XAP.

MDEV-31949: rpl_xa_prepare_gtid_fail deterministic paths
happy_xac is where the XA COMMIT completes before noticing the error signalled by the prior XAP sad_xac is where the XA COMMIT notices the error signalled by the prior XAC and rolls back, leaving a dangling XAP.
acb9c9e9 · Brandon Nesterenko · bd37485a · acb9c9e9 · acb9c9e9 · acb9c9e9
Commit acb9c9e9 authored Oct 18, 2023 by Brandon Nesterenko
3 changed files
--- a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail_happy_xac.test
+++ b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail_happy_xac.test
+#
+#   When handling the replication of an XA PREPARE, the commit phase is
+# bifurcated. First, the prepare is handled by the relevant storage engines.
+# Then second,the GTID slave state is updated as a separate autocommit
+# transaction. If the second stage fails, i.e. we are unable to update the
+# GTID slave state, then the slave should immediately quit in error, without
+# retry.
+#
+#   This tests validates the above behavior by forcing a lock-wait timeout on
+# the GTID slave state table during the second part of XA PREPARE's commit, to
+# ensure that the appropriate error is reported and the transaction was never
+# retried.
+#
+#
+# References
+#   MDEV-31038: Parallel Replication Breaks if XA PREPARE Fails Updating Slave
+#               GTID State
+#
+source include/master-slave.inc;
+source include/have_binlog_format_row.inc;
+source include/have_debug.inc;
+source include/have_debug_sync.inc;
+source include/have_innodb.inc;
+
+--connection slave
+set statement sql_log_bin=0 for call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+--source include/stop_slave.inc
+
+set @save_par_thds= @@global.slave_parallel_threads;
+set @save_strict_mode= @@global.gtid_strict_mode;
+set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
+
+change master to master_use_gtid=slave_pos;
+set @@global.slave_parallel_threads= 4;
+set @@global.slave_parallel_mode= optimistic;
+set @@global.gtid_strict_mode=ON;
+
+set statement sql_log_bin=0 for alter table mysql.gtid_slave_pos engine=innodb;
+--source include/start_slave.inc
+
+--connection master
+create table t1 (a int primary key, b int) engine=innodb;
+insert t1 values (1,1);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+set @@global.innodb_lock_wait_timeout= 1;
+
+--let $retried_tx_initial= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1)
+
+--connection master
+--let $gtid_domain_id=`SELECT @@GLOBAL.gtid_domain_id`
+--let $gtid_server_id=`SELECT @@GLOBAL.server_id`
+# seq_no 100 (XA COMMIT) will hit debug_sync hold_worker_on_schedule
+--let $xap_seq_no=99
+--eval set @@session.gtid_seq_no=$xap_seq_no
+xa start '1';
+update t1 set b=b+10 where a=1;
+xa end '1';
+xa prepare '1';
+xa commit '1';
+--let $new_gtid= `SELECT @@global.gtid_binlog_pos`
+--source include/save_master_gtid.inc
+
+--connection slave1
+BEGIN;
+--eval SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=$xap_seq_no FOR UPDATE
+
+--connection slave
+set @old_dbug= @@GLOBAL.debug_dbug;
+set @@GLOBAL.debug_dbug= "+d,hold_xap_finalization";
+--source include/start_slave.inc
+
+set debug_sync='now wait_for xap_finalizing';
+
+--echo # Waiting for XAC to binlog
+--let $wait_condition= SELECT @@global.gtid_binlog_pos='0-1-100';
+--source include/wait_condition.inc
+
+set debug_sync='now signal xap_continue';
+
+--let $slave_sql_errno= 1942
+--source include/wait_for_slave_sql_error.inc
+
+# TODO: Remove after fixing MDEV-21777
+# Stop the IO thread too, so the existing relay logs are force purged on slave
+# restart, as to not re-execute the already-prepared transaction
+--source include/stop_slave_io.inc
+
+--let $retried_tx_test= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1)
+if ($retried_tx_initial != $retried_tx_test)
+{
+    --echo Transaction was retried when a failed XA PREPARE slave GTID update should lead to immediate slave stop without retry
+    --die Transaction was retried when a failed XA PREPARE slave GTID update should lead to immediate slave stop without retry
+}
+
+--connection slave1
+ROLLBACK;
+
+set @@GLOBAL.debug_dbug= @old_dbug;
+set debug_sync= 'reset';
+
+# XA COMMIT succeeds, this is empty
+XA RECOVER;
+
+--let $xac_failed= query_get_value(XA RECOVER, data, 1)
+if ($xac_failed != "No such row")
+{
+    die XAC should have suceeded;
+}
+
+
+--echo # Cleanup
+
+--connection master
+drop table t1;
+
+--connection slave
+--echo # TODO: Remove after fixing MDEV-21777
+--eval set @@global.gtid_slave_pos= "$new_gtid"
+set @@global.slave_parallel_threads= @save_par_thds;
+set @@global.gtid_strict_mode= @save_strict_mode;
+set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
+--source include/start_slave.inc
+
+--source include/rpl_end.inc
+--echo # End of rpl_xa_prepare_gtid_fail.test
--- a/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail_sad_xac.test
+++ b/mysql-test/suite/rpl/t/rpl_xa_prepare_gtid_fail_sad_xac.test
+#
+#   When handling the replication of an XA PREPARE, the commit phase is
+# bifurcated. First, the prepare is handled by the relevant storage engines.
+# Then second,the GTID slave state is updated as a separate autocommit
+# transaction. If the second stage fails, i.e. we are unable to update the
+# GTID slave state, then the slave should immediately quit in error, without
+# retry.
+#
+#   This tests validates the above behavior by forcing a lock-wait timeout on
+# the GTID slave state table during the second part of XA PREPARE's commit, to
+# ensure that the appropriate error is reported and the transaction was never
+# retried.
+#
+#
+# References
+#   MDEV-31038: Parallel Replication Breaks if XA PREPARE Fails Updating Slave
+#               GTID State
+#
+source include/master-slave.inc;
+source include/have_binlog_format_row.inc;
+source include/have_debug.inc;
+source include/have_debug_sync.inc;
+source include/have_innodb.inc;
+
+--connection slave
+set statement sql_log_bin=0 for call mtr.add_suppression("Commit failed due to failure of an earlier commit on which this one depends");
+--source include/stop_slave.inc
+
+set @save_par_thds= @@global.slave_parallel_threads;
+set @save_strict_mode= @@global.gtid_strict_mode;
+set @save_innodb_lock_wait_timeout= @@global.innodb_lock_wait_timeout;
+
+change master to master_use_gtid=slave_pos;
+set @@global.slave_parallel_threads= 4;
+set @@global.slave_parallel_mode= optimistic;
+set @@global.gtid_strict_mode=ON;
+
+set statement sql_log_bin=0 for alter table mysql.gtid_slave_pos engine=innodb;
+--source include/start_slave.inc
+
+--connection master
+create table t1 (a int primary key, b int) engine=innodb;
+insert t1 values (1,1);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+set @@global.innodb_lock_wait_timeout= 1;
+
+--let $retried_tx_initial= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1)
+
+--connection master
+--let $gtid_domain_id=`SELECT @@GLOBAL.gtid_domain_id`
+--let $gtid_server_id=`SELECT @@GLOBAL.server_id`
+# seq_no 100 (XA COMMIT) will hit debug_sync hold_worker_on_schedule
+--let $xap_seq_no=99
+--eval set @@session.gtid_seq_no=$xap_seq_no
+xa start '1';
+update t1 set b=b+10 where a=1;
+xa end '1';
+xa prepare '1';
+xa commit '1';
+--let $new_gtid= `SELECT @@global.gtid_binlog_pos`
+--source include/save_master_gtid.inc
+
+--connection slave1
+BEGIN;
+--eval SELECT * FROM mysql.gtid_slave_pos WHERE seq_no=$xap_seq_no FOR UPDATE
+
+--connection slave
+set @old_dbug= @@GLOBAL.debug_dbug;
+set @@GLOBAL.debug_dbug= "+d,hold_worker_on_schedule";
+--source include/start_slave.inc
+
+# Note it's possible that this won't be signalled if XAP fails before another worker
+# thread begins processing the XAC GTID
+set debug_sync='now wait_for reached_pause timeout 1';
+
+# Give time to ensure XAP gtid_slave_pos update fails and signals its waiter
+# to stop due to failure of prior commit
+sleep 3;
+set DEBUG_SYNC='now signal continue_worker';
+
+--let $slave_sql_errno= 1942
+--source include/wait_for_slave_sql_error.inc
+
+# TODO: Remove after fixing MDEV-21777
+# Stop the IO thread too, so the existing relay logs are force purged on slave
+# restart, as to not re-execute the already-prepared transaction
+--source include/stop_slave_io.inc
+
+--let $retried_tx_test= query_get_value(SHOW ALL SLAVES STATUS, Retried_transactions, 1)
+if ($retried_tx_initial != $retried_tx_test)
+{
+    --echo Transaction was retried when a failed XA PREPARE slave GTID update should lead to immediate slave stop without retry
+    --die Transaction was retried when a failed XA PREPARE slave GTID update should lead to immediate slave stop without retry
+}
+
+--connection slave1
+ROLLBACK;
+
+set @@GLOBAL.debug_dbug= @old_dbug;
+set debug_sync= 'reset';
+
+# XA COMMIT failed, and its XAP should still be dangling
+XA RECOVER;
+
+--let $xac_failed= query_get_value(XA RECOVER, data, 1)
+if ($xac_failed == "No such row")
+{
+    die XAC should have failed;
+}
+
+# So commit it
+set statement gtid_domain_id=0, server_id=1, gtid_seq_no=100 for xa commit '1';
+
+--echo # Cleanup
+
+--connection master
+drop table t1;
+
+--connection slave
+--echo # TODO: Remove after fixing MDEV-21777
+--eval set @@global.gtid_slave_pos= "$new_gtid"
+set @@global.slave_parallel_threads= @save_par_thds;
+set @@global.gtid_strict_mode= @save_strict_mode;
+set @@global.innodb_lock_wait_timeout= @save_innodb_lock_wait_timeout;
+--source include/start_slave.inc
+
+--source include/rpl_end.inc
+--echo # End of rpl_xa_prepare_gtid_fail.test
--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -155,6 +155,11 @@ finish_event_group(rpl_parallel_thread *rpt, uint64 sub_id,
  wait_for_commit *wfc= &rgi->commit_orderer;
  int err;

+  DBUG_EXECUTE_IF("hold_xap_finalization", {
+      if (rgi->current_gtid.seq_no == 99) {
+        debug_sync_set_action(thd, STRING_WITH_LEN("now SIGNAL xap_finalizing WAIT_FOR xap_continue"));
+      }});
+
  thd->get_stmt_da()->set_overwrite_status(true);

  if (unlikely(rgi->worker_error))