Commit 33de71c2 authored by sjaakola's avatar sjaakola Committed by Jan Lindström

MDEV-22632 wsrep XID checkpointing can happen out of order for certification failure

When a transaction fails in certification phase, it has connsumed one GTID, but as
transaction must rollback, it will not go for commit ordering, and because of this
also the wsrep XID checkpointing can happen out of order.
This PR will make the thread, which has failed for certiication failure to wait for its
commit order turn for checkpointing wsrep IXD in innodb rollback segment.

There is a specific test for wsrep XID checkpointing ordering in mtr test:
mysql-wsrep-bugs-607, which is added in this PR.

Test galera_slave_replay depends also on this fix, as the second test phase
may also assert for bad wsrep XID checkpointing order.
galera_slave_replay.test had also other problems, which caused the test to
fail immediately, thse are now fixes in this PR as well.
parent b4abe7c9
......@@ -4,9 +4,7 @@ connection node_2;
connection node_1;
connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3;
connection node_3;
SET GLOBAL wsrep_on=OFF;
RESET MASTER;
SET GLOBAL wsrep_on=ON;
connection node_2a;
START SLAVE;
connection node_3;
......@@ -74,8 +72,10 @@ UPDATE t1 SET f2 = 'd' WHERE f1 = 3;
connection node_2a;
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync';
SET GLOBAL debug_dbug = "d,sync.wsrep_apply_cb";
connection node_3;
connection node_1;
UPDATE test.t1 SET f2 = 'e' WHERE f1 = 3;
connection node_2a;
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
connection node_3;
COMMIT;
connection node_2a;
......@@ -96,6 +96,4 @@ RESET SLAVE;
DROP TABLE t1;
connection node_3;
DROP TABLE t1;
SET GLOBAL wsrep_on=OFF;
RESET MASTER;
SET GLOBAL wsrep_on=ON;
connection node_2;
connection node_1;
#
# test the order of wsrep XID storage after certifiation failure
#
connection node_1;
set session wsrep_sync_wait=0;
create table t1 (i int primary key, j int);
insert into t1 values (4, 0);
connect node_2b, 127.0.0.1, root, , test, $NODE_MYPORT_2;
connection node_2b;
set session wsrep_sync_wait=0;
SET GLOBAL debug_dbug = "d,sync.wsrep_apply_cb";
connection node_1;
UPDATE test.t1 set j=1 where i=4;
connection node_2b;
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
connection node_2;
set session wsrep_sync_wait=0;
set session wsrep_retry_autocommit=0;
UPDATE test.t1 SET j=2 WHERE i=4;
connection node_2b;
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = "";
SET DEBUG_SYNC = "RESET";
connection node_2;
ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
select * from t1;
i j
4 1
DROP TABLE t1;
......@@ -21,9 +21,7 @@
#
--connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
--connection node_3
SET GLOBAL wsrep_on=OFF;
RESET MASTER;
SET GLOBAL wsrep_on=ON;
--connection node_2a
#
......@@ -163,18 +161,25 @@ UPDATE t1 SET f2 = 'd' WHERE f1 = 3;
# block applier
SET GLOBAL debug_dbug = "d,sync.wsrep_apply_cb";
# Inject a conflicting update from node 3
--connection node_3
# Inject a conflicting update from node 1
--connection node_1
UPDATE test.t1 SET f2 = 'e' WHERE f1 = 3;
--connection node_2a
# wait until applier has reached the sync point
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
--let $expected_cert_failures = `SELECT VARIABLE_VALUE+1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_cert_failures'`
# send the update from master
--connection node_3
--error 0
COMMIT;
--connection node_2a
--let $wait_condition = SELECT VARIABLE_VALUE = $expected_cert_failures FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_cert_failures'
--source include/wait_condition.inc
# release the applier
# release the applier from node 1
SET GLOBAL debug_dbug = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
......@@ -198,6 +203,4 @@ DROP TABLE t1;
--connection node_3
DROP TABLE t1;
SET GLOBAL wsrep_on=OFF;
RESET MASTER;
SET GLOBAL wsrep_on=ON;
--source include/galera_cluster.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--source include/galera_have_debug_sync.inc
--source include/have_log_bin.inc
#
# Test case to stress the order of wsrep XID checkpointing.
#
# In buggy version, the transaction which failed certification can
# rush to record wsrep XID checkpoint before the offending applier,
# causing assert in innodb sys header update routine
#
--echo #
--echo # test the order of wsrep XID storage after certifiation failure
--echo #
--connection node_1
set session wsrep_sync_wait=0;
create table t1 (i int primary key, j int);
insert into t1 values (4, 0);
--connect node_2b, 127.0.0.1, root, , test, $NODE_MYPORT_2
--connection node_2b
set session wsrep_sync_wait=0;
# wait for the last insert to be replicated from node 1
--let $wait_condition = SELECT COUNT(*) = 1 FROM test.t1;
--source include/wait_condition.inc
# block applier before applying
SET GLOBAL debug_dbug = "d,sync.wsrep_apply_cb";
# send update from node 1, it will pause in the sync point
--connection node_1
UPDATE test.t1 set j=1 where i=4;
--connection node_2b
# wait until applier has reached the sync point
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
# look number of cert failures so far, and expect one more to happen
--let $expected_cert_failures = `SELECT VARIABLE_VALUE+1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_cert_failures'`
# Inject a conflicting update in node 2, it should fail in certification
--connection node_2
set session wsrep_sync_wait=0;
set session wsrep_retry_autocommit=0;
--send UPDATE test.t1 SET j=2 WHERE i=4
--connection node_2b
# wait until the update has hit certification failure
--let $wait_condition = SELECT VARIABLE_VALUE = $expected_cert_failures FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_local_cert_failures'
--source include/wait_condition.inc
# release the applier
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = "";
SET DEBUG_SYNC = "RESET";
--connection node_2
--error ER_LOCK_DEADLOCK
--reap
select * from t1;
DROP TABLE t1;
......@@ -303,9 +303,21 @@ wsrep::gtid Wsrep_server_service::get_position(wsrep::client_service&)
return wsrep_get_SE_checkpoint();
}
void Wsrep_server_service::set_position(wsrep::client_service&,
void Wsrep_server_service::set_position(wsrep::client_service& c WSREP_UNUSED,
const wsrep::gtid& gtid)
{
Wsrep_client_service& cs WSREP_UNUSED (static_cast<Wsrep_client_service&>(c));
DBUG_ASSERT(cs.m_client_state.transaction().state()
== wsrep::transaction::s_aborted);
// Wait until all prior committers have finished.
wsrep::gtid wait_for(gtid.id(),
wsrep::seqno(gtid.seqno().get() - 1));
if (auto err = Wsrep_server_state::instance().provider()
.wait_for_gtid(wait_for, std::numeric_limits<int>::max()))
{
WSREP_WARN("Wait for gtid returned error %d while waiting for "
"prior transactions to commit before setting position", err);
}
wsrep_set_SE_checkpoint(gtid);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment