Commit ef7fc586 authored by Teemu Ollakka's avatar Teemu Ollakka Committed by Julius Goryavsky

MDEV-32282: Galera node remains paused after interleaving FTWRLs

After two concurrent FTWRL/UNLOCK TABLES, the node stays in paused state
and the following CREATE TABLE fails with

  ER_UNKNOWN_COM_ERROR (1047): Aborting TOI: Replication paused on
  node for FTWRL/BACKUP STAGE.

The cause is the use of global `wsrep_locked_seqno` to determine
if the node should be resumed on UNLOCK TABLES. In some executions
the `wsrep_locked_seqno` is cleared by the first UNLOCK TABLES
after the second FTWRL gets past `make_global_read_lock_block_commit()`.

As a fix, use `thd->wsrep_desynced_backup_stage` to determine
if the thread should resume the node on UNLOCK TABLES.

Add MTR test galera.galera_ftwrl_concurrent to reproduce the
race. The test contains also cases for BACKUP STAGE which
uses similar mechanism for desyncing and pausing the node.
Signed-off-by: default avatarJulius Goryavsky <julius.goryavsky@mariadb.com>
parent c9f87b88
connection node_2;
connection node_1;
connect node_1_ctrl, 127.0.0.1, root, , test, $NODE_MYPORT_1;
SET SESSION wsrep_sync_wait=0;
#
# Case 1: FTWRL
#
connection node_1;
SET SESSION wsrep_sync_wait=0;
FLUSH TABLES WITH READ LOCK;
SHOW STATUS LIKE 'wsrep_local_state_comment';
Variable_name Value
wsrep_local_state_comment Donor/Desynced
SET SESSION debug_sync = "wsrep_unlock_global_read_lock_after_resume_and_resync SIGNAL unlock_ready WAIT_FOR unlock_continue";
UNLOCK TABLES;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR unlock_ready";
connect node_1_a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1_a;
SET SESSION debug_sync = "wsrep_global_read_lock_block_commit_after_pause SIGNAL lock_ready WAIT_FOR lock_continue";
FLUSH TABLES WITH READ LOCK;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR lock_ready";
SET debug_sync = "now SIGNAL unlock_continue";
connection node_1;
connection node_1_ctrl;
SET debug_sync = "now SIGNAL lock_continue";
connection node_1_a;
UNLOCK TABLES;
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
connection node_1_ctrl;
SET debug_sync = "RESET";
#
# Case 2: BACKUP STAGE
#
connection node_1;
SET SESSION wsrep_sync_wait=0;
BACKUP STAGE START;
BACKUP STAGE BLOCK_DDL;
SHOW STATUS LIKE 'wsrep_local_state_comment';
Variable_name Value
wsrep_local_state_comment Donor/Desynced
SET SESSION debug_sync = "wsrep_backup_stage_after_resume_and_resync SIGNAL resume_and_resync_ready WAIT_FOR resume_and_resync_continue";
BACKUP STAGE END;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR resume_and_resync_ready";
connection node_1_a;
BACKUP STAGE START;
SET SESSION debug_sync = "wsrep_backup_stage_after_desync_and_pause SIGNAL desync_and_pause_ready WAIT_FOR desync_and_pause_continue";
BACKUP STAGE BLOCK_DDL;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR desync_and_pause_ready";
SET debug_sync = "now SIGNAL resume_and_resync_continue";
connection node_1;
connection node_1_ctrl;
SET debug_sync = "now SIGNAL desync_and_pause_continue";
connection node_1_a;
BACKUP STAGE END;
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
connection node_1_ctrl;
SET debug_sync = "RESET";
#
# Case 3: FTWRL first, BACKUP STAGE second
#
connection node_1;
SET SESSION wsrep_sync_wait=0;
SET SESSION wsrep_sync_wait=0;
FLUSH TABLES WITH READ LOCK;
SHOW STATUS LIKE 'wsrep_local_state_comment';
Variable_name Value
wsrep_local_state_comment Donor/Desynced
SET SESSION debug_sync = "wsrep_unlock_global_read_lock_after_resume_and_resync SIGNAL unlock_ready WAIT_FOR unlock_continue";
UNLOCK TABLES;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR unlock_ready";
connection node_1_a;
BACKUP STAGE START;
SET SESSION debug_sync = "wsrep_backup_stage_after_desync_and_pause SIGNAL desync_and_pause_ready WAIT_FOR desync_and_pause_continue";
BACKUP STAGE BLOCK_DDL;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR desync_and_pause_ready";
SET debug_sync = "now SIGNAL unlock_continue";
connection node_1;
connection node_1_ctrl;
SET debug_sync = "now SIGNAL desync_and_pause_continue";
connection node_1_a;
BACKUP STAGE END;
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
connection node_1_ctrl;
SET debug_sync = "RESET";
#
# Case 4: BACKUP STAGE first, then FTWRL
#
connection node_1;
SET SESSION wsrep_sync_wait=0;
BACKUP STAGE START;
BACKUP STAGE BLOCK_DDL;
SHOW STATUS LIKE 'wsrep_local_state_comment';
Variable_name Value
wsrep_local_state_comment Donor/Desynced
SET SESSION debug_sync = "wsrep_backup_stage_after_resume_and_resync SIGNAL resume_and_resync_ready WAIT_FOR resume_and_resync_continue";
BACKUP STAGE END;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR resume_and_resync_ready";
connection node_1_a;
SET SESSION debug_sync = "wsrep_global_read_lock_block_commit_after_pause SIGNAL lock_ready WAIT_FOR lock_continue";
FLUSH TABLES WITH READ LOCK;
connection node_1_ctrl;
SET debug_sync = "now WAIT_FOR lock_ready";
SET debug_sync = "now SIGNAL resume_and_resync_continue";
connection node_1;
connection node_1_ctrl;
SET debug_sync = "now SIGNAL lock_continue";
connection node_1_a;
UNLOCK TABLES;
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
connection node_1_ctrl;
SET debug_sync = "RESET";
#
# MDEV-32282
#
# A node remains in paused state after two interleaving FTWRLs,
# and the following CREATE TABLE fails with
#
# ER_UNKNOWN_COM_ERROR (1047): Aborting TOI: Replication paused on
# node for FTWRL/BACKUP STAGE.
#
# node_1 node_1_a
# ----------------------------------------------------------------------
# FTWRL
# UNLOCK TABLES wait after resume_and_resync()
# FTWRL wait after desync_and_pause()
# continue
# continue
# UNLOCK TABLES
# CREATE TABLE fails
#
--source include/galera_cluster.inc
--source include/have_innodb.inc
--source include/have_debug_sync.inc
# Connection to control sync points
--connect node_1_ctrl, 127.0.0.1, root, , test, $NODE_MYPORT_1
SET SESSION wsrep_sync_wait=0;
--echo #
--echo # Case 1: FTWRL
--echo #
--connection node_1
SET SESSION wsrep_sync_wait=0;
FLUSH TABLES WITH READ LOCK;
SHOW STATUS LIKE 'wsrep_local_state_comment';
SET SESSION debug_sync = "wsrep_unlock_global_read_lock_after_resume_and_resync SIGNAL unlock_ready WAIT_FOR unlock_continue";
--send UNLOCK TABLES
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR unlock_ready";
--connect node_1_a, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1_a
SET SESSION debug_sync = "wsrep_global_read_lock_block_commit_after_pause SIGNAL lock_ready WAIT_FOR lock_continue";
--send FLUSH TABLES WITH READ LOCK
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR lock_ready";
SET debug_sync = "now SIGNAL unlock_continue";
--connection node_1
--reap
--connection node_1_ctrl
SET debug_sync = "now SIGNAL lock_continue";
--connection node_1_a
--reap
UNLOCK TABLES;
--let $wait_condition = SELECT VARIABLE_VALUE = "Synced" FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = "wsrep_local_state_comment"
--source include/wait_condition.inc
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
--connection node_1_ctrl
SET debug_sync = "RESET";
--echo #
--echo # Case 2: BACKUP STAGE
--echo #
# Although BACKUP STAGE was not involved in MDEV-32282, add a testcase
# as it uses similar mechanism to pause and desync the node.
#
--connection node_1
SET SESSION wsrep_sync_wait=0;
BACKUP STAGE START;
BACKUP STAGE BLOCK_DDL;
SHOW STATUS LIKE 'wsrep_local_state_comment';
SET SESSION debug_sync = "wsrep_backup_stage_after_resume_and_resync SIGNAL resume_and_resync_ready WAIT_FOR resume_and_resync_continue";
--send BACKUP STAGE END
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR resume_and_resync_ready";
--connection node_1_a
BACKUP STAGE START;
SET SESSION debug_sync = "wsrep_backup_stage_after_desync_and_pause SIGNAL desync_and_pause_ready WAIT_FOR desync_and_pause_continue";
--send BACKUP STAGE BLOCK_DDL
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR desync_and_pause_ready";
SET debug_sync = "now SIGNAL resume_and_resync_continue";
--connection node_1
--reap
--connection node_1_ctrl
SET debug_sync = "now SIGNAL desync_and_pause_continue";
--connection node_1_a
--reap
BACKUP STAGE END;
--let $wait_condition = SELECT VARIABLE_VALUE = "Synced" FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = "wsrep_local_state_comment"
--source include/wait_condition.inc
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
--connection node_1_ctrl
SET debug_sync = "RESET";
--echo #
--echo # Case 3: FTWRL first, BACKUP STAGE second
--echo #
--connection node_1
SET SESSION wsrep_sync_wait=0;
SET SESSION wsrep_sync_wait=0;
FLUSH TABLES WITH READ LOCK;
SHOW STATUS LIKE 'wsrep_local_state_comment';
SET SESSION debug_sync = "wsrep_unlock_global_read_lock_after_resume_and_resync SIGNAL unlock_ready WAIT_FOR unlock_continue";
--send UNLOCK TABLES
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR unlock_ready";
--connection node_1_a
BACKUP STAGE START;
SET SESSION debug_sync = "wsrep_backup_stage_after_desync_and_pause SIGNAL desync_and_pause_ready WAIT_FOR desync_and_pause_continue";
--send BACKUP STAGE BLOCK_DDL
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR desync_and_pause_ready";
SET debug_sync = "now SIGNAL unlock_continue";
--connection node_1
--reap
--connection node_1_ctrl
SET debug_sync = "now SIGNAL desync_and_pause_continue";
--connection node_1_a
--reap
BACKUP STAGE END;
--let $wait_condition = SELECT VARIABLE_VALUE = "Synced" FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = "wsrep_local_state_comment"
--source include/wait_condition.inc
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
--connection node_1_ctrl
SET debug_sync = "RESET";
--echo #
--echo # Case 4: BACKUP STAGE first, then FTWRL
--echo #
--connection node_1
SET SESSION wsrep_sync_wait=0;
BACKUP STAGE START;
BACKUP STAGE BLOCK_DDL;
SHOW STATUS LIKE 'wsrep_local_state_comment';
SET SESSION debug_sync = "wsrep_backup_stage_after_resume_and_resync SIGNAL resume_and_resync_ready WAIT_FOR resume_and_resync_continue";
--send BACKUP STAGE END
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR resume_and_resync_ready";
--connection node_1_a
SET SESSION debug_sync = "wsrep_global_read_lock_block_commit_after_pause SIGNAL lock_ready WAIT_FOR lock_continue";
--send FLUSH TABLES WITH READ LOCK
--connection node_1_ctrl
SET debug_sync = "now WAIT_FOR lock_ready";
SET debug_sync = "now SIGNAL resume_and_resync_continue";
--connection node_1
--reap
--connection node_1_ctrl
SET debug_sync = "now SIGNAL lock_continue";
--connection node_1_a
--reap
UNLOCK TABLES;
--let $wait_condition = SELECT VARIABLE_VALUE = "Synced" FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = "wsrep_local_state_comment"
--source include/wait_condition.inc
CREATE TABLE t1 (f1 INT PRIMARY KEY) ENGINE=InnoDB;
DROP TABLE t1;
--connection node_1_ctrl
SET debug_sync = "RESET";
......@@ -268,6 +268,7 @@ static bool backup_block_ddl(THD *thd)
if (server_state.desync_and_pause().is_undefined()) {
DBUG_RETURN(1);
}
DEBUG_SYNC(thd, "wsrep_backup_stage_after_desync_and_pause");
thd->wsrep_desynced_backup_stage= true;
}
#endif /* WITH_WSREP */
......@@ -341,6 +342,7 @@ bool backup_end(THD *thd)
Wsrep_server_state &server_state= Wsrep_server_state::instance();
server_state.resume_and_resync();
thd->wsrep_desynced_backup_stage= false;
DEBUG_SYNC(thd, "wsrep_backup_stage_after_resume_and_resync");
}
#endif /* WITH_WSREP */
}
......
......@@ -1106,7 +1106,7 @@ void Global_read_lock::unlock_global_read_lock(THD *thd)
#ifdef WITH_WSREP
if (m_state == GRL_ACQUIRED_AND_BLOCKS_COMMIT &&
wsrep_locked_seqno != WSREP_SEQNO_UNDEFINED)
thd->wsrep_desynced_backup_stage)
{
Wsrep_server_state& server_state= Wsrep_server_state::instance();
if (server_state.state() == Wsrep_server_state::s_donor ||
......@@ -1120,8 +1120,10 @@ void Global_read_lock::unlock_global_read_lock(THD *thd)
server_state.state() == Wsrep_server_state::s_synced)
{
server_state.resume_and_resync();
DEBUG_SYNC(thd, "wsrep_unlock_global_read_lock_after_resume_and_resync");
wsrep_locked_seqno= WSREP_SEQNO_UNDEFINED;
}
thd->wsrep_desynced_backup_stage= false;
}
#endif /* WITH_WSREP */
......@@ -1178,11 +1180,13 @@ bool Global_read_lock::make_global_read_lock_block_commit(THD *thd)
server_state.state() != Wsrep_server_state::s_synced))
{
paused_seqno= server_state.pause();
thd->wsrep_desynced_backup_stage= true;
}
else if (WSREP_NNULL(thd) &&
server_state.state() == Wsrep_server_state::s_synced)
{
paused_seqno= server_state.desync_and_pause();
thd->wsrep_desynced_backup_stage= true;
}
else
{
......@@ -1193,6 +1197,7 @@ bool Global_read_lock::make_global_read_lock_block_commit(THD *thd)
{
wsrep_locked_seqno= paused_seqno.get();
}
DEBUG_SYNC(thd, "wsrep_global_read_lock_block_commit_after_pause");
#endif /* WITH_WSREP */
DBUG_RETURN(FALSE);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment