Commit 4111a530 authored by seppo's avatar seppo Committed by Jan Lindström

MDEV-21096 async slave crash with gtid_log_pos table access (#1413)

The original crash happened when async replication IO thread was updating mysql.gtid_slave_pos table. Operations on this table should remain node local, but it appears that protection (THD::wsrep_ignore_table flag) to prevent wsrep replication for this table mas missing for innodb write_row() and update_row().
It was somewhat difficult to reproduce the issue, because mtr seems to create the affected table mysql.gtid_log_pos as of Aria engine type, and Aria engine operations will not be replicated anyhow. It looks, though, that in release installation, mysql.gtid_slave_pos table is of InnoDB engine.
It was possible to trigger somewhat related problem by running test galera.galera_as_slave_gtid with configuration: gtid_pos_auto_engines=InnoDB. However, this test mode, causes earlier crash when replication background thread creates aditional table: mysql.gtid_slave_pos_InnoDB, and this table create triggered wsrep TOI replication, which also failed for assertion. Actually, async replication IO and background threads should not replicate anything to cluster.

This pull request contains new test galera.galera_as_slave_gtid_auto_engine, which basically just runs galera.galera_as_slave_gtid with configuration of gtid_pos_auto_engines=InnoDB.
Test galera.galera_as_slave_gtid is also modified for better code reuse.
Actual fix for MDEV-21096 is in storage/innobase/handler/ha_innodb.cc, where THD::wsrep_ignore_table flag is now honored before wsrep key population.
There is additional fix in sql/service_wsrep.cc where async replication IO and background threads are marked as non-local. This fences these threads out of wsrep replication altogether. Note that this change, actually makes the use of THD::wsrep_ignore-table redundant. We may want to refactor THD::wsrep_ignore_table out in the future, if there is no other use case for it in sight.
parent f9528821
connection node_2;
connection node_1;
connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3;
connection node_2;
START SLAVE;
connection node_3;
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES(1);
SELECT LENGTH(@@global.gtid_binlog_state) > 1;
LENGTH(@@global.gtid_binlog_state) > 1
1
connection node_2;
gtid_binlog_state_equal
1
connection node_1;
SELECT COUNT(*) = 1 FROM t1;
COUNT(*) = 1
1
gtid_binlog_state_equal
1
connection node_3;
DROP TABLE t1;
connection node_1;
connection node_2;
STOP SLAVE;
RESET SLAVE ALL;
#cleanup
connection node_1;
set global wsrep_on=OFF;
reset master;
set global wsrep_on=ON;
connection node_2;
set global wsrep_on=OFF;
reset master;
set global wsrep_on=ON;
connection node_3;
reset master;
connection node_2;
DROP TABLE mysql.gtid_slave_pos_InnoDB;
CALL mtr.add_suppression("The automatically created table");
#
# Test Galera as a slave to a MariaDB master using GTIDs
#
# suite/galera/galera_2nodes_as_slave.cnf describes the setup of the nodes
# suite/galera/t/galera_as_slave_gtid.cnf has the GTID options
#
# In addition to performing DDL and DML, we check that the gtid of the master is preserved inside the cluster
#
--source include/have_innodb.inc
--source include/galera_cluster.inc
# As node #3 is not a Galera node, and galera_cluster.inc does not open connetion to it
# we open the node_3 connection here
--connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
--connection node_2
--disable_query_log
--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3;
--enable_query_log
START SLAVE;
--connection node_3
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES(1);
SELECT LENGTH(@@global.gtid_binlog_state) > 1;
--let $gtid_binlog_state_node1 = `SELECT @@global.gtid_binlog_state;`
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
--source include/wait_condition.inc
--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
--source include/wait_condition.inc
--disable_query_log
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
--enable_query_log
--connection node_1
SELECT COUNT(*) = 1 FROM t1;
--disable_query_log
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
--enable_query_log
--connection node_3
DROP TABLE t1;
#
# Unfortunately without the sleep below the following statement fails with "query returned no rows", which
# is difficult to understand given that it is an aggregate query. A "query execution was interrupted"
# warning is also reported by MTR, which is also weird.
#
--sleep 1
--connection node_1
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
--source include/wait_condition.inc
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
--source include/wait_condition.inc
STOP SLAVE;
RESET SLAVE ALL;
--echo #cleanup
--connection node_1
set global wsrep_on=OFF;
reset master;
set global wsrep_on=ON;
--connection node_2
set global wsrep_on=OFF;
reset master;
set global wsrep_on=ON;
--connection node_3
reset master;
......@@ -7,80 +7,4 @@
# In addition to performing DDL and DML, we check that the gtid of the master is preserved inside the cluster
#
--source include/have_innodb.inc
--source include/galera_cluster.inc
# As node #3 is not a Galera node, and galera_cluster.inc does not open connetion to it
# we open the node_3 connection here
--connect node_3, 127.0.0.1, root, , test, $NODE_MYPORT_3
--connection node_2
--disable_query_log
--eval CHANGE MASTER TO MASTER_HOST='127.0.0.1', MASTER_USER='root', MASTER_PORT=$NODE_MYPORT_3;
--enable_query_log
START SLAVE;
--connection node_3
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY) ENGINE=InnoDB;
INSERT INTO t1 VALUES(1);
SELECT LENGTH(@@global.gtid_binlog_state) > 1;
--let $gtid_binlog_state_node1 = `SELECT @@global.gtid_binlog_state;`
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
--source include/wait_condition.inc
--let $wait_condition = SELECT COUNT(*) = 1 FROM t1;
--source include/wait_condition.inc
--disable_query_log
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
--enable_query_log
--connection node_1
SELECT COUNT(*) = 1 FROM t1;
--disable_query_log
--eval SELECT '$gtid_binlog_state_node1' = @@global.gtid_binlog_state AS gtid_binlog_state_equal;
#--eval SELECT GTID_SUBSET('$gtid_executed_node1', @@global.gtid_executed) AS gtid_executed_equal;
--enable_query_log
--connection node_3
DROP TABLE t1;
#
# Unfortunately without the sleep below the following statement fails with "query returned no rows", which
# is difficult to understand given that it is an aggregate query. A "query execution was interrupted"
# warning is also reported by MTR, which is also weird.
#
--sleep 1
--connection node_1
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
--source include/wait_condition.inc
--connection node_2
--let $wait_condition = SELECT COUNT(*) = 0 FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_NAME = 't1';
--source include/wait_condition.inc
STOP SLAVE;
RESET SLAVE ALL;
--echo #cleanup
--connection node_1
set global wsrep_on=OFF;
reset master;
set global wsrep_on=ON;
--connection node_2
set global wsrep_on=OFF;
reset master;
set global wsrep_on=ON;
--connection node_3
reset master;
--source galera_as_slave_gtid.inc
!include ../galera_2nodes_as_slave.cnf
[mysqld]
log-bin=mysqld-bin
log-slave-updates
binlog-format=ROW
gtid_pos_auto_engines=InnoDB
\ No newline at end of file
#
# Test Galera as a slave to a MariaDB master using GTIDs
#
# suite/galera/galera_2nodes_as_slave.cnf describes the setup of the nodes
# suite/galera/t/galera_as_slave_gtid.cnf has the GTID options
#
# In addition to performing DDL and DML, we check that the gtid of the master is preserved inside the cluster
#
--source galera_as_slave_gtid.inc
--connection node_2
DROP TABLE mysql.gtid_slave_pos_InnoDB;
CALL mtr.add_suppression("The automatically created table");
\ No newline at end of file
......@@ -112,7 +112,17 @@ extern "C" my_bool wsrep_get_debug()
extern "C" my_bool wsrep_thd_is_local(const THD *thd)
{
return thd->wsrep_cs().mode() == wsrep::client_state::m_local;
/*
async replication IO and background threads have nothing to replicate in the cluster,
marking them as non-local here to prevent write set population and replication
async replication SQL thread, applies client transactions from mariadb master
and will be replicated into cluster
*/
return (
thd->system_thread != SYSTEM_THREAD_SLAVE_BACKGROUND &&
thd->system_thread != SYSTEM_THREAD_SLAVE_IO &&
thd->wsrep_cs().mode() == wsrep::client_state::m_local);
}
extern "C" my_bool wsrep_thd_is_applying(const THD *thd)
......
......@@ -8199,6 +8199,7 @@ ha_innobase::write_row(
if (!error_result
&& wsrep_on(m_user_thd)
&& wsrep_thd_is_local(m_user_thd)
&& !wsrep_thd_ignore_table(m_user_thd)
&& !wsrep_consistency_check(m_user_thd)
&& (thd_sql_command(m_user_thd) != SQLCOM_CREATE_TABLE)
&& (thd_sql_command(m_user_thd) != SQLCOM_LOAD ||
......@@ -8909,7 +8910,8 @@ ha_innobase::update_row(
#ifdef WITH_WSREP
if (error == DB_SUCCESS
&& wsrep_on(m_user_thd)
&& wsrep_thd_is_local(m_user_thd)) {
&& wsrep_thd_is_local(m_user_thd)
&& !wsrep_thd_ignore_table(m_user_thd)) {
DBUG_PRINT("wsrep", ("update row key"));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment