Commit b89de2b2 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-8354: out-of-order error with --gtid-ignore-duplicates and row-based replication

The --gtid-ignore-duplicates option was not working correctly with row-based
replication. When a row event was completed, but before committing, there
was a small window where another multi-source SQL thread could wrongly try
to re-execute the same transaction, without properly ignoring the duplicate
GTID. This would lead to duplicate key error or out-of-order GTID error or
similar.

Thanks to Matt Neth for reporting this and giving an easy way to reproduce
the issue.
parent 93c039dd
......@@ -3,21 +3,25 @@
[mysqld.1]
log-slave-updates
loose-innodb
binlog-format=mixed
[mysqld.2]
log-slave-updates
loose-innodb
binlog-format=mixed
[mysqld.3]
log-bin=server3-bin
log-slave-updates
loose-innodb
binlog-format=mixed
[mysqld.4]
server-id=4
log-bin=server4-bin
log-slave-updates
loose-innodb
binlog-format=mixed
[ENV]
SERVER_MYPORT_4= @mysqld.4.port
......
......@@ -242,6 +242,145 @@ a
24
25
26
*** MDEV-8354: out-of-order error with --gtid-ignore-duplicates and row-based replication ***
SET default_master_connection = "b2a";
STOP SLAVE;
include/wait_for_slave_to_stop.inc
SET default_master_connection = "c2a";
STOP SLAVE;
include/wait_for_slave_to_stop.inc
SET default_master_connection = "c2b";
STOP SLAVE;
include/wait_for_slave_to_stop.inc
SET default_master_connection = "b2c";
STOP SLAVE;
include/wait_for_slave_to_stop.inc
SET @old_slave_mode=@@GLOBAL.slave_exec_mode;
SET GLOBAL slave_exec_mode=IDEMPOTENT;
SET @old_strict=@@GLOBAL.gtid_strict_mode;
SET GLOBAL gtid_strict_mode=1;
SET @old_dbug=@@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,inject_sleep_gtid_100_x_x";
SET @old_domain=@@SESSION.gtid_domain_id;
SET @old_format=@@SESSION.binlog_format;
SET SESSION gtid_domain_id=100;
SET SESSION binlog_format='row';
INSERT INTO t1 VALUES (30);
INSERT INTO t1 VALUES (31);
INSERT INTO t1 VALUES (32);
INSERT INTO t1 VALUES (33);
INSERT INTO t1 VALUES (34);
INSERT INTO t1 VALUES (35);
INSERT INTO t1 VALUES (36);
INSERT INTO t1 VALUES (37);
INSERT INTO t1 VALUES (38);
INSERT INTO t1 VALUES (39);
INSERT INTO t1 VALUES (40);
INSERT INTO t1 VALUES (41);
INSERT INTO t1 VALUES (42);
INSERT INTO t1 VALUES (43);
INSERT INTO t1 VALUES (44);
INSERT INTO t1 VALUES (45);
INSERT INTO t1 VALUES (46);
INSERT INTO t1 VALUES (47);
INSERT INTO t1 VALUES (48);
INSERT INTO t1 VALUES (49);
SET SESSION gtid_domain_id=@old_domain;
SET SESSION binlog_format=@old_format;
include/save_master_gtid.inc
include/sync_with_master_gtid.inc
INSERT INTO t1 VALUES (50);
include/save_master_gtid.inc
SET default_master_connection = "b2c";
START SLAVE;
include/wait_for_slave_to_start.inc
SELECT MASTER_GTID_WAIT("GTID", 30);
MASTER_GTID_WAIT("GTID", 30)
0
SET default_master_connection = "b2a";
START SLAVE;
include/wait_for_slave_to_start.inc
SET default_master_connection = "c2a";
START SLAVE;
include/wait_for_slave_to_start.inc
include/sync_with_master_gtid.inc
SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
a
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
SET default_master_connection = "c2b";
START SLAVE;
include/wait_for_slave_to_start.inc
include/sync_with_master_gtid.inc
SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
a
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
include/sync_with_master_gtid.inc
SET GLOBAL debug_dbug=@old_dbug;
SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
a
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
SET GLOBAL slave_exec_mode=@old_slave_mode;
SET GLOBAL gtid_strict_mode=@old_strict;
SET GLOBAL gtid_domain_id=0;
STOP ALL SLAVES;
Warnings:
......
--source include/not_embedded.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--echo *** Test all-to-all replication with --gtid-ignore-duplicates ***
......@@ -258,6 +260,112 @@ SELECT * FROM t1 WHERE a >= 20 ORDER BY a;
SELECT * FROM t1 WHERE a >= 20 ORDER BY a;
--echo *** MDEV-8354: out-of-order error with --gtid-ignore-duplicates and row-based replication ***
# Have only A->C A->B initially.
--connection server_1
SET default_master_connection = "b2a";
STOP SLAVE;
--source include/wait_for_slave_to_stop.inc
SET default_master_connection = "c2a";
STOP SLAVE;
--source include/wait_for_slave_to_stop.inc
--connection server_2
SET default_master_connection = "c2b";
STOP SLAVE;
--source include/wait_for_slave_to_stop.inc
--connection server_3
SET default_master_connection = "b2c";
STOP SLAVE;
--source include/wait_for_slave_to_stop.inc
SET @old_slave_mode=@@GLOBAL.slave_exec_mode;
SET GLOBAL slave_exec_mode=IDEMPOTENT;
SET @old_strict=@@GLOBAL.gtid_strict_mode;
SET GLOBAL gtid_strict_mode=1;
SET @old_dbug=@@GLOBAL.debug_dbug;
# This will inject a small sleep that helps trigger the race. I did not manage
# to create a non-sleeping version with debug_sync for this; the problem is
# that once the bug is fixed, the race becomes impossible, so even with
# debug_sync at best we can check that the debug_sync times out. Which is
# just another way of adding a sleep.
#
# The bug was a race at this point where another multi-source connection
# could incorrectly re-apply the same GTID, in case of row-based replication.
SET GLOBAL debug_dbug="+d,inject_sleep_gtid_100_x_x";
--connection server_1
SET @old_domain=@@SESSION.gtid_domain_id;
SET @old_format=@@SESSION.binlog_format;
SET SESSION gtid_domain_id=100;
SET SESSION binlog_format='row';
INSERT INTO t1 VALUES (30);
INSERT INTO t1 VALUES (31);
INSERT INTO t1 VALUES (32);
INSERT INTO t1 VALUES (33);
INSERT INTO t1 VALUES (34);
INSERT INTO t1 VALUES (35);
INSERT INTO t1 VALUES (36);
INSERT INTO t1 VALUES (37);
INSERT INTO t1 VALUES (38);
INSERT INTO t1 VALUES (39);
INSERT INTO t1 VALUES (40);
INSERT INTO t1 VALUES (41);
INSERT INTO t1 VALUES (42);
INSERT INTO t1 VALUES (43);
INSERT INTO t1 VALUES (44);
INSERT INTO t1 VALUES (45);
INSERT INTO t1 VALUES (46);
INSERT INTO t1 VALUES (47);
INSERT INTO t1 VALUES (48);
INSERT INTO t1 VALUES (49);
SET SESSION gtid_domain_id=@old_domain;
SET SESSION binlog_format=@old_format;
--source include/save_master_gtid.inc
--connection server_2
--source include/sync_with_master_gtid.inc
INSERT INTO t1 VALUES (50);
--let $gtid=`SELECT @@last_gtid`
--source include/save_master_gtid.inc
--connection server_3
SET default_master_connection = "b2c";
START SLAVE;
--source include/wait_for_slave_to_start.inc
--replace_result $gtid GTID
eval SELECT MASTER_GTID_WAIT("$gtid", 30);
# The bug occured here, the slave would get an out-of-order binlog error
# due to trying to re-apply the 100-x-x transaction.
# Restart stopped multi-source connections, and sync up.
--connection server_1
SET default_master_connection = "b2a";
START SLAVE;
--source include/wait_for_slave_to_start.inc
SET default_master_connection = "c2a";
START SLAVE;
--source include/wait_for_slave_to_start.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
--connection server_2
SET default_master_connection = "c2b";
START SLAVE;
--source include/wait_for_slave_to_start.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
--connection server_3
--source include/sync_with_master_gtid.inc
SET GLOBAL debug_dbug=@old_dbug;
SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
SET GLOBAL slave_exec_mode=@old_slave_mode;
SET GLOBAL gtid_strict_mode=@old_strict;
# Clean up.
--connection server_1
SET GLOBAL gtid_domain_id=0;
......
......@@ -1788,6 +1788,13 @@ void rpl_group_info::cleanup_context(THD *thd, bool error)
rli->clear_flag(Relay_log_info::IN_STMT);
rli->clear_flag(Relay_log_info::IN_TRANSACTION);
}
/*
Ensure we always release the domain for others to process, when using
--gtid-ignore-duplicates.
*/
if (gtid_ignore_duplicate_state != GTID_DUPLICATE_NULL)
rpl_global_gtid_slave_state.release_domain_owner(this);
}
/*
......@@ -1796,13 +1803,6 @@ void rpl_group_info::cleanup_context(THD *thd, bool error)
thd->variables.option_bits&= ~OPTION_NO_FOREIGN_KEY_CHECKS;
thd->variables.option_bits&= ~OPTION_RELAXED_UNIQUE_CHECKS;
/*
Ensure we always release the domain for others to process, when using
--gtid-ignore-duplicates.
*/
if (gtid_ignore_duplicate_state != GTID_DUPLICATE_NULL)
rpl_global_gtid_slave_state.release_domain_owner(this);
/*
Reset state related to long_find_row notes in the error log:
- timestamp
......@@ -1811,6 +1811,11 @@ void rpl_group_info::cleanup_context(THD *thd, bool error)
reset_row_stmt_start_timestamp();
unset_long_find_row_note_printed();
DBUG_EXECUTE_IF("inject_sleep_gtid_100_x_x", {
if (current_gtid.domain_id == 100)
my_sleep(50000);
};);
DBUG_VOID_RETURN;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment