MDEV-6903: gtid_slave_pos is incorrect after master crash

When a master slave restarts, it logs a special restart format description event in its binlog. When the slave sees this event, it knows it needs to roll back any active partial transaction, in case the master crashed previously in the middle of writing such transaction to its binlog. However, there was a bug where this rollback did not reset rgi->pending_gtid. This caused the @@gtid_slave_pos to be updated incorrectly with the GTID of the partial transaction that was rolled back. Fix this by always clearing rgi->pending_gtid in cleanup_context(), hopefully preventing similar bugs from turning up in other special cases where a transaction is rolled back during replication. Thanks to Pavel Ivanov for tracking down the issue and providing a test case.

MDEV-6903: gtid_slave_pos is incorrect after master crash
When a master slave restarts, it logs a special restart format description event in its binlog. When the slave sees this event, it knows it needs to roll back any active partial transaction, in case the master crashed previously in the middle of writing such transaction to its binlog. However, there was a bug where this rollback did not reset rgi->pending_gtid. This caused the @@gtid_slave_pos to be updated incorrectly with the GTID of the partial transaction that was rolled back. Fix this by always clearing rgi->pending_gtid in cleanup_context(), hopefully preventing similar bugs from turning up in other special cases where a transaction is rolled back during replication. Thanks to Pavel Ivanov for tracking down the issue and providing a test case.
b7968590 · Kristian Nielsen · f3bdf9d7 · b7968590 · b7968590 · b7968590
Commit b7968590 authored Nov 25, 2014 by Kristian Nielsen
3 changed files
--- a/mysql-test/suite/rpl/r/rpl_gtid_crash.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_crash.result
@@ -133,9 +133,17 @@ SELECT @@GLOBAL.server_id;
 3
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 a
+gtid_check
+Binlog pos ok
 # Wait 30 seconds for SQL thread to catch up with IO thread
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 a
+gtid_check
+Binlog pos ok
+gtid_check
+Slave pos ok
+gtid_check
+Current pos ok
 # Repeat this with additional transactions on the master
 SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
 BEGIN;
@@ -175,11 +183,21 @@ SELECT * from t1 WHERE a > 10 ORDER BY a;
 a
 13
 14
+gtid_check
+Binlog pos ok
+gtid_check
+Current pos ok
 # Wait 30 seconds for SQL thread to catch up with IO thread
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 a
 13
 14
+gtid_check
+Binlog pos ok
+gtid_check
+Slave pos ok
+gtid_check
+Current pos ok
 # Repeat this with additional transactions on the master
 SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
 BEGIN;
@@ -205,5 +223,48 @@ a
 14
 23
 24
+# Repeat this with slave restart
+SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
+BEGIN;
+INSERT INTO t1 VALUES (25);
+COMMIT;
+ERROR HY000: Error writing file 'master-bin' (errno: 28 "No space left on device")
+SET GLOBAL debug_dbug="+d,crash_dispatch_command_before";
+COMMIT;
+Got one of the listed errors
+# Wait 30 seconds for IO thread to connect and SQL thread to catch up
+# with IO thread.
+include/stop_slave.inc
+gtid_check
+Binlog pos ok
+gtid_check
+Current pos ok
+INSERT INTO t1 VALUES (26);
+INSERT INTO t1 VALUES (27);
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+a
+13
+14
+23
+24
+26
+27
+include/save_master_gtid.inc
+gtid_check
+Binlog pos ok
+gtid_check
+Slave pos ok
+gtid_check
+Current pos ok
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+a
+13
+14
+23
+24
+26
+27
 DROP TABLE t1;
 include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_gtid_crash.test
+++ b/mysql-test/suite/rpl/t/rpl_gtid_crash.test
@@ -269,6 +269,7 @@ SET GLOBAL debug_dbug="+d,crash_before_writing_xid";

 --connection server_1
 INSERT INTO t1 VALUES (9), (10);
+--let $saved_gtid=`SELECT @@last_gtid`
 --save_master_pos

 --connection server_2
@@ -333,6 +334,9 @@ EOF

 SELECT @@GLOBAL.server_id;
 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log

 --echo # Wait 30 seconds for SQL thread to catch up with IO thread
 --connection server_2
@@ -357,6 +361,11 @@ if ($read_log_pos != $exec_log_pos)
 }

 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_slave_pos, '$saved_gtid'), "Slave pos ok", CONCAT("Unexpected slave pos: ", @@gtid_slave_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log

 --echo # Repeat this with additional transactions on the master

@@ -387,6 +396,7 @@ EOF
 SELECT @@GLOBAL.server_id;
 INSERT INTO t1 VALUES (13);
 INSERT INTO t1 VALUES (14);
+--let $saved_gtid=`SELECT @@last_gtid`
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 --source include/save_master_gtid.inc

@@ -420,6 +430,10 @@ EOF

 SELECT @@GLOBAL.server_id;
 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log

 --echo # Wait 30 seconds for SQL thread to catch up with IO thread
 --connection server_2
@@ -444,6 +458,11 @@ if ($read_log_pos != $exec_log_pos)
 }

 SELECT * from t1 WHERE a > 10 ORDER BY a;
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_slave_pos, '$saved_gtid'), "Slave pos ok", CONCAT("Unexpected slave pos: ", @@gtid_slave_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log

 --echo # Repeat this with additional transactions on the master

@@ -472,6 +491,7 @@ EOF

 INSERT INTO t1 VALUES (23);
 INSERT INTO t1 VALUES (24);
+--let $saved_gtid=`SELECT @@last_gtid`
 SELECT * from t1 WHERE a > 10 ORDER BY a;
 --source include/save_master_gtid.inc

@@ -479,6 +499,86 @@ SELECT * from t1 WHERE a > 10 ORDER BY a;
 --source include/sync_with_master_gtid.inc
 SELECT * from t1 WHERE a > 10 ORDER BY a;

+--echo # Repeat this with slave restart
+
+--connection server_1
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+wait
+EOF
+
+SET GLOBAL debug_dbug="+d,inject_error_writing_xid";
+BEGIN;
+INSERT INTO t1 VALUES (25);
+--error ER_ERROR_ON_WRITE
+COMMIT;
+SET GLOBAL debug_dbug="+d,crash_dispatch_command_before";
+--error 2006,2013
+COMMIT;
+
+--source include/wait_until_disconnected.inc
+
+--append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart
+EOF
+
+--connection server_1
+--enable_reconnect
+--source include/wait_until_connected_again.inc
+
+--connection server_2
+--echo # Wait 30 seconds for IO thread to connect and SQL thread to catch up
+--echo # with IO thread.
+--let $wait_timeout= 300
+while ($wait_timeout != 0)
+{
+  --let $connected=`SELECT COUNT(*) > 0 FROM information_schema.processlist WHERE State = 'Waiting for master to send event'`
+  if ($connected)
+  {
+    --let $read_log_pos= query_get_value('SHOW SLAVE STATUS', Read_Master_Log_Pos, 1)
+    --let $exec_log_pos= query_get_value('SHOW SLAVE STATUS', Exec_Master_Log_Pos, 1)
+    if ($read_log_pos == $exec_log_pos)
+    {
+      --let $wait_timeout= 0
+    }
+    if ($read_log_pos != $exec_log_pos)
+    {
+      --sleep 0.1
+      --dec $wait_timeout
+    }
+  }
+  if (!$connected)
+  {
+    --sleep 0.1
+    --dec $wait_timeout
+  }
+}
+if (`SELECT NOT $connected OR $read_log_pos != $exec_log_pos`)
+{
+  --die Timeout wait for IO thread to connect and SQL thread to catch up with IO thread
+}
+
+--source include/stop_slave.inc
+
+--connection server_1
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
+INSERT INTO t1 VALUES (26);
+INSERT INTO t1 VALUES (27);
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection server_2
+--disable_query_log
+eval SELECT IF(INSTR(@@gtid_binlog_pos, '$saved_gtid'), "Binlog pos ok", CONCAT("Unexpected binlog pos: ", @@gtid_binlog_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_slave_pos, '$saved_gtid'), "Slave pos ok", CONCAT("Unexpected slave pos: ", @@gtid_slave_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+eval SELECT IF(INSTR(@@gtid_current_pos, '$saved_gtid'), "Current pos ok", CONCAT("Unexpected current pos: ", @@gtid_current_pos, "; does not contain the GTID $saved_gtid.")) AS gtid_check;
+--enable_query_log
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * from t1 WHERE a > 10 ORDER BY a;
+

 --connection server_1
 DROP TABLE t1;

--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
@@ -1717,6 +1717,11 @@ void rpl_group_info::cleanup_context(THD *thd, bool error)
    trans_rollback_stmt(thd); // if a "statement transaction"
    /* trans_rollback() also resets OPTION_GTID_BEGIN */
    trans_rollback(thd);      // if a "real transaction"
+    /*
+      Now that we have rolled back the transaction, make sure we do not
+      errorneously update the GTID position.
+    */
+    gtid_pending= false;
  }
  m_table_map.clear_tables();
  slave_close_thread_tables(thd);