Fix for MDEV-9670 server_id mysteriously set to 0

Problem was that in a circular replication setup the master remembers position to events it has generated itself when reading from a slave. If there are no new events in the queue from the slave, a Gtid_list_log_event is generated to remember the last skipped event. The problem happens if there is a network delay and we generate a Gtid_list_log_event in the middle of the transaction, in which case there will be an implicit comment and a new transaction with serverid=0 will be logged. The fix was to not generate any Gtid_list_log_events in the middle of a transaction.

Fix for MDEV-9670 server_id mysteriously set to 0
Problem was that in a circular replication setup the master remembers position to events it has generated itself when reading from a slave. If there are no new events in the queue from the slave, a Gtid_list_log_event is generated to remember the last skipped event. The problem happens if there is a network delay and we generate a Gtid_list_log_event in the middle of the transaction, in which case there will be an implicit comment and a new transaction with serverid=0 will be logged. The fix was to not generate any Gtid_list_log_events in the middle of a transaction.
946a07e8 · Andrei Elkin · Monty · 46d6f74c · 946a07e8 · 946a07e8
Commit 946a07e8 authored Jul 02, 2017 by Andrei Elkin Committed by Monty Jul 02, 2017
4 changed files
--- a/mysql-test/suite/rpl/r/circular_serverid0.result
+++ b/mysql-test/suite/rpl/r/circular_serverid0.result
+include/rpl_init.inc [topology=1->2->1]
+include/rpl_connect.inc [creating M4]
+include/rpl_connect.inc [creating M2]
+SET @old_debug= @@global.debug;
+STOP SLAVE;
+SET GLOBAL debug_dbug= "+d,dbug.rows_events_to_delay_relay_logging";
+START SLAVE IO_THREAD;
+include/wait_for_slave_io_to_start.inc
+CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb;
+INSERT INTO `t1` VALUES (null, repeat('a', 1024)), (null, repeat('b', 1024));
+SET debug_sync='now WAIT_FOR start_sql_thread';
+START SLAVE SQL_THREAD;
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+slave-bin.000001	#	Gtid	#	#	GTID #-#-#
+slave-bin.000001	#	Query	#	#	use `test`; CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb
+SET debug_sync='now SIGNAL go_on_relay_logging';
+include/show_binlog_events.inc
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+slave-bin.000001	#	Gtid	#	#	GTID #-#-#
+slave-bin.000001	#	Query	#	#	use `test`; CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb
+slave-bin.000001	#	Gtid	#	#	BEGIN GTID #-#-#
+slave-bin.000001	#	Table_map	#	#	table_id: # (test.t1)
+slave-bin.000001	#	Write_rows_v1	#	#	table_id: #
+slave-bin.000001	#	Write_rows_v1	#	#	table_id: # flags: STMT_END_F
+slave-bin.000001	#	Xid	#	#	COMMIT /* XID */
+drop table t1;
+SET GLOBAL debug_dbug= @old_debug;
+SET debug_sync='RESET';
+include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/circular_serverid0.cnf
+++ b/mysql-test/suite/rpl/t/circular_serverid0.cnf
+!include ../my.cnf
+
+[mysqld.1]
+gtid-domain-id=4
+server-id=4
+#
+log-slave-updates
+slave-parallel-threads=0
+gtid-strict-mode=1
+gtid-ignore-duplicates=1
+
+#
+# Max-size row events to minimum with the idea to create
+# a number of Rows_log_event per Query.
+#
+binlog-row-event-max-size=1024
+
+[mysqld.2]
+gtid-domain-id=2
+server-id=2
+#
+log-slave-updates
+slave-parallel-threads=0
+gtid-strict-mode=1
+gtid-ignore-duplicates=1
+binlog-row-event-max-size=1024
+# The slave will be initialized with a @@global.dbug-var value
+skip-slave-start=1
+
+
--- a/mysql-test/suite/rpl/t/circular_serverid0.test
+++ b/mysql-test/suite/rpl/t/circular_serverid0.test
+#
+# Testing chain/circular replication scenario of MDEV-9670
+# The effect of the bug was that we got a commit with a GTID with server_id
+#
+
+--source include/have_binlog_format_row.inc
+--source include/have_innodb.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+
+--let $rpl_topology= 1->2->1
+--source include/rpl_init.inc
+
+--let $rpl_connection_name= M4
+--let $rpl_server_number= 1
+--source include/rpl_connect.inc
+
+--let $rpl_connection_name= M2
+--let $rpl_server_number= 2
+--source include/rpl_connect.inc
+
+# The parameter reflects binlog-row-event-max-size @cnf.
+--let $row_size=1024
+
+SET @old_debug= @@global.debug;
+
+--connection M2
+STOP SLAVE;
+SET GLOBAL debug_dbug= "+d,dbug.rows_events_to_delay_relay_logging";
+START SLAVE IO_THREAD;
+--source include/wait_for_slave_io_to_start.inc
+
+--connection M2
+# This query also creates a Gtid event group whose Gtid will remain in
+# ignored status for too long causing a following group split.
+
+CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb;
+--sync_slave_with_master M4
+
+# This INSERT will be logged as two Write_log events which the buggy
+# slave applier would split.
+
+--connection M4
+eval INSERT INTO `t1` VALUES (null, repeat('a', $row_size)), (null, repeat('b', $row_size));
+
+# START M2 IO thread and wait for its signal to follow with the SQL
+# thread start.  At this moment the SQL thread shall be having 2 and
+# "half" groups to execute.  The "hafl" one would be committed by the
+# buggy applier after which the IO is released to queue the rest of
+# the 3rd group which the SQL thread commits separately to complete
+# the split.
+
+--connection M2
+
+# wait for IO signal to start the SQL thread. IO will be hanging upon that.
+SET debug_sync='now WAIT_FOR start_sql_thread';
+
+# Now the slave server has relay log whose last group is incomplete.
+# An unfixed slave server would go to "insert" a "fake"
+# Gtid_list_log_event event which actually would commit the incomplete
+# group. However before to actual commit do_apply_event() hits some assert.
+# In the fixed server the fake Gtid_list_log_event is *not* inserted
+# in the middle of a group.
+START SLAVE SQL_THREAD;
+
+# Sleep for a little time to give SQL thread a chance to commit while
+# the IO thread is hanging (see
+# DBUG_EXECUTE_IF("dbug.rows_events_to_delay_relay_logging"...) in
+# queue_event).  Alternatively to reproduce the case when buggy slave
+# wait for the 1st group commit
+
+#--let $count= 1
+#--let $table= t1
+#--source include/wait_until_rows_count.inc
+
+--sleep 2
+
+# Demonstrate either no split group in the correct slave or the 1nd
+# group in the buggy one
+--source include/show_binlog_events.inc
+
+# Release the IO thread
+SET debug_sync='now SIGNAL go_on_relay_logging';
+
+# Sync servers
+--sync_slave_with_master M4
+--connection M4
+--sync_slave_with_master M2
+--connection M2
+
+# Demonstrate replication goes correctly not to create any split, or
+# the 2nd group in the buggy slave
+--source include/show_binlog_events.inc
+
+#
+# Cleanup
+#
+--connection M4
+drop table t1;
+
+--connection M2
+SET GLOBAL debug_dbug= @old_debug;
+SET debug_sync='RESET';
+--source include/rpl_end.inc
--- a/sql/slave.cc
+++ b/sql/slave.cc
@@ -2363,6 +2363,7 @@ static void write_ignored_events_info_to_relay_log(THD *thd, Master_info *mi)
    }
    if (rli->ign_gtids.count())
    {
+      DBUG_ASSERT(!rli->is_in_group());         // Ensure no active transaction
      glev= new Gtid_list_log_event(&rli->ign_gtids,
                                    Gtid_list_log_event::FLAG_IGN_GTIDS);
      rli->ign_gtids.reset();
@@ -5317,7 +5318,9 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
  bool gtid_skip_enqueue= false;
  bool got_gtid_event= false;
  rpl_gtid event_gtid;
-
+#ifndef DBUG_OFF
+  static uint dbug_rows_event_count= 0;
+#endif
  /*
    FD_q must have been prepared for the first R_a event
    inside get_master_version_and_clock()
@@ -5384,6 +5387,26 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
      (uchar)buf[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT /* a way to escape */)
    DBUG_RETURN(queue_old_event(mi,buf,event_len));

+#ifdef ENABLED_DEBUG_SYNC
+  /*
+    A (+d,dbug.rows_events_to_delay_relay_logging)-test is supposed to
+    create a few Write_log_events and after receiving the 1st of them
+    the IO thread signals to launch the SQL thread, and sets itself to
+    wait for a release signal.
+  */
+  DBUG_EXECUTE_IF("dbug.rows_events_to_delay_relay_logging",
+                  if ((buf[EVENT_TYPE_OFFSET] == WRITE_ROWS_EVENT_V1 ||
+                       buf[EVENT_TYPE_OFFSET] == WRITE_ROWS_EVENT) &&
+                      ++dbug_rows_event_count == 2)
+                  {
+                    const char act[]=
+                      "now SIGNAL start_sql_thread "
+                      "WAIT_FOR go_on_relay_logging";
+                    DBUG_ASSERT(debug_sync_service);
+                    DBUG_ASSERT(!debug_sync_set_action(current_thd,
+                                                       STRING_WITH_LEN(act)));
+                  };);
+#endif
  mysql_mutex_lock(&mi->data_lock);

  switch ((uchar)buf[EVENT_TYPE_OFFSET]) {
@@ -6588,9 +6611,12 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
          DBUG_RETURN(ev);
        }

-        if (rli->ign_gtids.count())
+        if (rli->ign_gtids.count() && !rli->is_in_group())
        {
-          /* We generate and return a Gtid_list, to update gtid_slave_pos. */
+          /*
+            We generate and return a Gtid_list, to update gtid_slave_pos,
+            unless being in the middle of a group.
+          */
          DBUG_PRINT("info",("seeing ignored end gtids"));
          ev= new Gtid_list_log_event(&rli->ign_gtids,
                                      Gtid_list_log_event::FLAG_IGN_GTIDS);