MDEV-26: Global transaction ID.

When slave requested to start at some GTID, and that GTID was the very last event (within its replication domain) in some binlog file, we did not allow the binlog dump thread on the master to start from the beginning of a following binlog file. This is a problem, since the binlog file containing the GTID is likely to be purged if the replication domain is unused for long. With this fix, if the Gtid list event at the start of a binlog file contains exactly the GTID requested by the slave, we allow to start the binlog dump thread from this file, taking care not to skip any events from that domain in the file.

MDEV-26: Global transaction ID.
When slave requested to start at some GTID, and that GTID was the very last event (within its replication domain) in some binlog file, we did not allow the binlog dump thread on the master to start from the beginning of a following binlog file. This is a problem, since the binlog file containing the GTID is likely to be purged if the replication domain is unused for long. With this fix, if the Gtid list event at the start of a binlog file contains exactly the GTID requested by the slave, we allow to start the binlog dump thread from this file, taking care not to skip any events from that domain in the file.
cb65cee8 · unknown · e5b60f0a · cb65cee8 · cb65cee8 · cb65cee8
Commit cb65cee8 authored Mar 27, 2013 by unknown
3 changed files
--- a/mysql-test/suite/rpl/r/rpl_gtid_startpos.result
+++ b/mysql-test/suite/rpl/r/rpl_gtid_startpos.result
@@ -129,5 +129,25 @@ a
 SET SQL_LOG_BIN=0;
 call mtr.add_suppression("Slave: Table 't1' already exists Error_code: 1050");
 SET SQL_LOG_BIN=1;
+*** Test reconnecting slave with GTID after purge logs on master. ***
+FLUSH LOGS;
+INSERT INTO t1 VALUES (4);
+include/stop_slave.inc
+FLUSH LOGS;
+FLUSH LOGS;
+PURGE BINARY LOGS TO 'master-bin.000004';
+show binary logs;
+Log_name	File_size
+master-bin.000004	#
+INSERT INTO t1 VALUES (5);
+CHANGE MASTER TO master_host = '127.0.0.1', master_port = MASTER_PORT;
+include/start_slave.inc
+SELECT * FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+5
 DROP TABLE t1;
 include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_gtid_startpos.test
+++ b/mysql-test/suite/rpl/t/rpl_gtid_startpos.test
@@ -211,6 +211,34 @@ call mtr.add_suppression("Slave: Table 't1' already exists Error_code: 1050");
 SET SQL_LOG_BIN=1;


+--echo *** Test reconnecting slave with GTID after purge logs on master. ***
+
+--connection server_1
+FLUSH LOGS;
+INSERT INTO t1 VALUES (4);
+
+--connection server_2
+--let $wait_condition= SELECT COUNT(*) = 4 FROM t1
+--source include/wait_condition.inc
+--source include/stop_slave.inc
+
+--connection server_1
+FLUSH LOGS;
+FLUSH LOGS;
+--source include/wait_for_binlog_checkpoint.inc
+PURGE BINARY LOGS TO 'master-bin.000004';
+--source include/show_binary_logs.inc
+INSERT INTO t1 VALUES (5);
+
+--connection server_2
+--replace_result $MASTER_MYPORT MASTER_PORT
+eval CHANGE MASTER TO master_host = '127.0.0.1', master_port = $MASTER_MYPORT;
+--source include/start_slave.inc
+--let $wait_condition= SELECT COUNT(*) = 5 FROM t1
+--source include/wait_condition.inc
+SELECT * FROM t1 ORDER BY a;
+
+
 # Clean up.
 --connection server_1
 DROP TABLE t1;

--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
@@ -703,7 +703,7 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
  to build an in-memory hash or stuff like that.

  We need to check that slave did not request GTID D-S-N1, when the
-  Gtid_list_log_event for this binlog file has D-S-N2 with N2 >= N1.
+  Gtid_list_log_event for this binlog file has D-S-N2 with N2 > N1.

  In addition, we need to check that we do not have a GTID D-S-N3 in the
  Gtid_list_log_event where D is not present in the requested slave state at
@@ -727,7 +727,7 @@ contains_all_slave_gtid(slave_connection_state *st, Gtid_list_log_event *glev)
      return false;
    }
    if (gtid->server_id == glev->list[i].server_id &&
-        gtid->seq_no <= glev->list[i].seq_no)
+        gtid->seq_no < glev->list[i].seq_no)
    {
      /*
        The slave needs to receive gtid, but it is contained in an earlier
@@ -909,6 +909,25 @@ end:
  Returns the file name in out_name, which must be of size at least FN_REFLEN.

  Returns NULL on ok, error message on error.
+
+  In case of non-error return, the returned binlog file is guaranteed to
+  contain the first event to be transmitted to the slave for every domain
+  present in our binlogs. It is still necessary to skip all GTIDs up to
+  and including the GTID requested by slave within each domain.
+
+  However, as a special case, if the event to be sent to the slave is the very
+  first event (within that domain) in the returned binlog, then nothing should
+  be skipped, so that domain is deleted from the passed in slave connection
+  state.
+
+  This is necessary in case the slave requests a GTID within a replication
+  domain that has long been inactive. The binlog file containing that GTID may
+  have been long since purged. However, as long as no GTIDs after that have
+  been purged, we have the GTID requested by slave in the Gtid_list_log_event
+  of the latest binlog. So we can start from there, as long as we delete the
+  corresponding entry in the slave state so we do not wrongly skip any events
+  that might turn up if that domain becomes active again, vainly looking for
+  the requested GTID that was already purged.
 */
 static const char *
 gtid_find_binlog_file(slave_connection_state *state, char *out_name)
@@ -958,7 +977,37 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name)

    if (!glev || contains_all_slave_gtid(state, glev))
    {
+      uint32 i;
+
      strmake(out_name, buf, FN_REFLEN);
+
+      /*
+        As a special case, we allow to start from binlog file N if the
+        requested GTID is the last event (in the corresponding domain) in
+        binlog file (N-1), but then we need to remove that GTID from the slave
+        state, rather than skipping events waiting for it to turn up.
+      */
+      for (i= 0; i < glev->count; ++i)
+      {
+        const rpl_gtid *gtid= state->find(glev->list[i].domain_id);
+        if (!gtid)
+        {
+          /* contains_all_slave_gtid() would have returned false if so. */
+          DBUG_ASSERT(0);
+          continue;
+        }
+        if (gtid->server_id == glev->list[i].server_id &&
+            gtid->seq_no == glev->list[i].seq_no)
+        {
+          /*
+            The slave requested to start from the very beginning of this
+            domain in this binlog file. So delete the entry from the state,
+            we do not need to skip anything.
+          */
+          state->remove(gtid);
+        }
+      }
+
      goto end;
    }
    delete glev;