Bug#24901077: RESET SLAVE ALL DOES NOT ALWAYS RESET SLAVE

Description: ============ If you have a relay log index file that has ended up with some relay log files that do not exists, then RESET SLAVE ALL is not enough to get back to a clean state. Analysis: ========= In the bug scenario slave server is in stopped state and some of the relay logs got deleted but the relay log index file is not updated. During slave server restart replication initialization fails as some of the required relay logs are missing. User executes RESET SLAVE/RESET SLAVE ALL command to start a clean slave. As per the documentation RESET SLAVE command clears the master info and relay log info repositories, deletes all the relay log files, and starts a new relay log file. But in a scenario where the slave server's Relay_log_info object is not initialized slave will not purge the existing relay logs. Hence the index file still remains in a bad state. Users will not be able to start the slave unless these files are cleared. Fix: === RESET SLAVE/RESET SLAVE ALL commands should do the cleanup even in a scenario where Relay_log_info object initialization failed. Backported a flag named 'error_on_rli_init_info' which is required to identify slave's Relay_log_info object initialization failure. This flag exists in MySQL-5.6 onwards as part of BUG#14021292 fix. During RESET SLAVE/RESET SLAVE ALL execution this flag indicates the Relay_log_info initialization failure. In such a case open the relay log index/relay log files and do the required clean up.

Bug#24901077: RESET SLAVE ALL DOES NOT ALWAYS RESET SLAVE
Description: ============ If you have a relay log index file that has ended up with some relay log files that do not exists, then RESET SLAVE ALL is not enough to get back to a clean state. Analysis: ========= In the bug scenario slave server is in stopped state and some of the relay logs got deleted but the relay log index file is not updated. During slave server restart replication initialization fails as some of the required relay logs are missing. User executes RESET SLAVE/RESET SLAVE ALL command to start a clean slave. As per the documentation RESET SLAVE command clears the master info and relay log info repositories, deletes all the relay log files, and starts a new relay log file. But in a scenario where the slave server's Relay_log_info object is not initialized slave will not purge the existing relay logs. Hence the index file still remains in a bad state. Users will not be able to start the slave unless these files are cleared. Fix: === RESET SLAVE/RESET SLAVE ALL commands should do the cleanup even in a scenario where Relay_log_info object initialization failed. Backported a flag named 'error_on_rli_init_info' which is required to identify slave's Relay_log_info object initialization failure. This flag exists in MySQL-5.6 onwards as part of BUG#14021292 fix. During RESET SLAVE/RESET SLAVE ALL execution this flag indicates the Relay_log_info initialization failure. In such a case open the relay log index/relay log files and do the required clean up.
e619295e · Sujatha Sivakumar · 9181a561 · e619295e · e619295e · e619295e
Commit e619295e authored Feb 28, 2017 by Sujatha Sivakumar
7 changed files
--- a/mysql-test/suite/rpl/r/rpl_reset_slave_fail.result
+++ b/mysql-test/suite/rpl/r/rpl_reset_slave_fail.result
+include/master-slave.inc
+[connection master]
+CREATE TABLE t1 (c1 INT);
+INSERT INTO t1 (c1) VALUES (1);
+include/stop_slave_sql.inc
+FLUSH LOGS;
+FLUSH LOGS;
+INSERT INTO t1 (c1) VALUES (2);
+include/sync_slave_io_with_master.inc
+call mtr.add_suppression("File '.*slave-relay-bin.");
+call mtr.add_suppression("Could not open log file");
+call mtr.add_suppression("Failed to open the relay log");
+call mtr.add_suppression("Failed to initialize the master info structure");
+include/rpl_stop_server.inc [server_number=2]
+# Removing  file(s)
+include/rpl_start_server.inc [server_number=2]
+START SLAVE;
+ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log
+START SLAVE;
+ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log
+RESET SLAVE;
+DROP TABLE t1;
+START SLAVE UNTIL MASTER_LOG_FILE= 'MASTER_LOG_FILE', MASTER_LOG_POS= MASTER_LOG_POS;;
+include/wait_for_slave_sql_to_stop.inc
+include/stop_slave_io.inc
+include/start_slave.inc
+include/diff_tables.inc [master:t1, slave:t1]
+DROP TABLE t1;
+include/rpl_end.inc
--- a/mysql-test/suite/rpl/t/rpl_reset_slave_fail.test
+++ b/mysql-test/suite/rpl/t/rpl_reset_slave_fail.test
+###############################################################################
+# Bug#24901077: RESET SLAVE ALL DOES NOT ALWAYS RESET SLAVE
+#
+# Problem:
+# =======
+# If you have a relay log index file that has ended up with
+# some relay log files that do not exists, then RESET SLAVE
+# ALL is not enough to get back to a clean state.
+###############################################################################
+# Remove all slave-relay-bin.0* files (do not remove slave-relay-bin.index)
+# During server restart rli initialization will fail as there are no
+# relay logs.  In case of bug RESET SLAVE will not do the required clean up
+# as rli is not inited and subsequent START SLAVE will fail.
+# Disable "Warning  1612  Being purged log ./slave-relay-bin.0* was not found"
+# because it is different on Unix and Windows systems.
+
+--source include/have_binlog_format_mixed.inc
+--source include/master-slave.inc
+
+--connection master
+CREATE TABLE t1 (c1 INT);
+INSERT INTO t1 (c1) VALUES (1);
+--sync_slave_with_master
+
+--connection slave
+--source include/stop_slave_sql.inc
+--let $MYSQLD_SLAVE_DATADIR= `select @@datadir`
+
+--connection master
+# Generate more relay logs on slave.
+FLUSH LOGS;
+FLUSH LOGS;
+INSERT INTO t1 (c1) VALUES (2);
+
+--source include/sync_slave_io_with_master.inc
+call mtr.add_suppression("File '.*slave-relay-bin.");
+call mtr.add_suppression("Could not open log file");
+call mtr.add_suppression("Failed to open the relay log");
+call mtr.add_suppression("Failed to initialize the master info structure");
+
+# Stop slave
+--let $rpl_server_number= 2
+--source include/rpl_stop_server.inc
+
+# Delete file(s)
+--echo # Removing $remove_pattern file(s)
+--let $remove_pattern= slave-relay-bin.0*
+--remove_files_wildcard $MYSQLD_SLAVE_DATADIR $remove_pattern
+
+# Start slave
+--let $rpl_server_number= 2
+--source include/rpl_start_server.inc
+
+# Start slave must fail because of the removed file(s).
+--error ER_MASTER_INFO
+START SLAVE;
+
+# Try a second time, it must fail again.
+--error ER_MASTER_INFO
+START SLAVE;
+
+# Retrieve master executed position before reset slave.
+--let $master_exec_file= query_get_value("SHOW SLAVE STATUS", Relay_Master_Log_File, 1)
+--let $master_exec_pos= query_get_value("SHOW SLAVE STATUS", Exec_Master_Log_Pos, 1)
+
+# Reset slave.
+# Disable "Warning  1612  Being purged log ./slave-relay-bin.0* was not found"
+# because it is different on Unix and Windows systems.
+--disable_warnings
+RESET SLAVE;
+--enable_warnings
+DROP TABLE t1;
+--replace_result $master_exec_file MASTER_LOG_FILE $master_exec_pos MASTER_LOG_POS
+--eval START SLAVE UNTIL MASTER_LOG_FILE= '$master_exec_file', MASTER_LOG_POS= $master_exec_pos;
+--source include/wait_for_slave_sql_to_stop.inc
+--source include/stop_slave_io.inc
+
+# Start slave.
+--source include/start_slave.inc
+
+--connection master
+--sync_slave_with_master
+# Check consistency.
+--let $diff_tables= master:t1, slave:t1
+--source include/diff_tables.inc
+
+# Cleanup
+--connection master
+DROP TABLE t1;
+--sync_slave_with_master
+--source include/rpl_end.inc
--- a/sql/rpl_mi.cc
+++ b/sql/rpl_mi.cc
-/* Copyright (c) 2006, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -551,7 +551,6 @@ void end_master_info(Master_info* mi)

  if (!mi->inited)
    DBUG_VOID_RETURN;
-  end_relay_log_info(&mi->rli);
  if (mi->fd >= 0)
  {
    end_io_cache(&mi->file);

--- a/sql/rpl_rli.cc
+++ b/sql/rpl_rli.cc
-/* Copyright (c) 2006, 2013, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -41,7 +41,8 @@ Relay_log_info::Relay_log_info(bool is_slave_recovery)
   no_storage(FALSE), replicate_same_server_id(::replicate_same_server_id),
   info_fd(-1), cur_log_fd(-1), relay_log(&sync_relaylog_period),
   sync_counter(0), is_relay_log_recovery(is_slave_recovery),
-   save_temporary_tables(0), cur_log_old_open_count(0), group_relay_log_pos(0), 
+   save_temporary_tables(0), cur_log_old_open_count(0),
+   error_on_rli_init_info(false), group_relay_log_pos(0),
   event_relay_log_pos(0),
 #if HAVE_purify
   is_fake(FALSE),
@@ -108,7 +109,7 @@ int init_relay_log_info(Relay_log_info* rli,
 			const char* info_fname)
 {
  char fname[FN_REFLEN+128];
-  int info_fd;
+  int info_fd= -1;
  const char* msg = 0;
  int error = 0;
  DBUG_ENTER("init_relay_log_info");
@@ -118,6 +119,8 @@ int init_relay_log_info(Relay_log_info* rli,
    DBUG_RETURN(0);
  fn_format(fname, info_fname, mysql_data_home, "", 4+32);
  mysql_mutex_lock(&rli->data_lock);
+  if (rli->error_on_rli_init_info)
+    goto err;
  info_fd = rli->info_fd;
  rli->cur_log_fd = -1;
  rli->slave_skip_counter=0;
@@ -351,11 +354,14 @@ Failed to open the existing relay log info file '%s' (errno %d)",
    goto err;
  }
  rli->inited= 1;
+  rli->error_on_rli_init_info= false;
  mysql_mutex_unlock(&rli->data_lock);
  DBUG_RETURN(error);

 err:
-  sql_print_error("%s", msg);
+  rli->error_on_rli_init_info= true;
+  if (msg)
+    sql_print_error("%s", msg);
  end_io_cache(&rli->info_file);
  if (info_fd >= 0)
    mysql_file_close(info_fd, MYF(0));
@@ -942,6 +948,8 @@ int purge_relay_logs(Relay_log_info* rli, THD *thd, bool just_reset,
                     const char** errmsg)
 {
  int error=0;
+  const char *ln;
+  char name_buf[FN_REFLEN];
  DBUG_ENTER("purge_relay_logs");

  /*
@@ -968,12 +976,34 @@ int purge_relay_logs(Relay_log_info* rli, THD *thd, bool just_reset,
  if (!rli->inited)
  {
    DBUG_PRINT("info", ("rli->inited == 0"));
-    DBUG_RETURN(0);
-  }
-
-  DBUG_ASSERT(rli->slave_running == 0);
-  DBUG_ASSERT(rli->mi->slave_running == 0);
+    if (rli->error_on_rli_init_info)
+    {
+      ln= rli->relay_log.generate_name(opt_relay_logname, "-relay-bin",
+                                       1, name_buf);

+      if (rli->relay_log.open_index_file(opt_relaylog_index_name, ln, TRUE))
+      {
+        sql_print_error("Unable to purge relay log files. Failed to open relay "
+                        "log index file:%s.", rli->relay_log.get_index_fname());
+        DBUG_RETURN(1);
+      }
+      if (rli->relay_log.open(ln, LOG_BIN, 0, SEQ_READ_APPEND, 0,
+                             (max_relay_log_size ? max_relay_log_size :
+                              max_binlog_size), 1, TRUE))
+      {
+        sql_print_error("Unable to purge relay log files. Failed to open relay "
+                        "log file:%s.", rli->relay_log.get_log_fname());
+        DBUG_RETURN(1);
+      }
+    }
+    else
+      DBUG_RETURN(0);
+  }
+  else
+  {
+    DBUG_ASSERT(rli->slave_running == 0);
+    DBUG_ASSERT(rli->mi->slave_running == 0);
+  }
  rli->slave_skip_counter=0;
  mysql_mutex_lock(&rli->data_lock);

@@ -1013,6 +1043,8 @@ int purge_relay_logs(Relay_log_info* rli, THD *thd, bool just_reset,
                              rli->group_relay_log_pos,
                              0 /* do not need data lock */, errmsg, 0);

+  if (!rli->inited && rli->error_on_rli_init_info)
+    rli->relay_log.close(LOG_CLOSE_INDEX | LOG_CLOSE_STOP_EVENT);
 err:
 #ifndef DBUG_OFF
  char buf[22];

--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
-/* Copyright (c) 2005, 2012, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2005, 2017, Oracle and/or its affiliates. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -153,7 +153,14 @@ class Relay_log_info : public Slave_reporting_capability
    a different log under our feet
  */
  uint32 cur_log_old_open_count;
-  
+
+  /*
+    If on init_info() call error_on_rli_init_info is true that means
+    that previous call to init_info() terminated with an error, RESET
+    SLAVE must be executed and the problem fixed manually.
+   */
+  bool error_on_rli_init_info;
+
  /*
    Let's call a group (of events) :
      - a transaction

--- a/sql/slave.cc
+++ b/sql/slave.cc
-/* Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -881,6 +881,7 @@ void close_active_mi()
  if (active_mi)
  {
    end_master_info(active_mi);
+    end_relay_log_info(&active_mi->rli);
    delete active_mi;
    active_mi= 0;
  }
@@ -4165,6 +4166,7 @@ void end_relay_log_info(Relay_log_info* rli)
 {
  DBUG_ENTER("end_relay_log_info");

+  rli->error_on_rli_init_info= false;
  if (!rli->inited)
    DBUG_VOID_RETURN;
  if (rli->info_fd >= 0)

--- a/sql/sql_repl.cc
+++ b/sql/sql_repl.cc
-/* Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved.
+/* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -1313,6 +1313,7 @@ int reset_slave(THD *thd, Master_info* mi)

  // close master_info_file, relay_log_info_file, set mi->inited=rli->inited=0
  end_master_info(mi);
+  end_relay_log_info(&mi->rli);
  // and delete these two files
  fn_format(fname, master_info_file, mysql_data_home, "", 4+32);
  if (mysql_file_stat(key_file_master_info, fname, &stat_area, MYF(0)) &&