MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction

The retry of an event group in parallel replication set the wrong value for the end log position of the event that was retried (qev->future_event_relay_log_pos). It was too large by the size of the event, so it pointed into the middle of the following event. If the retry happened in the very last event of the event group, _and_ the SQL thread was stopped just after successfully retrying that event, then the SQL threads's relay log position would be left incorrect. Restarting the SQL thread could then try to read events from a garbage offset in the relay log, usually leading to an error about not being able to read the event.

MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction
The retry of an event group in parallel replication set the wrong value for the end log position of the event that was retried (qev->future_event_relay_log_pos). It was too large by the size of the event, so it pointed into the middle of the following event. If the retry happened in the very last event of the event group, _and_ the SQL thread was stopped just after successfully retrying that event, then the SQL threads's relay log position would be left incorrect. Restarting the SQL thread could then try to read events from a garbage offset in the relay log, usually leading to an error about not being able to read the event.
3dcd01e5 · Kristian Nielsen · d08b893b · 3dcd01e5 · 3dcd01e5 · 3dcd01e5
Commit 3dcd01e5 authored Nov 13, 2014 by Kristian Nielsen
3 changed files
--- a/mysql-test/suite/rpl/r/rpl_parallel_retry.result
+++ b/mysql-test/suite/rpl/r/rpl_parallel_retry.result
@@ -188,6 +188,52 @@ a	LENGTH(b)
 3	5012
 4	5000
 SET GLOBAL max_relay_log_size=@old_max;
+*** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***
+include/stop_slave.inc
+BEGIN;
+INSERT INTO t1 VALUES (100, 0);
+INSERT INTO t1 VALUES (101, 0);
+INSERT INTO t1 VALUES (102, 0);
+INSERT INTO t1 VALUES (103, 0);
+COMMIT;
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+a	b
+100	0
+101	0
+102	0
+103	0
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
+include/start_slave.inc
+SET GLOBAL debug_dbug=@old_dbug;
+retries
+1
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+a	b
+100	0
+101	0
+102	0
+103	0
+include/stop_slave_sql.inc
+INSERT INTO t1 VALUES (104, 1);
+INSERT INTO t1 VALUES (105, 1);
+INSERT INTO t1 VALUES (106, 1);
+INSERT INTO t1 VALUES (107, 1);
+INSERT INTO t1 VALUES (108, 1);
+INSERT INTO t1 VALUES (109, 1);
+include/start_slave.inc
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+a	b
+100	0
+101	0
+102	0
+103	0
+104	1
+105	1
+106	1
+107	1
+108	1
+109	1
 include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;
 include/start_slave.inc

--- a/mysql-test/suite/rpl/t/rpl_parallel_retry.test
+++ b/mysql-test/suite/rpl/t/rpl_parallel_retry.test
@@ -135,7 +135,6 @@ SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parall
 let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
 START SLAVE;
 --let $slave_sql_errno= 1213
--let $slave_timeout= 10
 --source include/wait_for_slave_sql_error.inc
 SET GLOBAL debug_dbug=@old_dbug;
 let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
@@ -208,6 +207,58 @@ SELECT a, LENGTH(b) FROM t2 ORDER BY a;
 SET GLOBAL max_relay_log_size=@old_max;


+--echo *** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***
+
+--connection server_2
+--source include/stop_slave.inc
+
+--connection server_1
+BEGIN;
+INSERT INTO t1 VALUES (100, 0);
+INSERT INTO t1 VALUES (101, 0);
+INSERT INTO t1 VALUES (102, 0);
+INSERT INTO t1 VALUES (103, 0);
+COMMIT;
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+--save_master_pos
+
+--connection server_2
+# Inject a DBUG error insert to cause the XID event of the single transaction
+# from the master to fail with a deadlock error and be retried.
+# The bug was that the retry of the XID would leave the relay log position
+# incorrect (off by the size of XID event).
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
+let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+--source include/start_slave.inc
+--sync_with_master
+SET GLOBAL debug_dbug=@old_dbug;
+let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
+--disable_query_log
+eval SELECT $new_retry - $old_retry AS retries;
+--enable_query_log
+
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+# Stop the SQL thread. When the bug was there to give the incorrect relay log
+# position, the restart of the SQL thread would read garbage data from the
+# middle of an event and fail with relay log IO error.
+--source include/stop_slave_sql.inc
+
+--connection server_1
+INSERT INTO t1 VALUES (104, 1);
+INSERT INTO t1 VALUES (105, 1);
+INSERT INTO t1 VALUES (106, 1);
+INSERT INTO t1 VALUES (107, 1);
+INSERT INTO t1 VALUES (108, 1);
+INSERT INTO t1 VALUES (109, 1);
+--save_master_pos
+
+--connection server_2
+--source include/start_slave.inc
+--sync_with_master
+SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
+
+
 --connection server_2
 --source include/stop_slave.inc
 SET GLOBAL slave_parallel_threads=@old_parallel_threads;

--- a/sql/rpl_parallel.cc
+++ b/sql/rpl_parallel.cc
@@ -446,7 +446,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
    ev->thd= thd;

    mysql_mutex_lock(&rpt->LOCK_rpl_thread);
-    qev= rpt->retry_get_qev(ev, orig_qev, log_name, cur_offset,
+    qev= rpt->retry_get_qev(ev, orig_qev, log_name, old_offset,
                            cur_offset - old_offset);
    mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
    if (!qev)
@@ -776,6 +776,18 @@ handle_rpl_parallel_thread(void *arg)
      if (likely(!rgi->worker_error) && !skip_event_group)
      {
        ++rgi->retry_event_count;
+#ifndef DBUG_OFF
+        err= 0;
+        DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid",
+          if (event_type == XID_EVENT)
+          {
+            thd->clear_error();
+            thd->get_stmt_da()->reset_diagnostics_area();
+            my_error(ER_LOCK_DEADLOCK, MYF(0));
+            err= 1;
+          });
+        if (!err)
+#endif
        err= rpt_handle_event(qev, rpt);
        delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
        DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",