Commit 3dcd01e5 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction

The retry of an event group in parallel replication set the wrong value for
the end log position of the event that was retried
(qev->future_event_relay_log_pos). It was too large by the size of the event,
so it pointed into the middle of the following event.

If the retry happened in the very last event of the event group, _and_ the SQL
thread was stopped just after successfully retrying that event, then the SQL
threads's relay log position would be left incorrect. Restarting the SQL
thread could then try to read events from a garbage offset in the relay log,
usually leading to an error about not being able to read the event.
parent d08b893b
......@@ -188,6 +188,52 @@ a LENGTH(b)
3 5012
4 5000
SET GLOBAL max_relay_log_size=@old_max;
*** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***
include/stop_slave.inc
BEGIN;
INSERT INTO t1 VALUES (100, 0);
INSERT INTO t1 VALUES (101, 0);
INSERT INTO t1 VALUES (102, 0);
INSERT INTO t1 VALUES (103, 0);
COMMIT;
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
a b
100 0
101 0
102 0
103 0
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
include/start_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
retries
1
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
a b
100 0
101 0
102 0
103 0
include/stop_slave_sql.inc
INSERT INTO t1 VALUES (104, 1);
INSERT INTO t1 VALUES (105, 1);
INSERT INTO t1 VALUES (106, 1);
INSERT INTO t1 VALUES (107, 1);
INSERT INTO t1 VALUES (108, 1);
INSERT INTO t1 VALUES (109, 1);
include/start_slave.inc
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
a b
100 0
101 0
102 0
103 0
104 1
105 1
106 1
107 1
108 1
109 1
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc
......
......@@ -135,7 +135,6 @@ SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_gtid_0_x_100,rpl_parall
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
START SLAVE;
--let $slave_sql_errno= 1213
--let $slave_timeout= 10
--source include/wait_for_slave_sql_error.inc
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
......@@ -208,6 +207,58 @@ SELECT a, LENGTH(b) FROM t2 ORDER BY a;
SET GLOBAL max_relay_log_size=@old_max;
--echo *** MDEV-7065: Incorrect relay log position in parallel replication after retry of transaction ***
--connection server_2
--source include/stop_slave.inc
--connection server_1
BEGIN;
INSERT INTO t1 VALUES (100, 0);
INSERT INTO t1 VALUES (101, 0);
INSERT INTO t1 VALUES (102, 0);
INSERT INTO t1 VALUES (103, 0);
COMMIT;
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
--save_master_pos
--connection server_2
# Inject a DBUG error insert to cause the XID event of the single transaction
# from the master to fail with a deadlock error and be retried.
# The bug was that the retry of the XID would leave the relay log position
# incorrect (off by the size of XID event).
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_simulate_temp_err_xid";
let $old_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--source include/start_slave.inc
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
let $new_retry= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1);
--disable_query_log
eval SELECT $new_retry - $old_retry AS retries;
--enable_query_log
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
# Stop the SQL thread. When the bug was there to give the incorrect relay log
# position, the restart of the SQL thread would read garbage data from the
# middle of an event and fail with relay log IO error.
--source include/stop_slave_sql.inc
--connection server_1
INSERT INTO t1 VALUES (104, 1);
INSERT INTO t1 VALUES (105, 1);
INSERT INTO t1 VALUES (106, 1);
INSERT INTO t1 VALUES (107, 1);
INSERT INTO t1 VALUES (108, 1);
INSERT INTO t1 VALUES (109, 1);
--save_master_pos
--connection server_2
--source include/start_slave.inc
--sync_with_master
SELECT * FROM t1 WHERE a >= 100 ORDER BY a;
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
......
......@@ -446,7 +446,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
ev->thd= thd;
mysql_mutex_lock(&rpt->LOCK_rpl_thread);
qev= rpt->retry_get_qev(ev, orig_qev, log_name, cur_offset,
qev= rpt->retry_get_qev(ev, orig_qev, log_name, old_offset,
cur_offset - old_offset);
mysql_mutex_unlock(&rpt->LOCK_rpl_thread);
if (!qev)
......@@ -776,6 +776,18 @@ handle_rpl_parallel_thread(void *arg)
if (likely(!rgi->worker_error) && !skip_event_group)
{
++rgi->retry_event_count;
#ifndef DBUG_OFF
err= 0;
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_xid",
if (event_type == XID_EVENT)
{
thd->clear_error();
thd->get_stmt_da()->reset_diagnostics_area();
my_error(ER_LOCK_DEADLOCK, MYF(0));
err= 1;
});
if (!err)
#endif
err= rpt_handle_event(qev, rpt);
delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment