Commit e8e2ef47 authored by Kristian Nielsen's avatar Kristian Nielsen

Merge MDEV-8302 into 10.0

parents b74795b0 9b9c5e89
......@@ -1649,6 +1649,46 @@ include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debg;
SET GLOBAL max_relay_log_size= @old_max;
include/start_slave.inc
*** MDEV-8302: Duplicate key with parallel replication ***
include/stop_slave.inc
/* Inject a small sleep which makes the race easier to hit. */
SET @old_dbug=@@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,inject_mdev8302";
INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5);
SET @old_dbug= @@SESSION.debug_dbug;
SET @commit_id= 20000;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t7 ORDER BY a;
a b
1 1
2 2
3 86
4 4
5 5
100 5
101 1
102 2
103 3
104 4
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t7 ORDER BY a;
a b
1 1
2 2
3 86
4 4
5 5
100 5
101 1
102 2
103 3
104 4
include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
include/start_slave.inc
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
include/start_slave.inc
......
......@@ -2314,6 +2314,60 @@ SET GLOBAL max_relay_log_size= @old_max;
--source include/start_slave.inc
--echo *** MDEV-8302: Duplicate key with parallel replication ***
--connection server_2
--source include/stop_slave.inc
/* Inject a small sleep which makes the race easier to hit. */
SET @old_dbug=@@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,inject_mdev8302";
--connection server_1
INSERT INTO t7 VALUES (100,1), (101,2), (102,3), (103,4), (104,5);
# Artificially create a bunch of group commits with conflicting transactions.
# The bug happened when T1 and T2 was in one group commit, and T3 was in the
# following group commit. T2 is a DELETE of a row with same primary key as a
# row that T3 inserts. T1 and T2 can conflict, causing T2 to be deadlock
# killed after starting to commit. The bug was that T2 could roll back before
# doing unmark_start_commit(); this could allow T3 to run before the retry
# of T2, causing duplicate key violation.
SET @old_dbug= @@SESSION.debug_dbug;
SET @commit_id= 20000;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
--let $n = 100
--disable_query_log
while ($n)
{
eval UPDATE t7 SET b=b+1 WHERE a=100+($n MOD 5);
eval DELETE FROM t7 WHERE a=100+($n MOD 5);
SET @commit_id = @commit_id + 1;
eval INSERT INTO t7 VALUES (100+($n MOD 5), $n);
SET @commit_id = @commit_id + 1;
dec $n;
}
--enable_query_log
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t7 ORDER BY a;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t7 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
--source include/start_slave.inc
# Clean up.
--connection server_2
......
......@@ -1582,6 +1582,24 @@ int ha_rollback_trans(THD *thd, bool all)
DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
trans == &thd->transaction.stmt);
if (is_real_trans)
{
/*
In parallel replication, if we need to rollback during commit, we must
first inform following transactions that we are going to abort our commit
attempt. Otherwise those following transactions can run too early, and
possibly cause replication to fail. See comments in retry_event_group().
There were several bugs with this in the past that were very hard to
track down (MDEV-7458, MDEV-8302). So we add here an assertion for
rollback without signalling following transactions. And in release
builds, we explicitly do the signalling before rolling back.
*/
DBUG_ASSERT(!(thd->rgi_slave && thd->rgi_slave->did_mark_start_commit));
if (thd->rgi_slave && thd->rgi_slave->did_mark_start_commit)
thd->rgi_slave->unmark_start_commit();
}
if (thd->in_sub_stmt)
{
DBUG_ASSERT(0);
......
......@@ -4277,7 +4277,6 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi,
"mysql", rpl_gtid_slave_state_table_name.str,
errcode,
thd->get_stmt_da()->message());
trans_rollback(thd);
sub_id= 0;
thd->is_slave_error= 1;
goto end;
......@@ -7367,7 +7366,6 @@ int Xid_log_event::do_apply_event(rpl_group_info *rgi)
"%s.%s: %d: %s",
"mysql", rpl_gtid_slave_state_table_name.str, ec,
thd->get_stmt_da()->message());
trans_rollback(thd);
thd->is_slave_error= 1;
return err;
}
......
......@@ -226,6 +226,11 @@ static void
signal_error_to_sql_driver_thread(THD *thd, rpl_group_info *rgi, int err)
{
rgi->worker_error= err;
/*
In case we get an error during commit, inform following transactions that
we aborted our commit.
*/
rgi->unmark_start_commit();
rgi->cleanup_context(thd, true);
rgi->rli->abort_slave= true;
rgi->rli->stop_for_until= false;
......@@ -370,6 +375,7 @@ do_retry:
transaction we deadlocked with will not signal that it started to commit
until after the unmark.
*/
DBUG_EXECUTE_IF("inject_mdev8302", { my_sleep(20000);});
rgi->unmark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_retry_after_unmark");
......@@ -883,11 +889,26 @@ handle_rpl_parallel_thread(void *arg)
group_rgi= rgi;
group_ending= is_group_ending(qev->ev, event_type);
if (group_ending && likely(!rgi->worker_error))
{
/*
Do an extra check for (deadlock) kill here. This helps prevent a
lingering deadlock kill that occured during normal DML processing to
propagate past the mark_start_commit(). If we detect a deadlock only
after mark_start_commit(), we have to unmark, which has at least a
theoretical possibility of leaving a window where it looks like all
transactions in a GCO have started committing, while in fact one
will need to rollback and retry. This is not supposed to be possible
(since there is a deadlock, at least one transaction should be
blocked from reaching commit), but this seems a fragile ensurance,
and there were historically a number of subtle bugs in this area.
*/
if (!thd->killed)
{
DEBUG_SYNC(thd, "rpl_parallel_before_mark_start_commit");
rgi->mark_start_commit();
DEBUG_SYNC(thd, "rpl_parallel_after_mark_start_commit");
}
}
/*
If the SQL thread is stopping, we just skip execution of all the
......@@ -911,7 +932,17 @@ handle_rpl_parallel_thread(void *arg)
});
if (!err)
#endif
{
if (thd->check_killed())
{
thd->clear_error();
thd->get_stmt_da()->reset_diagnostics_area();
thd->send_kill_message();
err= 1;
}
else
err= rpt_handle_event(qev, rpt);
}
delete_or_keep_event_post_apply(rgi, event_type, qev->ev);
DBUG_EXECUTE_IF("rpl_parallel_simulate_temp_err_gtid_0_x_100",
err= dbug_simulate_tmp_error(rgi, thd););
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment