Commit c2dd88ac authored by Kristian Nielsen's avatar Kristian Nielsen

Merge MDEV-8031 into 10.1

parents fc1f301e b616991a
......@@ -1591,7 +1591,7 @@ a b
120 0
121 0
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug;
SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
include/stop_slave.inc
......@@ -1620,7 +1620,88 @@ a b
130 0
131 0
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug;
SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
INSERT INTO t3 VALUES (201,0), (202,0);
include/save_master_gtid.inc
include/sync_with_master_gtid.inc
include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_mdev8031';
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10200;
INSERT INTO t3 VALUES (203, 1);
INSERT INTO t3 VALUES (204, 1);
INSERT INTO t3 VALUES (205, 1);
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=205;
UPDATE t3 SET b=b+1 WHERE a=205;
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 3
202 4
203 4
204 4
205 3
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 3
202 4
203 4
204 4
205 3
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_dbug;
include/start_slave.inc
*** Check getting deadlock killed inside open_binlog() during retry. ***
include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
SET @old_max= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size= 4096;
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10210;
Omit long queries that cause relaylog rotations and transaction retries...
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 6
202 8
203 7
204 7
205 5
include/save_master_gtid.inc
include/start_slave.inc
include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
a b
201 6
202 8
203 7
204 7
205 5
include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debg;
SET GLOBAL max_relay_log_size= @old_max;
include/start_slave.inc
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
......
......@@ -2215,7 +2215,7 @@ SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
SELECT * FROM t3 WHERE a >= 120 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug;
SET GLOBAL debug_dbug= @old_dbug;
--source include/start_slave.inc
......@@ -2262,10 +2262,119 @@ SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
SELECT * FROM t3 WHERE a >= 130 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debug;
SET GLOBAL debug_dbug= @old_dbug;
--source include/start_slave.inc
--echo *** MDEV-8031: Parallel replication stops on "connection killed" error (probably incorrectly handled deadlock kill) ***
--connection server_1
INSERT INTO t3 VALUES (201,0), (202,0);
--source include/save_master_gtid.inc
--connection server_2
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_mdev8031';
--connection server_1
# We artificially create a situation that hopefully resembles the original
# bug which was only seen "in the wild", and only once.
# Setup a fake group commit with lots of conflicts that will lead to deadloc
# kill. The slave DBUG injection causes the slave to be deadlock killed at
# a particular point during the retry, and then later do a small sleep at
# another critical point where the prior transaction then has a chance to
# complete. Finally an extra KILL check catches an unhandled, lingering
# deadlock kill. So rather artificial, but at least it exercises the
# relevant code paths.
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 10200;
INSERT INTO t3 VALUES (203, 1);
INSERT INTO t3 VALUES (204, 1);
INSERT INTO t3 VALUES (205, 1);
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=201;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=202;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=204;
UPDATE t3 SET b=b+1 WHERE a=203;
UPDATE t3 SET b=b+1 WHERE a=205;
UPDATE t3 SET b=b+1 WHERE a=205;
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_dbug;
--source include/start_slave.inc
--echo *** Check getting deadlock killed inside open_binlog() during retry. ***
--connection server_2
--source include/stop_slave.inc
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= '+d,inject_retry_event_group_open_binlog_kill';
SET @old_max= @@GLOBAL.max_relay_log_size;
SET GLOBAL max_relay_log_size= 4096;
--connection server_1
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
--let $large= `SELECT REPEAT("*", 8192)`
SET @commit_id= 10210;
--echo Omit long queries that cause relaylog rotations and transaction retries...
--disable_query_log
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=201 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=202 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=204 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=203 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
eval UPDATE t3 SET b=b+1 WHERE a=205 /* $large */;
--enable_query_log
SET SESSION debug_dbug=@old_dbug;
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SELECT * FROM t3 WHERE a>=200 ORDER BY a;
--source include/stop_slave.inc
SET GLOBAL debug_dbug= @old_debg;
SET GLOBAL max_relay_log_size= @old_max;
--source include/start_slave.inc
# Clean up.
--connection server_2
--source include/stop_slave.inc
......
......@@ -2,6 +2,7 @@
#include "rpl_parallel.h"
#include "slave.h"
#include "rpl_mi.h"
#include "sql_parse.h"
#include "debug_sync.h"
/*
......@@ -333,7 +334,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
IO_CACHE rlog;
LOG_INFO linfo;
File fd= (File)-1;
const char *errmsg= NULL;
const char *errmsg;
inuse_relaylog *ir= rgi->relay_log;
uint64 event_count;
uint64 events_to_execute= rgi->retry_event_count;
......@@ -349,6 +350,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
do_retry:
event_count= 0;
err= 0;
errmsg= NULL;
/*
If we already started committing before getting the deadlock (or other
......@@ -384,7 +386,16 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
*/
if(thd->wait_for_commit_ptr)
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Simulate that we get deadlock killed at this exact point. */
rgi->killed_for_retry= true;
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->killed= KILL_CONNECTION;
mysql_mutex_unlock(&thd->LOCK_thd_data);
});
rgi->cleanup_context(thd, 1);
thd->reset_killed();
thd->clear_error();
/*
If we retry due to a deadlock kill that occured during the commit step, we
......@@ -428,10 +439,22 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
complete its commit.
*/
thd->clear_error();
thd->reset_killed();
if(thd->wait_for_commit_ptr)
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Inject a small sleep to give prior transaction a chance to commit. */
my_sleep(100000);
});
}
/*
Let us clear any lingering deadlock kill one more time, here after
wait_for_prior_commit() has completed. This should rule out any
possibility of an old deadlock kill lingering on beyond this point.
*/
thd->reset_killed();
strmake_buf(log_name, ir->name);
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
{
......@@ -447,6 +470,14 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
err= 1;
goto err;
}
DBUG_EXECUTE_IF("inject_mdev8031", {
/* Simulate pending KILL caught in read_relay_log_description_event(). */
if (thd->check_killed()) {
thd->send_kill_message();
err= 1;
goto err;
}
});
my_b_seek(&rlog, cur_offset);
do
......@@ -469,7 +500,7 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
{
errmsg= "slave SQL thread aborted because of I/O error";
err= 1;
goto err;
goto check_retry;
}
if (rlog.error > 0)
{
......@@ -498,10 +529,25 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
}
strmake_buf(log_name ,linfo.log_file_name);
DBUG_EXECUTE_IF("inject_retry_event_group_open_binlog_kill", {
if (retries < 2)
{
/* Simulate that we get deadlock killed during open_binlog(). */
mysql_reset_thd_for_next_command(thd);
rgi->killed_for_retry= true;
mysql_mutex_lock(&thd->LOCK_thd_data);
thd->killed= KILL_CONNECTION;
mysql_mutex_unlock(&thd->LOCK_thd_data);
thd->send_kill_message();
fd= (File)-1;
err= 1;
goto check_retry;
}
});
if ((fd= open_binlog(&rlog, log_name, &errmsg)) <0)
{
err= 1;
goto err;
goto check_retry;
}
/* Loop to try again on the new log file. */
}
......@@ -544,26 +590,31 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
if (retries == 0) err= dbug_simulate_tmp_error(rgi, thd););
DBUG_EXECUTE_IF("rpl_parallel_simulate_infinite_temp_err_gtid_0_x_100",
err= dbug_simulate_tmp_error(rgi, thd););
if (err)
if (!err)
continue;
check_retry:
convert_kill_to_deadlock_error(rgi);
if (has_temporary_error(thd))
{
convert_kill_to_deadlock_error(rgi);
if (has_temporary_error(thd))
++retries;
if (retries < slave_trans_retries)
{
++retries;
if (retries < slave_trans_retries)
if (fd >= 0)
{
end_io_cache(&rlog);
mysql_file_close(fd, MYF(MY_WME));
fd= (File)-1;
goto do_retry;
}
sql_print_error("Slave worker thread retried transaction %lu time(s) "
"in vain, giving up. Consider raising the value of "
"the slave_transaction_retries variable.",
slave_trans_retries);
goto do_retry;
}
goto err;
sql_print_error("Slave worker thread retried transaction %lu time(s) "
"in vain, giving up. Consider raising the value of "
"the slave_transaction_retries variable.",
slave_trans_retries);
}
goto err;
} while (event_count < events_to_execute);
err:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment