Commit 18acbaf4 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-31655: Parallel replication deadlock victim preference code errorneously removed

Restore code to make InnoDB choose the second transaction as a deadlock
victim if two transactions deadlock that need to commit in-order for
parallel replication. This code was erroneously removed when VATS was
implemented in InnoDB.

Also add a test case for InnoDB choosing the right deadlock victim.
Also fixes this bug, with testcase that reliably reproduces:

MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master
Reviewed-by: default avatarMarko Mäkelä <marko.makela@mariadb.com>
Signed-off-by: default avatarKristian Nielsen <knielsen@knielsen-hq.org>
parent 3fee1b44
include/master-slave.inc
[connection master]
MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
connection server_1;
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
BEGIN;
COMMIT;
include/save_master_gtid.inc
connection server_2;
include/sync_with_master_gtid.inc
include/stop_slave.inc
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
set @@global.slave_parallel_threads= 5;
set @@global.slave_parallel_mode= conservative;
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
connection server_1;
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
SET @commit_id= 1+1000;
SET @commit_id= 2+1000;
SET @commit_id= 3+1000;
SET @commit_id= 4+1000;
SET @commit_id= 5+1000;
SET @commit_id= 6+1000;
SET @commit_id= 7+1000;
SET @commit_id= 8+1000;
SET @commit_id= 9+1000;
SET @commit_id= 10+1000;
SET SESSION debug_dbug= @old_dbug;
SELECT COUNT(*), SUM(a*100*b) FROM t1;
COUNT(*) SUM(a*100*b)
10 225000
include/save_master_gtid.inc
connection server_2;
include/start_slave.inc
include/sync_with_master_gtid.inc
SET GLOBAL debug_dbug= @old_dbug;
SELECT COUNT(*), SUM(a*100*b) FROM t1;
COUNT(*) SUM(a*100*b)
10 225000
connection server_2;
include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
SET GLOBAL slave_parallel_mode=@old_parallel_mode;
include/start_slave.inc
connection server_1;
DROP TABLE t1;
include/rpl_end.inc
include/master-slave.inc
[connection master]
connection master;
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1(a INT) ENGINE=INNODB;
INSERT INTO t1 VALUES(1);
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
set @@global.slave_parallel_threads= 2;
set @@global.slave_parallel_mode= OPTIMISTIC;
set @@global.slave_transaction_retries= 2;
*** MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master
connection master;
SET @@gtid_seq_no= 100;
INSERT INTO t1 SELECT 1+a FROM t1;
SET @@gtid_seq_no= 200;
INSERT INTO t1 SELECT 2+a FROM t1;
SELECT * FROM t1 ORDER BY a;
a
1
2
3
4
include/save_master_gtid.inc
connection slave;
SET @save_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,rpl_parallel_delay_gtid_0_x_100_start";
SET GLOBAL debug_dbug="+d,rpl_write_record_small_sleep_gtid_100_200";
SET GLOBAL debug_dbug="+d,small_sleep_after_lock_wait";
SET GLOBAL debug_dbug="+d,rpl_delay_deadlock_kill";
include/start_slave.inc
include/sync_with_master_gtid.inc
SET GLOBAL debug_dbug= @save_dbug;
SELECT * FROM t1 ORDER BY a;
a
1
2
3
4
connection slave;
include/stop_slave.inc
SET @@global.slave_parallel_threads= 0;
SET @@global.slave_parallel_mode= optimistic;
SET @@global.slave_transaction_retries= 10;
include/start_slave.inc
connection master;
DROP TABLE t1;
include/rpl_end.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/master-slave.inc
--echo MDEV-31655: Parallel replication deadlock victim preference code erroneously removed
# The problem was that InnoDB would choose the wrong deadlock victim.
# Create a lot of transactions that can cause deadlocks, and use error
# injection to check that the first transactions in each group is never
# selected as deadlock victim.
--let $rows= 10
--let $transactions= 5
--let $gcos= 10
--connection server_1
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
BEGIN;
--disable_query_log
--let $i= 0
while ($i < 10) {
eval INSERT INTO t1 VALUES ($i, 0);
inc $i;
}
--enable_query_log
COMMIT;
--source include/save_master_gtid.inc
--connection server_2
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
eval set @@global.slave_parallel_threads= $transactions;
set @@global.slave_parallel_mode= conservative;
SET @old_dbug= @@GLOBAL.debug_dbug;
# This error injection will allow no retries for GTIDs divisible by 1000.
SET GLOBAL debug_dbug= "+d,rpl_mdev31655_zero_retries";
--connection server_1
SET @old_dbug= @@SESSION.debug_dbug;
SET SESSION debug_dbug="+d,binlog_force_commit_id";
--let $j= 1
while ($j <= $gcos) {
eval SET @commit_id= $j+1000;
--let $i= 0
while ($i < $transactions) {
--disable_query_log
eval SET SESSION gtid_seq_no= 1000 + 1000*$j + $i;
BEGIN;
--let $k= 0
while ($k < $rows) {
eval UPDATE t1 SET b=b+1 WHERE a=(($i+$k) MOD $rows);
inc $k;
}
COMMIT;
--enable_query_log
inc $i;
}
inc $j;
}
SET SESSION debug_dbug= @old_dbug;
SELECT COUNT(*), SUM(a*100*b) FROM t1;
--source include/save_master_gtid.inc
--connection server_2
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SET GLOBAL debug_dbug= @old_dbug;
SELECT COUNT(*), SUM(a*100*b) FROM t1;
# Clean up.
--connection server_2
--source include/stop_slave.inc
SET GLOBAL slave_parallel_threads=@old_parallel_threads;
SET GLOBAL slave_parallel_mode=@old_parallel_mode;
--source include/start_slave.inc
--connection server_1
DROP TABLE t1;
--source include/rpl_end.inc
--source include/master-slave.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_binlog_format_statement.inc
--connection master
ALTER TABLE mysql.gtid_slave_pos ENGINE=InnoDB;
CREATE TABLE t1(a INT) ENGINE=INNODB;
INSERT INTO t1 VALUES(1);
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
--let $save_transaction_retries= `SELECT @@global.slave_transaction_retries`
--let $save_slave_parallel_threads= `SELECT @@global.slave_parallel_threads`
--let $save_slave_parallel_mode= `SELECT @@global.slave_parallel_mode`
set @@global.slave_parallel_threads= 2;
set @@global.slave_parallel_mode= OPTIMISTIC;
set @@global.slave_transaction_retries= 2;
--echo *** MDEV-28776: rpl.rpl_mark_optimize_tbl_ddl fails with timeout on sync_with_master
# This was a failure where a transaction T1 could deadlock multiple times
# with T2, eventually exceeding the default --slave-transaction-retries=10.
# Root cause was MDEV-31655, causing InnoDB to wrongly choose T1 as deadlock
# victim over T2. If thread scheduling is right, it was possible for T1 to
# repeatedly deadlock, roll back, and have time to grab an S lock again before
# T2 woke up and got its waiting X lock, thus repeating the same deadlock over
# and over.
# Once the bug is fixed, it is not possible to re-create the same execution
# and thread scheduling. Instead we inject small sleeps in a way that
# triggered the problem when the bug was there, to demonstrate that the
# problem no longer occurs.
--connection master
# T1
SET @@gtid_seq_no= 100;
INSERT INTO t1 SELECT 1+a FROM t1;
# T2
SET @@gtid_seq_no= 200;
INSERT INTO t1 SELECT 2+a FROM t1;
SELECT * FROM t1 ORDER BY a;
--source include/save_master_gtid.inc
--connection slave
SET @save_dbug= @@GLOBAL.debug_dbug;
# Inject various delays to hint thread scheduling to happen in the way that
# triggered MDEV-28776.
# Small delay starting T1 so it will be the youngest trx and be chosen over
# T2 as the deadlock victim by default in InnoDB.
SET GLOBAL debug_dbug="+d,rpl_parallel_delay_gtid_0_x_100_start";
# Small delay before taking insert X lock to give time for both T1 and T2 to
# get the S lock first and cause a deadlock.
SET GLOBAL debug_dbug="+d,rpl_write_record_small_sleep_gtid_100_200";
# Small delay after T2's wait on the X lock, to give time for T1 retry to
# re-aquire the T1 S lock first.
SET GLOBAL debug_dbug="+d,small_sleep_after_lock_wait";
# Delay deadlock kill of T2.
SET GLOBAL debug_dbug="+d,rpl_delay_deadlock_kill";
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
SET GLOBAL debug_dbug= @save_dbug;
SELECT * FROM t1 ORDER BY a;
# Cleanup.
--connection slave
--source include/stop_slave.inc
eval SET @@global.slave_parallel_threads= $save_slave_parallel_threads;
eval SET @@global.slave_parallel_mode= $save_slave_parallel_mode;
eval SET @@global.slave_transaction_retries= $save_transaction_retries;
--source include/start_slave.inc
--connection master
DROP TABLE t1;
--source include/rpl_end.inc
......@@ -1301,6 +1301,11 @@ handle_rpl_parallel_thread(void *arg)
bool did_enter_cond= false;
PSI_stage_info old_stage;
DBUG_EXECUTE_IF("rpl_parallel_delay_gtid_0_x_100_start", {
if (rgi->current_gtid.domain_id==0 &&
rgi->current_gtid.seq_no == 100)
my_sleep(10000);
});
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("hold_worker_on_schedule", {
if (rgi->current_gtid.domain_id == 0 &&
......@@ -1480,8 +1485,13 @@ handle_rpl_parallel_thread(void *arg)
err= dbug_simulate_tmp_error(rgi, thd););
if (unlikely(err))
{
ulong max_retries= slave_trans_retries;
convert_kill_to_deadlock_error(rgi);
if (has_temporary_error(thd) && slave_trans_retries > 0)
DBUG_EXECUTE_IF("rpl_mdev31655_zero_retries",
if ((rgi->current_gtid.seq_no % 1000) == 0)
max_retries= 0;
);
if (has_temporary_error(thd) && max_retries > 0)
err= retry_event_group(rgi, rpt, qev);
}
}
......
......@@ -506,6 +506,7 @@ static void bg_rpl_load_gtid_slave_state(void *)
static void bg_slave_kill(void *victim)
{
THD *to_kill= (THD *)victim;
DBUG_EXECUTE_IF("rpl_delay_deadlock_kill", my_sleep(1500000););
to_kill->awake(KILL_CONNECTION);
mysql_mutex_lock(&to_kill->LOCK_wakeup_ready);
to_kill->rgi_slave->killed_for_retry= rpl_group_info::RETRY_KILL_KILLED;
......
......@@ -5516,6 +5516,49 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
return 0;
}
/*
If the storage engine detects a deadlock, and needs to choose a victim
transaction to roll back, it can call this function to ask the upper
server layer for which of two possible transactions is prefered to be
aborted and rolled back.
In parallel replication, if two transactions are running in parallel and
one is fixed to commit before the other, then the one that commits later
will be prefered as the victim - chosing the early transaction as a victim
will not resolve the deadlock anyway, as the later transaction still needs
to wait for the earlier to commit.
The return value is -1 if the first transaction is prefered as a deadlock
victim, 1 if the second transaction is prefered, or 0 for no preference (in
which case the storage engine can make the choice as it prefers).
*/
extern "C" int
thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2)
{
rpl_group_info *rgi1, *rgi2;
if (!thd1 || !thd2)
return 0;
/*
If the transactions are participating in the same replication domain in
parallel replication, then request to select the one that will commit
later (in the fixed commit order from the master) as the deadlock victim.
*/
rgi1= thd1->rgi_slave;
rgi2= thd2->rgi_slave;
if (rgi1 && rgi2 &&
rgi1->is_parallel_exec &&
rgi1->rli == rgi2->rli &&
rgi1->current_gtid.domain_id == rgi2->current_gtid.domain_id)
return rgi1->gtid_sub_id < rgi2->gtid_sub_id ? 1 : -1;
/* No preferences, let the storage engine decide. */
return 0;
}
extern "C" int thd_non_transactional_update(const MYSQL_THD thd)
{
return(thd->transaction->all.modified_non_trans_table);
......
......@@ -80,6 +80,7 @@
#include "debug_sync.h" // DEBUG_SYNC
#include "debug.h" // debug_crash_here
#include <my_bit.h>
#include "rpl_rli.h"
#ifdef WITH_WSREP
#include "wsrep_trans_observer.h" /* wsrep_start_transction() */
......@@ -1828,6 +1829,12 @@ int write_record(THD *thd, TABLE *table, COPY_INFO *info, select_result *sink)
save_read_set= table->read_set;
save_write_set= table->write_set;
DBUG_EXECUTE_IF("rpl_write_record_small_sleep_gtid_100_200",
{
if (thd->rgi_slave && (thd->rgi_slave->current_gtid.seq_no == 100 ||
thd->rgi_slave->current_gtid.seq_no == 200))
my_sleep(20000);
});
if (info->handle_duplicates == DUP_REPLACE ||
info->handle_duplicates == DUP_UPDATE)
{
......
......@@ -63,6 +63,7 @@ ulong innodb_deadlock_report;
extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
#endif
/** Functor for accessing the embedded node within a table lock. */
......@@ -1958,6 +1959,14 @@ dberr_t lock_wait(que_thr_t *thr)
end_wait:
mysql_mutex_unlock(&lock_sys.wait_mutex);
DBUG_EXECUTE_IF("small_sleep_after_lock_wait",
{
if (!(type_mode & LOCK_TABLE) &&
(type_mode & LOCK_MODE_MASK) == LOCK_X &&
trx->error_state != DB_DEADLOCK && !trx_is_interrupted(trx)) {
my_sleep(20000);
}
});
thd_wait_end(trx->mysql_thd);
return trx->error_state;
......@@ -6309,6 +6318,28 @@ namespace Deadlock
}
}
ATTRIBUTE_COLD
/** Calculate a number used to compare deadlock victim candidates.
Bit 62 is used to prefer transaction that did not modified non-transactional
tables. Bits 1-61 are set to TRX_WEIGHT to prefer transactions with less locks
and less modified rows. Bit 0 is used to prefer orig_trx in case of a tie.
@param trx Transaction
@return a 64-bit unsigned, the lower the more preferred TRX is as a deadlock
victim */
static undo_no_t calc_victim_weight(trx_t *trx, const trx_t *orig_trx)
{
const undo_no_t trx_weight= (trx != orig_trx) | (TRX_WEIGHT(trx) << 1) |
(trx->mysql_thd &&
#ifdef WITH_WSREP
(thd_has_edited_nontrans_tables(trx->mysql_thd) ||
(trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false)))
#else
thd_has_edited_nontrans_tables(trx->mysql_thd)
#endif /* WITH_WSREP */
? 1ULL << 62 : 0);
return trx_weight;
}
ATTRIBUTE_COLD
/** Report a deadlock (cycle in the waits-for graph).
@param trx transaction waiting for a lock in this thread
......@@ -6332,24 +6363,7 @@ namespace Deadlock
static const char rollback_msg[]= "*** WE ROLL BACK TRANSACTION (%u)\n";
char buf[9 + sizeof rollback_msg];
/* If current_trx=true, trx is owned by this thread, and we can
safely invoke these without holding trx->mutex or lock_sys.latch.
If current_trx=false, a concurrent commit is protected by both
lock_sys.latch and lock_sys.wait_mutex. */
const undo_no_t trx_weight= TRX_WEIGHT(trx) |
(trx->mysql_thd &&
#ifdef WITH_WSREP
(thd_has_edited_nontrans_tables(trx->mysql_thd) ||
(trx->is_wsrep() && wsrep_thd_is_BF(trx->mysql_thd, false)))
#else
thd_has_edited_nontrans_tables(trx->mysql_thd)
#endif /* WITH_WSREP */
? 1ULL << 63 : 0);
trx_t *victim= nullptr;
undo_no_t victim_weight= ~0ULL;
unsigned victim_pos= 0, trx_pos= 0;
/* Here, lock elision does not make sense, because
for the output we are going to invoke system calls,
......@@ -6362,43 +6376,52 @@ namespace Deadlock
}
{
unsigned l= 0;
unsigned l= 1;
/* Now that we are holding lock_sys.wait_mutex again, check
whether a cycle still exists. */
trx_t *cycle= find_cycle(trx);
if (!cycle)
goto func_exit; /* One of the transactions was already aborted. */
victim= cycle;
undo_no_t victim_weight= calc_victim_weight(victim, trx);
unsigned victim_pos= l;
for (trx_t *next= cycle;;)
{
next= next->lock.wait_trx;
l++;
const undo_no_t next_weight= TRX_WEIGHT(next) |
(next->mysql_thd &&
#ifdef WITH_WSREP
(thd_has_edited_nontrans_tables(next->mysql_thd) ||
(next->is_wsrep() && wsrep_thd_is_BF(next->mysql_thd, false)))
const undo_no_t next_weight= calc_victim_weight(next, trx);
#ifdef HAVE_REPLICATION
const int pref=
thd_deadlock_victim_preference(victim->mysql_thd, next->mysql_thd);
/* Set bit 63 for any non-preferred victim to make such preference take
priority in the weight comparison.
-1 means victim is preferred. 1 means next is preferred. */
undo_no_t victim_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)(-pref);
undo_no_t next_not_pref= (1ULL << 63) & (undo_no_t)(int64_t)pref;
#else
thd_has_edited_nontrans_tables(next->mysql_thd)
#endif /* WITH_WSREP */
? 1ULL << 63 : 0);
if (next_weight < victim_weight)
undo_no_t victim_not_pref= 0;
undo_no_t next_not_pref= 0;
#endif
/* Single comparison to decide which of two transactions is preferred
as a deadlock victim.
- If thd_deadlock_victim_preference() returned non-zero, bit 63
comparison will decide the preferred one.
- Else if exactly one of them modified non-transactional tables,
bit 62 will decide.
- Else the TRX_WEIGHT in bits 1-61 will decide, if not equal.
- Else, if one of them is the original trx, bit 0 will decide.
- If all is equal, previous victim will arbitrarily be chosen. */
if ((next_weight|next_not_pref) < (victim_weight|victim_not_pref))
{
victim_weight= next_weight;
victim= next;
victim_pos= l;
}
if (next == victim)
trx_pos= l;
if (next == cycle)
break;
}
if (trx_pos && trx_weight == victim_weight)
{
victim= trx;
victim_pos= trx_pos;
}
/* Finally, display the deadlock */
switch (const auto r= static_cast<enum report>(innodb_deadlock_report)) {
case REPORT_OFF:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment