Commit 68cfcf9c authored by sjaakola's avatar sjaakola Committed by Jan Lindström

MDEV-29512 deadlock between commit monitor and THD::LOCK_thd_data mutex

This commit contains only a mtr test for reproducing the issue in MDEV-29512
The actual fix will be pushed in wsrep-lib repository

The hanging in MDEV-29512 happens when binlog purging is attempted, and there is
one local BF aborted transaction waiting for commit monitor.

The test will launch two node cluster and enable binlogging with expire log days,
to force binlog purging to happen.
A local transaction is executed so that will become BF abort victim, and has advanced
to replication stage waiting for commit monitor for final cleanup (to mark position in innodb)
after that, applier is released to complete the BF abort and due to binlog configuration,
starting the binlog purging. This is where the hanging would occur, if code is buggy
Reviewed-by: default avatarJan Lindström <jan.lindstrom@mariadb.com>
parent cd97523d
connection node_2;
connection node_1;
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 varchar(2000));
INSERT INTO t1 VALUES (1, 0, REPEAT('1234567890', 200));
INSERT INTO t1 VALUES (3, 3, REPEAT('1234567890', 200));
SET SESSION wsrep_sync_wait=0;
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1a;
SET SESSION wsrep_sync_wait=0;
connection node_1;
begin;
select f1,f2 from t1;
f1 f2
1 0
3 3
connection node_2;
UPDATE t1 SET f2=2 WHERE f1=3;
connection node_1a;
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
connection node_1;
UPDATE t1 SET f2=1 WHERE f1=3;
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_master_enter_sync';
COMMIT;
connection node_1a;
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
SET GLOBAL wsrep_provider_options = 'dbug=';
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_master_enter_sync';
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = NULL;
SET debug_sync='RESET';
connection node_1;
ERROR 40001: Deadlock found when trying to get lock; try restarting transaction
select f1,f2 from t1;
f1 f2
1 0
3 2
DROP TABLE t1;
!include ../galera_2nodes.cnf
[mysqld]
log-bin
log-slave-updates
[mysqld.1]
log_bin
log_slave_updates
max-binlog-size=4096
expire-logs-days=1
[mysqld.2]
#
# This test is for reproducing the issue in:
# https://jira.mariadb.org/browse/MDEV-29512
#
# The hanging in MDEV-29512 happens when binlog purging is attempted, and there is
# one local BF aborted transaction waiting for commit monitor.
#
# The test will launch two node cluster and enable binlogging with expire log days,
# to force binlog purging to happen.
# A local transaction is executed so that will become BF abort victim, and has advanced
# to replication stage waiting for commit monitor for final cleanup (to mark position in innodb)
# after that, applier is released to complete the BF abort and due to binlog configuration,
# starting the binlog purging. This is where the hanging would occur, if code is buggy
#
--source include/galera_cluster.inc
--source include/have_innodb.inc
--source include/have_debug_sync.inc
--source include/galera_have_debug_sync.inc
#
# binlog size is limited to 4096 bytes, we will create enough events to
# cause binlog rotation
#
CREATE TABLE t1 (f1 INTEGER PRIMARY KEY, f2 int, f3 varchar(2000));
INSERT INTO t1 VALUES (1, 0, REPEAT('1234567890', 200));
INSERT INTO t1 VALUES (3, 3, REPEAT('1234567890', 200));
SET SESSION wsrep_sync_wait=0;
# set sync point for replication applier
SET GLOBAL DEBUG_DBUG = "d,sync.wsrep_apply_cb";
# Control connection to manage sync points for appliers
--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1a
SET SESSION wsrep_sync_wait=0;
# starting local transaction, only select so far,
# write will happen later and this will be ordered after the transaction in node_2
--connection node_1
begin;
select f1,f2 from t1;
# send from node 2 an UPDATE transaction, which will BF abort the transaction in node_1
--connection node_2
--let $wait_condition=select count(*)=2 from t1
--source include/wait_condition.inc
UPDATE t1 SET f2=2 WHERE f1=3;
--connection node_1a
# wait to see the UPDATE from node_2 in apply_cb sync point
SET SESSION DEBUG_SYNC = "now WAIT_FOR sync.wsrep_apply_cb_reached";
--connection node_1
# now issuing conflicting update
UPDATE t1 SET f2=1 WHERE f1=3;
# Block the local commit, send final COMMIT and wait until it gets blocked
--let $galera_sync_point = commit_monitor_master_enter_sync
--source include/galera_set_sync_point.inc
--send COMMIT
--connection node_1a
# wait for the local commit to enter in commit monitor wait state
--let $galera_sync_point = commit_monitor_master_enter_sync
--source include/galera_wait_sync_point.inc
--source include/galera_clear_sync_point.inc
# release the local transaction to continue with commit
--let $galera_sync_point = commit_monitor_master_enter_sync
--source include/galera_signal_sync_point.inc
# and now release the applier, it should force local trx to abort
SET GLOBAL DEBUG_DBUG = "";
SET DEBUG_SYNC = "now SIGNAL signal.wsrep_apply_cb";
SET GLOBAL debug_dbug = NULL;
SET debug_sync='RESET';
--connection node_1
--error ER_LOCK_DEADLOCK
--reap
# wait until applying is complete
--let $wait_condition = SELECT COUNT(*)=1 FROM t1 WHERE f2=2
--source include/wait_condition.inc
# final read to verify what we got
select f1,f2 from t1;
DROP TABLE t1;
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment