Commit ec05fea0 authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-6549, failing to update gtid_slave_pos for a transaction that was retried.

The bug was that in some cases, if a replicated transaction was rolled back
due to deadlock, during the subsequent retry of that transaction, the
gtid_slave_pos would _not_ be updated with the new GTID, leaving the GTID
position of the slave incorrect.

Fix this by ensuring during the retry that we clear the flag that marks that
the GTID has already been recorded in gtid_slave_pos, so that the update of
gtid_slave_pos will be done again during the retry.

In the original bug, the symptom was an assertion due to OPTION_GTID_BEGIN not
being cleared during the retry of the transaction. The reason was some code in
handling of a COMMIT query event, which would not clear the flag when not
recording a GTID in gtid_slave_pos. This commit also fixes that code to always
clear the OPTION_GTID_BEGIN flag for clarity, though it is actually not
possible for OPTION_GTID_BEGIN to become set unless a GTID is pending for
update (after fixing the bug described above).
parent 354f3f1f
...@@ -793,6 +793,7 @@ SET debug_sync='now WAIT_FOR master_queued2'; ...@@ -793,6 +793,7 @@ SET debug_sync='now WAIT_FOR master_queued2';
SET debug_sync='now SIGNAL master_cont1'; SET debug_sync='now SIGNAL master_cont1';
SET debug_sync='RESET'; SET debug_sync='RESET';
include/start_slave.inc include/start_slave.inc
include/stop_slave.inc
SELECT * FROM t4 ORDER BY a; SELECT * FROM t4 ORDER BY a;
a b a b
1 NULL 1 NULL
...@@ -801,6 +802,42 @@ a b ...@@ -801,6 +802,42 @@ a b
5 NULL 5 NULL
6 6 6 6
7 NULL 7 NULL
DELETE FROM t4;
INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
UPDATE t4 SET b=NULL WHERE a=6;
SET debug_sync='now WAIT_FOR master_queued1';
SET @old_format= @@SESSION.binlog_format;
SET binlog_format='statement';
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
DELETE FROM t4 WHERE b <= 1;
SET debug_sync='now WAIT_FOR master_queued2';
SET debug_sync='now SIGNAL master_cont1';
SET @old_format=@@GLOBAL.binlog_format;
SET debug_sync='RESET';
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,disable_thd_need_ordering_with";
include/start_slave.inc
SET GLOBAL debug_dbug=@old_dbug;
SELECT * FROM t4 ORDER BY a;
a b
1 NULL
2 2
3 NULL
4 4
5 NULL
6 NULL
SET @last_gtid= 'GTID';
SELECT IF(@@gtid_slave_pos LIKE CONCAT('%',@last_gtid,'%'), "GTID found ok",
CONCAT("GTID ", @last_gtid, " not found in gtid_slave_pos=", @@gtid_slave_pos))
AS result;
result
GTID found ok
SELECT "ROW FOUND" AS `Is the row found?`
FROM mysql.gtid_slave_pos
WHERE CONCAT(domain_id, "-", server_id, "-", seq_no) = @last_gtid;
Is the row found?
ROW FOUND
*** MDEV-5938: Exec_master_log_pos not updated at log rotate in parallel replication *** *** MDEV-5938: Exec_master_log_pos not updated at log rotate in parallel replication ***
include/stop_slave.inc include/stop_slave.inc
SET GLOBAL slave_parallel_threads=1; SET GLOBAL slave_parallel_threads=1;
......
...@@ -1246,8 +1246,76 @@ SET debug_sync='RESET'; ...@@ -1246,8 +1246,76 @@ SET debug_sync='RESET';
--connection server_2 --connection server_2
--source include/start_slave.inc --source include/start_slave.inc
--sync_with_master --sync_with_master
--source include/stop_slave.inc
SELECT * FROM t4 ORDER BY a;
# MDEV-6549, failing to update gtid_slave_pos for a transaction that was retried.
# The problem was that when a transaction updates the mysql.gtid_slave_pos
# table, it clears the flag that marks that there is a GTID position that
# needs to be updated. Then, if the transaction got killed after that due
# to a deadlock, the subsequent retry would fail to notice that the GTID needs
# to be recorded in gtid_slave_pos.
#
# (In the original bug report, the symptom was an assertion; this was however
# just a side effect of the missing update of gtid_slave_pos, which also
# happened to cause a missing clear of OPTION_GTID_BEGIN).
--connection server_1
DELETE FROM t4;
INSERT INTO t4 VALUES (1,NULL), (2,2), (3,NULL), (4,4), (5, NULL), (6, 6);
# Create two transactions that can run in parallel on the slave but cause
# a deadlock if the second runs before the first.
--connection con1
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued1 WAIT_FOR master_cont1';
send UPDATE t4 SET b=NULL WHERE a=6;
--connection server_1
SET debug_sync='now WAIT_FOR master_queued1';
--connection con2
# Must use statement-based binlogging. Otherwise the transaction will not be
# binlogged at all, as it modifies no rows.
SET @old_format= @@SESSION.binlog_format;
SET binlog_format='statement';
SET debug_sync='commit_after_release_LOCK_prepare_ordered SIGNAL master_queued2';
send DELETE FROM t4 WHERE b <= 1;
--connection server_1
SET debug_sync='now WAIT_FOR master_queued2';
SET debug_sync='now SIGNAL master_cont1';
--connection con1
REAP;
--connection con2
REAP;
SET @old_format=@@GLOBAL.binlog_format;
SET debug_sync='RESET';
--save_master_pos
--let $last_gtid= `SELECT @@last_gtid`
--connection server_2
# Disable the usual skip of gap locks for transactions that are run in
# parallel, using DBUG. This allows the deadlock to occur, and this in turn
# triggers a retry of the second transaction, and the code that was buggy and
# caused the gtid_slave_pos update to be skipped in the retry.
SET @old_dbug= @@GLOBAL.debug_dbug;
SET GLOBAL debug_dbug="+d,disable_thd_need_ordering_with";
--source include/start_slave.inc
--sync_with_master
SET GLOBAL debug_dbug=@old_dbug;
SELECT * FROM t4 ORDER BY a; SELECT * FROM t4 ORDER BY a;
# Check that the GTID of the second transaction was correctly recorded in
# gtid_slave_pos, in the variable as well as in the table.
--replace_result $last_gtid GTID
eval SET @last_gtid= '$last_gtid';
SELECT IF(@@gtid_slave_pos LIKE CONCAT('%',@last_gtid,'%'), "GTID found ok",
CONCAT("GTID ", @last_gtid, " not found in gtid_slave_pos=", @@gtid_slave_pos))
AS result;
SELECT "ROW FOUND" AS `Is the row found?`
FROM mysql.gtid_slave_pos
WHERE CONCAT(domain_id, "-", server_id, "-", seq_no) = @last_gtid;
--echo *** MDEV-5938: Exec_master_log_pos not updated at log rotate in parallel replication *** --echo *** MDEV-5938: Exec_master_log_pos not updated at log rotate in parallel replication ***
......
...@@ -4265,28 +4265,31 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi, ...@@ -4265,28 +4265,31 @@ int Query_log_event::do_apply_event(rpl_group_info *rgi,
Record any GTID in the same transaction, so slave state is Record any GTID in the same transaction, so slave state is
transactionally consistent. transactionally consistent.
*/ */
if (current_stmt_is_commit && rgi->gtid_pending) if (current_stmt_is_commit)
{ {
sub_id= rgi->gtid_sub_id;
rgi->gtid_pending= false;
gtid= rgi->current_gtid;
thd->variables.option_bits&= ~OPTION_GTID_BEGIN; thd->variables.option_bits&= ~OPTION_GTID_BEGIN;
if (rpl_global_gtid_slave_state.record_gtid(thd, &gtid, sub_id, true, false)) if (rgi->gtid_pending)
{ {
int errcode= thd->get_stmt_da()->sql_errno(); sub_id= rgi->gtid_sub_id;
if (!is_parallel_retry_error(rgi, errcode)) rgi->gtid_pending= false;
rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE,
rgi->gtid_info(), gtid= rgi->current_gtid;
"Error during COMMIT: failed to update GTID state in " if (rpl_global_gtid_slave_state.record_gtid(thd, &gtid, sub_id, true, false))
"%s.%s: %d: %s", {
"mysql", rpl_gtid_slave_state_table_name.str, int errcode= thd->get_stmt_da()->sql_errno();
errcode, if (!is_parallel_retry_error(rgi, errcode))
thd->get_stmt_da()->message()); rli->report(ERROR_LEVEL, ER_CANNOT_UPDATE_GTID_STATE,
trans_rollback(thd); rgi->gtid_info(),
sub_id= 0; "Error during COMMIT: failed to update GTID state in "
thd->is_slave_error= 1; "%s.%s: %d: %s",
goto end; "mysql", rpl_gtid_slave_state_table_name.str,
errcode,
thd->get_stmt_da()->message());
trans_rollback(thd);
sub_id= 0;
thd->is_slave_error= 1;
goto end;
}
} }
} }
......
...@@ -318,6 +318,15 @@ do_retry: ...@@ -318,6 +318,15 @@ do_retry:
thd->wait_for_commit_ptr->unregister_wait_for_prior_commit(); thd->wait_for_commit_ptr->unregister_wait_for_prior_commit();
rgi->cleanup_context(thd, 1); rgi->cleanup_context(thd, 1);
/*
If we retry due to a deadlock kill that occured during the commit step, we
might have already updated (but not committed) an update of table
mysql.gtid_slave_pos, and cleared the gtid_pending flag. Now we have
rolled back any such update, so we must set the gtid_pending flag back to
true so that we will do a new update when/if we succeed with the retry.
*/
rgi->gtid_pending= true;
mysql_mutex_lock(&rli->data_lock); mysql_mutex_lock(&rli->data_lock);
++rli->retried_trans; ++rli->retried_trans;
statistic_increment(slave_retried_transactions, LOCK_status); statistic_increment(slave_retried_transactions, LOCK_status);
......
...@@ -4346,6 +4346,7 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd) ...@@ -4346,6 +4346,7 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
{ {
rpl_group_info *rgi, *other_rgi; rpl_group_info *rgi, *other_rgi;
DBUG_EXECUTE_IF("disable_thd_need_ordering_with", return 1;);
if (!thd || !other_thd) if (!thd || !other_thd)
return 1; return 1;
rgi= thd->rgi_slave; rgi= thd->rgi_slave;
...@@ -4361,7 +4362,7 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd) ...@@ -4361,7 +4362,7 @@ thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd)
if (!rgi->commit_id || rgi->commit_id != other_rgi->commit_id) if (!rgi->commit_id || rgi->commit_id != other_rgi->commit_id)
return 1; return 1;
/* /*
These two threads are doing parallel replication within the same Otherwise, these two threads are doing parallel replication within the same
replication domain. Their commit order is already fixed, so we do not need replication domain. Their commit order is already fixed, so we do not need
gap locks or similar to otherwise enforce ordering (and in fact such locks gap locks or similar to otherwise enforce ordering (and in fact such locks
could lead to unnecessary deadlocks and transaction retry). could lead to unnecessary deadlocks and transaction retry).
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment