Commit 946a07e8 authored by Andrei Elkin's avatar Andrei Elkin Committed by Monty

Fix for MDEV-9670 server_id mysteriously set to 0

Problem was that in a circular replication setup the master remembers
position to events it has generated itself when reading from a slave.
If there are no new events in the queue from the slave, a
Gtid_list_log_event is generated to remember the last skipped event.
The problem happens if there is a network delay and we generate a
Gtid_list_log_event in the middle of the transaction, in which case there
will be an implicit comment and a new transaction with serverid=0 will be
logged.

The fix was to not generate any Gtid_list_log_events in the middle of a
transaction.
parent 46d6f74c
include/rpl_init.inc [topology=1->2->1]
include/rpl_connect.inc [creating M4]
include/rpl_connect.inc [creating M2]
SET @old_debug= @@global.debug;
STOP SLAVE;
SET GLOBAL debug_dbug= "+d,dbug.rows_events_to_delay_relay_logging";
START SLAVE IO_THREAD;
include/wait_for_slave_io_to_start.inc
CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb;
INSERT INTO `t1` VALUES (null, repeat('a', 1024)), (null, repeat('b', 1024));
SET debug_sync='now WAIT_FOR start_sql_thread';
START SLAVE SQL_THREAD;
include/show_binlog_events.inc
Log_name Pos Event_type Server_id End_log_pos Info
slave-bin.000001 # Gtid # # GTID #-#-#
slave-bin.000001 # Query # # use `test`; CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb
SET debug_sync='now SIGNAL go_on_relay_logging';
include/show_binlog_events.inc
Log_name Pos Event_type Server_id End_log_pos Info
slave-bin.000001 # Gtid # # GTID #-#-#
slave-bin.000001 # Query # # use `test`; CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb
slave-bin.000001 # Gtid # # BEGIN GTID #-#-#
slave-bin.000001 # Table_map # # table_id: # (test.t1)
slave-bin.000001 # Write_rows_v1 # # table_id: #
slave-bin.000001 # Write_rows_v1 # # table_id: # flags: STMT_END_F
slave-bin.000001 # Xid # # COMMIT /* XID */
drop table t1;
SET GLOBAL debug_dbug= @old_debug;
SET debug_sync='RESET';
include/rpl_end.inc
!include ../my.cnf
[mysqld.1]
gtid-domain-id=4
server-id=4
#
log-slave-updates
slave-parallel-threads=0
gtid-strict-mode=1
gtid-ignore-duplicates=1
#
# Max-size row events to minimum with the idea to create
# a number of Rows_log_event per Query.
#
binlog-row-event-max-size=1024
[mysqld.2]
gtid-domain-id=2
server-id=2
#
log-slave-updates
slave-parallel-threads=0
gtid-strict-mode=1
gtid-ignore-duplicates=1
binlog-row-event-max-size=1024
# The slave will be initialized with a @@global.dbug-var value
skip-slave-start=1
#
# Testing chain/circular replication scenario of MDEV-9670
# The effect of the bug was that we got a commit with a GTID with server_id
#
--source include/have_binlog_format_row.inc
--source include/have_innodb.inc
--source include/have_debug.inc
--source include/have_debug_sync.inc
--let $rpl_topology= 1->2->1
--source include/rpl_init.inc
--let $rpl_connection_name= M4
--let $rpl_server_number= 1
--source include/rpl_connect.inc
--let $rpl_connection_name= M2
--let $rpl_server_number= 2
--source include/rpl_connect.inc
# The parameter reflects binlog-row-event-max-size @cnf.
--let $row_size=1024
SET @old_debug= @@global.debug;
--connection M2
STOP SLAVE;
SET GLOBAL debug_dbug= "+d,dbug.rows_events_to_delay_relay_logging";
START SLAVE IO_THREAD;
--source include/wait_for_slave_io_to_start.inc
--connection M2
# This query also creates a Gtid event group whose Gtid will remain in
# ignored status for too long causing a following group split.
CREATE TABLE t1 (a INT AUTO_INCREMENT PRIMARY KEY, b VARCHAR(30000)) ENGINE=innodb;
--sync_slave_with_master M4
# This INSERT will be logged as two Write_log events which the buggy
# slave applier would split.
--connection M4
eval INSERT INTO `t1` VALUES (null, repeat('a', $row_size)), (null, repeat('b', $row_size));
# START M2 IO thread and wait for its signal to follow with the SQL
# thread start. At this moment the SQL thread shall be having 2 and
# "half" groups to execute. The "hafl" one would be committed by the
# buggy applier after which the IO is released to queue the rest of
# the 3rd group which the SQL thread commits separately to complete
# the split.
--connection M2
# wait for IO signal to start the SQL thread. IO will be hanging upon that.
SET debug_sync='now WAIT_FOR start_sql_thread';
# Now the slave server has relay log whose last group is incomplete.
# An unfixed slave server would go to "insert" a "fake"
# Gtid_list_log_event event which actually would commit the incomplete
# group. However before to actual commit do_apply_event() hits some assert.
# In the fixed server the fake Gtid_list_log_event is *not* inserted
# in the middle of a group.
START SLAVE SQL_THREAD;
# Sleep for a little time to give SQL thread a chance to commit while
# the IO thread is hanging (see
# DBUG_EXECUTE_IF("dbug.rows_events_to_delay_relay_logging"...) in
# queue_event). Alternatively to reproduce the case when buggy slave
# wait for the 1st group commit
#--let $count= 1
#--let $table= t1
#--source include/wait_until_rows_count.inc
--sleep 2
# Demonstrate either no split group in the correct slave or the 1nd
# group in the buggy one
--source include/show_binlog_events.inc
# Release the IO thread
SET debug_sync='now SIGNAL go_on_relay_logging';
# Sync servers
--sync_slave_with_master M4
--connection M4
--sync_slave_with_master M2
--connection M2
# Demonstrate replication goes correctly not to create any split, or
# the 2nd group in the buggy slave
--source include/show_binlog_events.inc
#
# Cleanup
#
--connection M4
drop table t1;
--connection M2
SET GLOBAL debug_dbug= @old_debug;
SET debug_sync='RESET';
--source include/rpl_end.inc
......@@ -2363,6 +2363,7 @@ static void write_ignored_events_info_to_relay_log(THD *thd, Master_info *mi)
}
if (rli->ign_gtids.count())
{
DBUG_ASSERT(!rli->is_in_group()); // Ensure no active transaction
glev= new Gtid_list_log_event(&rli->ign_gtids,
Gtid_list_log_event::FLAG_IGN_GTIDS);
rli->ign_gtids.reset();
......@@ -5317,7 +5318,9 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
bool gtid_skip_enqueue= false;
bool got_gtid_event= false;
rpl_gtid event_gtid;
#ifndef DBUG_OFF
static uint dbug_rows_event_count= 0;
#endif
/*
FD_q must have been prepared for the first R_a event
inside get_master_version_and_clock()
......@@ -5384,6 +5387,26 @@ static int queue_event(Master_info* mi,const char* buf, ulong event_len)
(uchar)buf[EVENT_TYPE_OFFSET] != FORMAT_DESCRIPTION_EVENT /* a way to escape */)
DBUG_RETURN(queue_old_event(mi,buf,event_len));
#ifdef ENABLED_DEBUG_SYNC
/*
A (+d,dbug.rows_events_to_delay_relay_logging)-test is supposed to
create a few Write_log_events and after receiving the 1st of them
the IO thread signals to launch the SQL thread, and sets itself to
wait for a release signal.
*/
DBUG_EXECUTE_IF("dbug.rows_events_to_delay_relay_logging",
if ((buf[EVENT_TYPE_OFFSET] == WRITE_ROWS_EVENT_V1 ||
buf[EVENT_TYPE_OFFSET] == WRITE_ROWS_EVENT) &&
++dbug_rows_event_count == 2)
{
const char act[]=
"now SIGNAL start_sql_thread "
"WAIT_FOR go_on_relay_logging";
DBUG_ASSERT(debug_sync_service);
DBUG_ASSERT(!debug_sync_set_action(current_thd,
STRING_WITH_LEN(act)));
};);
#endif
mysql_mutex_lock(&mi->data_lock);
switch ((uchar)buf[EVENT_TYPE_OFFSET]) {
......@@ -6588,9 +6611,12 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
DBUG_RETURN(ev);
}
if (rli->ign_gtids.count())
if (rli->ign_gtids.count() && !rli->is_in_group())
{
/* We generate and return a Gtid_list, to update gtid_slave_pos. */
/*
We generate and return a Gtid_list, to update gtid_slave_pos,
unless being in the middle of a group.
*/
DBUG_PRINT("info",("seeing ignored end gtids"));
ev= new Gtid_list_log_event(&rli->ign_gtids,
Gtid_list_log_event::FLAG_IGN_GTIDS);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment