Commit d69e8357 authored by Brandon Nesterenko's avatar Brandon Nesterenko

MDEV-29639: Seconds_Behind_Master is incorrect for Delayed, Parallel Replicas

Problem
========
On a parallel, delayed replica, Seconds_Behind_Master will not be
calculated until after MASTER_DELAY seconds have passed and the
event has finished executing, resulting in potentially very large
values of Seconds_Behind_Master (which could be much larger than the
MASTER_DELAY parameter) for the entire duration the event is
delayed. This contradicts the documented MASTER_DELAY behavior,
which specifies how many seconds to withhold replicated events from
execution.

Solution
========
After a parallel replica idles, the first event after idling should
immediately update last_master_timestamp with the time that it began
execution on the primary.

Reviewed By
===========
Andrei Elkin <andrei.elkin@mariadb.com>
parent 2ed598ea
include/master-slave.inc
[connection master]
connection slave;
include/stop_slave.inc
change master to master_delay=3, master_use_gtid=Slave_Pos;
set @@GLOBAL.slave_parallel_threads=2;
include/start_slave.inc
connection master;
create table t1 (a int);
include/sync_slave_sql_with_master.inc
#
# Pt 1) Ensure SBM is updated immediately upon arrival of the next event
# Lock t1 on slave so the first received transaction does not complete/commit
connection slave;
LOCK TABLES t1 WRITE;
connection master;
# Sleep 2 to allow a buffer between events for SBM check
insert into t1 values (0);
include/save_master_gtid.inc
connection slave;
# Waiting for transaction to arrive on slave and begin SQL Delay..
# Validating SBM is updated on event arrival..
# ..done
connection slave;
UNLOCK TABLES;
include/sync_with_master_gtid.inc
#
# Pt 2) If the SQL thread has not entered an idle state, ensure
# following events do not update SBM
# Stop slave IO thread so it receives both events together on restart
connection slave;
include/stop_slave_io.inc
connection master;
# Sleep 2 to allow a buffer between events for SBM check
insert into t1 values (1);
# Sleep 3 to create gap between events
insert into t1 values (2);
connection slave;
LOCK TABLES t1 WRITE;
START SLAVE IO_THREAD;
# Wait for first transaction to complete SQL delay and begin execution..
# Validate SBM calculation doesn't use the second transaction because SQL thread shouldn't have gone idle..
# ..and that SBM wasn't calculated using prior committed transactions
# ..done
connection slave;
UNLOCK TABLES;
#
# Cleanup
# Reset master_delay
include/stop_slave.inc
CHANGE MASTER TO master_delay=0;
set @@GLOBAL.slave_parallel_threads=4;
include/start_slave.inc
connection master;
DROP TABLE t1;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/rpl_end.inc
# End of rpl_delayed_parallel_slave_sbm.test
#
# This test ensures that after a delayed parallel slave has idled, i.e.
# executed everything in its relay log, the next event group that the SQL
# thread reads from the relay log will immediately be used in the
# Seconds_Behind_Master. In particular, it ensures that the calculation for
# Seconds_Behind_Master is based on the timestamp of the new transaction,
# rather than the last committed transaction.
#
# References:
# MDEV-29639: Seconds_Behind_Master is incorrect for Delayed, Parallel
# Replicas
#
--source include/master-slave.inc
--connection slave
--source include/stop_slave.inc
--let $master_delay= 3
--eval change master to master_delay=$master_delay, master_use_gtid=Slave_Pos
--let $old_slave_threads= `SELECT @@GLOBAL.slave_parallel_threads`
set @@GLOBAL.slave_parallel_threads=2;
--source include/start_slave.inc
--connection master
create table t1 (a int);
--source include/sync_slave_sql_with_master.inc
--echo #
--echo # Pt 1) Ensure SBM is updated immediately upon arrival of the next event
--echo # Lock t1 on slave so the first received transaction does not complete/commit
--connection slave
LOCK TABLES t1 WRITE;
--connection master
--echo # Sleep 2 to allow a buffer between events for SBM check
sleep 2;
--let $ts_trx_before_ins= `SELECT UNIX_TIMESTAMP()`
--let insert_ctr= 0
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--source include/save_master_gtid.inc
--connection slave
--echo # Waiting for transaction to arrive on slave and begin SQL Delay..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting until MASTER_DELAY seconds after master executed event';
--source include/wait_condition.inc
--echo # Validating SBM is updated on event arrival..
--let $sbm_trx1_arrive= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $seconds_since_idling= `SELECT UNIX_TIMESTAMP() - $ts_trx_before_ins`
if (`SELECT $sbm_trx1_arrive > ($seconds_since_idling + 1)`)
{
--echo # SBM was $sbm_trx1_arrive yet shouldn't have been larger than $seconds_since_idling + 1 (for possible negative clock_diff_with_master)
--die Seconds_Behind_Master should reset after idling
}
--echo # ..done
--connection slave
UNLOCK TABLES;
--source include/sync_with_master_gtid.inc
--echo #
--echo # Pt 2) If the SQL thread has not entered an idle state, ensure
--echo # following events do not update SBM
--echo # Stop slave IO thread so it receives both events together on restart
--connection slave
--source include/stop_slave_io.inc
--connection master
--echo # Sleep 2 to allow a buffer between events for SBM check
sleep 2;
--let $ts_trxpt2_before_ins= `SELECT UNIX_TIMESTAMP()`
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--echo # Sleep 3 to create gap between events
sleep 3;
--eval insert into t1 values ($insert_ctr)
--inc $insert_ctr
--let $ts_trx_after_ins= `SELECT UNIX_TIMESTAMP()`
--connection slave
LOCK TABLES t1 WRITE;
START SLAVE IO_THREAD;
--echo # Wait for first transaction to complete SQL delay and begin execution..
--let $wait_condition= SELECT count(*) FROM information_schema.processlist WHERE state LIKE 'Waiting for table metadata lock%' AND command LIKE 'Slave_Worker';
--source include/wait_condition.inc
--echo # Validate SBM calculation doesn't use the second transaction because SQL thread shouldn't have gone idle..
--let $sbm_after_trx_no_idle= query_get_value(SHOW SLAVE STATUS, Seconds_Behind_Master, 1)
--let $timestamp_trxpt2_arrive= `SELECT UNIX_TIMESTAMP()`
if (`SELECT $sbm_after_trx_no_idle < $timestamp_trxpt2_arrive - $ts_trx_after_ins`)
{
--let $cmpv= `SELECT $timestamp_trxpt2_arrive - $ts_trx_after_ins`
--echo # SBM $sbm_after_trx_no_idle was more recent than time since last transaction ($cmpv seconds)
--die Seconds_Behind_Master should not have used second transaction timestamp
}
--let $seconds_since_idling= `SELECT ($timestamp_trxpt2_arrive - $ts_trxpt2_before_ins)`
--echo # ..and that SBM wasn't calculated using prior committed transactions
if (`SELECT $sbm_after_trx_no_idle > ($seconds_since_idling + 1)`)
{
--echo # SBM was $sbm_after_trx_no_idle yet shouldn't have been larger than $seconds_since_idling + 1 (for possible negative clock_diff_with_master)
--die Seconds_Behind_Master calculation should not have used prior committed transaction
}
--echo # ..done
--connection slave
UNLOCK TABLES;
--echo #
--echo # Cleanup
--echo # Reset master_delay
--source include/stop_slave.inc
--eval CHANGE MASTER TO master_delay=0
--eval set @@GLOBAL.slave_parallel_threads=$old_slave_threads
--source include/start_slave.inc
--connection master
DROP TABLE t1;
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/rpl_end.inc
--echo # End of rpl_delayed_parallel_slave_sbm.test
...@@ -45,8 +45,7 @@ rpt_handle_event(rpl_parallel_thread::queued_event *qev, ...@@ -45,8 +45,7 @@ rpt_handle_event(rpl_parallel_thread::queued_event *qev,
rgi->event_relay_log_pos= qev->event_relay_log_pos; rgi->event_relay_log_pos= qev->event_relay_log_pos;
rgi->future_event_relay_log_pos= qev->future_event_relay_log_pos; rgi->future_event_relay_log_pos= qev->future_event_relay_log_pos;
strcpy(rgi->future_event_master_log_name, qev->future_event_master_log_name); strcpy(rgi->future_event_master_log_name, qev->future_event_master_log_name);
if (!(ev->is_artificial_event() || ev->is_relay_log_event() || if (event_can_update_last_master_timestamp(ev))
(ev->when == 0)))
rgi->last_master_timestamp= ev->when + (time_t)ev->exec_time; rgi->last_master_timestamp= ev->when + (time_t)ev->exec_time;
err= apply_event_and_update_pos_for_parallel(ev, thd, rgi); err= apply_event_and_update_pos_for_parallel(ev, thd, rgi);
......
...@@ -4125,10 +4125,10 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli, ...@@ -4125,10 +4125,10 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli,
the user might be surprised to see a claim that the slave is up to date the user might be surprised to see a claim that the slave is up to date
long before those queued events are actually executed. long before those queued events are actually executed.
*/ */
if (!rli->mi->using_parallel() && if ((!rli->mi->using_parallel()) && event_can_update_last_master_timestamp(ev))
!(ev->is_artificial_event() || ev->is_relay_log_event() || (ev->when == 0)))
{ {
rli->last_master_timestamp= ev->when + (time_t) ev->exec_time; rli->last_master_timestamp= ev->when + (time_t) ev->exec_time;
rli->sql_thread_caught_up= false;
DBUG_ASSERT(rli->last_master_timestamp >= 0); DBUG_ASSERT(rli->last_master_timestamp >= 0);
} }
...@@ -4177,6 +4177,17 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli, ...@@ -4177,6 +4177,17 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli,
if (rli->mi->using_parallel()) if (rli->mi->using_parallel())
{ {
if (unlikely((rli->last_master_timestamp == 0 ||
rli->sql_thread_caught_up) &&
event_can_update_last_master_timestamp(ev)))
{
if (rli->last_master_timestamp < ev->when)
{
rli->last_master_timestamp= ev->when;
rli->sql_thread_caught_up= false;
}
}
int res= rli->parallel.do_event(serial_rgi, ev, event_size); int res= rli->parallel.do_event(serial_rgi, ev, event_size);
/* /*
In parallel replication, we need to update the relay log position In parallel replication, we need to update the relay log position
...@@ -4192,7 +4203,7 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli, ...@@ -4192,7 +4203,7 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli,
This is the case for pre-10.0 events without GTID, and for handling This is the case for pre-10.0 events without GTID, and for handling
slave_skip_counter. slave_skip_counter.
*/ */
if (!(ev->is_artificial_event() || ev->is_relay_log_event() || (ev->when == 0))) if (event_can_update_last_master_timestamp(ev))
{ {
/* /*
Ignore FD's timestamp as it does not reflect the slave execution Ignore FD's timestamp as it does not reflect the slave execution
...@@ -4200,7 +4211,8 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli, ...@@ -4200,7 +4211,8 @@ static int exec_relay_log_event(THD* thd, Relay_log_info* rli,
data modification event execution last long all this time data modification event execution last long all this time
Seconds_Behind_Master is zero. Seconds_Behind_Master is zero.
*/ */
if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT) if (ev->get_type_code() != FORMAT_DESCRIPTION_EVENT &&
rli->last_master_timestamp < ev->when)
rli->last_master_timestamp= ev->when + (time_t) ev->exec_time; rli->last_master_timestamp= ev->when + (time_t) ev->exec_time;
DBUG_ASSERT(rli->last_master_timestamp >= 0); DBUG_ASSERT(rli->last_master_timestamp >= 0);
...@@ -7544,7 +7556,6 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size) ...@@ -7544,7 +7556,6 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
if (hot_log) if (hot_log)
mysql_mutex_unlock(log_lock); mysql_mutex_unlock(log_lock);
rli->sql_thread_caught_up= false;
DBUG_RETURN(ev); DBUG_RETURN(ev);
} }
if (opt_reckless_slave) // For mysql-test if (opt_reckless_slave) // For mysql-test
...@@ -7711,7 +7722,6 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size) ...@@ -7711,7 +7722,6 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
rli->relay_log.wait_for_update_relay_log(rli->sql_driver_thd); rli->relay_log.wait_for_update_relay_log(rli->sql_driver_thd);
// re-acquire data lock since we released it earlier // re-acquire data lock since we released it earlier
mysql_mutex_lock(&rli->data_lock); mysql_mutex_lock(&rli->data_lock);
rli->sql_thread_caught_up= false;
continue; continue;
} }
/* /*
...@@ -7902,12 +7912,19 @@ event(errno: %d cur_log->error: %d)", ...@@ -7902,12 +7912,19 @@ event(errno: %d cur_log->error: %d)",
{ {
sql_print_information("Error reading relay log event: %s", sql_print_information("Error reading relay log event: %s",
"slave SQL thread was killed"); "slave SQL thread was killed");
DBUG_RETURN(0); goto end;
} }
err: err:
if (errmsg) if (errmsg)
sql_print_error("Error reading relay log event: %s", errmsg); sql_print_error("Error reading relay log event: %s", errmsg);
end:
/*
Set that we are not caught up so if there is a hang/problem on restart,
Seconds_Behind_Master will still grow.
*/
rli->sql_thread_caught_up= false;
DBUG_RETURN(0); DBUG_RETURN(0);
} }
#ifdef WITH_WSREP #ifdef WITH_WSREP
......
...@@ -49,6 +49,7 @@ ...@@ -49,6 +49,7 @@
#include "rpl_filter.h" #include "rpl_filter.h"
#include "rpl_tblmap.h" #include "rpl_tblmap.h"
#include "rpl_gtid.h" #include "rpl_gtid.h"
#include "log_event.h"
#define SLAVE_NET_TIMEOUT 60 #define SLAVE_NET_TIMEOUT 60
...@@ -293,6 +294,17 @@ extern char *report_host, *report_password; ...@@ -293,6 +294,17 @@ extern char *report_host, *report_password;
extern I_List<THD> threads; extern I_List<THD> threads;
/*
Check that a binlog event (read from the relay log) is valid to update
last_master_timestamp. That is, a valid event is one with a consistent
timestamp which originated from a primary server.
*/
static inline bool event_can_update_last_master_timestamp(Log_event *ev)
{
return ev && !(ev->is_artificial_event() || ev->is_relay_log_event() ||
(ev->when == 0));
}
#else #else
#define close_active_mi() /* no-op */ #define close_active_mi() /* no-op */
#endif /* HAVE_REPLICATION */ #endif /* HAVE_REPLICATION */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment