Commit fa5c75ec authored by unknown's avatar unknown

MDEV-450: Deadlock between starting a slave and reading system variables

Starting the SQL thread might deadlock with reading the values of the
replication filtering options.

The deadlock is due to a lock order violation when the variables are
read or set. For example, reading replicate_ignore_table first
acquires LOCK_global_system_variables in sys_var::value_ptr and later
acquires LOCK_active_mi in Sys_var_rpl_filter::global_value_ptr. This
violates the order established when starting a SQL thread, where
LOCK_active_mi is acquired before start_slave, and ends up creating a
thread (handle_slave_sql) that allocates a THD handle whose
constructor acquires LOCK_global_system_variables in THD::init.

The solution is to unlock LOCK_global_system_variables before the
replication filtering options are set or read. This way the lock
order is preserved and the data being read/set is still protected
given that it acquires LOCK_active_mi.
parent 4fb39eec
include/master-slave.inc
[connection master]
# connection: slave
SET @save_slave_net_timeout = @@GLOBAL.slave_net_timeout;
STOP SLAVE;
include/wait_for_slave_to_stop.inc
# open an extra connection to the slave
# connection: slave2
# set debug synchronization point
SET DEBUG_SYNC='fix_slave_net_timeout SIGNAL parked WAIT_FOR go';
# attempt to set slave_net_timeout, will wait on sync point
SET @@GLOBAL.slave_net_timeout = 100;
# connection: slave
SET DEBUG_SYNC='now WAIT_FOR parked';
# connection: slave1
# attempt to start the SQL thread
START SLAVE SQL_THREAD;
# connection: slave
# wait until SQL thread has been started
# sleep a bit so that the SQL thread THD handle is initialized
# signal the set slave_net_timeout to continue
SET DEBUG_SYNC='now SIGNAL go';
# connection: slave2
# reap result of set slave_net_timeout
# connection: slave1
# reap result of starting the SQL thread
# disconnect: slave2
# connection: slave
# cleanup
SET @@GLOBAL.slave_net_timeout = @save_slave_net_timeout;
include/rpl_end.inc
source include/master-slave.inc;
source include/have_debug_sync.inc;
--echo # connection: slave
connection slave;
SET @save_slave_net_timeout = @@GLOBAL.slave_net_timeout;
STOP SLAVE;
source include/wait_for_slave_to_stop.inc;
--echo # open an extra connection to the slave
connect(slave2,127.0.0.1,root,,test,$SLAVE_MYPORT,);
--echo # connection: slave2
--echo # set debug synchronization point
SET DEBUG_SYNC='fix_slave_net_timeout SIGNAL parked WAIT_FOR go';
--echo # attempt to set slave_net_timeout, will wait on sync point
--send SET @@GLOBAL.slave_net_timeout = 100
--echo # connection: slave
connection slave;
SET DEBUG_SYNC='now WAIT_FOR parked';
--echo # connection: slave1
connection slave1;
--echo # attempt to start the SQL thread
--send START SLAVE SQL_THREAD
--echo # connection: slave
connection slave;
--echo # wait until SQL thread has been started
let $wait_condition=
select count(*) = 1 from information_schema.processlist
where state = "Waiting for slave thread to start" and info = "START SLAVE SQL_THREAD";
--source include/wait_condition.inc
--echo # sleep a bit so that the SQL thread THD handle is initialized
sleep 2;
--echo # signal the set slave_net_timeout to continue
SET DEBUG_SYNC='now SIGNAL go';
--echo # connection: slave2
connection slave2;
--echo # reap result of set slave_net_timeout
--reap
--echo # connection: slave1
connection slave1;
--echo # reap result of starting the SQL thread
--reap
--echo # disconnect: slave2
disconnect slave2;
--echo # connection: slave
connection slave;
--echo # cleanup
SET @@GLOBAL.slave_net_timeout = @save_slave_net_timeout;
source include/rpl_end.inc;
......@@ -3005,6 +3005,8 @@ pthread_handler_t handle_slave_io(void *arg)
mysql= NULL ;
retry_count= 0;
thd= new THD; // note that contructor of THD uses DBUG_ !
mysql_mutex_lock(&mi->run_lock);
/* Inform waiting threads that slave has started */
mi->slave_run_id++;
......@@ -3013,7 +3015,6 @@ pthread_handler_t handle_slave_io(void *arg)
mi->events_till_disconnect = disconnect_slave_event_count;
#endif
thd= new THD; // note that contructor of THD uses DBUG_ !
THD_CHECK_SENTRY(thd);
mi->io_thd = thd;
......
......@@ -47,6 +47,7 @@
#include "sql_base.h" // close_cached_tables
#include <myisam.h>
#include "log_slow.h"
#include "debug_sync.h" // DEBUG_SYNC
#include "log_event.h"
#ifdef WITH_PERFSCHEMA_STORAGE_ENGINE
......@@ -3256,6 +3257,13 @@ bool Sys_var_rpl_filter::do_check(THD *thd, set_var *var)
{
bool status;
/*
We must not be holding LOCK_global_system_variables here, otherwise we can
deadlock with THD::init() which is invoked from within the slave threads
with opposite locking order.
*/
mysql_mutex_assert_not_owner(&LOCK_global_system_variables);
mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);
......@@ -3272,22 +3280,43 @@ bool Sys_var_rpl_filter::do_check(THD *thd, set_var *var)
return status;
}
bool Sys_var_rpl_filter::global_update(THD *thd, set_var *var)
void Sys_var_rpl_filter::lock(void)
{
bool slave_running, status= false;
/*
Starting a slave thread causes the new thread to attempt to
acquire LOCK_global_system_variables (in THD::init) while
LOCK_active_mi is being held by the thread that initiated
the process. In order to not violate the lock order, unlock
LOCK_global_system_variables before grabbing LOCK_active_mi.
*/
mysql_mutex_unlock(&LOCK_global_system_variables);
mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);
}
if (! (slave_running= active_mi->rli.slave_running))
status= set_filter_value(var->save_result.string_value.str);
void Sys_var_rpl_filter::unlock(void)
{
mysql_mutex_unlock(&active_mi->rli.run_lock);
mysql_mutex_unlock(&LOCK_active_mi);
mysql_mutex_lock(&LOCK_global_system_variables);
}
bool Sys_var_rpl_filter::global_update(THD *thd, set_var *var)
{
bool slave_running, status= false;
lock();
if (! (slave_running= active_mi->rli.slave_running))
status= set_filter_value(var->save_result.string_value.str);
if (slave_running)
my_error(ER_SLAVE_MUST_STOP, MYF(0));
unlock();
return slave_running || status;
}
......@@ -3326,8 +3355,7 @@ uchar *Sys_var_rpl_filter::global_value_ptr(THD *thd, LEX_STRING *base)
tmp.length(0);
mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);
lock();
switch (opt_id) {
case OPT_REPLICATE_DO_DB:
......@@ -3350,8 +3378,7 @@ uchar *Sys_var_rpl_filter::global_value_ptr(THD *thd, LEX_STRING *base)
break;
}
mysql_mutex_unlock(&active_mi->rli.run_lock);
mysql_mutex_unlock(&LOCK_active_mi);
unlock();
return (uchar *) thd->strmake(tmp.ptr(), tmp.length());
}
......@@ -3404,6 +3431,9 @@ static Sys_var_charptr Sys_slave_load_tmpdir(
static bool fix_slave_net_timeout(sys_var *self, THD *thd, enum_var_type type)
{
DEBUG_SYNC(thd, "fix_slave_net_timeout");
mysql_mutex_unlock(&LOCK_global_system_variables);
mysql_mutex_lock(&LOCK_active_mi);
DBUG_PRINT("info", ("slave_net_timeout=%u mi->heartbeat_period=%.3f",
slave_net_timeout,
......@@ -3413,6 +3443,7 @@ static bool fix_slave_net_timeout(sys_var *self, THD *thd, enum_var_type type)
ER_SLAVE_HEARTBEAT_VALUE_OUT_OF_RANGE_MAX,
ER(ER_SLAVE_HEARTBEAT_VALUE_OUT_OF_RANGE_MAX));
mysql_mutex_unlock(&LOCK_active_mi);
mysql_mutex_lock(&LOCK_global_system_variables);
return false;
}
static Sys_var_uint Sys_slave_net_timeout(
......@@ -3439,6 +3470,7 @@ static bool check_slave_skip_counter(sys_var *self, THD *thd, set_var *var)
}
static bool fix_slave_skip_counter(sys_var *self, THD *thd, enum_var_type type)
{
mysql_mutex_unlock(&LOCK_global_system_variables);
mysql_mutex_lock(&LOCK_active_mi);
mysql_mutex_lock(&active_mi->rli.run_lock);
/*
......@@ -3454,6 +3486,7 @@ static bool fix_slave_skip_counter(sys_var *self, THD *thd, enum_var_type type)
}
mysql_mutex_unlock(&active_mi->rli.run_lock);
mysql_mutex_unlock(&LOCK_active_mi);
mysql_mutex_lock(&LOCK_global_system_variables);
return 0;
}
static Sys_var_uint Sys_slave_skip_counter(
......
......@@ -588,6 +588,8 @@ class Sys_var_rpl_filter: public sys_var
protected:
uchar *global_value_ptr(THD *thd, LEX_STRING *base);
bool set_filter_value(const char *value);
void lock(void);
void unlock(void);
};
/**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment