Commit 95fdeb89 authored by Andrei Elkin's avatar Andrei Elkin

Bug#11763573 - 56299: MUTEX DEADLOCK WITH COM_BINLOG_DUMP, BINLOG PURGE, AND PROCESSLIST/KILL

The bug case is similar to one fixed earlier bug_49536.
Deadlock involving LOCK_log appears to be possible because the purge running thread
is holding LOCK_log whereas there is no sense of doing that and which fact was
exploited by the earlier bug fixes.

Fixed with small reengineering of rotate_and_purge(), adding two new methods and
setting up a policy to execute those instead of the former
rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED).
The policy for using rotate(), purge() is that if the caller acquires LOCK_log itself,
it should call rotate(), release the mutex and run purge().

Side effect of this patch is refining error message of bug@11747416 to print
the whole path.



mysql-test/suite/rpl/r/rpl_cant_read_event_incident.result:
  the file name printing is changed to a relative path instead of just the file name.
mysql-test/suite/rpl/r/rpl_log_pos.result:
  the file name printing is changed to a relative path instead of just the file name.
mysql-test/suite/rpl/r/rpl_manual_change_index_file.result:
  the file name printing is changed to a relative path instead of just the file name.
mysql-test/suite/rpl/r/rpl_packet.result:
  the file name printing is changed to a relative path instead of just the file name.
mysql-test/suite/rpl/r/rpl_rotate_purge_deadlock.result:
  new result file is added.
mysql-test/suite/rpl/t/rpl_cant_read_event_incident.test:
  The test of that bug can't satisfy windows and unix backslash interpretation so windows
  execution is chosen to bypass.
mysql-test/suite/rpl/t/rpl_rotate_purge_deadlock-master.opt:
  new opt file is added.
mysql-test/suite/rpl/t/rpl_rotate_purge_deadlock.test:
  regression test is added as well as verification of a 
  possible side effect of the fixes is tried.
sql/log.cc:
  LOCK_log is never taken during execution of log purging routine.
  The former MYSQL_BIN_LOG::rotate_and_purge is made to necessarily 
  acquiring and releasing LOCK_log. 
  If caller takes the mutex itself it has to use a new rotate(), purge() 
  methods combination and to never let purge() be run with LOCK_log grabbed.
  
  
  
  split apart to allow
  the caller to chose either it
  Simulation of concurrently rotating/purging threads is added.
sql/log.h:
  new rotate(), purge() methods are added to be used instead of
  the former rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED).
  rotate_and_purge() signature is changed. Caller should not call rotate_and_purge()
  but rather {rotate(), purge()} if LOCK_log is acquired by it.
sql/rpl_injector.cc:
  changes to reflect the new rotate_and_purge() signature.
sql/sql_class.h:
  unnecessary constants are removed.
sql/sql_parse.cc:
  changes to reflect the new rotate_and_purge() signature.
sql/sql_reload.cc:
  changes to reflect the new rotate_and_purge() signature.
sql/sql_repl.cc:
  followup for bug@11747416: the file name printing is changed to a relative 
  path instead of just the file name.
parent bf456a87
......@@ -11,7 +11,7 @@ reset slave;
start slave;
include/wait_for_slave_param.inc [Last_IO_Errno]
Last_IO_Errno = '1236'
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'binlog truncated in the middle of event; consider out of disk space on master; the last event was read from 'master-bin.000001' at 316, the last byte read was read from 'master-bin.000001' at 335.''
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'binlog truncated in the middle of event; consider out of disk space on master; the last event was read from './master-bin.000001' at 316, the last byte read was read from './master-bin.000001' at 335.''
reset master;
stop slave;
reset slave;
......
......@@ -9,7 +9,7 @@ change master to master_log_pos=MASTER_LOG_POS;
Read_Master_Log_Pos = '75'
start slave;
include/wait_for_slave_io_error.inc [errno=1236]
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'log event entry exceeded max_allowed_packet; Increase max_allowed_packet on master; the last event was read from 'master-bin.000001' at 75, the last byte read was read from 'master-bin.000001' at 94.''
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'log event entry exceeded max_allowed_packet; Increase max_allowed_packet on master; the last event was read from './master-bin.000001' at 75, the last byte read was read from './master-bin.000001' at 94.''
include/stop_slave_sql.inc
show master status;
File Position Binlog_Do_DB Binlog_Ignore_DB
......
......@@ -5,7 +5,7 @@ CREATE TABLE t1(c1 INT);
FLUSH LOGS;
call mtr.add_suppression('Got fatal error 1236 from master when reading data from binary log: .*could not find next log');
include/wait_for_slave_io_error.inc [errno=1236]
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'could not find next log; the last event was read from 'master-bin.000002' at 237, the last byte read was read from 'master-bin.000002' at 237.''
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'could not find next log; the last event was read from './master-bin.000002' at 237, the last byte read was read from './master-bin.000002' at 237.''
CREATE TABLE t2(c1 INT);
FLUSH LOGS;
CREATE TABLE t3(c1 INT);
......
......@@ -37,7 +37,7 @@ DROP TABLE t1;
CREATE TABLE t1 (f1 int PRIMARY KEY, f2 LONGTEXT, f3 LONGTEXT) ENGINE=MyISAM;
INSERT INTO t1(f1, f2, f3) VALUES(1, REPEAT('a', @@global.max_allowed_packet), REPEAT('b', @@global.max_allowed_packet));
include/wait_for_slave_io_error.inc [errno=1236]
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'log event entry exceeded max_allowed_packet; Increase max_allowed_packet on master; the last event was read from 'master-bin.000001' at 463, the last byte read was read from 'master-bin.000001' at 482.''
Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'log event entry exceeded max_allowed_packet; Increase max_allowed_packet on master; the last event was read from './master-bin.000001' at 463, the last byte read was read from './master-bin.000001' at 482.''
STOP SLAVE;
RESET SLAVE;
RESET MASTER;
......
include/master-slave.inc
[connection master]
show binary logs;
Log_name File_size
master-bin.000001 #
create table t1 (f text) engine=innodb;
SET DEBUG_SYNC = 'at_purge_logs_before_date WAIT_FOR rotated';
insert into t1 set f=repeat('a', 4096);
*** there must be two logs in the list ***
show binary logs;
Log_name File_size
master-bin.000001 #
master-bin.000002 #
insert into t1 set f=repeat('b', 4096);
*** there must be three logs in the list ***
show binary logs;
Log_name File_size
master-bin.000001 #
master-bin.000002 #
master-bin.000003 #
SET DEBUG_SYNC = 'now SIGNAL rotated';
SET DEBUG_SYNC = 'RESET';
SET DEBUG_SYNC = 'RESET';
SET DEBUG_SYNC = 'at_purge_logs_before_date WAIT_FOR rotated';
insert into t1 set f=repeat('b', 4096);
SET DEBUG_SYNC = 'now SIGNAL rotated';
SET DEBUG_SYNC = 'RESET';
SET DEBUG_SYNC = 'RESET';
drop table t1;
include/rpl_end.inc
......@@ -14,6 +14,11 @@
--source include/master-slave.inc
--source include/have_binlog_format_mixed.inc
#
# Bug#13050593 swallows `\' from Last_IO_Error
# todo: uncomment the filter once the bug is fixed.
#
--source include/not_windows.inc
call mtr.add_suppression("Error in Log_event::read_log_event()");
......
--max-binlog-size=4k --expire-logs-days=1
#
# Bug#11763573 - 56299: MUTEX DEADLOCK WITH COM_BINLOG_DUMP, BINLOG PURGE, AND PROCESSLIST/KILL
#
source include/master-slave.inc;
source include/have_debug_sync.inc;
source include/have_binlog_format_row.inc;
source include/have_innodb.inc;
#
# Testing that execution of two concurrent INSERTing connections both
# triggering the binlog rotation is correct even though their execution
# is interleaved.
# The test makes the first connection to complete the rotation part
# and yields control to the second connection that rotates as well and
# gets first on purging. And the fact of interleaving does not create
# any issue.
#
connection master;
source include/show_binary_logs.inc;
create table t1 (f text) engine=innodb;
SET DEBUG_SYNC = 'at_purge_logs_before_date WAIT_FOR rotated';
send insert into t1 set f=repeat('a', 4096);
connection master1;
let $wait_condition=
SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST
WHERE STATE like "debug sync point: at_purge_logs_before_date%";
--source include/wait_condition.inc
--echo *** there must be two logs in the list ***
source include/show_binary_logs.inc;
insert into t1 set f=repeat('b', 4096);
--echo *** there must be three logs in the list ***
source include/show_binary_logs.inc;
SET DEBUG_SYNC = 'now SIGNAL rotated';
SET DEBUG_SYNC = 'RESET';
# the first connection finally completes its INSERT
connection master;
reap;
SET DEBUG_SYNC = 'RESET';
sync_slave_with_master;
#
# Testing the reported deadlock involving DUMP, KILL and INSERT threads
#
connection master;
SET DEBUG_SYNC = 'at_purge_logs_before_date WAIT_FOR rotated';
send insert into t1 set f=repeat('b', 4096);
connection master1;
# make sure INSERT reaches waiting point
let $wait_condition=
SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST
WHERE STATE like "debug sync point: at_purge_logs_before_date%";
--source include/wait_condition.inc
# find and kill DUMP thread
let $_tid= `select id from information_schema.processlist where command = 'Binlog Dump' limit 1`;
--disable_query_log
eval kill query $_tid;
--enable_query_log
#
# Now the proof is that the new DUMP thread has executed
# a critical section of the deadlock without any regression and is UP
#
let $wait_condition=
SELECT COUNT(*) = 1 FROM INFORMATION_SCHEMA.PROCESSLIST
WHERE command = 'Binlog Dump' and STATE like "Master has sent all binlog to slave%";
--source include/wait_condition.inc
SET DEBUG_SYNC = 'now SIGNAL rotated';
SET DEBUG_SYNC = 'RESET';
connection master;
reap;
SET DEBUG_SYNC = 'RESET';
drop table t1;
sync_slave_with_master;
--source include/rpl_end.inc
......@@ -49,7 +49,7 @@
#include "sql_plugin.h"
#include "rpl_handler.h"
#include "debug_sync.h"
/* max size of the log message */
#define MAX_LOG_BUFFER_SIZE 1024
#define MAX_TIME_SIZE 32
......@@ -5002,20 +5002,30 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info)
{
bool synced;
if ((error= flush_and_sync(&synced)))
goto unlock;
if ((error= RUN_HOOK(binlog_storage, after_flush,
{
mysql_mutex_unlock(&LOCK_log);
}
else if ((error= RUN_HOOK(binlog_storage, after_flush,
(thd, log_file_name, file->pos_in_file, synced))))
{
sql_print_error("Failed to run 'after_flush' hooks");
goto unlock;
mysql_mutex_unlock(&LOCK_log);
}
else
{
bool check_purge;
signal_update();
rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
error= rotate(false, &check_purge);
mysql_mutex_unlock(&LOCK_log);
if (!error && check_purge)
purge();
}
unlock:
}
else
{
mysql_mutex_unlock(&LOCK_log);
}
}
if (error)
{
......@@ -5100,25 +5110,29 @@ bool general_log_write(THD *thd, enum enum_server_command command,
}
/**
The method executes rotation when LOCK_log is already acquired
by the caller.
@param force_rotate caller can request the log rotation
@param check_purge is set to true if rotation took place
@note
If rotation fails, for instance the server was unable
to create a new log file, we still try to write an
incident event to the current log.
@retval
nonzero - error
nonzero - error in rotating routine.
*/
int MYSQL_BIN_LOG::rotate_and_purge(uint flags)
int MYSQL_BIN_LOG::rotate(bool force_rotate, bool* check_purge)
{
int error= 0;
DBUG_ENTER("MYSQL_BIN_LOG::rotate_and_purge");
#ifdef HAVE_REPLICATION
bool check_purge= false;
#endif
if (!(flags & RP_LOCK_LOG_IS_ALREADY_LOCKED))
mysql_mutex_lock(&LOCK_log);
if ((flags & RP_FORCE_ROTATE) ||
(my_b_tell(&log_file) >= (my_off_t) max_size))
DBUG_ENTER("MYSQL_BIN_LOG::rotate");
//todo: fix the macro def and restore safe_mutex_assert_owner(&LOCK_log);
*check_purge= false;
if (force_rotate || (my_b_tell(&log_file) >= (my_off_t) max_size))
{
if ((error= new_file_without_locking()))
/**
......@@ -5133,24 +5147,59 @@ int MYSQL_BIN_LOG::rotate_and_purge(uint flags)
if (!write_incident(current_thd, FALSE))
flush_and_sync(0);
#ifdef HAVE_REPLICATION
check_purge= true;
#endif
*check_purge= true;
}
if (!(flags & RP_LOCK_LOG_IS_ALREADY_LOCKED))
mysql_mutex_unlock(&LOCK_log);
DBUG_RETURN(error);
}
/**
The method executes logs purging routine.
@retval
nonzero - error in rotating routine.
*/
void MYSQL_BIN_LOG::purge()
{
#ifdef HAVE_REPLICATION
/*
NOTE: Run purge_logs wo/ holding LOCK_log
as it otherwise will deadlock in ndbcluster_binlog_index_purge_file
*/
if (!error && check_purge && expire_logs_days)
if (expire_logs_days)
{
DEBUG_SYNC(current_thd, "at_purge_logs_before_date");
time_t purge_time= my_time(0) - expire_logs_days*24*60*60;
if (purge_time >= 0)
{
purge_logs_before_date(purge_time);
}
}
#endif
}
/**
The method is a shortcut of @c rotate() and @c purge().
LOCK_log is acquired prior to rotate and is released after it.
@param force_rotate caller can request the log rotation
@retval
nonzero - error in rotating routine.
*/
int MYSQL_BIN_LOG::rotate_and_purge(bool force_rotate)
{
int error= 0;
DBUG_ENTER("MYSQL_BIN_LOG::rotate_and_purge");
bool check_purge= false;
//todo: fix the macro def and restore safe_mutex_assert_not_owner(&LOCK_log);
mysql_mutex_lock(&LOCK_log);
error= rotate(force_rotate, &check_purge);
/*
NOTE: Run purge_logs wo/ holding LOCK_log because it does not need
the mutex. Otherwise causes various deadlocks.
*/
mysql_mutex_unlock(&LOCK_log);
if (!error && check_purge)
purge();
DBUG_RETURN(error);
}
......@@ -5352,11 +5401,18 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock)
{
if (!error && !(error= flush_and_sync(0)))
{
bool check_purge= false;
signal_update();
error= rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
error= rotate(false, &check_purge);
mysql_mutex_unlock(&LOCK_log);
if (!error && check_purge)
purge();
}
else
{
mysql_mutex_unlock(&LOCK_log);
}
}
DBUG_RETURN(error);
}
......@@ -5388,11 +5444,13 @@ bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
bool incident)
{
DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
mysql_mutex_lock(&LOCK_log);
DBUG_ASSERT(is_open());
if (likely(is_open())) // Should always be true
{
bool check_purge;
mysql_mutex_lock(&LOCK_log);
/*
We only bother to write to the binary log if there is anything
to write.
......@@ -5460,12 +5518,17 @@ bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
mysql_mutex_lock(&LOCK_prep_xids);
prepared_xids++;
mysql_mutex_unlock(&LOCK_prep_xids);
mysql_mutex_unlock(&LOCK_log);
}
else
if (rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED))
{
if (rotate(false, &check_purge))
goto err;
}
mysql_mutex_unlock(&LOCK_log);
if (check_purge)
purge();
}
}
DBUG_RETURN(0);
......
......@@ -455,7 +455,9 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
void make_log_name(char* buf, const char* log_ident);
bool is_active(const char* log_file_name);
int update_log_index(LOG_INFO* linfo, bool need_update_threads);
int rotate_and_purge(uint flags);
int rotate(bool force_rotate, bool* check_purge);
void purge();
int rotate_and_purge(bool force_rotate);
/**
Flush binlog cache and synchronize to disk.
......
......@@ -237,7 +237,7 @@ int injector::record_incident(THD *thd, Incident incident)
Incident_log_event ev(thd, incident);
if (int error= mysql_bin_log.write(&ev))
return error;
return mysql_bin_log.rotate_and_purge(RP_FORCE_ROTATE);
return mysql_bin_log.rotate_and_purge(true);
}
int injector::record_incident(THD *thd, Incident incident, LEX_STRING const message)
......@@ -245,5 +245,5 @@ int injector::record_incident(THD *thd, Incident incident, LEX_STRING const mess
Incident_log_event ev(thd, incident, message);
if (int error= mysql_bin_log.write(&ev))
return error;
return mysql_bin_log.rotate_and_purge(RP_FORCE_ROTATE);
return mysql_bin_log.rotate_and_purge(true);
}
......@@ -161,9 +161,6 @@ typedef struct st_user_var_events
bool unsigned_flag;
} BINLOG_USER_VAR_EVENT;
#define RP_LOCK_LOG_IS_ALREADY_LOCKED 1
#define RP_FORCE_ROTATE 2
/*
The COPY_INFO structure is used by INSERT/REPLACE code.
The schema of the row counting by the INSERT/INSERT ... ON DUPLICATE KEY
......
......@@ -2861,7 +2861,7 @@ case SQLCOM_PREPARE:
{
Incident_log_event ev(thd, incident);
(void) mysql_bin_log.write(&ev); /* error is ignored */
if (mysql_bin_log.rotate_and_purge(RP_FORCE_ROTATE))
if (mysql_bin_log.rotate_and_purge(true))
{
res= 1;
break;
......
......@@ -141,7 +141,7 @@ bool reload_acl_and_cache(THD *thd, unsigned long options,
tmp_write_to_binlog= 0;
if (mysql_bin_log.is_open())
{
if (mysql_bin_log.rotate_and_purge(RP_FORCE_ROTATE))
if (mysql_bin_log.rotate_and_purge(true))
*write_to_binlog= -1;
}
}
......
......@@ -1030,12 +1030,9 @@ impossible position";
detailing the fatal error message with coordinates
of the last position read.
*/
char b_start[FN_REFLEN], b_end[FN_REFLEN];
fn_format(b_start, coord->file_name, "", "", MY_REPLACE_DIR);
fn_format(b_end, log_file_name, "", "", MY_REPLACE_DIR);
my_snprintf(error_text, sizeof(error_text), fmt, errmsg,
b_start, (llstr(coord->pos, llbuff1), llbuff1),
b_end, (llstr(my_b_tell(&log), llbuff2), llbuff2));
coord->file_name, (llstr(coord->pos, llbuff1), llbuff1),
log_file_name, (llstr(my_b_tell(&log), llbuff2), llbuff2));
}
else
strcpy(error_text, errmsg);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment