"After Monty's review" changes to the fix for BUG#8325 "Deadlock in...

"After Monty's review" changes to the fix for BUG#8325 "Deadlock in replication thread stops replication":
s/sleep/safe_sleep (thread safe); sleep 0/1/2/3/4/5/5/5 (get slave less late);
no message on error log (deadlock is too common sometimes), a global counter
instead (SHOW STATUS LIKE 'slave_retried_transactions').
Plus a fix for libmysql/Makefile.shared
parent 2a53e9d9
...@@ -44,6 +44,7 @@ dlenev@build.mysql.com ...@@ -44,6 +44,7 @@ dlenev@build.mysql.com
dlenev@jabberwock.localdomain dlenev@jabberwock.localdomain
dlenev@mysql.com dlenev@mysql.com
ejonore@mc03.ndb.mysql.com ejonore@mc03.ndb.mysql.com
gbichot@quadita2.mysql.com
gbichot@quadxeon.mysql.com gbichot@quadxeon.mysql.com
georg@beethoven.local georg@beethoven.local
georg@lmy002.wdf.sap.corp georg@lmy002.wdf.sap.corp
......
...@@ -94,7 +94,8 @@ clean-local: ...@@ -94,7 +94,8 @@ clean-local:
`echo $(sql_cmn_objects) | sed "s;\.lo;.c;g"` \ `echo $(sql_cmn_objects) | sed "s;\.lo;.c;g"` \
$(CHARSET_SRCS) $(CHARSET_OBJS) \ $(CHARSET_SRCS) $(CHARSET_OBJS) \
$(mystringsextra) $(mysysheaders) $(vioheaders)\ $(mystringsextra) $(mysysheaders) $(vioheaders)\
../linked_client_sources net.c ../linked_libmysql_sources ../linked_libmysql_r_sources \
net.c
conf_to_src_SOURCES = conf_to_src.c conf_to_src_SOURCES = conf_to_src.c
conf_to_src_LDADD= conf_to_src_LDADD=
......
...@@ -8,6 +8,9 @@ create table t1 (a int not null, key(a)) engine=innodb; ...@@ -8,6 +8,9 @@ create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb; create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb; create table t3 (a int) engine=innodb;
create table t4 (a int) engine=innodb; create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
Variable_name Value
slave_transaction_retries 0
show create table t1; show create table t1;
Table Create Table Table Create Table
t1 CREATE TABLE `t1` ( t1 CREATE TABLE `t1` (
...@@ -20,6 +23,9 @@ t2 CREATE TABLE `t2` ( ...@@ -20,6 +23,9 @@ t2 CREATE TABLE `t2` (
`a` int(11) NOT NULL default '0', `a` int(11) NOT NULL default '0',
KEY `a` (`a`) KEY `a` (`a`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1 ) ENGINE=InnoDB DEFAULT CHARSET=latin1
show variables like 'slave_transaction_retries';
Variable_name Value
slave_transaction_retries 2
stop slave; stop slave;
begin; begin;
insert into t3 select * from t2 for update; insert into t3 select * from t2 for update;
......
...@@ -7,6 +7,8 @@ ...@@ -7,6 +7,8 @@
# (Guilhem) have seen the test manage to provoke lock wait timeout # (Guilhem) have seen the test manage to provoke lock wait timeout
# error but not deadlock error; that is ok as code deals with the two # error but not deadlock error; that is ok as code deals with the two
# errors in exactly the same way. # errors in exactly the same way.
# We don't 'show status like 'slave_retried_transactions'' because this
# is not repeatable (depends on sleeps).
source include/have_innodb.inc; source include/have_innodb.inc;
source include/master-slave.inc; source include/master-slave.inc;
...@@ -16,10 +18,12 @@ create table t1 (a int not null, key(a)) engine=innodb; ...@@ -16,10 +18,12 @@ create table t1 (a int not null, key(a)) engine=innodb;
create table t2 (a int not null, key(a)) engine=innodb; create table t2 (a int not null, key(a)) engine=innodb;
create table t3 (a int) engine=innodb; create table t3 (a int) engine=innodb;
create table t4 (a int) engine=innodb; create table t4 (a int) engine=innodb;
show variables like 'slave_transaction_retries';
sync_slave_with_master; sync_slave_with_master;
show create table t1; show create table t1;
show create table t2; show create table t2;
show variables like 'slave_transaction_retries';
stop slave; stop slave;
# 1) Test deadlock # 1) Test deadlock
......
...@@ -3062,8 +3062,17 @@ we force server id to 2, but this MySQL server will not act as a slave."); ...@@ -3062,8 +3062,17 @@ we force server id to 2, but this MySQL server will not act as a slave.");
#endif #endif
if (opt_bootstrap) /* If running with bootstrap, do not start replication. */ if (opt_bootstrap) /* If running with bootstrap, do not start replication. */
opt_skip_slave_start= 1; opt_skip_slave_start= 1;
/* init_slave() must be called after the thread keys are created */ /*
init_slave(); init_slave() must be called after the thread keys are created.
Some parts of the code (e.g. SHOW STATUS LIKE 'slave_running' and other
places) assume that active_mi != 0, so let's fail if it's 0 (out of
memory); a message has already been printed.
*/
if (init_slave() && !active_mi)
{
end_thr_alarm(1); // Don't allow alarms
unireg_abort(1);
}
if (opt_bootstrap) if (opt_bootstrap)
{ {
...@@ -5494,7 +5503,8 @@ struct show_var_st status_vars[]= { ...@@ -5494,7 +5503,8 @@ struct show_var_st status_vars[]= {
{"Select_range_check", (char*) &select_range_check_count, SHOW_LONG}, {"Select_range_check", (char*) &select_range_check_count, SHOW_LONG},
{"Select_scan", (char*) &select_scan_count, SHOW_LONG}, {"Select_scan", (char*) &select_scan_count, SHOW_LONG},
{"Slave_open_temp_tables", (char*) &slave_open_temp_tables, SHOW_LONG}, {"Slave_open_temp_tables", (char*) &slave_open_temp_tables, SHOW_LONG},
{"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING}, {"Slave_running", (char*) 0, SHOW_SLAVE_RUNNING},
{"Slave_retried_transactions",(char*) 0, SHOW_SLAVE_RETRIED_TRANS},
{"Slow_launch_threads", (char*) &slow_launch_threads, SHOW_LONG}, {"Slow_launch_threads", (char*) &slow_launch_threads, SHOW_LONG},
{"Slow_queries", (char*) &long_query_count, SHOW_LONG}, {"Slow_queries", (char*) &long_query_count, SHOW_LONG},
{"Sort_merge_passes", (char*) &filesort_merge_passes, SHOW_LONG}, {"Sort_merge_passes", (char*) &filesort_merge_passes, SHOW_LONG},
......
...@@ -27,6 +27,7 @@ ...@@ -27,6 +27,7 @@
#include <my_dir.h> #include <my_dir.h>
#include <sql_common.h> #include <sql_common.h>
#define MAX_SLAVE_RETRY_PAUSE 5
bool use_slave_mask = 0; bool use_slave_mask = 0;
MY_BITMAP slave_error_mask; MY_BITMAP slave_error_mask;
...@@ -2335,7 +2336,7 @@ st_relay_log_info::st_relay_log_info() ...@@ -2335,7 +2336,7 @@ st_relay_log_info::st_relay_log_info()
ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0), ignore_log_space_limit(0), last_master_timestamp(0), slave_skip_counter(0),
abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0), abort_pos_wait(0), slave_run_id(0), sql_thd(0), last_slave_errno(0),
inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE), inited(0), abort_slave(0), slave_running(0), until_condition(UNTIL_NONE),
until_log_pos(0) until_log_pos(0), retried_trans(0)
{ {
group_relay_log_name[0]= event_relay_log_name[0]= group_relay_log_name[0]= event_relay_log_name[0]=
group_master_log_name[0]= 0; group_master_log_name[0]= 0;
...@@ -2980,9 +2981,8 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) ...@@ -2980,9 +2981,8 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
init_master_info()). init_master_info()).
b) init_relay_log_pos(), because the BEGIN may be an older relay log. b) init_relay_log_pos(), because the BEGIN may be an older relay log.
*/ */
if (rli->trans_retries--) if (rli->trans_retries < slave_trans_retries)
{ {
sql_print_information("Slave SQL thread retries transaction");
if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL)) if (init_master_info(rli->mi, 0, 0, 0, SLAVE_SQL))
sql_print_error("Failed to initialize the master info structure"); sql_print_error("Failed to initialize the master info structure");
else if (init_relay_log_pos(rli, else if (init_relay_log_pos(rli,
...@@ -2994,8 +2994,16 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) ...@@ -2994,8 +2994,16 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
else else
{ {
exec_res= 0; exec_res= 0;
sleep(2); // chance for concurrent connection to get more locks /* chance for concurrent connection to get more locks */
} safe_sleep(thd, min(rli->trans_retries, MAX_SLAVE_RETRY_PAUSE),
(CHECK_KILLED_FUNC)sql_slave_killed, (void*)rli);
pthread_mutex_lock(&rli->data_lock); // because of SHOW STATUS
rli->trans_retries++;
rli->retried_trans++;
pthread_mutex_unlock(&rli->data_lock);
DBUG_PRINT("info", ("Slave retries transaction "
"rli->trans_retries: %lu", rli->trans_retries));
}
} }
else else
sql_print_error("Slave SQL thread retried transaction %lu time(s) " sql_print_error("Slave SQL thread retried transaction %lu time(s) "
...@@ -3004,17 +3012,8 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli) ...@@ -3004,17 +3012,8 @@ static int exec_relay_log_event(THD* thd, RELAY_LOG_INFO* rli)
slave_trans_retries); slave_trans_retries);
} }
if (!((thd->options & OPTION_BEGIN) && opt_using_transactions)) if (!((thd->options & OPTION_BEGIN) && opt_using_transactions))
{ rli->trans_retries= 0; // restart from fresh
rli->trans_retries= slave_trans_retries; // restart from fresh }
/*
TODO: when merged into 5.0, when slave does auto-rollback if
corrupted binlog, this should reset the retry counter too
(any rollback should). In fact it will work, as here we are just out
of a Format_description_log_event::exec_event() which rolled back.
But check repl code in 5.0 for new ha_rollback calls, just in case.
*/
}
}
return exec_res; return exec_res;
} }
else else
...@@ -3426,7 +3425,7 @@ extern "C" pthread_handler_decl(handle_slave_sql,arg) ...@@ -3426,7 +3425,7 @@ extern "C" pthread_handler_decl(handle_slave_sql,arg)
pthread_mutex_lock(&rli->log_space_lock); pthread_mutex_lock(&rli->log_space_lock);
rli->ignore_log_space_limit= 0; rli->ignore_log_space_limit= 0;
pthread_mutex_unlock(&rli->log_space_lock); pthread_mutex_unlock(&rli->log_space_lock);
rli->trans_retries= slave_trans_retries; // start from "no error" rli->trans_retries= 0; // start from "no error"
if (init_relay_log_pos(rli, if (init_relay_log_pos(rli,
rli->group_relay_log_name, rli->group_relay_log_name,
......
...@@ -295,7 +295,14 @@ typedef struct st_relay_log_info ...@@ -295,7 +295,14 @@ typedef struct st_relay_log_info
UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1 UNTIL_LOG_NAMES_CMP_EQUAL= 0, UNTIL_LOG_NAMES_CMP_GREATER= 1
} until_log_names_cmp_result; } until_log_names_cmp_result;
ulong trans_retries; /*
trans_retries varies between 0 to slave_transaction_retries and counts how
many times the slave has retried the present transaction; gets reset to 0
when the transaction finally succeeds. retried_trans is a cumulative
counter: how many times the slave has retried a transaction (any) since
slave started.
*/
ulong trans_retries, retried_trans;
st_relay_log_info(); st_relay_log_info();
~st_relay_log_info(); ~st_relay_log_info();
......
...@@ -1887,6 +1887,19 @@ int mysqld_show(THD *thd, const char *wild, show_var_st *variables, ...@@ -1887,6 +1887,19 @@ int mysqld_show(THD *thd, const char *wild, show_var_st *variables,
pthread_mutex_unlock(&LOCK_active_mi); pthread_mutex_unlock(&LOCK_active_mi);
break; break;
} }
case SHOW_SLAVE_RETRIED_TRANS:
{
/*
TODO: in 5.1 with multimaster, have one such counter per line in SHOW
SLAVE STATUS, and have the sum over all lines here.
*/
pthread_mutex_lock(&LOCK_active_mi);
pthread_mutex_lock(&active_mi->rli.data_lock);
end= int10_to_str(active_mi->rli.retried_trans, buff, 10);
pthread_mutex_unlock(&active_mi->rli.data_lock);
pthread_mutex_unlock(&LOCK_active_mi);
break;
}
#endif /* HAVE_REPLICATION */ #endif /* HAVE_REPLICATION */
case SHOW_OPENTABLES: case SHOW_OPENTABLES:
end= int10_to_str((long) cached_tables(), buff, 10); end= int10_to_str((long) cached_tables(), buff, 10);
......
...@@ -180,7 +180,7 @@ enum SHOW_TYPE ...@@ -180,7 +180,7 @@ enum SHOW_TYPE
SHOW_SSL_CTX_SESS_TIMEOUTS, SHOW_SSL_CTX_SESS_CACHE_FULL, SHOW_SSL_CTX_SESS_TIMEOUTS, SHOW_SSL_CTX_SESS_CACHE_FULL,
SHOW_SSL_GET_CIPHER_LIST, SHOW_SSL_GET_CIPHER_LIST,
#endif /* HAVE_OPENSSL */ #endif /* HAVE_OPENSSL */
SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, SHOW_SLAVE_RETRIED_TRANS,
SHOW_KEY_CACHE_LONG, SHOW_KEY_CACHE_CONST_LONG SHOW_KEY_CACHE_LONG, SHOW_KEY_CACHE_CONST_LONG
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment