Commit 1bd7662b authored by unknown's avatar unknown

When the I/O thread was stopped while copying a long transaction, and restarted,

Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it
by moving the test for corruption to Start_log_event::exec_event(). 
Changed Rotate_log_event::exec_event() to not increment positions when the
event is seen in the middle of a transaction.
I did a separate commit in 4.1 (so this should not be merged to 4.0) because
code is a bit different in 4.1.
A test to see if the slave detects when the master died while writing a 
transaction to the binlog (uses a forged truncated binlog I made).


sql/log_event.cc:
  When the I/O thread was stopped while copying a long transaction, and restarted,
  Rotate_log_event::exec_event() believed that the relay log was corrupted. Fixed it
  by moving the test for corruption to Start_log_event::exec_event(). 
  Changed Rotate_log_event::exec_event() to not increment positions when the
  event is seen in the middle of a transaction.
parent 7a58bfee
slave stop;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
reset master;
reset slave;
drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
slave start;
stop slave;
flush logs;
reset slave;
start slave;
show slave status;
Master_Host Master_User Master_Port Connect_retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_do_db Replicate_ignore_db Last_errno Last_error Skip_counter Exec_master_log_pos Relay_log_space
127.0.0.1 root MASTER_PORT 1 master-bin.002 4 slave-relay-bin.002 161 master-bin.001 Yes No 0 there is an unfinished transaction in the relay log (could find neither COMMIT nor ROLLBACK in the relay log); it could be that the master died while writing the transaction to its binary log. Now the slave is rolling back the transaction. 0 79 317
reset master;
# We are testing if a binlog which contains BEGIN but not COMMIT (the master did
# while writing the transaction to the binlog) triggers an error on slave.
# So we use such a truncated binlog and simulate that the master restarted after
# this.
source include/master-slave.inc;
connection slave;
stop slave;
connection master;
flush logs;
system mv -f var/log/master-bin.001 var/log/master-bin.002;
system cp std_data/trunc_binlog.001 var/log/master-bin.001;
connection slave;
reset slave;
start slave;
# can't sync_with_master so we must sleep
sleep 3;
--replace_result $MASTER_MYPORT MASTER_PORT
show slave status;
connection master;
reset master;
...@@ -2091,6 +2091,23 @@ int Start_log_event::exec_event(struct st_relay_log_info* rli) ...@@ -2091,6 +2091,23 @@ int Start_log_event::exec_event(struct st_relay_log_info* rli)
*/ */
close_temporary_tables(thd); close_temporary_tables(thd);
cleanup_load_tmpdir(); cleanup_load_tmpdir();
/*
As a transaction NEVER spans on 2 or more binlogs:
if we have an active transaction at this point, the master died while
writing the transaction to the binary log, i.e. while flushing the binlog
cache to the binlog. As the write was started, the transaction had been
committed on the master, so we lack of information to replay this
transaction on the slave; all we can do is stop with error.
*/
if (rli->inside_transaction)
{
slave_print_error(rli, 0,
"there is an unfinished transaction in the relay log \
(could find neither COMMIT nor ROLLBACK in the relay log); it could be that \
the master died while writing the transaction to its binary log. Now the slave \
is rolling back the transaction.");
return(1);
}
break; break;
/* /*
Now the older formats; in that case load_tmpdir is cleaned up by the I/O Now the older formats; in that case load_tmpdir is cleaned up by the I/O
...@@ -2166,51 +2183,34 @@ int Stop_log_event::exec_event(struct st_relay_log_info* rli) ...@@ -2166,51 +2183,34 @@ int Stop_log_event::exec_event(struct st_relay_log_info* rli)
We can't rotate the slave as this will cause infinitive rotations We can't rotate the slave as this will cause infinitive rotations
in a A -> B -> A setup. in a A -> B -> A setup.
NOTES
As a transaction NEVER spans on 2 or more binlogs:
if we have an active transaction at this point, the master died while
writing the transaction to the binary log, i.e. while flushing the binlog
cache to the binlog. As the write was started, the transaction had been
committed on the master, so we lack of information to replay this
transaction on the slave; all we can do is stop with error.
If we didn't detect it, then positions would start to become garbage (as we
are incrementing rli->relay_log_pos whereas we are in a transaction: the new
rli->relay_log_pos will be
relay_log_pos of the BEGIN + size of the Rotate event = garbage.
Since MySQL 4.0.14, the master ALWAYS sends a Rotate event when it starts
sending the next binlog, so we are sure to receive a Rotate event just
after the end of the "dead master"'s binlog; so this exec_event() is the
right place to catch the problem. If we would wait until
Start_log_event::exec_event() it would be too late, rli->relay_log_pos would
already be garbage.
RETURN VALUES RETURN VALUES
0 ok 0 ok
*/ */
int Rotate_log_event::exec_event(struct st_relay_log_info* rli) int Rotate_log_event::exec_event(struct st_relay_log_info* rli)
{ {
char* log_name = rli->master_log_name;
DBUG_ENTER("Rotate_log_event::exec_event"); DBUG_ENTER("Rotate_log_event::exec_event");
pthread_mutex_lock(&rli->data_lock); pthread_mutex_lock(&rli->data_lock);
/*
if (rli->inside_transaction) If we are in a transaction: the only normal case is when the I/O thread was
copying a big transaction, then it was stopped and restarted: we have this
in the relay log:
BEGIN
...
ROTATE (a fake one)
...
COMMIT or ROLLBACK
In that case, we don't want to touch the coordinates which correspond to the
beginning of the transaction.
*/
if (!rli->inside_transaction)
{ {
slave_print_error(rli, 0, memcpy(rli->master_log_name, new_log_ident, ident_len+1);
"there is an unfinished transaction in the relay log \ rli->master_log_pos= pos;
(could find neither COMMIT nor ROLLBACK in the relay log); it could be that \ DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
the master died while writing the transaction to its binary log. Now the slave \
is rolling back the transaction.");
pthread_mutex_unlock(&rli->data_lock);
DBUG_RETURN(1);
} }
memcpy(log_name, new_log_ident, ident_len+1);
rli->master_log_pos = pos;
rli->relay_log_pos += get_event_len(); rli->relay_log_pos += get_event_len();
DBUG_PRINT("info", ("master_log_pos: %d", (ulong) rli->master_log_pos));
pthread_mutex_unlock(&rli->data_lock); pthread_mutex_unlock(&rli->data_lock);
pthread_cond_broadcast(&rli->data_cond); pthread_cond_broadcast(&rli->data_cond);
flush_relay_log_info(rli); flush_relay_log_info(rli);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment