Commit fb537354 authored by Luis Soares's avatar Luis Soares

BUG#11760927: 53375: RBR + NO PK => HIGH LOAD ON SLAVE (TABLE

              SCAN/CPU) => SLAVE FAILURE

When a statement containing a large amount of ROWs to be applied on
the slave, and the slave's table does not contain a PK, it can take a
considerable amount of time to find and change all the rows that are
to be changed.

The proper slave enhancement will be implemented in WL 5597. However,
in this bug we are making it clear to the user what the problem is, by
printing a message to the error log if the execution time, for a given
statement in RBR, takes more than LONG_FIND_ROW_THRESHOLD (set to 60
seconds). This shall help the DBA to diagnose what's happening when
facing a slave server that is quiet for no apparent reason...

The note is only printed to the error log if log_warnings is set to be
greater than 1.


sql/log_event.cc:
  Core of the patch. 
  
  In Rows_log_event::do_apply_event, sets STMT start 
  timestamp.
  
  In Rows_log_event::find_row, if a PK is not used, then the start 
  timestamp is checked to see if the time spent on this STMT is enough
  to justify the printing of a note (only if it was not printed before).
sql/log_event.h:
  Defining LONG_FIND_ROW_THRESHOLD.
sql/rpl_rli.cc:
  Resets long_find_row state in rli->context_cleanup().
sql/rpl_rli.h:
  Two new rli properties that are necessary to control when to
  emit a note in the error log: 1) timestamp that states when the
  ROW statement started; 2) flag indicating whether the note has
  been emitted for the current statement or not. Also deployed
  accessors.
parent 0b516c98
include/master-slave.inc
[connection master]
include/stop_slave.inc
SET GLOBAL log_warnings = 2;
SET GLOBAL debug="d,inject_long_find_row_note";
include/start_slave.inc
CREATE TABLE t1 (c1 INT);
INSERT INTO t1 VALUES (1), (2);
UPDATE t1 SET c1= 1000 WHERE c1=2;
DELETE FROM t1;
DROP TABLE t1;
# Check if any note related to long DELETE_ROWS and UPDATE_ROWS appears in the error log
Occurrences: update=1, delete=1
include/stop_slave.inc
SET GLOBAL debug = '';
SET GLOBAL log_warnings = 1;
include/start_slave.inc
include/rpl_end.inc
#
# Bug#11760927: 53375: RBR + NO PK => HIGH LOAD ON SLAVE (TABLE SCAN/CPU) => SLAVE FAILURE
#
--source include/master-slave.inc
--source include/have_binlog_format_row.inc
--source include/have_debug.inc
# SETUP
# - setup log_warnings and debug
--connection slave
--source include/stop_slave.inc
--let $debug_save= `SELECT @@GLOBAL.debug`
--let $log_warnings_save= `SELECT @@GLOBAL.log_warnings`
SET GLOBAL log_warnings = 2;
let $log_error_= `SELECT @@GLOBAL.log_error`;
if(!$log_error_)
{
# MySQL Server on windows is started with --console and thus
# does not know the location of its .err log, use default location
let $log_error_ = $MYSQLTEST_VARDIR/log/mysqld.2.err;
}
# Assign env variable LOG_ERROR
let LOG_ERROR=$log_error_;
# force printing the notes to the error log
SET GLOBAL debug="d,inject_long_find_row_note";
--source include/start_slave.inc
# test
--connection master
CREATE TABLE t1 (c1 INT);
--sync_slave_with_master
--connection master
INSERT INTO t1 VALUES (1), (2);
UPDATE t1 SET c1= 1000 WHERE c1=2;
DELETE FROM t1;
DROP TABLE t1;
--sync_slave_with_master
--echo # Check if any note related to long DELETE_ROWS and UPDATE_ROWS appears in the error log
perl;
use strict;
my $log_error= $ENV{'LOG_ERROR'} or die "LOG_ERROR not set";
open(FILE, "$log_error") or die("Unable to open $log_error: $!\n");
my $upd_count = () = grep(/The slave is applying a ROW event on behalf of an UPDATE statement on table t1 and is currently taking a considerable amount/g,<FILE>);
seek(FILE, 0, 0) or die "Can't seek to beginning of file: $!";
my $del_count = () = grep(/The slave is applying a ROW event on behalf of a DELETE statement on table t1 and is currently taking a considerable amount/g,<FILE>);
print "Occurrences: update=$upd_count, delete=$del_count\n";
close(FILE);
EOF
# cleanup
--source include/stop_slave.inc
--eval SET GLOBAL debug = '$debug_save'
--eval SET GLOBAL log_warnings = $log_warnings_save
--source include/start_slave.inc
--source include/rpl_end.inc
......@@ -7761,6 +7761,12 @@ int Rows_log_event::do_apply_event(Relay_log_info const *rli)
// row processing loop
/*
set the initial time of this ROWS statement if it was not done
before in some other ROWS event.
*/
const_cast<Relay_log_info*>(rli)->set_row_stmt_start_timestamp();
while (error == 0 && m_curr_row < m_rows_end)
{
/* in_use can have been set to NULL in close_tables_for_reopen */
......@@ -9252,6 +9258,51 @@ static bool record_compare(TABLE *table)
return result;
}
/*
Check if we are already spending too much time on this statement.
if we are, warn user that it might be because table does not have
a PK, but only if the warning was not printed before for this STMT.
@param type The event type code.
@param table_name The name of the table that the slave is
operating.
@param is_index_scan States whether the slave is doing an index scan
or not.
@param rli The relay metadata info.
*/
static inline
void issue_long_find_row_warning(Log_event_type type,
const char *table_name,
bool is_index_scan,
const Relay_log_info *rli)
{
if ((global_system_variables.log_warnings > 1 &&
!const_cast<Relay_log_info*>(rli)->is_long_find_row_note_printed()))
{
time_t now= my_time(0);
time_t stmt_ts= const_cast<Relay_log_info*>(rli)->get_row_stmt_start_timestamp();
DBUG_EXECUTE_IF("inject_long_find_row_note",
stmt_ts-=(LONG_FIND_ROW_THRESHOLD*2););
time_t delta= (now - stmt_ts);
if (delta > LONG_FIND_ROW_THRESHOLD)
{
const_cast<Relay_log_info*>(rli)->set_long_find_row_note_printed();
const char* evt_type= type == DELETE_ROWS_EVENT ? " DELETE" : "n UPDATE";
const char* scan_type= is_index_scan ? "scanning an index" : "scanning the table";
sql_print_information("The slave is applying a ROW event on behalf of a%s statement "
"on table %s and is currently taking a considerable amount "
"of time (%lu seconds). This is due to the fact that it is %s "
"while looking up records to be processed. Consider adding a "
"primary key (or unique key) to the table to improve "
"performance.", evt_type, table_name, delta, scan_type);
}
}
}
/**
Locate the current row in event's table.
......@@ -9287,6 +9338,7 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
TABLE *table= m_table;
int error= 0;
bool is_table_scan= false, is_index_scan= false;
/*
rpl_row_tabledefs.test specifies that
......@@ -9452,6 +9504,8 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
}
}
is_index_scan=true;
/*
In case key is not unique, we still have to iterate over records found
and find the one which is identical to the row given. A copy of the
......@@ -9508,6 +9562,8 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
goto err;
}
is_table_scan= true;
/* Continue until we find the right record or have made a full loop */
do
{
......@@ -9561,10 +9617,18 @@ int Rows_log_event::find_row(const Relay_log_info *rli)
goto err;
}
ok:
if (is_table_scan || is_index_scan)
issue_long_find_row_warning(get_type_code(), m_table->alias,
is_index_scan, rli);
table->default_column_bitmaps();
DBUG_RETURN(0);
err:
if (is_table_scan || is_index_scan)
issue_long_find_row_warning(get_type_code(), m_table->alias,
is_index_scan, rli);
table->default_column_bitmaps();
DBUG_RETURN(error);
}
......
......@@ -53,6 +53,7 @@
class String;
#define PREFIX_SQL_LOAD "SQL_LOAD-"
#define LONG_FIND_ROW_THRESHOLD 60 /* seconds */
/**
Either assert or return an error.
......
......@@ -1245,6 +1245,15 @@ void Relay_log_info::cleanup_context(THD *thd, bool error)
*/
thd->variables.option_bits&= ~OPTION_NO_FOREIGN_KEY_CHECKS;
thd->variables.option_bits&= ~OPTION_RELAXED_UNIQUE_CHECKS;
/*
Reset state related to long_find_row notes in the error log:
- timestamp
- flag that decides whether the slave prints or not
*/
reset_row_stmt_start_timestamp();
unset_long_find_row_note_printed();
DBUG_VOID_RETURN;
}
......
......@@ -452,8 +452,49 @@ class Relay_log_info : public Slave_reporting_capability
(m_flags & (1UL << IN_STMT));
}
time_t get_row_stmt_start_timestamp()
{
return row_stmt_start_timestamp;
}
time_t set_row_stmt_start_timestamp()
{
if (row_stmt_start_timestamp == 0)
row_stmt_start_timestamp= my_time(0);
return row_stmt_start_timestamp;
}
void reset_row_stmt_start_timestamp()
{
row_stmt_start_timestamp= 0;
}
void set_long_find_row_note_printed()
{
long_find_row_note_printed= true;
}
void unset_long_find_row_note_printed()
{
long_find_row_note_printed= false;
}
bool is_long_find_row_note_printed()
{
return long_find_row_note_printed;
}
private:
uint32 m_flags;
/*
Runtime state for printing a note when slave is taking
too long while processing a row event.
*/
time_t row_stmt_start_timestamp;
bool long_find_row_note_printed;
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment