Commit 515b2203 authored by Ajo Robert's avatar Ajo Robert

Bug #18075170 SQL NODE RESTART REQUIRED TO

AVOID DEADLOCK AFTER RESTORE

Analysis
--------
Accessing the restored NDB table in an active multi-statement
transaction was resulting in deadlock found error.

MySQL Server needs to discover metadata of NDB table from
data nodes after table is restored from backup. Metadata
discovery happens on the first access to restored table.
Current code mandates this statement to be the first one
in the transaction. This is because discover needs exclusive
metadata lock on the table. Lock upgrade at this point can
lead to MDL deadlock and the code was written at the time
when MDL deadlock detector was not present. In case when
discovery attempted in the statement other than the first
one in transaction ER_LOCK_DEADLOCK error is reported
pessimistically.

Fix:
---
Removed the constraint as any potential deadlock will be
handled by deadlock detector. Also changed code in discover
to keep metadata locks of active transaction.

Same issue was present in table auto repair scenario. Same
fix is added in repair path also.
parent e7b6e814
# #
# Test of MyISAM MRG tables with corrupted children. # Tests for corrupted MyISAM tables and MyISAMMRG tables with corrupted
# children..
#
# Run with --myisam-recover=force option. # Run with --myisam-recover=force option.
# #
# Preparation: we need to make sure that the merge parent # Preparation: we need to make sure that the merge parent
...@@ -44,20 +46,20 @@ drop procedure p_create; ...@@ -44,20 +46,20 @@ drop procedure p_create;
# Switching to connection 'default' # Switching to connection 'default'
# #
# #
# We have to disable the ps-protocol, to avoid # We have to disable the ps-protocol, to avoid
# "Prepared statement needs to be re-prepared" errors # "Prepared statement needs to be re-prepared" errors
# -- table def versions change all the time with full table cache. # -- table def versions change all the time with full table cache.
# #
drop table if exists t1, t1_mrg, t1_copy; drop table if exists t1, t1_mrg, t1_copy;
# #
# Prepare a MERGE engine table, that refers to a corrupted # Prepare a MERGE engine table, that refers to a corrupted
# child. # child.
# #
create table t1 (a int, key(a)) engine=myisam; create table t1 (a int, key(a)) engine=myisam;
create table t1_mrg (a int) union (t1) engine=merge; create table t1_mrg (a int) union (t1) engine=merge;
# #
# Create a table with a corrupted index file: # Create a table with a corrupted index file:
# save an old index file, insert more rows, # save an old index file, insert more rows,
# overwrite the new index file with the old one. # overwrite the new index file with the old one.
# #
insert into t1 (a) values (1), (2), (3); insert into t1 (a) values (1), (2), (3);
...@@ -101,3 +103,48 @@ execute stmt; ...@@ -101,3 +103,48 @@ execute stmt;
deallocate prepare stmt; deallocate prepare stmt;
set @@global.table_definition_cache=default; set @@global.table_definition_cache=default;
set @@global.table_open_cache=default; set @@global.table_open_cache=default;
#
# 18075170 - sql node restart required to avoid deadlock after
# restore
#
# Check that auto-repair for MyISAM tables can now happen in the
# middle of transaction, without aborting it.
create table t1 (a int, key(a)) engine=myisam;
create table t2 (a int);
insert into t2 values (1);
# Create a table with a corrupted index file:
# save an old index file, insert more rows,
# overwrite the new index file with the old one.
insert into t1 (a) values (1);
flush table t1;
insert into t1 (a) values (4);
flush table t1;
# Check table is needed to mark the table as crashed.
check table t1;
Table Op Msg_type Msg_text
test.t1 check warning Size of datafile is: 14 Should be: 7
test.t1 check error Record-count is not ok; is 2 Should be: 1
test.t1 check warning Found 2 key parts. Should be: 1
test.t1 check error Corrupt
# At this point we have a corrupt t1
set autocommit = 0;
select * from t2;
a
1
# Without fix select from t1 will break the transaction. After the fix
# transaction should be active and should hold lock on table t2. Alter
# table from con2 will wait only if the transaction is not broken.
select * from t1;
a
1
4
Warnings:
Error 145 Table './test/t1' is marked as crashed and should be repaired
Error 1194 Table 't1' is marked as crashed and should be repaired
Error 1034 Number of rows changed from 1 to 2
ALTER TABLE t2 ADD val INT;
# With fix we should have alter table waiting for t2 lock here.
ROLLBACK;
SET autocommit = 1;
# Cleanup
drop table t1, t2;
#
# 18075170 - sql node restart required to avoid deadlock after
# restore
#
CREATE TABLE t1 (id INT) ENGINE=NDBCluster;
CREATE TABLE t2 (id INT) ENGINE=NDBCluster;
INSERT INTO t1 VALUES (1);
INSERT INTO t2 VALUES (1);
DROP TABLE t1;
DROP TABLE t2;
SET autocommit = 0;
SELECT * FROM t1;
id
1
SELECT * FROM t2;
id
1
ROLLBACK;
SET autocommit = 1;
drop table t1;
drop table t2;
SET autocommit = 0;
SELECT * FROM t1;
id
1
SELECT * FROM t2;
id
1
ALTER TABLE t1 ADD val INT;
ROLLBACK;
SET autocommit = 1;
drop table t1;
drop table t2;
-- source include/have_ndb.inc
-- source include/count_sessions.inc
--echo #
--echo # 18075170 - sql node restart required to avoid deadlock after
--echo # restore
--echo #
# Test Auto Discover option within a transaction
# and make sure the transaction is not broken.
CREATE TABLE t1 (id INT) ENGINE=NDBCluster;
CREATE TABLE t2 (id INT) ENGINE=NDBCluster;
INSERT INTO t1 VALUES (1);
INSERT INTO t2 VALUES (1);
-- source include/ndb_backup.inc
DROP TABLE t1;
DROP TABLE t2;
-- source include/ndb_restore_master.inc
SET autocommit = 0;
SELECT * FROM t1;
# Without fix below select was resulting in DEADLOCK error. With fix select
# should succeed.
SELECT * FROM t2;
ROLLBACK;
SET autocommit = 1;
drop table t1;
drop table t2;
#
# Checking lock preservation in transaction
#
# Using existing backup to create the scenario. Tables are deleted as part of
# above test cleanup. Thus restoring the backup will bring the system to
# required state.
-- source include/ndb_restore_master.inc
SET autocommit = 0;
SELECT * FROM t1;
SELECT * FROM t2;
connect(con2, localhost, root);
--SEND ALTER TABLE t1 ADD val INT
connection default;
# Alter from con2 will be in waiting state as there is a lock on t1 from
# default connection due to active transaction. We check for this condition
# then releasing the lock by rollbacking active transaction.
let $wait_condition=
SELECT count(*) = 1 FROM information_schema.processlist WHERE state
LIKE "Waiting%" AND info = "ALTER TABLE t1 ADD val INT";
--source include/wait_condition.inc
ROLLBACK;
SET autocommit = 1;
connection con2;
--REAP
disconnect con2;
connection default;
drop table t1;
drop table t2;
# Wait till all disconnects are completed
-- source include/wait_until_count_sessions.inc
--source include/count_sessions.inc
--echo #
--echo # Tests for corrupted MyISAM tables and MyISAMMRG tables with corrupted
--echo # children..
--echo # --echo #
--echo # Test of MyISAM MRG tables with corrupted children.
--echo # Run with --myisam-recover=force option. --echo # Run with --myisam-recover=force option.
--echo # --echo #
--echo # Preparation: we need to make sure that the merge parent --echo # Preparation: we need to make sure that the merge parent
...@@ -57,10 +61,10 @@ eval $lock; ...@@ -57,10 +61,10 @@ eval $lock;
--echo # --echo #
connection default; connection default;
--echo # --echo #
--echo # We have to disable the ps-protocol, to avoid --echo # We have to disable the ps-protocol, to avoid
--echo # "Prepared statement needs to be re-prepared" errors --echo # "Prepared statement needs to be re-prepared" errors
--echo # -- table def versions change all the time with full table cache. --echo # -- table def versions change all the time with full table cache.
--echo # --echo #
--disable_ps_protocol --disable_ps_protocol
--disable_warnings --disable_warnings
drop table if exists t1, t1_mrg, t1_copy; drop table if exists t1, t1_mrg, t1_copy;
...@@ -69,12 +73,12 @@ let $MYSQLD_DATADIR=`select @@datadir`; ...@@ -69,12 +73,12 @@ let $MYSQLD_DATADIR=`select @@datadir`;
--echo # --echo #
--echo # Prepare a MERGE engine table, that refers to a corrupted --echo # Prepare a MERGE engine table, that refers to a corrupted
--echo # child. --echo # child.
--echo # --echo #
create table t1 (a int, key(a)) engine=myisam; create table t1 (a int, key(a)) engine=myisam;
create table t1_mrg (a int) union (t1) engine=merge; create table t1_mrg (a int) union (t1) engine=merge;
--echo # --echo #
--echo # Create a table with a corrupted index file: --echo # Create a table with a corrupted index file:
--echo # save an old index file, insert more rows, --echo # save an old index file, insert more rows,
--echo # overwrite the new index file with the old one. --echo # overwrite the new index file with the old one.
--echo # --echo #
insert into t1 (a) values (1), (2), (3); insert into t1 (a) values (1), (2), (3);
...@@ -111,3 +115,64 @@ set @@global.table_open_cache=default; ...@@ -111,3 +115,64 @@ set @@global.table_open_cache=default;
disconnect con1; disconnect con1;
connection default; connection default;
--enable_ps_protocol --enable_ps_protocol
--echo #
--echo # 18075170 - sql node restart required to avoid deadlock after
--echo # restore
--echo #
--echo # Check that auto-repair for MyISAM tables can now happen in the
--echo # middle of transaction, without aborting it.
connection default;
create table t1 (a int, key(a)) engine=myisam;
create table t2 (a int);
insert into t2 values (1);
--echo # Create a table with a corrupted index file:
--echo # save an old index file, insert more rows,
--echo # overwrite the new index file with the old one.
insert into t1 (a) values (1);
flush table t1;
--copy_file $MYSQLD_DATADIR/test/t1.MYI $MYSQLD_DATADIR/test/t1_copy.MYI
insert into t1 (a) values (4);
flush table t1;
--remove_file $MYSQLD_DATADIR/test/t1.MYI
--copy_file $MYSQLD_DATADIR/test/t1_copy.MYI $MYSQLD_DATADIR/test/t1.MYI
--remove_file $MYSQLD_DATADIR/test/t1_copy.MYI
--echo # Check table is needed to mark the table as crashed.
check table t1;
--echo # At this point we have a corrupt t1
set autocommit = 0;
select * from t2;
--echo # Without fix select from t1 will break the transaction. After the fix
--echo # transaction should be active and should hold lock on table t2. Alter
--echo # table from con2 will wait only if the transaction is not broken.
select * from t1;
connect(con2, localhost, root);
--SEND ALTER TABLE t2 ADD val INT
connection default;
--echo # With fix we should have alter table waiting for t2 lock here.
let $wait_condition=
SELECT count(*) = 1 FROM information_schema.processlist WHERE state
LIKE "Waiting%" AND info = "ALTER TABLE t2 ADD val INT";
--source include/wait_condition.inc
ROLLBACK;
SET autocommit = 1;
connection con2;
--REAP
connection default;
disconnect con2;
--echo # Cleanup
drop table t1, t2;
# Wait till all disconnects are completed
-- source include/wait_until_count_sessions.inc
...@@ -3972,10 +3972,11 @@ request_backoff_action(enum_open_table_action action_arg, ...@@ -3972,10 +3972,11 @@ request_backoff_action(enum_open_table_action action_arg,
* We met a broken table that needs repair, or a table that * We met a broken table that needs repair, or a table that
is not present on this MySQL server and needs re-discovery. is not present on this MySQL server and needs re-discovery.
To perform the action, we need an exclusive metadata lock on To perform the action, we need an exclusive metadata lock on
the table. Acquiring an X lock while holding other shared the table. Acquiring X lock while holding other shared
locks is very deadlock-prone. If this is a multi- statement locks can easily lead to deadlocks. We rely on MDL deadlock
transaction that holds metadata locks for completed detector to discover them. If this is a multi-statement
statements, we don't do it, and report an error instead. transaction that holds metadata locks for completed statements,
we should keep these locks after discovery/repair.
The action type in this case is OT_DISCOVER or OT_REPAIR. The action type in this case is OT_DISCOVER or OT_REPAIR.
* Our attempt to acquire an MDL lock lead to a deadlock, * Our attempt to acquire an MDL lock lead to a deadlock,
detected by the MDL deadlock detector. The current detected by the MDL deadlock detector. The current
...@@ -4016,7 +4017,7 @@ request_backoff_action(enum_open_table_action action_arg, ...@@ -4016,7 +4017,7 @@ request_backoff_action(enum_open_table_action action_arg,
keep tables open between statements and a livelock keep tables open between statements and a livelock
is not possible. is not possible.
*/ */
if (action_arg != OT_REOPEN_TABLES && m_has_locks) if (action_arg == OT_BACKOFF_AND_RETRY && m_has_locks)
{ {
my_error(ER_LOCK_DEADLOCK, MYF(0)); my_error(ER_LOCK_DEADLOCK, MYF(0));
m_thd->mark_transaction_to_rollback(true); m_thd->mark_transaction_to_rollback(true);
...@@ -4043,6 +4044,32 @@ request_backoff_action(enum_open_table_action action_arg, ...@@ -4043,6 +4044,32 @@ request_backoff_action(enum_open_table_action action_arg,
} }
/**
An error handler to mark transaction to rollback on DEADLOCK error
during DISCOVER / REPAIR.
*/
class MDL_deadlock_discovery_repair_handler : public Internal_error_handler
{
public:
virtual bool handle_condition(THD *thd,
uint sql_errno,
const char* sqlstate,
MYSQL_ERROR::enum_warning_level level,
const char* msg,
MYSQL_ERROR ** cond_hdl)
{
if (sql_errno == ER_LOCK_DEADLOCK)
{
thd->mark_transaction_to_rollback(true);
}
/*
We have marked this transaction to rollback. Return false to allow
error to be reported or handled by other handlers.
*/
return false;
}
};
/** /**
Recover from failed attempt of open table by performing requested action. Recover from failed attempt of open table by performing requested action.
...@@ -4058,6 +4085,12 @@ Open_table_context:: ...@@ -4058,6 +4085,12 @@ Open_table_context::
recover_from_failed_open() recover_from_failed_open()
{ {
bool result= FALSE; bool result= FALSE;
MDL_deadlock_discovery_repair_handler handler;
/*
Install error handler to mark transaction to rollback on DEADLOCK error.
*/
m_thd->push_internal_handler(&handler);
/* Execute the action. */ /* Execute the action. */
switch (m_action) switch (m_action)
{ {
...@@ -4079,7 +4112,12 @@ recover_from_failed_open() ...@@ -4079,7 +4112,12 @@ recover_from_failed_open()
m_thd->warning_info->clear_warning_info(m_thd->query_id); m_thd->warning_info->clear_warning_info(m_thd->query_id);
m_thd->clear_error(); // Clear error message m_thd->clear_error(); // Clear error message
m_thd->mdl_context.release_transactional_locks(); /*
Rollback to start of the current statement to release exclusive lock
on table which was discovered but preserve locks from previous statements
in current transaction.
*/
m_thd->mdl_context.rollback_to_savepoint(start_of_statement_svp());
break; break;
} }
case OT_REPAIR: case OT_REPAIR:
...@@ -4093,12 +4131,18 @@ recover_from_failed_open() ...@@ -4093,12 +4131,18 @@ recover_from_failed_open()
m_failed_table->table_name, FALSE); m_failed_table->table_name, FALSE);
result= auto_repair_table(m_thd, m_failed_table); result= auto_repair_table(m_thd, m_failed_table);
m_thd->mdl_context.release_transactional_locks(); /*
Rollback to start of the current statement to release exclusive lock
on table which was discovered but preserve locks from previous statements
in current transaction.
*/
m_thd->mdl_context.rollback_to_savepoint(start_of_statement_svp());
break; break;
} }
default: default:
DBUG_ASSERT(0); DBUG_ASSERT(0);
} }
m_thd->pop_internal_handler();
/* /*
Reset the pointers to conflicting MDL request and the Reset the pointers to conflicting MDL request and the
TABLE_LIST element, set when we need auto-discovery or repair, TABLE_LIST element, set when we need auto-discovery or repair,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment