Commit e2124156 authored by sjaakola's avatar sjaakola Committed by Jan Lindström

MDEV-25551 applying crash with tables without PK

The underlying problem with MDEV-25551 turned out to be that
transactions having changes for tables with no primary key,
were not safe to apply in parallel. This is due to excessive locking
in innodb side, and even non related row modifications could end up
in lock conflict during applying.

The fix for MDEV-25551 has disabled parallel applying for tables with no PK.
This fix depends on change for wsrep-lib, where a separate PR allows
application to modify transaction flags in wsrep-lib.

This commit has also separate mtr test for verifying that transactions
modifying a table with no primary key, will not apply in parallel.
This test is a modified version of initial test created by Gabor Orosz,
the reporterr of MDEV-25551.
Another mtr test was added in galera_sr suite, for testing if modifying
tables with no primary key would causes issues for streaming replication
use cases.
Reviewed-by: default avatarJan Lindström <jan.lindstrom@mariadb.com>
parent 1dea7f79
connection node_2;
connection node_1;
CREATE TABLE t1 (f1 VARCHAR(32) NOT NULL) ENGINE=InnoDB;
INSERT INTO t1 (f1) VALUES ('0e66c5227a8a');
INSERT INTO t1 (f1) VALUES ('c6c112992c9');
CREATE TABLE t2 (i int primary key);
connection node_2;
SET SESSION wsrep_sync_wait = 0;
SET GLOBAL wsrep_slave_threads = 2;
***************************************************************
scenario 1, conflicting UPDATE
***************************************************************
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_slave_enter_sync';
connection node_1;
START TRANSACTION;
UPDATE t1 SET f1='5ffceebfada' WHERE t1.f1 = 'c6c112992c9';
COMMIT;
connection node_2;
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
connection node_1;
START TRANSACTION;
UPDATE t1 SET f1='4ffceebfcdc' WHERE t1.f1 = '0e66c5227a8a';
COMMIT;
connection node_2;
distance
1
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_slave_enter_sync';
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_slave_enter_sync';
SET GLOBAL wsrep_provider_options = 'dbug=';
***************************************************************
scenario 2, conflicting DELETE
***************************************************************
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_slave_enter_sync';
connection node_1;
START TRANSACTION;
INSERT INTO t2 VALUES (1);
DELETE FROM t1 WHERE f1='5ffceebfada';
COMMIT;
connection node_2;
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
connection node_1;
START TRANSACTION;
INSERT INTO t2 VALUES (2);
DELETE FROM t1 WHERE f1='4ffceebfcdc';
COMMIT;
connection node_2;
distance
1
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_slave_enter_sync';
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_slave_enter_sync';
SET GLOBAL wsrep_provider_options = 'dbug=';
connection node_1;
SET GLOBAL wsrep_slave_threads = DEFAULT;
DROP TABLE t1;
DROP TABLE t2;
connection node_2;
SET GLOBAL wsrep_slave_threads = DEFAULT;
#
# This test is a modified version of Gabor Orosz (GOro) test in jira tracker:
# https://jira.mariadb.org/browse/MDEV-25551
#
# The underlying problem with MDEV-25551 turned out to be that
# transactions having changes for tables with no primary key,
# were not safe to apply in parallel. This is due to excessive locking
# in innodb side, and even non related row modifications could end up
# in lock conflict during applying.
#
# The test creates a table with no primary key definition and executes two
# transactions (in node1) modifying separate rows in the table. In node2
# first applier is paused before commit phase, and second transaction is
# then submitted to see if it can interfere with the first transaciton.
# The fix for MDEV-25551 has disabled parallel applying for tables with no PK,
# and in the test applying of the send trasnaction should not even start, before
# the fisrt trkansaction is released from the sync point.
# The test also verifies that certification depedency status reflects the fact
# that the two transactions depend on each other.
#
# The test has two scenarios where both UPDATE and DELETE statements are verified
# to disable parallel applying
#
--source include/galera_cluster.inc
--source include/have_debug_sync.inc
--source include/galera_have_debug_sync.inc
# Setup
CREATE TABLE t1 (f1 VARCHAR(32) NOT NULL) ENGINE=InnoDB;
INSERT INTO t1 (f1) VALUES ('0e66c5227a8a');
INSERT INTO t1 (f1) VALUES ('c6c112992c9');
CREATE TABLE t2 (i int primary key);
--connection node_2
SET SESSION wsrep_sync_wait = 0;
--let $wait_condition = SELECT COUNT(*)=2 FROM t1;
--source include/wait_condition.inc
# Ensure that we have enough applier threads to process transactions in parallel
SET GLOBAL wsrep_slave_threads = 2;
--echo ***************************************************************
--echo scenario 1, conflicting UPDATE
--echo ***************************************************************
# Set up a synchronization point to catch the first transaction
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_set_sync_point.inc
--connection node_1
# Invoke the first transaction
START TRANSACTION;
UPDATE t1 SET f1='5ffceebfada' WHERE t1.f1 = 'c6c112992c9';
COMMIT;
--connection node_2
# Wait for the first transaction to apply until commit phase
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_wait_sync_point.inc
# remember status for received replication counter and certification dependency distance
--let $expected_wsrep_received = `SELECT VARIABLE_VALUE+1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_received'`
--let $cert_deps_distance = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cert_deps_distance'`
--connection node_1
# Invoke the second transaction
START TRANSACTION;
UPDATE t1 SET f1='4ffceebfcdc' WHERE t1.f1 = '0e66c5227a8a';
COMMIT;
# sleep is probably obsolete here, but it is good to give the latter update time to
# proceed in applying in node 2. In buggy version the update will start applying
# and cause conflict there.
--sleep 5
--connection node_2
# Wait for the second transaction to appear in repliaction queue
--let $wait_condition = SELECT VARIABLE_VALUE= $expected_wsrep_received FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_received';
--source include/wait_condition.inc
# verify that certification dependency distance has dropped
--disable_query_log
--eval SELECT VARIABLE_VALUE < $cert_deps_distance as 'distance' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cert_deps_distance'
--enable_query_log
# if deps distance dropped, it is indirect evidence that parallel applying was not approved
# Let the first transaction to proceed
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_signal_sync_point.inc
# second applier should now hit sync point
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_wait_sync_point.inc
--source include/galera_signal_sync_point.inc
--source include/galera_clear_sync_point.inc
--echo ***************************************************************
--echo scenario 2, conflicting DELETE
--echo ***************************************************************
# Set up a synchronization point to catch the first transaction
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_set_sync_point.inc
--connection node_1
# Invoke the first transaction, mix this with insert to table having PK
START TRANSACTION;
INSERT INTO t2 VALUES (1);
DELETE FROM t1 WHERE f1='5ffceebfada';
COMMIT;
--connection node_2
# Wait for the first transaction to apply until commit phase
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_wait_sync_point.inc
# remember status for received replication counter and certification dependency distance
--let $expected_wsrep_received = `SELECT VARIABLE_VALUE+1 FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_received'`
--let $cert_deps_distance = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cert_deps_distance'`
--connection node_1
# Invoke the second transaction, again mix this with insert to table having PK
START TRANSACTION;
INSERT INTO t2 VALUES (2);
DELETE FROM t1 WHERE f1='4ffceebfcdc';
COMMIT;
# sleep is probably obsolete here, but it is good to give the latter update time to
# proceed in applying in node 2. In buggy version the update will start applying
# and cause conflict there.
--sleep 5
--connection node_2
# Wait for the second transaction to appear in repliaction queue
--let $wait_condition = SELECT VARIABLE_VALUE= $expected_wsrep_received FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_received';
--source include/wait_condition.inc
# verify that certification dependency distance has dropped
--disable_query_log
--eval SELECT VARIABLE_VALUE < $cert_deps_distance as 'distance' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cert_deps_distance'
--enable_query_log
# if deps distance dropped, it is indirect evidence that parallel applying was not approved
# Let the first transaction to proceed
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_signal_sync_point.inc
# second applier should now hit sync point
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_wait_sync_point.inc
--source include/galera_signal_sync_point.inc
--source include/galera_clear_sync_point.inc
# Teardown
--connection node_1
SET GLOBAL wsrep_slave_threads = DEFAULT;
DROP TABLE t1;
DROP TABLE t2;
--connection node_2
SET GLOBAL wsrep_slave_threads = DEFAULT;
connection node_2;
connection node_1;
connection node_2;
SET SESSION wsrep_sync_wait = 0;
SET GLOBAL wsrep_slave_threads = 2;
flush status;
connection node_1;
CREATE TABLE t1 (f1 int, f2 int) ENGINE=InnoDB;
CREATE TABLE t2 (f1 int primary key, f2 int) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1,0);
INSERT INTO t1 VALUES (2,0);
INSERT INTO t2 VALUES (1,0);
INSERT INTO t2 VALUES (2,0);
connection node_2;
connection node_1;
set session wsrep_trx_fragment_size=1;
START TRANSACTION;
UPDATE t1 SET f2=1 where f1=1;
connection node_2;
distance
1
SET GLOBAL wsrep_provider_options = 'dbug=d,commit_monitor_slave_enter_sync';
connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1;
connection node_1a;
update t2 set f2=1 where f1=1;
connection node_2;
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
SET GLOBAL wsrep_provider_options = 'dbug=d,apply_monitor_slave_enter_sync';
connection node_1;
UPDATE t2 set f2=2 where f1=2;
connection node_2;
SET SESSION wsrep_on = 0;
SET SESSION wsrep_on = 1;
SET GLOBAL wsrep_provider_options = 'signal=commit_monitor_slave_enter_sync';
SET GLOBAL wsrep_provider_options = 'dbug=';
SET GLOBAL wsrep_provider_options = 'signal=apply_monitor_slave_enter_sync';
SET GLOBAL wsrep_provider_options = 'dbug=';
connection node_1;
COMMIT;
connection node_1;
SET GLOBAL wsrep_slave_threads = DEFAULT;
DROP TABLE t1;
DROP TABLE t2;
connection node_2;
SET GLOBAL wsrep_slave_threads = DEFAULT;
#
# This test is a modified version of Gabor Orosz (GOro) test in jira tracker:
# https://jira.mariadb.org/browse/MDEV-25551
#
# The underlying problem with MDEV-25551 turned out to be that
# transactions having changes for tables with no primary key,
# were not safe to apply in parallel. This is due to excessive locking
# in innodb side, and even non related row modifications could end up
# in lock conflict during applying.
#
# The test verifies that a transaction executing a streaming replication
# will disable parallel applying if it modifies a table with no primary key.
# And, if PA was disabled temporarily, it will be relaxed if next fragment
# contains changes for table with primary key.
#
--source include/galera_cluster.inc
--source include/have_debug_sync.inc
--source include/galera_have_debug_sync.inc
# Setup
--connection node_2
SET SESSION wsrep_sync_wait = 0;
# Ensure that we have enough applier threads to process transactions in parallel
SET GLOBAL wsrep_slave_threads = 2;
flush status;
--connection node_1
CREATE TABLE t1 (f1 int, f2 int) ENGINE=InnoDB;
CREATE TABLE t2 (f1 int primary key, f2 int) ENGINE=InnoDB;
INSERT INTO t1 VALUES (1,0);
INSERT INTO t1 VALUES (2,0);
INSERT INTO t2 VALUES (1,0);
INSERT INTO t2 VALUES (2,0);
--connection node_2
--let $wait_condition = SELECT COUNT(*)=2 FROM t2;
--source include/wait_condition.inc
# remember status for received replication counter and certification dependency distance
--let $cert_deps_distance = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cert_deps_distance'`
--connection node_1
# Invoke the first transaction
set session wsrep_trx_fragment_size=1;
START TRANSACTION;
UPDATE t1 SET f2=1 where f1=1;
--connection node_2
# verify that certification dependency distance has dropped
--disable_query_log
--eval SELECT VARIABLE_VALUE < $cert_deps_distance as 'distance' FROM INFORMATION_SCHEMA.GLOBAL_STATUS WHERE VARIABLE_NAME = 'wsrep_cert_deps_distance'
--enable_query_log
# if deps distance dropped, it is indirect evidence that parallel applying was not approved
# Try next that PA retricting is relaxed, if next fragment updates table t1 with primary key
# wsrep_cert_deps_distance cannot be trsuted in this test phase, we verify parallel applying
# by setting sync point for applier thread
# Set up a synchronization point to catch update on t2
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_set_sync_point.inc
--connect node_1a, 127.0.0.1, root, , test, $NODE_MYPORT_1
--connection node_1a
update t2 set f2=1 where f1=1;
--connection node_2
# Wait for the update t2 to apply until commit phase
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_wait_sync_point.inc
# Set up a synchronization point to catch the SR trx applying
--let $galera_sync_point = apply_monitor_slave_enter_sync
--source include/galera_set_sync_point.inc
--connection node_1
# continue SR transaction, and now update t2, which has PK
UPDATE t2 set f2=2 where f1=2;
--connection node_2
# Wait for the update t2 to apply until commit phase
--let $galera_sync_point = apply_monitor_slave_enter_sync commit_monitor_slave_enter_sync
--source include/galera_wait_sync_point.inc
# Let the first transaction to proceed
--let $galera_sync_point = commit_monitor_slave_enter_sync
--source include/galera_signal_sync_point.inc
--source include/galera_clear_sync_point.inc
--let $galera_sync_point = apply_monitor_slave_enter_sync
--source include/galera_signal_sync_point.inc
--source include/galera_clear_sync_point.inc
--connection node_1
COMMIT;
# Teardown
--connection node_1
SET GLOBAL wsrep_slave_threads = DEFAULT;
DROP TABLE t1;
DROP TABLE t2;
--connection node_2
SET GLOBAL wsrep_slave_threads = DEFAULT;
......@@ -6808,8 +6808,21 @@ int handler::ha_update_row(const uchar *old_data, const uchar *new_data)
rows_changed++;
error= binlog_log_row(table, old_data, new_data, log_func);
#ifdef WITH_WSREP
THD *thd= ha_thd();
bool is_wsrep= WSREP(thd);
/* for SR, the followin wsrep_after_row() may replicate a fragment, so we have to
declare potential PA unsafe before that*/
if (table->s->primary_key == MAX_KEY &&
is_wsrep && wsrep_thd_is_local(thd))
{
WSREP_DEBUG("marking trx as PA unsafe pk %d", table->s->primary_key);
if (thd->wsrep_cs().mark_transaction_pa_unsafe())
{
WSREP_DEBUG("session does not have active transaction, can not mark as PA unsafe");
}
}
if (table_share->tmp_table == NO_TMP_TABLE &&
WSREP(ha_thd()) && (error= wsrep_after_row(ha_thd())))
is_wsrep && (error= wsrep_after_row(thd)))
{
return error;
}
......@@ -6870,8 +6883,21 @@ int handler::ha_delete_row(const uchar *buf)
rows_changed++;
error= binlog_log_row(table, buf, 0, log_func);
#ifdef WITH_WSREP
THD *thd= ha_thd();
bool is_wsrep= WSREP(thd);
/* for SR, the followin wsrep_after_row() may replicate a fragment, so we have to
declare potential PA unsafe before that*/
if (table->s->primary_key == MAX_KEY &&
is_wsrep && wsrep_thd_is_local(thd))
{
WSREP_DEBUG("marking trx as PA unsafe pk %d", table->s->primary_key);
if (thd->wsrep_cs().mark_transaction_pa_unsafe())
{
WSREP_DEBUG("session does not have active transaction, can not mark as PA unsafe");
}
}
if (table_share->tmp_table == NO_TMP_TABLE &&
WSREP(ha_thd()) && (error= wsrep_after_row(ha_thd())))
is_wsrep && (error= wsrep_after_row(thd)))
{
return error;
}
......
Subproject commit f271ad0c6e3c647df83c1d5ec9cd26d77cef2337
Subproject commit 85b815032145ef5c18b7a8931d84becf6df8cd18
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment