Commit fe945067 authored by Sujatha's avatar Sujatha

MDEV-25502: rpl.rpl_perfschema_applier_status_by_worker failed in bb with: Test assertion failed

Problem:
=======
Test fails with 3 different symptoms
connection slave;
Assertion text: 'Last_Seen_Transaction should show .'
Assertion condition: '"0-1-1" = ""'
Assertion condition, interpolated: '"0-1-1" = ""'
Assertion result: '0'

connection slave;
Assertion text: 'Value returned by SSS and PS table for Last_Error_Number
                 should be same.'
Assertion condition: '"1146" = "0"'
Assertion condition, interpolated: '"1146" = "0"'
Assertion result: '0'

connection slave;
Assertion text: 'Value returned by PS table for worker_idle_time should be
                >= 1'
Assertion condition: '"0" >= "1"'
Assertion condition, interpolated: '"0" >= "1"'
Assertion result: '0'

Fix1:
====
Performance schema table's Last_Seen_Transaction is compared with 'SELECT
gtid_slave_pos'. Since DDLs are not transactional changes to user table and
gtid_slave_pos table are not guaranteed to be synchronous. To fix the
issue Gtid_IO_Pos value from SHOW SLAVE STATUS command will be used to
verify the correctness of Performance schema specific
Last_Seen_Transaction.

Fix2:
====
On error worker thread information is stored as part of backup pool. Access
to this backup pool should be protected by 'LOCK_rpl_thread_pool' mutex so
that simultaneous START SLAVE cannot destroy the backup pool, while it is
being queried by performance schema.

Fix3:
====
When a worker is waiting for events if performance schema table is queried,
at present it just returns the difference between current_time and
start_time.  This is incorrect. It should be worker_idle_time +
(current_time - start_time).

For example a worker thread was idle for 10 seconds and then it got events
to process. Upon completion it goes to idle state, now if the pfs table is
queried it should return current_idle time  + worker_idle_time.
parent e3b33842
......@@ -22,9 +22,11 @@ include/assert.inc [Service_State should be "ON" on a fresh slave server.]
include/assert.inc [Last_Seen_Transaction should show "" if no transaction applierd]
connection master;
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/assert.inc [Channel_name must be slave1]
include/assert.inc [Last_Seen_Transaction should show 0-1-1.]
include/assert.inc [Last_Seen_Transaction should show 0-1-1]
include/assert.inc [Value returned by SSS and PS table for Last_Error_Number should be same.]
include/assert.inc [Value returned by SSS and PS table for Last_Error_Message should both be empty.]
include/assert.inc [Value returned by PS table for Last_Error_Timestamp should be 0000-00-00 00:00:00.]
......
......@@ -29,7 +29,7 @@
# STOP SLAVE.
#
# ==== Related Bugs and Worklogs ====
#
#
# MDEV-20220: Merge 5.7 P_S replication table 'replication_applier_status_by_worker
#
--source include/have_perfschema.inc
......@@ -83,10 +83,10 @@ source include/assert.inc;
--connection master
CREATE TABLE t1 (a INT) ENGINE=InnoDB;
--save_master_pos
--source include/save_master_gtid.inc
--connection slave
--sync_with_master 0,'slave1'
connection slave;
--source include/sync_with_master_gtid.inc
let $ps_value= query_get_value(select channel_name from performance_schema.replication_applier_status_by_worker, channel_name, 1);
let $assert_text= Channel_name must be slave1;
......@@ -94,9 +94,9 @@ let $assert_cond= "$ps_value"= "slave1";
source include/assert.inc;
let $ps_value= query_get_value(select Last_Seen_Transaction from performance_schema.replication_applier_status_by_worker, Last_Seen_Transaction, 1);
let $slave_gtid_pos= `SELECT @@gtid_slave_pos`;
let $assert_text= Last_Seen_Transaction should show $slave_gtid_pos.;
let $assert_cond= "$ps_value" = "$slave_gtid_pos";
let $sss_value= query_get_value(SHOW SLAVE STATUS, Gtid_IO_Pos, 1);
let $assert_text= Last_Seen_Transaction should show $sss_value;
let $assert_cond= "$ps_value" = "$sss_value";
source include/assert.inc;
let $sss_value= query_get_value(SHOW SLAVE STATUS, Last_SQL_Errno, 1);
......
......@@ -187,7 +187,7 @@ struct rpl_parallel_thread {
ulonglong get_worker_idle_time()
{
if (start_time)
return compute_time_lapsed();
return (worker_idle_time + compute_time_lapsed());
else
return worker_idle_time;
}
......
......@@ -92,9 +92,9 @@ ha_rows table_replication_applier_status_by_worker::get_row_count()
int table_replication_applier_status_by_worker::rnd_next(void)
{
if (global_rpl_thread_pool.inited && global_rpl_thread_pool.count)
rpl_parallel_thread_pool *pool= &global_rpl_thread_pool;
if (pool->inited && pool->count)
{
rpl_parallel_thread_pool *pool= &global_rpl_thread_pool;
mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
uint worker_count= pool->count;
for (m_pos.set_at(&m_next_pos);
......@@ -111,7 +111,8 @@ int table_replication_applier_status_by_worker::rnd_next(void)
}
else
{
struct pool_bkp_for_pfs *bkp_pool= &global_rpl_thread_pool.pfs_bkp;
mysql_mutex_lock(&pool->LOCK_rpl_thread_pool);
struct pool_bkp_for_pfs *bkp_pool= &pool->pfs_bkp;
if (bkp_pool->inited && bkp_pool->count)
{
for (m_pos.set_at(&m_next_pos);
......@@ -121,9 +122,11 @@ int table_replication_applier_status_by_worker::rnd_next(void)
rpl_parallel_thread *rpt= bkp_pool->rpl_thread_arr[m_pos.m_index];
make_row(rpt);
m_next_pos.set_after(&m_pos);
mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
return 0;
}
}
mysql_mutex_unlock(&pool->LOCK_rpl_thread_pool);
}
return HA_ERR_END_OF_FILE;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment