Commit d039346a authored by Kristian Nielsen's avatar Kristian Nielsen

MDEV-4991: GTID binlog indexing

Improve the performance of slave connect using B+-Tree indexes on each binlog
file. The index allows fast lookup of a GTID position to the corresponding
offset in the binlog file, as well as lookup of a position to find the
corresponding GTID position.

This eliminates a costly sequential scan of the starting binlog file
to find the GTID starting position when a slave connects. This is
especially costly if the binlog file is not cached in memory (IO
cost), or if it is encrypted or a lot of slaves connect simultaneously
(CPU cost).

The size of the index files is generally less than 1% of the binlog data, so
not expected to be an issue.

Most of the work writing the index is done as a background task, in
the binlog background thread. This minimises the performance impact on
transaction commit. A simple global mutex is used to protect index
reads and (background) index writes; this is fine as slave connect is
a relatively infrequent operation.

Here are the user-visible options and status variables. The feature is on by
default and is expected to need no tuning or configuration for most users.

binlog_gtid_index
  On by default. Can be used to disable the indexes for testing purposes.

binlog_gtid_index_page_size (default 4096)
  Page size to use for the binlog GTID index. This is the size of the nodes
  in the B+-tree used internally in the index. A very small page-size (64 is
  the minimum) will be less efficient, but can be used to stress the
  BTree-code during testing.

binlog_gtid_index_span_min (default 65536)
  Control sparseness of the binlog GTID index. If set to N, at most one
  index record will be added for every N bytes of binlog file written.
  This can be used to reduce the number of records in the index, at
  the cost only of having to scan a few more events in the binlog file
  before finding the target position

Two status variables are available to monitor the use of the GTID indexes:

  Binlog_gtid_index_hit
  Binlog_gtid_index_miss

The "hit" status increments for each successful lookup in a GTID index.
The "miss" increments when a lookup is not possible. This indicates that the
index file is missing (eg. binlog written by old server version
without GTID index support), or corrupt.
Signed-off-by: default avatarKristian Nielsen <knielsen@knielsen-hq.org>
parent 20741b92
......@@ -126,7 +126,7 @@ SET(SQL_EMBEDDED_SOURCES emb_qcache.cc libmysqld.c lib_sql.cc
../sql/sql_expression_cache.cc
../sql/my_apc.cc ../sql/my_apc.h
../sql/my_json_writer.cc ../sql/my_json_writer.h
../sql/rpl_gtid.cc
../sql/rpl_gtid.cc ../sql/gtid_index.cc
../sql/sql_explain.cc ../sql/sql_explain.h
../sql/sql_analyze_stmt.cc ../sql/sql_analyze_stmt.h
../sql/compat56.cc
......
......@@ -98,6 +98,17 @@ The following specify which files/extra groups are read (specified before remain
involve user-defined functions (i.e. UDFs) or the UUID()
function; for those, row-based binary logging is
automatically used.
--binlog-gtid-index Enable the creation of a GTID index for every binlog
file, and the use of such index for speeding up GTID
lookup in the binlog.
(Defaults to on; use --skip-binlog-gtid-index to disable.)
--binlog-gtid-index-page-size=#
Page size to use for the binlog GTID index.
--binlog-gtid-index-span-min=#
Control sparseness of the binlog GTID index. If set to N,
at most one index record will be added for every N bytes
of binlog file written, to reduce the size of the index.
Normally does not need tuning.
--binlog-ignore-db=name
Tells the master that updates to the given database
should not be logged to the binary log.
......@@ -1597,6 +1608,9 @@ binlog-direct-non-transactional-updates FALSE
binlog-expire-logs-seconds 0
binlog-file-cache-size 16384
binlog-format MIXED
binlog-gtid-index TRUE
binlog-gtid-index-page-size 4096
binlog-gtid-index-span-min 65536
binlog-legacy-event-pos FALSE
binlog-optimize-thread-scheduling TRUE
binlog-row-event-max-size 8192
......
SET GLOBAL binlog_gtid_index= 0;
SET GLOBAL binlog_gtid_index= 1;
SET @gtid1= @@gtid_binlog_pos;
CREATE TABLE t1 (a INT PRIMARY KEY);
SET @gtid2= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (1);
SET @gtid3= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (2);
INSERT INTO t1 VALUES (3);
INSERT INTO t1 VALUES (4);
SET @gtid4= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (5);
SET @gtid5= @@gtid_binlog_pos;
SET @gtid6= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (106);
INSERT INTO t1 VALUES (107);
Ok
1
Ok
1
Ok
1
Ok
1
Ok
1
Ok
1
FLUSH BINARY LOGS;
Ok
1
Ok
1
Ok
1
Ok
1
Ok
1
Ok
1
*** Test that purge deletes the gtid index files. ***
FLUSH BINARY LOGS;
INSERT INTO t1 VALUES (200);
FLUSH BINARY LOGS;
INSERT INTO t1 VALUES (201);
FLUSH BINARY LOGS;
INSERT INTO t1 VALUES (202);
PURGE BINARY LOGS TO 'FILE';
*** Test missed index lookup due to missing or corrupt index file.
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
INSERT INTO t1 VALUES (301);
INSERT INTO t1 VALUES (302);
INSERT INTO t1 VALUES (303);
SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
INSERT INTO t1 VALUES (304);
INSERT INTO t1 VALUES (305);
FLUSH NO_WRITE_TO_BINLOG STATUS;
+++ Initial status:
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 0
Binlog_gtid_index_miss 0
+++ GTID Lookup in good index.
Gtid_Lookup_Ok
1
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 1
Binlog_gtid_index_miss 0
+++ GTID Lookup, index file is missing.
Gtid_Lookup_Ok
1
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 1
Binlog_gtid_index_miss 1
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
INSERT INTO t1 VALUES (306);
SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
INSERT INTO t1 VALUES (307);
INSERT INTO t1 VALUES (308);
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+++ GTID Lookup, first page of index is corrupt.
Gtid_Lookup_Ok
1
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 1
Binlog_gtid_index_miss 2
SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
SET GLOBAL binlog_gtid_index_page_size= 64;
SET GLOBAL binlog_gtid_index_span_min= 1;
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
INSERT INTO t1 VALUES (310);
INSERT INTO t1 VALUES (311);
INSERT INTO t1 VALUES (312);
SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
INSERT INTO t1 VALUES (313);
INSERT INTO t1 VALUES (314);
INSERT INTO t1 VALUES (315);
INSERT INTO t1 VALUES (316);
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
+++ GTID Lookup, root page of index is corrupt.
Gtid_Lookup_Ok
1
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 1
Binlog_gtid_index_miss 3
*** Test BINLOG_GTID_POS() with too-large offset.
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
INSERT INTO t1 VALUES (401);
INSERT INTO t1 VALUES (402);
+++ Test the hot index.
SELECT BINLOG_GTID_POS('FILE', 100000000);
BINLOG_GTID_POS('FILE', 100000000)
NULL
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 2
Binlog_gtid_index_miss 3
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
+++ Test the cold index.
SELECT BINLOG_GTID_POS('FILE', 100000000);
BINLOG_GTID_POS('FILE', 100000000)
NULL
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 3
Binlog_gtid_index_miss 3
DROP TABLE t1;
*** Test that binlog GTID index is recovered after a crash.
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
Ok
1
Ok
1
Ok
1
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 3
Binlog_gtid_index_miss 0
*** Crash the server, check that GTID index can be used after restart.
SET debug_dbug="d,crash_shutdown";
shutdown;
ERROR HY000: Lost connection to server during query
FLUSH NO_WRITE_TO_BINLOG STATUS;
Ok
1
Ok
1
Ok
1
SHOW STATUS LIKE 'binlog_gtid_index_%';
Variable_name Value
Binlog_gtid_index_hit 3
Binlog_gtid_index_miss 0
DROP TABLE t1;
--source include/have_binlog_format_mixed.inc
SET GLOBAL binlog_gtid_index= 0;
SET GLOBAL binlog_gtid_index= 1;
--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
--let $pos1= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid1= @@gtid_binlog_pos;
CREATE TABLE t1 (a INT PRIMARY KEY);
--let $pos2= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid2= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (1);
--let $pos3= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid3= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (2);
INSERT INTO t1 VALUES (3);
INSERT INTO t1 VALUES (4);
--let $pos4= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid4= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (5);
--let $pos5= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid5= @@gtid_binlog_pos;
--disable_query_log
--let $i=0
while ($i < 100) {
eval INSERT INTO t1 VALUES (6 + $i);
inc $i;
}
--enable_query_log
--let $pos6= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid6= @@gtid_binlog_pos;
INSERT INTO t1 VALUES (106);
INSERT INTO t1 VALUES (107);
# Test first the hot and then the cold index.
--let $i= 0
while ($i < 2) {
--disable_query_log
eval SELECT BINLOG_GTID_POS('$file', $pos1) = @gtid1 AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos2) = @gtid2 AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos3) = @gtid3 AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos4) = @gtid4 AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos5) = @gtid5 AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos6) = @gtid6 AS Ok;
--enable_query_log
inc $i;
if ($i == 1) {
FLUSH BINARY LOGS;
}
}
--echo *** Test that purge deletes the gtid index files. ***
FLUSH BINARY LOGS;
INSERT INTO t1 VALUES (200);
--let $file2= query_get_value(SHOW MASTER STATUS, File, 1)
FLUSH BINARY LOGS;
INSERT INTO t1 VALUES (201);
--let $file3= query_get_value(SHOW MASTER STATUS, File, 1)
FLUSH BINARY LOGS;
INSERT INTO t1 VALUES (202);
--let $file4= query_get_value(SHOW MASTER STATUS, File, 1)
--replace_result $file3 FILE
eval PURGE BINARY LOGS TO '$file3';
--let $MYSQLD_DATADIR= `select @@datadir`
--error 1
--file_exists $MYSQLD_DATADIR/$file.idx
--error 1
--file_exists $MYSQLD_DATADIR/$file2.idx
--file_exists $MYSQLD_DATADIR/$file3.idx
--file_exists $MYSQLD_DATADIR/$file4.idx
--echo *** Test missed index lookup due to missing or corrupt index file.
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
INSERT INTO t1 VALUES (301);
INSERT INTO t1 VALUES (302);
INSERT INTO t1 VALUES (303);
--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
INSERT INTO t1 VALUES (304);
INSERT INTO t1 VALUES (305);
# BINLOG_GTID_POS() has a side effect: it increments binlog_gtid_index_hit
--disable_ps2_protocol
FLUSH NO_WRITE_TO_BINLOG STATUS;
--echo +++ Initial status:
SHOW STATUS LIKE 'binlog_gtid_index_%';
--echo +++ GTID Lookup in good index.
--disable_query_log
eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
--enable_query_log
SHOW STATUS LIKE 'binlog_gtid_index_%';
--remove_file $MYSQLD_DATADIR/$file.idx
--echo +++ GTID Lookup, index file is missing.
--disable_query_log
eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
--enable_query_log
SHOW STATUS LIKE 'binlog_gtid_index_%';
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
INSERT INTO t1 VALUES (306);
--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
INSERT INTO t1 VALUES (307);
INSERT INTO t1 VALUES (308);
# Rotate again so we hit an on-disk index file, not the "hot" index.
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
# Corrupt the flag byte of the first page with an unused bit.
--let FILE_TO_CORRUPT= $MYSQLD_DATADIR/$file.idx
--perl
use strict;
use warnings;
use Fcntl qw(:DEFAULT :seek);
sysopen F, $ENV{FILE_TO_CORRUPT}, O_RDWR
or die "Cannot open file $ENV{FILE_TO_CORRUPT}: $!\n";
# Corrupt the flag byte with an unused flag.
sysseek(F, 16, SEEK_SET)
or die "Cannot seek file: $!\n";
my $buf;
sysread(F, $buf, 1)
or die "Cannot read file: $!\n";
$buf= chr(ord($buf) | 0x80);
sysseek(F, 16, SEEK_SET)
or die "Cannot seek file: $!\n";
syswrite(F, $buf, 1) == 1
or die "Cannot write file: $!\n";
close F;
EOF
--echo +++ GTID Lookup, first page of index is corrupt.
--disable_query_log
eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
--enable_query_log
SHOW STATUS LIKE 'binlog_gtid_index_%';
# Corrupt the last byte of the root page.
# Set a small page-size so we test corruption in something not the header page.
SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
SET GLOBAL binlog_gtid_index_page_size= 64;
SET GLOBAL binlog_gtid_index_span_min= 1;
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
INSERT INTO t1 VALUES (310);
INSERT INTO t1 VALUES (311);
INSERT INTO t1 VALUES (312);
--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
SET @gtid_pos= @@GLOBAL.gtid_binlog_pos;
INSERT INTO t1 VALUES (313);
INSERT INTO t1 VALUES (314);
INSERT INTO t1 VALUES (315);
INSERT INTO t1 VALUES (316);
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
--let FILE_TO_CORRUPT= $MYSQLD_DATADIR/$file.idx
--perl
use strict;
use warnings;
use Fcntl qw(:DEFAULT :seek);
sysopen F, $ENV{FILE_TO_CORRUPT}, O_RDWR
or die "Cannot open file $ENV{FILE_TO_CORRUPT}: $!\n";
# Tricky: The index is written asynchroneously, it may still be incomplete.
# So wait for the file to be written completely with a root node at the end.
my $count= 0;
for (;;) {
my $end= sysseek(F, 0, SEEK_END);
if ($end > 0 && ($end % 64) == 0) {
# The index file is non-empty with a full page at the end, test if the
# root page has been fully written. This is seen as bit 2 (PAGE_FLAG_LAST)
# and bit 3 (PAGE_FLAG_ROOT) being set (0xc).
my $flag;
if (sysseek(F, -64, SEEK_CUR) &&
sysread(F, $flag, 1) &&
(ord($flag) & 0xc) == 0xc) {
last;
}
}
die "Timeout waiting for GTID index to be non-empty\n"
if ++$count >= 500;
# Simple way to do sub-second sleep.
select(undef, undef, undef, 0.050);
}
# Corrupt the flag byte with an unused flag.
sysseek(F, -2, SEEK_END)
or die "Cannot seek file: $!\n";
my $buf;
sysread(F, $buf, 1)
or die "Cannot read file: $!\n";
$buf= chr(ord($buf) ^ 0x4);
sysseek(F, -2, SEEK_END)
or die "Cannot seek file: $!\n";
syswrite(F, $buf, 1) == 1
or die "Cannot write file: $!\n";
close F;
EOF
--echo +++ GTID Lookup, root page of index is corrupt.
--disable_query_log
eval SELECT BINLOG_GTID_POS('$file', $pos) = @gtid_pos AS Gtid_Lookup_Ok;
--enable_query_log
SHOW STATUS LIKE 'binlog_gtid_index_%';
--echo *** Test BINLOG_GTID_POS() with too-large offset.
# New binlog to skip the now corrupted one.
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
INSERT INTO t1 VALUES (401);
INSERT INTO t1 VALUES (402);
--echo +++ Test the hot index.
--replace_result $file FILE
eval SELECT BINLOG_GTID_POS('$file', 100000000);
SHOW STATUS LIKE 'binlog_gtid_index_%';
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
--echo +++ Test the cold index.
--replace_result $file FILE
eval SELECT BINLOG_GTID_POS('$file', 100000000);
SHOW STATUS LIKE 'binlog_gtid_index_%';
--enable_ps2_protocol
DROP TABLE t1;
--binlog-gtid-index-page-size=128 --binlog-gtid-index-span-min=1
--source include/have_innodb.inc
# Don't test this under valgrind, memory leaks will occur
--source include/not_valgrind.inc
# Avoid CrashReporter popup on Mac
--source include/not_crashrep.inc
# Binary must be compiled with debug for crash to occur
--source include/have_debug.inc
--source include/have_binlog_format_row.inc
# We have an .opt file that sets a small page size and disables sparseness,
# so we get something non-trivial in the GTID index even with a small amount
# of binlogged events.
--echo *** Test that binlog GTID index is recovered after a crash.
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
--disable_query_log
INSERT INTO t1 VALUES (0, 0);
INSERT INTO t1 VALUES (1, 0);
INSERT INTO t1 VALUES (2, 0);
--let $i= 10
while ($i < 20) {
eval INSERT INTO t1 VALUES ($i, 0);
inc $i;
}
--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
--let $pos1= query_get_value(SHOW MASTER STATUS, Position, 1)
--let $gtid1= `SELECT @@gtid_binlog_pos`
while ($i < 30) {
eval INSERT INTO t1 VALUES ($i, 0);
inc $i;
}
--let $pos2= query_get_value(SHOW MASTER STATUS, Position, 1)
--let $gtid2= `SELECT @@gtid_binlog_pos`
while ($i < 40) {
eval INSERT INTO t1 VALUES ($i, 0);
inc $i;
}
--let $pos3= query_get_value(SHOW MASTER STATUS, Position, 1)
--let $gtid3= `SELECT @@gtid_binlog_pos`
INSERT INTO t1 VALUES (50, 0);
INSERT INTO t1 VALUES (51, 0);
--disable_ps2_protocol
FLUSH NO_WRITE_TO_BINLOG STATUS;
eval SELECT BINLOG_GTID_POS('$file', $pos1) = "$gtid1" AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos2) = "$gtid2" AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos3) = "$gtid3" AS Ok;
--enable_query_log
SHOW STATUS LIKE 'binlog_gtid_index_%';
--enable_ps2_protocol
--echo *** Crash the server, check that GTID index can be used after restart.
--source include/crash_mysqld.inc
--disable_ps2_protocol
FLUSH NO_WRITE_TO_BINLOG STATUS;
--disable_query_log
eval SELECT BINLOG_GTID_POS('$file', $pos1) = "$gtid1" AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos2) = "$gtid2" AS Ok;
eval SELECT BINLOG_GTID_POS('$file', $pos3) = "$gtid3" AS Ok;
--enable_query_log
SHOW STATUS LIKE 'binlog_gtid_index_%';
--enable_ps2_protocol
DROP TABLE t1;
......@@ -6,6 +6,7 @@ connection server_2;
include/stop_slave.inc
CHANGE MASTER TO MASTER_USE_GTID=SLAVE_POS;
call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not set up decryption for binlog.'");
call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not decrypt binlog: encryption key error");
#####################################################
# Part 1: unencrypted master
#####################################################
......@@ -58,10 +59,11 @@ INSERT INTO table3_no_encryption SELECT NULL,NOW(),b FROM table3_no_encryption;
connection server_2;
start slave;
include/wait_for_slave_io_error.inc [errno=1236]
# Ensuring slave was unable to replicate any transactions..
# Ensuring slave was unable to replicate any encrypted transactions..
# ..success
SHOW TABLES;
Tables_in_test
table1_no_encryption
include/stop_slave_sql.inc
reset slave;
##########
......@@ -80,5 +82,7 @@ COUNT(*)
4
DROP TABLE table1_no_encryption, table2_to_encrypt, table3_no_encryption;
connection server_2;
RESET MASTER;
SET GLOBAL gtid_slave_pos= '';
include/start_slave.inc
include/rpl_end.inc
......@@ -36,6 +36,7 @@
CHANGE MASTER TO MASTER_USE_GTID=SLAVE_POS;
--enable_connect_log
call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not set up decryption for binlog.'");
call mtr.add_suppression(" Got fatal error 1236 from master when reading data from binary log: 'Could not decrypt binlog: encryption key error");
--echo #####################################################
--echo # Part 1: unencrypted master
......@@ -55,6 +56,7 @@ FLUSH BINARY LOGS;
SET binlog_format=ROW;
INSERT INTO table1_no_encryption SELECT NULL,NOW(),b FROM table1_no_encryption;
INSERT INTO table1_no_encryption SELECT NULL,NOW(),b FROM table1_no_encryption;
--let $last_unencrypted_gtid= `SELECT @@gtid_binlog_pos`
# Make sure that binary logs are not encrypted
......@@ -120,11 +122,11 @@ start slave;
--let $slave_io_errno= 1236
--source include/wait_for_slave_io_error.inc
--echo # Ensuring slave was unable to replicate any transactions..
--echo # Ensuring slave was unable to replicate any encrypted transactions..
--let $gsp= `SELECT @@global.gtid_slave_pos`
if (`SELECT strcmp("$gsp","")`)
if (`SELECT strcmp("$gsp","$last_unencrypted_gtid")`)
{
die Slave without encryption configured should fail to read encrypted binlog;
die Slave without encryption configured should fail to read encrypted binlog (expected $last_unencrypted_gtid but got $gsp);
}
--echo # ..success
......@@ -151,5 +153,7 @@ DROP TABLE table1_no_encryption, table2_to_encrypt, table3_no_encryption;
--connection server_2
--disable_connect_log
RESET MASTER;
SET GLOBAL gtid_slave_pos= '';
--source include/start_slave.inc
--source include/rpl_end.inc
......@@ -14,6 +14,9 @@ CREATE TABLE t1(id int not null primary key) engine=innodb;
INSERT INTO t1 values (1);
show global variables like '%gtid%';
Variable_name Value
binlog_gtid_index ON
binlog_gtid_index_page_size 4096
binlog_gtid_index_span_min 65536
gtid_binlog_pos 1-11-2
gtid_binlog_state 1-11-2
gtid_cleanup_batch_size 64
......@@ -29,6 +32,9 @@ connection node_2;
SET SESSION wsrep_sync_wait=15;
show global variables like '%gtid%';
Variable_name Value
binlog_gtid_index ON
binlog_gtid_index_page_size 4096
binlog_gtid_index_span_min 65536
gtid_binlog_pos 0-12-1,1-11-2
gtid_binlog_state 0-12-1,1-11-2
gtid_cleanup_batch_size 64
......@@ -55,6 +61,9 @@ Error 1231 Variable 'server_id' can't be set to the value of '200'
INSERT INTO t1 values(2);
show global variables like '%gtid%';
Variable_name Value
binlog_gtid_index ON
binlog_gtid_index_page_size 4096
binlog_gtid_index_span_min 65536
gtid_binlog_pos 0-12-1,1-11-3
gtid_binlog_state 0-12-1,1-11-3
gtid_cleanup_batch_size 64
......@@ -69,6 +78,9 @@ wsrep_gtid_mode ON
connection node_1;
show global variables like '%gtid%';
Variable_name Value
binlog_gtid_index ON
binlog_gtid_index_page_size 4096
binlog_gtid_index_span_min 65536
gtid_binlog_pos 1-11-3
gtid_binlog_state 1-11-3
gtid_cleanup_batch_size 64
......
......@@ -8,12 +8,12 @@ wait/synch/mutex/sql/Ack_receiver::mutex YES YES
wait/synch/mutex/sql/Cversion_lock YES YES
wait/synch/mutex/sql/Delayed_insert::mutex YES YES
wait/synch/mutex/sql/Event_scheduler::LOCK_scheduler_state YES YES
wait/synch/mutex/sql/Gtid_index_writer::gtid_index_mutex YES YES
wait/synch/mutex/sql/gtid_waiting::LOCK_gtid_waiting YES YES
wait/synch/mutex/sql/hash_filo::lock YES YES
wait/synch/mutex/sql/HA_DATA_PARTITION::LOCK_auto_inc YES YES
wait/synch/mutex/sql/LOCK_active_mi YES YES
wait/synch/mutex/sql/LOCK_after_binlog_sync YES YES
wait/synch/mutex/sql/LOCK_audit_mask YES YES
select * from performance_schema.setup_instruments
where name like 'Wait/Synch/Rwlock/sql/%'
and name not in (
......
......@@ -23,6 +23,7 @@ from performance_schema.file_summary_by_instance
where file_name like "%master-%" order by file_name;
FILE_NAME EVENT_NAME COUNT_READ COUNT_WRITE SUM_NUMBER_OF_BYTES_READ SUM_NUMBER_OF_BYTES_WRITE
master-bin.000001 wait/io/file/sql/binlog MANY MANY MANY MANY
master-bin.000001.idx wait/io/file/sql/gtid_index NONE MANY NONE MANY
master-bin.index wait/io/file/sql/binlog_index MANY MANY MANY MANY
select * from performance_schema.file_summary_by_instance
where file_name like "%slave-%" order by file_name;
......@@ -112,6 +113,7 @@ where file_name like "%slave-%"
order by file_name;
FILE_NAME EVENT_NAME COUNT_READ COUNT_WRITE SUM_NUMBER_OF_BYTES_READ SUM_NUMBER_OF_BYTES_WRITE
slave-bin.000001 wait/io/file/sql/binlog MANY MANY MANY MANY
slave-bin.000001.idx wait/io/file/sql/gtid_index NONE MANY NONE MANY
slave-bin.index wait/io/file/sql/binlog_index MANY MANY MANY MANY
slave-relay-bin.000001 wait/io/file/sql/relaylog MANY MANY MANY MANY
slave-relay-bin.000002 wait/io/file/sql/relaylog MANY MANY MANY MANY
......
# Include file for main test rpl.rpl_gtid_index.
# Test GTID indexes with given parameters.
#
# Parameters:
# $NUM_POS Number of GTIDs/binlog positions to create
# $NUM_DOMAIN Number of different domains to use
# $NUM_SERVER Number of different server_id to use
# $NUM_SLAVE_CONNECTS How many GTID slave connect positions to test
# $RND_SEED Random seed
--echo *** Testing $NUM_POS GTIDs with $NUM_SLAVE_CONNECTS test connects
--connection master
DELETE FROM t1 WHERE a >= 1000;
# Rotate binlogs to make new GTID index settings take effect.
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
# Prepare some random values, but deterministic between test runs.
CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
ENGINE=InnoDB;
INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1);
eval
INSERT INTO rand_data(idx, domain_id, server_id)
SELECT seq,
@tmp:=floor($NUM_DOMAIN*POW(rand($RND_SEED),2)),
100 + $NUM_SERVER*@tmp + floor($NUM_SERVER*rand($RND_SEED))
FROM seq_1_to_$NUM_POS;
# Let's check that the test data is deterministic.
# If this changes due to some server changes, it's fine, the .result can just
# be updated. But we want it to be identical between test runs on same code,
# to facilitate debugging test failures.
SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
# Create some data for the binlog (and GTID index), recording the correct
# binlog positions and GTIDs.
CREATE TABLE gtid_data(
idx INT PRIMARY KEY,
gtid VARCHAR(44),
gtid_pos VARCHAR(255),
file VARCHAR(100),
pos INT,
row_count INT,
KEY(file, pos)) ENGINE=InnoDB;
--let $gtid= `SELECT @@last_gtid`
--source include/save_master_gtid.inc
--connection slave
--source include/sync_with_master_gtid.inc
--source include/stop_slave.inc
--connection master
SET @orig_domain_id= @@gtid_domain_id;
SET @orig_server_id= @@server_id;
--let $i= 0
--let $rotate_point= `SELECT floor($NUM_POS/2)`
--let $base_count= `SELECT COUNT(*) FROM t1`
--disable_query_log
while ($i < $NUM_POS) {
--let $file= query_get_value(SHOW MASTER STATUS, File, 1)
--let $pos= query_get_value(SHOW MASTER STATUS, Position, 1)
--let $gtid_pos= `SELECT @@gtid_binlog_pos`
--let $row_count= `SELECT $base_count + $i`
eval SET gtid_domain_id= (SELECT domain_id FROM rand_data WHERE idx=$i+1);
eval SET server_id= (SELECT server_id FROM rand_data WHERE idx=$i+1);
BEGIN;
eval INSERT INTO gtid_data(idx, gtid, gtid_pos, file, pos, row_count)
VALUES ($i, '$gtid', '$gtid_pos', '$file', $pos, $row_count);
eval INSERT INTO t1 VALUES ($i + 1000, 0);
COMMIT;
--let $gtid= `SELECT @@last_gtid`
inc $i;
if ($i==$rotate_point) {
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
}
}
--enable_query_log
SET gtid_domain_id= @orig_domain_id;
SET server_id= @orig_server_id;
SELECT COUNT(*) FROM gtid_data;
# Test that BINLOG_GTID_POS returns correct positions for every GTID position.
--echo *** The result should be empty, otherwise some result is wrong:
SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
FROM gtid_data
WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
ORDER BY idx;
# Prepare to rewind the slave to this point to test again on same binlog.
--connection slave
SET @orig_pos= @@GLOBAL.gtid_slave_pos;
SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
--echo *** Now connect the slave to each position in turn, and test that
--echo *** the right amount of data is replicated at each point.
--let $old_silent= $keep_include_silent
--let $keep_include_silent= 1
--let $i= 0
--disable_query_log
while ($i < $NUM_POS) {
--connection master
--let $gtid_pos= `SELECT gtid_pos FROM gtid_data WHERE idx=$i`
--let $master_count= `SELECT row_count FROM gtid_data WHERE idx=$i`
--connection slave
--disable_result_log
eval START SLAVE UNTIL master_gtid_pos='$gtid_pos';
--enable_result_log
--let $res= `SELECT MASTER_GTID_WAIT('$gtid_pos')`
if ($res != 0) {
--die "FAIL: MASTER_GTID_WAIT($gtid_pos) returned $res, should have been 0"
}
--source include/wait_for_slave_to_stop.inc
--let $slave_count = `SELECT COUNT(*) FROM t1`
if ($master_count != $slave_count) {
SELECT * FROM gtid_data ORDER BY file, pos;
SELECT * FROM t1 ORDER BY a;
--die "Not all rows replicated. $master_count on master but $slave_count on slave."
}
--let $i= `SELECT $i + ceil($NUM_POS / $NUM_SLAVE_CONNECTS)`
}
--enable_query_log
--echo *** Test slave connecting to some GTID positions where the position in
--echo *** the master's binlog is different between the different domains.
--echo *** Revind the slave and test on the same binlog data from the master as before.
--connection slave
SET sql_log_bin= 0;
TRUNCATE gtid_data;
DELETE FROM t1 WHERE a > @orig_t1_limit;
SET sql_log_bin= 1;
SET GLOBAL gtid_slave_pos= @orig_pos;
--let $i= 0
--disable_query_log
while ($i <= $NUM_DOMAIN) {
# Build a GTID position from GTIDs that are picked at different locations
# in the gtid_data table for each domain.
--connection master
let $until_pos=`
SELECT GROUP_CONCAT(gtid SEPARATOR ',')
FROM gtid_data
WHERE idx IN (
SELECT MAX(gtid_data.idx) AS pick
FROM gtid_data
INNER JOIN rand_data ON (rand_data.idx = gtid_data.idx)
WHERE gtid_data.idx*$NUM_DOMAIN <= (domain_id + $i)*$NUM_POS
GROUP BY domain_id
)`;
--connection slave
--disable_result_log
eval START SLAVE UNTIL master_gtid_pos='$until_pos';
--enable_result_log
--let $res= `SELECT MASTER_GTID_WAIT('$until_pos')`
if ($res != 0) {
--die "FAIL: MASTER_GTID_WAIT($until_pos) returned $res, should have been 0"
}
--source include/wait_for_slave_to_stop.inc
inc $i;
}
--enable_query_log
--let $keep_include_silent= $old_silent
# Check that everything was replicated (nothing skipped).
# We have one less row on the slave since the last UNTIL is the one before
# the master inserted the last row.
--connection master
--let $master_count= `SELECT COUNT(*)-1 FROM t1`
--connection slave
--let $slave_count= `SELECT COUNT(*) FROM t1`
if ($master_count != $slave_count) {
SELECT * FROM gtid_data ORDER BY file, pos;
SELECT * FROM t1 ORDER BY a;
--die "Not all rows replicated. $master_count on master but $slave_count on slave."
}
--connection master
DROP TABLE gtid_data, rand_data;
--source include/save_master_gtid.inc
--connection slave
--source include/start_slave.inc
--source include/sync_with_master_gtid.inc
--connection master
......@@ -28,7 +28,6 @@ include/show_events.inc
Log_name Pos Event_type Server_id End_log_pos Info
slave-relay-bin.000002 # Rotate # # master-bin.000001;pos=POS
slave-relay-bin.000002 # Format_desc # # SERVER_VERSION, BINLOG_VERSION
slave-relay-bin.000002 # Gtid_list # # []
slave-relay-bin.000002 # Binlog_checkpoint # # master-bin.000001
slave-relay-bin.000002 # Gtid # # GTID #-#-#
slave-relay-bin.000002 # Gtid_list # # [#-#-#]
......
include/master-slave.inc
[connection master]
connection slave;
include/stop_slave.inc
CHANGE MASTER TO master_use_gtid= slave_pos;
include/start_slave.inc
connection master;
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t1 VALUES (0, 0);
*** Test looking up a lot of different event positions and GTIDs.
CREATE FUNCTION gtid_eq(a VARCHAR(255), b VARCHAR(255)) RETURNS BOOLEAN DETERMINISTIC
BEGIN
DECLARE g VARCHAR(255);
IF a IS NULL OR b IS NULL OR LENGTH(a) != LENGTH(b) THEN
RETURN FALSE;
END IF;
SET a= CONCAT(a, ',');
SET b= CONCAT(',', b, ',');
WHILE LENGTH(a) > 0 DO
SET g= REGEXP_SUBSTR(a, '^[^,]+,');
SET a= SUBSTRING(a, LENGTH(g)+1);
SET b= REPLACE(b, CONCAT(',', g), ',');
END WHILE;
RETURN b = ',';
END //
SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
*** A fair amount of work with default GTID index settings.
*** Testing 200 GTIDs with 50 test connects
connection master;
DELETE FROM t1 WHERE a >= 1000;
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
ENGINE=InnoDB;
INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1);
INSERT INTO rand_data(idx, domain_id, server_id)
SELECT seq,
@tmp:=floor(5*POW(rand(42),2)),
100 + 5*@tmp + floor(5*rand(42))
FROM seq_1_to_200;
SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
COUNT(*) SUM(domain_id) SUM(server_id)
201 285 21852
CREATE TABLE gtid_data(
idx INT PRIMARY KEY,
gtid VARCHAR(44),
gtid_pos VARCHAR(255),
file VARCHAR(100),
pos INT,
row_count INT,
KEY(file, pos)) ENGINE=InnoDB;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
connection master;
SET @orig_domain_id= @@gtid_domain_id;
SET @orig_server_id= @@server_id;
SET gtid_domain_id= @orig_domain_id;
SET server_id= @orig_server_id;
SELECT COUNT(*) FROM gtid_data;
COUNT(*)
200
*** The result should be empty, otherwise some result is wrong:
SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
FROM gtid_data
WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
ORDER BY idx;
idx gtid_pos BINLOG_GTID_POS(file, pos)
connection slave;
SET @orig_pos= @@GLOBAL.gtid_slave_pos;
SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
*** Now connect the slave to each position in turn, and test that
*** the right amount of data is replicated at each point.
*** Test slave connecting to some GTID positions where the position in
*** the master's binlog is different between the different domains.
*** Revind the slave and test on the same binlog data from the master as before.
connection slave;
SET sql_log_bin= 0;
TRUNCATE gtid_data;
DELETE FROM t1 WHERE a > @orig_t1_limit;
SET sql_log_bin= 1;
SET GLOBAL gtid_slave_pos= @orig_pos;
connection master;
connection slave;
connection master;
DROP TABLE gtid_data, rand_data;
include/save_master_gtid.inc
connection slave;
include/start_slave.inc
include/sync_with_master_gtid.inc
connection master;
*** A lot of GTIDs with small btree pages to stress the Btree code.
SET GLOBAL binlog_gtid_index_page_size= 64;
SET GLOBAL binlog_gtid_index_span_min= 1;
*** Testing 1000 GTIDs with 50 test connects
connection master;
DELETE FROM t1 WHERE a >= 1000;
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
ENGINE=InnoDB;
INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1);
INSERT INTO rand_data(idx, domain_id, server_id)
SELECT seq,
@tmp:=floor(10*POW(rand(150),2)),
100 + 5*@tmp + floor(5*rand(150))
FROM seq_1_to_1000;
SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
COUNT(*) SUM(domain_id) SUM(server_id)
1001 2881 116394
CREATE TABLE gtid_data(
idx INT PRIMARY KEY,
gtid VARCHAR(44),
gtid_pos VARCHAR(255),
file VARCHAR(100),
pos INT,
row_count INT,
KEY(file, pos)) ENGINE=InnoDB;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
connection master;
SET @orig_domain_id= @@gtid_domain_id;
SET @orig_server_id= @@server_id;
SET gtid_domain_id= @orig_domain_id;
SET server_id= @orig_server_id;
SELECT COUNT(*) FROM gtid_data;
COUNT(*)
1000
*** The result should be empty, otherwise some result is wrong:
SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
FROM gtid_data
WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
ORDER BY idx;
idx gtid_pos BINLOG_GTID_POS(file, pos)
connection slave;
SET @orig_pos= @@GLOBAL.gtid_slave_pos;
SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
*** Now connect the slave to each position in turn, and test that
*** the right amount of data is replicated at each point.
*** Test slave connecting to some GTID positions where the position in
*** the master's binlog is different between the different domains.
*** Revind the slave and test on the same binlog data from the master as before.
connection slave;
SET sql_log_bin= 0;
TRUNCATE gtid_data;
DELETE FROM t1 WHERE a > @orig_t1_limit;
SET sql_log_bin= 1;
SET GLOBAL gtid_slave_pos= @orig_pos;
connection master;
connection slave;
connection master;
DROP TABLE gtid_data, rand_data;
include/save_master_gtid.inc
connection slave;
include/start_slave.inc
include/sync_with_master_gtid.inc
connection master;
*** Small page size with sparse index.
SET GLOBAL binlog_gtid_index_page_size= 64;
SET GLOBAL binlog_gtid_index_span_min= 2048;
*** Testing 200 GTIDs with 50 test connects
connection master;
DELETE FROM t1 WHERE a >= 1000;
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
ENGINE=InnoDB;
INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1);
INSERT INTO rand_data(idx, domain_id, server_id)
SELECT seq,
@tmp:=floor(10*POW(rand(666),2)),
100 + 5*@tmp + floor(5*rand(666))
FROM seq_1_to_200;
SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
COUNT(*) SUM(domain_id) SUM(server_id)
201 599 23410
CREATE TABLE gtid_data(
idx INT PRIMARY KEY,
gtid VARCHAR(44),
gtid_pos VARCHAR(255),
file VARCHAR(100),
pos INT,
row_count INT,
KEY(file, pos)) ENGINE=InnoDB;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
connection master;
SET @orig_domain_id= @@gtid_domain_id;
SET @orig_server_id= @@server_id;
SET gtid_domain_id= @orig_domain_id;
SET server_id= @orig_server_id;
SELECT COUNT(*) FROM gtid_data;
COUNT(*)
200
*** The result should be empty, otherwise some result is wrong:
SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
FROM gtid_data
WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
ORDER BY idx;
idx gtid_pos BINLOG_GTID_POS(file, pos)
connection slave;
SET @orig_pos= @@GLOBAL.gtid_slave_pos;
SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
*** Now connect the slave to each position in turn, and test that
*** the right amount of data is replicated at each point.
*** Test slave connecting to some GTID positions where the position in
*** the master's binlog is different between the different domains.
*** Revind the slave and test on the same binlog data from the master as before.
connection slave;
SET sql_log_bin= 0;
TRUNCATE gtid_data;
DELETE FROM t1 WHERE a > @orig_t1_limit;
SET sql_log_bin= 1;
SET GLOBAL gtid_slave_pos= @orig_pos;
connection master;
connection slave;
connection master;
DROP TABLE gtid_data, rand_data;
include/save_master_gtid.inc
connection slave;
include/start_slave.inc
include/sync_with_master_gtid.inc
connection master;
*** Medium page size.
SET GLOBAL binlog_gtid_index_page_size= 512;
SET GLOBAL binlog_gtid_index_span_min= 512;
*** Testing 200 GTIDs with 50 test connects
connection master;
DELETE FROM t1 WHERE a >= 1000;
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
ENGINE=InnoDB;
INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1);
INSERT INTO rand_data(idx, domain_id, server_id)
SELECT seq,
@tmp:=floor(10*POW(rand(1024),2)),
100 + 5*@tmp + floor(5*rand(1024))
FROM seq_1_to_200;
SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
COUNT(*) SUM(domain_id) SUM(server_id)
201 555 23160
CREATE TABLE gtid_data(
idx INT PRIMARY KEY,
gtid VARCHAR(44),
gtid_pos VARCHAR(255),
file VARCHAR(100),
pos INT,
row_count INT,
KEY(file, pos)) ENGINE=InnoDB;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
connection master;
SET @orig_domain_id= @@gtid_domain_id;
SET @orig_server_id= @@server_id;
SET gtid_domain_id= @orig_domain_id;
SET server_id= @orig_server_id;
SELECT COUNT(*) FROM gtid_data;
COUNT(*)
200
*** The result should be empty, otherwise some result is wrong:
SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
FROM gtid_data
WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
ORDER BY idx;
idx gtid_pos BINLOG_GTID_POS(file, pos)
connection slave;
SET @orig_pos= @@GLOBAL.gtid_slave_pos;
SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
*** Now connect the slave to each position in turn, and test that
*** the right amount of data is replicated at each point.
*** Test slave connecting to some GTID positions where the position in
*** the master's binlog is different between the different domains.
*** Revind the slave and test on the same binlog data from the master as before.
connection slave;
SET sql_log_bin= 0;
TRUNCATE gtid_data;
DELETE FROM t1 WHERE a > @orig_t1_limit;
SET sql_log_bin= 1;
SET GLOBAL gtid_slave_pos= @orig_pos;
connection master;
connection slave;
connection master;
DROP TABLE gtid_data, rand_data;
include/save_master_gtid.inc
connection slave;
include/start_slave.inc
include/sync_with_master_gtid.inc
connection master;
*** Large page size.
SET GLOBAL binlog_gtid_index_page_size= 16384;
SET GLOBAL binlog_gtid_index_span_min= 1;
*** Testing 200 GTIDs with 50 test connects
connection master;
DELETE FROM t1 WHERE a >= 1000;
FLUSH NO_WRITE_TO_BINLOG BINARY LOGS;
CREATE TABLE rand_data(idx INT PRIMARY KEY, domain_id INT, server_id INT)
ENGINE=InnoDB;
INSERT INTO rand_data(idx, domain_id, server_id) VALUES (0, 0, 1);
INSERT INTO rand_data(idx, domain_id, server_id)
SELECT seq,
@tmp:=floor(10*POW(rand(12345),2)),
100 + 5*@tmp + floor(5*rand(12345))
FROM seq_1_to_200;
SELECT COUNT(*), SUM(domain_id), SUM(server_id) FROM rand_data;
COUNT(*) SUM(domain_id) SUM(server_id)
201 571 23252
CREATE TABLE gtid_data(
idx INT PRIMARY KEY,
gtid VARCHAR(44),
gtid_pos VARCHAR(255),
file VARCHAR(100),
pos INT,
row_count INT,
KEY(file, pos)) ENGINE=InnoDB;
include/save_master_gtid.inc
connection slave;
include/sync_with_master_gtid.inc
include/stop_slave.inc
connection master;
SET @orig_domain_id= @@gtid_domain_id;
SET @orig_server_id= @@server_id;
SET gtid_domain_id= @orig_domain_id;
SET server_id= @orig_server_id;
SELECT COUNT(*) FROM gtid_data;
COUNT(*)
200
*** The result should be empty, otherwise some result is wrong:
SELECT idx, gtid_pos, BINLOG_GTID_POS(file, pos)
FROM gtid_data
WHERE NOT gtid_eq(CONVERT(gtid_pos USING utf8),BINLOG_GTID_POS(file, pos))
ORDER BY idx;
idx gtid_pos BINLOG_GTID_POS(file, pos)
connection slave;
SET @orig_pos= @@GLOBAL.gtid_slave_pos;
SET @orig_t1_limit= (SELECT MAX(a) FROM t1);
*** Now connect the slave to each position in turn, and test that
*** the right amount of data is replicated at each point.
*** Test slave connecting to some GTID positions where the position in
*** the master's binlog is different between the different domains.
*** Revind the slave and test on the same binlog data from the master as before.
connection slave;
SET sql_log_bin= 0;
TRUNCATE gtid_data;
DELETE FROM t1 WHERE a > @orig_t1_limit;
SET sql_log_bin= 1;
SET GLOBAL gtid_slave_pos= @orig_pos;
connection master;
connection slave;
connection master;
DROP TABLE gtid_data, rand_data;
include/save_master_gtid.inc
connection slave;
include/start_slave.inc
include/sync_with_master_gtid.inc
connection master;
connection master;
SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
DROP TABLE t1;
DROP FUNCTION gtid_eq;
include/rpl_end.inc
......@@ -24,6 +24,7 @@ CHANGE MASTER TO MASTER_USE_GTID=slave_pos;
--echo #
--echo # Initialize test data
--connection master
--source include/wait_for_binlog_checkpoint.inc
create table t1 (a int);
SET @@session.server_id= 3;
create table t2 (a int);
......
--source include/have_sequence.inc
--source include/have_innodb.inc
--source include/master-slave.inc
--source include/have_binlog_format_mixed.inc
--connection slave
--source include/stop_slave.inc
CHANGE MASTER TO master_use_gtid= slave_pos;
--source include/start_slave.inc
--connection master
CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=InnoDB;
INSERT INTO t1 VALUES (0, 0);
--echo *** Test looking up a lot of different event positions and GTIDs.
# A function for comparing GTID positions.
# Handles that the domain_id order is different in the two strings.
# Works by repeatedly removing one GTID from each string. If the strings have
# the same length and nothing is left at the end, then they are identical.
delimiter //;
CREATE FUNCTION gtid_eq(a VARCHAR(255), b VARCHAR(255)) RETURNS BOOLEAN DETERMINISTIC
BEGIN
DECLARE g VARCHAR(255);
IF a IS NULL OR b IS NULL OR LENGTH(a) != LENGTH(b) THEN
RETURN FALSE;
END IF;
SET a= CONCAT(a, ',');
SET b= CONCAT(',', b, ',');
WHILE LENGTH(a) > 0 DO
SET g= REGEXP_SUBSTR(a, '^[^,]+,');
SET a= SUBSTRING(a, LENGTH(g)+1);
SET b= REPLACE(b, CONCAT(',', g), ',');
END WHILE;
RETURN b = ',';
END //
delimiter ;//
SET @old_page_size= @@GLOBAL.binlog_gtid_index_page_size;
SET @old_span_min= @@GLOBAL.binlog_gtid_index_span_min;
--echo *** A fair amount of work with default GTID index settings.
--let $NUM_POS= 200
--let $NUM_DOMAIN= 5
--let $NUM_SERVER= 5
--let $NUM_SLAVE_CONNECTS= 50
--let $RND_SEED= 42
--source suite/rpl/include/rpl_gtid_index.inc
--echo *** A lot of GTIDs with small btree pages to stress the Btree code.
--let $NUM_POS= 1000
--let $NUM_DOMAIN= 10
--let $RND_SEED= 150
SET GLOBAL binlog_gtid_index_page_size= 64;
SET GLOBAL binlog_gtid_index_span_min= 1;
--source suite/rpl/include/rpl_gtid_index.inc
--echo *** Small page size with sparse index.
--let $NUM_POS= 200
--let $RND_SEED= 666
SET GLOBAL binlog_gtid_index_page_size= 64;
SET GLOBAL binlog_gtid_index_span_min= 2048;
--source suite/rpl/include/rpl_gtid_index.inc
--echo *** Medium page size.
--let $NUM_POS= 200
--let $RND_SEED= 1024
SET GLOBAL binlog_gtid_index_page_size= 512;
SET GLOBAL binlog_gtid_index_span_min= 512;
--source suite/rpl/include/rpl_gtid_index.inc
--echo *** Large page size.
--let $NUM_POS= 200
--let $RND_SEED= 12345
SET GLOBAL binlog_gtid_index_page_size= 16384;
SET GLOBAL binlog_gtid_index_span_min= 1;
--source suite/rpl/include/rpl_gtid_index.inc
# Cleanup.
--connection master
SET GLOBAL binlog_gtid_index_page_size= @old_page_size;
SET GLOBAL binlog_gtid_index_span_min= @old_span_min;
DROP TABLE t1;
DROP FUNCTION gtid_eq;
--source include/rpl_end.inc
......@@ -432,6 +432,36 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST MIXED,STATEMENT,ROW
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_GTID_INDEX
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
VARIABLE_COMMENT Enable the creation of a GTID index for every binlog file, and the use of such index for speeding up GTID lookup in the binlog.
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME BINLOG_GTID_INDEX_PAGE_SIZE
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Page size to use for the binlog GTID index.
NUMERIC_MIN_VALUE 64
NUMERIC_MAX_VALUE 16777216
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_GTID_INDEX_SPAN_MIN
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Control sparseness of the binlog GTID index. If set to N, at most one index record will be added for every N bytes of binlog file written, to reduce the size of the index. Normally does not need tuning.
NUMERIC_MIN_VALUE 1
NUMERIC_MAX_VALUE 1073741824
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_OPTIMIZE_THREAD_SCHEDULING
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
......
......@@ -452,6 +452,36 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST MIXED,STATEMENT,ROW
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_GTID_INDEX
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE BOOLEAN
VARIABLE_COMMENT Enable the creation of a GTID index for every binlog file, and the use of such index for speeding up GTID lookup in the binlog.
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST OFF,ON
READ_ONLY NO
COMMAND_LINE_ARGUMENT OPTIONAL
VARIABLE_NAME BINLOG_GTID_INDEX_PAGE_SIZE
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Page size to use for the binlog GTID index.
NUMERIC_MIN_VALUE 64
NUMERIC_MAX_VALUE 16777216
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_GTID_INDEX_SPAN_MIN
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Control sparseness of the binlog GTID index. If set to N, at most one index record will be added for every N bytes of binlog file written, to reduce the size of the index. Normally does not need tuning.
NUMERIC_MIN_VALUE 1
NUMERIC_MAX_VALUE 1073741824
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME BINLOG_IGNORE_DB
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE VARCHAR
......
......@@ -165,7 +165,7 @@ SET (SQL_SOURCE
gcalc_slicescan.cc gcalc_tools.cc
my_apc.cc mf_iocache_encr.cc item_jsonfunc.cc
my_json_writer.cc json_schema.cc json_schema_helper.cc
rpl_gtid.cc rpl_parallel.cc
rpl_gtid.cc gtid_index.cc rpl_parallel.cc
semisync.cc semisync_master.cc semisync_slave.cc
semisync_master_ack_receiver.cc
sp_instr.cc
......
/*
Copyright (c) 2023 Kristian Nielsen <knielsen@knielsen-hq.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#include "gtid_index.h"
#include "sql_const.h"
#include "log.h"
static const uchar GTID_INDEX_MAGIC[4]= {
254, 254, 12, 1,
};
Gtid_index_writer *Gtid_index_writer::hot_index_list= nullptr;
/* gtid_index_mutex is inited in MYSQL_LOG::init_pthread_objects(). */
mysql_mutex_t Gtid_index_writer::gtid_index_mutex;
Gtid_index_writer::Gtid_index_writer(const char *filename, uint32 offset,
rpl_binlog_state_base *binlog_state,
uint32 opt_page_size,
my_off_t opt_span_min)
: offset_min_threshold(opt_span_min),
nodes(nullptr), previous_offset(0),
max_level(0), index_file(-1),
error_state(false), file_header_written(false), in_hot_index_list(false)
{
uint32 count;
rpl_gtid *gtid_list;
page_size= opt_page_size;
pending_state.init();
if (alloc_level_if_missing(0))
{
give_error("Out of memory allocating node list");
return;
}
/*
Lock the index mutex at this point just before we create the new index
file on disk. From this point on, and until the index is fully written,
the reader will find us in the "hot index" list and will be able to read
from the index while it's still being constructed.
*/
lock_gtid_index();
build_index_filename(filename);
int create_flags= O_WRONLY|O_TRUNC|O_BINARY|O_EXCL;
index_file= mysql_file_create(key_file_gtid_index, index_file_name,
CREATE_MODE, create_flags, MYF(0));
if (index_file < 0 && my_errno == EEXIST)
{
/*
It shouldn't happen that an old GTID index file remains, as we remove
them as part of RESET MASTER and PURGE BINARY LOGS. But if it happens
due to some external file copy of the user or something, delete any old
GTID index file first.
*/
sql_print_information("Old GTID index file found '%s', deleting",
index_file_name);
my_errno= 0;
mysql_file_delete(key_file_gtid_index, index_file_name, MYF(0));
index_file= mysql_file_create(key_file_gtid_index, index_file_name,
CREATE_MODE, create_flags, MYF(0));
}
if (index_file < 0)
{
give_error("Failed to open new index file for writing");
goto err;
}
/*
Write out an initial index record, i.e. corresponding to the GTID_LIST
event / binlog state at the start of the binlog file.
*/
count= binlog_state->count_nolock();
gtid_list= gtid_list_buffer(count);
if (count > 0)
{
if (!gtid_list)
goto err;
binlog_state->get_gtid_list_nolock(gtid_list, count);
}
write_record(offset, gtid_list, count);
insert_in_hot_index();
err:
unlock_gtid_index();
}
Gtid_index_writer::~Gtid_index_writer()
{
if (in_hot_index_list)
{
lock_gtid_index();
close();
unlock_gtid_index();
}
if (index_file > 0)
{
/*
Should have been closed by call to Gtid_index_writer::close().
We can at least avoid leaking file descriptor.
*/
mysql_file_close(index_file, MYF(0));
}
if (nodes)
{
for (uint32 i= 0; i <= max_level; ++i)
delete nodes[i];
my_free(nodes);
}
/*
state.free() is not needed here, will be called from rpl_binlog_state_base
destructor.
*/
}
void
Gtid_index_writer::gtid_index_init()
{
mysql_mutex_init(key_gtid_index_lock, &gtid_index_mutex, MY_MUTEX_INIT_SLOW);
}
void
Gtid_index_writer::gtid_index_cleanup()
{
mysql_mutex_destroy(&gtid_index_mutex);
}
const Gtid_index_writer *
Gtid_index_writer::find_hot_index(const char *file_name)
{
mysql_mutex_assert_owner(&gtid_index_mutex);
for (const Gtid_index_writer *p= hot_index_list; p; p= p->next_hot_index)
{
if (0 == strcmp(file_name, p->index_file_name))
return p;
}
return nullptr;
}
void
Gtid_index_writer::insert_in_hot_index()
{
mysql_mutex_assert_owner(&gtid_index_mutex);
next_hot_index= hot_index_list;
hot_index_list= this;
in_hot_index_list= true;
}
void
Gtid_index_writer::remove_from_hot_index()
{
mysql_mutex_assert_owner(&gtid_index_mutex);
Gtid_index_writer **next_ptr_ptr= &hot_index_list;
for (;;)
{
Gtid_index_writer *p= *next_ptr_ptr;
if (!p)
break;
if (p == this)
{
*next_ptr_ptr= p->next_hot_index;
break;
}
next_ptr_ptr= &p->next_hot_index;
}
next_hot_index= nullptr;
in_hot_index_list= false;
}
void
Gtid_index_writer::process_gtid(uint32 offset, const rpl_gtid *gtid)
{
rpl_gtid *gtid_list;
uint32 gtid_count;
if (process_gtid_check_batch(offset, gtid, &gtid_list, &gtid_count))
return; // Error
if (gtid_list)
async_update(offset, gtid_list, gtid_count);
}
int
Gtid_index_writer::process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
rpl_gtid **out_gtid_list,
uint32 *out_gtid_count)
{
uint32 count;
rpl_gtid *gtid_list;
mysql_mutex_assert_not_owner(&gtid_index_mutex);
if (unlikely(pending_state.update_nolock(gtid)))
{
give_error("Out of memory processing GTID for binlog GTID index");
return 1;
}
/*
Sparse index; we record only selected GTIDs, and scan the binlog forward
from there to find the exact spot.
*/
if (offset - previous_offset < offset_min_threshold)
{
*out_gtid_list= nullptr;
*out_gtid_count= 0;
return 0;
}
count= pending_state.count_nolock();
DBUG_ASSERT(count > 0 /* Since we just updated with a GTID. */);
gtid_list= (rpl_gtid *)
my_malloc(key_memory_binlog_gtid_index, count*sizeof(*gtid_list), MYF(0));
if (unlikely(!gtid_list))
{
give_error("Out of memory allocating GTID list for binlog GTID index");
return 1;
}
if (unlikely(pending_state.get_gtid_list_nolock(gtid_list, count)))
{
/* Shouldn't happen as we allocated the list with the correct length. */
DBUG_ASSERT(false);
give_error("Internal error allocating GTID list for binlog GTID index");
my_free(gtid_list);
return 1;
}
pending_state.reset_nolock();
previous_offset= offset;
*out_gtid_list= gtid_list;
*out_gtid_count= count;
return 0;
}
int
Gtid_index_writer::async_update(uint32 event_offset,
rpl_gtid *gtid_list,
uint32 gtid_count)
{
lock_gtid_index();
int res= write_record(event_offset, gtid_list, gtid_count);
unlock_gtid_index();
my_free(gtid_list);
return res;
}
void
Gtid_index_writer::close()
{
lock_gtid_index();
if (!error_state)
{
/*
Write out the remaining pending pages, and insert the final child pointer
in interior nodes.
*/
for (uint32 level= 0; ; ++level)
{
uint32 node_ptr= write_current_node(level, level==max_level);
nodes[level]->reset();
if (!node_ptr || level >= max_level)
break;
add_child_ptr(level+1, node_ptr);
}
}
remove_from_hot_index();
unlock_gtid_index();
if (!error_state)
{
if (mysql_file_sync(index_file, MYF(0)))
give_error("Error syncing index file to disk");
}
mysql_file_close(index_file, MYF(0));
index_file= (File)-1;
}
Gtid_index_base::Index_node_base::Index_node_base()
: first_page(nullptr), current_page(nullptr), current_ptr(nullptr)
{
}
Gtid_index_base::Index_node_base::~Index_node_base()
{
free_pages();
}
void
Gtid_index_base::Index_node_base::free_pages()
{
for (Node_page *p= first_page; p; )
{
Node_page *q= p->next;
my_free(p);
p= q;
}
}
void
Gtid_index_base::Index_node_base::reset()
{
free_pages();
first_page= current_page= nullptr;
}
Gtid_index_base::Gtid_index_base()
: gtid_buffer(nullptr), gtid_buffer_alloc(0)
{
}
Gtid_index_base::~Gtid_index_base()
{
if (gtid_buffer_alloc > 0)
my_free(gtid_buffer);
}
void
Gtid_index_base::make_gtid_index_file_name(char *out_name, size_t bufsize,
const char *base_filename)
{
char *p= strmake(out_name, base_filename, bufsize-1);
size_t remain= bufsize - (p - out_name);
strmake(p, ".idx", remain-1);
}
void
Gtid_index_base::build_index_filename(const char *filename)
{
make_gtid_index_file_name(index_file_name, sizeof(index_file_name), filename);
}
rpl_gtid *
Gtid_index_base::gtid_list_buffer(uint32 count)
{
if (gtid_buffer_alloc >= count)
return gtid_buffer;
rpl_gtid *new_buffer= (rpl_gtid *)
my_malloc(key_memory_binlog_gtid_index, count*sizeof(*new_buffer), MYF(0));
if (!new_buffer)
{
give_error("Out of memory allocating buffer for GTID list");
return NULL;
}
my_free(gtid_buffer);
gtid_buffer= new_buffer;
gtid_buffer_alloc= count;
return new_buffer;
}
Gtid_index_writer::Index_node::Index_node(uint32 level_)
: num_records(0), level(level_), force_spill_page(false)
{
state.init();
}
Gtid_index_writer::Index_node::~Index_node()
{
free_pages();
}
uint32
Gtid_index_writer::write_current_node(uint32 level, bool is_root)
{
Index_node *n= nodes[level];
uint32 node_pos= (uint32)mysql_file_tell(index_file, MYF(0));
for (Node_page *p= n->first_page; p ; p= p->next)
{
if (unlikely(is_root))
*(p->flag_ptr) |= PAGE_FLAG_ROOT;
if (likely(!p->next))
*(p->flag_ptr) |= PAGE_FLAG_LAST;
int4store(p->page + page_size - CHECKSUM_LEN,
my_checksum(0, p->page, page_size - CHECKSUM_LEN));
if (mysql_file_write(index_file, p->page, page_size, MYF(MY_NABP)))
{
give_error("Error writing index page");
return 0;
}
}
DBUG_ASSERT(node_pos % page_size == 0);
/* Page numbers are +1 just so that zero can denote invalid page pointer. */
return 1 + (node_pos / (uint32)page_size);
}
void
Gtid_index_writer::Index_node::reset()
{
Index_node_base::reset();
state.reset_nolock();
num_records= 0;
force_spill_page= false;
}
/*
Make sure there is requested space in the current page, by allocating a
new spill page if necessary.
*/
int
Gtid_index_writer::reserve_space(Index_node *n, size_t bytes)
{
DBUG_ASSERT(bytes <= page_size);
if (likely(n->current_page) &&
likely(n->current_ptr - n->current_page->page + bytes <=
(page_size - CHECKSUM_LEN)))
return 0;
/* Not enough room, allocate a spill page. */
Node_page *page= alloc_page();
n->force_spill_page= false;
if (!page)
return 1;
n->current_ptr=
init_header(page, n->level==0, !n->current_page);
if (n->current_page)
n->current_page->next= page;
else
n->first_page= page;
n->current_page= page;
return 0;
}
int
Gtid_index_writer::do_write_record(uint32 level,
uint32 event_offset,
const rpl_gtid *gtid_list,
uint32 gtid_count)
{
DBUG_ASSERT(level <= max_level);
Index_node *n= nodes[level];
if (reserve_space(n, 8))
return 1;
/* Store the count as +1, so that 0 can mean "no more records". */
int4store(n->current_ptr, gtid_count+1);
int4store(n->current_ptr+4, event_offset);
n->current_ptr+= 8;
for (uint32 i= 0; i < gtid_count; ++i)
{
if (reserve_space(n, 16))
return 1;
int4store(n->current_ptr, gtid_list[i].domain_id);
int4store(n->current_ptr+4, gtid_list[i].server_id);
int8store(n->current_ptr+8, gtid_list[i].seq_no);
n->current_ptr+= 16;
}
++n->num_records;
return 0;
}
/*
Add a child pointer to the current node on LEVEL.
The first page has node_ptr=1 just so that a zero node_ptr can be used as
a no/invalid value (effectively node_ptr points to the end of the target
page, in unit of pages).
Adding a child pointer shouldn't spill to a new page, code must make sure that
there is always room for the final child pointer in current non-leaf node.
*/
int
Gtid_index_writer::add_child_ptr(uint32 level, my_off_t node_offset)
{
DBUG_ASSERT(level <= max_level);
DBUG_ASSERT(node_offset > 0);
Index_node *n= nodes[level];
if (reserve_space(n, 4))
return 1;
DBUG_ASSERT(n->current_page);
DBUG_ASSERT((size_t)(n->current_ptr - n->current_page->page + 4) <=
page_size - CHECKSUM_LEN);
int4store(n->current_ptr, node_offset);
n->current_ptr+= 4;
return 0;
}
/*
Write one index record to the GTID index, flushing nodes and allocating
new nodes as necessary.
*/
int
Gtid_index_writer::write_record(uint32 event_offset,
const rpl_gtid *gtid_list,
uint32 gtid_count)
{
if (error_state)
return 1; /* Avoid continuing on a possibly corrupt state. */
uint32 level= 0;
/*
The most frequent case is when there is room in the current page for the
current position to be written, in which case we exit early in the first
iteration of the following loop.
In the general case, we move up through the path to the root, writing
lower-level node page to disk and adding child pointers in higher-level
nodes, until we reach a node that has room. This final node may be a
freshly allocated new root node in the few times when the height of the
tree increases.
*/
for (;;)
{
Index_node *n= nodes[level];
if (update_gtid_state(&n->state, gtid_list, gtid_count))
return give_error("Out of memory updating the local GTID state");
if (check_room(level, gtid_count))
{
/* There is room in the node, just add the index record. */
return do_write_record(level, event_offset, gtid_list, gtid_count);
}
/*
This node is full:
- First, write out this node to disk.
- Add a child pointer in the parent node (allocating one if needed).
- On level 0, allocate a new leaf node and add the index record there.
- On levels >0, skip the last index record when the node gets full
(B+-Tree has (k-1) keys for k child pointers).
- Loop to the parent node to add an index record there.
*/
uint32 node_ptr= write_current_node(level, false);
if (!node_ptr)
return 1;
if (alloc_level_if_missing(level+1) ||
add_child_ptr(level+1, node_ptr))
return 1;
uint32 new_count= n->state.count_nolock();
rpl_gtid *new_gtid_list= gtid_list_buffer(new_count);
if (new_count > 0 && !new_gtid_list)
return 1;
if (n->state.get_gtid_list_nolock(new_gtid_list, new_count))
return give_error("Internal error processing GTID state");
n->reset();
if (level == 0)
{
if (do_write_record(level, event_offset, new_gtid_list, new_count))
return 1;
}
else
{
/*
Allocate a page for the node. This is mostly to help the reader of hot
index to not see NULL pointers, and we will need the page later anyway
to put at least one child pointer to the level below.
*/
if (reserve_space(n, 4))
return 1;
}
gtid_list= new_gtid_list;
gtid_count= new_count;
++level;
}
// NotReached.
}
bool
Gtid_index_writer::check_room(uint32 level, uint32 gtid_count)
{
Index_node *n= nodes[level];
/* There's always room in an empty (to-be-allocated) page. */
if (!n->current_page || n->num_records == 0)
return true;
/*
Make sure we use at least 1/2 a page of room after the initial record,
setting a flag to allocate a spill page later if needed.
*/
size_t avail= page_size - CHECKSUM_LEN - (n->current_ptr - n->current_page->page);
if (n->num_records==1 && avail < page_size/2)
{
n->force_spill_page= true;
return true;
}
if (n->force_spill_page)
return true;
size_t needed= 8 + 16*gtid_count;
/* Non-leaf pages need extra 4 bytes for a child pointer. */
if (level > 0)
needed+= 4;
return needed <= avail;
}
int
Gtid_index_writer::alloc_level_if_missing(uint32 level)
{
if (likely(nodes))
{
if (likely(max_level >= level))
return 0;
DBUG_ASSERT(level == max_level+1); // Alloc one at a time
}
Index_node *node= new Index_node(level);
if (!node)
return give_error("Out of memory allocating new node");
Index_node **new_nodes= (Index_node **)
my_realloc(key_memory_binlog_gtid_index, nodes, (level+1)*sizeof(*nodes),
MYF(MY_ALLOW_ZERO_PTR|MY_ZEROFILL));
if (!new_nodes)
{
delete node;
return give_error("Out of memory allocating larger node list");
}
new_nodes[level]= node;
nodes= new_nodes;
max_level= level;
return 0;
}
/*
Initialize the start of a data page.
This is at the start of a page, except for the very first page where it
comes after the global file header.
Format:
0 flags.
1-3 unused padding/reserved.
The argument FIRST denotes if this is the first page (if false it is a
continuation page).
*/
uchar *
Gtid_index_writer::init_header(Node_page *page, bool is_leaf, bool is_first)
{
uchar *p= page->page;
bool is_file_header= !file_header_written;
if (unlikely(is_file_header))
{
memcpy(p, GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC));
p+= sizeof(GTID_INDEX_MAGIC);
*p++= GTID_INDEX_VERSION_MAJOR;
*p++= GTID_INDEX_VERSION_MINOR;
/* Flags/padding currently unused. */
*p++= 0;
*p++= 0;
int4store(p, page_size);
p+= 4;
DBUG_ASSERT(p == page->page + GTID_INDEX_FILE_HEADER_SIZE);
file_header_written= true;
}
uchar flags= 0;
if (is_leaf)
flags|= PAGE_FLAG_IS_LEAF;
if (unlikely(!is_first))
flags|= PAGE_FLAG_IS_CONT;
page->flag_ptr= p;
*p++= flags;
/* Padding/reserved. */
p+= 3;
DBUG_ASSERT(p == page->page +
(is_file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0) +
GTID_INDEX_PAGE_HEADER_SIZE);
DBUG_ASSERT((size_t)(p - page->page) < page_size - CHECKSUM_LEN);
return p;
}
int
Gtid_index_base::update_gtid_state(rpl_binlog_state_base *state,
const rpl_gtid *gtid_list, uint32 gtid_count)
{
for (uint32 i= 0; i < gtid_count; ++i)
if (state->update_nolock(&gtid_list[i]))
return 1;
return 0;
}
Gtid_index_base::Node_page *Gtid_index_base::alloc_page()
{
Node_page *new_node= (Node_page *)
my_malloc(key_memory_binlog_gtid_index,
sizeof(Node_page) + page_size,
MYF(MY_ZEROFILL));
if (!new_node)
give_error("Out of memory for allocating index page");
return new_node;
}
int Gtid_index_writer::give_error(const char *msg)
{
if (!error_state)
{
sql_print_information("Error during binlog GTID index creation, will "
"fallback to slower sequential binlog scan. "
"Error is: %s", msg);
error_state= true;
}
return 1;
}
Gtid_index_reader::Gtid_index_reader()
: n(nullptr), index_file(-1),
file_open(false), index_valid(false), has_root_node(false),
version_major(0), version_minor(0)
{
current_state.init();
compare_state.init();
}
Gtid_index_reader::~Gtid_index_reader()
{
if (file_open)
mysql_file_close(index_file, MYF(0));
}
int
Gtid_index_reader::search_offset(uint32 in_offset,
uint32 *out_offset, uint32 *out_gtid_count)
{
in_search_offset= in_offset;
search_cmp_function= &Gtid_index_reader::search_cmp_offset;
return do_index_search(out_offset, out_gtid_count);
}
int
Gtid_index_reader::search_gtid_pos(slave_connection_state *in_gtid_pos,
uint32 *out_offset, uint32 *out_gtid_count)
{
in_search_gtid_pos= in_gtid_pos;
search_cmp_function= &Gtid_index_reader::search_cmp_gtid_pos;
int res= do_index_search(out_offset, out_gtid_count);
/* Let's not leave a dangling pointer to the caller's memory. */
in_search_gtid_pos= nullptr;
return res;
}
rpl_gtid *
Gtid_index_reader::search_gtid_list()
{
return gtid_buffer;
}
int
Gtid_index_reader::search_cmp_offset(uint32 offset,
rpl_binlog_state_base *state)
{
if (offset <= in_search_offset)
return 0;
else
return -1;
}
int
Gtid_index_reader::search_cmp_gtid_pos(uint32 offset,
rpl_binlog_state_base *state)
{
if (state->is_before_pos(in_search_gtid_pos))
return 0;
else
return -1;
}
int
Gtid_index_reader::next_page()
{
if (!read_page->next)
return 1;
read_page= read_page->next;
read_ptr= read_page->flag_ptr + 4;
return 0;
}
int
Gtid_index_reader::find_bytes(uint32 num_bytes)
{
if ((my_ptrdiff_t)(read_ptr - read_page->page + num_bytes) <=
(my_ptrdiff_t)(page_size - CHECKSUM_LEN))
return 0;
return next_page();
}
int
Gtid_index_reader::get_child_ptr(uint32 *out_child_ptr)
{
if (find_bytes(4))
return give_error("Corrupt index, short index node");
*out_child_ptr= (uint32)uint4korr(read_ptr);
read_ptr+= 4;
return 0;
}
/*
Read the start of an index record (count of GTIDs in the differential state
and offset).
Returns:
0 ok
1 EOF, no more data in this node
*/
int
Gtid_index_reader::get_offset_count(uint32 *out_offset, uint32 *out_gtid_count)
{
if (find_bytes(8))
return 1;
uint32 gtid_count= uint4korr(read_ptr);
if (gtid_count == 0)
{
/* 0 means invalid/no record (we store N+1 for N GTIDs in record). */
return 1;
}
*out_gtid_count= gtid_count - 1;
*out_offset= uint4korr(read_ptr + 4);
read_ptr+= 8;
return 0;
}
int
Gtid_index_reader::get_gtid_list(rpl_gtid *out_gtid_list, uint32 count)
{
for (uint32 i= 0; i < count; ++i)
{
if (find_bytes(16))
return give_error("Corrupt index, short index node");
out_gtid_list[i].domain_id= uint4korr(read_ptr);
out_gtid_list[i].server_id= uint4korr(read_ptr + 4);
out_gtid_list[i].seq_no= uint8korr(read_ptr + 8);
read_ptr+= 16;
}
return 0;
}
int
Gtid_index_reader::open_index_file(const char *binlog_filename)
{
close_index_file();
build_index_filename(binlog_filename);
if ((index_file= mysql_file_open(key_file_gtid_index, index_file_name,
O_RDONLY|O_BINARY, MYF(0))) < 0)
return 1; // No error for missing index (eg. upgrade)
file_open= true;
if (read_file_header())
return 1;
return 0;
}
void
Gtid_index_reader::close_index_file()
{
if (!file_open)
return;
mysql_file_close(index_file, MYF(0));
file_open= false;
index_valid= false;
}
int
Gtid_index_reader::do_index_search(uint32 *out_offset, uint32 *out_gtid_count)
{
/* In cold index, we require a complete index with a valid root node. */
if (!has_root_node)
return -1;
return do_index_search_root(out_offset, out_gtid_count);
}
int
Gtid_index_reader::do_index_search_root(uint32 *out_offset,
uint32 *out_gtid_count)
{
current_state.reset_nolock();
compare_state.reset_nolock();
/*
These states will be initialized to the full state stored at the start of
the root node and then incrementally updated.
*/
bool current_state_updated= false;
if (read_root_node())
return -1;
for (;;)
{
if (*n->first_page->flag_ptr & PAGE_FLAG_IS_LEAF)
break;
if (compare_state.load_nolock(&current_state))
{
give_error("Out of memory allocating GTID list");
return -1;
}
uint32 child_ptr;
if (get_child_ptr(&child_ptr))
return -1;
/* Scan over the keys in the node to find the child pointer to follow */
for (;;)
{
uint32 offset, gtid_count;
int res= get_offset_count(&offset, &gtid_count);
if (res == 1) // EOF?
{
/* Follow the right-most child pointer. */
if (read_node(child_ptr))
return -1;
break;
}
rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
uint32 child2_ptr;
if ((gtid_count > 0 && !gtid_list) ||
get_gtid_list(gtid_list, gtid_count) ||
get_child_ptr(&child2_ptr))
return -1;
if (update_gtid_state(&compare_state, gtid_list, gtid_count))
return -1;
int cmp= (this->*search_cmp_function)(offset, &compare_state);
if (cmp < 0)
{
/* Follow the left child of this key. */
if (read_node(child_ptr))
return -1;
break;
}
/* Continue to scan the next key. */
update_gtid_state(&current_state, gtid_list, gtid_count);
current_state_updated= true;
current_offset= offset;
child_ptr= child2_ptr;
}
}
return do_index_search_leaf(current_state_updated,
out_offset, out_gtid_count);
}
int Gtid_index_reader::do_index_search_leaf(bool current_state_updated,
uint32 *out_offset,
uint32 *out_gtid_count)
{
uint32 offset, gtid_count;
int res= get_offset_count(&offset, &gtid_count);
if (res == 1)
{
DBUG_ASSERT(0);
give_error("Corrupt index; empty leaf node");
return -1;
}
rpl_gtid *gtid_list= gtid_list_buffer(gtid_count);
if ((gtid_count > 0 && !gtid_list) ||
get_gtid_list(gtid_list, gtid_count))
return -1;
/*
The first key is ignored (already included in the current state), unless
it is the very first state in the index.
*/
if (!current_state_updated)
update_gtid_state(&current_state, gtid_list, gtid_count);
current_offset= offset;
if (compare_state.load_nolock(&current_state))
{
give_error("Out of memory allocating GTID state");
return -1;
}
int cmp= (this->*search_cmp_function)(offset, &compare_state);
if (cmp < 0)
return 0; // Search position is before start of index.
/* Scan over the keys in the leaf node. */
for (;;)
{
uint32 offset, gtid_count;
int res= get_offset_count(&offset, &gtid_count);
if (res == 1) // EOF?
{
/* Reached end of leaf, last key is the one searched for. */
break;
}
gtid_list= gtid_list_buffer(gtid_count);
if ((gtid_count > 0 && !gtid_list) ||
get_gtid_list(gtid_list, gtid_count))
return -1;
if (update_gtid_state(&compare_state, gtid_list, gtid_count))
return -1;
cmp= (this->*search_cmp_function)(offset, &compare_state);
if (cmp < 0)
{
/* Next key is larger, so current state is the one searched for. */
break;
}
update_gtid_state(&current_state, gtid_list, gtid_count);
current_offset= offset;
}
*out_offset= current_offset;
*out_gtid_count= current_state.count_nolock();
/* Save the result in the shared gtid list buffer. */
if ((!(gtid_list= gtid_list_buffer(*out_gtid_count)) && *out_gtid_count > 0) ||
current_state.get_gtid_list_nolock(gtid_list, *out_gtid_count))
return -1;
return 1;
}
/*
Read the file header and check that it's valid and that the format is not
too new a version for us to be able to read it.
*/
int
Gtid_index_reader::read_file_header()
{
if (!file_open)
return 1;
uchar buf[GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE];
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, 0, MY_SEEK_SET, MYF(0)) ||
mysql_file_read(index_file, buf,
GTID_INDEX_FILE_HEADER_SIZE + GTID_INDEX_PAGE_HEADER_SIZE,
MYF(MY_NABP)))
return give_error("Error reading page from index file");
if (memcmp(&buf[0], GTID_INDEX_MAGIC, sizeof(GTID_INDEX_MAGIC)))
return give_error("Corrupt index file, magic not found in header");
version_major= buf[4];
version_minor= buf[5];
/* We cannot safely read a major version we don't know about. */
if (version_major > GTID_INDEX_VERSION_MAJOR)
return give_error("Incompatible index file, version too high");
page_size= uint4korr(&buf[8]);
/* Verify checksum integrity of page_size and major/minor version. */
uint32 crc= my_checksum(0, buf, sizeof(buf));
uchar *buf3= (uchar *)
my_malloc(key_memory_binlog_gtid_index, page_size - sizeof(buf), MYF(0));
if (!buf3)
return give_error("Error allocating memory for index page");
int res= 0;
if (mysql_file_read(index_file, buf3, page_size - sizeof(buf), MYF(MY_NABP)))
res= give_error("Error reading page from index file");
else
{
crc= my_checksum(crc, buf3, page_size - sizeof(buf) - CHECKSUM_LEN);
if (crc != uint4korr(buf3 + page_size - sizeof(buf) - CHECKSUM_LEN))
res= give_error("Corrupt page, invalid checksum");
}
my_free(buf3);
if (res)
return res;
/*
Check that there is a valid root node at the end of the file.
If there is not, the index may be a "hot index" that is currently being
constructed. Or it was only partially written before server crash and not
recovered for some reason.
*/
uchar flags= buf[GTID_INDEX_PAGE_HEADER_SIZE];
constexpr uchar needed_flags= PAGE_FLAG_ROOT|PAGE_FLAG_LAST;
if ((flags & needed_flags) == needed_flags)
{
/* Special case: the index is a single page, which is the root node. */
has_root_node= true;
}
else
{
uchar buf2[GTID_INDEX_PAGE_HEADER_SIZE];
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
MY_SEEK_END, MYF(0)) ||
mysql_file_read(index_file, buf2, GTID_INDEX_PAGE_HEADER_SIZE,
MYF(MY_NABP)))
return give_error("Error reading root page from index file");
flags= buf2[0];
has_root_node= ((flags & needed_flags) == needed_flags);
/* No need to verify checksum here, will be done by read_root_node(). */
}
index_valid= true;
return 0;
}
int
Gtid_index_reader::verify_checksum(Gtid_index_base::Node_page *page)
{
uint32 calc_checksum= my_checksum(0, page->page, page_size - CHECKSUM_LEN);
uint32 read_checksum= uint4korr(page->page + page_size - CHECKSUM_LEN);
if (calc_checksum != read_checksum)
return give_error("Corrupt page, invalid checksum");
return 0;
}
Gtid_index_base::Node_page *
Gtid_index_reader::alloc_and_read_page()
{
Node_page *page= alloc_page();
if (!page)
{
give_error("Error allocating memory for index page");
return nullptr;
}
if (mysql_file_read(index_file, page->page, page_size, MYF(MY_NABP)))
{
my_free(page);
give_error("Error reading page from index file");
return nullptr;
}
if (verify_checksum(page))
{
my_free(page);
return nullptr;
}
return page;
}
int
Gtid_index_reader::read_root_node()
{
if (!index_valid || !has_root_node)
return 1;
cold_node.reset();
n= &cold_node;
/*
Read pages one by one from the back of the file until we have a complete
root node.
*/
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)page_size,
MY_SEEK_END, MYF(0)))
return give_error("Error seeking index file");
for (;;)
{
Node_page *page= alloc_and_read_page();
if (!page)
return 1;
if (mysql_file_tell(index_file, MYF(0)) == page_size)
page->flag_ptr= &page->page[GTID_INDEX_FILE_HEADER_SIZE];
else
page->flag_ptr= &page->page[0];
page->next= n->first_page;
n->first_page= page;
uchar flags= *page->flag_ptr;
if (unlikely(!(flags & PAGE_FLAG_ROOT)))
return give_error("Corrupt or truncated index, no root node found");
if (!(flags & PAGE_FLAG_IS_CONT))
break; // Found start of root node
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, -(int32)(2*page_size),
MY_SEEK_CUR, MYF(0)))
return give_error("Error seeking index file for multi-page root node");
}
read_page= n->first_page;
read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
return 0;
}
int
Gtid_index_reader::read_node(uint32 page_ptr)
{
DBUG_ASSERT(page_ptr != 0 /* No zero child pointers in on-disk pages. */);
if (!index_valid || !page_ptr)
return 1;
return read_node_cold(page_ptr);
}
int
Gtid_index_reader::read_node_cold(uint32 page_ptr)
{
if (MY_FILEPOS_ERROR == mysql_file_seek(index_file, (page_ptr-1)*page_size,
MY_SEEK_SET, MYF(0)))
return give_error("Error seeking index file");
bool file_header= (page_ptr == 1);
cold_node.reset();
n= &cold_node;
Node_page **next_ptr_ptr= &n->first_page;
for (;;)
{
Node_page *page= alloc_and_read_page();
if (!page)
return 1;
page->flag_ptr= &page->page[file_header ? GTID_INDEX_FILE_HEADER_SIZE : 0];
file_header= false;
/* Insert the page at the end of the list. */
page->next= nullptr;
*next_ptr_ptr= page;
next_ptr_ptr= &page->next;
uchar flags= *(page->flag_ptr);
if (flags & PAGE_FLAG_LAST)
break;
}
read_page= n->first_page;
read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
return 0;
}
int Gtid_index_reader::give_error(const char *msg)
{
sql_print_information("Error reading binlog GTID index, will "
"fallback to slower sequential binlog scan. "
"Error is: %s", msg);
return 1;
}
Gtid_index_reader_hot::Gtid_index_reader_hot()
: hot_writer(nullptr)
{
}
int
Gtid_index_reader_hot::get_child_ptr(uint32 *out_child_ptr)
{
if (find_bytes(4))
{
/*
If reading hot index, EOF or zero child ptr means the child pointer has
not yet been written. A zero out_child_ptr makes read_node() read the
hot node for the child.
*/
if (hot_writer)
{
*out_child_ptr= 0;
return 0;
}
return give_error("Corrupt index, short index node");
}
*out_child_ptr= (uint32)uint4korr(read_ptr);
read_ptr+= 4;
return 0;
}
int
Gtid_index_reader_hot::do_index_search(uint32 *out_offset,
uint32 *out_gtid_count)
{
/* Check for a "hot" index. */
Gtid_index_writer::lock_gtid_index();
hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
if (!hot_writer)
{
Gtid_index_writer::unlock_gtid_index();
/*
Check the index file header (and index end) again, in case it was
hot when open_index_file() was called, but became cold in the meantime.
*/
if (!has_root_node && Gtid_index_reader::read_file_header())
return -1;
}
int res= do_index_search_root(out_offset, out_gtid_count);
if (hot_writer)
{
hot_writer= nullptr;
Gtid_index_writer::unlock_gtid_index();
}
return res;
}
int
Gtid_index_reader_hot::read_file_header()
{
if (!file_open)
return 1;
Gtid_index_writer::lock_gtid_index();
hot_writer= Gtid_index_writer::find_hot_index(index_file_name);
if (!hot_writer)
Gtid_index_writer::unlock_gtid_index();
int res;
if (hot_writer && hot_writer->max_level == 0)
{
/*
No pages from the hot index have been written to disk, there's just a
single incomplete node at level 0.
We have to read the file header from the in-memory page.
*/
uchar *p= hot_writer->nodes[0]->first_page->page;
page_size= uint4korr(p + 8);
has_root_node= false;
index_valid= true;
res= 0;
}
else
res= Gtid_index_reader::read_file_header();
if (hot_writer)
{
hot_writer= nullptr;
Gtid_index_writer::unlock_gtid_index();
}
return res;
}
int
Gtid_index_reader_hot::read_root_node()
{
if (!index_valid)
return 1;
if (hot_writer)
{
hot_level= hot_writer->max_level;
return read_node_hot();
}
if (has_root_node)
{
return Gtid_index_reader::read_root_node();
}
return 1;
}
int
Gtid_index_reader_hot::read_node(uint32 page_ptr)
{
if (!index_valid || (!page_ptr && !hot_writer))
return 1;
if (hot_writer)
{
if (!page_ptr)
{
/*
The "hot" index is only partially written. Not yet written child pages
are indicated by zero child pointers. Such child pages are found from
the list of active nodes in the writer.
*/
if (hot_level <= 0)
{
DBUG_ASSERT(0 /* Should be no child pointer to follow on leaf page. */);
return give_error("Corrupt hot index (child pointer on leaf page");
}
DBUG_ASSERT(n == hot_writer->nodes[hot_level]);
--hot_level;
return read_node_hot();
}
/*
We started searching the "hot" index, but now we've reached a "cold"
part of the index that's already fully written. So leave the "hot index"
mode and continue reading pages from the on-disk index from here.
*/
hot_writer= nullptr;
Gtid_index_writer::unlock_gtid_index();
}
return read_node_cold(page_ptr);
}
int
Gtid_index_reader_hot::read_node_hot()
{
if (hot_writer->error_state)
return give_error("Cannot access hot index");
n= hot_writer->nodes[hot_level];
read_page= n->first_page;
/* The writer should allocate pages for all nodes. */
DBUG_ASSERT(read_page != nullptr);
if (!read_page)
return give_error("Page not available in hot index");
read_ptr= read_page->flag_ptr + GTID_INDEX_PAGE_HEADER_SIZE;
return 0;
}
/*
Copyright (c) 2023 Kristian Nielsen <knielsen@knielsen-hq.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#ifndef GTID_INDEX_H
#define GTID_INDEX_H
#include "my_global.h"
#include "mysqld.h"
#include "mariadb.h"
#include "rpl_gtid.h"
/*
This implements an on-disk index for each binlog file to speed up access to
the binlog at a specific offset or GTID position. This is primarily used when
a slave connects to the master, but also by user calling BINLOG_GTID_POS().
A connecting slave first scans the binlog files to find the last one with an
initial GTID_LIST event that lies before the starting GTID position. Then a
sequential scan of the binlog file is done until the requested GTID position
is found.
The binlog index conceptually extends this using index records corresponding
to different offset within one binlog file. Each record functions as if it
was the initial GTID_LIST event of a new binlog file, allowing the
sequential scan to start from the corresponding position. By having
sufficiently many index records, the scan will be fast.
The code that adds one record to the index is in two parts, a "sync" path
and an "async" path. The "sync" path in process_gtid_check_batch() does the
minimum amount of work which needs to run as part of transaction commit. The
actual writing of the index, in async_update(), can then be done as a
background task, minimizing the performance impact on transaction processing.
The "sync" and "async" paths each run single threaded, but can execute in
parallel with each other.
The index file is written incrementally together with the binlog file.
However there is no fsync()'s of the index file needed while writing. A
partially written index left by a crashing server will be re-written during
binlog recovery. A reader is allowed to use the index as it is begin written
(for the "hot" binlog file); such access is protected by mutex.
In case of lost or corrupt index, fallback to full sequential scan is done
(so performance will be affected but not correct functionality).
The index file is structured like a B+-tree. The index is append-only, so
also resembles a log-structured merge-tree, but with no merging of levels
needed as it covers a single fixed-size binlog file. This makes the building
of the tree relatively simple.
Keys in the tree consist of a GTID state (corresponding to a GTID_LIST
event) and the associated binlog file offset. All keys (except the first key
in each level of the tree) are delta-compressed to save space, holding only
the (domain_id, server_id) pairs that differ from the previous record.
The file is page-based. The first page contains the leftmost leaf node, and
the root node is at the end of the file. An incompletely written index file
can be detected by the last page in the file not being a root node page.
Nodes in the B+-tree usually fit in one page, but a node can be split across
multiple pages if GTID states are very large.
Page format:
The first page contains an extra file header:
Offset Size Description
0 4 MAGIC header identifying the file as a binlog index
4 1 Major version number. A new major version of the file format
is not readable by older server versions.
5 1 Minor version number. Formats differing only in minor version
are backwards compatible and can be read by older servers.
6 2 Padding/unused.
8 4 Page size.
Each page additionally contains this header:
Offset Size Description
0 1 Flags
1 3 Padding/unused
The last 4 bytes of each page is a 32-bit CRC.
An interior node is a sequence of
<child ptr> <key> <child ptr> <key> ... <key> <child ptr>
while a leaf node has only keys.
A child pointer is stored as 4 byte integer. The first page is 1, so that
0 can be used to denote "not present".
Format of a key:
Offset Size Description
0 4 Number of GTIDs in the key, plus 1. Or 0 for EOF.
4 4 Binlog file offset
8 4 Domain_id of first GTID
12 4 Server_id of first GTID
16 8 Seq_no of first GTID
... and so on for each GTID in the key.
A node typically fits in one page. But if the GTID state is very big (or
the page size very small), multiple pages may be used. When a node is split,
it can be split after a child pointer or before or after a GTID, but not
elsewhere.
Here is an example GTID index with page_size=64 containing 3 records:
Offset GTID state
0x11d [empty]
0x20e [0-1-1]
0x2ad [0-1-2]
The example contains 3 nodes, each stored in a single page. Two leaf nodes and
one interior root node.
Page 1 (leaf node page with file header):
fe fe 0c 01 "magic" identifying the file as a binlog GTID index
01 00 Major version 1, minor version 0
00 00 Padding / currently unused
40 00 00 00 Page size (64 bytes in this example)
05 Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node)
00 00 00 Padding / current unused
Key 1:
01 00 00 00 <GTID_count + 1> = 1 (entry has zero GTIDs in it)
1d 01 00 00 Binlog file offset = 0x11d
[Empty GTID state at the very start of the binlog]
Key 2:
02 00 00 00 GTID_count = 1
0e 02 00 00 Binlog file offset = 0x20e
00 00 00 00 01 00 00 00
01 00 00 00 00 00 00 00 GTID 0-1-1
00 00 00 00 00 Zero denotes end-of-node
00 00 00 00 00 00 00 (Unused space in the page)
0e 4f ac 43 Checksum / CRC
Page 2 (leaf node):
05 Flag PAGE_FLAG_IS_LEAF | PAGE_FLAG_LAST (single-page leaf node)
00 00 00 Unused
Key 1:
02 00 00 00 GTID_count = 1
ad 02 00 00 Binlog file offset = 0x2ad
00 00 00 00 01 00 00 00
02 00 00 00 00 00 00 00 GTID 0-1-2
00 00 00 00 End-of-node
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 (Unused space in the page)
0c 4e c2 b9 CRC
Page 3 (root node):
0c PAGE_FLAG_ROOT | PAGE_FLAG_LAST (interior root node)
00 00 00 Unused
Child pointer:
01 00 00 00 Pointer to page 1
Key for next child page:
02 00 00 00 GTID_count = 1
ad 02 00 00 Binlog offset = 0x2ad
00 00 00 00 01 00 00 00
02 00 00 00 00 00 00 00 GTID 0-1-2
Child pointer:
02 00 00 00 Pointer to page 2
00 00 0 000 Zero denotes end-of-node
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 (Unused)
8155 a3c7 CRC
Below is an example of the logical B-Tree structure of a larger GTID index
with a total of 12 keys.
We use S0, S1, ..., S11 to denote a key, which consists of a GTID state (as
seen in @@binlog_gtid_state and GTID_LIST_EVENT) and the associated binlog
file offset. D1, D2, ..., D11 denote the same keys, but delta-compressed, so
that D1 stores only those GTIDs that are not the same as in S0.
Pages are denoted by P1, P2, ..., P8. In the example, P1, P2, P3, P5, and P6
are leaf pages, the rest are interior node pages. P8 is the root node (the
root is always the last page in the index).
The contents of each page is listed in square brackets [...]. So P1[S0 D1 D2]
is a leaf page with 3 keys, and P7[P5 <D10+D11> P6] is an interior node page
with one key <D10+D11> and two child-page pointers to P5 and P6. The
notation <D10+D11> denotes the delta-compression of key S11 relative to S9;
all GTIDs in S11 that are not present in S9. In the code, this is computed
by combining D10 and D11, hence the use of the notation "D10+D11" instead of
the equivalent "S11-S9".
Here is the example B-Tree. It has 3 levels, with the leaf nodes at the top:
P1[S0 D1 D2] P2[D3 D4 D5] P3[D6 D7 D8] P5[D9 D10] P6[D11]
P4[P1 <S3> P2 <D4+D5+D6> P3] P7[P5 <D10+D11> P6]
P8[P4 <S9> P7]
To find eg. S4, we start from the root P8. S4<S9, so we follow the left child
pointer to P4. S4>S3 and S4<S6 (S6=(S3+D4+D5+D6)), so we follow the child
pointer to leaf page P2.
The index is written completely append-only; this is possible since keys are
always inserted in-order, at the end of the index. One page is kept in-memory
at each level of the B-Tree; when a new key no longer fits the page, it is
written out to disk and a new in-memory page is allocated for it.
Here are the operations that occur while writing the index file from the
above example. The left column is each key added to the index as the
corresponding GTID is written into the binlog file (<EOF> is when the index
is closed at binlog rotation). The right column are the operations performed,
as follows:
alloc(p) Allocate page p
add_key(p,k) Insert the key k into the page p
add_ptr(p,q) Insert a pointer to child page q in parent page p
write(p) Write out page p to disk at the end of the index file.
GTID STATE OPERATIONS
S0 alloc(P1) add_key(P1,S0)
D1 add_key(P1,D1)
D2 add_key(P1,D2)
D3 write(P1) alloc(P4) add_ptr(P4,P1)
alloc(P2) add_key(P2,D3) add_key(P4,S3)
D4 add_key(P2,D4)
D5 add_key(P2,D5)
D6 write(P2) add_ptr(P4,P2)
alloc(P3) add_key(P3,D6) add_key(P4,D4+D5+D6)
D7 add_key(P3,D7)
D8 add_key(P3,D8)
D9 write(P3) add_ptr(P4,P3)
alloc(P5) add_key(P5,D9)
write(P4) alloc(P8) add_ptr(P8,P4) alloc(P7) add_key(P8,S9)
D10 add_key(P5,D10)
D11 write(P5) add_ptr(P7,P5)
alloc(P6) add_key(P6,D11) add_key(P7,D10+D11)
<EOF> write(P6) add_ptr(P7,P6)
write(P7) add_ptr(P8,P7)
write(P8)
After adding each record to the index, there is exactly one partial page
allocated in-memory for each level present in the B-Tree; new pages being
allocated as old pages fill up and are written to disk.
*/
class Gtid_index_base
{
public:
/* +4 for ".idx" prefix. */
static constexpr size_t GTID_INDEX_FILENAME_MAX_SIZE= FN_REFLEN+4;
protected:
enum enum_page_flags {
/* Set for a leaf node page, cleared for an interior node page. */
PAGE_FLAG_IS_LEAF= 1,
/* This is a continuation page. */
PAGE_FLAG_IS_CONT= 2,
/* No continuation page follows (the last page in a group). */
PAGE_FLAG_LAST= 4,
/*
Flag set to mark the root node. (The root node is normally the last page
in the index file, but having an explicit flag allows us to detect a
partially written index file with the root node missing.
*/
PAGE_FLAG_ROOT= 8,
};
/*
Minor version increment represents a backwards-compatible format (can be
read by any server version that knows the format of the major version).
Major version increment means a server should not attempt to read from the
index.
*/
static constexpr uchar GTID_INDEX_VERSION_MAJOR= 1;
static constexpr uchar GTID_INDEX_VERSION_MINOR= 0;
static constexpr size_t GTID_INDEX_FILE_HEADER_SIZE= 12;
static constexpr size_t GTID_INDEX_PAGE_HEADER_SIZE= 4;
static constexpr size_t CHECKSUM_LEN= 4;
#ifdef _MSC_VER
/*
Flexible array member is part of C99, but it is not standard in C++.
All the compilers and platforms we support do support it, though.
Just we need to disable on Windows a warning about using a non-standard
C++ extension.
*/
#pragma warning(disable : 4200)
#endif
struct Node_page
{
Node_page *next;
/* Pointer to allow to update the "flags" byte at page writeout. */
uchar *flag_ptr;
/* Flexible array member; will be allocated to opt_gtid_index_page_size. */
uchar page[];
};
struct Index_node_base
{
Node_page *first_page;
Node_page *current_page;
/* The current_ptr is only valid if current_page != 0. */
uchar *current_ptr;
Index_node_base();
~Index_node_base();
void free_pages();
void reset();
};
public:
static void make_gtid_index_file_name(char *out_name, size_t bufsize,
const char *base_filename);
protected:
int update_gtid_state(rpl_binlog_state_base *state,
const rpl_gtid *gtid_list, uint32 gtid_count);
Node_page *alloc_page();
rpl_gtid *gtid_list_buffer(uint32 count);
void build_index_filename(const char *filename);
virtual int give_error(const char *msg) = 0;
/*
A buffer to hold a gtid_list temporarily.
Increased as needed to hold largest needed list.
*/
rpl_gtid *gtid_buffer;
uint32 gtid_buffer_alloc;
size_t page_size;
public:
char index_file_name[GTID_INDEX_FILENAME_MAX_SIZE];
protected:
Gtid_index_base();
virtual ~Gtid_index_base();
};
class Gtid_index_writer : public Gtid_index_base
{
private:
const my_off_t offset_min_threshold;
struct Index_node : public Index_node_base
{
rpl_binlog_state_base state;
uint32 num_records;
uint32 level;
bool force_spill_page;
Index_node(uint32 level_);
~Index_node();
void reset();
};
public:
static void gtid_index_init();
static void gtid_index_cleanup();
protected:
friend class Gtid_index_reader_hot;
static void lock_gtid_index() { mysql_mutex_lock(&gtid_index_mutex); }
static void unlock_gtid_index() { mysql_mutex_unlock(&gtid_index_mutex); }
static const Gtid_index_writer *find_hot_index(const char *file_name);
public:
Gtid_index_writer(const char *filename, uint32 offset,
rpl_binlog_state_base *binlog_state,
uint32 opt_page_size, my_off_t opt_span_min);
virtual ~Gtid_index_writer();
void process_gtid(uint32 offset, const rpl_gtid *gtid);
int process_gtid_check_batch(uint32 offset, const rpl_gtid *gtid,
rpl_gtid **out_gtid_list,
uint32 *out_gtid_count);
int async_update(uint32 event_offset, rpl_gtid *gtid_list, uint32 gtid_count);
void close();
private:
void insert_in_hot_index();
void remove_from_hot_index();
uint32 write_current_node(uint32 level, bool is_root);
int reserve_space(Index_node *n, size_t bytes);
int do_write_record(uint32 level, uint32 event_offset,
const rpl_gtid *gtid_list, uint32 gtid_count);
int add_child_ptr(uint32 level, my_off_t node_offset);
int write_record(uint32 event_offset, const rpl_gtid *gtid_list,
uint32 gtid_count);
bool check_room(uint32 level, uint32 gtid_count);
int alloc_level_if_missing(uint32 level);
uchar *init_header(Node_page *page, bool is_leaf, bool is_first);
int give_error(const char *msg) override;
static mysql_mutex_t gtid_index_mutex;
static Gtid_index_writer *hot_index_list;
rpl_binlog_state_base pending_state;
/* Next pointer for the hot_index_list linked list. */
Gtid_index_writer *next_hot_index;
/* The currently being built index nodes, from leaf[0] to root[max_level]. */
Index_node **nodes;
my_off_t previous_offset;
uint32 max_level;
File index_file;
/*
This is set if we encounter an error (such as out-of-memory or I/O error).
Then we will no longer do any updates to the index, to prevent leaving a
corrupt index. This is not fatal; the partial index will work up to where
it got the error, and the code can fall-back to sequential scan of the
binlog.
*/
bool error_state;
/* Flag to help put the file header at the start of the very first page. */
bool file_header_written;
/* Flag set while this object is visible in the "hot index" list. */
bool in_hot_index_list;
};
class Gtid_index_reader : public Gtid_index_base
{
public:
Gtid_index_reader();
virtual ~Gtid_index_reader();
int open_index_file(const char *binlog_filename);
void close_index_file();
/*
The search functions take either a binlog offset or GTID position to search
for. They return:
0 for "not found" (searched position is earlier than start of index).
1 for "found"
-1 for error.
When found, the returned position is the last position in the index that
lies at or before the searched position. The offset of the returned
position is written to *out_offset. The number of GTIDs in the returned
GTID state is written to *out_gtid_count; the list of found GTIDs can be
accessed with search_gtid_list() and is valid only until next search or
freeing of the Gtid_index_reader object.
*/
int search_offset(uint32 in_offset, uint32 *out_offset,
uint32 *out_gtid_count);
int search_gtid_pos(slave_connection_state *in_gtid_pos, uint32 *out_offset,
uint32 *out_gtid_count);
rpl_gtid *search_gtid_list();
protected:
int search_cmp_offset(uint32 offset, rpl_binlog_state_base *state);
int search_cmp_gtid_pos(uint32 offset, rpl_binlog_state_base *state);
virtual int do_index_search(uint32 *out_offset, uint32 *out_gtid_count);
int do_index_search_root(uint32 *out_offset, uint32 *out_gtid_count);
int do_index_search_leaf(bool current_state_updated,
uint32 *out_offset, uint32 *out_gtid_count);
int next_page();
int find_bytes(uint32 num_bytes);
virtual int get_child_ptr(uint32 *out_child_ptr);
int get_offset_count(uint32 *out_offset, uint32 *out_gtid_count);
int get_gtid_list(rpl_gtid *out_gtid_list, uint32 count);
virtual int read_file_header();
int verify_checksum(Node_page *page);
Node_page *alloc_and_read_page();
virtual int read_root_node();
virtual int read_node(uint32 page_ptr);
int read_node_cold(uint32 page_ptr);
int give_error(const char *msg) override;
rpl_binlog_state_base current_state;
rpl_binlog_state_base compare_state;
Index_node_base cold_node;
/* n points to either cold node or hot node in writer. */
Index_node_base *n;
int (Gtid_index_reader::* search_cmp_function)(uint32, rpl_binlog_state_base *);
slave_connection_state *in_search_gtid_pos;
Node_page *read_page;
uchar *read_ptr;
File index_file;
uint32 current_offset;
uint32 in_search_offset;
bool file_open;
bool index_valid;
bool has_root_node;
uchar version_major;
uchar version_minor;
};
/*
Sub-class of Gtid_index_reader that can additionally access in-memory "hot"
pages of the index, which are partially filled pages of the current binlog
file, not yet written to disk.
*/
class Gtid_index_reader_hot : public Gtid_index_reader
{
public:
Gtid_index_reader_hot();
virtual ~Gtid_index_reader_hot() { }
private:
int do_index_search(uint32 *out_offset, uint32 *out_gtid_count) override;
int get_child_ptr(uint32 *out_child_ptr) override;
int read_file_header() override;
int read_root_node() override;
int read_node(uint32 page_ptr) override;
int read_node_hot();
/* Pointer to the writer object, if we're reading a hot index. */
const Gtid_index_writer *hot_writer;
/* The level we are currently reading in the hot writer .*/
uint32 hot_level;
};
#endif /* GTID_INDEX_H */
......@@ -40,6 +40,7 @@
#include "sql_audit.h"
#include "mysqld.h"
#include "ddl_log.h"
#include "gtid_index.h"
#include <my_dir.h>
#include <m_ctype.h> // For test_if_number
......@@ -164,12 +165,44 @@ static SHOW_VAR binlog_status_vars_detail[]=
Variables for the binlog background thread.
Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
*/
struct Binlog_background_job
{
union
{
MYSQL_BIN_LOG::xid_count_per_binlog *notify_entry;
struct {
Gtid_index_writer *gi;
rpl_gtid *gtid_list;
uint32 gtid_count;
uint32 offset;
} gtid_index_data;
};
Binlog_background_job *next;
enum enum_job_type {
CHECKPOINT_NOTIFY,
GTID_INDEX_UPDATE,
GTID_INDEX_CLOSE,
SENTINEL
} job_type;
};
static bool binlog_background_thread_started= false;
static bool binlog_background_thread_stop= false;
static MYSQL_BIN_LOG::xid_count_per_binlog *
binlog_background_thread_queue= NULL;
static bool binlog_background_thread_sentinel= false;
static Binlog_background_job *binlog_background_thread_queue= NULL;
static Binlog_background_job **binlog_background_thread_endptr=
&binlog_background_thread_queue;
static Binlog_background_job *binlog_background_freelist= NULL;
static bool start_binlog_background_thread();
static int queue_binlog_background_checkpoint_notify(
MYSQL_BIN_LOG::xid_count_per_binlog *entry);
static int queue_binlog_background_gtid_index_update(Gtid_index_writer *gi,
uint32 offset,
rpl_gtid *gtid_list,
uint32 count);
static int queue_binlog_background_gtid_index_close(Gtid_index_writer *gi);
static int queue_binlog_background_sentinel();
static void binlog_background_wait_for_sentinel();
static rpl_binlog_state rpl_global_gtid_binlog_state;
......@@ -3418,7 +3451,7 @@ MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
group_commit_queue(0), group_commit_queue_busy(FALSE),
num_commits(0), num_group_commits(0),
group_commit_trigger_count(0), group_commit_trigger_timeout(0),
group_commit_trigger_lock_wait(0),
group_commit_trigger_lock_wait(0), gtid_index(nullptr),
sync_period_ptr(sync_period), sync_counter(0),
state_file_deleted(false), binlog_state_recover_done(false),
is_relay_log(0), relay_signal_cnt(0),
......@@ -3861,6 +3894,26 @@ bool MYSQL_BIN_LOG::open(const char *log_name,
if (write_event(&gl_ev))
goto err;
/* Open an index file for this binlog file. */
DBUG_ASSERT(!gtid_index); /* Binlog close should clear it. */
if (gtid_index)
delete gtid_index;
if (opt_binlog_gtid_index)
{
my_off_t offset= my_b_tell(&log_file);
gtid_index=
new Gtid_index_writer(log_file_name, (uint32)offset,
&rpl_global_gtid_binlog_state,
(uint32)opt_binlog_gtid_index_page_size,
(my_off_t)opt_binlog_gtid_index_span_min);
if (!gtid_index)
sql_print_information("Could not create GTID index for binlog "
"file '%s'. Accesses to this binlog file will "
"fallback to slower sequential scan.");
}
else
gtid_index= nullptr;
/* Output a binlog checkpoint event at the start of the binlog file. */
/*
......@@ -4410,12 +4463,31 @@ bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log,
no new ones will be written. So we can proceed to delete the logs.
*/
mysql_mutex_unlock(&LOCK_xid_list);
/*
Push a sentinel through the binlog background thread and wait for it to
return. When it does, we know that no more GTID index operations are
pending as we are holding LOCK_log.
(This is normally already the case as we pushed a binlog checkpoint
request through. But if no XID-capable engines are enabled (eg. running
without InnoDB), then that is a no-op).
*/
queue_binlog_background_sentinel();
binlog_background_wait_for_sentinel();
}
/* Save variables so that we can reopen the log */
save_name=name;
name=0; // Protect against free
close(LOG_CLOSE_TO_BE_OPENED);
/*
Close the active log.
Close the active GTID index synchroneously. We don't want the close
running in the background while we delete the gtid index file. And we just
pushed a sentinel through the binlog background thread while holding
LOCK_log, so no other GTID index operations can be pending.
*/
close(LOG_CLOSE_TO_BE_OPENED|LOG_CLOSE_SYNC_GTID_INDEX);
last_used_log_number= 0; // Reset log number cache
......@@ -4440,6 +4512,28 @@ bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log,
for (;;)
{
/* Delete any GTID index file. */
char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf),
linfo.log_file_name);
if (my_delete(buf, MYF(0)))
{
/* If ENOENT, the GTID index file is already deleted or never existed. */
if (my_errno != ENOENT)
{
if (thd)
{
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
ER_CANT_DELETE_FILE, ER_THD(thd, ER_CANT_DELETE_FILE),
buf, my_errno);
}
sql_print_information("Failed to delete file '%s' (errno=%d)",
buf, my_errno);
}
my_errno= 0;
}
/* Delete the binlog file. */
if (unlikely((error= my_delete(linfo.log_file_name, MYF(0)))))
{
if (my_errno == ENOENT)
......@@ -4950,6 +5044,7 @@ int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space,
int error= 0;
LOG_INFO log_info;
LOG_INFO check_log_info;
char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
DBUG_ASSERT(my_b_inited(&purge_index_file));
......@@ -4983,6 +5078,24 @@ int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space,
/* Get rid of the trailing '\n' */
log_info.log_file_name[length-1]= 0;
Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf),
log_info.log_file_name);
if (my_delete(buf, MYF(0)))
{
/* If ENOENT, the GTID index file is already deleted or never existed. */
if (my_errno != ENOENT)
{
if (thd)
{
push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
ER_CANT_DELETE_FILE, ER_THD(thd, ER_CANT_DELETE_FILE),
buf, my_errno);
}
sql_print_information("Failed to delete file '%s'", buf);
}
my_errno= 0;
}
if (unlikely(!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s,
MYF(0))))
{
......@@ -6962,6 +7075,8 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
{
bool synced;
update_gtid_index((uint32)offset, thd->get_last_commit_gtid());
if ((error= flush_and_sync(&synced)))
{
}
......@@ -7039,6 +7154,30 @@ bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
}
void
MYSQL_BIN_LOG::update_gtid_index(uint32 offset, rpl_gtid gtid)
{
if (!unlikely(gtid_index))
return;
rpl_gtid *gtid_list;
uint32 gtid_count;
int err= gtid_index->process_gtid_check_batch(offset, &gtid,
&gtid_list, &gtid_count);
if (err)
return;
if (gtid_list)
{
/*
Perform the GTID index update in the binlog background thread,
as we are running under the critical LOCK_log mutex.
*/
if (queue_binlog_background_gtid_index_update(gtid_index, offset,
gtid_list, gtid_count))
my_free(gtid_list);
}
}
int error_log_print(enum loglevel level, const char *format,
va_list args)
{
......@@ -8477,6 +8616,8 @@ MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
strmake_buf(cache_mngr->last_commit_pos_file, log_file_name);
commit_offset= my_b_write_tell(&log_file);
update_gtid_index((uint32)commit_offset,
current->thd->get_last_commit_gtid());
cache_mngr->last_commit_pos_offset= commit_offset;
if ((cache_mngr->using_xa && cache_mngr->xa_xid) || current->need_unlog)
{
......@@ -9063,6 +9204,33 @@ void MYSQL_BIN_LOG::close(uint exiting)
}
#endif /* HAVE_REPLICATION */
if (!is_relay_log && likely(gtid_index))
{
if (exiting & (LOG_CLOSE_STOP_EVENT|LOG_CLOSE_SYNC_GTID_INDEX))
{
/*
The binlog background thread is already stopped just close the final
GTID index synchronously. Or caller explicitly requested synchronous
close of the GTID index.
*/
gtid_index->close();
delete gtid_index;
}
else
{
/*
Queue a close on the current GTID index.
Important that this is queued _before_ the checkpoint request is sent
(and thus before chechpoint notifications can be queued); this way, if
we crash before the GTID index is synced to disk, the checkpoint will
still be pending and the binlog file will be scanned during crash
recovery and the GTID index recovered.
*/
queue_binlog_background_gtid_index_close(gtid_index);
}
gtid_index= nullptr;
}
/* don't pwrite in a file opened with O_APPEND - it doesn't work */
if (log_file.type == WRITE_CACHE && !(exiting & LOG_CLOSE_DELAYED_CLOSE))
{
......@@ -10656,22 +10824,7 @@ void
TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
{
xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
bool found_entry= false;
mysql_mutex_lock(&LOCK_binlog_background_thread);
/* count the same notification kind from different engines */
for (xid_count_per_binlog *link= binlog_background_thread_queue;
link && !found_entry; link= link->next_in_queue)
{
if ((found_entry= (entry == link)))
entry->notify_count++;
}
if (!found_entry)
{
entry->next_in_queue= binlog_background_thread_queue;
binlog_background_thread_queue= entry;
}
mysql_cond_signal(&COND_binlog_background_thread);
mysql_mutex_unlock(&LOCK_binlog_background_thread);
queue_binlog_background_checkpoint_notify(entry);
}
/*
......@@ -10690,7 +10843,9 @@ pthread_handler_t
binlog_background_thread(void *arg __attribute__((unused)))
{
bool stop;
MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
Binlog_background_job *queue, *next;
Binlog_background_job *freelist= nullptr;
Binlog_background_job **freelist_endptr= &freelist;
THD *thd;
my_thread_init();
DBUG_ENTER("binlog_background_thread");
......@@ -10734,6 +10889,18 @@ binlog_background_thread(void *arg __attribute__((unused)))
*/
THD_STAGE_INFO(thd, stage_binlog_waiting_background_tasks);
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
/*
Put back our job objects in the freelist, now that we own the mutex again.
*/
if (freelist)
{
*freelist_endptr= binlog_background_freelist;
binlog_background_freelist= freelist;
freelist= nullptr;
freelist_endptr= &freelist;
}
for (;;)
{
stop= binlog_background_thread_stop;
......@@ -10752,6 +10919,7 @@ binlog_background_thread(void *arg __attribute__((unused)))
}
/* Grab the queue, if any. */
binlog_background_thread_queue= NULL;
binlog_background_thread_endptr= &binlog_background_thread_queue;
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
/* Process any incoming commit_checkpoint_notify() calls. */
......@@ -10767,17 +10935,40 @@ binlog_background_thread(void *arg __attribute__((unused)))
#endif
while (queue)
{
long count= queue->notify_count;
switch (queue->job_type)
{
case Binlog_background_job::CHECKPOINT_NOTIFY:
THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify);
DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done");
/* Set the thread start time */
thd->set_time();
/* Grab next pointer first, as mark_xid_done() may free the element. */
next= queue->next_in_queue;
queue->notify_count= 0;
for (long i= 0; i <= count; i++)
mysql_bin_log.mark_xid_done(queue->binlog_id, true);
queue= next;
mysql_bin_log.mark_xid_done(queue->notify_entry->binlog_id, true);
break;
case Binlog_background_job::GTID_INDEX_UPDATE:
queue->gtid_index_data.gi->
async_update(queue->gtid_index_data.offset,
queue->gtid_index_data.gtid_list,
queue->gtid_index_data.gtid_count);
break;
case Binlog_background_job::GTID_INDEX_CLOSE:
queue->gtid_index_data.gi->close();
delete queue->gtid_index_data.gi;
break;
case Binlog_background_job::SENTINEL:
/*
The sentinel is a way to signal to reset_logs() that all pending
background jobs prior to the sentinel have been processed.
*/
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
DBUG_ASSERT(binlog_background_thread_sentinel);
binlog_background_thread_sentinel= false;
mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
break;
}
#ifdef ENABLED_DEBUG_SYNC
DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
......@@ -10786,6 +10977,12 @@ binlog_background_thread(void *arg __attribute__((unused)))
STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
);
#endif
next= queue->next;
queue->next= nullptr;
*freelist_endptr= queue;
freelist_endptr= &queue->next;
queue= next;
}
if (stop)
......@@ -10794,6 +10991,13 @@ binlog_background_thread(void *arg __attribute__((unused)))
THD_STAGE_INFO(thd, stage_binlog_stopping_background_thread);
while (freelist)
{
next= freelist->next;
my_free(freelist);
freelist= next;
}
/* No need to use mutex as thd is not linked into other threads */
THD_count::count++;
delete thd;
......@@ -10802,6 +11006,12 @@ binlog_background_thread(void *arg __attribute__((unused)))
/* Signal that we are (almost) stopped. */
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
while (binlog_background_freelist)
{
next= binlog_background_freelist->next;
my_free(binlog_background_freelist);
binlog_background_freelist= next;
}
binlog_background_thread_stop= false;
mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
......@@ -10845,6 +11055,139 @@ start_binlog_background_thread()
return 0;
}
static Binlog_background_job *
get_binlog_background_job()
{
Binlog_background_job *job;
mysql_mutex_assert_owner(&mysql_bin_log.LOCK_binlog_background_thread);
if ((job= binlog_background_freelist) != nullptr)
binlog_background_freelist= job->next;
else
job= (Binlog_background_job *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*job),
MYF(MY_WME));
return job;
}
static void
queue_binlog_background_job(Binlog_background_job *job)
{
mysql_mutex_assert_owner(&mysql_bin_log.LOCK_binlog_background_thread);
job->next= nullptr;
*binlog_background_thread_endptr= job;
binlog_background_thread_endptr= &job->next;
mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread);
}
static int
queue_binlog_background_checkpoint_notify(
MYSQL_BIN_LOG::xid_count_per_binlog *entry)
{
int res;
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
Binlog_background_job *job= get_binlog_background_job();
if (!job)
res= 1;
else
{
job->job_type= Binlog_background_job::CHECKPOINT_NOTIFY;
job->notify_entry= entry;
queue_binlog_background_job(job);
res= 0;
}
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
return res;
}
static int
queue_binlog_background_gtid_index_update(Gtid_index_writer *gi, uint32 offset,
rpl_gtid *gtid_list, uint32 count)
{
int res;
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
Binlog_background_job *job= get_binlog_background_job();
if (!unlikely(job))
res= 1;
else
{
job->job_type= Binlog_background_job::GTID_INDEX_UPDATE;
job->gtid_index_data.gi= gi;
job->gtid_index_data.gtid_list= gtid_list;
job->gtid_index_data.gtid_count= count;
job->gtid_index_data.offset= offset;
queue_binlog_background_job(job);
res= 0;
}
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
return res;
}
static int
queue_binlog_background_gtid_index_close(Gtid_index_writer *gi)
{
int res;
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
Binlog_background_job *job= get_binlog_background_job();
if (!job)
return 1;
else
{
job->job_type= Binlog_background_job::GTID_INDEX_CLOSE;
job->gtid_index_data.gi= gi;
queue_binlog_background_job(job);
res= 0;
}
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
return res;
}
static int
queue_binlog_background_sentinel()
{
int res;
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
DBUG_ASSERT(!binlog_background_thread_sentinel);
Binlog_background_job *job= get_binlog_background_job();
if (!job)
return 1;
else
{
binlog_background_thread_sentinel= true;
job->job_type= Binlog_background_job::SENTINEL;
queue_binlog_background_job(job);
res= 0;
}
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
return res;
}
static void
binlog_background_wait_for_sentinel()
{
mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
while(binlog_background_thread_sentinel)
mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread_end,
&mysql_bin_log.LOCK_binlog_background_thread);
mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);
}
#ifdef HAVE_REPLICATION
class Recovery_context
{
......@@ -11111,7 +11454,7 @@ bool Recovery_context::reset_truncate_coord(my_off_t pos)
for (uint i= 0; i < gtid_maybe_to_truncate->elements(); i++)
{
rpl_gtid gtid= gtid_maybe_to_truncate->at(i);
if (rpl_global_gtid_binlog_state.update_nolock(&gtid, false))
if (rpl_global_gtid_binlog_state.update_nolock(&gtid))
return true;
}
gtid_maybe_to_truncate->clear();
......@@ -11376,6 +11719,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
Format_description_log_event *fdle, bool do_xa)
{
Log_event *ev= NULL;
Gtid_index_writer *gtid_index_recover= NULL;
HASH xids, ddl_log_ids;
MEM_ROOT mem_root;
char binlog_checkpoint_name[FN_REFLEN];
......@@ -11512,6 +11856,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
/* Initialise the binlog state from the Gtid_list event. */
if (rpl_global_gtid_binlog_state.load(glev->list, glev->count))
goto err2;
if (opt_binlog_gtid_index)
gtid_index_recover= recover_gtid_index_start(last_log_name, end_pos);
}
break;
......@@ -11551,7 +11897,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
(((Query_log_event *)ev)->is_commit() ||
((Query_log_event *)ev)->is_rollback()))));
if (rpl_global_gtid_binlog_state.update_nolock(&ctx.last_gtid, false))
recover_gtid_index_process(gtid_index_recover, end_pos, &ctx.last_gtid);
if (rpl_global_gtid_binlog_state.update_nolock(&ctx.last_gtid))
goto err2;
ctx.last_gtid_valid= false;
}
......@@ -11560,6 +11907,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
delete ev;
ev= NULL;
} // end of while
recover_gtid_index_end(gtid_index_recover);
gtid_index_recover= NULL;
cur_log= &log;
/*
......@@ -11645,6 +11994,7 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
err2:
delete ev;
recover_gtid_index_abort(gtid_index_recover);
if (file >= 0)
{
end_io_cache(&log);
......@@ -11663,6 +12013,104 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
}
/*
Start recovery of the GTID index for a binlog file.
The old index is deleted and a new index is rebuilt while scanning the
binlog file during binlog recovery.
Errors are not fatal, as the code can fallback to slower full binlog file
scan when no GTID index is available.
@param base_name File name of the binlog file.
@param offset End log pos of the GTID_LIST log event of the binlog file.
@return Gtid_index_writer object or NULL.
*/
Gtid_index_writer *
MYSQL_BIN_LOG::recover_gtid_index_start(const char *base_name, my_off_t offset)
{
char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
Gtid_index_base::make_gtid_index_file_name(buf, sizeof(buf), base_name);
if (my_delete(buf, MYF(0)))
{
/* If ENOENT, the GTID index file is already deleted or never existed. */
if (my_errno != ENOENT)
{
sql_print_information("Failed to delete file '%s' (errno=%d)", buf, my_errno);
}
my_errno= 0;
}
Gtid_index_writer *gi=
new Gtid_index_writer(base_name, (uint32)offset,
&rpl_global_gtid_binlog_state,
(uint32)opt_binlog_gtid_index_page_size,
(my_off_t)opt_binlog_gtid_index_span_min);
return gi;
}
/*
Process one GTID during GTID index recovery.
@param gi Gtid_index_writer object or NULL.
@param offset End log pos of the GTID event.
@param gev GTID log event to process.
@return nothing
*/
void
MYSQL_BIN_LOG::recover_gtid_index_process(Gtid_index_writer *gi,
my_off_t offset,
const rpl_gtid *gtid)
{
if (gi)
{
gi->process_gtid((uint32)offset, gtid);
}
}
/*
Complete the recovery of one GTID index, syncing and closing it.
@param gi Gtid_index_writer object or NULL.
@return nothing
*/
void
MYSQL_BIN_LOG::recover_gtid_index_end(Gtid_index_writer *gi)
{
if (gi)
{
gi->close();
delete gi;
}
}
/*
Abort the recovery of one GTID index, deleting any partially recovered index.
@param gi Gtid_index_writer object or NULL.
@return nothing
*/
void
MYSQL_BIN_LOG::recover_gtid_index_abort(Gtid_index_writer *gi)
{
if (gi)
{
char buf[Gtid_index_base::GTID_INDEX_FILENAME_MAX_SIZE];
strmake(buf, gi->index_file_name, sizeof(buf)-1);
/*
Delete first the Gtid_index_writer object and then the partial index
(the writer still has the index file open and active until destructed).
*/
delete(gi);
my_delete(buf, MYF(0));
}
}
int
MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
......
......@@ -21,8 +21,10 @@
#include "rpl_constants.h"
class Relay_log_info;
class Gtid_index_writer;
class Format_description_log_event;
class Gtid_log_event;
bool reopen_fstreams(const char *filename, FILE *outstream, FILE *errstream);
void setup_log_handling();
......@@ -240,6 +242,7 @@ extern TC_LOG_DUMMY tc_log_dummy;
#define LOG_CLOSE_TO_BE_OPENED 2
#define LOG_CLOSE_STOP_EVENT 4
#define LOG_CLOSE_DELAYED_CLOSE 8
#define LOG_CLOSE_SYNC_GTID_INDEX 16
/*
Maximum unique log filename extension.
......@@ -711,6 +714,9 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
ulonglong group_commit_trigger_count, group_commit_trigger_timeout;
ulonglong group_commit_trigger_lock_wait;
/* Binlog GTID index. */
Gtid_index_writer *gtid_index;
/* pointer to the sync period variable, for binlog this will be
sync_binlog_period, for relay log this will be
sync_relay_log_period
......@@ -720,6 +726,13 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
bool state_file_deleted;
bool binlog_state_recover_done;
Gtid_index_writer *recover_gtid_index_start(const char *base_name,
my_off_t offset);
void recover_gtid_index_process(Gtid_index_writer *gi, my_off_t offset,
const rpl_gtid *gtid);
void recover_gtid_index_end(Gtid_index_writer *gi);
void recover_gtid_index_abort(Gtid_index_writer *gi);
inline uint get_sync_period()
{
return *sync_period_ptr;
......@@ -739,6 +752,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
bool write_transaction_to_binlog_events(group_commit_entry *entry);
void trx_group_commit_leader(group_commit_entry *leader);
bool is_xidlist_idle_nolock();
void update_gtid_index(uint32 offset, rpl_gtid gtid);
public:
int new_file_without_locking();
/*
......@@ -759,11 +774,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private Event_log
ulong binlog_id;
/* Total prepared XIDs and pending checkpoint requests in this binlog. */
long xid_count;
long notify_count;
/* For linking in requests to the binlog background thread. */
xid_count_per_binlog *next_in_queue;
xid_count_per_binlog(char *log_file_name, uint log_file_name_len)
:binlog_id(0), xid_count(0), notify_count(0)
:binlog_id(0), xid_count(0)
{
binlog_name_len= log_file_name_len;
binlog_name= (char *) my_malloc(PSI_INSTRUMENT_ME, binlog_name_len, MYF(MY_ZEROFILL));
......
......@@ -83,6 +83,7 @@
#include "wsrep_server_state.h"
#endif /* WITH_WSREP */
#include "proxy_protocol.h"
#include "gtid_index.h"
#include "sql_callback.h"
#include "threadpool.h"
......@@ -443,6 +444,9 @@ my_bool sp_automatic_privileges= 1;
ulong opt_binlog_rows_event_max_size;
ulong binlog_row_metadata;
my_bool opt_binlog_gtid_index= TRUE;
uint opt_binlog_gtid_index_page_size= 4096;
uint opt_binlog_gtid_index_span_min= 65536;
my_bool opt_master_verify_checksum= 0;
my_bool opt_slave_sql_verify_checksum= 1;
const char *binlog_format_names[]= {"MIXED", "STATEMENT", "ROW", NullS};
......@@ -491,6 +495,7 @@ ulong malloc_calls;
ulong specialflag=0;
ulong binlog_cache_use= 0, binlog_cache_disk_use= 0;
ulong binlog_stmt_cache_use= 0, binlog_stmt_cache_disk_use= 0;
ulong binlog_gtid_index_hit= 0, binlog_gtid_index_miss= 0;
ulong max_connections, max_connect_errors;
uint max_password_errors;
ulong extra_max_connections;
......@@ -896,7 +901,7 @@ PSI_file_key key_file_binlog, key_file_binlog_cache, key_file_binlog_index,
PSI_file_key key_file_query_log, key_file_slow_log;
PSI_file_key key_file_relaylog, key_file_relaylog_index,
key_file_relaylog_cache, key_file_relaylog_index_cache;
PSI_file_key key_file_binlog_state;
PSI_file_key key_file_binlog_state, key_file_gtid_index;
#ifdef HAVE_PSI_INTERFACE
#ifdef HAVE_MMAP
......@@ -921,6 +926,7 @@ PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
key_LOCK_status, key_LOCK_temp_pool,
key_LOCK_system_variables_hash, key_LOCK_thd_data, key_LOCK_thd_kill,
key_LOCK_user_conn, key_LOCK_uuid_short_generator, key_LOG_LOCK_log,
key_gtid_index_lock,
key_master_info_data_lock, key_master_info_run_lock,
key_master_info_sleep_lock, key_master_info_start_stop_lock,
key_master_info_start_alter_lock,
......@@ -1007,6 +1013,7 @@ static PSI_mutex_info all_server_mutexes[]=
{ &key_LOCK_user_conn, "LOCK_user_conn", PSI_FLAG_GLOBAL},
{ &key_LOCK_uuid_short_generator, "LOCK_uuid_short_generator", PSI_FLAG_GLOBAL},
{ &key_LOG_LOCK_log, "LOG::LOCK_log", 0},
{ &key_gtid_index_lock, "Gtid_index_writer::gtid_index_mutex", 0},
{ &key_master_info_data_lock, "Master_info::data_lock", 0},
{ &key_master_info_start_stop_lock, "Master_info::start_stop_lock", 0},
{ &key_master_info_run_lock, "Master_info::run_lock", 0},
......@@ -2011,6 +2018,7 @@ static void clean_up(bool print_message)
injector::free_instance();
mysql_bin_log.cleanup();
Gtid_index_writer::gtid_index_cleanup();
my_tz_free();
my_dboptions_cache_free();
......@@ -3962,6 +3970,7 @@ static int init_common_variables()
inited before MY_INIT(). So we do it here.
*/
mysql_bin_log.init_pthread_objects();
Gtid_index_writer::gtid_index_init();
/* TODO: remove this when my_time_t is 64 bit compatible */
if (!IS_TIME_T_VALID_FOR_TIMESTAMP(server_start_time))
......@@ -7396,6 +7405,8 @@ SHOW_VAR status_vars[]= {
{"Binlog_bytes_written", (char*) offsetof(STATUS_VAR, binlog_bytes_written), SHOW_LONGLONG_STATUS},
{"Binlog_cache_disk_use", (char*) &binlog_cache_disk_use, SHOW_LONG},
{"Binlog_cache_use", (char*) &binlog_cache_use, SHOW_LONG},
{"Binlog_gtid_index_hit", (char*) &binlog_gtid_index_hit, SHOW_LONG},
{"Binlog_gtid_index_miss", (char*) &binlog_gtid_index_miss, SHOW_LONG},
{"Binlog_stmt_cache_disk_use",(char*) &binlog_stmt_cache_disk_use, SHOW_LONG},
{"Binlog_stmt_cache_use", (char*) &binlog_stmt_cache_use, SHOW_LONG},
{"Busy_time", (char*) offsetof(STATUS_VAR, busy_time), SHOW_DOUBLE_STATUS},
......@@ -7821,6 +7832,7 @@ static int mysql_init_variables(void)
delayed_insert_errors= thread_created= 0;
specialflag= 0;
binlog_cache_use= binlog_cache_disk_use= 0;
binlog_gtid_index_hit= binlog_gtid_index_miss= 0;
max_used_connections= slow_launch_threads = 0;
max_used_connections_time= 0;
mysqld_user= mysqld_chroot= opt_init_file= opt_bin_logname = 0;
......@@ -9219,7 +9231,8 @@ static PSI_file_info all_server_files[]=
{ &key_file_trg, "trigger_name", 0},
{ &key_file_trn, "trigger", 0},
{ &key_file_init, "init", 0},
{ &key_file_binlog_state, "binlog_state", 0}
{ &key_file_binlog_state, "binlog_state", 0},
{ &key_file_gtid_index, "gtid_index", 0}
};
#endif /* HAVE_PSI_INTERFACE */
......@@ -9413,6 +9426,7 @@ PSI_memory_key key_memory_acl_cache;
PSI_memory_key key_memory_acl_mem;
PSI_memory_key key_memory_acl_memex;
PSI_memory_key key_memory_binlog_cache_mngr;
PSI_memory_key key_memory_binlog_gtid_index;
PSI_memory_key key_memory_binlog_pos;
PSI_memory_key key_memory_binlog_recover_exec;
PSI_memory_key key_memory_binlog_statement_buffer;
......@@ -9652,6 +9666,7 @@ static PSI_memory_info all_server_memory[]=
// { &key_memory_Slave_job_group_group_relay_log_name, "Slave_job_group::group_relay_log_name", 0},
{ &key_memory_Relay_log_info_group_relay_log_name, "Relay_log_info::group_relay_log_name", 0},
{ &key_memory_binlog_cache_mngr, "binlog_cache_mngr", 0},
{ &key_memory_binlog_gtid_index, "binlog_gtid_index", 0},
{ &key_memory_Row_data_memory_memory, "Row_data_memory::memory", 0},
// { &key_memory_Gtid_set_to_string, "Gtid_set::to_string", 0},
// { &key_memory_Gtid_state_to_string, "Gtid_state::to_string", 0},
......
......@@ -217,6 +217,7 @@ extern ulonglong thd_startup_options;
extern my_thread_id global_thread_id;
extern ulong binlog_cache_use, binlog_cache_disk_use;
extern ulong binlog_stmt_cache_use, binlog_stmt_cache_disk_use;
extern ulong binlog_gtid_index_hit, binlog_gtid_index_miss;
extern ulong aborted_threads, aborted_connects, aborted_connects_preauth;
extern ulong delayed_insert_timeout;
extern ulong delayed_insert_limit, delayed_queue_size;
......@@ -249,6 +250,9 @@ extern ulonglong slave_max_statement_time;
extern double slave_max_statement_time_double;
extern ulong opt_binlog_rows_event_max_size;
extern ulong binlog_row_metadata;
extern my_bool opt_binlog_gtid_index;
extern uint opt_binlog_gtid_index_page_size;
extern uint opt_binlog_gtid_index_span_min;
extern ulong thread_cache_size;
extern ulong stored_program_cache_size;
extern ulong opt_slave_parallel_threads;
......@@ -333,7 +337,7 @@ extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
key_LOCK_rpl_status, key_LOCK_server_started,
key_LOCK_status, key_LOCK_optimizer_costs,
key_LOCK_thd_data, key_LOCK_thd_kill,
key_LOCK_user_conn, key_LOG_LOCK_log,
key_LOCK_user_conn, key_LOG_LOCK_log, key_gtid_index_lock,
key_master_info_data_lock, key_master_info_run_lock,
key_master_info_sleep_lock, key_master_info_start_stop_lock,
key_master_info_start_alter_lock,
......@@ -411,7 +415,7 @@ extern PSI_file_key key_file_relaylog, key_file_relaylog_index,
key_file_relaylog_cache, key_file_relaylog_index_cache;
extern PSI_socket_key key_socket_tcpip, key_socket_unix,
key_socket_client_connection;
extern PSI_file_key key_file_binlog_state;
extern PSI_file_key key_file_binlog_state, key_file_gtid_index;
#ifdef HAVE_PSI_INTERFACE
void init_server_psi_keys();
......@@ -456,6 +460,7 @@ extern PSI_memory_key key_memory_user_var_entry_value;
extern PSI_memory_key key_memory_Slave_job_group_group_relay_log_name;
extern PSI_memory_key key_memory_Relay_log_info_group_relay_log_name;
extern PSI_memory_key key_memory_binlog_cache_mngr;
extern PSI_memory_key key_memory_binlog_gtid_index;
extern PSI_memory_key key_memory_Row_data_memory_memory;
extern PSI_memory_key key_memory_errmsgs;
extern PSI_memory_key key_memory_Event_queue_element_for_exec_names;
......
......@@ -376,6 +376,15 @@ constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_ROW_METADATA=
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_LEGACY_EVENT_POS=
SUPER_ACL | BINLOG_ADMIN_ACL;
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX=
BINLOG_ADMIN_ACL;
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_PAGE_SIZE=
BINLOG_ADMIN_ACL;
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPAN_MIN=
BINLOG_ADMIN_ACL;
constexpr privilege_t PRIV_SET_SYSTEM_GLOBAL_VAR_EXPIRE_LOGS_DAYS=
BINLOG_ADMIN_ACL;
......
......@@ -1542,19 +1542,18 @@ rpl_slave_state::alloc_gtid_pos_table(LEX_CSTRING *table_name, void *hton,
}
void rpl_binlog_state::init()
void
rpl_binlog_state_base::init()
{
my_hash_init(PSI_INSTRUMENT_ME, &hash, &my_charset_bin, 32,
offsetof(element, domain_id), sizeof(element::domain_id),
NULL, my_free, HASH_UNIQUE);
my_init_dynamic_array(PSI_INSTRUMENT_ME, &gtid_sort_array, sizeof(rpl_gtid), 8, 8, MYF(0));
mysql_mutex_init(key_LOCK_binlog_state, &LOCK_binlog_state,
MY_MUTEX_INIT_SLOW);
initialized= 1;
}
void
rpl_binlog_state::reset_nolock()
rpl_binlog_state_base::reset_nolock()
{
uint32 i;
......@@ -1565,72 +1564,67 @@ rpl_binlog_state::reset_nolock()
void
rpl_binlog_state::reset()
{
mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
mysql_mutex_unlock(&LOCK_binlog_state);
}
void rpl_binlog_state::free()
rpl_binlog_state_base::free()
{
if (initialized)
{
initialized= 0;
reset_nolock();
my_hash_free(&hash);
delete_dynamic(&gtid_sort_array);
mysql_mutex_destroy(&LOCK_binlog_state);
}
}
rpl_binlog_state_base::~rpl_binlog_state_base()
{
free();
}
bool
rpl_binlog_state::load(struct rpl_gtid *list, uint32 count)
rpl_binlog_state_base::load_nolock(struct rpl_gtid *list, uint32 count)
{
uint32 i;
bool res= false;
mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
for (i= 0; i < count; ++i)
{
if (update_nolock(&(list[i]), false))
if (update_nolock(&(list[i])))
{
res= true;
break;
}
}
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
static int rpl_binlog_state_load_cb(rpl_gtid *gtid, void *data)
{
rpl_binlog_state *self= (rpl_binlog_state *)data;
return self->update_nolock(gtid, false);
}
bool
rpl_binlog_state::load(rpl_slave_state *slave_pos)
rpl_binlog_state_base::load_nolock(rpl_binlog_state_base *orig_state)
{
bool res= false;
ulong i, j;
HASH *h1= &orig_state->hash;
mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
if (slave_pos->iterate(rpl_binlog_state_load_cb, this, NULL, 0, false))
res= true;
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
for (i= 0; i < h1->records; ++i)
{
element *e= (element *)my_hash_element(h1, i);
HASH *h2= &e->hash;
const rpl_gtid *last_gtid= e->last_gtid;
for (j= 0; j < h2->records; ++j)
{
const rpl_gtid *gtid= (const rpl_gtid *)my_hash_element(h2, j);
if (gtid == last_gtid)
continue;
if (update_nolock(gtid))
return true;
}
if (likely(last_gtid) && update_nolock(last_gtid))
return true;
}
rpl_binlog_state::~rpl_binlog_state()
{
free();
return false;
}
......@@ -1640,10 +1634,13 @@ rpl_binlog_state::~rpl_binlog_state()
If the (domain_id, server_id) pair already exists, then the new GTID replaces
the old one for that domain id. Else a new entry is inserted.
Note that rpl_binlog_state_base::update_nolock() does not call my_error()
for out-of-memory, caller must do that if needed (eg. ER_OUT_OF_RESOURCES).
Returns 0 for ok, 1 for error.
*/
int
rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict)
rpl_binlog_state_base::update_nolock(const struct rpl_gtid *gtid)
{
element *elem;
......@@ -1651,13 +1648,6 @@ rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict)
(const uchar *)(&gtid->domain_id),
sizeof(gtid->domain_id))))
{
if (strict && elem->last_gtid && elem->last_gtid->seq_no >= gtid->seq_no)
{
my_error(ER_GTID_STRICT_OUT_OF_ORDER, MYF(0), gtid->domain_id,
gtid->server_id, gtid->seq_no, elem->last_gtid->domain_id,
elem->last_gtid->server_id, elem->last_gtid->seq_no);
return 1;
}
if (elem->seq_no_counter < gtid->seq_no)
elem->seq_no_counter= gtid->seq_no;
if (!elem->update_element(gtid))
......@@ -1666,17 +1656,267 @@ rpl_binlog_state::update_nolock(const struct rpl_gtid *gtid, bool strict)
else if (!alloc_element_nolock(gtid))
return 0;
my_error(ER_OUT_OF_RESOURCES, MYF(0));
return 1;
}
int
rpl_binlog_state_base::alloc_element_nolock(const rpl_gtid *gtid)
{
element *elem;
rpl_gtid *lookup_gtid;
/* First time we see this domain_id; allocate a new element. */
elem= (element *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*elem), MYF(0));
lookup_gtid= (rpl_gtid *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*lookup_gtid),
MYF(0));
if (elem && lookup_gtid)
{
elem->domain_id= gtid->domain_id;
my_hash_init(PSI_INSTRUMENT_ME, &elem->hash, &my_charset_bin, 32,
offsetof(rpl_gtid, server_id), sizeof(rpl_gtid::domain_id),
NULL, my_free, HASH_UNIQUE);
elem->last_gtid= lookup_gtid;
elem->seq_no_counter= gtid->seq_no;
memcpy(lookup_gtid, gtid, sizeof(*lookup_gtid));
if (0 == my_hash_insert(&elem->hash, (const uchar *)lookup_gtid))
{
lookup_gtid= NULL; /* Do not free. */
if (0 == my_hash_insert(&hash, (const uchar *)elem))
return 0;
}
my_hash_free(&elem->hash);
}
/* An error. */
if (elem)
my_free(elem);
if (lookup_gtid)
my_free(lookup_gtid);
return 1;
}
uint32
rpl_binlog_state_base::count_nolock()
{
uint32 c= 0;
uint32 i;
for (i= 0; i < hash.records; ++i)
c+= ((element *)my_hash_element(&hash, i))->hash.records;
return c;
}
int
rpl_binlog_state_base::get_gtid_list_nolock(rpl_gtid *gtid_list, uint32 list_size)
{
uint32 i, j, pos;
pos= 0;
for (i= 0; i < hash.records; ++i)
{
element *e= (element *)my_hash_element(&hash, i);
if (!e->last_gtid)
{
DBUG_ASSERT(e->hash.records==0);
continue;
}
for (j= 0; j <= e->hash.records; ++j)
{
const rpl_gtid *gtid;
if (j < e->hash.records)
{
gtid= (rpl_gtid *)my_hash_element(&e->hash, j);
if (gtid == e->last_gtid)
continue;
}
else
gtid= e->last_gtid;
if (pos >= list_size)
return 1;
memcpy(&gtid_list[pos++], gtid, sizeof(*gtid));
}
}
return 0;
}
rpl_gtid *
rpl_binlog_state_base::find_nolock(uint32 domain_id, uint32 server_id)
{
element *elem;
if (!(elem= (element *)my_hash_search(&hash, (const uchar *)&domain_id,
sizeof(domain_id))))
return NULL;
return (rpl_gtid *)my_hash_search(&elem->hash, (const uchar *)&server_id,
sizeof(server_id));
}
/*
Return true if this binlog state is before the position specified by the
passed-in slave_connection_state, false otherwise.
Note that if the GTID D-S-N is the last GTID added to the state in the
domain D, then the state is considered to come before the position D-S-N
within domain D.
*/
bool
rpl_binlog_state_base::is_before_pos(slave_connection_state *pos)
{
/*
First check each GTID in the slave position, if it comes after what is
in the state.
*/
for (uint32 i= 0; i < pos->hash.records; ++i)
{
const slave_connection_state::entry *e=
(const slave_connection_state::entry *)my_hash_element(&pos->hash, i);
/*
IF we have an entry with the same (domain_id, server_id),
AND either
( we are ahead in that server_id
OR we are identical, but there's some other server_id after)
THEN that position lies before our state.
*/
element *elem;
if ((elem= (element *)my_hash_search(&hash,
(const uchar *)&e->gtid.domain_id,
sizeof(e->gtid.domain_id))))
{
const rpl_gtid *g= (rpl_gtid *)
my_hash_search(&elem->hash, (const uchar *)&e->gtid.server_id,
sizeof(e->gtid.server_id));
if (g != nullptr &&
( g->seq_no > e->gtid.seq_no ||
( g->seq_no == e->gtid.seq_no && g != elem->last_gtid) ))
return false;
}
}
/*
Then check the state, if there are any domains present that are missing
from the position.
*/
for (uint32 i= 0; i < hash.records; ++i)
{
const element *elem= (const element *) my_hash_element(&hash, i);
if (likely(elem->hash.records > 0) &&
!pos->find(elem->domain_id))
return false;
}
/* Nothing in our state lies after anything in the position. */
return true;
}
void rpl_binlog_state::init()
{
rpl_binlog_state_base::init();
my_init_dynamic_array(PSI_INSTRUMENT_ME, &gtid_sort_array, sizeof(rpl_gtid), 8, 8, MYF(0));
mysql_mutex_init(key_LOCK_binlog_state, &LOCK_binlog_state,
MY_MUTEX_INIT_SLOW);
}
void
rpl_binlog_state::reset()
{
mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
mysql_mutex_unlock(&LOCK_binlog_state);
}
void rpl_binlog_state::free()
{
if (initialized)
{
rpl_binlog_state_base::free();
delete_dynamic(&gtid_sort_array);
mysql_mutex_destroy(&LOCK_binlog_state);
}
}
rpl_binlog_state::~rpl_binlog_state()
{
free();
}
bool
rpl_binlog_state::load(struct rpl_gtid *list, uint32 count)
{
mysql_mutex_lock(&LOCK_binlog_state);
bool res= load_nolock(list, count);
mysql_mutex_unlock(&LOCK_binlog_state);
if (res)
my_error(ER_OUT_OF_RESOURCES, MYF(0));
return res;
}
static int rpl_binlog_state_load_cb(rpl_gtid *gtid, void *data)
{
rpl_binlog_state *self= (rpl_binlog_state *)data;
return self->update_nolock(gtid);
}
bool
rpl_binlog_state::load(rpl_slave_state *slave_pos)
{
bool res= false;
mysql_mutex_lock(&LOCK_binlog_state);
reset_nolock();
if (slave_pos->iterate(rpl_binlog_state_load_cb, this, NULL, 0, false))
{
my_error(ER_OUT_OF_RESOURCES, MYF(0));
res= true;
}
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
int
rpl_binlog_state::update(const struct rpl_gtid *gtid, bool strict)
{
int res;
int res= 0;
element *elem;
mysql_mutex_lock(&LOCK_binlog_state);
res= update_nolock(gtid, strict);
if ((elem= (element *)my_hash_search(&hash,
(const uchar *)(&gtid->domain_id),
sizeof(gtid->domain_id))))
{
if (strict && elem->last_gtid && elem->last_gtid->seq_no >= gtid->seq_no)
{
my_error(ER_GTID_STRICT_OUT_OF_ORDER, MYF(0), gtid->domain_id,
gtid->server_id, gtid->seq_no, elem->last_gtid->domain_id,
elem->last_gtid->server_id, elem->last_gtid->seq_no);
res= 1;
}
else
{
if (elem->seq_no_counter < gtid->seq_no)
elem->seq_no_counter= gtid->seq_no;
if (elem->update_element(gtid))
res= 1;
}
}
else if (alloc_element_nolock(gtid))
{
my_error(ER_OUT_OF_RESOURCES, MYF(0));
res= 1;
}
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
......@@ -1762,43 +2002,6 @@ rpl_binlog_state::element::update_element(const rpl_gtid *gtid)
}
int
rpl_binlog_state::alloc_element_nolock(const rpl_gtid *gtid)
{
element *elem;
rpl_gtid *lookup_gtid;
/* First time we see this domain_id; allocate a new element. */
elem= (element *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*elem), MYF(MY_WME));
lookup_gtid= (rpl_gtid *)my_malloc(PSI_INSTRUMENT_ME, sizeof(*lookup_gtid),
MYF(MY_WME));
if (elem && lookup_gtid)
{
elem->domain_id= gtid->domain_id;
my_hash_init(PSI_INSTRUMENT_ME, &elem->hash, &my_charset_bin, 32,
offsetof(rpl_gtid, server_id), sizeof(rpl_gtid::domain_id),
NULL, my_free, HASH_UNIQUE);
elem->last_gtid= lookup_gtid;
elem->seq_no_counter= gtid->seq_no;
memcpy(lookup_gtid, gtid, sizeof(*lookup_gtid));
if (0 == my_hash_insert(&elem->hash, (const uchar *)lookup_gtid))
{
lookup_gtid= NULL; /* Do not free. */
if (0 == my_hash_insert(&hash, (const uchar *)elem))
return 0;
}
my_hash_free(&elem->hash);
}
/* An error. */
if (elem)
my_free(elem);
if (lookup_gtid)
my_free(lookup_gtid);
return 1;
}
/*
Check that a new GTID can be logged without creating an out-of-order
sequence number with existing GTIDs.
......@@ -1950,7 +2153,7 @@ rpl_binlog_state::read_from_iocache(IO_CACHE *src)
p= buf;
end= buf + len;
if (gtid_parser_helper(&p, end, &gtid) ||
update_nolock(&gtid, false))
update_nolock(&gtid))
{
res= 1;
break;
......@@ -1961,17 +2164,6 @@ rpl_binlog_state::read_from_iocache(IO_CACHE *src)
}
rpl_gtid *
rpl_binlog_state::find_nolock(uint32 domain_id, uint32 server_id)
{
element *elem;
if (!(elem= (element *)my_hash_search(&hash, (const uchar *)&domain_id,
sizeof(domain_id))))
return NULL;
return (rpl_gtid *)my_hash_search(&elem->hash, (const uchar *)&server_id,
sizeof(server_id));
}
rpl_gtid *
rpl_binlog_state::find(uint32 domain_id, uint32 server_id)
{
......@@ -2002,12 +2194,8 @@ rpl_binlog_state::find_most_recent(uint32 domain_id)
uint32
rpl_binlog_state::count()
{
uint32 c= 0;
uint32 i;
mysql_mutex_lock(&LOCK_binlog_state);
for (i= 0; i < hash.records; ++i)
c+= ((element *)my_hash_element(&hash, i))->hash.records;
uint32 c= count_nolock();
mysql_mutex_unlock(&LOCK_binlog_state);
return c;
......@@ -2017,41 +2205,8 @@ rpl_binlog_state::count()
int
rpl_binlog_state::get_gtid_list(rpl_gtid *gtid_list, uint32 list_size)
{
uint32 i, j, pos;
int res= 0;
mysql_mutex_lock(&LOCK_binlog_state);
pos= 0;
for (i= 0; i < hash.records; ++i)
{
element *e= (element *)my_hash_element(&hash, i);
if (!e->last_gtid)
{
DBUG_ASSERT(e->hash.records==0);
continue;
}
for (j= 0; j <= e->hash.records; ++j)
{
const rpl_gtid *gtid;
if (j < e->hash.records)
{
gtid= (rpl_gtid *)my_hash_element(&e->hash, j);
if (gtid == e->last_gtid)
continue;
}
else
gtid= e->last_gtid;
if (pos >= list_size)
{
res= 1;
goto end;
}
memcpy(&gtid_list[pos++], gtid, sizeof(*gtid));
}
}
end:
int res= get_gtid_list_nolock(gtid_list, list_size);
mysql_mutex_unlock(&LOCK_binlog_state);
return res;
}
......
......@@ -26,6 +26,11 @@
extern const LEX_CSTRING rpl_gtid_slave_state_table_name;
class String;
#ifdef MYSQL_SERVER
struct TABLE;
#endif
struct slave_connection_state;
#define PARAM_GTID(G) G.domain_id, G.server_id, G.seq_no
#define GTID_MAX_STR_LENGTH (10+1+10+1+20)
......@@ -296,8 +301,13 @@ struct rpl_slave_state
to know where to start when a master is changed to a slave. As a side
effect, it also allows to skip a hash lookup in the very common case of
logging a new GTID with same server id as last GTID.
The base class rpl_binlog_state_base contains just be basic data operations
to insert/update GTIDs, and is used eg. from Gtid_index_*. The main class
rpl_binlog_state builds server logic on top of that like mutex locking,
gtid_strict_mode handling, etc.
*/
struct rpl_binlog_state
struct rpl_binlog_state_base
{
struct element {
uint32 domain_id;
......@@ -309,29 +319,45 @@ struct rpl_binlog_state
int update_element(const rpl_gtid *gtid);
};
/* Mapping from domain_id to collection of elements. */
HASH hash;
my_bool initialized;
rpl_binlog_state_base() : initialized(0) {}
~rpl_binlog_state_base();
void init();
void reset_nolock();
void free();
bool load_nolock(struct rpl_gtid *list, uint32 count);
bool load_nolock(rpl_binlog_state_base *orig_state);
int update_nolock(const struct rpl_gtid *gtid);
int alloc_element_nolock(const rpl_gtid *gtid);
uint32 count_nolock();
int get_gtid_list_nolock(rpl_gtid *gtid_list, uint32 list_size);
rpl_gtid *find_nolock(uint32 domain_id, uint32 server_id);
bool is_before_pos(slave_connection_state *pos);
};
struct rpl_binlog_state : public rpl_binlog_state_base
{
/* Mutex protecting access to the state. */
mysql_mutex_t LOCK_binlog_state;
my_bool initialized;
/* Auxiliary buffer to sort gtid list. */
DYNAMIC_ARRAY gtid_sort_array;
rpl_binlog_state() :initialized(0) {}
rpl_binlog_state() {}
~rpl_binlog_state();
void init();
void reset_nolock();
void reset();
void free();
bool load(struct rpl_gtid *list, uint32 count);
bool load(rpl_slave_state *slave_pos);
int update_nolock(const struct rpl_gtid *gtid, bool strict);
int update(const struct rpl_gtid *gtid, bool strict);
int update_with_next_gtid(uint32 domain_id, uint32 server_id,
rpl_gtid *gtid);
int alloc_element_nolock(const rpl_gtid *gtid);
bool check_strict_sequence(uint32 domain_id, uint32 server_id, uint64 seq_no,
bool no_error= false);
int bump_seq_no_if_needed(uint32 domain_id, uint64 seq_no);
......@@ -342,7 +368,6 @@ struct rpl_binlog_state
int get_most_recent_gtid_list(rpl_gtid **list, uint32 *size);
bool append_pos(String *str);
bool append_state(String *str);
rpl_gtid *find_nolock(uint32 domain_id, uint32 server_id);
rpl_gtid *find(uint32 domain_id, uint32 server_id);
rpl_gtid *find_most_recent(uint32 domain_id);
const char* drop_domain(DYNAMIC_ARRAY *ids, Gtid_list_log_event *glev, char*);
......
......@@ -1547,7 +1547,7 @@ Relay_log_info::update_relay_log_state(rpl_gtid *gtid_list, uint32 count)
int res= 0;
while (count)
{
if (relay_log_state.update_nolock(gtid_list, false))
if (relay_log_state.update_nolock(gtid_list))
res= 1;
++gtid_list;
--count;
......
......@@ -31,6 +31,7 @@
#include "semisync_master.h"
#include "semisync_slave.h"
#include "mysys_err.h"
#include "gtid_index.h"
enum enum_gtid_until_state {
......@@ -1286,6 +1287,100 @@ check_slave_start_position(binlog_send_info *info, const char **errormsg,
return err;
}
/*
Helper function for gtid_find_binlog_pos() below.
Check a binlog file against a slave position. Use a GTID index if present.
Returns:
0 This is the binlog file that contains the position. If *out_start_seek
is non-zero, it is the offset found in the GTID index at which to start
scanning the binlog file for events to send to the slave.
1 This binlog file is too new to contain the given slave position.
-1 Error, *out_errormsg contains error string.
The *out_glev event must be deleted by the caller if set non-null.
*/
static int
gtid_check_binlog_file(slave_connection_state *state,
Gtid_index_reader_hot *reader,
const binlog_file_entry *list,
bool *found_in_index, uint32 *out_start_seek,
uint32 *found_count,
char *out_name, Gtid_list_log_event **out_glev,
const char **out_errormsg)
{
Gtid_list_log_event *glev= nullptr;
char buf[FN_REFLEN];
File file;
IO_CACHE cache;
int res= -1;
*found_in_index= false;
*out_glev= nullptr;
*out_errormsg= nullptr;
/*
Try to lookup the GTID position in the gtid index.
If that doesn't work, read the Gtid_list_log_event at the start of the
binlog file to get the binlog state.
*/
if (normalize_binlog_name(buf, list->name.str, false))
{
*out_errormsg= "Failed to determine binlog file name while looking for "
"GTID position in binlog";
goto end;
}
if (likely(reader && !reader->open_index_file(buf)))
{
int lookup= reader->search_gtid_pos(state, out_start_seek, found_count);
reader->close_index_file();
if (lookup >= 0)
{
statistic_increment(binlog_gtid_index_hit, &LOCK_status);
if (lookup == 0)
res= 1;
else
{
strmake(out_name, buf, FN_REFLEN);
*found_in_index= true;
res= 0;
}
goto end;
}
/*
Error in the index lookup; fall back to reading the GTID_LIST event from
the binlog file and scan it from the beginning.
*/
}
statistic_increment(binlog_gtid_index_miss, &LOCK_status);
bzero((char*) &cache, sizeof(cache));
if (unlikely((file= open_binlog(&cache, buf, out_errormsg)) == (File)-1))
goto end;
*out_errormsg= get_gtid_list_event(&cache, &glev);
end_io_cache(&cache);
mysql_file_close(file, MYF(MY_WME));
if (unlikely(*out_errormsg))
goto end;
if (!glev || contains_all_slave_gtid(state, glev))
{
strmake(out_name, buf, FN_REFLEN);
*out_glev= glev;
*out_errormsg= nullptr;
res= 0;
}
else
{
delete glev;
res= 1;
}
end:
return res;
}
/*
Find the name of the binlog file to start reading for a slave that connects
using GTID state.
......@@ -1314,14 +1409,17 @@ check_slave_start_position(binlog_send_info *info, const char **errormsg,
the requested GTID that was already purged.
*/
static const char *
gtid_find_binlog_file(slave_connection_state *state, char *out_name,
slave_connection_state *until_gtid_state)
gtid_find_binlog_pos(slave_connection_state *state, char *out_name,
slave_connection_state *until_gtid_state,
rpl_binlog_state *until_binlog_state,
bool *found_in_index, uint32 *out_start_seek)
{
MEM_ROOT memroot;
binlog_file_entry *list;
Gtid_list_log_event *glev= NULL;
const char *errormsg= NULL;
char buf[FN_REFLEN];
Gtid_index_reader_hot *reader= NULL;
*found_in_index= false;
init_alloc_root(PSI_INSTRUMENT_ME, &memroot,
10*(FN_REFLEN+sizeof(binlog_file_entry)), 0,
......@@ -1332,48 +1430,41 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
goto end;
}
if (opt_binlog_gtid_index)
reader= new Gtid_index_reader_hot();
while (list)
{
File file;
IO_CACHE cache;
uint32 found_count;
int res= gtid_check_binlog_file(state, reader, list, found_in_index,
out_start_seek, &found_count,
out_name, &glev, &errormsg);
if (res < 0)
goto end;
if (res == 0)
{
if (*found_in_index || glev)
{
uint32 i;
uint32 count;
rpl_gtid *gtids;
if (!list->next)
if (*found_in_index)
{
count= found_count;
gtids= reader->search_gtid_list();
/*
It should be safe to read the currently used binlog, as we will only
read the header part that is already written.
But if that does not work on windows, then we will need to cache the
event somewhere in memory I suppose - that could work too.
Load the initial GTID state corresponding to the position found in
the GTID index, as we will not have a GTID_LIST event to load it
from.
*/
until_binlog_state->load(gtids, count);
}
/*
Read the Gtid_list_log_event at the start of the binlog file to
get the binlog state.
*/
if (normalize_binlog_name(buf, list->name.str, false))
else
{
errormsg= "Failed to determine binlog file name while looking for "
"GTID position in binlog";
goto end;
count= glev->count;
gtids= glev->list;
}
bzero((char*) &cache, sizeof(cache));
if (unlikely((file= open_binlog(&cache, buf, &errormsg)) == (File)-1))
goto end;
errormsg= get_gtid_list_event(&cache, &glev);
end_io_cache(&cache);
mysql_file_close(file, MYF(MY_WME));
if (unlikely(errormsg))
goto end;
if (!glev || contains_all_slave_gtid(state, glev))
{
strmake(out_name, buf, FN_REFLEN);
if (glev)
{
uint32 i;
/*
As a special case, we allow to start from binlog file N if the
requested GTID is the last event (in the corresponding domain) in
......@@ -1385,9 +1476,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
from the UNTIL hash, to mark that such domains have already reached
their UNTIL condition.
*/
for (i= 0; i < glev->count; ++i)
for (i= 0; i < count; ++i)
{
const rpl_gtid *gtid= state->find(glev->list[i].domain_id);
const rpl_gtid *gtid= state->find(gtids[i].domain_id);
if (!gtid)
{
/*
......@@ -1400,8 +1491,8 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
further GTIDs in the Gtid_list.
*/
DBUG_ASSERT(0);
} else if (gtid->server_id == glev->list[i].server_id &&
gtid->seq_no == glev->list[i].seq_no)
} else if (gtid->server_id == gtids[i].server_id &&
gtid->seq_no == gtids[i].seq_no)
{
/*
The slave requested to start from the very beginning of this
......@@ -1412,9 +1503,9 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
}
if (until_gtid_state &&
(gtid= until_gtid_state->find(glev->list[i].domain_id)) &&
gtid->server_id == glev->list[i].server_id &&
gtid->seq_no <= glev->list[i].seq_no)
(gtid= until_gtid_state->find(gtids[i].domain_id)) &&
gtid->server_id == gtids[i].server_id &&
gtid->seq_no <= gtids[i].seq_no)
{
/*
We've already reached the stop position in UNTIL for this domain,
......@@ -1427,8 +1518,6 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
goto end;
}
delete glev;
glev= NULL;
list= list->next;
}
......@@ -1441,11 +1530,56 @@ gtid_find_binlog_file(slave_connection_state *state, char *out_name,
if (glev)
delete glev;
if (reader)
delete reader;
free_root(&memroot, MYF(0));
return errormsg;
}
static bool
gtid_index_lookup_pos(const char *name, uint32 offset, uint32 *out_start_seek,
slave_connection_state *out_gtid_state)
{
Gtid_index_reader_hot *reader= nullptr;
bool opened= false;
bool found= false;
uint32 found_offset, found_gtid_count;
rpl_gtid *found_gtids;
int res;
if (!(reader= new Gtid_index_reader_hot()) ||
reader->open_index_file(name))
{
statistic_increment(binlog_gtid_index_miss, &LOCK_status);
goto err;
}
opened= true;
res= reader->search_offset(offset, &found_offset, &found_gtid_count);
if (res <= 0)
{
statistic_increment(binlog_gtid_index_miss, &LOCK_status);
goto err;
}
statistic_increment(binlog_gtid_index_hit, &LOCK_status);
/* We found the position, initialize the state from the index. */
found_gtids= reader->search_gtid_list();
if (out_gtid_state->load(found_gtids, found_gtid_count))
goto err;
*out_start_seek= found_offset;
found= true;
err:
if (opened)
reader->close_index_file();
if (reader)
delete reader;
return found;
}
/*
Given an old-style binlog position with file name and file offset, find the
corresponding gtid position. If the offset is not at an event boundary, give
......@@ -1469,8 +1603,22 @@ gtid_state_from_pos(const char *name, uint32 offset,
int err;
String packet;
Format_description_log_event *fdev= NULL;
bool found_in_index;
uint32 UNINIT_VAR(start_seek);
bool seek_done= false;
if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0)))
/*
Try to lookup the position in the binlog gtid index. If found (as it will
usually be unless the index is corrupted somehow), we can seek directly to
a point at or just before the desired location, saving an expensive scan
of the binlog file from the start.
*/
found_in_index= opt_binlog_gtid_index ?
gtid_index_lookup_pos(name, offset, &start_seek, gtid_state) :
false;
if (found_in_index)
found_gtid_list_event= true;
else if (unlikely(gtid_state->load((const rpl_gtid *)NULL, 0)))
{
errormsg= "Internal error (out of memory?) initializing slave state "
"while scanning binlog to find start position";
......@@ -1559,6 +1707,25 @@ gtid_state_from_pos(const char *name, uint32 offset,
errormsg= "Could not start decryption of binlog.";
goto end;
}
if (found_in_index && !seek_done)
{
/*
Just to avoid a redundant event read before hitting the next branch.
ToDo: share this code with the below somehow.
*/
my_b_seek(&cache, start_seek);
seek_done= true;
}
}
else if (found_in_index && !seek_done)
{
/*
After reading the format_description event and possibly
start_encryption, we can seek forward to avoid most or all of the scan
(depending on the sparseness of the index).
*/
my_b_seek(&cache, start_seek);
seek_done= true;
}
else if (unlikely(typ != FORMAT_DESCRIPTION_EVENT &&
!found_format_description_event))
......@@ -1570,7 +1737,7 @@ gtid_state_from_pos(const char *name, uint32 offset,
else if (typ == ROTATE_EVENT || typ == STOP_EVENT ||
typ == BINLOG_CHECKPOINT_EVENT)
continue; /* Continue looking */
else if (typ == GTID_LIST_EVENT)
else if (typ == GTID_LIST_EVENT && !found_in_index)
{
rpl_gtid *gtid_list;
bool status;
......@@ -1798,7 +1965,7 @@ send_event_to_slave(binlog_send_info *info, Log_event_type event_type,
}
});
if (info->until_binlog_state.update_nolock(&event_gtid, false))
if (info->until_binlog_state.update_nolock(&event_gtid))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
return "Failed in internal GTID book-keeping: Out of memory";
......@@ -2198,6 +2365,8 @@ static int init_binlog_sender(binlog_send_info *info,
char search_file_name[FN_REFLEN];
const char *name=search_file_name;
bool found_in_index= false;
uint32 start_seek= 0;
if (info->using_gtid_state)
{
if (info->gtid_state.load(connect_gtid_state.ptr(),
......@@ -2223,17 +2392,27 @@ static int init_binlog_sender(binlog_send_info *info,
info->error= error;
return 1;
}
if ((info->errmsg= gtid_find_binlog_file(&info->gtid_state,
if ((info->errmsg= gtid_find_binlog_pos(&info->gtid_state,
search_file_name,
info->until_gtid_state)))
info->until_gtid_state,
&info->until_binlog_state,
&found_in_index, &start_seek)))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
return 1;
}
if (found_in_index)
{
/* Start from a position looked up in the binlog gtid index. */
*pos = start_seek;
}
else
{
/* start from beginning of binlog file */
*pos = 4;
}
}
else
{
if (log_ident[0])
......@@ -2865,6 +3044,7 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
ushort flags)
{
LOG_INFO linfo;
ulong ev_offset;
IO_CACHE log;
File file = -1;
......@@ -2990,6 +3170,34 @@ void mysql_binlog_send(THD* thd, char* log_ident, my_off_t pos,
if (info->until_gtid_state && info->until_gtid_state->count() == 0)
info->gtid_until_group= GTID_UNTIL_STOP_AFTER_STANDALONE;
if (info->using_gtid_state && pos > BIN_LOG_HEADER_SIZE &&
( info->gtid_state.is_pos_reached() ||
info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE ) )
{
/*
We are starting a GTID connect from a point not at the start of the
binlog file (from a GTID index lookup). Send a fake GTID_LIST event
in place of the real GTID_LIST that would normally be sent from the
start of the binlog file.
If we already reached the gtid UNTIL position, then set the
FLAG_UNTIL_REACHED in the GTID_LIST event and stop immediately.
*/
uint32 flag= 0;
if (info->gtid_until_group == GTID_UNTIL_STOP_AFTER_STANDALONE)
{
flag= Gtid_list_log_event::FLAG_UNTIL_REACHED;
info->should_stop= true;
}
Gtid_list_log_event glev(&info->until_binlog_state, flag);
if (reset_transmit_packet(info, info->flags, &ev_offset, &info->errmsg) ||
fake_gtid_list_event(info, &glev, &info->errmsg, (int32)pos))
{
info->error= ER_MASTER_FATAL_ERROR_READING_BINLOG;
goto err;
}
}
THD_STAGE_INFO(thd, stage_sending_binlog_event_to_slave);
if (send_one_binlog_file(info, &log, &linfo, pos))
break;
......
......@@ -6844,6 +6844,36 @@ Sys_binlog_row_metadata(
ON_UPDATE(NULL));
static Sys_var_on_access_global<Sys_var_mybool,
PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX>
Sys_binlog_gtid_index(
"binlog_gtid_index",
"Enable the creation of a GTID index for every binlog file, and the use "
"of such index for speeding up GTID lookup in the binlog.",
GLOBAL_VAR(opt_binlog_gtid_index), CMD_LINE(OPT_ARG),
DEFAULT(TRUE));
static Sys_var_on_access_global<Sys_var_uint,
PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_PAGE_SIZE>
Sys_binlog_gtid_index_page_size(
"binlog_gtid_index_page_size",
"Page size to use for the binlog GTID index.",
GLOBAL_VAR(opt_binlog_gtid_index_page_size), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(64, 1<<24), DEFAULT(4096), BLOCK_SIZE(1));
static Sys_var_on_access_global<Sys_var_uint,
PRIV_SET_SYSTEM_GLOBAL_VAR_BINLOG_GTID_INDEX_SPAN_MIN>
Sys_binlog_gtid_index_span_min(
"binlog_gtid_index_span_min",
"Control sparseness of the binlog GTID index. If set to N, at most one "
"index record will be added for every N bytes of binlog file written, "
"to reduce the size of the index. Normally does not need tuning.",
GLOBAL_VAR(opt_binlog_gtid_index_span_min), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(1, 1024*1024L*1024L), DEFAULT(65536), BLOCK_SIZE(1));
static bool check_pseudo_slave_mode(sys_var *self, THD *thd, set_var *var)
{
longlong previous_val= thd->variables.pseudo_slave_mode;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment