Commit 6ceb1a47 authored by unknown's avatar unknown

Merge bk-internal.mysql.com:/home/bk/mysql-maria

into  mysql.com:/home/my/mysql-maria


storage/maria/ha_maria.cc:
  Auto merged
storage/maria/ma_bitmap.c:
  Auto merged
storage/maria/ma_checkpoint.c:
  Auto merged
storage/maria/ma_close.c:
  Auto merged
storage/maria/ma_loghandler.c:
  Auto merged
storage/maria/ma_loghandler.h:
  Auto merged
storage/maria/ma_open.c:
  Auto merged
storage/maria/ma_pagecache.h:
  Auto merged
storage/maria/ma_write.c:
  Auto merged
storage/maria/maria_def.h:
  Auto merged
storage/maria/unittest/ma_pagecache_single.c:
  Auto merged
storage/maria/ma_blockrec.c:
  Manual merge
storage/maria/ma_page.c:
  Manual merge
storage/maria/ma_pagecache.c:
  Manual merge
storage/maria/ma_preload.c:
  Manual merge
storage/maria/ma_recovery.c:
  Manual merge
  Add _ma_unpin_all_pages() to all new UNDO redo_exec_hook's
parents 5d6ee443 644361af
# Maria help script.
# Cleans up all logs to give recovery a fresh start.
# API: none, just uses vardir, port and socket.
connection admin;
-- echo * shut down mysqld, removed logs, restarted it
append_file $MYSQLTEST_VARDIR/tmp/master0.expect;
wait-maria_empty_logs.inc
EOF
--exec $MYSQLADMIN --no-defaults -S $MASTER_MYSOCK -P $MASTER_MYPORT -u root --password= shutdown 2>&1;
remove_file $MYSQLTEST_VARDIR/master-data/maria_log_control;
remove_file $MYSQLTEST_VARDIR/master-data/maria_log.00000001;
-- error 0,1 # maybe there is just one log
remove_file $MYSQLTEST_VARDIR/master-data/maria_log.00000002;
# Hope there were not more than these logs.
-- error 0,1
remove_file $MYSQLTEST_VARDIR/master-data/maria_recovery.trace;
append_file $MYSQLTEST_VARDIR/tmp/master0.expect;
restart-maria_empty_logs.inc
EOF
--source include/wait_until_connected_again.inc
connection default;
# the effect of "use" is lost after a restart so we are back into db "test",
# because connection 'default' was created with db "test".
use mysqltest;
# Maria helper script
# Copies table' data and index file to other directory, or back, or compares.
# The other directory looks like a database directory, so that we can
# read copies from inside mysqld, that's also why we copy the frm.
# "mms" is a namespace for Maria_Make_Snapshot
# API:
# 1) set one of
# $mms_copy : to copy table from database to spare directory
# $mms_reverse : to copy it back
# $mms_compare_physically : to compare both byte-for-byte
# 2) set $mms_table_to_use to a number N: table will be mysqltest.tN
# 3) set $mms_purpose to say what this copy is for (influences the naming
# of the spare directory).
if ($mms_copy)
{
--echo * copied t$mms_table_to_use for $mms_purpose
copy_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAD;
copy_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAI;
copy_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.frm $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.frm;
}
if ($mms_reverse_copy)
{
# do not call this without flushing target table first!
--echo * copied t$mms_table_to_use back for $mms_purpose
-- error 0,1
remove_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD;
copy_file $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAD $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD;
-- error 0,1
remove_file $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI;
copy_file $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAI $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI;
}
if ($mms_compare_physically)
{
# After the UNDO phase this is normally impossible
# (UNDO execution has created new log records => pages have new LSNs).
# So, do this only when testing REDO phase.
# If UNDO phase, we nevertheless compare checksums
# (see maria_verify_recovery.inc).
--echo * compared t$mms_table_to_use to old version
diff_files $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAD $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAD;
# index file not yet recovered
# diff_files $MYSQLTEST_VARDIR/master-data/mysqltest/t$mms_table_to_use.MAI $MYSQLTEST_VARDIR/master-data/mysqltest_for_$mms_purpose/t$mms_table_to_use.MAI;
}
# Maria helper script
# Copies clean tables' data and index file to other directory
# Tables are t1...t[$mms_tables]
# They are later used as a reference to see if recovery works.
# API:
# set $mms_tables to N, the script will cover tables mysqltest.t1,...tN
connection admin;
let $mms_table_to_use=$mms_tables;
let $mms_purpose=comparison;
let $mms_copy=1;
--disable_query_log
--disable_warnings
eval drop database if exists mysqltest_for_$mms_purpose;
--enable_warnings
eval create database mysqltest_for_$mms_purpose;
--enable_query_log
while ($mms_table_to_use)
{
# to serve as a reference, table must be in a clean state
eval flush table t$mms_table_to_use;
-- source include/maria_make_snapshot.inc
dec $mms_table_to_use;
}
let $mms_copy=0;
connection default;
# Maria helper script
# Copies tables' data and index file to other directory, and control file.
# Tables are t1...t[$mms_tables].
# Later, mysqld is shutdown, and that snapshot is put back into the
# datadir, control file too ("flashing recovery's brain"), and recovery is let
# to run on it (see maria_verify_recovery.inc).
# API:
# set $mms_tables to N, the script will cover tables mysqltest.t1,...tN
connection admin;
let $mms_table_to_use=$mms_tables;
let $mms_purpose=feeding_recovery;
let $mms_copy=1;
--disable_query_log
--disable_warnings
eval drop database if exists mysqltest_for_$mms_purpose;
--enable_warnings
eval create database mysqltest_for_$mms_purpose;
--enable_query_log
while ($mms_table_to_use)
{
-- source include/maria_make_snapshot.inc
dec $mms_table_to_use;
}
let $mms_copy=0;
-- error 0,1
remove_file $MYSQLTEST_VARDIR/tmp/mms_for_$mms_purpose.maria_log_control;
copy_file $MYSQLTEST_VARDIR/master-data/maria_log_control $MYSQLTEST_VARDIR/tmp/mms_for_$mms_purpose.maria_log_control;
connection default;
# Maria helper script.
# Runs recovery, compare with expected table data.
# API:
# 1) set $mms_tables to N, the script will cover tables mysqltest.t1,...tN
# 2) set $mvr_debug_option to the crash way
# 3) set $mvr_restore_old_snapshot to 1 if you want recovery to run on
# an old copy of tables and of the control file, 0 for normal recovery.
# 4) set $mms_compare_physically to 1 if you want a physical byte-for-byte
# comparison with expected table. Checksum comparison is always done.
# "mvr" is a namespace for Maria_Verify_Recovery
connection admin;
# we may do a copy-back of tables before comparison, so save comparison
# request made by caller:
let $mms_compare_physically_save=$mms_compare_physically;
let $mms_compare_physically=0;
# warn mtr that mysqld is going to die and should not be restarted immediately
#append_file $MYSQLTEST_VARDIR/tmp/master0.expect;
#wait-maria_verify_recovery.inc
#EOF
# todo: remove this "system" and uncomment above when BUG#32296 is fixed
system echo wait-maria_verify_recovery.inc >> $MYSQLTEST_VARDIR/tmp/master0.expect;
# flush page cache and log, only log, or nothing, and kill mysqld with
# abort().
# When we restore an old snapshot, we could just kill mysqld nicely,
# but that would implicitely commit all work, which the tester may
# not want (tester may want to observe rollback happening).
eval SET SESSION debug=$mvr_debug_option;
--echo * crashing mysqld intentionally
--error 2013
set global maria_checkpoint_interval=1; # this will crash (DBUG magic)
if ($mvr_restore_old_snapshot)
{
# copy snapshot made by maria_make_snapshot_for_feeding_recovery back
# into datadir.
let $mms_table_to_use=$mms_tables;
let $mms_purpose=feeding_recovery;
let $mms_reverse_copy=1;
while ($mms_table_to_use)
{
-- source include/maria_make_snapshot.inc
dec $mms_table_to_use;
}
let $mms_reverse_copy=0;
# also copy back control file, to force recovery to start from an early
# point, ignoring further checkpoints.
-- error 0,1
remove_file $MYSQLTEST_VARDIR/master-data/maria_log_control;
copy_file $MYSQLTEST_VARDIR/tmp/mms_for_$mms_purpose.maria_log_control $MYSQLTEST_VARDIR/master-data/maria_log_control;
}
--echo * recovery happens
# let mtr restart mysqld (and thus execute the maria log)
#append_file $MYSQLTEST_VARDIR/tmp/master0.expect;
#restart-maria_verify_recovery.inc
#EOF
system echo restart-maria_verify_recovery.inc >> $MYSQLTEST_VARDIR/tmp/master0.expect;
--source include/wait_until_connected_again.inc
# Compare that tables of $mms_tables are identical to old.
# We always compare with CHECKSUM TABLE, and if requested (which makes sense
# only for testing the REDO phase, as UNDO phase generates new records so new
# LSNs on pages.) with a physical byte-for-byte comparison.
let $mms_table_to_use=$mms_tables;
let $mms_purpose=comparison;
let $mms_compare_physically=$mms_compare_physically_save;
while ($mms_table_to_use)
{
# Todo: remove this REPAIR when we have index recovery working.
# It is a quick repair, so that it will fail if data file is corrupted.
--echo * rebuilding index (until we have recovery of index)
eval repair table t$mms_table_to_use quick;
--echo * testing that checksum after recovery is as expected
let $new_checksum=`CHECKSUM TABLE t$mms_table_to_use`;
let $old_checksum=`CHECKSUM TABLE mysqltest_for_$mms_purpose.t$mms_table_to_use`;
# the $ text variables above are of the form "db.tablename\tchecksum",
# as db differs, we use substring().
eval select if(substring("$new_checksum",instr("$new_checksum",".t1")) = substring("$old_checksum",instr("$old_checksum",".t1")),"ok","failure");
# this script may compare physically or do nothing
-- source include/maria_make_snapshot.inc
dec $mms_table_to_use;
}
connection default;
# the effect of "use" is lost after a restart so we are back into db "test"
use mysqltest;
#
# Include this script to wait until the connection to the
# server has been restored or timeout occurs
# server has been restored or timeout occurs.
# You should have done --enable_reconnect first
--disable_result_log
--disable_query_log
let $counter= 500;
let $mysql_errno= 1;
while ($mysql_errno)
{
--error 0,2002,2006
--error 0,2002,2003,2006
show status;
dec $counter;
......
......@@ -886,15 +886,33 @@ sub check_expected_crash_and_restart($)
mtr_verbose("$mysqld->{'type'} $mysqld->{'idx'} exited, pid: $ret_pid");
$mysqld->{'pid'}= 0;
# Check if crash expected and restart if it was
# Check if crash expected, and restart if it was
my $expect_file= "$::opt_vardir/tmp/" . "$mysqld->{'type'}" .
"$mysqld->{'idx'}" . ".expect";
while ( 1 )
{
if ( -f $expect_file )
{
mtr_verbose("Crash was expected, file $expect_file exists");
my $expect_file_handler;
open($expect_file_handler, "<$expect_file") or die;
my @expect_lines= <$expect_file_handler>;
close $expect_file_handler;
# look at most recent order by the test
my $expect_content= pop @expect_lines;
chomp $expect_content;
if ( $expect_content =~ /^wait/ )
{
mtr_verbose("Test asks that we wait before restart");
# Millisceond sleep emulated with select
select(undef, undef, undef, (0.1));
next;
}
unlink($expect_file);
mysqld_start($mysqld, $mysqld->{'start_opts'},
$mysqld->{'start_slave_master_info'});
unlink($expect_file);
}
last;
}
return;
......@@ -914,8 +932,8 @@ sub check_expected_crash_and_restart($)
if ( -f $expect_file )
{
mtr_verbose("Crash was expected, file $expect_file exists");
ndbmgmd_start($cluster);
unlink($expect_file);
ndbmgmd_start($cluster);
}
return;
}
......@@ -933,9 +951,9 @@ sub check_expected_crash_and_restart($)
if ( -f $expect_file )
{
mtr_verbose("Crash was expected, file $expect_file exists");
unlink($expect_file);
ndbd_start($cluster, $ndbd->{'idx'},
$ndbd->{'start_extra_args'});
unlink($expect_file);
}
return;
}
......
drop database if exists mysqltest;
create database mysqltest;
use mysqltest;
* shut down mysqld, removed logs, restarted it
use mysqltest;
create table t1 (a varchar(1000)) engine=maria;
* TEST of REDO: see if recovery can reconstruct if we give it an old table
* copied t1 for feeding_recovery
insert into t1 values ("00000000");
flush table t1;
* copied t1 for comparison
SET SESSION debug="+d,maria_flush_whole_log,maria_crash";
* crashing mysqld intentionally
set global maria_checkpoint_interval=1;
ERROR HY000: Lost connection to MySQL server during query
* copied t1 back for feeding_recovery
* recovery happens
* rebuilding index (until we have recovery of index)
repair table t1 quick;
Table Op Msg_type Msg_text
mysqltest.t1 repair status OK
* testing that checksum after recovery is as expected
select if(substring("mysqltest.t1 488070860",instr("mysqltest.t1 488070860",".t1")) = substring("mysqltest_for_comparison.t1 488070860",instr("mysqltest_for_comparison.t1 488070860",".t1")),"ok","failure");
if(substring("mysqltest.t1 488070860",instr("mysqltest.t1 488070860",".t1")) = substring("mysqltest_for_comparison.t1 488070860",instr("mysqltest_for_comparison.t1 488070860",".t1")),"ok","failure")
ok
* compared t1 to old version
use mysqltest;
select * from t1;
a
00000000
* TEST of REDO+UNDO: normal recovery test (no moving tables under its feet)
insert into t1 values ("00000000");
flush table t1;
* copied t1 for comparison
lock tables t1 write;
insert into t1 values ("aaaaaaaaa");
SET SESSION debug="+d,maria_crash";
* crashing mysqld intentionally
set global maria_checkpoint_interval=1;
ERROR HY000: Lost connection to MySQL server during query
* recovery happens
* rebuilding index (until we have recovery of index)
repair table t1 quick;
Table Op Msg_type Msg_text
mysqltest.t1 repair status OK
* testing that checksum after recovery is as expected
select if(substring("mysqltest.t1 976141720",instr("mysqltest.t1 976141720",".t1")) = substring("mysqltest_for_comparison.t1 976141720",instr("mysqltest_for_comparison.t1 976141720",".t1")),"ok","failure");
if(substring("mysqltest.t1 976141720",instr("mysqltest.t1 976141720",".t1")) = substring("mysqltest_for_comparison.t1 976141720",instr("mysqltest_for_comparison.t1 976141720",".t1")),"ok","failure")
ok
use mysqltest;
select * from t1;
a
00000000
00000000
insert into t1 values ("00000000");
flush table t1;
* copied t1 for comparison
lock tables t1 write;
insert into t1 values ("aaaaaaaaa");
SET SESSION debug="+d,maria_flush_whole_page_cache,maria_crash";
* crashing mysqld intentionally
set global maria_checkpoint_interval=1;
ERROR HY000: Lost connection to MySQL server during query
* recovery happens
* rebuilding index (until we have recovery of index)
repair table t1 quick;
Table Op Msg_type Msg_text
mysqltest.t1 repair status OK
* testing that checksum after recovery is as expected
select if(substring("mysqltest.t1 1464212580",instr("mysqltest.t1 1464212580",".t1")) = substring("mysqltest_for_comparison.t1 1464212580",instr("mysqltest_for_comparison.t1 1464212580",".t1")),"ok","failure");
if(substring("mysqltest.t1 1464212580",instr("mysqltest.t1 1464212580",".t1")) = substring("mysqltest_for_comparison.t1 1464212580",instr("mysqltest_for_comparison.t1 1464212580",".t1")),"ok","failure")
ok
use mysqltest;
select * from t1;
a
00000000
00000000
00000000
insert into t1 values ("00000000");
flush table t1;
* copied t1 for comparison
lock tables t1 write;
insert into t1 values ("aaaaaaaaa");
SET SESSION debug="+d,maria_flush_states,maria_flush_whole_log,maria_crash";
* crashing mysqld intentionally
set global maria_checkpoint_interval=1;
ERROR HY000: Lost connection to MySQL server during query
* recovery happens
* rebuilding index (until we have recovery of index)
repair table t1 quick;
Table Op Msg_type Msg_text
mysqltest.t1 repair status OK
* testing that checksum after recovery is as expected
select if(substring("mysqltest.t1 1952283440",instr("mysqltest.t1 1952283440",".t1")) = substring("mysqltest_for_comparison.t1 1952283440",instr("mysqltest_for_comparison.t1 1952283440",".t1")),"ok","failure");
if(substring("mysqltest.t1 1952283440",instr("mysqltest.t1 1952283440",".t1")) = substring("mysqltest_for_comparison.t1 1952283440",instr("mysqltest_for_comparison.t1 1952283440",".t1")),"ok","failure")
ok
use mysqltest;
select * from t1;
a
00000000
00000000
00000000
00000000
insert into t1 values ("00000000");
flush table t1;
* copied t1 for comparison
lock tables t1 write;
insert into t1 values ("aaaaaaaaa");
SET SESSION debug="+d,maria_flush_whole_log,maria_crash";
* crashing mysqld intentionally
set global maria_checkpoint_interval=1;
ERROR HY000: Lost connection to MySQL server during query
* recovery happens
* rebuilding index (until we have recovery of index)
repair table t1 quick;
Table Op Msg_type Msg_text
mysqltest.t1 repair status OK
* testing that checksum after recovery is as expected
select if(substring("mysqltest.t1 2440354300",instr("mysqltest.t1 2440354300",".t1")) = substring("mysqltest_for_comparison.t1 2440354300",instr("mysqltest_for_comparison.t1 2440354300",".t1")),"ok","failure");
if(substring("mysqltest.t1 2440354300",instr("mysqltest.t1 2440354300",".t1")) = substring("mysqltest_for_comparison.t1 2440354300",instr("mysqltest_for_comparison.t1 2440354300",".t1")),"ok","failure")
ok
use mysqltest;
select * from t1;
a
00000000
00000000
00000000
00000000
00000000
drop table t1;
* shut down mysqld, removed logs, restarted it
use mysqltest;
CREATE TABLE t1 (
i int,
b blob default NULL,
c varchar(6000) default NULL
) ENGINE=MARIA CHECKSUM=1;
* copied t1 for feeding_recovery
INSERT INTO t1 VALUES (1, REPEAT('a', 5000), REPEAT('b', 5000));
UPDATE t1 SET i=3, b=CONCAT(b,'c') WHERE i=1;
SELECT LENGTH(b) FROM t1 WHERE i=3;
LENGTH(b)
5001
flush table t1;
* copied t1 for comparison
SET SESSION debug="+d,maria_flush_whole_log,maria_crash";
* crashing mysqld intentionally
set global maria_checkpoint_interval=1;
ERROR HY000: Lost connection to MySQL server during query
* copied t1 back for feeding_recovery
* recovery happens
* rebuilding index (until we have recovery of index)
repair table t1 quick;
Table Op Msg_type Msg_text
mysqltest.t1 repair status OK
* testing that checksum after recovery is as expected
select if(substring("mysqltest.t1 3472399915",instr("mysqltest.t1 3472399915",".t1")) = substring("mysqltest_for_comparison.t1 3472399915",instr("mysqltest_for_comparison.t1 3472399915",".t1")),"ok","failure");
if(substring("mysqltest.t1 3472399915",instr("mysqltest.t1 3472399915",".t1")) = substring("mysqltest_for_comparison.t1 3472399915",instr("mysqltest_for_comparison.t1 3472399915",".t1")),"ok","failure")
ok
use mysqltest;
SELECT LENGTH(b) FROM t1 WHERE i=3;
LENGTH(b)
5001
drop table t1;
drop database mysqltest_for_feeding_recovery;
drop database mysqltest_for_comparison;
drop database mysqltest;
--skip-stack-trace --skip-core-file
--source include/not_embedded.inc
# Don't test this under valgrind, memory leaks will occur as we crash
--source include/not_valgrind.inc
# Binary must be compiled with debug for crash to occur
--source include/have_debug.inc
--source include/have_maria.inc
--disable_warnings
drop database if exists mysqltest;
--enable_warnings
create database mysqltest;
# Include scripts can perform SQL. For it to not influence the main test
# they use a separate connection. This way if they use a DDL it would
# not autocommit in the main test.
connect (admin, 127.0.0.1, root,,mysqltest,,);
--enable_reconnect
connection default;
use mysqltest;
--enable_reconnect
# A sample test
-- source include/maria_empty_logs.inc
let $mms_tables=1;
create table t1 (a varchar(1000)) engine=maria;
--echo * TEST of REDO: see if recovery can reconstruct if we give it an old table
-- source include/maria_make_snapshot_for_feeding_recovery.inc
# Your committed statements here, which we expect to
# be reconstructed from the log
insert into t1 values ("00000000");
-- source include/maria_make_snapshot_for_comparison.inc
# we want recovery to run on the first snapshot made above
let $mvr_restore_old_snapshot=1;
# As we did only committed work, we test REDO applying, which could
# produce a physically identical table.
let $mms_compare_physically=1;
let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash";
# the script below will trigger recovery and compare checksums
-- source include/maria_verify_recovery.inc
let $mms_compare_physically=0;
# so a SELECT like this is pure visual effect, brings nothing.
select * from t1;
--echo * TEST of REDO+UNDO: normal recovery test (no moving tables under its feet)
# different types of crash => a loop; here are loop control variables
let $crash_no_flush=1;
let $crash_flush_whole_page_cache=0;
let $crash_flush_states=0;
let $crash_flush_whole_log=0;
let $crash_loop=1;
# we want recovery to use the tables as they were at time of crash
let $mvr_restore_old_snapshot=0;
# UNDO phase prevents physical comparison, normally,
# so we'll only use checksums to compare.
let $mms_compare_physically=0;
# Note that we don't remove logs between iterations. Test is
# cumulative (each new recovery processes more log records than the previous).
while ($crash_loop)
{
if ($crash_flush_whole_log)
{
let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash";
# set up what next iteration should do:
let $crash_flush_whole_log=0;
let $crash_loop=0;
}
if ($crash_flush_states)
{
let $mvr_debug_option="+d,maria_flush_states,maria_flush_whole_log,maria_crash";
let $crash_flush_states=0;
let $crash_flush_whole_log=1;
}
if ($crash_flush_whole_page_cache)
{
let $mvr_debug_option="+d,maria_flush_whole_page_cache,maria_crash";
let $crash_flush_whole_page_cache=0;
let $crash_flush_states=1;
}
if ($crash_no_flush)
{
let $mvr_debug_option="+d,maria_crash";
let $crash_no_flush=0;
let $crash_flush_whole_page_cache=1;
}
# Your committed statements here
insert into t1 values ("00000000");
-- source include/maria_make_snapshot_for_comparison.inc
# Your statements which we expect to be rolled back
lock tables t1 write;
insert into t1 values ("aaaaaaaaa");
-- source include/maria_verify_recovery.inc
select * from t1;
}
drop table t1;
# what did we compare above:
# - checksum: tells that the tables contain the same amount of rows
# and same data in rows
# - index: no, neither state nor pages were compared
# - bitmap pages: the REPAIR QUICK done above very probably checks
# that bitmap reflects page occupation; do we need to do physical
# compare?
# - page LSN: not compared; we should compare that page's LSN in new
# table is >= page's LSN in old table (it can be >, due to UNDO phase)
# we had a bug where new page's LSN was 0... todo.
#
# Test for this bug: an UPDATE purges and rewrites a tail page, and
# recovery applied the purge, stamped page with UNDO's LSN, thus
# the rewrite was ignored.
#
-- source include/maria_empty_logs.inc
let $mms_tables=1;
CREATE TABLE t1 (
i int,
b blob default NULL,
c varchar(6000) default NULL
) ENGINE=MARIA CHECKSUM=1;
-- source include/maria_make_snapshot_for_feeding_recovery.inc
INSERT INTO t1 VALUES (1, REPEAT('a', 5000), REPEAT('b', 5000));
UPDATE t1 SET i=3, b=CONCAT(b,'c') WHERE i=1;
SELECT LENGTH(b) FROM t1 WHERE i=3;
-- source include/maria_make_snapshot_for_comparison.inc
# we want recovery to run on the first snapshot made above
let $mvr_restore_old_snapshot=1;
let $mms_compare_physically=0;
let $mvr_debug_option="+d,maria_flush_whole_log,maria_crash";
-- source include/maria_verify_recovery.inc
SELECT LENGTH(b) FROM t1 WHERE i=3;
drop table t1;
# clean up everything
let $mms_purpose=feeding_recovery;
eval drop database mysqltest_for_$mms_purpose;
let $mms_purpose=comparison;
eval drop database mysqltest_for_$mms_purpose;
drop database mysqltest;
......@@ -1243,6 +1243,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize)
thd->proc_info= "Repair with keycache";
param.testflag &= ~T_REP_BY_SORT;
error= maria_repair(&param, file, fixed_name, param.testflag & T_QUICK);
/**
@todo RECOVERY BUG we do things with the index file
(maria_sort_index() after the above which already has logged the
record and bumped create_rename_lsn. Is it ok?
*/
}
param.testflag= testflag;
optimize_done= 1;
......@@ -1311,6 +1316,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize)
thd->proc_info= old_proc_info;
if (!thd->locked_tables)
{
/**
@todo RECOVERY BUG find why this is needed. Monty says it's because a
new non-transactional table is created by maria_repair(): find how this
new table's state influences the old one's.
*/
_ma_reenable_logging_for_table(file->s);
maria_lock_database(file, F_UNLCK);
}
......@@ -1991,20 +2001,6 @@ int ha_maria::external_lock(THD *thd, int lock_type)
goto skip_transaction;
if (lock_type != F_UNLCK)
{
if (!thd->transaction.on)
{
/*
No need to log REDOs/UNDOs. If this is an internal temporary table
which will be renamed to a permanent table (like in ALTER TABLE),
the rename happens after unlocking so will be durable (and the table
will get its create_rename_lsn).
Note: if we wanted to enable users to have an old backup and apply
tons of archived logs to roll-forward, we could then not disable
REDOs/UNDOs in this case.
*/
DBUG_PRINT("info", ("Disabling logging for table"));
_ma_tmp_disable_logging_for_table(file->s);
}
if (!trn) /* no transaction yet - open it now */
{
trn= trnman_new_trn(& thd->mysys_var->mutex,
......@@ -2025,6 +2021,20 @@ int ha_maria::external_lock(THD *thd, int lock_type)
trans_register_ha(thd, FALSE, maria_hton);
trnman_new_statement(trn);
}
if (!thd->transaction.on)
{
/*
No need to log REDOs/UNDOs. If this is an internal temporary table
which will be renamed to a permanent table (like in ALTER TABLE),
the rename happens after unlocking so will be durable (and the table
will get its create_rename_lsn).
Note: if we wanted to enable users to have an old backup and apply
tons of archived logs to roll-forward, we could then not disable
REDOs/UNDOs in this case.
*/
DBUG_PRINT("info", ("Disabling logging for table"));
_ma_tmp_disable_logging_for_table(file, TRUE);
}
}
else
{
......
......@@ -152,7 +152,8 @@ static inline my_bool write_changed_bitmap(MARIA_SHARE *share,
(uchar*) bitmap->map, PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY, 0));
PAGECACHE_WRITE_DELAY, 0,
LSN_IMPOSSIBLE));
}
/*
......@@ -571,6 +572,26 @@ static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
Inexistent or half-created page (could be crash in the middle of
_ma_bitmap_create_first(), before appending maria_bitmap_marker).
*/
/**
@todo RECOVERY BUG
We are updating data_file_length before writing any log record for the
row operation. What if now state is flushed by a checkpoint with the
new value, and crash before the checkpoint record is written, recovery
may not even open the table (no log records) so not fix
data_file_length ("WAL violation")?
Scenario: assume share->id==0, then:
thread 1 (here) thread 2 (checkpoint)
update data_file_length
copy state to memory, flush log
set share->id and write FILE_ID (not flushed)
see share->id!=0 so flush state
crash
FILE_ID will be missing, Recovery will not open table and not fix
data_file_length. This bug should be fixed with other "checkpoint vs
bitmap" bugs.
One possibility will be logging a standalone LOGREC_CREATE_BITMAP in a
separate transaction (using dummy_transaction_object).
*/
share->state.state.data_file_length= end_of_page;
bzero(bitmap->map, bitmap->block_size);
memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker),
......@@ -641,6 +662,12 @@ static my_bool _ma_change_bitmap_page(MARIA_HA *info,
if (bitmap->changed)
{
/**
@todo RECOVERY BUG this is going to flush the bitmap page possibly to
disk even though it could be over-allocated with not yet any REDO-UNDO
complete group (WAL violation: no way to undo the over-allocation if
crash). See also collect_tables().
*/
if (write_changed_bitmap(info->s, bitmap))
DBUG_RETURN(1);
bitmap->changed= 0;
......
......@@ -1433,7 +1433,15 @@ static my_bool write_tail(MARIA_HA *info,
/* Increase data file size, if extended */
position= (my_off_t) block->page * block_size;
if (info->state->data_file_length <= position)
{
/*
We are modifying a state member before writing the UNDO; this is a WAL
violation. But for data_file_length this is ok, as long as we change
data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
collect_tables()).
*/
info->state->data_file_length= position + block_size;
}
DBUG_ASSERT(share->pagecache->block_size == block_size);
if (!(res= pagecache_write(share->pagecache,
......@@ -1443,7 +1451,8 @@ static my_bool write_tail(MARIA_HA *info,
PAGECACHE_LOCK_READ,
block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
PAGECACHE_PIN,
PAGECACHE_WRITE_DELAY, &page_link.link)))
PAGECACHE_WRITE_DELAY, &page_link.link,
LSN_IMPOSSIBLE)))
{
page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
page_link.changed= 1;
......@@ -1547,7 +1556,7 @@ static my_bool write_full_pages(MARIA_HA *info,
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0))
0, LSN_IMPOSSIBLE))
DBUG_RETURN(1);
page++;
block->used= BLOCKUSED_USED;
......@@ -2351,7 +2360,8 @@ static my_bool write_block_record(MARIA_HA *info,
PAGECACHE_LOCK_READ,
head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
PAGECACHE_PIN,
PAGECACHE_WRITE_DELAY, &page_link.link))
PAGECACHE_WRITE_DELAY, &page_link.link,
LSN_IMPOSSIBLE))
goto disk_err;
page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
page_link.changed= 1;
......@@ -3172,7 +3182,8 @@ static my_bool delete_head_or_tail(MARIA_HA *info,
buff, share->page_type,
lock_at_write,
PAGECACHE_PIN_LEFT_PINNED,
PAGECACHE_WRITE_DELAY, &page_link.link))
PAGECACHE_WRITE_DELAY, &page_link.link,
LSN_IMPOSSIBLE))
DBUG_RETURN(1);
}
else /* page is now empty */
......@@ -3196,7 +3207,8 @@ static my_bool delete_head_or_tail(MARIA_HA *info,
buff, share->page_type,
lock_at_write,
PAGECACHE_PIN_LEFT_PINNED,
PAGECACHE_WRITE_DELAY, &page_link.link))
PAGECACHE_WRITE_DELAY, &page_link.link,
LSN_IMPOSSIBLE))
DBUG_RETURN(1);
DBUG_ASSERT(empty_space >= info->s->bitmap.sizes[0]);
......@@ -3257,6 +3269,7 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
ulonglong page;
uint record_number;
MARIA_SHARE *share= info->s;
LSN lsn= LSN_IMPOSSIBLE;
DBUG_ENTER("_ma_delete_block_record");
page= ma_recordpos_to_page(info->cur_row.lastpos);
......@@ -3273,7 +3286,6 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
if (share->now_transactional)
{
LSN lsn;
uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
DIRPOS_STORE_SIZE + HA_CHECKSUM_STORE_SIZE];
size_t row_length;
......@@ -3311,7 +3323,7 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
}
_ma_unpin_all_pages_and_finalize_row(info, info->trn->undo_lsn);
_ma_unpin_all_pages_and_finalize_row(info, lsn);
DBUG_RETURN(0);
err:
......@@ -5078,8 +5090,8 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
DBUG_ASSERT(rownr == 0);
if (rownr != 0)
goto err;
unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
unpin_method= PAGECACHE_PIN_LEFT_UNPINNED;
unlock_method= PAGECACHE_LOCK_WRITE;
unpin_method= PAGECACHE_PIN;
buff= info->keyread_buff;
info->keyread_buff_used= 1;
......@@ -5120,8 +5132,8 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
DBUG_RETURN(my_errno);
DBUG_RETURN(0);
}
unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
unpin_method= PAGECACHE_UNPIN;
unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
unpin_method= PAGECACHE_PIN_LEFT_PINNED;
if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
{
......@@ -5189,15 +5201,23 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
empty_space-= data_length;
int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
/* Write modified page */
lsn_store(buff, lsn);
/*
Write modified page. We don't update its LSN, and keep it pinned. When we
have processed all REDOs for this page in the current REDO's group, we
will stamp page with UNDO's LSN (if we stamped it now, a next REDO, in
this group, for this page, would be skipped) and unpin then.
*/
if (pagecache_write(share->pagecache,
&info->dfile, page, 0,
buff, PAGECACHE_PLAIN_PAGE,
unlock_method, unpin_method,
PAGECACHE_WRITE_DELAY, 0))
PAGECACHE_WRITE_DELAY, &page_link.link,
LSN_IMPOSSIBLE))
DBUG_RETURN(my_errno);
page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
push_dynamic(&info->pinned_pages, (void*) &page_link);
/* Fix bitmap */
if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
DBUG_RETURN(my_errno);
......@@ -5215,7 +5235,7 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
DBUG_RETURN(0);
err:
if (unlock_method == PAGECACHE_LOCK_WRITE_UNLOCK)
if (unlock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
pagecache_unlock_by_link(share->pagecache, page_link.link,
PAGECACHE_LOCK_WRITE_UNLOCK,
PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
......@@ -5302,18 +5322,23 @@ uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0)
goto err;
lsn_store(buff, lsn);
result= 0;
if (pagecache_write(share->pagecache,
&info->dfile, page, 0,
buff, PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN,
PAGECACHE_WRITE_DELAY, 0))
PAGECACHE_LOCK_LEFT_WRITELOCKED,
PAGECACHE_PIN_LEFT_PINNED,
PAGECACHE_WRITE_DELAY, 0,
LSN_IMPOSSIBLE))
result= my_errno;
else
{
page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
push_dynamic(&info->pinned_pages, (void*) &page_link);
/* This will work even if the page was marked as UNALLOCATED_PAGE */
if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
result= my_errno;
}
DBUG_RETURN(result);
......@@ -5360,7 +5385,12 @@ uint _ma_apply_redo_free_blocks(MARIA_HA *info,
start_page= page= page_korr(header);
header+= PAGE_STORE_SIZE;
page_range= pagerange_korr(header);
/* Page range may have this bit set to indicate a tail page */
page_range= pagerange_korr(header) & ~TAIL_BIT;
/** @todo RECOVERY BUG enable this assertion when newer tree pulled */
#if 0
DBUG_ASSERT(page_range > 0);
#endif
header+= PAGERANGE_STORE_SIZE;
DBUG_PRINT("info", ("page: %lu pages: %u", (long) page, page_range));
......
......@@ -176,7 +176,6 @@ static int really_execute_checkpoint(void)
LSN_IN_PARTS(checkpoint_start_log_horizon)));
lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
/*
STEP 2: fetch information about transactions.
We must fetch transactions before dirty pages. Indeed, a transaction
......@@ -346,6 +345,43 @@ int ma_checkpoint_init(ulong interval)
}
#ifndef DBUG_OFF
/**
Function used to test recovery: flush some table pieces and then caller
crashes.
@param what_to_flush 0: current bitmap and all data pages
1: state
*/
static void flush_all_tables(int what_to_flush)
{
LIST *pos; /**< to iterate over open tables */
pthread_mutex_lock(&THR_LOCK_maria);
for (pos= maria_open_list; pos; pos= pos->next)
{
MARIA_HA *info= (MARIA_HA*)pos->data;
if (info->s->now_transactional)
{
switch (what_to_flush)
{
case 0:
_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
FLUSH_KEEP, FLUSH_KEEP);
break;
case 1:
_ma_state_info_write(info->s, 1|4);
DBUG_PRINT("maria_flush_states",
("is_of_horizon: LSN (%lu,0x%lx)",
LSN_IN_PARTS(info->s->state.is_of_horizon)));
break;
}
}
}
pthread_mutex_unlock(&THR_LOCK_maria);
}
#endif
/**
@brief Destroys the checkpoint module
*/
......@@ -353,6 +389,32 @@ int ma_checkpoint_init(ulong interval)
void ma_checkpoint_end(void)
{
DBUG_ENTER("ma_checkpoint_end");
DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
{
DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
flush_all_tables(0);
});
DBUG_EXECUTE_IF("maria_flush_whole_log",
{
DBUG_PRINT("maria_flush_whole_log", ("now"));
translog_flush(translog_get_horizon());
});
/*
Note that for WAL reasons, maria_flush_states requires
maria_flush_whole_log.
*/
DBUG_EXECUTE_IF("maria_flush_states",
{
DBUG_PRINT("maria_flush_states", ("now"));
flush_all_tables(1);
});
DBUG_EXECUTE_IF("maria_crash",
{
DBUG_PRINT("maria_crash", ("now"));
fflush(DBUG_FILE);
abort();
});
if (checkpoint_inited)
{
pthread_mutex_lock(&LOCK_checkpoint);
......@@ -501,11 +563,6 @@ filter_flush_data_file_evenly(enum pagecache_page_type type,
@note MikaelR questioned why the same thread does two different jobs, the
risk could be that while a checkpoint happens no LRD flushing happens.
@note MikaelR noted that he observed that Linux's file cache may never
fsync to disk until this cache is full, at which point it decides to empty
the cache, making the machine very slow. A solution was to fsync after
writing 2 MB.
*/
pthread_handler_t ma_checkpoint_background(void *arg)
......@@ -622,6 +679,13 @@ pthread_handler_t ma_checkpoint_background(void *arg)
if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
break; /* and we will continue with the same file */
dfile++; /* otherwise all this file is flushed, move to next file */
/*
MikaelR noted that he observed that Linux's file cache may never
fsync to disk until this cache is full, at which point it decides
to empty the cache, making the machine very slow. A solution was
to fsync after writing 2 MB. So we might want to fsync() here if
we wrote enough pages.
*/
}
filter_param.is_data_file= FALSE;
while (kfile != kfiles_end)
......@@ -866,9 +930,32 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
*/
}
translog_unlock();
/**
We are going to flush these states.
Before, all records describing how to undo such state must be
in the log (WAL). Usually this means UNDOs. In the special case of
data|key_file_length, recovery just needs to open the table to fix the
length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
understand it must open a table, is enough; so as long as
data|key_file_length is updated after writing any log record it's ok:
if we copied new value above, it means the record was before
state_copies_horizon and we flush such record below.
Apart from data|key_file_length which are easily recoverable from the
real file's size, all other state members must be updated only when
writing the UNDO; otherwise, if updated before, if their new value is
flushed by a checkpoint and there is a crash before UNDO is written,
their REDO group will be missing or at least incomplete and skipped
by recovery, so bad state value will stay. For example, setting
key_root before writing the UNDO: the table would have old index
pages (they were pinned at time of crash) and a new, thus wrong,
key_root.
@todo RECOVERY BUG check that all code honours that.
*/
if (translog_flush(state_copies_horizon))
goto err;
/* now we have cached states and they are WAL-safe*/
state_copies_end= state_copy;
state_copy= state_copies;
/* so now we have cached states */
}
/* locate our state among these cached ones */
......@@ -979,6 +1066,12 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
each checkpoint if the table was once written and then not anymore.
*/
}
/**
@todo RECOVERY BUG this is going to flush the bitmap page possibly to
disk even though it could be over-allocated with not yet any
REDO-UNDO complete group (WAL violation: no way to undo the
over-allocation if crash); see also _ma_change_bitmap_page().
*/
sync_error|=
_ma_flush_bitmap(share); /* after that, all is in page cache */
DBUG_ASSERT(share->pagecache == maria_pagecache);
......
......@@ -86,8 +86,9 @@ int maria_close(register MARIA_HA *info)
may be using the file at this point
IF using --external-locking, which does not apply to Maria.
*/
if ((share->changed && share->base.born_transactional) ||
(share->mode != O_RDONLY && maria_is_crashed(info)))
if (share->mode != O_RDONLY &&
((share->changed && share->base.born_transactional) ||
maria_is_crashed(info)))
{
/*
State must be written to file as it was not done at table's
......
......@@ -552,6 +552,11 @@ static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID=
{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0,
"long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
static LOG_DESC INIT_LOGREC_INCOMPLETE_LOG=
{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE,
NULL, NULL, NULL, 0,
"incomplete_log", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL;
static void loghandler_init()
......@@ -627,12 +632,14 @@ static void loghandler_init()
INIT_LOGREC_FILE_ID;
log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]=
INIT_LOGREC_LONG_TRANSACTION_ID;
for (i= LOGREC_LONG_TRANSACTION_ID + 1;
log_record_type_descriptor[LOGREC_INCOMPLETE_LOG]=
INIT_LOGREC_INCOMPLETE_LOG;
for (i= LOGREC_INCOMPLETE_LOG + 1;
i < LOGREC_NUMBER_OF_TYPES;
i++)
log_record_type_descriptor[i].class= LOGRECTYPE_NOT_ALLOWED;
DBUG_EXECUTE("info",
check_translog_description_table(LOGREC_LONG_TRANSACTION_ID););
check_translog_description_table(LOGREC_INCOMPLETE_LOG););
};
......@@ -2083,6 +2090,7 @@ static my_bool translog_buffer_flush(struct st_translog_buffer *buffer)
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED, 0,
LSN_IMPOSSIBLE,
&translog_page_validator, (uchar*) &data))
{
UNRECOVERABLE_ERROR(("Can't write page (%lu,0x%lx) to pagecache",
......@@ -6570,7 +6578,8 @@ my_bool translog_flush(LSN lsn)
struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
/* we can't flush in future */
DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, lsn) >= 0);
if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0 ||
full_circle)
{
DBUG_PRINT("info", ("already flushed: (%lu,0x%lx)",
LSN_IN_PARTS(log_descriptor.flushed)));
......
......@@ -137,6 +137,7 @@ enum translog_record_type
LOGREC_REDO_REPAIR_TABLE,
LOGREC_FILE_ID,
LOGREC_LONG_TRANSACTION_ID,
LOGREC_INCOMPLETE_LOG,
LOGREC_RESERVED_FUTURE_EXTENSION= 63
};
#define LOGREC_NUMBER_OF_TYPES 64 /* Maximum, can't be extended */
......
......@@ -1058,6 +1058,8 @@ uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite)
is too new). Recovery does it by itself.
*/
share->state.is_of_horizon= translog_get_horizon();
DBUG_PRINT("info", ("is_of_horizon set to LSN (%lu,0x%lx)",
LSN_IN_PARTS(share->state.is_of_horizon)));
}
res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite);
if (pWrite & 4)
......
......@@ -137,7 +137,8 @@ int _ma_write_keypage(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo,
lock == PAGECACHE_LOCK_LEFT_WRITELOCKED ?
PAGECACHE_PIN_LEFT_PINNED :
PAGECACHE_PIN,
PAGECACHE_WRITE_DELAY, &page_link.link);
PAGECACHE_WRITE_DELAY, &page_link.link,
LSN_IMPOSSIBLE);
if (lock == PAGECACHE_LOCK_WRITE)
{
......@@ -192,7 +193,6 @@ int _ma_dispose(register MARIA_HA *info, my_off_t pos, my_bool page_not_read)
_ma_store_keynr(info, buff, (uchar) MARIA_DELETE_KEY_NR);
mi_sizestore(buff + share->keypage_header, old_link);
share->state.changed|= STATE_NOT_SORTED_PAGES;
if (info->s->now_transactional)
{
LSN lsn;
......@@ -235,6 +235,7 @@ int _ma_dispose(register MARIA_HA *info, my_off_t pos, my_bool page_not_read)
share->page_type,
lock_method, pin_method,
PAGECACHE_WRITE_DELAY, &page_link.link,
LSN_IMPOSSIBLE,
0, share->keypage_header+8, 0, 0))
result= 1;
......
......@@ -633,6 +633,24 @@ static uint pagecache_fwrite(PAGECACHE *pagecache,
(pageno)<<(pagecache->shift), flags)
/**
@brief set rec_lsn of pagecache block (if it is needed)
@param block block where to set rec_lsn
@param first_REDO_LSN_for_page the LSN to set
*/
static inline void pagecache_set_block_rec_lsn(PAGECACHE_BLOCK_LINK *block,
LSN first_REDO_LSN_for_page)
{
if (block->rec_lsn == LSN_MAX)
block->rec_lsn= first_REDO_LSN_for_page;
else
DBUG_ASSERT(cmp_translog_addr(block->rec_lsn,
first_REDO_LSN_for_page) <= 0);
}
/*
next_power(value) is 2 at the power of (1+floor(log2(value)));
e.g. next_power(2)=4, next_power(3)=4.
......@@ -2488,7 +2506,12 @@ static void check_and_set_lsn(PAGECACHE *pagecache,
{
LSN old;
DBUG_ENTER("check_and_set_lsn");
DBUG_ASSERT(block->type == PAGECACHE_LSN_PAGE);
/*
In recovery, we can _ma_unpin_all_pages() to put a LSN on page, though
page would be PAGECACHE_PLAIN_PAGE (transactionality temporarily disabled
to not log REDOs).
*/
DBUG_ASSERT((block->type == PAGECACHE_LSN_PAGE) || maria_in_recovery);
old= lsn_korr(block->buffer + PAGE_LSN_OFFSET);
DBUG_PRINT("info", ("old lsn: (%lu, 0x%lx) new lsn: (%lu, 0x%lx)",
LSN_IN_PARTS(old), LSN_IN_PARTS(lsn)));
......@@ -2570,12 +2593,7 @@ void pagecache_unlock(PAGECACHE *pagecache,
{
DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK);
DBUG_ASSERT(pin == PAGECACHE_UNPIN);
if (block->rec_lsn == LSN_MAX)
block->rec_lsn= first_REDO_LSN_for_page;
else
DBUG_ASSERT(cmp_translog_addr(block->rec_lsn,
first_REDO_LSN_for_page) <= 0);
pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
}
if (lsn != LSN_IMPOSSIBLE)
check_and_set_lsn(pagecache, lsn, block);
......@@ -2755,11 +2773,7 @@ void pagecache_unlock_by_link(PAGECACHE *pagecache,
DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
lock == PAGECACHE_LOCK_READ_UNLOCK);
DBUG_ASSERT(pin == PAGECACHE_UNPIN);
if (block->rec_lsn == LSN_MAX)
block->rec_lsn= first_REDO_LSN_for_page;
else
DBUG_ASSERT(cmp_translog_addr(block->rec_lsn,
first_REDO_LSN_for_page) <= 0);
pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
}
if (lsn != LSN_IMPOSSIBLE)
check_and_set_lsn(pagecache, lsn, block);
......@@ -2785,6 +2799,7 @@ void pagecache_unlock_by_link(PAGECACHE *pagecache,
(ulong) block));
}
pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
if (make_lock_and_pin(pagecache, block, lock, pin, 0))
DBUG_ASSERT(0); /* should not happend */
......@@ -3214,25 +3229,27 @@ my_bool pagecache_delete_pages(PAGECACHE *pagecache,
}
/*
Write a buffer into a cached file.
SYNOPSIS
pagecache_write_part()
pagecache pointer to a page cache data structure
file handler for the file to write data to
pageno number of the block of data in the file
level determines the weight of the data
buff buffer with the data
type type of the page
lock lock change
pin pin page
write_mode how to write page
link link to the page if we pin it
/**
@brief Writes a buffer into a cached file.
RETURN VALUE
0 if a success, 1 - otherwise.
@param pagecache pointer to a page cache data structure
@param file handler for the file to write data to
@param pageno number of the block of data in the file
@param level determines the weight of the data
@param buff buffer with the data
@param type type of the page
@param lock lock change
@param pin pin page
@param write_mode how to write page
@param link link to the page if we pin it
@param first_REDO_LSN_for_page the lsn to set rec_lsn
@param offset offset in the page
@param size size of data
@param validator read page validator
@param validator_data the validator data
@retval 0 if a success.
@retval 1 Error.
*/
/* description of how to change lock before and after write */
......@@ -3296,6 +3313,7 @@ my_bool pagecache_write_part(PAGECACHE *pagecache,
enum pagecache_page_pin pin,
enum pagecache_write_mode write_mode,
PAGECACHE_BLOCK_LINK **page_link,
LSN first_REDO_LSN_for_page,
uint offset, uint size,
pagecache_disk_read_validator validator,
uchar* validator_data)
......@@ -3430,6 +3448,16 @@ my_bool pagecache_write_part(PAGECACHE *pagecache,
block->status&= ~PCBLOCK_ERROR;
}
if (first_REDO_LSN_for_page)
{
/* single write action of the last write action */
DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
lock == PAGECACHE_LOCK_LEFT_UNLOCKED);
DBUG_ASSERT(pin == PAGECACHE_UNPIN ||
pin == PAGECACHE_PIN_LEFT_UNPINNED);
pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
}
if (need_lock_change)
{
/*
......@@ -3612,6 +3640,10 @@ static int flush_cached_blocks(PAGECACHE *pagecache,
PCBLOCK_NUMBER(pagecache, block), (ulong)block,
block->pins));
DBUG_ASSERT(block->pins == 1);
/**
@todo If page is contiguous with next page to flush, group flushes in
one single my_pwrite().
*/
error= pagecache_fwrite(pagecache, file,
block->buffer,
block->hash_link->pageno,
......
......@@ -204,12 +204,12 @@ extern uchar *pagecache_valid_read(PAGECACHE *pagecache,
pagecache_disk_read_validator validator,
uchar* validator_data);
#define pagecache_write(P,F,N,L,B,T,O,I,M,K) \
pagecache_write_part(P,F,N,L,B,T,O,I,M,K,0,(P)->block_size,0,0)
#define pagecache_write(P,F,N,L,B,T,O,I,M,K,R) \
pagecache_write_part(P,F,N,L,B,T,O,I,M,K,R,0,(P)->block_size,0,0)
#define pagecache_inject(P,F,N,L,B,T,O,I,K,V,D) \
#define pagecache_inject(P,F,N,L,B,T,O,I,K,R,V,D) \
pagecache_write_part(P,F,N,L,B,T,O,I,PAGECACHE_WRITE_DONE, \
K,0,(P)->block_size,V,D)
K,R,0,(P)->block_size,V,D)
extern my_bool pagecache_write_part(PAGECACHE *pagecache,
PAGECACHE_FILE *file,
......@@ -221,6 +221,7 @@ extern my_bool pagecache_write_part(PAGECACHE *pagecache,
enum pagecache_page_pin pin,
enum pagecache_write_mode write_mode,
PAGECACHE_BLOCK_LINK **link,
LSN first_REDO_LSN_for_page,
uint offset,
uint size,
pagecache_disk_read_validator validator,
......
......@@ -86,7 +86,8 @@ int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves)
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DONE, 0))
PAGECACHE_WRITE_DONE, 0,
LSN_IMPOSSIBLE))
goto err;
}
pos+= block_length;
......
This diff is collapsed.
......@@ -30,5 +30,5 @@ int maria_recover(void);
int maria_apply_log(LSN lsn, enum maria_apply_log_way apply,
FILE *trace_file,
my_bool execute_undo_phase, my_bool skip_DDLs,
my_bool take_checkpoints);
my_bool take_checkpoints, uint *warnings_count);
C_MODE_END
......@@ -218,8 +218,15 @@ int maria_write(MARIA_HA *info, uchar *record)
info->cur_row.checksum;
}
if (share->base.auto_key)
{
/**
@todo RECOVERY BUG
if updated here, it's not recoverable (no mutex => checkpoint may see a
crazy value and flush it into the table's state on disk).
*/
set_if_bigger(info->s->state.auto_increment,
ma_retrieve_auto_increment(info, record));
}
info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_WRITTEN |
HA_STATE_ROW_CHANGED);
info->state->records+= !share->now_transactional; /*otherwise already done*/
......
......@@ -1033,8 +1033,8 @@ int _ma_update_create_rename_lsn(MARIA_SHARE *share,
int _ma_update_create_rename_lsn_sub(MARIA_SHARE *share,
LSN lsn, my_bool do_sync);
#define _ma_tmp_disable_logging_for_table(S) \
{ (S)->now_transactional= FALSE; (S)->page_type= PAGECACHE_PLAIN_PAGE; }
void _ma_tmp_disable_logging_for_table(MARIA_HA *info,
my_bool log_incomplete);
#define _ma_reenable_logging_for_table(S) \
{ if (((S)->now_transactional= (S)->base.born_transactional)) \
(S)->page_type= PAGECACHE_LSN_PAGE; }
......
......@@ -38,6 +38,7 @@ int main(int argc, char **argv)
{
LSN lsn;
char **default_argv;
uint warnings_count;
MY_INIT(argv[0]);
my_progname_short= my_progname+dirname_length(my_progname);
......@@ -106,9 +107,13 @@ int main(int argc, char **argv)
if (maria_apply_log(lsn, opt_apply ? MARIA_LOG_APPLY :
(opt_check ? MARIA_LOG_CHECK :
MARIA_LOG_DISPLAY_HEADER), opt_silent ? NULL : stdout,
opt_apply_undo, FALSE, FALSE))
opt_apply_undo, FALSE, FALSE, &warnings_count))
goto err;
if (warnings_count == 0)
fprintf(stdout, "%s: SUCCESS\n", my_progname_short);
else
fprintf(stdout, "%s: DOUBTFUL (%u warnings, check previous output)\n",
my_progname_short, warnings_count);
goto end;
err:
......@@ -130,7 +135,8 @@ int main(int argc, char **argv)
static struct my_option my_long_options[] =
{
{"apply", 'a',
"Apply log to tables. Will display a lot of information if not run with --silent",
"Apply log to tables: modifies tables! you should make a backup first! "
" Displays a lot of information if not run with --silent",
(uchar **) &opt_apply, (uchar **) &opt_apply, 0,
GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
{"check", 'c',
......@@ -143,7 +149,7 @@ static struct my_option my_long_options[] =
#endif
{"help", '?', "Display this help and exit.",
0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
{"only-display", 'o', "display brief info about records's header",
{"only-display", 'o', "display brief info read from records' header",
(uchar **) &opt_only_display, (uchar **) &opt_only_display, 0, GET_BOOL,
NO_ARG,0, 0, 0, 0, 0, 0},
{ "page_buffer_size", 'P', "",
......@@ -154,7 +160,7 @@ static struct my_option my_long_options[] =
{"silent", 's', "Print less information during apply/undo phase",
(uchar **) &opt_silent, (uchar **) &opt_silent, 0,
GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
{"undo", 'u', "Apply undos to tables. (disable with --disable-undo)",
{"undo", 'u', "Apply UNDO records to tables. (disable with --disable-undo)",
(uchar **) &opt_apply_undo, (uchar **) &opt_apply_undo, 0,
GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
{"version", 'V', "Print version and exit.",
......
......@@ -237,7 +237,7 @@ void writer(int num)
PAGECACHE_LOCK_WRITE_UNLOCK,
PAGECACHE_UNPIN,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
if (i % flush_divider == 0)
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
......@@ -380,7 +380,7 @@ int main(int argc __attribute__((unused)),
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
}
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
free(buffr);
......
......@@ -104,7 +104,7 @@ int simple_read_write_test()
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
pagecache_read(&pagecache, &file1, 0, 3, (char*)buffr,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_LEFT_UNLOCKED,
......@@ -140,7 +140,7 @@ int simple_read_change_write_read_test()
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
/* test */
pagecache_read(&pagecache, &file1, 0, 3, (char*)buffw,
......@@ -153,7 +153,7 @@ int simple_read_change_write_read_test()
PAGECACHE_LOCK_WRITE_UNLOCK,
PAGECACHE_UNPIN,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
pagecache_read(&pagecache, &file1, 0, 3, (char*)buffr,
PAGECACHE_PLAIN_PAGE,
......@@ -194,7 +194,7 @@ int simple_pin_test()
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
/* test */
if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
{
......@@ -210,14 +210,14 @@ int simple_pin_test()
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
bfill(buffw + PAGE_SIZE/2, PAGE_SIZE/2, ((unsigned char) 129));
pagecache_write(&pagecache, &file1, 0, 3, (char*)buffw,
PAGECACHE_PLAIN_PAGE,
PAGECACHE_LOCK_WRITE_TO_READ,
PAGECACHE_PIN_LEFT_PINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
/*
We have to get error because one page of the file is pinned,
other page should be flushed
......@@ -272,7 +272,7 @@ int simple_delete_forget_test()
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
/* test */
bfill(buffw, PAGE_SIZE, '\2');
......@@ -281,7 +281,7 @@ int simple_delete_forget_test()
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
pagecache_delete(&pagecache, &file1, 0,
PAGECACHE_LOCK_WRITE, 0);
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
......@@ -314,7 +314,7 @@ int simple_delete_flush_test()
PAGECACHE_LOCK_WRITE,
PAGECACHE_PIN,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
/* test */
bfill(buffw, PAGE_SIZE, '\2');
......@@ -323,7 +323,7 @@ int simple_delete_flush_test()
PAGECACHE_LOCK_LEFT_WRITELOCKED,
PAGECACHE_PIN_LEFT_PINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
pagecache_delete(&pagecache, &file1, 0,
PAGECACHE_LOCK_LEFT_WRITELOCKED, 1);
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
......@@ -362,7 +362,7 @@ int simple_big_test()
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
}
desc[i].length= 0;
desc[i].content= '\0';
......
......@@ -185,7 +185,7 @@ int main(int argc __attribute__((unused)), char *argv[])
/* Suppressing of automatic record writing */
trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1);
plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1);
srandom(122334817L);
......@@ -335,6 +335,15 @@ int main(int argc __attribute__((unused)), char *argv[])
ok(1, "flush");
}
if (translog_flush(translog_get_horizon()))
{
fprintf(stderr, "Can't flush up to horizon\n", (ulong) i);
translog_destroy();
ok(0, "flush");
exit(1);
}
ok(1, "flush");
srandom(122334817L);
rc= 1;
......
......@@ -129,7 +129,7 @@ int main(int argc __attribute__((unused)), char *argv[])
PAGECACHE_LOCK_LEFT_UNLOCKED,
PAGECACHE_PIN_LEFT_UNPINNED,
PAGECACHE_WRITE_DELAY,
0);
0, LSN_IMPOSSIBLE);
flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
}
if ((stat= my_stat(first_translog_file, &st, MYF(0))) == 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment