Commit e0028938 authored by unknown's avatar unknown

WL#3071 - Maria checkpoint

Observe WAL for the table's state: all log records needed for undoing
uncommitted state must be in the log before we flush state.


storage/maria/ha_maria.cc:
  comments
storage/maria/ma_bitmap.c:
  Comment for why there is no bug
storage/maria/ma_blockrec.c:
  comment for why there is no bug
storage/maria/ma_checkpoint.c:
  Observe WAL for the table's state: all log records needed for undoing
  uncommitted state must be in the log before we flush state. I tested
  by hand that the bug existed (create table, insert one row into it
  but let that insert pause after increasing data_file_length, let
  checkpoint start but kill it after it has flushed state).
  Log contains nothing, table is not recovered though it has
  a too big data_file_length. With this bugfix, the log contains
  REDO so table is opened so data_file_length is corrected.
storage/maria/ma_close.c:
  If table is read-only we must never write to it.
  Should be a no-change in fact, as if read-only, share->changed is
  normally always false.
storage/maria/ma_recovery.c:
  documenting bug found by Monty. Print when fixing data_file_length.
parent 5ddc4d8f
...@@ -1288,6 +1288,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize) ...@@ -1288,6 +1288,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize)
thd->proc_info= "Repair with keycache"; thd->proc_info= "Repair with keycache";
param.testflag &= ~T_REP_BY_SORT; param.testflag &= ~T_REP_BY_SORT;
error= maria_repair(&param, file, fixed_name, param.testflag & T_QUICK); error= maria_repair(&param, file, fixed_name, param.testflag & T_QUICK);
/**
@todo RECOVERY BUG we do things with the index file
(maria_sort_index() after the above which already has logged the
record and bumped create_rename_lsn. Is it ok?
*/
} }
param.testflag= testflag; param.testflag= testflag;
optimize_done= 1; optimize_done= 1;
...@@ -1356,6 +1361,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize) ...@@ -1356,6 +1361,11 @@ int ha_maria::repair(THD *thd, HA_CHECK &param, bool do_optimize)
thd->proc_info= old_proc_info; thd->proc_info= old_proc_info;
if (!thd->locked_tables) if (!thd->locked_tables)
{ {
/**
@todo RECOVERY BUG find why this is needed. Monty says it's because a
new non-transactional table is created by maria_repair(): find how this
new table's state influences the old one's.
*/
_ma_reenable_logging_for_table(file->s); _ma_reenable_logging_for_table(file->s);
maria_lock_database(file, F_UNLCK); maria_lock_database(file, F_UNLCK);
} }
......
...@@ -533,6 +533,17 @@ static my_bool _ma_read_bitmap_page(MARIA_SHARE *share, ...@@ -533,6 +533,17 @@ static my_bool _ma_read_bitmap_page(MARIA_SHARE *share,
Inexistent or half-created page (could be crash in the middle of Inexistent or half-created page (could be crash in the middle of
_ma_bitmap_create_first(), before appending maria_bitmap_marker). _ma_bitmap_create_first(), before appending maria_bitmap_marker).
*/ */
/*
We are updating data_file_length before writing any log record for the
row operation. What if now state is flushed by a checkpoint with the
new value, and crash before the checkpoint record is written, recovery
may not even open the table (no log records) so not fix
data_file_length ("WAL violation")? In fact this is ok:
- checkpoint flushes state only if share->id!=0
- so if state was flushed, table had share->id!=0, so had a
LOGREC_FILE_ID (or was in previous checkpoint record), so recovery will
meet and open it and fix data_file_length.
*/
share->state.state.data_file_length= end_of_page; share->state.state.data_file_length= end_of_page;
bzero(bitmap->map, bitmap->block_size); bzero(bitmap->map, bitmap->block_size);
memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker), memcpy(bitmap->map + bitmap->block_size - sizeof(maria_bitmap_marker),
......
...@@ -1441,7 +1441,23 @@ static my_bool write_tail(MARIA_HA *info, ...@@ -1441,7 +1441,23 @@ static my_bool write_tail(MARIA_HA *info,
/* Increase data file size, if extended */ /* Increase data file size, if extended */
position= (my_off_t) block->page * block_size; position= (my_off_t) block->page * block_size;
if (info->state->data_file_length <= position) if (info->state->data_file_length <= position)
{
/*
We are modifying a state member before writing the UNDO; this is a WAL
violation: assume this setting is made, checkpoint flushes new state,
and crash happens before the UNDO is written: how to undo the bad state?
Fortunately for data_file_length this is ok: as long as we change
data_file_length after writing any REDO or UNDO we are safe:
- checkpoint flushes state only if it's older than
checkpoint_start_log_horizon, and flushes log up to that horizon first
- so if checkpoint flushed state with new data_file_length, REDO is in
log so LOGREC_FILE_ID too, recovery will meet and open the table thus
fix data_file_length to be the file's physical size.
Same property is currently true in all places of this file which change
data_file_length.
*/
info->state->data_file_length= position + block_size; info->state->data_file_length= position + block_size;
}
DBUG_ASSERT(share->pagecache->block_size == block_size); DBUG_ASSERT(share->pagecache->block_size == block_size);
if (!(res= pagecache_write(share->pagecache, if (!(res= pagecache_write(share->pagecache,
......
...@@ -176,7 +176,27 @@ static int really_execute_checkpoint(void) ...@@ -176,7 +176,27 @@ static int really_execute_checkpoint(void)
DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)", DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
LSN_IN_PARTS(checkpoint_start_log_horizon))); LSN_IN_PARTS(checkpoint_start_log_horizon)));
lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon); lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
/*
We are going to flush the state of some tables (in collect_tables()) if
it's older than checkpoint_start_log_horizon. Before, all records
describing how to undo this flushed state must be in the log
(WAL). Usually this means UNDOs. In the special case of data_file_length,
recovery just needs to open the table, so any LOGREC_FILE_ID/REDO/UNDO
allowing recovery to understand it must open a table, is enough.
*/
/**
Apart from data|key_file_length which are easily recoverable from the OS,
all other state members must be updated only when writing the UNDO;
otherwise, if updated before, if their new value is flushed by a
checkpoint and there is a crash before UNDO is written, their REDO group
will be missing or at least incomplete and skipped by recovery, so bad
state value will stay. For example, setting key_root before writing the
UNDO: the table would have old index page (they were pinned at time of
crash) and a new, thus wrong, key_root.
@todo RECOVERY BUG check that all code honours that.
*/
if (translog_flush(checkpoint_start_log_horizon))
goto err;
/* /*
STEP 2: fetch information about transactions. STEP 2: fetch information about transactions.
...@@ -891,11 +911,15 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) ...@@ -891,11 +911,15 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
dfile= share->bitmap.file; dfile= share->bitmap.file;
/* /*
Ignore table which has no logged writes (all its future log records will Ignore table which has no logged writes (all its future log records will
be found naturally by Recovery). Ignore obsolete shares (_before_ be found naturally by Recovery). This also avoids flushing
setting themselves to last_version=0 they already did all flush and a data_file_length changed too early by a client (before any log record
sync; if we flush their state now we may be flushing an obsolete state was written, giving no chance to recovery to meet and open the table,
onto a newer one (assuming the table has been reopened with a different see _ma_read_bitmap_page()).
share but of course same physical index file). Ignore obsolete shares (_before_ setting themselves to last_version=0
they already did all flush and sync; if we flush their state now we may
be flushing an obsolete state onto a newer one (assuming the table has
been reopened with a different share but of course same physical index
file).
*/ */
if ((share->id != 0) && (share->last_version != 0)) if ((share->id != 0) && (share->last_version != 0))
{ {
...@@ -950,6 +974,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon) ...@@ -950,6 +974,7 @@ static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
It may also be a share which got last_version==0 since we checked It may also be a share which got last_version==0 since we checked
last_version; in this case, it flushed its state and the LSN test last_version; in this case, it flushed its state and the LSN test
above will catch it. above will catch it.
Last, see comments at start of really_execute_checkpoint().
*/ */
} }
else else
......
...@@ -86,8 +86,9 @@ int maria_close(register MARIA_HA *info) ...@@ -86,8 +86,9 @@ int maria_close(register MARIA_HA *info)
may be using the file at this point may be using the file at this point
IF using --external-locking, which does not apply to Maria. IF using --external-locking, which does not apply to Maria.
*/ */
if ((share->changed && share->base.born_transactional) || if (share->mode != O_RDONLY &&
(share->mode != O_RDONLY && maria_is_crashed(info))) ((share->changed && share->base.born_transactional) ||
maria_is_crashed(info)))
{ {
/* /*
State must be written to file as it was not done at table's State must be written to file as it was not done at table's
......
...@@ -1066,8 +1066,16 @@ static int new_table(uint16 sid, const char *name, ...@@ -1066,8 +1066,16 @@ static int new_table(uint16 sid, const char *name,
tprint(tracef, ", length unknown\n"); tprint(tracef, ", length unknown\n");
goto end; goto end;
} }
if (share->state.state.data_file_length != dfile_len)
{
tprint(tracef, ", has wrong state.data_file_length (fixing it)");
share->state.state.data_file_length= dfile_len; share->state.state.data_file_length= dfile_len;
}
if (share->state.state.key_file_length != kfile_len)
{
tprint(tracef, ", has wrong state.key_file_length (fixing it)");
share->state.state.key_file_length= kfile_len; share->state.state.key_file_length= kfile_len;
}
if ((dfile_len % share->block_size) > 0) if ((dfile_len % share->block_size) > 0)
{ {
tprint(tracef, ", has too short last page\n"); tprint(tracef, ", has too short last page\n");
...@@ -1145,6 +1153,19 @@ prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD) ...@@ -1145,6 +1153,19 @@ prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD)
goto end; goto end;
} }
buff= log_record_buffer.str; buff= log_record_buffer.str;
/**
@todo RECOVERY BUG
we stamp page with UNDO's LSN. Assume an operation logs REDO-REDO-UNDO
where the two REDOs are about the same page. Then recovery applies first
REDO and skips second REDO which is wrong. Solutions:
a) when applying REDO, keep page pinned, don't stamp it, remember it;
when seeing UNDO, unpin pages and stamp them; for BLOB pages we cannot
pin them (too large for memory) so need an additional pass in REDO phase:
- find UNDO
- execute all REDOs about this UNDO but skipping REDOs for
or b) when applying REDO, stamp page with REDO's LSN (=> difference in
'cmp' between run-time and recovery, need a special 'cmp'...).
*/
if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn, if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
HEAD_PAGE, HEAD_PAGE,
buff + FILEID_STORE_SIZE, buff + FILEID_STORE_SIZE,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment