Commit fa05e9c9 authored by unknown's avatar unknown

WL#3071 - Maria checkpoint

Adding rec_lsn to Maria's page cache. Misc fixes to Checkpoint.


mysys/mf_pagecache.c:
  adding rec_lsn, the LSN when a page first became dirty.
  It is set when unlocking a page (TODO: should also be set when
  the unlocking is an implicit part of pagecache_write()).
  It is reset in link_to_file_list() and free_block()
  (one of which is used every time we flush a block).
  It is a ulonglong and not LSN, because its destination is comparisons
  for which ulonglong is better than a struct.
storage/maria/ma_checkpoint.c:
  misc fixes to Checkpoint (updates now that the transaction manager
  and the page cache are more known)
storage/maria/ma_close.c:
  an important note for the future.
storage/maria/ma_least_recently_dirtied.c:
  comment
parent ad29d552
......@@ -295,6 +295,7 @@ struct st_pagecache_block_link
enum pagecache_page_type type; /* type of the block */
uint hits_left; /* number of hits left until promotion */
ulonglong last_hit_time; /* timestamp of the last hit */
ulonglong rec_lsn; /* LSN when first became dirty */
KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */
};
......@@ -1202,6 +1203,7 @@ static void link_to_file_list(PAGECACHE *pagecache,
if (block->status & BLOCK_CHANGED)
{
block->status&= ~BLOCK_CHANGED;
block->rec_lsn= 0;
pagecache->blocks_changed--;
pagecache->global_blocks_changed--;
}
......@@ -2509,6 +2511,8 @@ void pagecache_unlock_page(PAGECACHE *pagecache,
DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK &&
pin == PAGECACHE_UNPIN);
/* TODO: insert LSN writing code */
DBUG_ASSERT(first_REDO_LSN_for_page > 0);
set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page);
}
#ifndef DBUG_OFF
......@@ -2671,6 +2675,8 @@ void pagecache_unlock(PAGECACHE *pagecache,
DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK &&
pin == PAGECACHE_UNPIN);
/* TODO: insert LSN writing code */
DBUG_ASSERT(first_REDO_LSN_for_page > 0);
set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page);
}
#ifndef DBUG_OFF
......@@ -3012,10 +3018,9 @@ restart:
pagecache->blocks_changed--;
pagecache->global_blocks_changed--;
/*
free_block() will change the status of the block so no need to change
it here.
free_block() will change the status and rec_lsn of the block so no
need to change them here.
*/
}
/* Cache is locked, so we can relese page before freeing it */
pagecache_make_lock_and_pin(pagecache, block,
......@@ -3328,6 +3333,7 @@ static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
#ifndef DBUG_OFF
block->type= PAGECACHE_EMPTY_PAGE;
#endif
block->rec_lsn= 0;
KEYCACHE_THREAD_TRACE("free block");
KEYCACHE_DBUG_PRINT("free_block",
("block is freed"));
......
......@@ -40,8 +40,7 @@
#include "share.h"
#include "log.h"
/* could also be called LSN_ERROR */
#define LSN_IMPOSSIBLE ((LSN)0)
#define LSN_IMPOSSIBLE ((LSN)0) /* could also be called LSN_ERROR */
#define LSN_MAX ((LSN)ULONGLONG_MAX)
/*
......@@ -57,9 +56,12 @@ st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,..
MEDIUM checkpoint.
*/
LSN max_rec_lsn_at_last_checkpoint= 0;
/* last submitted checkpoint request; cleared only when executed */
CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
static inline ulonglong read_non_atomic(ulonglong volatile *x);
/*
Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA
ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the
......@@ -67,6 +69,7 @@ CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
*/
my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
{
my_bool result;
DBUG_ENTER("execute_synchronous_checkpoint");
DBUG_ASSERT(level > NONE);
......@@ -76,43 +79,52 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
wait_on_checkpoint_done_cond();
synchronous_checkpoint_in_progress= level;
execute_checkpoint(level);
result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
synchronous_checkpoint_in_progress= NONE;
unlock(log_mutex);
broadcast(checkpoint_done_cond);
DBUG_RETURN(result);
}
/* Picks a checkpoint request, if there is one, and executes it */
/*
If no checkpoint is running, and there is a pending asynchronous checkpoint
request, executes it.
Is safe if multiple threads call it, though in first version only one will.
It's intended to be used by a thread which regularly calls this function;
this is why, if there is a request,it does not wait in a loop for
synchronous checkpoints to be finished, but just exits (because the thread
may want to do something useful meanwhile (flushing dirty pages for example)
instead of waiting).
*/
my_bool execute_asynchronous_checkpoint_if_any()
{
my_bool result;
CHECKPOINT_LEVEL level;
DBUG_ENTER("execute_asynchronous_checkpoint");
lock(log_mutex);
if (likely(next_asynchronous_checkpoint_to_do == NONE))
if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
(synchronous_checkpoint_in_progress != NONE)))
{
unlock(log_mutex);
DBUG_RETURN(FALSE);
}
while (synchronous_checkpoint_in_progress)
wait_on_checkpoint_done_cond();
do_checkpoint:
level= next_asynchronous_checkpoint_to_do;
DBUG_ASSERT(level > NONE);
execute_checkpoint(level);
result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
if (next_asynchronous_checkpoint_to_do > level)
goto do_checkpoint; /* one more request was posted */
else
/* If only one thread calls this function, "<" can never happen below */
if (next_asynchronous_checkpoint_to_do <= level)
{
DBUG_ASSERT(next_asynchronous_checkpoint_to_do == level);
next_asynchronous_checkpoint_to_do= NONE; /* all work done */
/* it's our request or weaker/equal ones, all work is done */
next_asynchronous_checkpoint_to_do= NONE;
}
/* otherwise if it is a stronger request, we'll deal with it at next call */
unlock(log_mutex);
broadcast(checkpoint_done_cond);
DBUG_RETURN(result);
}
......@@ -123,17 +135,14 @@ do_checkpoint:
*/
my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
{
LSN candidate_max_rec_lsn_at_last_checkpoint;
/* to avoid { lock + no-op + unlock } in the common (==indirect) case */
my_bool need_log_mutex;
DBUG_ENTER("execute_checkpoint");
safemutex_assert_owner(log_mutex);
copy_of_max_rec_lsn_at_last_checkpoint= max_rec_lsn_at_last_checkpoint;
if (unlikely(need_log_mutex= (level > INDIRECT)))
if (unlikely(level > INDIRECT))
{
LSN copy_of_max_rec_lsn_at_last_checkpoint=
max_rec_lsn_at_last_checkpoint;
/* much I/O work to do, release log mutex */
unlock(log_mutex);
......@@ -149,51 +158,29 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
flush all pages which were already dirty at last checkpoint:
ensures that recovery will never start from before the next-to-last
checkpoint (two-checkpoint rule).
It is max, not min as the WL says (TODO update WL).
*/
flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint);
/* this will go full speed (normal scheduling, no sleep) */
break;
}
}
candidate_max_rec_lsn_at_last_checkpoint= checkpoint_indirect(need_log_mutex);
lock(log_mutex);
/*
this portion cannot be done as a hook in write_log_record() for the
LOGREC_CHECKPOINT type because:
- at that moment we still have not written to the control file so cannot
mark the request as done; this could be solved by writing to the control
file in the hook but that would be an I/O under the log's mutex, bad.
- it would not be nice organisation of code (I tried it :).
*/
if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
{
/* checkpoint succeeded */
maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
written_since_last_checkpoint= (my_off_t)0;
DBUG_RETURN(FALSE);
}
/*
keep mutex locked because callers will want to clear mutex-protected
status variables
keep mutex locked upon exit because callers will want to clear
mutex-protected status variables
*/
DBUG_RETURN(TRUE);
DBUG_RETURN(execute_checkpoint_indirect());
}
/*
Does an indirect checpoint (collects data from data structures, writes into
a checkpoint log record).
Returns the largest LSN of the LRD when the checkpoint happened (this is a
fuzzy definition), or LSN_IMPOSSIBLE on error. That LSN is used for the
"two-checkpoint rule" (MEDIUM checkpoints).
Starts and ends while having log's mutex (released in the middle).
*/
LSN checkpoint_indirect(my_bool need_log_mutex)
my_bool execute_checkpoint_indirect()
{
DBUG_ENTER("checkpoint_indirect");
int error= 0;
/* checkpoint record data: */
LSN checkpoint_start_lsn;
......@@ -202,25 +189,30 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
char *ptr;
LSN checkpoint_lsn;
LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
list_element *el; /* to scan lists */
ulong stored_LRD_size= 0;
DBUG_ENTER("execute_checkpoint_indirect");
DBUG_ASSERT(sizeof(byte *) <= 8);
DBUG_ASSERT(sizeof(LSN) <= 8);
if (need_log_mutex)
lock(log_mutex); /* maybe this will clash with log_read_end_lsn() */
safemutex_assert_owner(log_mutex);
checkpoint_start_lsn= log_read_end_lsn();
if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */
DBUG_RETURN(TRUE);
unlock(log_mutex);
DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
/* STEP 1: fetch information about dirty pages */
/* note: this piece will move into mysys/mf_pagecache.c */
{
ulong stored_LRD_size= 0;
/*
We lock the entire cache but will be quick, just reading/writing a few MBs
of memory at most.
When we enter here, we must be sure that no "first_in_switch" situation
is happening or will happen (either we have to get rid of
first_in_switch in the code or, first_in_switch has to increment a
"danger" counter for Checkpoint to know it has to wait.
*/
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
......@@ -230,7 +222,11 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
checkpoint record; the true number of page-LRD-info we'll store into the
record is stored_LRD_size.
*/
string1.length= 8+8+(8+8)*pagecache->blocks_changed;
/*
TODO: Ingo says blocks_changed is not a reliable number (see his
document); ask him.
*/
string1.length= 8+8+(8+8+8)*pagecache->blocks_changed;
if (NULL == (string1.str= my_malloc(string1.length)))
goto err;
ptr= string1.str;
......@@ -238,16 +234,6 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
ptr+= 8+8; /* don't store stored_LRD_size now, wait */
if (pagecache->blocks_changed > 0)
{
/*
There are different ways to scan the dirty blocks;
flush_all_key_blocks() uses a loop over pagecache->used_last->next_used,
and for each element of the loop, loops into
pagecache->changed_blocks[FILE_HASH(file of the element)].
This has the drawback that used_last includes non-dirty blocks, and it's
two loops over many elements. Here we try something simpler.
If there are no blocks in changed_blocks[file_hash], we should hit
zeroes and skip them.
*/
uint file_hash;
for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
......@@ -260,68 +246,102 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
DBUG_ASSERT(block->status & BLOCK_CHANGED);
if (block->type != PAGECACHE_LSN_PAGE)
{
/* no need to store it in the checkpoint record */
continue;
continue; /* no need to store it in the checkpoint record */
}
/* Q: two "block"s cannot have the same "hash_link", right? */
/*
In the current pagecache, rec_lsn is not set correctly:
1) it is set on pagecache_unlock(), too late (a page is dirty
(BLOCK_CHANGED) since the first pagecache_write()). It may however
be not too late, because until unlock(), the page's update is not
committed, so it's ok that REDOs for it be skipped at Recovery
(which is what happens with an unset rec_lsn). Note that this
relies on the assumption that a transaction never commits while
holding locks on pages.
2) sometimes the unlocking can be an implicit action of
pagecache_write(), without any call to pagecache_unlock(), then
rec_lsn is not set. That one is a critical problem.
TODO: fix this when Monty has explained how he writes BLOB pages.
*/
if (0 == block->rec_lsn)
abort(); /* always fail in all builds, in case it's problem 2) */
int8store(ptr, block->hash_link->file.file);
ptr+= 8;
int8store(ptr, block->hash_link->pageno);
ptr+= 8;
/* I assume rec_lsn will be member of "block", not of "hash_link" */
int8store(ptr, block->rec_lsn);
ptr+= 8;
stored_LRD_size++;
DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
block->rec_lsn);
}
}
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
int8store(string1.str+8, stored_LRD_size);
string1.length= 8+8+(8+8)*stored_LRD_size;
string1.length= 8+8+(8+8+8)*stored_LRD_size;
}
/* STEP 2: fetch information about transactions */
/* note: this piece will move into trnman.c */
/*
If trx are in more than one list (e.g. three:
running transactions, committed transactions, purge queue), we can either
take mutexes of all three together or do crabbing.
But if an element can move from list 1 to list 3 without passing through
list 2, crabbing is dangerous.
Hopefully it's ok to take 3 mutexes together...
Otherwise I'll have to make sure I miss no important trx and I handle dups.
Transactions are in the "active list" (protected by a mutex) and in a
lock-free hash of "committed" (insertion protected by the same mutex,
deletion lock-free).
*/
lock(global_transactions_list_mutex); /* or 3 mutexes if there are 3 */
string2.length= 8+(8+8)*trx_list->count;
{
TRN *trn;
ulong stored_trn_size= 0;
/* First, the active transactions */
pthread_mutex_lock(LOCK_trn_list);
string2.length= 8+(7+2+8+8+8)*trnman_active_transactions;
if (NULL == (string2.str= my_malloc(string2.length)))
goto err;
ptr= string2.str;
int8store(ptr, trx_list->count);
ptr+= 8;
for (el= trx_list->first; el; el= el->next)
for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
{
/* possibly latch el.rwlock */
*ptr= el->state;
ptr++;
int7store(ptr, el->long_trans_id);
/* we would latch trn.rwlock if it existed */
if (0 == trn->short_trid) /* trn is not inited, skip */
continue;
/* state is not needed for now (only when we have prepared trx) */
/* int7store does not exist but mi_int7store does */
int7store(ptr, trn->trid);
ptr+= 7;
int2store(ptr, el->short_trans_id);
int2store(ptr, trn->short_trid);
ptr+= 2;
int8store(ptr, el->undo_lsn);
int8store(ptr, trn->undo_lsn); /* is an LSN 7 or 8 bytes really? */
ptr+= 8;
int8store(ptr, el->undo_purge_lsn);
int8store(ptr, trn->undo_purge_lsn);
ptr+= 8;
/*
if no latch, use double variable of type ULONGLONG_CONSISTENT in
st_transaction, or even no need if Intel >=486
*/
int8store(ptr, el->first_undo_lsn);
int8store(ptr, read_non_atomic(&trn->first_undo_lsn));
ptr+= 8;
/* possibly unlatch el.rwlock */
stored_trn_size++;
}
pthread_mutex_unlock(LOCK_trn_list);
/*
Now the committed ones.
We need a function which scans the hash's list of elements in a
lock-free manner (a bit like lfind(), starting from bucket 0), and for
each node (committed transaction) stores the transaction's
information (trid, undo_purge_lsn, first_undo_lsn) into a buffer.
This big buffer is malloc'ed at the start, so the number of elements (or
an upper bound of it) found in the hash needs to be known in advance
(one solution is to keep LOCK_trn_list locked, ensuring that nodes are
only deleted).
*/
/*
TODO: if we see there exists no transaction (active and committed) we can
tell the lock-free structures to do some freeing (my_free()).
*/
int8store(string1.str, stored_trn_size);
string2.length= 8+(7+2+8+8+8)*stored_trn_size;
}
unlock(global_transactions_list_mutex);
/* STEP 3: fetch information about table files */
{
/* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
lock(global_share_list_mutex);
string3.length= 8+(8+8)*share_list->count;
......@@ -351,6 +371,7 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
these two are long ops (involving disk I/O) that's why we copied the
list, to not keep the list locked for long:
*/
/* TODO: what if the table pointer is gone/reused now? */
flush_bitmap_pages(el);
/* TODO: and also autoinc counter, logical file end, free page list */
......@@ -360,6 +381,7 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
*/
force_file(el);
}
}
/* LAST STEP: now write the checkpoint log record */
......@@ -389,11 +411,38 @@ err:
candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE;
end:
my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));
DBUG_RETURN(candidate_max_rec_lsn_at_last_checkpoint);
/*
this portion cannot be done as a hook in write_log_record() for the
LOGREC_CHECKPOINT type because:
- at that moment we still have not written to the control file so cannot
mark the request as done; this could be solved by writing to the control
file in the hook but that would be an I/O under the log's mutex, bad.
- it would not be nice organisation of code (I tried it :).
*/
if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
{
/* checkpoint succeeded */
/*
TODO: compute log's low water mark (how to do that with our fuzzy
ARIES-like reads of data structures? TODO think about it :).
*/
lock(log_mutex);
/* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */
maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
written_since_last_checkpoint= (my_off_t)0;
DBUG_RETURN(FALSE);
}
lock(log_mutex);
DBUG_RETURN(TRUE);
/*
keep mutex locked upon exit because callers will want to clear
mutex-protected status variables
*/
}
......@@ -433,7 +482,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
safemutex_assert_owner(log_mutex);
DBUG_ASSERT(level > NONE);
if (checkpoint_request < level)
if (next_asynchronous_checkpoint_to_do < level)
{
/* no equal or stronger running or to run, we post request */
/*
......@@ -445,7 +494,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
(see least_recently_dirtied.c) will notice our request in max a few
seconds.
*/
checkpoint_request= level; /* post request */
next_asynchronous_checkpoint_to_do= level; /* post request */
}
/*
......@@ -457,3 +506,37 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
the end user.
*/
}
/*
If a 64-bit variable transitions from both halves being zero to both halves
being non-zero, and never changes after that (like the transaction's
first_undo_lsn), this function can be used to do a read of it (without
mutex, without atomic load) which always produces a correct (though maybe
slightly old) value (even on 32-bit CPUs).
*/
static inline ulonglong read_non_atomic(ulonglong volatile *x)
{
#if ( SIZEOF_CHARP >= 8 )
/* 64-bit CPU (right?), 64-bit reads are atomic */
return *x;
#else
/*
32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
low bits and new high bits, or the contrary).
As the variable we read transitions from both halves being zero to both
halves being non-zero, and never changes then, we can detect atomicity
problems:
*/
ulonglong y;
for (;;) /* loop until no atomicity problems */
{
y= *x;
if (likely(((0 == y) ||
((0 != (y >> 32)) && (0 != (y << 32)))))
return y;
/* Worth seeing it! */
DBUG_PRINT("info",("atomicity problem"));
}
#endif
}
......@@ -57,6 +57,12 @@ int maria_close(register MARIA_HA *info)
info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
}
flag= !--share->reopen;
/*
RECOVERYTODO:
Below we are going to make the table unknown to future checkpoints, so it
needs to have fsync'ed itself entirely (bitmap, pages, etc) at this
point.
*/
maria_open_list=list_delete(maria_open_list,&info->open_list);
pthread_mutex_unlock(&share->intern_lock);
......
......@@ -94,7 +94,10 @@ pthread_handler_decl background_flush_and_checkpoint_thread()
while (this_thread_not_killed)
{
if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
execute_asynchronous_checkpoint_if_any();
{
/* note that we don't care of the checkpoint's success */
(void)execute_asynchronous_checkpoint_if_any();
}
lock(global_LRD_mutex);
flush_one_group_from_LRD();
safemutex_assert_not_owner(global_LRD_mutex);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment