WL#3071 - Maria checkpoint

Adding rec_lsn to Maria's page cache. Misc fixes to Checkpoint. mysys/mf_pagecache.c: adding rec_lsn, the LSN when a page first became dirty. It is set when unlocking a page (TODO: should also be set when the unlocking is an implicit part of pagecache_write()). It is reset in link_to_file_list() and free_block() (one of which is used every time we flush a block). It is a ulonglong and not LSN, because its destination is comparisons for which ulonglong is better than a struct. storage/maria/ma_checkpoint.c: misc fixes to Checkpoint (updates now that the transaction manager and the page cache are more known) storage/maria/ma_close.c: an important note for the future. storage/maria/ma_least_recently_dirtied.c: comment

WL#3071 - Maria checkpoint
Adding rec_lsn to Maria's page cache. Misc fixes to Checkpoint. mysys/mf_pagecache.c: adding rec_lsn, the LSN when a page first became dirty. It is set when unlocking a page (TODO: should also be set when the unlocking is an implicit part of pagecache_write()). It is reset in link_to_file_list() and free_block() (one of which is used every time we flush a block). It is a ulonglong and not LSN, because its destination is comparisons for which ulonglong is better than a struct. storage/maria/ma_checkpoint.c: misc fixes to Checkpoint (updates now that the transaction manager and the page cache are more known) storage/maria/ma_close.c: an important note for the future. storage/maria/ma_least_recently_dirtied.c: comment
fa05e9c9 · unknown · ad29d552 · fa05e9c9 · fa05e9c9 · fa05e9c9
Commit fa05e9c9 authored Dec 16, 2006 by unknown
4 changed files
--- a/mysys/mf_pagecache.c
+++ b/mysys/mf_pagecache.c
@@ -295,6 +295,7 @@ struct st_pagecache_block_link
  enum pagecache_page_type type; /* type of the block                        */
  uint hits_left;         /* number of hits left until promotion             */
  ulonglong last_hit_time; /* timestamp of the last hit                      */
+  ulonglong rec_lsn;       /* LSN when first became dirty                    */
  KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event    */
 };

@@ -1202,6 +1203,7 @@ static void link_to_file_list(PAGECACHE *pagecache,
  if (block->status & BLOCK_CHANGED)
  {
    block->status&= ~BLOCK_CHANGED;
+    block->rec_lsn= 0;
    pagecache->blocks_changed--;
    pagecache->global_blocks_changed--;
  }
@@ -2509,6 +2511,8 @@ void pagecache_unlock_page(PAGECACHE *pagecache,
    DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK &&
                pin == PAGECACHE_UNPIN);
    /* TODO: insert LSN writing code */
+    DBUG_ASSERT(first_REDO_LSN_for_page > 0);
+    set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page);
  }

 #ifndef DBUG_OFF
@@ -2671,6 +2675,8 @@ void pagecache_unlock(PAGECACHE *pagecache,
    DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK &&
                pin == PAGECACHE_UNPIN);
    /* TODO: insert LSN writing code */
+    DBUG_ASSERT(first_REDO_LSN_for_page > 0);
+    set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page);
  }

 #ifndef DBUG_OFF
@@ -3012,10 +3018,9 @@ restart:
      pagecache->blocks_changed--;
      pagecache->global_blocks_changed--;
      /*
-        free_block() will change the status of the block so no need to change
-        it here.
+        free_block() will change the status and rec_lsn of the block so no
+        need to change them here.
      */
-
    }
    /* Cache is locked, so we can relese page before freeing it */
    pagecache_make_lock_and_pin(pagecache, block,
@@ -3328,6 +3333,7 @@ static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
 #ifndef DBUG_OFF
  block->type= PAGECACHE_EMPTY_PAGE;
 #endif
+  block->rec_lsn= 0;
  KEYCACHE_THREAD_TRACE("free block");
  KEYCACHE_DBUG_PRINT("free_block",
                      ("block is freed"));

--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
@@ -40,8 +40,7 @@
 #include "share.h"
 #include "log.h"

-/* could also be called LSN_ERROR */
-#define LSN_IMPOSSIBLE ((LSN)0)
+#define LSN_IMPOSSIBLE ((LSN)0) /* could also be called LSN_ERROR */
 #define LSN_MAX ((LSN)ULONGLONG_MAX)

 /*
@@ -57,9 +56,12 @@ st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,..
  MEDIUM checkpoint.
 */
 LSN max_rec_lsn_at_last_checkpoint= 0;
+/* last submitted checkpoint request; cleared only when executed */
 CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
 CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;

+static inline ulonglong read_non_atomic(ulonglong volatile *x);
+
 /*
  Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA
  ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the
@@ -67,6 +69,7 @@ CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
 */
 my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
 {
+  my_bool result;
  DBUG_ENTER("execute_synchronous_checkpoint");
  DBUG_ASSERT(level > NONE);

@@ -76,43 +79,52 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
    wait_on_checkpoint_done_cond();

  synchronous_checkpoint_in_progress= level;
-  execute_checkpoint(level);
+  result= execute_checkpoint(level);
  safemutex_assert_owner(log_mutex);
  synchronous_checkpoint_in_progress= NONE;
  unlock(log_mutex);
  broadcast(checkpoint_done_cond);
+  DBUG_RETURN(result);
 }

-/* Picks a checkpoint request, if there is one, and executes it */
+/*
+  If no checkpoint is running, and there is a pending asynchronous checkpoint
+  request, executes it.
+  Is safe if multiple threads call it, though in first version only one will.
+  It's intended to be used by a thread which regularly calls this function;
+  this is why, if there is a request,it does not wait in a loop for
+  synchronous checkpoints to be finished, but just exits (because the thread
+  may want to do something useful meanwhile (flushing dirty pages for example)
+  instead of waiting).
+*/
 my_bool execute_asynchronous_checkpoint_if_any()
 {
+  my_bool result;
  CHECKPOINT_LEVEL level;
  DBUG_ENTER("execute_asynchronous_checkpoint");

  lock(log_mutex);
-  if (likely(next_asynchronous_checkpoint_to_do == NONE))
+  if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
+             (synchronous_checkpoint_in_progress != NONE)))
  {
    unlock(log_mutex);
    DBUG_RETURN(FALSE);
  }

-  while (synchronous_checkpoint_in_progress)
-    wait_on_checkpoint_done_cond();
-
-do_checkpoint:
  level= next_asynchronous_checkpoint_to_do;
  DBUG_ASSERT(level > NONE);
-  execute_checkpoint(level);
+  result= execute_checkpoint(level);
  safemutex_assert_owner(log_mutex);
-  if (next_asynchronous_checkpoint_to_do > level)
-    goto do_checkpoint;     /* one more request was posted */
-  else
+  /* If only one thread calls this function, "<" can never happen below */
+  if (next_asynchronous_checkpoint_to_do <= level)
  {
-    DBUG_ASSERT(next_asynchronous_checkpoint_to_do == level);
-    next_asynchronous_checkpoint_to_do= NONE; /* all work done */
+    /* it's our request or weaker/equal ones, all work is done */
+    next_asynchronous_checkpoint_to_do= NONE;
  }
+  /* otherwise if it is a stronger request, we'll deal with it at next call */
  unlock(log_mutex);
  broadcast(checkpoint_done_cond);
+  DBUG_RETURN(result);
 }


@@ -123,17 +135,14 @@ do_checkpoint:
 */
 my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
 {
-  LSN candidate_max_rec_lsn_at_last_checkpoint;
-  /* to avoid { lock + no-op + unlock } in the common (==indirect) case */
-  my_bool need_log_mutex;
-
  DBUG_ENTER("execute_checkpoint");

  safemutex_assert_owner(log_mutex);
-  copy_of_max_rec_lsn_at_last_checkpoint= max_rec_lsn_at_last_checkpoint;

-  if (unlikely(need_log_mutex= (level > INDIRECT)))
+  if (unlikely(level > INDIRECT))
  {
+    LSN copy_of_max_rec_lsn_at_last_checkpoint=
+      max_rec_lsn_at_last_checkpoint;
    /* much I/O work to do, release log mutex */
    unlock(log_mutex);

@@ -149,51 +158,29 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
        flush all pages which were already dirty at last checkpoint:
        ensures that recovery will never start from before the next-to-last
        checkpoint (two-checkpoint rule).
-        It is max, not min as the WL says (TODO update WL).
      */
      flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint);
      /* this will go full speed (normal scheduling, no sleep) */
      break;
    }
+    lock(log_mutex);
  }

-  candidate_max_rec_lsn_at_last_checkpoint= checkpoint_indirect(need_log_mutex);
-
-  lock(log_mutex);
-  /*
-    this portion cannot be done as a hook in write_log_record() for the
-    LOGREC_CHECKPOINT type because:
-    - at that moment we still have not written to the control file so cannot
-    mark the request as done; this could be solved by writing to the control
-    file in the hook but that would be an I/O under the log's mutex, bad.
-    - it would not be nice organisation of code (I tried it :).
-  */
-  if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
-  {
-    /* checkpoint succeeded */
-    maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
-    written_since_last_checkpoint= (my_off_t)0;
-    DBUG_RETURN(FALSE);
-  }
  /*
-    keep mutex locked because callers will want to clear mutex-protected
-    status variables
+    keep mutex locked upon exit because callers will want to clear
+    mutex-protected status variables
  */
-  DBUG_RETURN(TRUE);
+  DBUG_RETURN(execute_checkpoint_indirect());
 }


 /*
  Does an indirect checpoint (collects data from data structures, writes into
  a checkpoint log record).
-  Returns the largest LSN of the LRD when the checkpoint happened (this is a
-  fuzzy definition), or LSN_IMPOSSIBLE on error. That LSN is used for the
-  "two-checkpoint rule" (MEDIUM checkpoints).
+  Starts and ends while having log's mutex (released in the middle).
 */
-LSN checkpoint_indirect(my_bool need_log_mutex)
+my_bool execute_checkpoint_indirect()
 {
-  DBUG_ENTER("checkpoint_indirect");
-
  int error= 0;
  /* checkpoint record data: */
  LSN checkpoint_start_lsn;
@@ -202,163 +189,198 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
  char *ptr;
  LSN checkpoint_lsn;
  LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
-  list_element *el;   /* to scan lists */
-  ulong stored_LRD_size= 0;
-
+  DBUG_ENTER("execute_checkpoint_indirect");

  DBUG_ASSERT(sizeof(byte *) <= 8);
  DBUG_ASSERT(sizeof(LSN) <= 8);

-  if (need_log_mutex)
-    lock(log_mutex); /* maybe this will clash with log_read_end_lsn() */
+  safemutex_assert_owner(log_mutex);
  checkpoint_start_lsn= log_read_end_lsn();
+  if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */
+    DBUG_RETURN(TRUE);
  unlock(log_mutex);

  DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));

  /* STEP 1: fetch information about dirty pages */
-
-  /*
-    We lock the entire cache but will be quick, just reading/writing a few MBs
-    of memory at most.
-  */
-  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
-
-  /*
-    This is an over-estimation, as in theory blocks_changed may contain
-    non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the
-    checkpoint record; the true number of page-LRD-info we'll store into the
-    record is stored_LRD_size.
-  */
-  string1.length= 8+8+(8+8)*pagecache->blocks_changed;
-  if (NULL == (string1.str= my_malloc(string1.length)))
-    goto err;
-  ptr= string1.str;
-  int8store(ptr, checkpoint_start_lsn);
-  ptr+= 8+8; /* don't store stored_LRD_size now, wait */
-  if (pagecache->blocks_changed > 0)
+  /* note: this piece will move into mysys/mf_pagecache.c */
  {
+    ulong stored_LRD_size= 0;
+    /*
+      We lock the entire cache but will be quick, just reading/writing a few MBs
+      of memory at most.
+      When we enter here, we must be sure that no "first_in_switch" situation
+      is happening or will happen (either we have to get rid of
+      first_in_switch in the code or, first_in_switch has to increment a
+      "danger" counter for Checkpoint to know it has to wait.
+    */
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+    /*
+      This is an over-estimation, as in theory blocks_changed may contain
+      non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the
+      checkpoint record; the true number of page-LRD-info we'll store into the
+      record is stored_LRD_size.
+    */
    /*
-      There are different ways to scan the dirty blocks;
-      flush_all_key_blocks() uses a loop over pagecache->used_last->next_used,
-      and for each element of the loop, loops into
-      pagecache->changed_blocks[FILE_HASH(file of the element)].
-      This has the drawback that used_last includes non-dirty blocks, and it's
-      two loops over many elements. Here we try something simpler.
-      If there are no blocks in changed_blocks[file_hash], we should hit
-      zeroes and skip them.
+      TODO: Ingo says blocks_changed is not a reliable number (see his
+      document); ask him.
    */
-    uint file_hash;
-    for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+    string1.length= 8+8+(8+8+8)*pagecache->blocks_changed;
+    if (NULL == (string1.str= my_malloc(string1.length)))
+      goto err;
+    ptr= string1.str;
+    int8store(ptr, checkpoint_start_lsn);
+    ptr+= 8+8; /* don't store stored_LRD_size now, wait */
+    if (pagecache->blocks_changed > 0)
    {
-      PAGECACHE_BLOCK_LINK *block;
-      for (block= pagecache->changed_blocks[file_hash] ;
-           block;
-           block= block->next_changed)
+      uint file_hash;
+      for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
      {
-        DBUG_ASSERT(block->hash_link != NULL);
-        DBUG_ASSERT(block->status & BLOCK_CHANGED);
-        if (block->type != PAGECACHE_LSN_PAGE)
+        PAGECACHE_BLOCK_LINK *block;
+        for (block= pagecache->changed_blocks[file_hash] ;
+             block;
+             block= block->next_changed)
        {
-          /* no need to store it in the checkpoint record */
-          continue;
+          DBUG_ASSERT(block->hash_link != NULL);
+          DBUG_ASSERT(block->status & BLOCK_CHANGED);
+          if (block->type != PAGECACHE_LSN_PAGE)
+          {
+            continue; /* no need to store it in the checkpoint record */
+          }
+          /*
+            In the current pagecache, rec_lsn is not set correctly:
+            1) it is set on pagecache_unlock(), too late (a page is dirty
+            (BLOCK_CHANGED) since the first pagecache_write()). It may however
+            be not too late, because until unlock(), the page's update is not
+            committed, so it's ok that REDOs for it be skipped at Recovery
+            (which is what happens with an unset rec_lsn). Note that this
+            relies on the assumption that a transaction never commits while
+            holding locks on pages.
+            2) sometimes the unlocking can be an implicit action of
+            pagecache_write(), without any call to pagecache_unlock(), then
+            rec_lsn is not set. That one is a critical problem.
+            TODO: fix this when Monty has explained how he writes BLOB pages.
+          */
+          if (0 == block->rec_lsn)
+            abort(); /* always fail in all builds, in case it's problem 2) */
+
+          int8store(ptr, block->hash_link->file.file);
+          ptr+= 8;
+          int8store(ptr, block->hash_link->pageno);
+          ptr+= 8;
+          int8store(ptr, block->rec_lsn);
+          ptr+= 8;
+          stored_LRD_size++;
+          DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
+          set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
+                        block->rec_lsn);
        }
-        /* Q: two "block"s cannot have the same "hash_link", right? */
-        int8store(ptr, block->hash_link->pageno);
-        ptr+= 8;
-        /* I assume rec_lsn will be member of "block", not of "hash_link" */
-        int8store(ptr, block->rec_lsn);
-        ptr+= 8;
-        stored_LRD_size++;
-        set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
-                      block->rec_lsn);
      }
-    }
-  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
-  DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
-  int8store(string1.str+8, stored_LRD_size);
-  string1.length= 8+8+(8+8)*stored_LRD_size;
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      int8store(string1.str+8, stored_LRD_size);
+      string1.length= 8+8+(8+8+8)*stored_LRD_size;
+  }

  /* STEP 2: fetch information about transactions */
-
+  /* note: this piece will move into trnman.c */
  /*
-    If trx are in more than one list (e.g. three:
-    running transactions, committed transactions, purge queue), we can either
-    take mutexes of all three together or do crabbing.
-    But if an element can move from list 1 to list 3 without passing through
-    list 2, crabbing is dangerous.
-    Hopefully it's ok to take 3 mutexes together...
-    Otherwise I'll have to make sure I miss no important trx and I handle dups.
+    Transactions are in the "active list" (protected by a mutex) and in a
+    lock-free hash of "committed" (insertion protected by the same mutex,
+    deletion lock-free).
  */
-  lock(global_transactions_list_mutex); /* or 3 mutexes if there are 3 */
-  string2.length= 8+(8+8)*trx_list->count;
-  if (NULL == (string2.str= my_malloc(string2.length)))
-    goto err;
-  ptr= string2.str;
-  int8store(ptr, trx_list->count);
-  ptr+= 8;
-  for (el= trx_list->first; el; el= el->next)
  {
-    /* possibly latch el.rwlock */
-    *ptr= el->state;
-    ptr++;
-    int7store(ptr, el->long_trans_id);
-    ptr+= 7;
-    int2store(ptr, el->short_trans_id);
-    ptr+= 2;
-    int8store(ptr, el->undo_lsn);
-    ptr+= 8;
-    int8store(ptr, el->undo_purge_lsn);
+    TRN *trn;
+    ulong stored_trn_size= 0;
+    /* First, the active transactions */
+    pthread_mutex_lock(LOCK_trn_list);
+    string2.length= 8+(7+2+8+8+8)*trnman_active_transactions;
+    if (NULL == (string2.str= my_malloc(string2.length)))
+      goto err;
+    ptr= string2.str;
    ptr+= 8;
+    for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
+    {
+      /* we would latch trn.rwlock if it existed */
+      if (0 == trn->short_trid) /* trn is not inited, skip */
+        continue;
+      /* state is not needed for now (only when we have prepared trx) */
+      /* int7store does not exist but mi_int7store does */
+      int7store(ptr, trn->trid);
+      ptr+= 7;
+      int2store(ptr, trn->short_trid);
+      ptr+= 2;
+      int8store(ptr, trn->undo_lsn); /* is an LSN 7 or 8 bytes really? */
+      ptr+= 8;
+      int8store(ptr, trn->undo_purge_lsn);
+      ptr+= 8;
+      int8store(ptr, read_non_atomic(&trn->first_undo_lsn));
+      ptr+= 8;
+      /* possibly unlatch el.rwlock */
+      stored_trn_size++;
+    }
+    pthread_mutex_unlock(LOCK_trn_list);
    /*
-      if no latch, use double variable of type ULONGLONG_CONSISTENT in
-      st_transaction, or even no need if Intel >=486
+      Now the committed ones.
+      We need a function which scans the hash's list of elements in a
+      lock-free manner (a bit like lfind(), starting from bucket 0), and for
+      each node (committed transaction) stores the transaction's
+      information (trid, undo_purge_lsn, first_undo_lsn) into a buffer.
+      This big buffer is malloc'ed at the start, so the number of elements (or
+      an upper bound of it) found in the hash needs to be known in advance
+      (one solution is to keep LOCK_trn_list locked, ensuring that nodes are
+      only deleted).
    */
-    int8store(ptr, el->first_undo_lsn);
-    ptr+= 8;
-    /* possibly unlatch el.rwlock */
+    /*
+      TODO: if we see there exists no transaction (active and committed) we can
+      tell the lock-free structures to do some freeing (my_free()).
+    */
+    int8store(string1.str, stored_trn_size);
+    string2.length= 8+(7+2+8+8+8)*stored_trn_size;
  }
-  unlock(global_transactions_list_mutex);

  /* STEP 3: fetch information about table files */

-  /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
-  lock(global_share_list_mutex);
-  string3.length= 8+(8+8)*share_list->count;
-  if (NULL == (string3.str= my_malloc(string3.length)))
-    goto err;
-  ptr= string3.str;
-  /* possibly latch each MARIA_SHARE, one by one, like this: */
-  pthread_mutex_lock(&share->intern_lock);
-  /*
-    We'll copy the file id (a bit like share->kfile), the file name
-    (like share->unique_file_name[_length]).
-  */
-  make_copy_of_global_share_list_to_array;
-  pthread_mutex_unlock(&share->intern_lock);
-  unlock(global_share_list_mutex);
-
-  /* work on copy */
-  int8store(ptr, elements_in_array);
-  ptr+= 8;
-  for (el in array)
  {
-    int8store(ptr, array[...].file_id);
-    ptr+= 8;
-    memcpy(ptr, array[...].file_name, ...);
-    ptr+= ...;
+    /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
+    lock(global_share_list_mutex);
+    string3.length= 8+(8+8)*share_list->count;
+    if (NULL == (string3.str= my_malloc(string3.length)))
+      goto err;
+    ptr= string3.str;
+    /* possibly latch each MARIA_SHARE, one by one, like this: */
+    pthread_mutex_lock(&share->intern_lock);
    /*
-      these two are long ops (involving disk I/O) that's why we copied the
-      list, to not keep the list locked for long:
+      We'll copy the file id (a bit like share->kfile), the file name
+      (like share->unique_file_name[_length]).
    */
-    flush_bitmap_pages(el);
-    /* TODO: and also autoinc counter, logical file end, free page list */
+    make_copy_of_global_share_list_to_array;
+    pthread_mutex_unlock(&share->intern_lock);
+    unlock(global_share_list_mutex);

-    /*
-      fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
-      second, so if you have touched 1000 files it's 7 seconds).
-    */
-    force_file(el);
+    /* work on copy */
+    int8store(ptr, elements_in_array);
+    ptr+= 8;
+    for (el in array)
+    {
+      int8store(ptr, array[...].file_id);
+      ptr+= 8;
+      memcpy(ptr, array[...].file_name, ...);
+      ptr+= ...;
+      /*
+        these two are long ops (involving disk I/O) that's why we copied the
+        list, to not keep the list locked for long:
+      */
+      /* TODO: what if the table pointer is gone/reused now? */
+      flush_bitmap_pages(el);
+      /* TODO: and also autoinc counter, logical file end, free page list */
+
+      /*
+        fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
+        second, so if you have touched 1000 files it's 7 seconds).
+      */
+      force_file(el);
+    }
  }

  /* LAST STEP: now write the checkpoint log record */
@@ -389,11 +411,38 @@ err:
  candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE;

 end:
+
  my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
  my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
  my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));

-  DBUG_RETURN(candidate_max_rec_lsn_at_last_checkpoint);
+  /*
+    this portion cannot be done as a hook in write_log_record() for the
+    LOGREC_CHECKPOINT type because:
+    - at that moment we still have not written to the control file so cannot
+    mark the request as done; this could be solved by writing to the control
+    file in the hook but that would be an I/O under the log's mutex, bad.
+    - it would not be nice organisation of code (I tried it :).
+  */
+  if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
+  {
+    /* checkpoint succeeded */
+    /*
+      TODO: compute log's low water mark (how to do that with our fuzzy
+      ARIES-like reads of data structures? TODO think about it :).
+    */
+    lock(log_mutex);
+    /* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */
+    maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
+    written_since_last_checkpoint= (my_off_t)0;
+    DBUG_RETURN(FALSE);
+  }
+  lock(log_mutex);
+  DBUG_RETURN(TRUE);
+  /*
+    keep mutex locked upon exit because callers will want to clear
+    mutex-protected status variables
+  */
 }


@@ -433,7 +482,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
  safemutex_assert_owner(log_mutex);

  DBUG_ASSERT(level > NONE);
-  if (checkpoint_request < level)
+  if (next_asynchronous_checkpoint_to_do < level)
  {
    /* no equal or stronger running or to run, we post request */
    /*
@@ -445,7 +494,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
      (see least_recently_dirtied.c) will notice our request in max a few
      seconds.
    */
-    checkpoint_request= level; /* post request */
+    next_asynchronous_checkpoint_to_do= level; /* post request */
  }

  /*
@@ -457,3 +506,37 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
    the end user.
  */
 }
+
+
+/*
+  If a 64-bit variable transitions from both halves being zero to both halves
+  being non-zero, and never changes after that (like the transaction's
+  first_undo_lsn), this function can be used to do a read of it (without
+  mutex, without atomic load) which always produces a correct (though maybe
+  slightly old) value (even on 32-bit CPUs).
+*/
+static inline ulonglong read_non_atomic(ulonglong volatile *x)
+{
+#if ( SIZEOF_CHARP >= 8 )
+  /* 64-bit CPU (right?), 64-bit reads are atomic */
+  return *x;
+#else
+  /*
+    32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
+    low bits and new high bits, or the contrary).
+    As the variable we read transitions from both halves being zero to both
+    halves being non-zero, and never changes then, we can detect atomicity
+    problems:
+  */
+  ulonglong y;
+  for (;;) /* loop until no atomicity problems */
+  {
+    y= *x;
+    if (likely(((0 == y) ||
+                ((0 != (y >> 32)) && (0 != (y << 32)))))
+      return y;
+    /* Worth seeing it! */
+    DBUG_PRINT("info",("atomicity problem"));
+  }
+#endif
+}
--- a/storage/maria/ma_close.c
+++ b/storage/maria/ma_close.c
@@ -57,6 +57,12 @@ int maria_close(register MARIA_HA *info)
    info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
  }
  flag= !--share->reopen;
+  /*
+    RECOVERYTODO:
+    Below we are going to make the table unknown to future checkpoints, so it
+    needs to have fsync'ed itself entirely (bitmap, pages, etc) at this
+    point.
+  */
  maria_open_list=list_delete(maria_open_list,&info->open_list);
  pthread_mutex_unlock(&share->intern_lock);


--- a/storage/maria/ma_least_recently_dirtied.c
+++ b/storage/maria/ma_least_recently_dirtied.c
@@ -94,7 +94,10 @@ pthread_handler_decl background_flush_and_checkpoint_thread()
  while (this_thread_not_killed)
  {
    if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
-      execute_asynchronous_checkpoint_if_any();
+    {
+      /* note that we don't care of the checkpoint's success */
+      (void)execute_asynchronous_checkpoint_if_any();
+    }
    lock(global_LRD_mutex);
    flush_one_group_from_LRD();
    safemutex_assert_not_owner(global_LRD_mutex);