WL#3071 Maria checkpoint

- cleanups, simplifications - moving the construction of the "dirty pages table" into the pagecache where it belongs (because it's the pagecache which knows dirty pages). TODO: do the same soon for the "transactions table". - fix for a small bug in the pagecache (decrementation of "changed_blocks") include/pagecache.h: prototype mysys/mf_pagecache.c: m_string.h moves up for LEX_STRING to be known for pagecache.h. In pagecache_delete_page(), we must decrement "blocks_changed" even if we just delete the page without flushing it. A new function pagecache_collect_changed_blocks_with_LSN() (used by the Checkpoint module), which stores information about the changed blocks (a.k.a. "the dirty pages table") into a LEX_STRING. This function is not tested now, it will be when there is a Checkpoint. storage/maria/ma_checkpoint.c: refining the checkpoint code: factoring functions, moving the construction of the "dirty pages table" into mf_pagecache.c (I'll do the same with the construction of the "transactions table" once Serg tells me what's the best way to do it). storage/maria/ma_least_recently_dirtied.c: Simplifying the thread which does background flushing of least-recently-dirtied pages: - in first version that thread will not flush, just do checkpoints - in 2nd version, flushing should re-use existing page cache functions like flush_pagecache_blocks(). unittest/mysys/test_file.h: m_string.h moves up for LEX_STRING to be known in pagecache.h

WL#3071 Maria checkpoint
- cleanups, simplifications - moving the construction of the "dirty pages table" into the pagecache where it belongs (because it's the pagecache which knows dirty pages). TODO: do the same soon for the "transactions table". - fix for a small bug in the pagecache (decrementation of "changed_blocks") include/pagecache.h: prototype mysys/mf_pagecache.c: m_string.h moves up for LEX_STRING to be known for pagecache.h. In pagecache_delete_page(), we must decrement "blocks_changed" even if we just delete the page without flushing it. A new function pagecache_collect_changed_blocks_with_LSN() (used by the Checkpoint module), which stores information about the changed blocks (a.k.a. "the dirty pages table") into a LEX_STRING. This function is not tested now, it will be when there is a Checkpoint. storage/maria/ma_checkpoint.c: refining the checkpoint code: factoring functions, moving the construction of the "dirty pages table" into mf_pagecache.c (I'll do the same with the construction of the "transactions table" once Serg tells me what's the best way to do it). storage/maria/ma_least_recently_dirtied.c: Simplifying the thread which does background flushing of least-recently-dirtied pages: - in first version that thread will not flush, just do checkpoints - in 2nd version, flushing should re-use existing page cache functions like flush_pagecache_blocks(). unittest/mysys/test_file.h: m_string.h moves up for LEX_STRING to be known in pagecache.h
95fbf242 · unknown · b70f4f29 · 95fbf242 · 95fbf242 · 95fbf242
Commit 95fbf242 authored Dec 18, 2006 by unknown
5 changed files
--- a/include/pagecache.h
+++ b/include/pagecache.h
@@ -221,6 +221,9 @@ extern my_bool pagecache_delete_page(PAGECACHE *pagecache,
                                     enum pagecache_page_lock lock,
                                     my_bool flush);
 extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
+extern my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
+                                                         LEX_STRING *str,
+                                                         LSN *max_lsn);

 C_MODE_END
 #endif /* _keycache_h */
--- a/mysys/mf_pagecache.c
+++ b/mysys/mf_pagecache.c
@@ -40,9 +40,9 @@
 */

 #include "mysys_priv.h"
+#include <m_string.h>
 #include <pagecache.h>
 #include "my_static.h"
-#include <m_string.h>
 #include <my_bit.h>
 #include <errno.h>
 #include <stdarg.h>
@@ -295,7 +295,7 @@ struct st_pagecache_block_link
  enum pagecache_page_type type; /* type of the block                        */
  uint hits_left;         /* number of hits left until promotion             */
  ulonglong last_hit_time; /* timestamp of the last hit                      */
-  ulonglong rec_lsn;       /* LSN when first became dirty                    */
+  LSN rec_lsn;            /* LSN when first became dirty                    */
  KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event    */
 };

@@ -2988,33 +2988,35 @@ my_bool pagecache_delete_page(PAGECACHE *pagecache,
      goto restart;
    }

-    if (block->status & BLOCK_CHANGED && flush)
+    if (block->status & BLOCK_CHANGED)
    {
-      /* The block contains a dirty page - push it out of the cache */
-
-      KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty"));
-
-      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
-      /*
-        The call is thread safe because only the current
-        thread might change the block->hash_link value
-      */
-      DBUG_ASSERT(block->pins == 1);
-      error= pagecache_fwrite(pagecache,
-                              &block->hash_link->file,
-                              block->buffer,
-                              block->hash_link->pageno,
-                              block->type,
-                              MYF(MY_NABP | MY_WAIT_IF_FULL));
-      pagecache_pthread_mutex_lock(&pagecache->cache_lock);
-      pagecache->global_cache_write++;
-
-      if (error)
+      if (flush)
      {
-        block->status|= BLOCK_ERROR;
-        goto err;
+        /* The block contains a dirty page - push it out of the cache */
+        
+        KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty"));
+        
+        pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+        /*
+          The call is thread safe because only the current
+          thread might change the block->hash_link value
+        */
+        DBUG_ASSERT(block->pins == 1);
+        error= pagecache_fwrite(pagecache,
+                                &block->hash_link->file,
+                                block->buffer,
+                                block->hash_link->pageno,
+                                block->type,
+                                MYF(MY_NABP | MY_WAIT_IF_FULL));
+        pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+        pagecache->global_cache_write++;
+        
+        if (error)
+        {
+          block->status|= BLOCK_ERROR;
+          goto err;
+        }
      }
-
      pagecache->blocks_changed--;
      pagecache->global_blocks_changed--;
      /*
@@ -3793,6 +3795,132 @@ int reset_key_cache_counters(const char *name, PAGECACHE *key_cache)
 }


+/*
+  Allocates a buffer and stores in it some information about all dirty pages
+  of type PAGECACHE_LSN_PAGE.
+
+  SYNOPSIS
+    pagecache_collect_changed_blocks_with_LSN()
+    pagecache  pointer to the page cache
+    str        (OUT) pointer to a LEX_STRING where the allocated buffer, and
+               its size, will be put
+    max_lsn    (OUT) pointer to a LSN where the maximum rec_lsn of all
+               relevant dirty pages will be put
+
+  DESCRIPTION
+    Does the allocation because the caller cannot know the size itself.
+    Memory freeing is done by the caller.
+    Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they
+    are not interesting for a checkpoint record.
+    The caller has the intention of doing checkpoints.
+
+  RETURN
+    0 on success
+    1 on error
+*/
+my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
+                                                  LEX_STRING *str,
+                                                  LSN *max_lsn)
+{
+  my_bool error;
+  ulong stored_LRD_size= 0;
+  uint file_hash;
+  char *ptr;
+  DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN");
+
+  *max_lsn= 0;
+  /*
+    We lock the entire cache but will be quick, just reading/writing a few MBs
+    of memory at most.
+    When we enter here, we must be sure that no "first_in_switch" situation
+    is happening or will happen (either we have to get rid of
+    first_in_switch in the code or, first_in_switch has to increment a
+    "danger" counter for this function to know it has to wait). TODO.
+  */
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+  /* Count how many dirty pages are interesting */
+  for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+  {
+    PAGECACHE_BLOCK_LINK *block;
+    for (block= pagecache->changed_blocks[file_hash] ;
+         block;
+         block= block->next_changed)
+    {
+      /*
+        Q: is there somthing subtle with block->hash_link: can it be NULL?
+        does it have to be == hash_link->block... ?
+      */
+      DBUG_ASSERT(block->hash_link != NULL);
+      DBUG_ASSERT(block->status & BLOCK_CHANGED);
+      if (block->type != PAGECACHE_LSN_PAGE)
+        continue; /* no need to store it */
+      /*
+        In the current pagecache, rec_lsn is not set correctly:
+        1) it is set on pagecache_unlock(), too late (a page is dirty
+        (BLOCK_CHANGED) since the first pagecache_write()). So in this
+        scenario:
+        thread1:                       thread2:
+        write_REDO
+        pagecache_write()              checkpoint : reclsn not known
+        pagecache_unlock(sets rec_lsn)
+        commit
+        crash,
+        at recovery we will wrongly skip the REDO. It also affects the
+        low-water mark's computation.
+        2) sometimes the unlocking can be an implicit action of
+        pagecache_write(), without any call to pagecache_unlock(), then
+        rec_lsn is not set.
+        1) and 2) are critical problems.
+        TODO: fix this when Monty has explained how he writes BLOB pages.
+      */
+      if (0 == block->rec_lsn)
+      {
+        DBUG_ASSERT(0);
+        goto err;
+      }
+      stored_LRD_size++;
+    }
+  }
+
+  str->length= 8+(4+4+8)*stored_LRD_size;
+  if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
+    goto err;
+  ptr= str->str;
+  int8store(ptr, stored_LRD_size);
+  ptr+= 8;
+  if (0 == stored_LRD_size)
+    goto end;
+  for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+  {
+    PAGECACHE_BLOCK_LINK *block;
+    for (block= pagecache->changed_blocks[file_hash] ;
+         block;
+         block= block->next_changed)
+    {
+      if (block->type != PAGECACHE_LSN_PAGE)
+        continue; /* no need to store it in the checkpoint record */
+      DBUG_ASSERT((4 == sizeof(block->hash_link->file.file)) &&
+                  (4 == sizeof(block->hash_link->pageno)));
+      int4store(ptr, block->hash_link->file.file);
+      ptr+= 4;
+      int4store(ptr, block->hash_link->pageno);
+      ptr+= 4;
+      int8store(ptr, (ulonglong)block->rec_lsn);
+      ptr+= 8;
+      set_if_bigger(*max_lsn, block->rec_lsn);
+    }
+  }
+  error= 0;
+  goto end;
+err:
+  error= 1;
+end:
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  DBUG_RETURN(error);
+}
+
+
 #ifndef DBUG_OFF
 /*
  Test if disk-cache is ok

--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
--- a/storage/maria/ma_least_recently_dirtied.c
+++ b/storage/maria/ma_least_recently_dirtied.c
@@ -36,162 +36,57 @@
 #include "least_recently_dirtied.h"

 /*
-  MikaelR suggested removing this global_LRD_mutex (I have a paper note of
-  comments), however at least for the first version we'll start with this
-  mutex (which will be a LOCK-based atomic_rwlock).
-*/
-pthread_mutex_t global_LRD_mutex; 
-
-/*
-  When we flush a page, we should pin page.
-  This "pin" is to protect against that:
-  I make copy,
-  you modify in memory and flush to disk and remove from LRD and from cache,
-  I write copy to disk,
-  checkpoint happens.
-  result: old page is on disk, page is absent from LRD, your REDO will be
-  wrongly ignored.
-
-  Pin: there can be multiple pins, flushing imposes that there are zero pins.
-  For example, pin could be a uint counter protected by the page's latch.
-
-  Maybe it's ok if when there is a page replacement, the replacer does not
-  remove page from the LRD (it would save global mutex); for that, background
-  flusher should be prepared to see pages in the LRD which are not in the page
-  cache (then just ignore them). However checkpoint will contain superfluous
-  entries and so do more work.
-*/
-
-#define PAGE_SIZE (16*1024) /* just as an example */
-/*
-  Optimization:
-  LRD flusher should not flush pages one by one: to be fast, it flushes a
-  group of pages in sequential disk order if possible; a group of pages is just
-  FLUSH_GROUP_SIZE pages.
-  Key cache has groupping already somehow Monty said (investigate that).
-*/
-#define FLUSH_GROUP_SIZE 512 /* 8 MB */
-/*
-  We don't want to probe for checkpoint requests all the time (it takes
-  the log mutex).
-  If FLUSH_GROUP_SIZE is 8MB, assuming a local disk which can write 30MB/s
-  (1.8GB/min), probing every 16th call to flush_one_group_from_LRD() is every
-  16*8=128MB which is every 128/30=4.2second.
-  Using a power of 2 gives a fast modulo operation.
-*/
-#define CHECKPOINT_PROBING_PERIOD_LOG2 4
-
-/*
-  This thread does background flush of pieces of the LRD, and all checkpoints.
+  This thread does background flush of pieces of the LRD, and serves
+  requests for asynchronous checkpoints.
  Just launch it when engine starts.
  MikaelR questioned why the same thread does two different jobs, the risk
  could be that while a checkpoint happens no LRD flushing happens.
+  For now, we only do checkpoints - no LRD flushing (to be done when the
+  second version of the page cache is ready WL#3077).
+  Reasons to delay:
+  - Recovery will work (just slower)
+  - new page cache may be different, why do then re-do
+  - current pagecache probably has issues with flushing when somebody is
+  writing to the table being flushed - better avoid that.
 */
 pthread_handler_decl background_flush_and_checkpoint_thread()
 {
-  char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE);
-  uint flush_calls= 0;
  while (this_thread_not_killed)
  {
-    if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
-    {
-      /* note that we don't care of the checkpoint's success */
-      (void)execute_asynchronous_checkpoint_if_any();
-    }
-    lock(global_LRD_mutex);
-    flush_one_group_from_LRD();
-    safemutex_assert_not_owner(global_LRD_mutex);
+    /* note that we don't care of the checkpoint's success */
+    (void)execute_asynchronous_checkpoint_if_any();
+    sleep(5);
    /*
-      We are a background thread, leave time for client threads or we would
-      monopolize the disk:
+      in the final version, we will not sleep but call flush_pages_from_LRD()
+      repeatedly. If there are no dirty pages, we'll make sure to not have a
+      tight loop probing for checkpoint requests.
    */
-    pthread_yield();
  }
-  my_free(flush_group_buffer);
 }

+/* The rest of this file will not serve in first version */
+
 /*
-  flushes only the first FLUSH_GROUP_SIZE pages of the LRD.
+  flushes only the first pages of the LRD.
+  max_this_number could be FLUSH_CACHE (of mf_pagecache.c) for example.
 */
-flush_one_group_from_LRD()
+flush_pages_from_LRD(uint max_this_number, LSN max_this_lsn)
 {
-  char *ptr;
-  safe_mutex_assert_owner(global_LRD_mutex);
-
-  for (page= 0; page<FLUSH_GROUP_SIZE; page++)
-  {
-    copy_element_to_array;
-  }
  /*
    One rule to better observe is "page must be flushed to disk before it is
    removed from LRD" (otherwise checkpoint is incomplete info, corruption).
  */
-  unlock(global_LRD_mutex);
-  /* page id is concatenation of "file id" and "number of page in file" */
-  qsort(array, sizeof(*element), FLUSH_GROUP_SIZE, by_page_id);
-  for (scan_array)
-  {
-    if (page_cache_latch(page_id, READ) == PAGE_ABSENT)
-    {
-      /*
-        page disappeared since we made the copy (it was flushed to be
-        replaced), remove from array (memcpy tail of array over it)...
-      */
-      continue;
-    }
-    memcpy(flush_group_buffer+..., page->data, PAGE_SIZE);
-    pin_page;
-    page_cache_unlatch(page_id, KEEP_PINNED); /* but keep pinned */
-  }
-  for (scan_the_array)
-  {
-    /*
-      As an optimization, we try to identify contiguous-in-the-file segments (to
-      issue one big write()).
-      In non-optimized version, contiguous segment is always only one page.
-    */
-    if ((next_page.page_id - this_page.page_id) == 1)
-    {
-      /*
-        this page and next page are in same file and are contiguous in the
-        file: add page to contiguous segment...
-      */
-      continue; /* defer write() to next pages */
-    }
-    /* contiguous segment ends */
-    my_pwrite(file, contiguous_segment_start_offset, contiguous_segment_size);

-    /*
-      note that if we had doublewrite, doublewrite buffer may prevent us from
-      doing this write() grouping (if doublewrite space is shorter).
-    */
-  }
  /*
-    Now remove pages from LRD. As we have pinned them, all pages that we
-    managed to pin are still in the LRD, in the same order, we can just cut
-    the LRD at the last element of "array". This is more efficient that
-    removing element by element (which would take LRD mutex many times) in the
-    loop above.
+    Build a list of pages to flush:
+    changed_blocks[i] is roughly sorted by descending rec_lsn,
+    so we could do a merge sort of changed_blocks[] lists, stopping after we
+    have the max_this_number first elements or after we have found a page with
+    rec_lsn > max_this_lsn.
+    Then do like pagecache_flush_blocks_int() does (beware! this time we are
+    not alone on the file! there may be dangers! TODO: sort this out).
  */
-  lock(global_LRD_mutex);
-  /* cut LRD by bending LRD->first, free cut portion... */
-  unlock(global_LRD_mutex);
-  for (scan_array)
-  {
-    /*
-      if the page has a property "modified since last flush" (i.e. which is
-      redundant with the presence of the page in the LRD, this property can
-      just be a pointer to the LRD element) we should reset it
-      (note that then the property would live slightly longer than
-      the presence in LRD).
-    */
-    page_cache_unpin(page_id);
-    /*
-      order between unpin and removal from LRD is not clear, depends on what
-      pin actually is.
-    */
-  }
-  free(array);
+
  /*
    MikaelR noted that he observed that Linux's file cache may never fsync to
    disk until this cache is full, at which point it decides to empty the
@@ -201,28 +96,11 @@ flush_one_group_from_LRD()
 }

 /*
-  Flushes all page from LRD up to approximately rec_lsn>=max_lsn.
-  This is approximate because we flush groups, and because the LRD list may
+  Note that when we flush all page from LRD up to rec_lsn>=max_lsn,
+  this is approximate because the LRD list may
  not be exactly sorted by rec_lsn (because for a big row, all pages of the
  row are inserted into the LRD with rec_lsn being the LSN of the REDO for the
  first page, so if there are concurrent insertions, the last page of the big
  row may have a smaller rec_lsn than the previous pages inserted by
  concurrent inserters).
 */
-int flush_all_LRD_to_lsn(LSN max_lsn)
-{
-  lock(global_LRD_mutex);
-  if (max_lsn == MAX_LSN) /* don't want to flush forever, so make it fixed: */
-    max_lsn= LRD->first->prev->rec_lsn;
-  while (LRD->first->rec_lsn < max_lsn)
-  {
-    if (flush_one_group_from_LRD()) /* will unlock LRD mutex */
-      return 1;
-    /*
-      The scheduler may preempt us here as we released the mutex; this is good.
-    */
-    lock(global_LRD_mutex);
-  }
-  unlock(global_LRD_mutex);
-  return 0;
-}
--- a/unittest/mysys/test_file.h
+++ b/unittest/mysys/test_file.h
-
+#include <m_string.h>
 #include <pagecache.h>

 /*