Commit 95fbf242 authored by unknown's avatar unknown

WL#3071 Maria checkpoint

- cleanups, simplifications
- moving the construction of the "dirty pages table" into the
pagecache where it belongs (because it's the pagecache which knows
dirty pages). TODO: do the same soon for the "transactions table".
- fix for a small bug in the pagecache (decrementation of "changed_blocks")


include/pagecache.h:
  prototype
mysys/mf_pagecache.c:
  m_string.h moves up for LEX_STRING to be known for pagecache.h.
  In pagecache_delete_page(), we must decrement "blocks_changed" even
  if we just delete the page without flushing it.
  A new function pagecache_collect_changed_blocks_with_LSN()
  (used by the Checkpoint module), which stores information about the
  changed blocks (a.k.a. "the dirty pages table") into a LEX_STRING.
  This function is not tested now, it will be when there is a Checkpoint.
storage/maria/ma_checkpoint.c:
  refining the checkpoint code: factoring functions, moving the
  construction of the "dirty pages table" into mf_pagecache.c
  (I'll do the same with the construction of the "transactions table"
  once Serg tells me what's the best way to do it).
storage/maria/ma_least_recently_dirtied.c:
  Simplifying the thread which does background flushing of
  least-recently-dirtied pages:
  - in first version that thread will not flush, just do checkpoints
  - in 2nd version, flushing should re-use existing page cache functions
  like flush_pagecache_blocks().
unittest/mysys/test_file.h:
  m_string.h moves up for LEX_STRING to be known in pagecache.h
parent b70f4f29
......@@ -221,6 +221,9 @@ extern my_bool pagecache_delete_page(PAGECACHE *pagecache,
enum pagecache_page_lock lock,
my_bool flush);
extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
extern my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
LEX_STRING *str,
LSN *max_lsn);
C_MODE_END
#endif /* _keycache_h */
......@@ -40,9 +40,9 @@
*/
#include "mysys_priv.h"
#include <m_string.h>
#include <pagecache.h>
#include "my_static.h"
#include <m_string.h>
#include <my_bit.h>
#include <errno.h>
#include <stdarg.h>
......@@ -295,7 +295,7 @@ struct st_pagecache_block_link
enum pagecache_page_type type; /* type of the block */
uint hits_left; /* number of hits left until promotion */
ulonglong last_hit_time; /* timestamp of the last hit */
ulonglong rec_lsn; /* LSN when first became dirty */
LSN rec_lsn; /* LSN when first became dirty */
KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */
};
......@@ -2988,7 +2988,9 @@ my_bool pagecache_delete_page(PAGECACHE *pagecache,
goto restart;
}
if (block->status & BLOCK_CHANGED && flush)
if (block->status & BLOCK_CHANGED)
{
if (flush)
{
/* The block contains a dirty page - push it out of the cache */
......@@ -3014,7 +3016,7 @@ my_bool pagecache_delete_page(PAGECACHE *pagecache,
block->status|= BLOCK_ERROR;
goto err;
}
}
pagecache->blocks_changed--;
pagecache->global_blocks_changed--;
/*
......@@ -3793,6 +3795,132 @@ int reset_key_cache_counters(const char *name, PAGECACHE *key_cache)
}
/*
Allocates a buffer and stores in it some information about all dirty pages
of type PAGECACHE_LSN_PAGE.
SYNOPSIS
pagecache_collect_changed_blocks_with_LSN()
pagecache pointer to the page cache
str (OUT) pointer to a LEX_STRING where the allocated buffer, and
its size, will be put
max_lsn (OUT) pointer to a LSN where the maximum rec_lsn of all
relevant dirty pages will be put
DESCRIPTION
Does the allocation because the caller cannot know the size itself.
Memory freeing is done by the caller.
Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they
are not interesting for a checkpoint record.
The caller has the intention of doing checkpoints.
RETURN
0 on success
1 on error
*/
my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
LEX_STRING *str,
LSN *max_lsn)
{
my_bool error;
ulong stored_LRD_size= 0;
uint file_hash;
char *ptr;
DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN");
*max_lsn= 0;
/*
We lock the entire cache but will be quick, just reading/writing a few MBs
of memory at most.
When we enter here, we must be sure that no "first_in_switch" situation
is happening or will happen (either we have to get rid of
first_in_switch in the code or, first_in_switch has to increment a
"danger" counter for this function to know it has to wait). TODO.
*/
pagecache_pthread_mutex_lock(&pagecache->cache_lock);
/* Count how many dirty pages are interesting */
for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
PAGECACHE_BLOCK_LINK *block;
for (block= pagecache->changed_blocks[file_hash] ;
block;
block= block->next_changed)
{
/*
Q: is there somthing subtle with block->hash_link: can it be NULL?
does it have to be == hash_link->block... ?
*/
DBUG_ASSERT(block->hash_link != NULL);
DBUG_ASSERT(block->status & BLOCK_CHANGED);
if (block->type != PAGECACHE_LSN_PAGE)
continue; /* no need to store it */
/*
In the current pagecache, rec_lsn is not set correctly:
1) it is set on pagecache_unlock(), too late (a page is dirty
(BLOCK_CHANGED) since the first pagecache_write()). So in this
scenario:
thread1: thread2:
write_REDO
pagecache_write() checkpoint : reclsn not known
pagecache_unlock(sets rec_lsn)
commit
crash,
at recovery we will wrongly skip the REDO. It also affects the
low-water mark's computation.
2) sometimes the unlocking can be an implicit action of
pagecache_write(), without any call to pagecache_unlock(), then
rec_lsn is not set.
1) and 2) are critical problems.
TODO: fix this when Monty has explained how he writes BLOB pages.
*/
if (0 == block->rec_lsn)
{
DBUG_ASSERT(0);
goto err;
}
stored_LRD_size++;
}
}
str->length= 8+(4+4+8)*stored_LRD_size;
if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
goto err;
ptr= str->str;
int8store(ptr, stored_LRD_size);
ptr+= 8;
if (0 == stored_LRD_size)
goto end;
for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
PAGECACHE_BLOCK_LINK *block;
for (block= pagecache->changed_blocks[file_hash] ;
block;
block= block->next_changed)
{
if (block->type != PAGECACHE_LSN_PAGE)
continue; /* no need to store it in the checkpoint record */
DBUG_ASSERT((4 == sizeof(block->hash_link->file.file)) &&
(4 == sizeof(block->hash_link->pageno)));
int4store(ptr, block->hash_link->file.file);
ptr+= 4;
int4store(ptr, block->hash_link->pageno);
ptr+= 4;
int8store(ptr, (ulonglong)block->rec_lsn);
ptr+= 8;
set_if_bigger(*max_lsn, block->rec_lsn);
}
}
error= 0;
goto end;
err:
error= 1;
end:
pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
DBUG_RETURN(error);
}
#ifndef DBUG_OFF
/*
Test if disk-cache is ok
......
This diff is collapsed.
......@@ -36,162 +36,57 @@
#include "least_recently_dirtied.h"
/*
MikaelR suggested removing this global_LRD_mutex (I have a paper note of
comments), however at least for the first version we'll start with this
mutex (which will be a LOCK-based atomic_rwlock).
*/
pthread_mutex_t global_LRD_mutex;
/*
When we flush a page, we should pin page.
This "pin" is to protect against that:
I make copy,
you modify in memory and flush to disk and remove from LRD and from cache,
I write copy to disk,
checkpoint happens.
result: old page is on disk, page is absent from LRD, your REDO will be
wrongly ignored.
Pin: there can be multiple pins, flushing imposes that there are zero pins.
For example, pin could be a uint counter protected by the page's latch.
Maybe it's ok if when there is a page replacement, the replacer does not
remove page from the LRD (it would save global mutex); for that, background
flusher should be prepared to see pages in the LRD which are not in the page
cache (then just ignore them). However checkpoint will contain superfluous
entries and so do more work.
*/
#define PAGE_SIZE (16*1024) /* just as an example */
/*
Optimization:
LRD flusher should not flush pages one by one: to be fast, it flushes a
group of pages in sequential disk order if possible; a group of pages is just
FLUSH_GROUP_SIZE pages.
Key cache has groupping already somehow Monty said (investigate that).
*/
#define FLUSH_GROUP_SIZE 512 /* 8 MB */
/*
We don't want to probe for checkpoint requests all the time (it takes
the log mutex).
If FLUSH_GROUP_SIZE is 8MB, assuming a local disk which can write 30MB/s
(1.8GB/min), probing every 16th call to flush_one_group_from_LRD() is every
16*8=128MB which is every 128/30=4.2second.
Using a power of 2 gives a fast modulo operation.
*/
#define CHECKPOINT_PROBING_PERIOD_LOG2 4
/*
This thread does background flush of pieces of the LRD, and all checkpoints.
This thread does background flush of pieces of the LRD, and serves
requests for asynchronous checkpoints.
Just launch it when engine starts.
MikaelR questioned why the same thread does two different jobs, the risk
could be that while a checkpoint happens no LRD flushing happens.
For now, we only do checkpoints - no LRD flushing (to be done when the
second version of the page cache is ready WL#3077).
Reasons to delay:
- Recovery will work (just slower)
- new page cache may be different, why do then re-do
- current pagecache probably has issues with flushing when somebody is
writing to the table being flushed - better avoid that.
*/
pthread_handler_decl background_flush_and_checkpoint_thread()
{
char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE);
uint flush_calls= 0;
while (this_thread_not_killed)
{
if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
{
/* note that we don't care of the checkpoint's success */
(void)execute_asynchronous_checkpoint_if_any();
}
lock(global_LRD_mutex);
flush_one_group_from_LRD();
safemutex_assert_not_owner(global_LRD_mutex);
sleep(5);
/*
We are a background thread, leave time for client threads or we would
monopolize the disk:
in the final version, we will not sleep but call flush_pages_from_LRD()
repeatedly. If there are no dirty pages, we'll make sure to not have a
tight loop probing for checkpoint requests.
*/
pthread_yield();
}
my_free(flush_group_buffer);
}
/* The rest of this file will not serve in first version */
/*
flushes only the first FLUSH_GROUP_SIZE pages of the LRD.
flushes only the first pages of the LRD.
max_this_number could be FLUSH_CACHE (of mf_pagecache.c) for example.
*/
flush_one_group_from_LRD()
flush_pages_from_LRD(uint max_this_number, LSN max_this_lsn)
{
char *ptr;
safe_mutex_assert_owner(global_LRD_mutex);
for (page= 0; page<FLUSH_GROUP_SIZE; page++)
{
copy_element_to_array;
}
/*
One rule to better observe is "page must be flushed to disk before it is
removed from LRD" (otherwise checkpoint is incomplete info, corruption).
*/
unlock(global_LRD_mutex);
/* page id is concatenation of "file id" and "number of page in file" */
qsort(array, sizeof(*element), FLUSH_GROUP_SIZE, by_page_id);
for (scan_array)
{
if (page_cache_latch(page_id, READ) == PAGE_ABSENT)
{
/*
page disappeared since we made the copy (it was flushed to be
replaced), remove from array (memcpy tail of array over it)...
*/
continue;
}
memcpy(flush_group_buffer+..., page->data, PAGE_SIZE);
pin_page;
page_cache_unlatch(page_id, KEEP_PINNED); /* but keep pinned */
}
for (scan_the_array)
{
/*
As an optimization, we try to identify contiguous-in-the-file segments (to
issue one big write()).
In non-optimized version, contiguous segment is always only one page.
*/
if ((next_page.page_id - this_page.page_id) == 1)
{
/*
this page and next page are in same file and are contiguous in the
file: add page to contiguous segment...
*/
continue; /* defer write() to next pages */
}
/* contiguous segment ends */
my_pwrite(file, contiguous_segment_start_offset, contiguous_segment_size);
/*
note that if we had doublewrite, doublewrite buffer may prevent us from
doing this write() grouping (if doublewrite space is shorter).
*/
}
/*
Now remove pages from LRD. As we have pinned them, all pages that we
managed to pin are still in the LRD, in the same order, we can just cut
the LRD at the last element of "array". This is more efficient that
removing element by element (which would take LRD mutex many times) in the
loop above.
*/
lock(global_LRD_mutex);
/* cut LRD by bending LRD->first, free cut portion... */
unlock(global_LRD_mutex);
for (scan_array)
{
/*
if the page has a property "modified since last flush" (i.e. which is
redundant with the presence of the page in the LRD, this property can
just be a pointer to the LRD element) we should reset it
(note that then the property would live slightly longer than
the presence in LRD).
*/
page_cache_unpin(page_id);
/*
order between unpin and removal from LRD is not clear, depends on what
pin actually is.
Build a list of pages to flush:
changed_blocks[i] is roughly sorted by descending rec_lsn,
so we could do a merge sort of changed_blocks[] lists, stopping after we
have the max_this_number first elements or after we have found a page with
rec_lsn > max_this_lsn.
Then do like pagecache_flush_blocks_int() does (beware! this time we are
not alone on the file! there may be dangers! TODO: sort this out).
*/
}
free(array);
/*
MikaelR noted that he observed that Linux's file cache may never fsync to
disk until this cache is full, at which point it decides to empty the
......@@ -201,28 +96,11 @@ flush_one_group_from_LRD()
}
/*
Flushes all page from LRD up to approximately rec_lsn>=max_lsn.
This is approximate because we flush groups, and because the LRD list may
Note that when we flush all page from LRD up to rec_lsn>=max_lsn,
this is approximate because the LRD list may
not be exactly sorted by rec_lsn (because for a big row, all pages of the
row are inserted into the LRD with rec_lsn being the LSN of the REDO for the
first page, so if there are concurrent insertions, the last page of the big
row may have a smaller rec_lsn than the previous pages inserted by
concurrent inserters).
*/
int flush_all_LRD_to_lsn(LSN max_lsn)
{
lock(global_LRD_mutex);
if (max_lsn == MAX_LSN) /* don't want to flush forever, so make it fixed: */
max_lsn= LRD->first->prev->rec_lsn;
while (LRD->first->rec_lsn < max_lsn)
{
if (flush_one_group_from_LRD()) /* will unlock LRD mutex */
return 1;
/*
The scheduler may preempt us here as we released the mutex; this is good.
*/
lock(global_LRD_mutex);
}
unlock(global_LRD_mutex);
return 0;
}
#include <m_string.h>
#include <pagecache.h>
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment