Commit a52c4820 authored by inaam's avatar inaam

branches/innodb+ rb://210

Introduce a new mutex to protect flush_list.
Redesign mtr_commit() in a way that log_sys mutex is not held while all
mtr_memos are popped and is released just after the modified blocks are
inserted into the flush_list. This should reduce contention on log_sys
mutex.

Approved by: Heikki
parent 4b34fd14
......@@ -153,12 +153,12 @@ list. We also keep a pointer to near the end of the LRU list,
which we can use when we want to artificially age a page in the
buf_pool. This is used if we know that some page is not needed
again for some time: we insert the block right after the pointer,
causing it to be replaced sooner than would noramlly be the case.
causing it to be replaced sooner than would normally be the case.
Currently this aging mechanism is used for read-ahead mechanism
of pages, and it can also be used when there is a scan of a full
table which cannot fit in the memory. Putting the pages near the
of the LRU list, we make sure that most of the buf_pool stays in the
main memory, undisturbed.
end of the LRU list, we make sure that most of the buf_pool stays
in the main memory, undisturbed.
The unzip_LRU list contains a subset of the common LRU list. The
blocks on the unzip_LRU list hold a compressed file page and the
......@@ -172,6 +172,7 @@ The chain of modified blocks (buf_pool->flush_list) contains the blocks
holding file pages that have been modified in the memory
but not written to disk yet. The block with the oldest modification
which has not yet been written to disk is at the end of the chain.
The access to this list is protected by flush_list_mutex.
The chain of unmodified compressed blocks (buf_pool->zip_clean)
contains the control blocks (buf_page_t) of those compressed pages
......@@ -981,6 +982,7 @@ buf_pool_init(void)
/* 2. Initialize flushing fields
-------------------------------- */
mutex_create(&buf_pool->flush_list_mutex, SYNC_BUF_FLUSH_LIST);
for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
buf_pool->no_flush[i] = os_event_create(NULL);
}
......@@ -1407,6 +1409,7 @@ buf_pool_page_hash_rebuild(void)
buf_page_address_fold(b->space, b->offset), b);
}
buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
......@@ -1434,6 +1437,7 @@ buf_pool_page_hash_rebuild(void)
}
}
buf_flush_list_mutex_exit();
buf_pool_mutex_exit();
}
......@@ -3534,11 +3538,6 @@ buf_validate(void)
}
n_lru++;
if (block->page.oldest_modification > 0) {
n_flush++;
}
break;
case BUF_BLOCK_NOT_USED:
......@@ -3577,6 +3576,10 @@ buf_validate(void)
ut_error;
break;
}
/* It is OK to read oldest_modification here because
we have acquired buf_pool_zip_mutex above which acts
as the 'block->mutex' for these bpages. */
ut_a(!b->oldest_modification);
ut_a(buf_page_hash_get(b->space, b->offset) == b);
......@@ -3584,23 +3587,23 @@ buf_validate(void)
n_zip++;
}
/* Check dirty compressed-only blocks. */
/* Check dirty blocks. */
buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
ut_a(b->oldest_modification);
n_flush++;
switch (buf_page_get_state(b)) {
case BUF_BLOCK_ZIP_DIRTY:
ut_a(b->oldest_modification);
n_lru++;
n_flush++;
n_zip++;
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
break;
case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) {
case BUF_FLUSH_LRU:
......@@ -3633,6 +3636,10 @@ buf_validate(void)
ut_a(buf_page_hash_get(b->space, b->offset) == b);
}
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
buf_flush_list_mutex_exit();
mutex_exit(&buf_pool_zip_mutex);
if (n_lru + n_free > buf_pool->curr_size + n_zip) {
......@@ -3649,7 +3656,6 @@ buf_validate(void)
(ulong) n_free);
ut_error;
}
ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
......@@ -3690,6 +3696,7 @@ buf_print(void)
counts = mem_alloc(sizeof(ulint) * size);
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
fprintf(stderr,
"buf_pool size %lu\n"
......@@ -3716,6 +3723,8 @@ buf_print(void)
(ulong) buf_pool->stat.n_pages_created,
(ulong) buf_pool->stat.n_pages_written);
buf_flush_list_mutex_exit();
/* Count the number of blocks belonging to each index in the buffer */
n_found = 0;
......@@ -3839,6 +3848,7 @@ buf_get_latched_pages_number(void)
}
}
buf_flush_list_mutex_enter();
for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
b = UT_LIST_GET_NEXT(list, b)) {
ut_ad(b->in_flush_list);
......@@ -3864,6 +3874,7 @@ buf_get_latched_pages_number(void)
}
}
buf_flush_list_mutex_exit();
mutex_exit(&buf_pool_zip_mutex);
buf_pool_mutex_exit();
......@@ -3896,16 +3907,13 @@ buf_get_modified_ratio_pct(void)
{
ulint ratio;
buf_pool_mutex_enter();
/* This is for heuristics. No need to grab any mutex here. */
ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
+ UT_LIST_GET_LEN(buf_pool->free));
/* 1 + is there to avoid division by zero */
buf_pool_mutex_exit();
return(ratio);
}
......@@ -3924,6 +3932,7 @@ buf_print_io(
ut_ad(buf_pool);
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
fprintf(file,
"Buffer pool size %lu\n"
......@@ -3945,6 +3954,8 @@ buf_print_io(
+ buf_pool->init_flush[BUF_FLUSH_LIST],
(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
buf_flush_list_mutex_exit();
current_time = time(NULL);
time_elapsed = 0.001 + difftime(current_time,
buf_pool->last_printout_time);
......
......@@ -102,7 +102,7 @@ buf_flush_insert_in_flush_rbt(
const ib_rbt_node_t* c_node;
const ib_rbt_node_t* p_node;
ut_ad(buf_pool_mutex_own());
ut_ad(buf_flush_list_mutex_own());
/* Insert this buffer into the rbt. */
c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
......@@ -130,7 +130,8 @@ buf_flush_delete_from_flush_rbt(
ibool ret = FALSE;
ut_ad(buf_pool_mutex_own());
ut_ad(buf_flush_list_mutex_own());
ret = rbt_delete(buf_pool->flush_rbt, &bpage);
ut_ad(ret);
}
......@@ -159,6 +160,8 @@ buf_flush_block_cmp(
ut_ad(b1 != NULL);
ut_ad(b2 != NULL);
ut_ad(buf_flush_list_mutex_own());
ut_ad(b1->in_flush_list);
ut_ad(b2->in_flush_list);
......@@ -188,12 +191,12 @@ void
buf_flush_init_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
/* Create red black tree for speedy insertions in flush list. */
buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
buf_flush_block_cmp);
buf_pool_mutex_exit();
buf_flush_list_mutex_exit();
}
/********************************************************************//**
......@@ -203,7 +206,7 @@ void
buf_flush_free_flush_rbt(void)
/*==========================*/
{
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
......@@ -212,7 +215,7 @@ buf_flush_free_flush_rbt(void)
rbt_free(buf_pool->flush_rbt);
buf_pool->flush_rbt = NULL;
buf_pool_mutex_exit();
buf_flush_list_mutex_exit();
}
/********************************************************************//**
......@@ -221,31 +224,38 @@ UNIV_INTERN
void
buf_flush_insert_into_flush_list(
/*=============================*/
buf_block_t* block) /*!< in/out: block which is modified */
buf_block_t* block, /*!< in/out: block which is modified */
ib_uint64_t lsn) /*!< in: oldest modification */
{
ut_ad(buf_pool_mutex_own());
ut_ad(!buf_pool_mutex_own());
ut_ad(mutex_own(&block->mutex));
buf_flush_list_mutex_enter();
ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
|| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
<= block->page.oldest_modification));
<= lsn));
/* If we are in the recovery then we need to update the flush
red-black tree as well. */
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
buf_flush_insert_sorted_into_flush_list(block);
buf_flush_list_mutex_exit();
buf_flush_insert_sorted_into_flush_list(block, lsn);
return;
}
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(block->page.in_LRU_list);
ut_ad(block->page.in_page_hash);
ut_ad(!block->page.in_zip_hash);
ut_ad(!block->page.in_flush_list);
ut_d(block->page.in_flush_list = TRUE);
block->page.oldest_modification = lsn;
UT_LIST_ADD_FIRST(list, buf_pool->flush_list, &block->page);
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
buf_flush_list_mutex_exit();
}
/********************************************************************//**
......@@ -256,19 +266,21 @@ UNIV_INTERN
void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
buf_block_t* block) /*!< in/out: block which is modified */
buf_block_t* block, /*!< in/out: block which is modified */
ib_uint64_t lsn) /*!< in: oldest modification */
{
buf_page_t* prev_b;
buf_page_t* b;
ut_ad(buf_pool_mutex_own());
ut_ad(!buf_pool_mutex_own());
ut_ad(mutex_own(&block->mutex));
ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_ad(block->page.in_LRU_list);
ut_ad(block->page.in_page_hash);
ut_ad(!block->page.in_zip_hash);
buf_flush_list_mutex_enter();
ut_ad(!block->page.in_flush_list);
ut_d(block->page.in_flush_list = TRUE);
block->page.oldest_modification = lsn;
prev_b = NULL;
......@@ -304,6 +316,8 @@ buf_flush_insert_sorted_into_flush_list(
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
buf_flush_list_mutex_exit();
}
/********************************************************************//**
......@@ -388,6 +402,8 @@ buf_flush_remove(
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(bpage->in_flush_list);
buf_flush_list_mutex_enter();
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
/* clean compressed pages should not be on the flush list */
......@@ -419,14 +435,24 @@ buf_flush_remove(
bpage->oldest_modification = 0;
ut_d(UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
ut_ad(ut_list_node_313->in_flush_list)));
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
buf_flush_list_mutex_exit();
}
/*******************************************************************//**
Relocates a buffer control block on the flush_list.
Note that it is assumed that the contents of bpage has already been
copied to dpage. */
copied to dpage.
IMPORTANT: When this function is called bpage and dpage are not
exact copy of each other. For example, they both will have different
::state. Also the ::list pointers in dpage may be stale. We need to
use the current list node (bpage) to do the list manipulation because
the list pointers could have changed between the time that we copied
the contents of bpage to the dpage and the flush list manipulation
below. */
UNIV_INTERN
void
buf_flush_relocate_on_flush_list(
......@@ -441,6 +467,15 @@ buf_flush_relocate_on_flush_list(
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
buf_flush_list_mutex_enter();
/* FIXME: At this point we have both buf_pool and flush_list
mutexes. Theoratically removal of a block from flush list is
only covered by flush_list mutex but currently we do
have buf_pool mutex in buf_flush_remove() therefore this block
is guaranteed to be in the flush list. We need to check if
this will work without the assumption of block removing code
having the buf_pool mutex. */
ut_ad(bpage->in_flush_list);
ut_ad(dpage->in_flush_list);
......@@ -478,6 +513,8 @@ buf_flush_relocate_on_flush_list(
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ut_a(buf_flush_validate_low());
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
buf_flush_list_mutex_exit();
}
/********************************************************************//**
......@@ -938,6 +975,7 @@ buf_flush_write_block_low(
relocated in the buffer pool or removed from flush_list or
LRU_list. */
ut_ad(!buf_pool_mutex_own());
ut_ad(!buf_flush_list_mutex_own());
ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
ut_ad(bpage->oldest_modification != 0);
......@@ -1133,17 +1171,19 @@ buf_flush_try_neighbors(
ulint count = 0;
ulint i;
ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
ut_ad(flush_type == BUF_FLUSH_LRU
|| flush_type == BUF_FLUSH_LIST);
if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
/* If there is little space, it is better not to flush any
block except from the end of the LRU list */
/* If there is little space, it is better not to flush
any block except from the end of the LRU list */
low = offset;
high = offset + 1;
} else {
/* When flushed, dirty blocks are searched in neighborhoods of
this size, and flushed along with the original page. */
/* When flushed, dirty blocks are searched in
neighborhoods of this size, and flushed along with the
original page. */
ulint buf_flush_area = ut_min(BUF_READ_AHEAD_AREA,
buf_pool->curr_size / 16);
......@@ -1184,11 +1224,12 @@ buf_flush_try_neighbors(
if (buf_flush_ready_for_flush(bpage, flush_type)
&& (i == offset || !bpage->buf_fix_count)) {
/* We only try to flush those
neighbors != offset where the buf fix count is
zero, as we then know that we probably can
latch the page without a semaphore wait.
Semaphore waits are expensive because we must
flush the doublewrite buffer before we start
neighbors != offset where the buf fix
count is zero, as we then know that we
probably can latch the page without a
semaphore wait. Semaphore waits are
expensive because we must flush the
doublewrite buffer before we start
waiting. */
buf_flush_page(bpage, flush_type);
......@@ -1207,6 +1248,206 @@ buf_flush_try_neighbors(
return(count);
}
/********************************************************************//**
Check if the block is modified and ready for flushing. If the the block
is ready to flush then flush the page and try o flush its neighbors.
@return TRUE if buf_pool mutex was not released during this function.
This does not guarantee that some pages were written as well.
Number of pages written are incremented to the count. */
static
ibool
buf_flush_page_and_try_neighbors(
/*=============================*/
buf_page_t* bpage, /*!< in: buffer control block,
must be
buf_page_in_file(bpage) */
enum buf_flush flush_type, /*!< in: BUF_FLUSH_LRU
or BUF_FLUSH_LIST */
ulint* count) /*!< in/out: number of pages
flushed */
{
ibool flushed = FALSE;
mutex_t* block_mutex;
ut_ad(buf_pool_mutex_own());
block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
ut_a(buf_page_in_file(bpage));
if (buf_flush_ready_for_flush(bpage, flush_type)) {
ulint space;
ulint offset;
buf_pool_mutex_exit();
/* These fields are protected by both the
buffer pool mutex and block mutex. */
space = buf_page_get_space(bpage);
offset = buf_page_get_page_no(bpage);
mutex_exit(block_mutex);
/* Try to flush also all the neighbors */
*count += buf_flush_try_neighbors(space, offset,
flush_type);
buf_pool_mutex_enter();
flushed = TRUE;
} else {
mutex_exit(block_mutex);
}
ut_ad(buf_pool_mutex_own());
return(flushed);
}
/*******************************************************************//**
This utility flushes dirty blocks from the end of the LRU list.
In the case of an LRU flush the calling thread may own latches to
pages: to avoid deadlocks, this function must be written so that it
cannot end up waiting for these latches!
@return number of blocks for which the write request was queued. */
static
ulint
buf_flush_LRU_list_batch(
/*=====================*/
ulint max) /*!< in: max of blocks to flush */
{
buf_page_t* bpage;
ulint count = 0;
ut_ad(buf_pool_mutex_own());
do {
/* Start from the end of the list looking for a
suitable block to be flushed. */
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
/* Iterate backwards over the flush list till we find
a page that isn't ready for flushing. */
while (bpage != NULL
&& !buf_flush_page_and_try_neighbors(
bpage, BUF_FLUSH_LRU, &count)) {
bpage = UT_LIST_GET_PREV(LRU, bpage);
}
} while (bpage != NULL && count < max);
/* We keep track of all flushes happening as part of LRU
flush. When estimating the desired rate at which flush_list
should be flushed, we factor in this value. */
buf_lru_flush_page_count += count;
ut_ad(buf_pool_mutex_own());
return(count);
}
/*******************************************************************//**
This utility flushes dirty blocks from the end of the flush_list.
the calling thread is not allowed to own any latches on pages!
@return number of blocks for which the write request was queued;
ULINT_UNDEFINED if there was a flush of the same type already
running */
static
ulint
buf_flush_flush_list_batch(
/*=======================*/
ulint min_n, /*!< in: wished minimum mumber
of blocks flushed (it is not
guaranteed that the actual
number is that big, though) */
ib_uint64_t lsn_limit) /*!< all blocks whose
oldest_modification is smaller
than this should be flushed (if
their number does not exceed
min_n) */
{
ulint len;
buf_page_t* bpage;
ulint count = 0;
ut_ad(buf_pool_mutex_own());
/* If we have flushed enough, leave the loop */
do {
/* Start from the end of the list looking for a suitable
block to be flushed. */
buf_flush_list_mutex_enter();
/* We use len here because theoratically insertions can
happen in the flush_list below while we are traversing
it for a suitable candidate for flushing. We'd like to
set a limit on how farther we are willing to traverse
the list. */
len = UT_LIST_GET_LEN(buf_pool->flush_list);
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
if (bpage) {
ut_a(bpage->oldest_modification > 0);
}
if (!bpage || bpage->oldest_modification >= lsn_limit) {
/* We have flushed enough */
buf_flush_list_mutex_exit();
break;
}
ut_a(bpage->oldest_modification > 0);
ut_ad(bpage->in_flush_list);
buf_flush_list_mutex_exit();
/* The list may change during the flushing and we cannot
safely preserve within this function a pointer to a
block in the list! */
while (bpage != NULL
&& len > 0
&& !buf_flush_page_and_try_neighbors(
bpage, BUF_FLUSH_LIST, &count)) {
buf_flush_list_mutex_enter();
/* If we are here that means that buf_pool
mutex was not released in
buf_flush_page_and_try_neighbors() above and
this guarantees that bpage didn't get
relocated since we released the flush_list
mutex above. There is a chance, however, that
the bpage got removed from flush_list (not
currently possible because flush_list_remove()
also obtains buf_pool mutex but that may change
in future). To avoid this scenario we check
the oldest_modification and if it is zero
we start all over again. */
if (bpage->oldest_modification == 0) {
buf_flush_list_mutex_exit();
break;
}
bpage = UT_LIST_GET_PREV(list, bpage);
ut_ad(!bpage || bpage->in_flush_list);
buf_flush_list_mutex_exit();
--len;
}
} while (count < min_n && bpage != NULL && len > 0);
ut_ad(buf_pool_mutex_own());
return(count);
}
/*******************************************************************//**
This utility flushes dirty blocks from the end of the LRU list or flush_list.
NOTE 1: in the case of an LRU flush the calling thread may own latches to
......@@ -1232,22 +1473,18 @@ buf_flush_batch(
(if their number does not exceed
min_n), otherwise ignored */
{
buf_page_t* bpage;
ulint page_count = 0;
ulint old_page_count;
ulint space;
ulint offset;
ulint count = 0;
ut_ad((flush_type == BUF_FLUSH_LRU)
|| (flush_type == BUF_FLUSH_LIST));
ut_ad(flush_type == BUF_FLUSH_LRU
|| flush_type == BUF_FLUSH_LIST);
#ifdef UNIV_SYNC_DEBUG
ut_ad((flush_type != BUF_FLUSH_LIST)
|| sync_thread_levels_empty_gen(TRUE));
#endif /* UNIV_SYNC_DEBUG */
buf_pool_mutex_enter();
if ((buf_pool->n_flush[flush_type] > 0)
|| (buf_pool->init_flush[flush_type] == TRUE)) {
if (buf_pool->n_flush[flush_type] > 0
|| buf_pool->init_flush[flush_type] == TRUE) {
/* There is already a flush batch of the same type running */
......@@ -1258,82 +1495,21 @@ buf_flush_batch(
buf_pool->init_flush[flush_type] = TRUE;
for (;;) {
flush_next:
/* If we have flushed enough, leave the loop */
if (page_count >= min_n) {
break;
}
/* Start from the end of the list looking for a suitable
block to be flushed. */
if (flush_type == BUF_FLUSH_LRU) {
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
} else {
ut_ad(flush_type == BUF_FLUSH_LIST);
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
if (!bpage
|| bpage->oldest_modification >= lsn_limit) {
/* We have flushed enough */
break;
}
ut_ad(bpage->in_flush_list);
}
/* Note that after finding a single flushable page, we try to
flush also all its neighbors, and after that start from the
END of the LRU list or flush list again: the list may change
during the flushing and we cannot safely preserve within this
function a pointer to a block in the list! */
do {
mutex_t*block_mutex = buf_page_get_mutex(bpage);
ibool ready;
ut_a(buf_page_in_file(bpage));
mutex_enter(block_mutex);
ready = buf_flush_ready_for_flush(bpage, flush_type);
mutex_exit(block_mutex);
if (ready) {
space = buf_page_get_space(bpage);
offset = buf_page_get_page_no(bpage);
buf_pool_mutex_exit();
old_page_count = page_count;
/* Try to flush also all the neighbors */
page_count += buf_flush_try_neighbors(
space, offset, flush_type);
/* fprintf(stderr,
"Flush type %lu, page no %lu, neighb %lu\n",
flush_type, offset,
page_count - old_page_count); */
buf_pool_mutex_enter();
goto flush_next;
} else if (flush_type == BUF_FLUSH_LRU) {
bpage = UT_LIST_GET_PREV(LRU, bpage);
} else {
ut_ad(flush_type == BUF_FLUSH_LIST);
bpage = UT_LIST_GET_PREV(list, bpage);
ut_ad(!bpage || bpage->in_flush_list);
}
} while (bpage != NULL);
/* If we could not find anything to flush, leave the loop */
/* Note: The buffer pool mutex is released and reacquired within
the flush functions. */
switch(flush_type) {
case BUF_FLUSH_LRU:
count = buf_flush_LRU_list_batch(min_n);
break;
case BUF_FLUSH_LIST:
count = buf_flush_flush_list_batch(min_n, lsn_limit);
break;
default:
ut_error;
}
ut_ad(buf_pool_mutex_own());
buf_pool->init_flush[flush_type] = FALSE;
if (buf_pool->n_flush[flush_type] == 0) {
......@@ -1348,26 +1524,17 @@ buf_flush_batch(
buf_flush_buffered_writes();
#ifdef UNIV_DEBUG
if (buf_debug_prints && page_count > 0) {
ut_a(flush_type == BUF_FLUSH_LRU
|| flush_type == BUF_FLUSH_LIST);
if (buf_debug_prints && count > 0) {
fprintf(stderr, flush_type == BUF_FLUSH_LRU
? "Flushed %lu pages in LRU flush\n"
: "Flushed %lu pages in flush list flush\n",
(ulong) page_count);
(ulong) count);
}
#endif /* UNIV_DEBUG */
srv_buf_pool_flushed += page_count;
/* We keep track of all flushes happening as part of LRU
flush. When estimating the desired rate at which flush_list
should be flushed we factor in this value. */
if (flush_type == BUF_FLUSH_LRU) {
buf_lru_flush_page_count += page_count;
}
srv_buf_pool_flushed += count;
return(page_count);
return(count);
}
/******************************************************************//**
......@@ -1585,6 +1752,8 @@ buf_flush_validate_low(void)
buf_page_t* bpage;
const ib_rbt_node_t* rnode = NULL;
ut_ad(buf_flush_list_mutex_own());
UT_LIST_VALIDATE(list, buf_page_t, buf_pool->flush_list,
ut_ad(ut_list_node_313->in_flush_list));
......@@ -1600,7 +1769,16 @@ buf_flush_validate_low(void)
while (bpage != NULL) {
const ib_uint64_t om = bpage->oldest_modification;
ut_ad(bpage->in_flush_list);
ut_a(buf_page_in_file(bpage));
/* A page in flush_list can be in BUF_BLOCK_REMOVE_HASH
state. This happens when a page is in the middle of
being relocated. In that case the original descriptor
can have this state and still be in the flush list
waiting to acquire the flush_list_mutex to complete
the relocation. */
ut_a(buf_page_in_file(bpage)
|| buf_page_get_state(bpage)
== BUF_BLOCK_REMOVE_HASH);
ut_a(om > 0);
if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
......@@ -1634,11 +1812,11 @@ buf_flush_validate(void)
{
ibool ret;
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
ret = buf_flush_validate_low();
buf_pool_mutex_exit();
buf_flush_list_mutex_exit();
return(ret);
}
......
......@@ -2018,6 +2018,7 @@ buf_LRU_print(void)
while (bpage != NULL) {
mutex_enter(buf_page_get_mutex(bpage));
fprintf(stderr, "BLOCK space %lu page %lu ",
(ulong) buf_page_get_space(bpage),
(ulong) buf_page_get_page_no(bpage));
......@@ -2066,6 +2067,7 @@ buf_LRU_print(void)
break;
}
mutex_exit(buf_page_get_mutex(bpage));
bpage = UT_LIST_GET_NEXT(LRU, bpage);
}
......
......@@ -347,9 +347,8 @@ void
buf_page_release(
/*=============*/
buf_block_t* block, /*!< in: buffer block */
ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
ulint rw_latch); /*!< in: RW_S_LATCH, RW_X_LATCH,
RW_NO_LATCH */
mtr_t* mtr); /*!< in: mtr */
/********************************************************************//**
Moves a page to the start of the buffer pool LRU list. This high-level
function can be used to prevent an important page from slipping out of
......@@ -1102,8 +1101,9 @@ struct buf_page_struct{
UT_LIST_NODE_T(buf_page_t) list;
/*!< based on state, this is a
list node, protected only by
buf_pool_mutex, in one of the
list node, protected either by
buf_pool_mutex or by
flush_list_mutex, in one of the
following lists in buf_pool:
- BUF_BLOCK_NOT_USED: free
......@@ -1112,6 +1112,12 @@ struct buf_page_struct{
- BUF_BLOCK_ZIP_PAGE: zip_clean
- BUF_BLOCK_ZIP_FREE: zip_free[]
If bpage is part of flush_list
then the node pointers are
covered by flush_list_mutex.
Otherwise these pointers are
protected by buf_pool_mutex.
The contents of the list node
is undefined if !in_flush_list
&& state == BUF_BLOCK_FILE_PAGE,
......@@ -1122,10 +1128,15 @@ struct buf_page_struct{
#ifdef UNIV_DEBUG
ibool in_flush_list; /*!< TRUE if in buf_pool->flush_list;
when buf_pool_mutex is free, the
when flush_list_mutex is free, the
following should hold: in_flush_list
== (state == BUF_BLOCK_FILE_PAGE
|| state == BUF_BLOCK_ZIP_DIRTY) */
|| state == BUF_BLOCK_ZIP_DIRTY)
Writes to this field must be
covered by both block->mutex
and flush_list_mutex. Hence
reads can happen while holding
any one of the two mutexes */
ibool in_free_list; /*!< TRUE if in buf_pool->free; when
buf_pool_mutex is free, the following
should hold: in_free_list
......@@ -1135,7 +1146,8 @@ struct buf_page_struct{
/*!< log sequence number of
the youngest modification to
this block, zero if not
modified */
modified. Protected by block
mutex */
ib_uint64_t oldest_modification;
/*!< log sequence number of
the START of the log entry
......@@ -1143,7 +1155,12 @@ struct buf_page_struct{
modification to this block
which has not yet been flushed
on disk; zero if all
modifications are on disk */
modifications are on disk.
Writes to this field must be
covered by both block->mutex
and flush_list_mutex. Hence
reads can happen while holding
any one of the two mutexes */
/* @} */
/** @name LRU replacement algorithm fields
These fields are protected by buf_pool_mutex only (not
......@@ -1375,6 +1392,13 @@ struct buf_pool_struct{
/* @{ */
mutex_t flush_list_mutex;/*!< mutex protecting the
flush list access. This mutex
protects flush_list, flush_rbt
and bpage::list pointers when
the bpage is on flush_list. It
also protects writes to
bpage::oldest_modification */
UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
/*!< base node of the modified block
list */
......@@ -1400,7 +1424,8 @@ struct buf_pool_struct{
also be on the flush_list.
This tree is relevant only in
recovery and is set to NULL
once the recovery is over. */
once the recovery is over.
Protected by flush_list_mutex */
ulint freed_page_clock;/*!< a sequence number used
to count the number of buffer
blocks removed from the end of
......@@ -1492,6 +1517,18 @@ Use these instead of accessing buf_pool_mutex directly. */
mutex_enter(&buf_pool_mutex); \
} while (0)
/** Test if flush list mutex is owned. */
#define buf_flush_list_mutex_own() mutex_own(&buf_pool->flush_list_mutex)
/** Acquire the flush list mutex. */
#define buf_flush_list_mutex_enter() do { \
mutex_enter(&buf_pool->flush_list_mutex); \
} while (0)
/** Release the flush list mutex. */
# define buf_flush_list_mutex_exit() do { \
mutex_exit(&buf_pool->flush_list_mutex); \
} while (0)
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
/** Flag to forbid the release of the buffer pool mutex.
Protected by buf_pool_mutex. */
......
......@@ -121,7 +121,7 @@ buf_pool_get_oldest_modification(void)
buf_page_t* bpage;
ib_uint64_t lsn;
buf_pool_mutex_enter();
buf_flush_list_mutex_enter();
bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
......@@ -132,7 +132,7 @@ buf_pool_get_oldest_modification(void)
lsn = bpage->oldest_modification;
}
buf_pool_mutex_exit();
buf_flush_list_mutex_exit();
/* The returned answer may be out of date: the flush_list can
change after the mutex has been released. */
......@@ -1018,21 +1018,14 @@ void
buf_page_release(
/*=============*/
buf_block_t* block, /*!< in: buffer block */
ulint rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH,
ulint rw_latch) /*!< in: RW_S_LATCH, RW_X_LATCH,
RW_NO_LATCH */
mtr_t* mtr) /*!< in: mtr */
{
ut_ad(block);
ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
ut_a(block->page.buf_fix_count > 0);
if (rw_latch == RW_X_LATCH && mtr->modifications) {
buf_pool_mutex_enter();
buf_flush_note_modification(block, mtr);
buf_pool_mutex_exit();
}
mutex_enter(&block->mutex);
#ifdef UNIV_SYNC_DEBUG
......
......@@ -33,7 +33,8 @@ UNIV_INTERN
void
buf_flush_insert_into_flush_list(
/*=============================*/
buf_block_t* block); /*!< in/out: block which is modified */
buf_block_t* block, /*!< in/out: block which is modified */
ib_uint64_t lsn); /*!< in: oldest modification */
/********************************************************************//**
Inserts a modified block into the flush list in the right sorted position.
This function is used by recovery, because there the modifications do not
......@@ -42,7 +43,8 @@ UNIV_INTERN
void
buf_flush_insert_sorted_into_flush_list(
/*====================================*/
buf_block_t* block); /*!< in/out: block which is modified */
buf_block_t* block, /*!< in/out: block which is modified */
ib_uint64_t lsn); /*!< in: oldest modification */
/********************************************************************//**
This function should be called at a mini-transaction commit, if a page was
......@@ -61,24 +63,26 @@ buf_flush_note_modification(
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
ut_ad(buf_pool_mutex_own());
ut_ad(!buf_pool_mutex_own());
ut_ad(!buf_flush_list_mutex_own());
ut_ad(mtr->start_lsn != 0);
ut_ad(mtr->modifications);
mutex_enter(&block->mutex);
ut_ad(block->page.newest_modification <= mtr->end_lsn);
block->page.newest_modification = mtr->end_lsn;
if (!block->page.oldest_modification) {
block->page.oldest_modification = mtr->start_lsn;
ut_ad(block->page.oldest_modification != 0);
buf_flush_insert_into_flush_list(block);
buf_flush_insert_into_flush_list(block, mtr->start_lsn);
} else {
ut_ad(block->page.oldest_modification <= mtr->start_lsn);
}
mutex_exit(&block->mutex);
++srv_buf_pool_write_requests;
}
......@@ -101,23 +105,22 @@ buf_flush_recv_note_modification(
ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
buf_pool_mutex_enter();
ut_ad(!buf_pool_mutex_own());
ut_ad(!buf_flush_list_mutex_own());
ut_ad(start_lsn != 0);
ut_ad(block->page.newest_modification <= end_lsn);
mutex_enter(&block->mutex);
block->page.newest_modification = end_lsn;
if (!block->page.oldest_modification) {
block->page.oldest_modification = start_lsn;
ut_ad(block->page.oldest_modification != 0);
buf_flush_insert_sorted_into_flush_list(block);
buf_flush_insert_sorted_into_flush_list(block, start_lsn);
} else {
ut_ad(block->page.oldest_modification <= start_lsn);
}
buf_pool_mutex_exit();
mutex_exit(&block->mutex);
}
#endif /* !UNIV_HOTBACKUP */
......@@ -475,8 +475,9 @@ or row lock! */
SYNC_SEARCH_SYS, as memory allocation
can call routines there! Otherwise
the level is SYNC_MEM_HASH. */
#define SYNC_BUF_POOL 150
#define SYNC_BUF_BLOCK 149
#define SYNC_BUF_POOL 150 /* Buffer pool mutex */
#define SYNC_BUF_BLOCK 149 /* Block mutex */
#define SYNC_BUF_FLUSH_LIST 145 /* Buffer flush list mutex */
#define SYNC_DOUBLEWRITE 140
#define SYNC_ANY_LATCH 135
#define SYNC_THR_LOCAL 133
......
......@@ -30,6 +30,7 @@ Created 11/26/1995 Heikki Tuuri
#endif
#include "buf0buf.h"
#include "buf0flu.h"
#include "page0types.h"
#include "mtr0log.h"
#include "log0log.h"
......@@ -38,7 +39,7 @@ Created 11/26/1995 Heikki Tuuri
# include "log0recv.h"
/*****************************************************************//**
Releases the item in the slot given. */
UNIV_INLINE
static
void
mtr_memo_slot_release(
/*==================*/
......@@ -48,14 +49,19 @@ mtr_memo_slot_release(
void* object;
ulint type;
ut_ad(mtr && slot);
ut_ad(mtr);
ut_ad(slot);
#ifndef UNIV_DEBUG
UT_NOT_USED(mtr);
#endif /* UNIV_DEBUG */
object = slot->object;
type = slot->type;
if (UNIV_LIKELY(object != NULL)) {
if (type <= MTR_MEMO_BUF_FIX) {
buf_page_release((buf_block_t*)object, type, mtr);
buf_page_release((buf_block_t*)object, type);
} else if (type == MTR_MEMO_S_LOCK) {
rw_lock_s_unlock((rw_lock_t*)object);
#ifdef UNIV_DEBUG
......@@ -73,13 +79,10 @@ mtr_memo_slot_release(
}
/**********************************************************//**
Releases the mlocks and other objects stored in an mtr memo. They are released
in the order opposite to which they were pushed to the memo. NOTE! It is
essential that the x-rw-lock on a modified buffer page is not released before
buf_page_note_modification is called for that page! Otherwise, some thread
might race to modify it, and the flush list sort order on lsn would be
destroyed. */
UNIV_INLINE
Releases the mlocks and other objects stored in an mtr memo.
They are released in the order opposite to which they were pushed
to the memo. */
static
void
mtr_memo_pop_all(
/*=============*/
......@@ -105,6 +108,58 @@ mtr_memo_pop_all(
}
}
/*****************************************************************//**
Releases the item in the slot given. */
static
void
mtr_memo_slot_note_modification(
/*============================*/
mtr_t* mtr, /*!< in: mtr */
mtr_memo_slot_t* slot) /*!< in: memo slot */
{
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
ut_ad(mtr->modifications);
if (slot->object != NULL && slot->type == MTR_MEMO_PAGE_X_FIX) {
buf_flush_note_modification((buf_block_t*) slot->object, mtr);
}
}
/**********************************************************//**
Add the modified pages to the buffer flush list. They are released
in the order opposite to which they were pushed to the memo. NOTE! It is
essential that the x-rw-lock on a modified buffer page is not released
before buf_page_note_modification is called for that page! Otherwise,
some thread might race to modify it, and the flush list sort order on
lsn would be destroyed. */
static
void
mtr_memo_note_modifications(
/*========================*/
mtr_t* mtr) /*!< in: mtr */
{
dyn_array_t* memo;
ulint offset;
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
commit */
memo = &mtr->memo;
offset = dyn_array_get_data_size(memo);
while (offset > 0) {
mtr_memo_slot_t* slot;
offset -= sizeof(mtr_memo_slot_t);
slot = dyn_array_get_element(memo, offset);
mtr_memo_slot_note_modification(mtr, slot);
}
}
/************************************************************//**
Writes the contents of a mini-transaction log, if any, to the database log. */
static
......@@ -137,7 +192,9 @@ mtr_log_reserve_and_write(
&mtr->start_lsn);
if (mtr->end_lsn) {
return;
/* Success. We have the log mutex.
Add pages to flush list and exit */
goto func_exit;
}
}
......@@ -161,6 +218,13 @@ mtr_log_reserve_and_write(
}
mtr->end_lsn = log_close();
func_exit:
if (mtr->modifications) {
mtr_memo_note_modifications(mtr);
}
log_release();
}
#endif /* !UNIV_HOTBACKUP */
......@@ -172,10 +236,6 @@ mtr_commit(
/*=======*/
mtr_t* mtr) /*!< in: mini-transaction */
{
#ifndef UNIV_HOTBACKUP
ibool write_log;
#endif /* !UNIV_HOTBACKUP */
ut_ad(mtr);
ut_ad(mtr->magic_n == MTR_MAGIC_N);
ut_ad(mtr->state == MTR_ACTIVE);
......@@ -184,25 +244,12 @@ mtr_commit(
#ifndef UNIV_HOTBACKUP
/* This is a dirty read, for debugging. */
ut_ad(!recv_no_log_write);
write_log = mtr->modifications && mtr->n_log_recs;
if (write_log) {
if (mtr->modifications && mtr->n_log_recs) {
mtr_log_reserve_and_write(mtr);
}
/* We first update the modification info to buffer pages, and only
after that release the log mutex: this guarantees that when the log
mutex is free, all buffer pages contain an up-to-date info of their
modifications. This fact is used in making a checkpoint when we look
at the oldest modification of any page in the buffer pool. It is also
required when we insert modified buffer pages in to the flush list
which must be sorted on oldest_modification. */
mtr_memo_pop_all(mtr);
if (write_log) {
log_release();
}
#endif /* !UNIV_HOTBACKUP */
ut_d(mtr->state = MTR_COMMITTED);
......@@ -241,6 +288,10 @@ mtr_rollback_to_savepoint(
slot = dyn_array_get_element(memo, offset);
ut_ad(slot->type != MTR_MEMO_MODIFY);
/* We do not call mtr_memo_slot_note_modification()
because there MUST be no changes made to the buffer
pages after the savepoint */
mtr_memo_slot_release(mtr, slot);
}
}
......@@ -272,7 +323,10 @@ mtr_memo_release(
slot = dyn_array_get_element(memo, offset);
if ((object == slot->object) && (type == slot->type)) {
if (object == slot->object && type == slot->type) {
if (mtr->modifications) {
mtr_memo_slot_note_modification(mtr, slot);
}
mtr_memo_slot_release(mtr, slot);
......
......@@ -1092,6 +1092,7 @@ sync_thread_add_level(
case SYNC_TRX_SYS_HEADER:
case SYNC_FILE_FORMAT_TAG:
case SYNC_DOUBLEWRITE:
case SYNC_BUF_FLUSH_LIST:
case SYNC_BUF_POOL:
case SYNC_SEARCH_SYS:
case SYNC_SEARCH_SYS_CONF:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment