Commit 3d9f3eed authored by marko's avatar marko

branches/innodb+: Implement the buf_pool_watch for DeleteBuffering in

the page hash table. This serves two purposes. It allows multiple
watches to be set at the same time (by multiple purge threads) and it
removes a race condition when the read of a block completes about the
time the buffer pool watch is being set.

buf_pool_watch_clear(): Rename to buf_pool_watch_unset(). Add
parameters space, offset.

buf_pool_watch_remove(): A helper function for removing the watch.

buf_pool_watch_is(): A predicate for testing if a block descriptor is
a sentinel for the buffer pool watch.

buf_pool_watch[BUF_POOL_WATCH_SIZE]: An array of sentinel block descriptors.

buf_pool_watch_set(): Add a parameter for the fold value, and return
the block if the block is in the buffer pool. Allocate the sentinel
from buf_pool_watch[] if needed. Use buf_fix_count for
reference-counting.

enum buf_block_state: Add BUF_BLOCK_POOL_WATCH as a state alias that
is shared with BUF_BLOCK_ZIP_FREE.

buf_page_hash_get_low(): A low-level variant of buf_page_hash_get()
that takes the fold value as a parameter and may return a watch
sentinel block. In callers, test the return value for
buf_pool_watch_is() [impossible cases with ut_ad(), possible ones with if].
When needed, invoke buf_pool_watch_remove() but preserve the buf_fix_count.

buf_page_hash_get(), buf_block_hash_get(): Return NULL for watch
sentinel blocks, to keep existing behaviour.

buf_page_init(): Add a parameter for the fold value.

ibuf_insert(): If a buffer pool watch exists for the block, refuse to
buffer subsequent operations, so that the purge that is being buffered
will not "overtake" later requests. Previously, we would notify the
watch in this case. Either way, the block would be read to the buffer
pool. In the current design, we can only notify the watch by actually
setting up a real block in buf_pool->page_hash.

rb://263 approved by Inaam Rana
parent 227df33e
......@@ -645,11 +645,11 @@ retry_page_get:
cursor->flag = BTR_CUR_DELETE_IBUF;
} else {
/* The purge could not be buffered. */
buf_pool_watch_clear();
buf_pool_watch_unset(space, page_no);
break;
}
buf_pool_watch_clear();
buf_pool_watch_unset(space, page_no);
goto func_exit;
default:
......
......@@ -457,6 +457,8 @@ buf_buddy_relocate(
return(FALSE);
}
ut_ad(!buf_pool_watch_is(bpage));
if (page_zip_get_size(&bpage->zip) != size) {
/* The block is of different size. We would
have to relocate all blocks covered by src.
......
This diff is collapsed.
......@@ -1454,8 +1454,10 @@ alloc:
buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
const ulint fold = buf_page_address_fold(
bpage->space, bpage->offset);
buf_page_t* hash_b = buf_page_hash_get_low(
bpage->space, bpage->offset, fold);
ut_a(!buf_page_hash_get(bpage->space, bpage->offset));
ut_a(!hash_b);
b->state = b->oldest_modification
? BUF_BLOCK_ZIP_DIRTY
......@@ -1680,6 +1682,7 @@ buf_LRU_block_remove_hashed_page(
ibool zip) /*!< in: TRUE if should remove also the
compressed page of an uncompressed page */
{
ulint fold;
const buf_page_t* hashed_bpage;
ut_ad(bpage);
ut_ad(buf_pool_mutex_own());
......@@ -1763,7 +1766,9 @@ buf_LRU_block_remove_hashed_page(
break;
}
hashed_bpage = buf_page_hash_get(bpage->space, bpage->offset);
fold = buf_page_address_fold(bpage->space, bpage->offset);
hashed_bpage = buf_page_hash_get_low(bpage->space, bpage->offset,
fold);
if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
fprintf(stderr,
......@@ -1795,9 +1800,7 @@ buf_LRU_block_remove_hashed_page(
ut_ad(!bpage->in_zip_hash);
ut_ad(bpage->in_page_hash);
ut_d(bpage->in_page_hash = FALSE);
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash,
buf_page_address_fold(bpage->space, bpage->offset),
bpage);
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE:
ut_ad(!bpage->in_free_list);
......
......@@ -3399,15 +3399,14 @@ ibuf_insert_low(
goto function_exit;
}
/* After this point, buf_pool_watch_occurred(space, page_no)
may still become true, but we do not have to care about it,
since we are holding a latch on the insert buffer leaf page
that contains buffered changes for (space, page_no). If
buf_pool_watch_occurred(space, page_no) becomes true,
buf_page_io_complete() for (space, page_no) will have to
acquire a latch on the same insert buffer leaf page, which it
cannot do until we have buffered the IBUF_OP_DELETE and done
mtr_commit(&mtr) to release the latch. */
/* After this point, the page could still be loaded to the
buffer pool, but we do not have to care about it, since we are
holding a latch on the insert buffer leaf page that contains
buffered changes for (space, page_no). If the page enters the
buffer pool, buf_page_io_complete() for (space, page_no) will
have to acquire a latch on the same insert buffer leaf page,
which it cannot do until we have buffered the IBUF_OP_DELETE
and done mtr_commit(&mtr) to release the latch. */
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a((buffered == 0) || ibuf_count_get(space, page_no));
......@@ -3602,7 +3601,7 @@ ibuf_insert(
case IBUF_USE_INSERT:
case IBUF_USE_INSERT_DELETE_MARK:
case IBUF_USE_ALL:
goto notify;
goto check_watch;
case IBUF_USE_COUNT:
break;
}
......@@ -3617,7 +3616,7 @@ ibuf_insert(
case IBUF_USE_INSERT_DELETE_MARK:
case IBUF_USE_ALL:
ut_ad(!no_counter);
goto notify;
goto check_watch;
case IBUF_USE_COUNT:
break;
}
......@@ -3632,7 +3631,7 @@ ibuf_insert(
case IBUF_USE_DELETE:
case IBUF_USE_ALL:
ut_ad(!no_counter);
goto skip_notify;
goto skip_watch;
case IBUF_USE_COUNT:
break;
}
......@@ -3644,23 +3643,39 @@ ibuf_insert(
/* unknown op or use */
ut_error;
notify:
/* If another thread buffers an insert on a page while
the purge is in progress, the purge for the same page
must not be buffered, because it could remove a record
that was re-inserted later.
check_watch:
/* If a thread attempts to buffer an insert on a page while a
purge is in progress on the same page, the purge must not be
buffered, because it could remove a record that was
re-inserted later. For simplicity, we block the buffering of
all operations on a page that has a purge pending.
We do not call this in the IBUF_OP_DELETE case,
because that would always trigger the buffer pool
watch during purge and thus prevent the buffering of
delete operations. We assume that IBUF_OP_DELETE
operations are only issued by the purge thread. */
We do not check this in the IBUF_OP_DELETE case, because that
would always trigger the buffer pool watch during purge and
thus prevent the buffering of delete operations. We assume
that the issuer of IBUF_OP_DELETE has called
buf_pool_watch_set(space, page_no). */
buf_pool_mutex_enter();
buf_pool_watch_notify(space, page_no);
buf_pool_mutex_exit();
{
buf_page_t* bpage;
ulint fold = buf_page_address_fold(space, page_no);
buf_pool_mutex_enter();
bpage = buf_page_hash_get_low(space, page_no, fold);
buf_pool_mutex_exit();
if (UNIV_LIKELY_NULL(bpage)) {
/* A buffer pool watch has been set or the
page has been read into the buffer pool.
Do not buffer the request. If a purge operation
is being buffered, have this request executed
directly on the page in the buffer pool after the
buffered entries for this page have been merged. */
return(FALSE);
}
}
skip_notify:
skip_watch:
entry_size = rec_get_converted_size(index, entry, 0);
if (entry_size
......
......@@ -86,6 +86,8 @@ The enumeration values must be 0..7. */
enum buf_page_state {
BUF_BLOCK_ZIP_FREE = 0, /*!< contains a free
compressed page */
BUF_BLOCK_POOL_WATCH = 0, /*!< a sentinel for the buffer pool
watch, element of buf_pool_watch[] */
BUF_BLOCK_ZIP_PAGE, /*!< contains a clean
compressed page */
BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed
......@@ -290,8 +292,8 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
BUF_GET_NO_LATCH, BUF_GET_NOWAIT or
BUF_GET_IF_IN_POOL_WATCH */
BUF_GET_NO_LATCH or
BUF_GET_IF_IN_POOL_OR_WATCH */
const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mini-transaction */
......@@ -994,6 +996,16 @@ Returns the control block of a file page, NULL if not found.
@return block, NULL if not found */
UNIV_INLINE
buf_page_t*
buf_page_hash_get_low(
/*==================*/
ulint space, /*!< in: space id */
ulint offset, /*!< in: offset of the page within space */
ulint fold); /*!< in: buf_page_address_fold(space, offset) */
/******************************************************************//**
Returns the control block of a file page, NULL if not found.
@return block, NULL if not found or not a real control block */
UNIV_INLINE
buf_page_t*
buf_page_hash_get(
/*==============*/
ulint space, /*!< in: space id */
......@@ -1015,30 +1027,48 @@ UNIV_INTERN
ulint
buf_get_free_list_len(void);
/*=======================*/
/********************************************************************
Stop watching if the marked page is read in. */
Determine if a block is a sentinel for a buffer pool watch.
@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
UNIV_INTERN
void
buf_pool_watch_clear(void);
/*======================*/
/************************************************************************
Set watch occurred flag. */
ibool
buf_pool_watch_is(
/*==============*/
const buf_page_t* bpage) /*!< in: block */
__attribute__((nonnull, warn_unused_result));
/****************************************************************//**
Add watch for the given page to be read in. Caller must have the buffer pool
@return NULL if watch set, block if the page is in the buffer pool */
UNIV_INTERN
buf_page_t*
buf_pool_watch_set(
/*===============*/
ulint space, /*!< in: space id */
ulint offset, /*!< in: page number */
ulint fold) /*!< in: buf_page_address_fold(space, offset) */
__attribute__((warn_unused_result));
/****************************************************************//**
Stop watching if the page has been read in.
buf_pool_watch_set(space,offset) must have returned NULL before. */
UNIV_INTERN
void
buf_pool_watch_notify(
/*==================*/
ulint space, /*!< in: space id of page read in */
ulint offset);/*!< in: offset of page read in */
/********************************************************************
Check if the given page is being watched and has been read to the buffer
pool.
@return TRUE if the given page is being watched and it has been read in */
buf_pool_watch_unset(
/*=================*/
ulint space, /*!< in: space id */
ulint offset);/*!< in: page number */
/****************************************************************//**
Check if the page has been read in.
This may only be called after buf_pool_watch_set(space,offset)
has returned NULL and before invoking buf_pool_watch_unset(space,offset).
@return FALSE if the given page was not read in, TRUE if it was */
UNIV_INTERN
ibool
buf_pool_watch_occurred(
/*====================*/
ulint space, /*!< in: space id */
ulint page_no); /*!< in: page number */
ulint space, /*!< in: space id */
ulint offset) /*!< in: page number */
__attribute__((warn_unused_result));
#endif /* !UNIV_HOTBACKUP */
/** The common buffer control block structure
......@@ -1079,7 +1109,10 @@ struct buf_page_struct{
#endif /* !UNIV_HOTBACKUP */
page_zip_des_t zip; /*!< compressed page; zip.data
(but not the data it points to) is
also protected by buf_pool_mutex */
also protected by buf_pool_mutex;
state == BUF_BLOCK_ZIP_PAGE and
zip.data == NULL means an active
buf_pool_watch */
#ifndef UNIV_HOTBACKUP
buf_page_t* hash; /*!< node used in chaining to
buf_pool->page_hash or
......@@ -1434,18 +1467,7 @@ struct buf_pool_struct{
set to zero when a buffer block is
allocated */
/* @} */
/** @name Buffer pool watch
This is needed for implementing delete buffering. */
/* @{ */
/*--------------------------*/
ibool watch_active; /* if TRUE, set watch_occurred
when watch_space, watch_page_no
is read in. */
ulint watch_space; /* space id of watched page */
ulint watch_page_no; /* page number of watched page */
ibool watch_occurred; /* has watched page been read in */
/*--------------------------*/
/* @} */
/** @name LRU replacement algorithm fields */
/* @{ */
......
......@@ -902,21 +902,20 @@ Returns the control block of a file page, NULL if not found.
@return block, NULL if not found */
UNIV_INLINE
buf_page_t*
buf_page_hash_get(
/*==============*/
buf_page_hash_get_low(
/*==================*/
ulint space, /*!< in: space id */
ulint offset) /*!< in: offset of the page within space */
ulint offset, /*!< in: offset of the page within space */
ulint fold) /*!< in: buf_page_address_fold(space, offset) */
{
buf_page_t* bpage;
ulint fold;
ut_ad(buf_pool);
ut_ad(buf_pool_mutex_own());
ut_ad(fold == buf_page_address_fold(space, offset));
/* Look for the page in the hash table */
fold = buf_page_address_fold(space, offset);
HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage,
ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
&& buf_page_in_file(bpage)),
......@@ -931,6 +930,26 @@ buf_page_hash_get(
return(bpage);
}
/******************************************************************//**
Returns the control block of a file page, NULL if not found.
@return block, NULL if not found or not a real control block */
UNIV_INLINE
buf_page_t*
buf_page_hash_get(
/*==============*/
ulint space, /*!< in: space id */
ulint offset) /*!< in: offset of the page within space */
{
ulint fold = buf_page_address_fold(space, offset);
buf_page_t* bpage = buf_page_hash_get_low(space, offset, fold);
if (bpage && UNIV_UNLIKELY(buf_pool_watch_is(bpage))) {
bpage = NULL;
}
return(bpage);
}
/******************************************************************//**
Returns the control block of a file page, NULL if not found
or an uncompressed page frame does not exist.
......@@ -942,7 +961,11 @@ buf_block_hash_get(
ulint space, /*!< in: space id */
ulint offset) /*!< in: offset of the page within space */
{
return(buf_page_get_block(buf_page_hash_get(space, offset)));
buf_block_t* block;
block = buf_page_get_block(buf_page_hash_get(space, offset));
return(block);
}
/********************************************************************//**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment