Commit 3d9f3eed authored by marko's avatar marko

branches/innodb+: Implement the buf_pool_watch for DeleteBuffering in

the page hash table. This serves two purposes. It allows multiple
watches to be set at the same time (by multiple purge threads) and it
removes a race condition when the read of a block completes about the
time the buffer pool watch is being set.

buf_pool_watch_clear(): Rename to buf_pool_watch_unset(). Add
parameters space, offset.

buf_pool_watch_remove(): A helper function for removing the watch.

buf_pool_watch_is(): A predicate for testing if a block descriptor is
a sentinel for the buffer pool watch.

buf_pool_watch[BUF_POOL_WATCH_SIZE]: An array of sentinel block descriptors.

buf_pool_watch_set(): Add a parameter for the fold value, and return
the block if the block is in the buffer pool. Allocate the sentinel
from buf_pool_watch[] if needed. Use buf_fix_count for
reference-counting.

enum buf_block_state: Add BUF_BLOCK_POOL_WATCH as a state alias that
is shared with BUF_BLOCK_ZIP_FREE.

buf_page_hash_get_low(): A low-level variant of buf_page_hash_get()
that takes the fold value as a parameter and may return a watch
sentinel block. In callers, test the return value for
buf_pool_watch_is() [impossible cases with ut_ad(), possible ones with if].
When needed, invoke buf_pool_watch_remove() but preserve the buf_fix_count.

buf_page_hash_get(), buf_block_hash_get(): Return NULL for watch
sentinel blocks, to keep existing behaviour.

buf_page_init(): Add a parameter for the fold value.

ibuf_insert(): If a buffer pool watch exists for the block, refuse to
buffer subsequent operations, so that the purge that is being buffered
will not "overtake" later requests. Previously, we would notify the
watch in this case. Either way, the block would be read to the buffer
pool. In the current design, we can only notify the watch by actually
setting up a real block in buf_pool->page_hash.

rb://263 approved by Inaam Rana
parent 227df33e
...@@ -645,11 +645,11 @@ retry_page_get: ...@@ -645,11 +645,11 @@ retry_page_get:
cursor->flag = BTR_CUR_DELETE_IBUF; cursor->flag = BTR_CUR_DELETE_IBUF;
} else { } else {
/* The purge could not be buffered. */ /* The purge could not be buffered. */
buf_pool_watch_clear(); buf_pool_watch_unset(space, page_no);
break; break;
} }
buf_pool_watch_clear(); buf_pool_watch_unset(space, page_no);
goto func_exit; goto func_exit;
default: default:
......
...@@ -457,6 +457,8 @@ buf_buddy_relocate( ...@@ -457,6 +457,8 @@ buf_buddy_relocate(
return(FALSE); return(FALSE);
} }
ut_ad(!buf_pool_watch_is(bpage));
if (page_zip_get_size(&bpage->zip) != size) { if (page_zip_get_size(&bpage->zip) != size) {
/* The block is of different size. We would /* The block is of different size. We would
have to relocate all blocks covered by src. have to relocate all blocks covered by src.
......
This diff is collapsed.
...@@ -1454,8 +1454,10 @@ alloc: ...@@ -1454,8 +1454,10 @@ alloc:
buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
const ulint fold = buf_page_address_fold( const ulint fold = buf_page_address_fold(
bpage->space, bpage->offset); bpage->space, bpage->offset);
buf_page_t* hash_b = buf_page_hash_get_low(
bpage->space, bpage->offset, fold);
ut_a(!buf_page_hash_get(bpage->space, bpage->offset)); ut_a(!hash_b);
b->state = b->oldest_modification b->state = b->oldest_modification
? BUF_BLOCK_ZIP_DIRTY ? BUF_BLOCK_ZIP_DIRTY
...@@ -1680,6 +1682,7 @@ buf_LRU_block_remove_hashed_page( ...@@ -1680,6 +1682,7 @@ buf_LRU_block_remove_hashed_page(
ibool zip) /*!< in: TRUE if should remove also the ibool zip) /*!< in: TRUE if should remove also the
compressed page of an uncompressed page */ compressed page of an uncompressed page */
{ {
ulint fold;
const buf_page_t* hashed_bpage; const buf_page_t* hashed_bpage;
ut_ad(bpage); ut_ad(bpage);
ut_ad(buf_pool_mutex_own()); ut_ad(buf_pool_mutex_own());
...@@ -1763,7 +1766,9 @@ buf_LRU_block_remove_hashed_page( ...@@ -1763,7 +1766,9 @@ buf_LRU_block_remove_hashed_page(
break; break;
} }
hashed_bpage = buf_page_hash_get(bpage->space, bpage->offset); fold = buf_page_address_fold(bpage->space, bpage->offset);
hashed_bpage = buf_page_hash_get_low(bpage->space, bpage->offset,
fold);
if (UNIV_UNLIKELY(bpage != hashed_bpage)) { if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
fprintf(stderr, fprintf(stderr,
...@@ -1795,9 +1800,7 @@ buf_LRU_block_remove_hashed_page( ...@@ -1795,9 +1800,7 @@ buf_LRU_block_remove_hashed_page(
ut_ad(!bpage->in_zip_hash); ut_ad(!bpage->in_zip_hash);
ut_ad(bpage->in_page_hash); ut_ad(bpage->in_page_hash);
ut_d(bpage->in_page_hash = FALSE); ut_d(bpage->in_page_hash = FALSE);
HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
buf_page_address_fold(bpage->space, bpage->offset),
bpage);
switch (buf_page_get_state(bpage)) { switch (buf_page_get_state(bpage)) {
case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_ZIP_PAGE:
ut_ad(!bpage->in_free_list); ut_ad(!bpage->in_free_list);
......
...@@ -3399,15 +3399,14 @@ ibuf_insert_low( ...@@ -3399,15 +3399,14 @@ ibuf_insert_low(
goto function_exit; goto function_exit;
} }
/* After this point, buf_pool_watch_occurred(space, page_no) /* After this point, the page could still be loaded to the
may still become true, but we do not have to care about it, buffer pool, but we do not have to care about it, since we are
since we are holding a latch on the insert buffer leaf page holding a latch on the insert buffer leaf page that contains
that contains buffered changes for (space, page_no). If buffered changes for (space, page_no). If the page enters the
buf_pool_watch_occurred(space, page_no) becomes true, buffer pool, buf_page_io_complete() for (space, page_no) will
buf_page_io_complete() for (space, page_no) will have to have to acquire a latch on the same insert buffer leaf page,
acquire a latch on the same insert buffer leaf page, which it which it cannot do until we have buffered the IBUF_OP_DELETE
cannot do until we have buffered the IBUF_OP_DELETE and done and done mtr_commit(&mtr) to release the latch. */
mtr_commit(&mtr) to release the latch. */
#ifdef UNIV_IBUF_COUNT_DEBUG #ifdef UNIV_IBUF_COUNT_DEBUG
ut_a((buffered == 0) || ibuf_count_get(space, page_no)); ut_a((buffered == 0) || ibuf_count_get(space, page_no));
...@@ -3602,7 +3601,7 @@ ibuf_insert( ...@@ -3602,7 +3601,7 @@ ibuf_insert(
case IBUF_USE_INSERT: case IBUF_USE_INSERT:
case IBUF_USE_INSERT_DELETE_MARK: case IBUF_USE_INSERT_DELETE_MARK:
case IBUF_USE_ALL: case IBUF_USE_ALL:
goto notify; goto check_watch;
case IBUF_USE_COUNT: case IBUF_USE_COUNT:
break; break;
} }
...@@ -3617,7 +3616,7 @@ ibuf_insert( ...@@ -3617,7 +3616,7 @@ ibuf_insert(
case IBUF_USE_INSERT_DELETE_MARK: case IBUF_USE_INSERT_DELETE_MARK:
case IBUF_USE_ALL: case IBUF_USE_ALL:
ut_ad(!no_counter); ut_ad(!no_counter);
goto notify; goto check_watch;
case IBUF_USE_COUNT: case IBUF_USE_COUNT:
break; break;
} }
...@@ -3632,7 +3631,7 @@ ibuf_insert( ...@@ -3632,7 +3631,7 @@ ibuf_insert(
case IBUF_USE_DELETE: case IBUF_USE_DELETE:
case IBUF_USE_ALL: case IBUF_USE_ALL:
ut_ad(!no_counter); ut_ad(!no_counter);
goto skip_notify; goto skip_watch;
case IBUF_USE_COUNT: case IBUF_USE_COUNT:
break; break;
} }
...@@ -3644,23 +3643,39 @@ ibuf_insert( ...@@ -3644,23 +3643,39 @@ ibuf_insert(
/* unknown op or use */ /* unknown op or use */
ut_error; ut_error;
notify: check_watch:
/* If another thread buffers an insert on a page while /* If a thread attempts to buffer an insert on a page while a
the purge is in progress, the purge for the same page purge is in progress on the same page, the purge must not be
must not be buffered, because it could remove a record buffered, because it could remove a record that was
that was re-inserted later. re-inserted later. For simplicity, we block the buffering of
all operations on a page that has a purge pending.
We do not call this in the IBUF_OP_DELETE case, We do not check this in the IBUF_OP_DELETE case, because that
because that would always trigger the buffer pool would always trigger the buffer pool watch during purge and
watch during purge and thus prevent the buffering of thus prevent the buffering of delete operations. We assume
delete operations. We assume that IBUF_OP_DELETE that the issuer of IBUF_OP_DELETE has called
operations are only issued by the purge thread. */ buf_pool_watch_set(space, page_no). */
buf_pool_mutex_enter(); {
buf_pool_watch_notify(space, page_no); buf_page_t* bpage;
buf_pool_mutex_exit(); ulint fold = buf_page_address_fold(space, page_no);
buf_pool_mutex_enter();
bpage = buf_page_hash_get_low(space, page_no, fold);
buf_pool_mutex_exit();
if (UNIV_LIKELY_NULL(bpage)) {
/* A buffer pool watch has been set or the
page has been read into the buffer pool.
Do not buffer the request. If a purge operation
is being buffered, have this request executed
directly on the page in the buffer pool after the
buffered entries for this page have been merged. */
return(FALSE);
}
}
skip_notify: skip_watch:
entry_size = rec_get_converted_size(index, entry, 0); entry_size = rec_get_converted_size(index, entry, 0);
if (entry_size if (entry_size
......
...@@ -86,6 +86,8 @@ The enumeration values must be 0..7. */ ...@@ -86,6 +86,8 @@ The enumeration values must be 0..7. */
enum buf_page_state { enum buf_page_state {
BUF_BLOCK_ZIP_FREE = 0, /*!< contains a free BUF_BLOCK_ZIP_FREE = 0, /*!< contains a free
compressed page */ compressed page */
BUF_BLOCK_POOL_WATCH = 0, /*!< a sentinel for the buffer pool
watch, element of buf_pool_watch[] */
BUF_BLOCK_ZIP_PAGE, /*!< contains a clean BUF_BLOCK_ZIP_PAGE, /*!< contains a clean
compressed page */ compressed page */
BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed
...@@ -290,8 +292,8 @@ buf_page_get_gen( ...@@ -290,8 +292,8 @@ buf_page_get_gen(
ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */ ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /*!< in: guessed block or NULL */ buf_block_t* guess, /*!< in: guessed block or NULL */
ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL, ulint mode, /*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
BUF_GET_NO_LATCH, BUF_GET_NOWAIT or BUF_GET_NO_LATCH or
BUF_GET_IF_IN_POOL_WATCH */ BUF_GET_IF_IN_POOL_OR_WATCH */
const char* file, /*!< in: file name */ const char* file, /*!< in: file name */
ulint line, /*!< in: line where called */ ulint line, /*!< in: line where called */
mtr_t* mtr); /*!< in: mini-transaction */ mtr_t* mtr); /*!< in: mini-transaction */
...@@ -994,6 +996,16 @@ Returns the control block of a file page, NULL if not found. ...@@ -994,6 +996,16 @@ Returns the control block of a file page, NULL if not found.
@return block, NULL if not found */ @return block, NULL if not found */
UNIV_INLINE UNIV_INLINE
buf_page_t* buf_page_t*
buf_page_hash_get_low(
/*==================*/
ulint space, /*!< in: space id */
ulint offset, /*!< in: offset of the page within space */
ulint fold); /*!< in: buf_page_address_fold(space, offset) */
/******************************************************************//**
Returns the control block of a file page, NULL if not found.
@return block, NULL if not found or not a real control block */
UNIV_INLINE
buf_page_t*
buf_page_hash_get( buf_page_hash_get(
/*==============*/ /*==============*/
ulint space, /*!< in: space id */ ulint space, /*!< in: space id */
...@@ -1015,30 +1027,48 @@ UNIV_INTERN ...@@ -1015,30 +1027,48 @@ UNIV_INTERN
ulint ulint
buf_get_free_list_len(void); buf_get_free_list_len(void);
/*=======================*/ /*=======================*/
/******************************************************************** /********************************************************************
Stop watching if the marked page is read in. */ Determine if a block is a sentinel for a buffer pool watch.
@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
UNIV_INTERN UNIV_INTERN
void ibool
buf_pool_watch_clear(void); buf_pool_watch_is(
/*======================*/ /*==============*/
/************************************************************************ const buf_page_t* bpage) /*!< in: block */
Set watch occurred flag. */ __attribute__((nonnull, warn_unused_result));
/****************************************************************//**
Add watch for the given page to be read in. Caller must have the buffer pool
@return NULL if watch set, block if the page is in the buffer pool */
UNIV_INTERN
buf_page_t*
buf_pool_watch_set(
/*===============*/
ulint space, /*!< in: space id */
ulint offset, /*!< in: page number */
ulint fold) /*!< in: buf_page_address_fold(space, offset) */
__attribute__((warn_unused_result));
/****************************************************************//**
Stop watching if the page has been read in.
buf_pool_watch_set(space,offset) must have returned NULL before. */
UNIV_INTERN UNIV_INTERN
void void
buf_pool_watch_notify( buf_pool_watch_unset(
/*==================*/ /*=================*/
ulint space, /*!< in: space id of page read in */ ulint space, /*!< in: space id */
ulint offset);/*!< in: offset of page read in */ ulint offset);/*!< in: page number */
/******************************************************************** /****************************************************************//**
Check if the given page is being watched and has been read to the buffer Check if the page has been read in.
pool. This may only be called after buf_pool_watch_set(space,offset)
@return TRUE if the given page is being watched and it has been read in */ has returned NULL and before invoking buf_pool_watch_unset(space,offset).
@return FALSE if the given page was not read in, TRUE if it was */
UNIV_INTERN UNIV_INTERN
ibool ibool
buf_pool_watch_occurred( buf_pool_watch_occurred(
/*====================*/ /*====================*/
ulint space, /*!< in: space id */ ulint space, /*!< in: space id */
ulint page_no); /*!< in: page number */ ulint offset) /*!< in: page number */
__attribute__((warn_unused_result));
#endif /* !UNIV_HOTBACKUP */ #endif /* !UNIV_HOTBACKUP */
/** The common buffer control block structure /** The common buffer control block structure
...@@ -1079,7 +1109,10 @@ struct buf_page_struct{ ...@@ -1079,7 +1109,10 @@ struct buf_page_struct{
#endif /* !UNIV_HOTBACKUP */ #endif /* !UNIV_HOTBACKUP */
page_zip_des_t zip; /*!< compressed page; zip.data page_zip_des_t zip; /*!< compressed page; zip.data
(but not the data it points to) is (but not the data it points to) is
also protected by buf_pool_mutex */ also protected by buf_pool_mutex;
state == BUF_BLOCK_ZIP_PAGE and
zip.data == NULL means an active
buf_pool_watch */
#ifndef UNIV_HOTBACKUP #ifndef UNIV_HOTBACKUP
buf_page_t* hash; /*!< node used in chaining to buf_page_t* hash; /*!< node used in chaining to
buf_pool->page_hash or buf_pool->page_hash or
...@@ -1434,18 +1467,7 @@ struct buf_pool_struct{ ...@@ -1434,18 +1467,7 @@ struct buf_pool_struct{
set to zero when a buffer block is set to zero when a buffer block is
allocated */ allocated */
/* @} */ /* @} */
/** @name Buffer pool watch
This is needed for implementing delete buffering. */
/* @{ */
/*--------------------------*/
ibool watch_active; /* if TRUE, set watch_occurred
when watch_space, watch_page_no
is read in. */
ulint watch_space; /* space id of watched page */
ulint watch_page_no; /* page number of watched page */
ibool watch_occurred; /* has watched page been read in */
/*--------------------------*/
/* @} */
/** @name LRU replacement algorithm fields */ /** @name LRU replacement algorithm fields */
/* @{ */ /* @{ */
......
...@@ -902,21 +902,20 @@ Returns the control block of a file page, NULL if not found. ...@@ -902,21 +902,20 @@ Returns the control block of a file page, NULL if not found.
@return block, NULL if not found */ @return block, NULL if not found */
UNIV_INLINE UNIV_INLINE
buf_page_t* buf_page_t*
buf_page_hash_get( buf_page_hash_get_low(
/*==============*/ /*==================*/
ulint space, /*!< in: space id */ ulint space, /*!< in: space id */
ulint offset) /*!< in: offset of the page within space */ ulint offset, /*!< in: offset of the page within space */
ulint fold) /*!< in: buf_page_address_fold(space, offset) */
{ {
buf_page_t* bpage; buf_page_t* bpage;
ulint fold;
ut_ad(buf_pool); ut_ad(buf_pool);
ut_ad(buf_pool_mutex_own()); ut_ad(buf_pool_mutex_own());
ut_ad(fold == buf_page_address_fold(space, offset));
/* Look for the page in the hash table */ /* Look for the page in the hash table */
fold = buf_page_address_fold(space, offset);
HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage, HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage,
ut_ad(bpage->in_page_hash && !bpage->in_zip_hash ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
&& buf_page_in_file(bpage)), && buf_page_in_file(bpage)),
...@@ -931,6 +930,26 @@ buf_page_hash_get( ...@@ -931,6 +930,26 @@ buf_page_hash_get(
return(bpage); return(bpage);
} }
/******************************************************************//**
Returns the control block of a file page, NULL if not found.
@return block, NULL if not found or not a real control block */
UNIV_INLINE
buf_page_t*
buf_page_hash_get(
/*==============*/
ulint space, /*!< in: space id */
ulint offset) /*!< in: offset of the page within space */
{
ulint fold = buf_page_address_fold(space, offset);
buf_page_t* bpage = buf_page_hash_get_low(space, offset, fold);
if (bpage && UNIV_UNLIKELY(buf_pool_watch_is(bpage))) {
bpage = NULL;
}
return(bpage);
}
/******************************************************************//** /******************************************************************//**
Returns the control block of a file page, NULL if not found Returns the control block of a file page, NULL if not found
or an uncompressed page frame does not exist. or an uncompressed page frame does not exist.
...@@ -942,7 +961,11 @@ buf_block_hash_get( ...@@ -942,7 +961,11 @@ buf_block_hash_get(
ulint space, /*!< in: space id */ ulint space, /*!< in: space id */
ulint offset) /*!< in: offset of the page within space */ ulint offset) /*!< in: offset of the page within space */
{ {
return(buf_page_get_block(buf_page_hash_get(space, offset))); buf_block_t* block;
block = buf_page_get_block(buf_page_hash_get(space, offset));
return(block);
} }
/********************************************************************//** /********************************************************************//**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment