Commit 46b1f500 authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-23399: Remove buf_pool.flush_rbt

Normally, buf_pool.flush_list must be sorted by
buf_page_t::oldest_modification, so that log_checkpoint()
can choose MIN(oldest_modification) as the checkpoint LSN.

During recovery, buf_pool.flush_rbt used to guarantee the
ordering. However, we can allow the buf_pool.flush_list to
be in an arbitrary order during recovery, and simply ensure
that it is in the correct order by the time a log checkpoint
needs to be executed.

recv_sys_t::apply(): To keep it simple, we will always flush the
buffer pool at the end of each batch.

Note that log_checkpoint() will invoke recv_sys_t::apply() in case
a checkpoint is initiated during the last batch of recovery,
when we already allow writes to data pages and the redo log.

Reviewed by: Vladislav Vaintroub
parent b535a790
...@@ -1607,12 +1607,6 @@ void buf_pool_t::close() ...@@ -1607,12 +1607,6 @@ void buf_pool_t::close()
mutex_free(&mutex); mutex_free(&mutex);
mutex_free(&flush_list_mutex); mutex_free(&flush_list_mutex);
if (flush_rbt)
{
rbt_free(flush_rbt);
flush_rbt= nullptr;
}
for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage; for (buf_page_t *bpage= UT_LIST_GET_LAST(LRU), *prev_bpage= nullptr; bpage;
bpage= prev_bpage) bpage= prev_bpage)
{ {
...@@ -2113,7 +2107,6 @@ inline void buf_pool_t::resize() ...@@ -2113,7 +2107,6 @@ inline void buf_pool_t::resize()
ut_ad(curr_size == old_size); ut_ad(curr_size == old_size);
ut_ad(n_chunks_new == n_chunks); ut_ad(n_chunks_new == n_chunks);
ut_ad(UT_LIST_GET_LEN(withdraw) == 0); ut_ad(UT_LIST_GET_LEN(withdraw) == 0);
ut_ad(flush_rbt == NULL);
n_chunks_new = (new_instance_size << srv_page_size_shift) n_chunks_new = (new_instance_size << srv_page_size_shift)
/ srv_buf_pool_chunk_unit; / srv_buf_pool_chunk_unit;
......
...@@ -236,134 +236,6 @@ static void buf_flush_validate_skip() ...@@ -236,134 +236,6 @@ static void buf_flush_validate_skip()
} }
#endif /* UNIV_DEBUG */ #endif /* UNIV_DEBUG */
/******************************************************************//**
Insert a block in the flush_rbt and returns a pointer to its
predecessor or NULL if no predecessor. The ordering is maintained
on the basis of the <oldest_modification, space, offset> key.
@return pointer to the predecessor or NULL if no predecessor. */
static
buf_page_t*
buf_flush_insert_in_flush_rbt(
/*==========================*/
buf_page_t* bpage) /*!< in: bpage to be inserted. */
{
const ib_rbt_node_t* c_node;
const ib_rbt_node_t* p_node;
buf_page_t* prev = NULL;
ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
ut_ad(mutex_own(&buf_pool.flush_list_mutex));
/* Insert this buffer into the rbt. */
c_node = rbt_insert(buf_pool.flush_rbt, &bpage, &bpage);
ut_a(c_node != NULL);
/* Get the predecessor. */
p_node = rbt_prev(buf_pool.flush_rbt, c_node);
if (p_node != NULL) {
buf_page_t** value;
value = rbt_value(buf_page_t*, p_node);
prev = *value;
ut_a(prev != NULL);
}
return(prev);
}
/*********************************************************//**
Delete a bpage from the flush_rbt. */
static
void
buf_flush_delete_from_flush_rbt(
/*============================*/
buf_page_t* bpage) /*!< in: bpage to be removed. */
{
ut_ad(mutex_own(&buf_pool.flush_list_mutex));
#ifdef UNIV_DEBUG
ibool ret =
#endif /* UNIV_DEBUG */
rbt_delete(buf_pool.flush_rbt, &bpage);
ut_ad(ret);
}
/*****************************************************************//**
Compare two modified blocks in the buffer pool. The key for comparison
is:
key = <oldest_modification, space, offset>
This comparison is used to maintian ordering of blocks in the
buf_pool.flush_rbt.
Note that for the purpose of flush_rbt, we only need to order blocks
on the oldest_modification. The other two fields are used to uniquely
identify the blocks.
@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
static
int
buf_flush_block_cmp(
/*================*/
const void* p1, /*!< in: block1 */
const void* p2) /*!< in: block2 */
{
const buf_page_t* b1 = *static_cast<const buf_page_t*const*>(p1);
const buf_page_t* b2 = *static_cast<const buf_page_t*const*>(p2);
ut_ad(b1 != NULL);
ut_ad(b2 != NULL);
ut_ad(mutex_own(&buf_pool.flush_list_mutex));
const lsn_t m1 = b1->oldest_modification(),
m2 = b2->oldest_modification();
ut_ad(m1);
ut_ad(m2);
if (m2 > m1) {
return(1);
} else if (m2 < m1) {
return(-1);
}
if (b2->id() > b1->id()) {
return 1;
}
if (b2->id() < b1->id()) {
return -1;
}
return 0;
}
/********************************************************************//**
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
void
buf_flush_init_flush_rbt(void)
/*==========================*/
{
mutex_enter(&buf_pool.flush_list_mutex);
ut_ad(buf_pool.flush_rbt == NULL);
/* Create red black tree for speedy insertions in flush list. */
buf_pool.flush_rbt = rbt_create(
sizeof(buf_page_t*), buf_flush_block_cmp);
mutex_exit(&buf_pool.flush_list_mutex);
}
/********************************************************************//**
Frees up the red-black tree. */
void
buf_flush_free_flush_rbt(void)
/*==========================*/
{
mutex_enter(&buf_pool.flush_list_mutex);
ut_d(buf_flush_validate_low());
rbt_free(buf_pool.flush_rbt);
buf_pool.flush_rbt = NULL;
mutex_exit(&buf_pool.flush_list_mutex);
}
/** Insert a modified block into the flush list. /** Insert a modified block into the flush list.
@param[in,out] block modified block @param[in,out] block modified block
@param[in] lsn oldest modification */ @param[in] lsn oldest modification */
...@@ -380,32 +252,7 @@ void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn) ...@@ -380,32 +252,7 @@ void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn)
block->physical_size()); block->physical_size());
incr_flush_list_size_in_bytes(block); incr_flush_list_size_in_bytes(block);
if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
/* The field in_LRU_list is protected by buf_pool.mutex, which
we are not holding. However, while a block is in the flush
list, it is dirty and cannot be discarded, not from the
page_hash or from the LRU list. At most, the uncompressed
page frame of a compressed block may be discarded or created
(copying the block->page to or from a buf_page_t that is
dynamically allocated from buf_buddy_alloc()). Because those
transitions hold buf_pool.flush_list_mutex (via
buf_flush_relocate_on_flush_list()), there is no possibility
of a race condition in the assertions below. */
ut_ad(block->page.in_LRU_list);
/* buf_buddy_block_register() will take a block in the
BUF_BLOCK_MEMORY state, not a file page. */
ut_ad(!block->page.in_zip_hash);
if (buf_page_t* prev_b =
buf_flush_insert_in_flush_rbt(&block->page)) {
UT_LIST_INSERT_AFTER(buf_pool.flush_list, prev_b, &block->page);
goto func_exit;
}
}
UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page); UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page);
func_exit:
ut_d(buf_flush_validate_skip()); ut_d(buf_flush_validate_skip());
mutex_exit(&buf_pool.flush_list_mutex); mutex_exit(&buf_pool.flush_list_mutex);
} }
...@@ -430,14 +277,6 @@ void buf_flush_remove(buf_page_t* bpage) ...@@ -430,14 +277,6 @@ void buf_flush_remove(buf_page_t* bpage)
the bpage from flush list. */ the bpage from flush list. */
buf_pool.flush_hp.adjust(bpage); buf_pool.flush_hp.adjust(bpage);
UT_LIST_REMOVE(buf_pool.flush_list, bpage); UT_LIST_REMOVE(buf_pool.flush_list, bpage);
/* If the flush_rbt is active then delete from there as well. */
if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
buf_flush_delete_from_flush_rbt(bpage);
}
/* Must be done after we have removed it from the flush_rbt
because we assert on it in buf_flush_block_cmp(). */
bpage->clear_oldest_modification(); bpage->clear_oldest_modification();
buf_pool.stat.flush_list_bytes -= bpage->physical_size(); buf_pool.stat.flush_list_bytes -= bpage->physical_size();
...@@ -467,7 +306,6 @@ buf_flush_relocate_on_flush_list( ...@@ -467,7 +306,6 @@ buf_flush_relocate_on_flush_list(
buf_page_t* dpage) /*!< in/out: destination block */ buf_page_t* dpage) /*!< in/out: destination block */
{ {
buf_page_t* prev; buf_page_t* prev;
buf_page_t* prev_b = NULL;
ut_ad(mutex_own(&buf_pool.mutex)); ut_ad(mutex_own(&buf_pool.mutex));
mutex_enter(&buf_pool.flush_list_mutex); mutex_enter(&buf_pool.flush_list_mutex);
...@@ -481,19 +319,10 @@ buf_flush_relocate_on_flush_list( ...@@ -481,19 +319,10 @@ buf_flush_relocate_on_flush_list(
having the buf_pool mutex. */ having the buf_pool mutex. */
ut_ad(dpage->oldest_modification()); ut_ad(dpage->oldest_modification());
/* If recovery is active we must swap the control blocks in
the flush_rbt as well. */
if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
buf_flush_delete_from_flush_rbt(bpage);
prev_b = buf_flush_insert_in_flush_rbt(dpage);
}
/* Important that we adjust the hazard pointer before removing /* Important that we adjust the hazard pointer before removing
the bpage from the flush list. */ the bpage from the flush list. */
buf_pool.flush_hp.adjust(bpage); buf_pool.flush_hp.adjust(bpage);
/* Must be done after we have removed it from the flush_rbt
because we assert on it in buf_flush_block_cmp(). */
bpage->clear_oldest_modification(); bpage->clear_oldest_modification();
prev = UT_LIST_GET_PREV(list, bpage); prev = UT_LIST_GET_PREV(list, bpage);
...@@ -506,9 +335,6 @@ buf_flush_relocate_on_flush_list( ...@@ -506,9 +335,6 @@ buf_flush_relocate_on_flush_list(
UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage); UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage);
} }
/* Just an extra check. Previous in flush_list
should be the same control block as in flush_rbt. */
ut_a(!buf_pool.flush_rbt || prev_b == prev);
ut_d(buf_flush_validate_low()); ut_d(buf_flush_validate_low());
mutex_exit(&buf_pool.flush_list_mutex); mutex_exit(&buf_pool.flush_list_mutex);
} }
...@@ -2889,7 +2715,6 @@ struct Check { ...@@ -2889,7 +2715,6 @@ struct Check {
static void buf_flush_validate_low() static void buf_flush_validate_low()
{ {
buf_page_t* bpage; buf_page_t* bpage;
const ib_rbt_node_t* rnode = NULL;
ut_ad(mutex_own(&buf_pool.flush_list_mutex)); ut_ad(mutex_own(&buf_pool.flush_list_mutex));
...@@ -2897,13 +2722,6 @@ static void buf_flush_validate_low() ...@@ -2897,13 +2722,6 @@ static void buf_flush_validate_low()
bpage = UT_LIST_GET_FIRST(buf_pool.flush_list); bpage = UT_LIST_GET_FIRST(buf_pool.flush_list);
/* If we are in recovery mode i.e.: flush_rbt != NULL
then each block in the flush_list must also be present
in the flush_rbt. */
if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
rnode = rbt_first(buf_pool.flush_rbt);
}
while (bpage != NULL) { while (bpage != NULL) {
const lsn_t om = bpage->oldest_modification(); const lsn_t om = bpage->oldest_modification();
/* A page in buf_pool.flush_list can be in /* A page in buf_pool.flush_list can be in
...@@ -2912,29 +2730,14 @@ static void buf_flush_validate_low() ...@@ -2912,29 +2730,14 @@ static void buf_flush_validate_low()
original descriptor can have this state and still be original descriptor can have this state and still be
in the flush list waiting to acquire the in the flush list waiting to acquire the
buf_pool.flush_list_mutex to complete the relocation. */ buf_pool.flush_list_mutex to complete the relocation. */
ut_a(bpage->in_file() ut_ad(bpage->in_file()
|| bpage->state() == BUF_BLOCK_REMOVE_HASH); || bpage->state() == BUF_BLOCK_REMOVE_HASH);
ut_a(om > 0); ut_ad(om > 0);
if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) {
buf_page_t** prpage;
ut_a(rnode != NULL);
prpage = rbt_value(buf_page_t*, rnode);
ut_a(*prpage != NULL);
ut_a(*prpage == bpage);
rnode = rbt_next(buf_pool.flush_rbt, rnode);
}
bpage = UT_LIST_GET_NEXT(list, bpage); bpage = UT_LIST_GET_NEXT(list, bpage);
ut_ad(!bpage || recv_recovery_is_on()
ut_a(!bpage || om >= bpage->oldest_modification()); || om >= bpage->oldest_modification());
} }
/* By this time we must have exhausted the traversal of
flush_rbt (if active) as well. */
ut_a(rnode == NULL);
} }
/** Validate the flush list. */ /** Validate the flush list. */
......
...@@ -39,7 +39,6 @@ Created 11/5/1995 Heikki Tuuri ...@@ -39,7 +39,6 @@ Created 11/5/1995 Heikki Tuuri
#include "hash0hash.h" #include "hash0hash.h"
#include "ut0byte.h" #include "ut0byte.h"
#include "page0types.h" #include "page0types.h"
#include "ut0rbt.h"
#include "log0log.h" #include "log0log.h"
#include "srv0srv.h" #include "srv0srv.h"
#include <ostream> #include <ostream>
...@@ -1910,7 +1909,7 @@ class buf_pool_t ...@@ -1910,7 +1909,7 @@ class buf_pool_t
FlushListMutex flush_list_mutex;/*!< mutex protecting the FlushListMutex flush_list_mutex;/*!< mutex protecting the
flush list access. This mutex flush list access. This mutex
protects flush_list, flush_rbt protects flush_list
and bpage::list pointers when and bpage::list pointers when
the bpage is on flush_list. It the bpage is on flush_list. It
also protects writes to also protects writes to
...@@ -1934,20 +1933,6 @@ class buf_pool_t ...@@ -1934,20 +1933,6 @@ class buf_pool_t
of the given type running; of the given type running;
os_event_set() and os_event_reset() os_event_set() and os_event_reset()
are protected by buf_pool_t::mutex */ are protected by buf_pool_t::mutex */
ib_rbt_t* flush_rbt; /*!< a red-black tree is used
exclusively during recovery to
speed up insertions in the
flush_list. This tree contains
blocks in order of
oldest_modification LSN and is
kept in sync with the
flush_list.
Each member of the tree MUST
also be on the flush_list.
This tree is relevant only in
recovery and is set to NULL
once the recovery is over.
Protected by flush_list_mutex */
unsigned freed_page_clock;/*!< a sequence number used unsigned freed_page_clock;/*!< a sequence number used
to count the number of buffer to count the number of buffer
blocks removed from the end of blocks removed from the end of
......
...@@ -164,20 +164,6 @@ void buf_flush_wait_LRU_batch_end(); ...@@ -164,20 +164,6 @@ void buf_flush_wait_LRU_batch_end();
void buf_flush_validate(); void buf_flush_validate();
#endif /* UNIV_DEBUG */ #endif /* UNIV_DEBUG */
/********************************************************************//**
Initialize the red-black tree to speed up insertions into the flush_list
during recovery process. Should be called at the start of recovery
process before any page has been read/written. */
void
buf_flush_init_flush_rbt(void);
/*==========================*/
/********************************************************************//**
Frees up the red-black tree. */
void
buf_flush_free_flush_rbt(void);
/*==========================*/
/** Write a flushable page from buf_pool to a file. /** Write a flushable page from buf_pool to a file.
buf_pool.mutex must be held. buf_pool.mutex must be held.
@param bpage buffer control block @param bpage buffer control block
......
...@@ -2684,6 +2684,8 @@ void recv_sys_t::apply(bool last_batch) ...@@ -2684,6 +2684,8 @@ void recv_sys_t::apply(bool last_batch)
ut_ad(!log_mutex_own()); ut_ad(!log_mutex_own());
mutex_exit(&mutex); mutex_exit(&mutex);
/* Instead of flushing, last_batch could sort the buf_pool.flush_list
in ascending order of buf_page_t::oldest_modification. */
buf_flush_wait_LRU_batch_end(); buf_flush_wait_LRU_batch_end();
buf_flush_sync(); buf_flush_sync();
...@@ -3271,10 +3273,6 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn) ...@@ -3271,10 +3273,6 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0);
ut_d(mutex_exit(&buf_pool.flush_list_mutex)); ut_d(mutex_exit(&buf_pool.flush_list_mutex));
/* Initialize red-black tree for fast insertions into the
flush_list during recovery process. */
buf_flush_init_flush_rbt();
if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
ib::info() << "innodb_force_recovery=6 skips redo log apply"; ib::info() << "innodb_force_recovery=6 skips redo log apply";
...@@ -3567,8 +3565,6 @@ void recv_recovery_from_checkpoint_finish() ...@@ -3567,8 +3565,6 @@ void recv_recovery_from_checkpoint_finish()
recv_sys.debug_free(); recv_sys.debug_free();
/* Free up the flush_rbt. */
buf_flush_free_flush_rbt();
/* Enable innodb_sync_debug checks */ /* Enable innodb_sync_debug checks */
ut_d(sync_check_enable()); ut_d(sync_check_enable());
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment