Commit 358a31df authored by Inaam Rana's avatar Inaam Rana

Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME

WITH LARGE BUFFER POOL

(Note: this a backport of revno:3472 from mysql-trunk)

rb://845
approved by: Marko

  When dropping a table (with an .ibd file i.e.: with
  innodb_file_per_table set) we scan entire LRU to invalidate pages from
  that table. This can be painful in case of large buffer pools as we hold
  the buf_pool->mutex for the scan. Note that gravity of the problem does
  not depend on the size of the table. Even with an empty table but a
  large and filled up buffer pool we'll end up scanning a very long LRU
  list.
  
  The fix is to scan flush_list and just remove the blocks belonging to
  the table from the flush_list, marking them as non-dirty. The blocks
  are left in the LRU list for eventual eviction due to aging. The
  flush_list is typically much smaller than the LRU list but for cases
  where it is very long we have the solution of releasing the
  buf_pool->mutex after scanning 1K pages.
  
  buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure
  that a block stays in the flush_list and LRU list when we release
  buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve
  this.
parent 0cd92281
......@@ -7,6 +7,7 @@ page_size
drop table t1;
SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0;
page_size
8192
create table t2(a text) engine=innodb;
SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0;
page_size
......
......@@ -26,7 +26,7 @@ while ($i)
drop table t1;
# no lazy eviction at drop table in 5.1 and 5.5 there should be no
# because of lazy eviction at drop table in 5.5 there should be some
# used 8K pages
-- eval $query_i_s
......@@ -36,7 +36,7 @@ create table t2(a text) engine=innodb;
-- disable_query_log
-- let $i = 200
-- let $i = 400
while ($i)
{
insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000));
......
......@@ -3888,6 +3888,9 @@ buf_pool_validate_instance(
ut_a(rw_lock_is_locked(&block->lock,
RW_LOCK_EX));
break;
case BUF_IO_PIN:
break;
}
n_lru++;
......@@ -3917,6 +3920,7 @@ buf_pool_validate_instance(
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_PIN:
/* All clean blocks should be I/O-unfixed. */
break;
case BUF_IO_READ:
......@@ -3956,6 +3960,7 @@ buf_pool_validate_instance(
switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE:
case BUF_IO_READ:
case BUF_IO_PIN:
break;
case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) {
......
......@@ -68,8 +68,12 @@ allowed to point to either end of the LRU list. */
/** When dropping the search hash index entries before deleting an ibd
file, we build a local array of pages belonging to that tablespace
in the buffer pool. Following is the size of that array. */
#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024
in the buffer pool. Following is the size of that array.
We also release buf_pool->mutex after scanning this many pages of the
flush_list when dropping a table. This is to ensure that other threads
are not blocked for extended period of time when using very large
buffer pools. */
#define BUF_LRU_DROP_SEARCH_SIZE 1024
/** If we switch on the InnoDB monitor because there are too few available
frames in the buffer pool, we set this to TRUE */
......@@ -210,7 +214,7 @@ buf_LRU_drop_page_hash_batch(
ulint i;
ut_ad(arr != NULL);
ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
for (i = 0; i < count; ++i) {
btr_search_drop_page_hash_when_freed(space_id, zip_size,
......@@ -244,7 +248,7 @@ buf_LRU_drop_page_hash_for_tablespace(
}
page_arr = ut_malloc(
sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE);
sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
buf_pool_mutex_enter(buf_pool);
num_entries = 0;
......@@ -283,10 +287,10 @@ next_page:
/* Store the page number so that we can drop the hash
index in a batch later. */
page_arr[num_entries] = bpage->offset;
ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
++num_entries;
if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
goto next_page;
}
......@@ -331,37 +335,40 @@ next_page:
}
/******************************************************************//**
Invalidates all pages belonging to a given tablespace inside a specific
Remove all dirty pages belonging to a given tablespace inside a specific
buffer pool instance when we are deleting the data file(s) of that
tablespace. */
tablespace. The pages still remain a part of LRU and are evicted from
the list as they age towards the tail of the LRU. */
static
void
buf_LRU_invalidate_tablespace_buf_pool_instance(
/*============================================*/
buf_LRU_remove_dirty_pages_for_tablespace(
/*======================================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */
ulint id) /*!< in: space id */
{
buf_page_t* bpage;
ibool all_freed;
ulint i;
scan_again:
buf_pool_mutex_enter(buf_pool);
buf_flush_list_mutex_enter(buf_pool);
all_freed = TRUE;
bpage = UT_LIST_GET_LAST(buf_pool->LRU);
for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
bpage != NULL; ++i) {
while (bpage != NULL) {
buf_page_t* prev_bpage;
mutex_t* block_mutex = NULL;
ut_a(buf_page_in_file(bpage));
prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
prev_bpage = UT_LIST_GET_PREV(list, bpage);
/* bpage->space and bpage->io_fix are protected by
buf_pool_mutex and block_mutex. It is safe to check
them while holding buf_pool_mutex only. */
buf_pool->mutex and block_mutex. It is safe to check
them while holding buf_pool->mutex only. */
if (buf_page_get_space(bpage) != id) {
/* Skip this block, as it does not belong to
......@@ -374,79 +381,83 @@ scan_again:
all_freed = FALSE;
goto next_page;
} else {
block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
}
if (bpage->buf_fix_count > 0) {
/* We have to release the flush_list_mutex to obey the
latching order. We are however guaranteed that the page
will stay in the flush_list because buf_flush_remove()
needs buf_pool->mutex as well. */
buf_flush_list_mutex_exit(buf_pool);
block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
mutex_exit(block_mutex);
/* We cannot remove this page during
this scan yet; maybe the system is
currently reading it in, or flushing
the modifications to the file */
if (bpage->buf_fix_count > 0) {
mutex_exit(block_mutex);
buf_flush_list_mutex_enter(buf_pool);
all_freed = FALSE;
/* We cannot remove this page during
this scan yet; maybe the system is
currently reading it in, or flushing
the modifications to the file */
goto next_page;
}
all_freed = FALSE;
goto next_page;
}
ut_ad(mutex_own(block_mutex));
ut_ad(bpage->oldest_modification != 0);
#ifdef UNIV_DEBUG
if (buf_debug_prints) {
fprintf(stderr,
"Dropping space %lu page %lu\n",
(ulong) buf_page_get_space(bpage),
(ulong) buf_page_get_page_no(bpage));
}
#endif
if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
/* This is a compressed-only block
descriptor. Do nothing. */
} else if (((buf_block_t*) bpage)->index) {
ulint page_no;
ulint zip_size;
buf_flush_remove(bpage);
buf_pool_mutex_exit(buf_pool);
zip_size = buf_page_get_zip_size(bpage);
page_no = buf_page_get_page_no(bpage);
mutex_exit(block_mutex);
buf_flush_list_mutex_enter(buf_pool);
next_page:
bpage = prev_bpage;
mutex_exit(block_mutex);
if (!bpage) {
break;
}
/* Note that the following call will acquire
and release an X-latch on the page. */
/* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
loop we release buf_pool->mutex to let other threads
do their job. */
if (i < BUF_LRU_DROP_SEARCH_SIZE) {
continue;
}
btr_search_drop_page_hash_when_freed(
id, zip_size, page_no);
goto scan_again;
/* We IO-fix the block to make sure that the block
stays in its position in the flush_list. */
if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
/* Block is already IO-fixed. We don't
want to change the value. Lets leave
this block alone. */
continue;
}
if (bpage->oldest_modification != 0) {
buf_flush_list_mutex_exit(buf_pool);
block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
buf_page_set_sticky(bpage);
mutex_exit(block_mutex);
buf_flush_remove(bpage);
}
/* Now it is safe to release the buf_pool->mutex. */
buf_pool_mutex_exit(buf_pool);
os_thread_yield();
buf_pool_mutex_enter(buf_pool);
/* Remove from the LRU list. */
mutex_enter(block_mutex);
buf_page_unset_sticky(bpage);
mutex_exit(block_mutex);
if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
!= BUF_BLOCK_ZIP_FREE) {
buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
mutex_exit(block_mutex);
} else {
/* The block_mutex should have been released
by buf_LRU_block_remove_hashed_page() when it
returns BUF_BLOCK_ZIP_FREE. */
ut_ad(block_mutex == &buf_pool->zip_mutex);
ut_ad(!mutex_own(block_mutex));
}
next_page:
bpage = prev_bpage;
buf_flush_list_mutex_enter(buf_pool);
ut_ad(bpage->in_flush_list);
i = 0;
}
buf_pool_mutex_exit(buf_pool);
buf_flush_list_mutex_exit(buf_pool);
ut_ad(buf_flush_validate(buf_pool));
if (!all_freed) {
os_thread_sleep(20000);
......@@ -477,7 +488,7 @@ buf_LRU_invalidate_tablespace(
buf_pool = buf_pool_from_array(i);
buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id);
buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
}
}
......@@ -1532,8 +1543,9 @@ alloc:
/* Prevent buf_page_get_gen() from
decompressing the block while we release
buf_pool->mutex and block_mutex. */
b->buf_fix_count++;
b->io_fix = BUF_IO_READ;
mutex_enter(&buf_pool->zip_mutex);
buf_page_set_sticky(b);
mutex_exit(&buf_pool->zip_mutex);
}
buf_pool_mutex_exit(buf_pool);
......@@ -1573,8 +1585,7 @@ alloc:
if (b) {
mutex_enter(&buf_pool->zip_mutex);
b->buf_fix_count--;
buf_page_set_io_fix(b, BUF_IO_NONE);
buf_page_unset_sticky(b);
mutex_exit(&buf_pool->zip_mutex);
}
......
......@@ -910,7 +910,27 @@ buf_block_set_io_fix(
/*=================*/
buf_block_t* block, /*!< in/out: control block */
enum buf_io_fix io_fix);/*!< in: io_fix state */
/*********************************************************************//**
Makes a block sticky. A sticky block implies that even after we release
the buf_pool->mutex and the block->mutex:
* it cannot be removed from the flush_list
* the block descriptor cannot be relocated
* it cannot be removed from the LRU list
Note that:
* the block can still change its position in the LRU list
* the next and previous pointers can change. */
UNIV_INLINE
void
buf_page_set_sticky(
/*================*/
buf_page_t* bpage); /*!< in/out: control block */
/*********************************************************************//**
Removes stickiness of a block. */
UNIV_INLINE
void
buf_page_unset_sticky(
/*==================*/
buf_page_t* bpage); /*!< in/out: control block */
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
......
......@@ -414,6 +414,7 @@ buf_page_get_io_fix(
case BUF_IO_NONE:
case BUF_IO_READ:
case BUF_IO_WRITE:
case BUF_IO_PIN:
return(io_fix);
}
ut_error;
......@@ -464,6 +465,49 @@ buf_block_set_io_fix(
buf_page_set_io_fix(&block->page, io_fix);
}
/*********************************************************************//**
Makes a block sticky. A sticky block implies that even after we release
the buf_pool->mutex and the block->mutex:
* it cannot be removed from the flush_list
* the block descriptor cannot be relocated
* it cannot be removed from the LRU list
Note that:
* the block can still change its position in the LRU list
* the next and previous pointers can change. */
UNIV_INLINE
void
buf_page_set_sticky(
/*================*/
buf_page_t* bpage) /*!< in/out: control block */
{
#ifdef UNIV_DEBUG
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
ut_ad(buf_pool_mutex_own(buf_pool));
#endif
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
bpage->io_fix = BUF_IO_PIN;
}
/*********************************************************************//**
Removes stickiness of a block. */
UNIV_INLINE
void
buf_page_unset_sticky(
/*==================*/
buf_page_t* bpage) /*!< in/out: control block */
{
#ifdef UNIV_DEBUG
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
ut_ad(buf_pool_mutex_own(buf_pool));
#endif
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
bpage->io_fix = BUF_IO_NONE;
}
/********************************************************************//**
Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */
......
......@@ -57,7 +57,10 @@ enum buf_flush {
enum buf_io_fix {
BUF_IO_NONE = 0, /**< no pending I/O */
BUF_IO_READ, /**< read pending */
BUF_IO_WRITE /**< write pending */
BUF_IO_WRITE, /**< write pending */
BUF_IO_PIN /**< disallow relocation of
block and its removal of from
the flush_list */
};
/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment