Commit d9a3a6ed authored by Inaam Rana's avatar Inaam Rana

Bug#11759044 - 51325: DROPPING AN EMPTY INNODB TABLE TAKES A LONG TIME

WITH LARGE BUFFER POOL

(Note: this a backport of revno:3472 from mysql-trunk)

rb://845
approved by: Marko

  When dropping a table (with an .ibd file i.e.: with
  innodb_file_per_table set) we scan entire LRU to invalidate pages from
  that table. This can be painful in case of large buffer pools as we hold
  the buf_pool->mutex for the scan. Note that gravity of the problem does
  not depend on the size of the table. Even with an empty table but a
  large and filled up buffer pool we'll end up scanning a very long LRU
  list.
  
  The fix is to scan flush_list and just remove the blocks belonging to
  the table from the flush_list, marking them as non-dirty. The blocks
  are left in the LRU list for eventual eviction due to aging. The
  flush_list is typically much smaller than the LRU list but for cases
  where it is very long we have the solution of releasing the
  buf_pool->mutex after scanning 1K pages.
  
  buf_page_[set|unset]_sticky(): Use new IO-state BUF_IO_PIN to ensure
  that a block stays in the flush_list and LRU list when we release
  buf_pool->mutex. Previously we have been abusing BUF_IO_READ to achieve
  this.
parent f337ef3d
...@@ -7,6 +7,7 @@ page_size ...@@ -7,6 +7,7 @@ page_size
drop table t1; drop table t1;
SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0; SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0;
page_size page_size
8192
create table t2(a text) engine=innodb; create table t2(a text) engine=innodb;
SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0; SELECT page_size FROM information_schema.innodb_cmpmem WHERE pages_used > 0;
page_size page_size
......
...@@ -26,7 +26,7 @@ while ($i) ...@@ -26,7 +26,7 @@ while ($i)
drop table t1; drop table t1;
# no lazy eviction at drop table in 5.1 and 5.5 there should be no # because of lazy eviction at drop table in 5.5 there should be some
# used 8K pages # used 8K pages
-- eval $query_i_s -- eval $query_i_s
...@@ -36,7 +36,7 @@ create table t2(a text) engine=innodb; ...@@ -36,7 +36,7 @@ create table t2(a text) engine=innodb;
-- disable_query_log -- disable_query_log
-- let $i = 200 -- let $i = 400
while ($i) while ($i)
{ {
insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000)); insert into t2 values(repeat('abcdefghijklmnopqrstuvwxyz',1000));
......
...@@ -3888,6 +3888,9 @@ buf_pool_validate_instance( ...@@ -3888,6 +3888,9 @@ buf_pool_validate_instance(
ut_a(rw_lock_is_locked(&block->lock, ut_a(rw_lock_is_locked(&block->lock,
RW_LOCK_EX)); RW_LOCK_EX));
break; break;
case BUF_IO_PIN:
break;
} }
n_lru++; n_lru++;
...@@ -3917,6 +3920,7 @@ buf_pool_validate_instance( ...@@ -3917,6 +3920,7 @@ buf_pool_validate_instance(
ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
switch (buf_page_get_io_fix(b)) { switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE: case BUF_IO_NONE:
case BUF_IO_PIN:
/* All clean blocks should be I/O-unfixed. */ /* All clean blocks should be I/O-unfixed. */
break; break;
case BUF_IO_READ: case BUF_IO_READ:
...@@ -3956,6 +3960,7 @@ buf_pool_validate_instance( ...@@ -3956,6 +3960,7 @@ buf_pool_validate_instance(
switch (buf_page_get_io_fix(b)) { switch (buf_page_get_io_fix(b)) {
case BUF_IO_NONE: case BUF_IO_NONE:
case BUF_IO_READ: case BUF_IO_READ:
case BUF_IO_PIN:
break; break;
case BUF_IO_WRITE: case BUF_IO_WRITE:
switch (buf_page_get_flush_type(b)) { switch (buf_page_get_flush_type(b)) {
......
...@@ -68,8 +68,12 @@ allowed to point to either end of the LRU list. */ ...@@ -68,8 +68,12 @@ allowed to point to either end of the LRU list. */
/** When dropping the search hash index entries before deleting an ibd /** When dropping the search hash index entries before deleting an ibd
file, we build a local array of pages belonging to that tablespace file, we build a local array of pages belonging to that tablespace
in the buffer pool. Following is the size of that array. */ in the buffer pool. Following is the size of that array.
#define BUF_LRU_DROP_SEARCH_HASH_SIZE 1024 We also release buf_pool->mutex after scanning this many pages of the
flush_list when dropping a table. This is to ensure that other threads
are not blocked for extended period of time when using very large
buffer pools. */
#define BUF_LRU_DROP_SEARCH_SIZE 1024
/** If we switch on the InnoDB monitor because there are too few available /** If we switch on the InnoDB monitor because there are too few available
frames in the buffer pool, we set this to TRUE */ frames in the buffer pool, we set this to TRUE */
...@@ -210,7 +214,7 @@ buf_LRU_drop_page_hash_batch( ...@@ -210,7 +214,7 @@ buf_LRU_drop_page_hash_batch(
ulint i; ulint i;
ut_ad(arr != NULL); ut_ad(arr != NULL);
ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE); ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
for (i = 0; i < count; ++i) { for (i = 0; i < count; ++i) {
btr_search_drop_page_hash_when_freed(space_id, zip_size, btr_search_drop_page_hash_when_freed(space_id, zip_size,
...@@ -244,7 +248,7 @@ buf_LRU_drop_page_hash_for_tablespace( ...@@ -244,7 +248,7 @@ buf_LRU_drop_page_hash_for_tablespace(
} }
page_arr = ut_malloc( page_arr = ut_malloc(
sizeof(ulint) * BUF_LRU_DROP_SEARCH_HASH_SIZE); sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE);
buf_pool_mutex_enter(buf_pool); buf_pool_mutex_enter(buf_pool);
num_entries = 0; num_entries = 0;
...@@ -283,10 +287,10 @@ next_page: ...@@ -283,10 +287,10 @@ next_page:
/* Store the page number so that we can drop the hash /* Store the page number so that we can drop the hash
index in a batch later. */ index in a batch later. */
page_arr[num_entries] = bpage->offset; page_arr[num_entries] = bpage->offset;
ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE); ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
++num_entries; ++num_entries;
if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) { if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
goto next_page; goto next_page;
} }
...@@ -331,37 +335,40 @@ next_page: ...@@ -331,37 +335,40 @@ next_page:
} }
/******************************************************************//** /******************************************************************//**
Invalidates all pages belonging to a given tablespace inside a specific Remove all dirty pages belonging to a given tablespace inside a specific
buffer pool instance when we are deleting the data file(s) of that buffer pool instance when we are deleting the data file(s) of that
tablespace. */ tablespace. The pages still remain a part of LRU and are evicted from
the list as they age towards the tail of the LRU. */
static static
void void
buf_LRU_invalidate_tablespace_buf_pool_instance( buf_LRU_remove_dirty_pages_for_tablespace(
/*============================================*/ /*======================================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */ buf_pool_t* buf_pool, /*!< buffer pool instance */
ulint id) /*!< in: space id */ ulint id) /*!< in: space id */
{ {
buf_page_t* bpage; buf_page_t* bpage;
ibool all_freed; ibool all_freed;
ulint i;
scan_again: scan_again:
buf_pool_mutex_enter(buf_pool); buf_pool_mutex_enter(buf_pool);
buf_flush_list_mutex_enter(buf_pool);
all_freed = TRUE; all_freed = TRUE;
bpage = UT_LIST_GET_LAST(buf_pool->LRU); for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list), i = 0;
bpage != NULL; ++i) {
while (bpage != NULL) {
buf_page_t* prev_bpage; buf_page_t* prev_bpage;
mutex_t* block_mutex = NULL; mutex_t* block_mutex = NULL;
ut_a(buf_page_in_file(bpage)); ut_a(buf_page_in_file(bpage));
prev_bpage = UT_LIST_GET_PREV(LRU, bpage); prev_bpage = UT_LIST_GET_PREV(list, bpage);
/* bpage->space and bpage->io_fix are protected by /* bpage->space and bpage->io_fix are protected by
buf_pool_mutex and block_mutex. It is safe to check buf_pool->mutex and block_mutex. It is safe to check
them while holding buf_pool_mutex only. */ them while holding buf_pool->mutex only. */
if (buf_page_get_space(bpage) != id) { if (buf_page_get_space(bpage) != id) {
/* Skip this block, as it does not belong to /* Skip this block, as it does not belong to
...@@ -374,79 +381,83 @@ scan_again: ...@@ -374,79 +381,83 @@ scan_again:
all_freed = FALSE; all_freed = FALSE;
goto next_page; goto next_page;
} else { }
/* We have to release the flush_list_mutex to obey the
latching order. We are however guaranteed that the page
will stay in the flush_list because buf_flush_remove()
needs buf_pool->mutex as well. */
buf_flush_list_mutex_exit(buf_pool);
block_mutex = buf_page_get_mutex(bpage); block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex); mutex_enter(block_mutex);
if (bpage->buf_fix_count > 0) { if (bpage->buf_fix_count > 0) {
mutex_exit(block_mutex); mutex_exit(block_mutex);
buf_flush_list_mutex_enter(buf_pool);
/* We cannot remove this page during /* We cannot remove this page during
this scan yet; maybe the system is this scan yet; maybe the system is
currently reading it in, or flushing currently reading it in, or flushing
the modifications to the file */ the modifications to the file */
all_freed = FALSE; all_freed = FALSE;
goto next_page; goto next_page;
} }
}
ut_ad(mutex_own(block_mutex)); ut_ad(bpage->oldest_modification != 0);
#ifdef UNIV_DEBUG buf_flush_remove(bpage);
if (buf_debug_prints) {
fprintf(stderr,
"Dropping space %lu page %lu\n",
(ulong) buf_page_get_space(bpage),
(ulong) buf_page_get_page_no(bpage));
}
#endif
if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
/* This is a compressed-only block
descriptor. Do nothing. */
} else if (((buf_block_t*) bpage)->index) {
ulint page_no;
ulint zip_size;
buf_pool_mutex_exit(buf_pool);
zip_size = buf_page_get_zip_size(bpage);
page_no = buf_page_get_page_no(bpage);
mutex_exit(block_mutex); mutex_exit(block_mutex);
buf_flush_list_mutex_enter(buf_pool);
next_page:
bpage = prev_bpage;
/* Note that the following call will acquire if (!bpage) {
and release an X-latch on the page. */ break;
btr_search_drop_page_hash_when_freed(
id, zip_size, page_no);
goto scan_again;
} }
if (bpage->oldest_modification != 0) { /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
loop we release buf_pool->mutex to let other threads
do their job. */
if (i < BUF_LRU_DROP_SEARCH_SIZE) {
continue;
}
buf_flush_remove(bpage); /* We IO-fix the block to make sure that the block
stays in its position in the flush_list. */
if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
/* Block is already IO-fixed. We don't
want to change the value. Lets leave
this block alone. */
continue;
} }
/* Remove from the LRU list. */ buf_flush_list_mutex_exit(buf_pool);
block_mutex = buf_page_get_mutex(bpage);
mutex_enter(block_mutex);
buf_page_set_sticky(bpage);
mutex_exit(block_mutex);
if (buf_LRU_block_remove_hashed_page(bpage, TRUE) /* Now it is safe to release the buf_pool->mutex. */
!= BUF_BLOCK_ZIP_FREE) { buf_pool_mutex_exit(buf_pool);
buf_LRU_block_free_hashed_page((buf_block_t*) bpage); os_thread_yield();
buf_pool_mutex_enter(buf_pool);
mutex_enter(block_mutex);
buf_page_unset_sticky(bpage);
mutex_exit(block_mutex); mutex_exit(block_mutex);
} else {
/* The block_mutex should have been released buf_flush_list_mutex_enter(buf_pool);
by buf_LRU_block_remove_hashed_page() when it ut_ad(bpage->in_flush_list);
returns BUF_BLOCK_ZIP_FREE. */
ut_ad(block_mutex == &buf_pool->zip_mutex); i = 0;
ut_ad(!mutex_own(block_mutex));
}
next_page:
bpage = prev_bpage;
} }
buf_pool_mutex_exit(buf_pool); buf_pool_mutex_exit(buf_pool);
buf_flush_list_mutex_exit(buf_pool);
ut_ad(buf_flush_validate(buf_pool));
if (!all_freed) { if (!all_freed) {
os_thread_sleep(20000); os_thread_sleep(20000);
...@@ -477,7 +488,7 @@ buf_LRU_invalidate_tablespace( ...@@ -477,7 +488,7 @@ buf_LRU_invalidate_tablespace(
buf_pool = buf_pool_from_array(i); buf_pool = buf_pool_from_array(i);
buf_LRU_drop_page_hash_for_tablespace(buf_pool, id); buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
buf_LRU_invalidate_tablespace_buf_pool_instance(buf_pool, id); buf_LRU_remove_dirty_pages_for_tablespace(buf_pool, id);
} }
} }
...@@ -1532,8 +1543,9 @@ alloc: ...@@ -1532,8 +1543,9 @@ alloc:
/* Prevent buf_page_get_gen() from /* Prevent buf_page_get_gen() from
decompressing the block while we release decompressing the block while we release
buf_pool->mutex and block_mutex. */ buf_pool->mutex and block_mutex. */
b->buf_fix_count++; mutex_enter(&buf_pool->zip_mutex);
b->io_fix = BUF_IO_READ; buf_page_set_sticky(b);
mutex_exit(&buf_pool->zip_mutex);
} }
buf_pool_mutex_exit(buf_pool); buf_pool_mutex_exit(buf_pool);
...@@ -1573,8 +1585,7 @@ alloc: ...@@ -1573,8 +1585,7 @@ alloc:
if (b) { if (b) {
mutex_enter(&buf_pool->zip_mutex); mutex_enter(&buf_pool->zip_mutex);
b->buf_fix_count--; buf_page_unset_sticky(b);
buf_page_set_io_fix(b, BUF_IO_NONE);
mutex_exit(&buf_pool->zip_mutex); mutex_exit(&buf_pool->zip_mutex);
} }
......
...@@ -910,7 +910,27 @@ buf_block_set_io_fix( ...@@ -910,7 +910,27 @@ buf_block_set_io_fix(
/*=================*/ /*=================*/
buf_block_t* block, /*!< in/out: control block */ buf_block_t* block, /*!< in/out: control block */
enum buf_io_fix io_fix);/*!< in: io_fix state */ enum buf_io_fix io_fix);/*!< in: io_fix state */
/*********************************************************************//**
Makes a block sticky. A sticky block implies that even after we release
the buf_pool->mutex and the block->mutex:
* it cannot be removed from the flush_list
* the block descriptor cannot be relocated
* it cannot be removed from the LRU list
Note that:
* the block can still change its position in the LRU list
* the next and previous pointers can change. */
UNIV_INLINE
void
buf_page_set_sticky(
/*================*/
buf_page_t* bpage); /*!< in/out: control block */
/*********************************************************************//**
Removes stickiness of a block. */
UNIV_INLINE
void
buf_page_unset_sticky(
/*==================*/
buf_page_t* bpage); /*!< in/out: control block */
/********************************************************************//** /********************************************************************//**
Determine if a buffer block can be relocated in memory. The block Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */ can be dirty, but it must not be I/O-fixed or bufferfixed. */
......
...@@ -414,6 +414,7 @@ buf_page_get_io_fix( ...@@ -414,6 +414,7 @@ buf_page_get_io_fix(
case BUF_IO_NONE: case BUF_IO_NONE:
case BUF_IO_READ: case BUF_IO_READ:
case BUF_IO_WRITE: case BUF_IO_WRITE:
case BUF_IO_PIN:
return(io_fix); return(io_fix);
} }
ut_error; ut_error;
...@@ -464,6 +465,49 @@ buf_block_set_io_fix( ...@@ -464,6 +465,49 @@ buf_block_set_io_fix(
buf_page_set_io_fix(&block->page, io_fix); buf_page_set_io_fix(&block->page, io_fix);
} }
/*********************************************************************//**
Makes a block sticky. A sticky block implies that even after we release
the buf_pool->mutex and the block->mutex:
* it cannot be removed from the flush_list
* the block descriptor cannot be relocated
* it cannot be removed from the LRU list
Note that:
* the block can still change its position in the LRU list
* the next and previous pointers can change. */
UNIV_INLINE
void
buf_page_set_sticky(
/*================*/
buf_page_t* bpage) /*!< in/out: control block */
{
#ifdef UNIV_DEBUG
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
ut_ad(buf_pool_mutex_own(buf_pool));
#endif
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
bpage->io_fix = BUF_IO_PIN;
}
/*********************************************************************//**
Removes stickiness of a block. */
UNIV_INLINE
void
buf_page_unset_sticky(
/*==================*/
buf_page_t* bpage) /*!< in/out: control block */
{
#ifdef UNIV_DEBUG
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
ut_ad(buf_pool_mutex_own(buf_pool));
#endif
ut_ad(mutex_own(buf_page_get_mutex(bpage)));
ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN);
bpage->io_fix = BUF_IO_NONE;
}
/********************************************************************//** /********************************************************************//**
Determine if a buffer block can be relocated in memory. The block Determine if a buffer block can be relocated in memory. The block
can be dirty, but it must not be I/O-fixed or bufferfixed. */ can be dirty, but it must not be I/O-fixed or bufferfixed. */
......
...@@ -57,7 +57,10 @@ enum buf_flush { ...@@ -57,7 +57,10 @@ enum buf_flush {
enum buf_io_fix { enum buf_io_fix {
BUF_IO_NONE = 0, /**< no pending I/O */ BUF_IO_NONE = 0, /**< no pending I/O */
BUF_IO_READ, /**< read pending */ BUF_IO_READ, /**< read pending */
BUF_IO_WRITE /**< write pending */ BUF_IO_WRITE, /**< write pending */
BUF_IO_PIN /**< disallow relocation of
block and its removal of from
the flush_list */
}; };
/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */ /** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment