Commit fceb78e6 authored by sunny's avatar sunny

branches/innodb+: Delete buffer port from branches/fts:r2283

parent 57fc4316
......@@ -558,6 +558,7 @@ btr_page_get_father_node_ptr(
its page x-latched */
mtr_t* mtr) /* in: mtr */
{
page_t* page;
dtuple_t* tuple;
rec_t* user_rec;
rec_t* node_ptr;
......@@ -574,7 +575,19 @@ btr_page_get_father_node_ptr(
ut_ad(dict_index_get_page(index) != page_no);
level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
user_rec = btr_cur_get_rec(cursor);
page = btr_cur_get_page(cursor);
if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) {
/* Empty pages can result from buffered delete operations.
The first record from the free list can be used to find the
father node. */
user_rec = page_header_get_ptr(page, PAGE_FREE);
ut_a(user_rec);
} else {
user_rec = btr_cur_get_rec(cursor);
}
ut_a(page_rec_is_user_rec(user_rec));
tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
......
......@@ -39,6 +39,14 @@ Created 10/16/1994 Heikki Tuuri
#include "lock0lock.h"
#include "zlib.h"
/* Btree operation types, introduced as part of delete buffering. */
typedef enum btr_op_enum {
BTR_NO_OP = 0,
BTR_INSERT_OP,
BTR_DELETE_OP,
BTR_DELMARK_OP
} btr_op_t;
#ifdef UNIV_DEBUG
/* If the following is set to TRUE, this module prints a lot of
trace information of individual record operations */
......@@ -139,6 +147,8 @@ btr_rec_get_externally_stored_len(
rec_t* rec, /* in: record */
const ulint* offsets);/* in: array returned by rec_get_offsets() */
/**********************************************************
The following function is used to set the deleted bit of a record. */
UNIV_INLINE
......@@ -148,7 +158,7 @@ btr_rec_set_deleted_flag(
/* out: TRUE on success;
FALSE on page_zip overflow */
rec_t* rec, /* in/out: physical record */
page_zip_des_t* page_zip,/* in/out: compressed page (or NULL) */
page_zip_des_t* page_zip,/* in/out: compressed page (or NULL) */
ulint flag) /* in: nonzero if delete marked */
{
if (page_rec_is_comp(rec)) {
......@@ -306,25 +316,29 @@ btr_cur_search_to_nth_level(
RW_S_LATCH, or 0 */
mtr_t* mtr) /* in: mtr */
{
page_cur_t* page_cursor;
page_t* page;
buf_block_t* block;
ulint space;
buf_block_t* guess;
ulint height;
rec_t* node_ptr;
ulint page_no;
ulint space;
ulint up_match;
ulint up_bytes;
ulint low_match;
ulint low_bytes;
ulint height;
ulint savepoint;
ulint rw_latch;
ulint page_mode;
ulint insert_planned;
ulint buf_mode;
ulint estimate;
ulint zip_size;
ulint watch_leaf;
page_cur_t* page_cursor;
ulint ignore_sec_unique;
btr_op_t btr_op = BTR_NO_OP;
ulint root_height = 0; /* remove warning */
#ifdef BTR_CUR_ADAPT
btr_search_t* info;
#endif
......@@ -344,17 +358,38 @@ btr_cur_search_to_nth_level(
cursor->up_match = ULINT_UNDEFINED;
cursor->low_match = ULINT_UNDEFINED;
#endif
insert_planned = latch_mode & BTR_INSERT;
/* This flags are mutually exclusive, they are lumped together
with the latch mode for historical reasons. It's possible for
none of the flags to be set. */
if (latch_mode & BTR_INSERT) {
btr_op = BTR_INSERT_OP;
} else if (latch_mode & BTR_DELETE) {
btr_op = BTR_DELETE_OP;
} else if (latch_mode & BTR_DELETE_MARK) {
btr_op = BTR_DELMARK_OP;
}
watch_leaf = latch_mode & BTR_WATCH_LEAF;
estimate = latch_mode & BTR_ESTIMATE;
ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
| BTR_IGNORE_SEC_UNIQUE);
ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
/* Turn the flags unrelated to the latch mode off. */
latch_mode &= ~(
BTR_INSERT
| BTR_DELETE_MARK
| BTR_DELETE
| BTR_ESTIMATE
| BTR_IGNORE_SEC_UNIQUE
| BTR_WATCH_LEAF);
cursor->flag = BTR_CUR_BINARY;
cursor->index = index;
cursor->leaf_in_buf_pool = FALSE;
cursor->ibuf_cnt = ULINT_UNDEFINED;
#ifndef BTR_CUR_ADAPT
guess = NULL;
#else
......@@ -367,9 +402,17 @@ btr_cur_search_to_nth_level(
#ifdef UNIV_SEARCH_PERF_STAT
info->n_searches++;
#endif
/* TODO: investigate if there is any real reason for forbidding
adaptive hash usage when watch_leaf is true.*/
/* Ibuf does not use adaptive hash; this is prevented by the
latch_mode check below. */
if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
&& latch_mode <= BTR_MODIFY_LEAF
&& info->last_hash_succ
&& !estimate
&& !watch_leaf
#ifdef PAGE_CUR_LE_OR_EXTENDS
&& mode != PAGE_CUR_LE_OR_EXTENDS
#endif /* PAGE_CUR_LE_OR_EXTENDS */
......@@ -390,8 +433,9 @@ btr_cur_search_to_nth_level(
return;
}
#endif
#endif
#endif /* BTR_CUR_HASH_ADAPT */
#endif /* BTR_CUR_ADAPT */
btr_cur_n_non_sea++;
/* If the hash search did not succeed, do binary search down the
......@@ -456,154 +500,228 @@ btr_cur_search_to_nth_level(
/* Loop and search until we arrive at the desired level */
for (;;) {
ulint zip_size;
buf_block_t* block;
search_loop:
if (height == 0) {
if (watch_leaf) {
buf_mode = BUF_GET_IF_IN_POOL;
} else if (latch_mode <= BTR_MODIFY_LEAF) {
rw_latch = latch_mode;
if (btr_op != BTR_NO_OP
&& ibuf_should_try(index, ignore_sec_unique)) {
/* Try insert/delete mark/delete to the
insert/delete buffer if the page is not in
the buffer pool */
buf_mode = BUF_GET_IF_IN_POOL;
}
}
}
retry_page_get:
zip_size = dict_table_zip_size(index->table);
zip_size = dict_table_zip_size(index->table);
block = buf_page_get_gen(space, zip_size, page_no,
rw_latch, guess, buf_mode,
__FILE__, __LINE__,
mtr);
if (watch_leaf && height == 0) {
ut_a(buf_mode == BUF_GET_IF_IN_POOL);
buf_mode = BUF_GET_IF_IN_POOL_OR_WATCH;
}
block = buf_page_get_gen(
space, zip_size, page_no, rw_latch, guess, buf_mode,
__FILE__, __LINE__, mtr);
if (watch_leaf && height == 0) {
cursor->leaf_in_buf_pool = !!block;
/* We didn't find a page but we set a watch on it. */
if (block == NULL) {
/* This must be a search to perform an insert;
try insert to the insert buffer */
ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
ut_ad(insert_planned);
ut_ad(cursor->thr);
if (ibuf_should_try(index, ignore_sec_unique)
&& ibuf_insert(tuple, index, space, zip_size,
page_no, cursor->thr)) {
/* Insertion to the insert buffer succeeded */
cursor->flag = BTR_CUR_INSERT_TO_IBUF;
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
cursor->flag = BTR_CUR_ABORTED;
goto func_exit;
}
}
if (block == NULL) {
/* This must be a search to perform an insert/delete
mark/ delete; try using the insert/delete buffer */
ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
ut_ad(cursor->thr);
if (ibuf_should_try(index, ignore_sec_unique)) {
switch (btr_op) {
case BTR_INSERT_OP:
if (ibuf_insert(IBUF_OP_INSERT, tuple, index,
space, zip_size, page_no,
cursor->thr)) {
cursor->flag = BTR_CUR_INSERT_TO_IBUF;
goto func_exit;
}
break;
case BTR_DELMARK_OP:
if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple,
index, space, zip_size,
page_no, cursor->thr)) {
cursor->flag = BTR_CUR_DEL_MARK_IBUF;
goto func_exit;
}
goto func_exit;
}
/* Insert to the insert buffer did not succeed:
retry page get */
break;
case BTR_DELETE_OP:
if (ibuf_insert(IBUF_OP_DELETE, tuple, index,
space, zip_size, page_no,
cursor->thr)) {
buf_mode = BUF_GET;
cursor->flag = BTR_CUR_DELETE_IBUF;
goto retry_page_get;
goto func_exit;
}
break;
default:
ut_error;
}
}
page = buf_block_get_frame(block);
/* Insert to the insert/delete buffer did not succeed, we
must read the page from disk. */
buf_mode = BUF_GET;
goto retry_page_get;
}
block->check_index_page_at_flush = TRUE;
page = buf_block_get_frame(block);
#ifdef UNIV_ZIP_DEBUG
if (rw_latch != RW_NO_LATCH) {
const page_zip_des_t* page_zip
= buf_block_get_page_zip(block);
ut_a(!page_zip || page_zip_validate(page_zip, page));
}
#endif /* UNIV_ZIP_DEBUG */
if (rw_latch != RW_NO_LATCH) {
const page_zip_des_t* page_zip;
block->check_index_page_at_flush = TRUE;
page_zip = buf_block_get_page_zip(block);
ut_a(!page_zip || page_zip_validate(page_zip, page));
}
#endif /* UNIV_ZIP_DEBUG */
#ifdef UNIV_SYNC_DEBUG
if (rw_latch != RW_NO_LATCH) {
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
}
if (rw_latch != RW_NO_LATCH) {
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
}
#endif
ut_ad(0 == ut_dulint_cmp(index->id,
btr_page_get_index_id(page)));
ut_ad(0 == ut_dulint_cmp(index->id, btr_page_get_index_id(page)));
if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
/* We are in the root node */
if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
/* We are in the root node */
height = btr_page_get_level(page, mtr);
root_height = height;
cursor->tree_height = root_height + 1;
height = btr_page_get_level(page, mtr);
root_height = height;
cursor->tree_height = root_height + 1;
/* 1-level trees must be handled here
for BTR_WATCH_LEAF. */
if (watch_leaf && height == 0) {
cursor->leaf_in_buf_pool = TRUE;
}
#ifdef BTR_CUR_ADAPT
if (block != guess) {
info->root_guess = block;
}
#endif
if (block != guess) {
info->root_guess = block;
}
#endif
}
if (height == 0) {
if (rw_latch == RW_NO_LATCH) {
btr_cur_latch_leaves(page, space, zip_size,
page_no, latch_mode,
cursor, mtr);
}
if (height == 0) {
if (rw_latch == RW_NO_LATCH) {
if ((latch_mode != BTR_MODIFY_TREE)
&& (latch_mode != BTR_CONT_MODIFY_TREE)) {
btr_cur_latch_leaves(
page, space, zip_size, page_no, latch_mode,
cursor, mtr);
}
/* Release the tree s-latch */
if (latch_mode != BTR_MODIFY_TREE
&& latch_mode != BTR_CONT_MODIFY_TREE) {
mtr_release_s_latch_at_savepoint(
mtr, savepoint,
dict_index_get_lock(index));
}
/* Release the tree s-latch */
page_mode = mode;
mtr_release_s_latch_at_savepoint(
mtr, savepoint, dict_index_get_lock(index));
}
page_cur_search_with_match(block, index, tuple, page_mode,
&up_match, &up_bytes,
&low_match, &low_bytes,
page_cursor);
page_mode = mode;
}
if (estimate) {
btr_cur_add_path_info(cursor, height, root_height);
}
page_cur_search_with_match(
block, index, tuple, page_mode, &up_match, &up_bytes,
&low_match, &low_bytes, page_cursor);
/* If this is the desired level, leave the loop */
if (estimate) {
btr_cur_add_path_info(cursor, height, root_height);
}
ut_ad(height == btr_page_get_level(
page_cur_get_page(page_cursor), mtr));
/* If this is the desired level, leave the loop */
if (level == height) {
ut_ad(height == btr_page_get_level(page_cur_get_page(page_cursor),
mtr));
if (level > 0) {
/* x-latch the page */
page = btr_page_get(space, zip_size,
page_no, RW_X_LATCH, mtr);
ut_a((ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
}
if (level == height) {
break;
if (level > 0) {
/* x-latch the page */
page = btr_page_get(
space, zip_size, page_no, RW_X_LATCH, mtr);
ut_a((ibool)!!page_is_comp(page)
== dict_table_is_comp(index->table));
}
ut_ad(height > 0);
goto loop_end;
}
height--;
ut_ad(height > 0);
if ((height == 0) && (latch_mode <= BTR_MODIFY_LEAF)) {
height--;
rw_latch = latch_mode;
node_ptr = page_cur_get_rec(page_cursor);
if (insert_planned
&& ibuf_should_try(index, ignore_sec_unique)) {
offsets = rec_get_offsets(
node_ptr, cursor->index, offsets, ULINT_UNDEFINED, &heap);
/* Try insert to the insert buffer if the
page is not in the buffer pool */
/* Go to the child node */
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
buf_mode = BUF_GET_IF_IN_POOL;
}
}
if (index->type & DICT_IBUF && height == level) {
/* We're doing a search on an ibuf tree and we're one level
above the leaf page. (Assuming level == 0, which it should
be.) */
guess = NULL;
ulint is_min_rec;
node_ptr = page_cur_get_rec(page_cursor);
offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
ULINT_UNDEFINED, &heap);
/* Go to the child node */
page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
}
is_min_rec = rec_get_info_bits(node_ptr, 0)
& REC_INFO_MIN_REC_FLAG;
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
if (!is_min_rec) {
cursor->ibuf_cnt = ibuf_rec_get_fake_counter(node_ptr);
ut_a(cursor->ibuf_cnt <= 0xFFFF
|| cursor->ibuf_cnt == ULINT_UNDEFINED);
}
}
goto search_loop;
loop_end:
if (level == 0) {
cursor->low_match = low_match;
cursor->low_bytes = low_bytes;
......@@ -625,6 +743,11 @@ retry_page_get:
}
func_exit:
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
if (has_search_latch) {
rw_lock_s_lock(&btr_search_latch);
......@@ -686,8 +809,7 @@ btr_cur_open_at_index_side(
page_t* page;
block = buf_page_get_gen(space, zip_size, page_no,
RW_NO_LATCH, NULL, BUF_GET,
__FILE__, __LINE__,
mtr);
__FILE__, __LINE__, mtr);
page = buf_block_get_frame(block);
ut_ad(0 == ut_dulint_cmp(index->id,
btr_page_get_index_id(page)));
......@@ -806,8 +928,7 @@ btr_cur_open_at_rnd_pos(
block = buf_page_get_gen(space, zip_size, page_no,
RW_NO_LATCH, NULL, BUF_GET,
__FILE__, __LINE__,
mtr);
__FILE__, __LINE__, mtr);
page = buf_block_get_frame(block);
ut_ad(0 == ut_dulint_cmp(index->id,
btr_page_get_index_id(page)));
......@@ -2651,7 +2772,7 @@ btr_cur_del_mark_set_sec_rec(
}
/***************************************************************
Sets a secondary index record delete mark to FALSE. This function is only
Sets a secondary index record'd delete mark to value. This function is only
used by the insert buffer insert merge mechanism. */
UNIV_INTERN
void
......@@ -2662,14 +2783,38 @@ btr_cur_del_unmark_for_ibuf(
corresponding to rec, or NULL
when the tablespace is
uncompressed */
ibool val, /* in: value to set */
mtr_t* mtr) /* in: mtr */
{
/* We do not need to reserve btr_search_latch, as the page has just
been read to the buffer pool and there cannot be a hash index to it. */
btr_rec_set_deleted_flag(rec, page_zip, FALSE);
btr_rec_set_deleted_flag(rec, page_zip, val);
btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
}
/***************************************************************
Sets a secondary index record's delete mark to the given value. This
function is only used by the insert buffer merge mechanism. */
void
btr_cur_set_deleted_flag_for_ibuf(
/*==============================*/
rec_t* rec, /* in: record */
page_zip_des_t* page_zip, /* in/out: compressed page
corresponding to rec, or NULL
when the tablespace is
uncompressed */
ibool val, /* in: value to set */
mtr_t* mtr) /* in: mtr */
{
/* We do not need to reserve btr_search_latch, as the page has just
been read to the buffer pool and there cannot be a hash index to it. */
btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
rec_set_deleted_flag_new(rec, page_zip, val);
btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
}
/*==================== B-TREE RECORD REMOVE =========================*/
......@@ -2763,8 +2908,7 @@ btr_cur_optimistic_delete(
ut_a(!page_zip || page_zip_validate(page_zip, page));
#endif /* UNIV_ZIP_DEBUG */
if (dict_index_is_clust(cursor->index)
|| !page_is_leaf(page)) {
if (dict_index_is_clust(cursor->index) || !page_is_leaf(page)) {
/* The insert buffer does not handle
inserts to clustered indexes or to non-leaf
pages of secondary index B-trees. */
......
......@@ -213,6 +213,7 @@ buf_buddy_block_register(
buf_block_t* block) /* in: buffer frame to allocate */
{
const ulint fold = BUF_POOL_ZIP_FOLD(block);
ut_ad(buf_pool_mutex_own());
ut_ad(!mutex_own(&buf_pool_zip_mutex));
......@@ -224,6 +225,7 @@ buf_buddy_block_register(
ut_ad(!block->page.in_page_hash);
ut_ad(!block->page.in_zip_hash);
ut_d(block->page.in_zip_hash = TRUE);
HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
buf_buddy_n_frames++;
......@@ -278,23 +280,21 @@ buf_buddy_alloc_clean(
TRUE if storage was allocated from the LRU list
and buf_pool_mutex was temporarily released */
{
ulint count;
buf_page_t* bpage;
ut_ad(buf_pool_mutex_own());
ut_ad(!mutex_own(&buf_pool_zip_mutex));
if (buf_buddy_n_frames < buf_buddy_max_n_frames) {
goto free_LRU;
}
if (buf_buddy_n_frames >= buf_buddy_max_n_frames
&& ((BUF_BUDDY_LOW << i) >= PAGE_ZIP_MIN_SIZE
&& i < BUF_BUDDY_SIZES)) {
if (BUF_BUDDY_LOW << i >= PAGE_ZIP_MIN_SIZE
&& i < BUF_BUDDY_SIZES) {
/* Try to find a clean compressed-only page
of the same size. */
page_zip_des_t dummy_zip;
ulint j;
page_zip_des_t dummy_zip;
page_zip_set_size(&dummy_zip, BUF_BUDDY_LOW << i);
......@@ -335,9 +335,12 @@ buf_buddy_alloc_clean(
/* Free blocks from the end of the LRU list until enough space
is available. */
count = 0;
free_LRU:
for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); bpage;
bpage = UT_LIST_GET_PREV(LRU, bpage)) {
for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
bpage;
bpage = UT_LIST_GET_PREV(LRU, bpage), ++count) {
void* ret;
mutex_t* block_mutex = buf_page_get_mutex(bpage);
......@@ -440,20 +443,19 @@ buf_buddy_alloc_low(
}
/* Try replacing a clean page in the buffer pool. */
block = buf_buddy_alloc_clean(i, lru);
if (block) {
goto func_exit;
}
/* Try replacing an uncompressed page in the buffer pool. */
buf_pool_mutex_exit();
block = buf_LRU_get_free_block(0);
*lru = TRUE;
buf_pool_mutex_enter();
alloc_big:
buf_buddy_block_register(block);
......
......@@ -1346,6 +1346,69 @@ buf_pool_resize(void)
buf_pool_page_hash_rebuild();
}
/********************************************************************
Add watch for the given page to be read in. Caller must have the buffer pool
mutex reserved. */
static
void
buf_pool_add_watch(
/*===============*/
ulint space, /* in: space id */
ulint page_no) /* in: page number */
{
ut_ad(mutex_own(&buf_pool_mutex));
/* There can't be multiple watches at the same time. */
ut_a(!buf_pool->watch_active);
buf_pool->watch_active = TRUE;
buf_pool->watch_space = space;
buf_pool->watch_happened = FALSE;
buf_pool->watch_page_no = page_no;
}
/********************************************************************
Stop watching if the marked page is read in. */
UNIV_INTERN
void
buf_pool_remove_watch(void)
/*=======================*/
{
buf_pool_mutex_enter();
ut_ad(buf_pool->watch_active);
buf_pool->watch_active = FALSE;
buf_pool_mutex_exit();
}
/********************************************************************
Check if the given page is being watched and has been read to the buffer
pool. */
UNIV_INTERN
ibool
buf_pool_watch_happened(
/*====================*/
/* out: TRUE if the given page is being
watched and it has been read in */
ulint space, /* in: space id */
ulint page_no) /* in: page number */
{
ulint ret;
buf_pool_mutex_enter();
ret = buf_pool->watch_active
&& space == buf_pool->watch_space
&& page_no == buf_pool->watch_page_no
&& buf_pool->watch_happened;
buf_pool_mutex_exit();
return(ret);
}
/************************************************************************
Moves to the block to the start of the LRU list if there is a danger
that the block would drift out of the buffer pool. */
......@@ -1763,7 +1826,8 @@ buf_page_get_gen(
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /* in: guessed block or NULL */
ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL,
BUF_GET_NO_LATCH, BUF_GET_NOWAIT */
BUF_GET_NO_LATCH, BUF_GET_NOWAIT or
BUF_GET_IF_IN_POOL_OR_WATCH*/
const char* file, /* in: file name */
ulint line, /* in: line where called */
mtr_t* mtr) /* in: mini-transaction */
......@@ -1778,11 +1842,17 @@ buf_page_get_gen(
|| (rw_latch == RW_X_LATCH)
|| (rw_latch == RW_NO_LATCH));
ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
|| (mode == BUF_GET_NO_LATCH) || (mode == BUF_GET_NOWAIT));
/* Check for acceptable modes. */
ut_ad(mode == BUF_GET
|| mode == BUF_GET_IF_IN_POOL
|| mode == BUF_GET_NO_LATCH
|| mode == BUF_GET_NOWAIT
|| mode == BUF_GET_IF_IN_POOL_OR_WATCH);
ut_ad(zip_size == fil_space_get_zip_size(space));
#ifndef UNIV_LOG_DEBUG
ut_ad(!ibuf_inside() || ibuf_page(space, zip_size, offset));
ut_ad(!ibuf_inside() || ibuf_page(space, zip_size, offset, mtr));
#endif
buf_pool->n_page_gets++;
loop:
......@@ -1818,9 +1888,14 @@ loop2:
if (block == NULL) {
/* Page not in buf_pool: needs to be read from file */
if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
buf_pool_add_watch(space, offset);
}
buf_pool_mutex_exit();
if (mode == BUF_GET_IF_IN_POOL) {
if (mode == BUF_GET_IF_IN_POOL
|| mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
return(NULL);
}
......@@ -1837,7 +1912,18 @@ loop2:
must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
if (must_read && mode == BUF_GET_IF_IN_POOL) {
if (must_read
&& (mode == BUF_GET_IF_IN_POOL
|| mode == BUF_GET_IF_IN_POOL_OR_WATCH)) {
/* The page is being read to bufer pool,
but we can't wait around for the read to
complete. */
if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
buf_pool_add_watch(space, offset);
}
/* The page is only being read to buffer */
buf_pool_mutex_exit();
......@@ -2140,7 +2226,7 @@ buf_page_optimistic_get_func(
ut_ad(!ibuf_inside()
|| ibuf_page(buf_block_get_space(block),
buf_block_get_zip_size(block),
buf_block_get_page_no(block)));
buf_block_get_page_no(block), mtr));
if (rw_latch == RW_S_LATCH) {
success = rw_lock_s_lock_func_nowait(&(block->lock),
......@@ -2392,6 +2478,25 @@ buf_page_init_low(
#endif /* UNIV_DEBUG_FILE_ACCESSES */
}
/************************************************************************
Set watch happened flag. */
UNIV_INLINE
void
buf_page_notify_watch(
/*==================*/
ulint space, /* in: space id of page read in */
ulint offset) /* in: offset of page read in */
{
ut_ad(buf_pool_mutex_own());
if (buf_pool->watch_active
&& space == buf_pool->watch_space
&& offset == buf_pool->watch_page_no) {
buf_pool->watch_happened = TRUE;
}
}
#ifdef UNIV_HOTBACKUP
/************************************************************************
Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
......@@ -2481,6 +2586,7 @@ buf_page_init(
}
buf_page_init_low(&block->page);
buf_page_notify_watch(space, offset);
ut_ad(!block->page.in_zip_hash);
ut_ad(!block->page.in_page_hash);
......@@ -2531,7 +2637,8 @@ buf_page_init_for_read(
mtr_start(&mtr);
if (!ibuf_page_low(space, zip_size, offset, &mtr)) {
if (!recv_no_ibuf_operations
&& !ibuf_page(space, zip_size, offset, &mtr)) {
mtr_commit(&mtr);
......@@ -2583,7 +2690,9 @@ err_exit2:
if (block) {
bpage = &block->page;
mutex_enter(&block->mutex);
buf_page_init(space, offset, block);
buf_page_notify_watch(space, offset);
/* The block must be put to the LRU list, to the old blocks */
buf_LRU_add_block(bpage, TRUE/* to old blocks */);
......@@ -2650,11 +2759,15 @@ err_exit2:
mutex_enter(&buf_pool_zip_mutex);
UNIV_MEM_DESC(bpage->zip.data,
page_zip_get_size(&bpage->zip), bpage);
buf_page_init_low(bpage);
buf_page_notify_watch(space, offset);
bpage->state = BUF_BLOCK_ZIP_PAGE;
bpage->space = space;
bpage->offset = offset;
#ifdef UNIV_DEBUG
bpage->in_page_hash = FALSE;
bpage->in_zip_hash = FALSE;
......@@ -2748,6 +2861,7 @@ buf_page_create(
mutex_enter(&block->mutex);
buf_page_init(space, offset, block);
buf_page_notify_watch(space, offset);
/* The block must be put to the LRU list */
buf_LRU_add_block(&block->page, FALSE);
......@@ -3539,7 +3653,7 @@ buf_print_io(
fprintf(file,
"Buffer pool size %lu\n"
"Free buffers %lu\n"
"Free buffers %lu\n"
"Database pages %lu\n"
"Modified db pages %lu\n"
"Pending reads %lu\n"
......
......@@ -497,7 +497,7 @@ loop:
if (!buf_lru_switched_on_innodb_mon) {
/* Over 67 % of the buffer pool is occupied by lock
/* Over 67 % of the buffer pool is occupied by lock
heaps or the adaptive hash index. This may be a memory
leak! */
......
......@@ -191,8 +191,6 @@ struct fil_space_struct {
currently in the list above */
UT_LIST_NODE_T(fil_space_t) space_list;
/* list of all spaces */
ibuf_data_t* ibuf_data;
/* insert buffer data */
ulint magic_n;
};
......@@ -476,33 +474,6 @@ fil_space_get_type(
return(space->purpose);
}
/***********************************************************************
Returns the ibuf data of a file space. */
UNIV_INTERN
ibuf_data_t*
fil_space_get_ibuf_data(
/*====================*/
/* out: ibuf data for this space */
ulint id) /* in: space id */
{
fil_system_t* system = fil_system;
fil_space_t* space;
ut_ad(system);
ut_a(id == 0);
mutex_enter(&(system->mutex));
space = fil_space_get_by_id(id);
mutex_exit(&(system->mutex));
ut_a(space);
return(space->ibuf_data);
}
/**************************************************************************
Checks if all the file nodes in a space are flushed. The caller must hold
the fil_system mutex. */
......@@ -1183,8 +1154,6 @@ try_again:
UT_LIST_INIT(space->chain);
space->magic_n = FIL_SPACE_MAGIC_N;
space->ibuf_data = NULL;
rw_lock_create(&space->latch, SYNC_FSP);
HASH_INSERT(fil_space_t, hash, system->spaces, id, space);
......@@ -1649,25 +1618,6 @@ fil_set_max_space_id_if_bigger(
mutex_exit(&(system->mutex));
}
/********************************************************************
Initializes the ibuf data structure for space 0 == the system tablespace.
This can be called after the file space headers have been created and the
dictionary system has been initialized. */
UNIV_INTERN
void
fil_ibuf_init_at_db_start(void)
/*===========================*/
{
fil_space_t* space;
space = UT_LIST_GET_FIRST(fil_system->space_list);
ut_a(space);
ut_a(space->purpose == FIL_TABLESPACE);
space->ibuf_data = ibuf_data_init_for_space(space->id);
}
/********************************************************************
Writes the flushed lsn and the latest archived log number to the page header
of the first page of a data file of the system tablespace (space 0),
......@@ -4266,13 +4216,13 @@ fil_io(
|| sync || is_log);
#ifdef UNIV_SYNC_DEBUG
ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE)
|| ibuf_page(space_id, zip_size, block_offset));
|| ibuf_page(space_id, zip_size, block_offset, NULL));
#endif
#endif
if (sync) {
mode = OS_AIO_SYNC;
} else if (type == OS_FILE_READ && !is_log
&& ibuf_page(space_id, zip_size, block_offset)) {
&& ibuf_page(space_id, zip_size, block_offset, NULL)) {
mode = OS_AIO_IBUF;
} else if (is_log) {
mode = OS_AIO_LOG;
......
......@@ -2191,8 +2191,8 @@ fseg_create_general(
/* This thread did not own the latch before this call: free
excess pages from the insert buffer free list */
if (space == 0) {
ibuf_free_excess_pages(0);
if (space == IBUF_SPACE_ID) {
ibuf_free_excess_pages();
}
}
......@@ -2759,8 +2759,8 @@ fseg_alloc_free_page_general(
/* This thread did not own the latch before this call: free
excess pages from the insert buffer free list */
if (space == 0) {
ibuf_free_excess_pages(0);
if (space == IBUF_SPACE_ID) {
ibuf_free_excess_pages();
}
}
......
......@@ -62,7 +62,28 @@ is in the compact format. The presence of this marker can be detected by
looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
The high-order bit of the character set field in the type info is the
"nullable" flag for the field. */
"nullable" flag for the field.
In versions >= TODO:
The optional marker byte at the start of the fourth field is replaced by
mandatory 3 fields, totaling 4 bytes:
1. 2 bytes: Counter field, used to sort records within a (space id, page
no) in the order they were added. This is needed so that for example the
sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled
correctly.
2. 1 byte: Operation type (see ibuf_op_t).
3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT.
To ensure older records, which do not have counters to enforce correct
sorting, are merged before any new records, ibuf_insert checks if we're
trying to insert to a position that contains old-style records, and if so,
refuses the insert. Thus, ibuf pages are gradually converted to the new
format as their corresponding buffer pool pages are read into memory.
*/
/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
......@@ -137,17 +158,18 @@ access order rules. */
/* Buffer pool size per the maximum insert buffer size */
#define IBUF_POOL_SIZE_PER_MAX_SIZE 2
/* Table name for the insert buffer. */
#define IBUF_TABLE_NAME "SYS_IBUF_TABLE"
/* The insert buffer control structure */
UNIV_INTERN ibuf_t* ibuf = NULL;
static ulint ibuf_rnd = 986058871;
UNIV_INTERN ulint ibuf_flush_count = 0;
#ifdef UNIV_IBUF_COUNT_DEBUG
/* Dimensions for the ibuf_count array */
#define IBUF_COUNT_N_SPACES 500
#define IBUF_COUNT_N_PAGES 2000
#define IBUF_COUNT_N_SPACES 4
#define IBUF_COUNT_N_PAGES 130000
/* Buffered entry counts for file pages, used in debugging */
static ulint ibuf_counts[IBUF_COUNT_N_SPACES][IBUF_COUNT_N_PAGES];
......@@ -192,6 +214,22 @@ ibuf_count_check(
# error "IBUF_BITS_PER_PAGE must be an even number!"
#endif
/* Various constants for checking the type of an ibuf record and extracting
data from it. For details, see the description of the record format at the
top of this file. */
#define IBUF_REC_INFO_SIZE 4 /* Combined size of info fields at
the beginning of the fourth field */
/* Offsets for the fields at the beginning of the fourth field */
#define IBUF_REC_OFFSET_COUNTER 0
#define IBUF_REC_OFFSET_TYPE 2
#define IBUF_REC_OFFSET_FLAGS 3
/* Record flag masks */
#define IBUF_REC_COMPACT 0x1 /* Whether the record is compact */
/* The mutex used to block pessimistic inserts to ibuf trees */
static mutex_t ibuf_pessimistic_insert_mutex;
......@@ -230,15 +268,6 @@ because ibuf merge is done to a page when it is read in, and it is
still physically like the index page even if the index would have been
dropped! So, there seems to be no problem. */
#ifdef UNIV_DEBUG
/**********************************************************************
Validates the ibuf data structures when the caller owns ibuf_mutex. */
static
ibool
ibuf_validate_low(void);
/*===================*/
/* out: TRUE if ok */
#endif /* UNIV_DEBUG */
/**********************************************************************
Sets the flag in the current OS thread local storage denoting that it is
inside an insert buffer routine. */
......@@ -293,17 +322,14 @@ page_t*
ibuf_header_page_get(
/*=================*/
/* out: insert buffer header page */
ulint space, /* in: space id */
mtr_t* mtr) /* in: mtr */
{
buf_block_t* block;
ut_a(space == 0);
ut_ad(!ibuf_inside());
block = buf_page_get(space, 0, FSP_IBUF_HEADER_PAGE_NO,
RW_X_LATCH, mtr);
block = buf_page_get(
IBUF_SPACE_ID, 0, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_IBUF_HEADER);
......@@ -319,19 +345,17 @@ page_t*
ibuf_tree_root_get(
/*===============*/
/* out: insert buffer tree root page */
ibuf_data_t* data, /* in: ibuf data */
ulint space, /* in: space id */
mtr_t* mtr) /* in: mtr */
{
buf_block_t* block;
ut_a(space == 0);
ut_ad(ibuf_inside());
mtr_x_lock(dict_index_get_lock(data->index), mtr);
mtr_x_lock(dict_index_get_lock(ibuf->index), mtr);
block = buf_page_get(
IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH, mtr);
block = buf_page_get(space, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH,
mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
......@@ -374,116 +398,71 @@ ibuf_count_set(
#endif
/**********************************************************************
Creates the insert buffer data structure at a database startup and initializes
the data structures for the insert buffer. */
UNIV_INTERN
void
ibuf_init_at_db_start(void)
/*=======================*/
{
ibuf = mem_alloc(sizeof(ibuf_t));
/* Note that also a pessimistic delete can sometimes make a B-tree
grow in size, as the references on the upper levels of the tree can
change */
ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
/ IBUF_POOL_SIZE_PER_MAX_SIZE;
UT_LIST_INIT(ibuf->data_list);
ibuf->size = 0;
mutex_create(&ibuf_pessimistic_insert_mutex,
SYNC_IBUF_PESS_INSERT_MUTEX);
mutex_create(&ibuf_mutex, SYNC_IBUF_MUTEX);
mutex_create(&ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX);
fil_ibuf_init_at_db_start();
}
/**********************************************************************
Updates the size information in an ibuf data, assuming the segment size has
not changed. */
Updates the size information of the ibuf, assuming the segment size has not
changed. */
static
void
ibuf_data_sizes_update(
/*===================*/
ibuf_data_t* data, /* in: ibuf data struct */
const page_t* root, /* in: ibuf tree root */
ibuf_size_update(
/*=============*/
page_t* root, /* in: ibuf tree root */
mtr_t* mtr) /* in: mtr */
{
ulint old_size;
#ifdef UNIV_SYNC_DEBUG
ut_ad(mutex_own(&ibuf_mutex));
#endif /* UNIV_SYNC_DEBUG */
old_size = data->size;
data->free_list_len = flst_get_len(root + PAGE_HEADER
ibuf->free_list_len = flst_get_len(root + PAGE_HEADER
+ PAGE_BTR_IBUF_FREE_LIST, mtr);
data->height = 1 + btr_page_get_level(root, mtr);
ibuf->height = 1 + btr_page_get_level(root, mtr);
data->size = data->seg_size - (1 + data->free_list_len);
/* the '1 +' is the ibuf header page */
ut_ad(data->size < data->seg_size);
if (page_get_n_recs(root) == 0) {
data->empty = TRUE;
} else {
data->empty = FALSE;
}
ut_ad(ibuf->size + data->size >= old_size);
ibuf->size = ibuf->size + data->size - old_size;
ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len);
#if 0
fprintf(stderr, "ibuf size %lu, space ibuf size %lu\n",
ibuf->size, data->size);
#endif
ibuf->empty = page_get_n_recs(root) == 0;
}
/**********************************************************************
Creates the insert buffer data struct for a single tablespace. Reads the
root page of the insert buffer tree in the tablespace. This function can
be called only after the dictionary system has been initialized, as this
creates also the insert buffer table and index into this tablespace. */
Creates the insert buffer data structure at a database startup and initializes
the data structures for the insert buffer. */
UNIV_INTERN
ibuf_data_t*
ibuf_data_init_for_space(
/*=====================*/
/* out, own: ibuf data struct, linked to the list
in ibuf control structure */
ulint space) /* in: space id */
void
ibuf_init_at_db_start(void)
/*=======================*/
{
ibuf_data_t* data;
page_t* root;
page_t* header_page;
mtr_t mtr;
char* buf;
mem_heap_t* heap;
dict_table_t* table;
mem_heap_t* heap;
dict_index_t* index;
ulint n_used;
ulint error;
page_t* header_page;
ut_a(space == 0);
ibuf = mem_alloc(sizeof(ibuf_t));
memset(ibuf, 0, sizeof(*ibuf));
/* Note that also a pessimistic delete can sometimes make a B-tree
grow in size, as the references on the upper levels of the tree can
change */
ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
/ IBUF_POOL_SIZE_PER_MAX_SIZE;
mutex_create(&ibuf_pessimistic_insert_mutex,
SYNC_IBUF_PESS_INSERT_MUTEX);
data = mem_alloc(sizeof(ibuf_data_t));
mutex_create(&ibuf_mutex, SYNC_IBUF_MUTEX);
data->space = space;
mutex_create(&ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX);
mtr_start(&mtr);
mutex_enter(&ibuf_mutex);
mtr_x_lock(fil_space_get_latch(space, NULL), &mtr);
mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, NULL), &mtr);
header_page = ibuf_header_page_get(space, &mtr);
header_page = ibuf_header_page_get(&mtr);
fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
&n_used, &mtr);
......@@ -491,33 +470,23 @@ ibuf_data_init_for_space(
ut_ad(n_used >= 2);
data->seg_size = n_used;
ibuf->seg_size = n_used;
{
buf_block_t* block = buf_page_get(
space, 0, FSP_IBUF_TREE_ROOT_PAGE_NO,
buf_block_t* block;
block = buf_page_get(
IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO,
RW_X_LATCH, &mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
root = buf_block_get_frame(block);
}
data->size = 0;
data->n_inserts = 0;
data->n_merges = 0;
data->n_merged_recs = 0;
ibuf_data_sizes_update(data, root, &mtr);
/*
if (!data->empty) {
fprintf(stderr,
"InnoDB: index entries found in the insert buffer\n");
} else {
fprintf(stderr,
"InnoDB: insert buffer empty\n");
}
*/
ibuf_size_update(root, &mtr);
mutex_exit(&ibuf_mutex);
mtr_commit(&mtr);
......@@ -525,42 +494,28 @@ ibuf_data_init_for_space(
ibuf_exit();
heap = mem_heap_create(450);
buf = mem_heap_alloc(heap, 50);
sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space);
/* use old-style record format for the insert buffer */
table = dict_mem_table_create(buf, space, 2, 0);
/* Use old-style record format for the insert buffer. */
table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0);
dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_BINARY, 0, 0);
dict_mem_table_add_col(table, heap, "TYPES", DATA_BINARY, 0, 0);
dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0);
table->id = ut_dulint_add(DICT_IBUF_ID_MIN, space);
table->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID);
dict_table_add_to_cache(table, heap);
mem_heap_free(heap);
index = dict_mem_index_create(
buf, "CLUST_IND", space,
DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, 2);
dict_mem_index_add_field(index, "PAGE_NO", 0);
dict_mem_index_add_field(index, "TYPES", 0);
IBUF_TABLE_NAME, "CLUST_IND",
IBUF_SPACE_ID, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, 1);
index->id = ut_dulint_add(DICT_IBUF_ID_MIN, space);
dict_mem_index_add_field(index, "DUMMY_COLUMN", 0);
error = dict_index_add_to_cache(table, index,
FSP_IBUF_TREE_ROOT_PAGE_NO);
ut_a(error == DB_SUCCESS);
index->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID);
data->index = dict_table_get_first_index(table);
dict_index_add_to_cache(table, index, FSP_IBUF_TREE_ROOT_PAGE_NO);
mutex_enter(&ibuf_mutex);
UT_LIST_ADD_LAST(data_list, ibuf->data_list, data);
mutex_exit(&ibuf_mutex);
return(data);
ibuf->index = dict_table_get_first_index(table);
}
/*************************************************************************
......@@ -605,7 +560,7 @@ ibuf_parse_bitmap_init(
/*===================*/
/* out: end of log record or NULL */
byte* ptr, /* in: buffer */
byte* end_ptr __attribute__((unused)), /* in: buffer end */
byte* end_ptr UNIV_UNUSED, /* in: buffer end */
buf_block_t* block, /* in: block or NULL */
mtr_t* mtr) /* in: mtr or NULL */
{
......@@ -631,7 +586,7 @@ ibuf_bitmap_page_get_bits(
0 for uncompressed pages */
ulint bit, /* in: IBUF_BITMAP_FREE,
IBUF_BITMAP_BUFFERED, ... */
mtr_t* mtr __attribute__((unused)))
mtr_t* mtr UNIV_UNUSED)
/* in: mtr containing an
x-latch to the bitmap page */
{
......@@ -929,10 +884,8 @@ ibuf_update_free_bits_low(
performed to the page */
mtr_t* mtr) /* in/out: mtr */
{
ulint before;
ulint after;
ut_a(!buf_block_get_page_zip(block));
ulint before;
before = ibuf_index_page_calc_free_bits(0, max_ins_size);
......@@ -1033,7 +986,7 @@ ibuf_fixed_addr_page(
0 for uncompressed pages */
ulint page_no)/* in: page number */
{
return((space == 0 && page_no == IBUF_TREE_ROOT_PAGE_NO)
return((space == IBUF_SPACE_ID && page_no == IBUF_TREE_ROOT_PAGE_NO)
|| ibuf_bitmap_page(zip_size, page_no));
}
......@@ -1046,68 +999,42 @@ ibuf_page(
/* out: TRUE if level 2 or level 3 page */
ulint space, /* in: space id */
ulint zip_size,/* in: compressed page size in bytes, or 0 */
ulint page_no)/* in: page number */
ulint page_no,/* in: page number */
mtr_t* mtr) /* in: mtr which will contain an x-latch to the
bitmap page if the page is not one of the fixed
address ibuf pages, or NULL, in which case a new
transaction is created. */
{
page_t* bitmap_page;
mtr_t mtr;
ibool ret;
if (recv_no_ibuf_operations) {
/* Recovery is running: no ibuf operations should be
performed */
return(FALSE);
}
mtr_t mtr_local;
page_t* bitmap_page;
ibool use_local_mtr = (mtr == NULL);
if (ibuf_fixed_addr_page(space, zip_size, page_no)) {
return(TRUE);
}
if (space != 0) {
/* Currently we only have an ibuf tree in space 0 */
} else if (space != IBUF_SPACE_ID) {
return(FALSE);
}
ut_ad(fil_space_get_type(space) == FIL_TABLESPACE);
ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TABLESPACE);
mtr_start(&mtr);
if (use_local_mtr) {
mtr = &mtr_local;
mtr_start(mtr);
}
bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr);
bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
ret = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
IBUF_BITMAP_IBUF, &mtr);
mtr_commit(&mtr);
IBUF_BITMAP_IBUF, mtr);
return(ret);
}
/***************************************************************************
Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. */
UNIV_INTERN
ibool
ibuf_page_low(
/*==========*/
/* out: TRUE if level 2 or level 3 page */
ulint space, /* in: space id */
ulint zip_size,/* in: compressed page size in bytes, or 0 */
ulint page_no,/* in: page number */
mtr_t* mtr) /* in: mtr which will contain an x-latch to the
bitmap page if the page is not one of the fixed
address ibuf pages */
{
page_t* bitmap_page;
if (ibuf_fixed_addr_page(space, zip_size, page_no)) {
return(TRUE);
if (use_local_mtr) {
mtr_commit(mtr);
}
bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
return(ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
IBUF_BITMAP_IBUF, mtr));
return(ret);
}
/************************************************************************
......@@ -1178,6 +1105,185 @@ ibuf_rec_get_space(
return(0);
}
/********************************************************************
Get various information about an ibuf record. */
static
void
ibuf_rec_get_info(
/*==============*/
const rec_t* rec, /* in: ibuf record */
ibuf_op_t* op, /* out: operation type, or NULL */
ibool* comp, /* out: compact flag, or NULL */
ulint* info_len, /* out: length of info fields at the
start of the fourth field, or
NULL */
ulint* counter) /* in: counter value, or NULL */
{
const byte* types;
ulint fields;
ulint len;
ulint mod;
/* Local variables to shadow arguments. */
ibuf_op_t op_local;
ibool comp_local;
ulint info_len_local;
ulint counter_local;
ut_ad(ibuf_inside());
fields = rec_get_n_fields_old(rec);
ut_a(fields > 4);
types = rec_get_nth_field_old(rec, 3, &len);
mod = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
if (mod == 0) {
op_local = IBUF_OP_INSERT;
comp_local = FALSE;
info_len_local = 0;
ut_ad(!counter);
} else if (mod == 1) {
op_local = IBUF_OP_INSERT;
comp_local = TRUE;
info_len_local = 1;
ut_ad(!counter);
} else if (mod == IBUF_REC_INFO_SIZE) {
op_local = (ibuf_op_t)types[IBUF_REC_OFFSET_TYPE];
comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
info_len_local = IBUF_REC_INFO_SIZE;
counter_local = mach_read_from_2(
types + IBUF_REC_OFFSET_COUNTER);
} else {
ut_error;
}
ut_a(op_local < IBUF_OP_COUNT);
ut_a((len - info_len_local) ==
(fields - 4) * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
if (op) {
*op = op_local;
}
if (comp) {
*comp = comp_local;
}
if (info_len) {
*info_len = info_len_local;
}
if (counter) {
*counter = counter_local;
}
}
/********************************************************************
Returns the operation type field of an ibuf record. */
static
ibuf_op_t
ibuf_rec_get_op_type(
/*=================*/
/* out: operation type */
rec_t* rec) /* in: ibuf record */
{
ulint len;
const byte* field;
ut_ad(ibuf_inside());
ut_ad(rec_get_n_fields_old(rec) > 2);
field = rec_get_nth_field_old(rec, 1, &len);
if (len > 1) {
/* This is a < 4.1.x format record */
return(IBUF_OP_INSERT);
} else {
ibuf_op_t op;
ibuf_rec_get_info(rec, &op, NULL, NULL, NULL);
return(op);
}
}
/********************************************************************
Read the first two bytes from a record's fourth field (counter field in new
records; something else in older records). */
ulint
ibuf_rec_get_fake_counter(
/*======================*/
/* out: "counter" field, or ULINT_UNDEFINED if for
some reason it can't be read*/
rec_t* rec) /* in: ibuf record */
{
byte* ptr;
ulint len;
if (rec_get_n_fields_old(rec) < 4) {
return(ULINT_UNDEFINED);
}
ptr = rec_get_nth_field_old(rec, 3, &len);
if (len >= 2) {
return(mach_read_from_2(ptr));
} else {
return(ULINT_UNDEFINED);
}
}
/********************************************************************
Add accumulated operation counts to a permanent array. Both arrays must be
of size IBUF_OP_COUNT. */
static
void
ibuf_add_ops(
/*=========*/
ulint* arr, /* in/out: array to modify */
ulint* ops) /* in: operation counts */
{
ulint i;
for (i = 0; i < IBUF_OP_COUNT; i++) {
arr[i] += ops[i];
}
}
/********************************************************************
Print operation counts. The array must be of size IBUF_OP_COUNT. */
static
void
ibuf_print_ops(
/*=========*/
ulint* ops, /* in: operation counts */
FILE* file) /* in: file where to print */
{
static const char* op_names[] = {
"insert",
"delete mark",
"delete"
};
ulint i;
ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT);
for (i = 0; i < IBUF_OP_COUNT; i++) {
fprintf(file, "%s %lu%s", op_names[i],
(ulong) ops[i], (i < (IBUF_OP_COUNT - 1)) ? ", " : "");
}
}
/************************************************************************
Creates a dummy index for inserting a record to a non-clustered index.
*/
......@@ -1192,12 +1298,11 @@ ibuf_dummy_index_create(
dict_table_t* table;
dict_index_t* index;
table = dict_mem_table_create("IBUF_DUMMY",
DICT_HDR_SPACE, n,
comp ? DICT_TF_COMPACT : 0);
table = dict_mem_table_create(
"IBUF_DUMMY", DICT_HDR_SPACE, n, comp ? DICT_TF_COMPACT : 0);
index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY",
DICT_HDR_SPACE, 0, n);
index = dict_mem_index_create(
"IBUF_DUMMY", "IBUF_DUMMY", DICT_HDR_SPACE, 0, n);
index->table = table;
......@@ -1217,12 +1322,14 @@ ibuf_dummy_index_add_col(
ulint len) /* in: length of the column */
{
ulint i = index->table->n_def;
dict_mem_table_add_col(index->table, NULL, NULL,
dtype_get_mtype(type),
dtype_get_prtype(type),
dtype_get_len(type));
dict_index_add_col(index, index->table,
dict_table_get_nth_col(index->table, i), len);
dict_mem_table_add_col(
index->table, NULL, NULL, dtype_get_mtype(type),
dtype_get_prtype(type), dtype_get_len(type));
dict_index_add_col(
index, index->table,
dict_table_get_nth_col(index->table, i), len);
}
/************************************************************************
Deallocates a dummy index for inserting a record to a non-clustered index.
......@@ -1242,6 +1349,67 @@ ibuf_dummy_index_free(
/*************************************************************************
Builds the entry to insert into a non-clustered index when we have the
corresponding record in an ibuf index. */
UNIV_INLINE
dtuple_t*
ibuf_build_entry_pre_4_1_x(
/*=======================*/
/* out, own: entry to insert to
a non-clustered index; NOTE that
as we copy pointers to fields in
ibuf_rec, the caller must hold a
latch to the ibuf_rec page as long
as the entry is used! */
const rec_t* ibuf_rec, /* in: record in an insert buffer */
mem_heap_t* heap, /* in: heap where built */
dict_index_t** pindex) /* out, own: dummy index that
describes the entry */
{
ulint i;
ulint len;
const byte* types;
dtuple_t* tuple;
ulint n_fields;
ut_a(trx_doublewrite_must_reset_space_ids);
ut_a(!trx_sys_multiple_tablespace_format);
n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
tuple = dtuple_create(heap, n_fields);
types = rec_get_nth_field_old(ibuf_rec, 1, &len);
ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
for (i = 0; i < n_fields; i++) {
const byte* data;
dfield_t* field;
field = dtuple_get_nth_field(tuple, i);
data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
dfield_set_data(field, data, len);
dtype_read_for_order_and_null_size(
dfield_get_type(field),
types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
}
*pindex = ibuf_dummy_index_create(n_fields, FALSE);
return(tuple);
}
/*************************************************************************
Builds the entry used to
1) IBUF_OP_INSERT: insert into a non-clustered index
2) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to
activate
3) IBUF_OP_DELETE: find the record we need to delete
when we have the corresponding record in an ibuf index. */
static
dtuple_t*
ibuf_build_entry_from_ibuf_rec(
......@@ -1263,7 +1431,9 @@ ibuf_build_entry_from_ibuf_rec(
const byte* types;
const byte* data;
ulint len;
ulint info_len;
ulint i;
ulint comp;
dict_index_t* index;
data = rec_get_nth_field_old(ibuf_rec, 1, &len);
......@@ -1271,29 +1441,7 @@ ibuf_build_entry_from_ibuf_rec(
if (len > 1) {
/* This a < 4.1.x format record */
ut_a(trx_doublewrite_must_reset_space_ids);
ut_a(!trx_sys_multiple_tablespace_format);
n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
tuple = dtuple_create(heap, n_fields);
types = rec_get_nth_field_old(ibuf_rec, 1, &len);
ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
for (i = 0; i < n_fields; i++) {
field = dtuple_get_nth_field(tuple, i);
data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
dfield_set_data(field, data, len);
dtype_read_for_order_and_null_size(
dfield_get_type(field),
types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
}
*pindex = ibuf_dummy_index_create(n_fields, FALSE);
return(tuple);
return(ibuf_build_entry_pre_4_1_x(ibuf_rec, heap, pindex));
}
/* This a >= 4.1.x format record */
......@@ -1308,16 +1456,12 @@ ibuf_build_entry_from_ibuf_rec(
types = rec_get_nth_field_old(ibuf_rec, 3, &len);
ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1);
index = ibuf_dummy_index_create(
n_fields, len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
ibuf_rec_get_info(ibuf_rec, NULL, &comp, &info_len, NULL);
if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
/* compact record format */
len--;
ut_a(*types == 0);
types++;
}
index = ibuf_dummy_index_create(n_fields, comp);
len -= info_len;
types += info_len;
ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
......@@ -1343,9 +1487,51 @@ ibuf_build_entry_from_ibuf_rec(
ut_d(dict_table_add_system_columns(index->table, index->table->heap));
*pindex = index;
return(tuple);
}
/**********************************************************************
Get the data size. */
UNIV_INLINE
ulint
ibuf_rec_get_size(
/*==============*/
/* out: size of fields */
const rec_t* rec, /* in: ibuf record */
const byte* types, /* in: fields */
ulint n_fields, /* in: number of fields */
ibool new_format) /* in: TRUE or FALSE */
{
ulint i;
ulint offset;
ulint size = 0;
/* 4 for compact record and 2 for old style. */
offset = new_format ? 4 : 2;
for (i = 0; i < n_fields; i++) {
ulint len;
const byte* field;
field = rec_get_nth_field_old(rec, i + offset, &len);
if (len == UNIV_SQL_NULL) {
dtype_t dtype;
dtype_read_for_order_and_null_size(
&dtype, types + i
* DATA_ORDER_NULL_TYPE_BUF_SIZE);
size += dtype_get_sql_null_size(&dtype);
} else {
size += len;
}
}
return(size);
}
/************************************************************************
Returns the space taken by a stored non-clustered index entry if converted to
an index record. */
......@@ -1358,14 +1544,12 @@ ibuf_rec_get_volume(
page directory */
const rec_t* ibuf_rec)/* in: ibuf record */
{
dtype_t dtype;
ibool new_format = FALSE;
ulint data_size = 0;
ulint n_fields;
const byte* types;
const byte* data;
ulint len;
ulint i;
const byte* data;
const byte* types;
ulint n_fields;
ulint data_size = 0;
ibool new_format = FALSE;
ut_ad(ibuf_inside());
ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
......@@ -1383,54 +1567,52 @@ ibuf_rec_get_volume(
types = rec_get_nth_field_old(ibuf_rec, 1, &len);
ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
} else {
/* >= 4.1.x format record */
ibuf_op_t op;
ibool comp;
ulint info_len;
ut_a(trx_sys_multiple_tablespace_format);
ut_a(*data == 0);
types = rec_get_nth_field_old(ibuf_rec, 3, &len);
ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1);
if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
/* compact record format */
ibuf_rec_get_info(ibuf_rec, &op, &comp, &info_len, NULL);
if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
/* Delete-marking a record doesn't take any
additional space, and while deleting a record
actually frees up space, we have to play it safe and
pretend it takes no additional space (the record
might not exist, etc.). */
return(0);
} else if (comp) {
dtuple_t* entry;
ulint volume;
dict_index_t* dummy_index;
mem_heap_t* heap = mem_heap_create(500);
dtuple_t* entry = ibuf_build_entry_from_ibuf_rec(
entry = ibuf_build_entry_from_ibuf_rec(
ibuf_rec, heap, &dummy_index);
volume = rec_get_converted_size(dummy_index, entry, 0);
ibuf_dummy_index_free(dummy_index);
mem_heap_free(heap);
return(volume + page_dir_calc_reserved_space(1));
}
types += info_len;
n_fields = rec_get_n_fields_old(ibuf_rec) - 4;
new_format = TRUE;
}
for (i = 0; i < n_fields; i++) {
if (new_format) {
data = rec_get_nth_field_old(ibuf_rec, i + 4, &len);
dtype_new_read_for_order_and_null_size(
&dtype, types + i
* DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
} else {
data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
dtype_read_for_order_and_null_size(
&dtype, types + i
* DATA_ORDER_NULL_TYPE_BUF_SIZE);
}
if (len == UNIV_SQL_NULL) {
data_size += dtype_get_sql_null_size(&dtype);
} else {
data_size += len;
}
}
data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, new_format);
return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
+ page_dir_calc_reserved_space(1));
......@@ -1447,40 +1629,38 @@ ibuf_entry_build(
index tree; NOTE that the original entry
must be kept because we copy pointers to its
fields */
ibuf_op_t op, /* in: operation type */
dict_index_t* index, /* in: non-clustered index */
const dtuple_t* entry, /* in: entry for a non-clustered index */
ulint space, /* in: space id */
ulint page_no,/* in: index page number where entry should
be inserted */
ulint counter,/* in: counter value */
mem_heap_t* heap) /* in: heap into which to build */
{
dtuple_t* tuple;
dfield_t* field;
const dfield_t* entry_field;
ulint n_fields;
ulint type_info_size;
byte* buf;
byte* buf2;
ulint i;
/* Starting from 4.1.x, we have to build a tuple whose
(1) first field is the space id,
(2) the second field a single marker byte (0) to tell that this
is a new format record,
(3) the third contains the page number, and
(4) the fourth contains the relevent type information of each data
field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is
(a) 0 for b-trees in the old format, and
(b) 1 for b-trees in the compact format, the first byte of the field
being the marker (0);
(5) and the rest of the fields are copied from entry. All fields
in the tuple are ordered like the type binary in our insert buffer
tree. */
/* We have to build a tuple with the following fields:
1-4) These are described at the top of this file.
5) The rest of the fields are copied from the entry.
All fields in the tuple are ordered like the type binary in our
insert buffer tree. */
n_fields = dtuple_get_n_fields(entry);
tuple = dtuple_create(heap, n_fields + 4);
/* Store the space id in tuple */
/* 1) Space Id */
field = dtuple_get_nth_field(tuple, 0);
......@@ -1490,7 +1670,7 @@ ibuf_entry_build(
dfield_set_data(field, buf, 4);
/* Store the marker byte field in tuple */
/* 2) Marker byte */
field = dtuple_get_nth_field(tuple, 1);
......@@ -1502,7 +1682,7 @@ ibuf_entry_build(
dfield_set_data(field, buf, 1);
/* Store the page number in tuple */
/* 3) Page number */
field = dtuple_get_nth_field(tuple, 2);
......@@ -1512,14 +1692,20 @@ ibuf_entry_build(
dfield_set_data(field, buf, 4);
/* Store the type info in buf2, and add the fields from entry to
tuple */
buf2 = mem_heap_alloc(heap, n_fields
* DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ dict_table_is_comp(index->table));
if (dict_table_is_comp(index->table)) {
*buf2++ = 0; /* write the compact format indicator */
}
/* 4) Type info, part #1 */
type_info_size = IBUF_REC_INFO_SIZE
+ n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
buf2 = mem_heap_alloc(heap, type_info_size);
mach_write_to_2(buf2 + IBUF_REC_OFFSET_COUNTER, counter);
buf2[IBUF_REC_OFFSET_TYPE] = (byte) op;
buf2[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table)
? IBUF_REC_COMPACT : 0;
/* 5+) Fields from the entry */
for (i = 0; i < n_fields; i++) {
ulint fixed_len;
const dict_field_t* ifield;
......@@ -1554,21 +1740,17 @@ ibuf_entry_build(
#endif /* UNIV_DEBUG */
dtype_new_store_for_order_and_null_size(
buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
buf2 + IBUF_REC_INFO_SIZE
+ i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
dfield_get_type(entry_field), fixed_len);
}
/* Store the type info in buf2 to field 3 of tuple */
/* 4) Type info, part #2 */
field = dtuple_get_nth_field(tuple, 3);
if (dict_table_is_comp(index->table)) {
buf2--;
}
dfield_set_data(field, buf2, type_info_size);
dfield_set_data(field, buf2, n_fields
* DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+ dict_table_is_comp(index->table));
/* Set all the types in the new tuple binary */
dtuple_set_types_binary(tuple, n_fields + 4);
......@@ -1673,10 +1855,9 @@ Checks if there are enough pages in the free list of the ibuf tree that we
dare to start a pessimistic insert to the insert buffer. */
UNIV_INLINE
ibool
ibuf_data_enough_free_for_insert(
/*=============================*/
ibuf_data_enough_free_for_insert(void)
/*==================================*/
/* out: TRUE if enough free pages in list */
ibuf_data_t* data) /* in: ibuf data for the space */
{
ut_ad(mutex_own(&ibuf_mutex));
......@@ -1686,7 +1867,7 @@ ibuf_data_enough_free_for_insert(
inserts buffered for pages that we read to the buffer pool, without
any risk of running out of free space in the insert buffer. */
return(data->free_list_len >= data->size / 2 + 3 * data->height);
return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height);
}
/*************************************************************************
......@@ -1694,14 +1875,13 @@ Checks if there are enough pages in the free list of the ibuf tree that we
should remove them and free to the file space management. */
UNIV_INLINE
ibool
ibuf_data_too_much_free(
/*====================*/
ibuf_data_too_much_free(void)
/*=========================*/
/* out: TRUE if enough free pages in list */
ibuf_data_t* data) /* in: ibuf data for the space */
{
ut_ad(mutex_own(&ibuf_mutex));
return(data->free_list_len >= 3 + data->size / 2 + 3 * data->height);
return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height);
}
/*************************************************************************
......@@ -1709,12 +1889,10 @@ Allocates a new page from the ibuf file segment and adds it to the free
list. */
static
ulint
ibuf_add_free_page(
/*===============*/
ibuf_add_free_page(void)
/*====================*/
/* out: DB_SUCCESS, or DB_STRONG_FAIL
if no space left */
ulint space, /* in: space id */
ibuf_data_t* ibuf_data) /* in: ibuf data for the space */
{
mtr_t mtr;
page_t* header_page;
......@@ -1724,15 +1902,13 @@ ibuf_add_free_page(
page_t* root;
page_t* bitmap_page;
ut_a(space == 0);
mtr_start(&mtr);
/* Acquire the fsp latch before the ibuf header, obeying the latching
order */
mtr_x_lock(fil_space_get_latch(space, &zip_size), &mtr);
mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &zip_size), &mtr);
header_page = ibuf_header_page_get(space, &mtr);
header_page = ibuf_header_page_get(&mtr);
/* Allocate a new page: NOTE that if the page has been a part of a
non-clustered index which has subsequently been dropped, then the
......@@ -1744,9 +1920,10 @@ ibuf_add_free_page(
of a deadlock. This is the reason why we created a special ibuf
header page apart from the ibuf tree. */
page_no = fseg_alloc_free_page(header_page + IBUF_HEADER
+ IBUF_TREE_SEG_HEADER, 0, FSP_UP,
&mtr);
page_no = fseg_alloc_free_page(
header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
&mtr);
if (page_no == FIL_NULL) {
mtr_commit(&mtr);
......@@ -1754,11 +1931,15 @@ ibuf_add_free_page(
}
{
buf_block_t* block = buf_page_get(
space, 0, page_no, RW_X_LATCH, &mtr);
buf_block_t* block;
block = buf_page_get(
IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
#endif /* UNIV_SYNC_DEBUG */
page = buf_block_get_frame(block);
}
......@@ -1766,7 +1947,7 @@ ibuf_add_free_page(
mutex_enter(&ibuf_mutex);
root = ibuf_tree_root_get(ibuf_data, space, &mtr);
root = ibuf_tree_root_get(&mtr);
/* Add the page to the free list and update the ibuf size data */
......@@ -1776,16 +1957,18 @@ ibuf_add_free_page(
mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST,
MLOG_2BYTES, &mtr);
ibuf_data->seg_size++;
ibuf_data->free_list_len++;
ibuf->seg_size++;
ibuf->free_list_len++;
/* Set the bit indicating that this page is now an ibuf tree page
(level 2 page) */
bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr);
bitmap_page = ibuf_bitmap_get_map_page(
IBUF_SPACE_ID, page_no, zip_size, &mtr);
ibuf_bitmap_page_set_bits(
bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, TRUE, &mtr);
ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
IBUF_BITMAP_IBUF, TRUE, &mtr);
mtr_commit(&mtr);
mutex_exit(&ibuf_mutex);
......@@ -1799,10 +1982,8 @@ ibuf_add_free_page(
Removes a page from the free list and frees it to the fsp system. */
static
void
ibuf_remove_free_page(
/*==================*/
ulint space, /* in: space id */
ibuf_data_t* ibuf_data) /* in: ibuf data for the space */
ibuf_remove_free_page(void)
/*=======================*/
{
mtr_t mtr;
mtr_t mtr2;
......@@ -1813,15 +1994,13 @@ ibuf_remove_free_page(
page_t* root;
page_t* bitmap_page;
ut_a(space == 0);
mtr_start(&mtr);
/* Acquire the fsp latch before the ibuf header, obeying the latching
order */
mtr_x_lock(fil_space_get_latch(space, &zip_size), &mtr);
mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &zip_size), &mtr);
header_page = ibuf_header_page_get(space, &mtr);
header_page = ibuf_header_page_get(&mtr);
/* Prevent pessimistic inserts to insert buffer trees for a while */
mutex_enter(&ibuf_pessimistic_insert_mutex);
......@@ -1830,7 +2009,7 @@ ibuf_remove_free_page(
mutex_enter(&ibuf_mutex);
if (!ibuf_data_too_much_free(ibuf_data)) {
if (!ibuf_data_too_much_free()) {
mutex_exit(&ibuf_mutex);
......@@ -1845,11 +2024,10 @@ ibuf_remove_free_page(
mtr_start(&mtr2);
root = ibuf_tree_root_get(ibuf_data, space, &mtr2);
root = ibuf_tree_root_get(&mtr2);
page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
&mtr2)
.page;
&mtr2).page;
/* NOTE that we must release the latch on the ibuf tree root
because in fseg_free_page we access level 1 pages, and the root
......@@ -1867,26 +2045,31 @@ ibuf_remove_free_page(
page from it. */
fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
space, page_no, &mtr);
IBUF_SPACE_ID, page_no, &mtr);
#ifdef UNIV_DEBUG_FILE_ACCESSES
buf_page_reset_file_page_was_freed(space, page_no);
buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no);
#endif
ibuf_enter();
mutex_enter(&ibuf_mutex);
root = ibuf_tree_root_get(ibuf_data, space, &mtr);
root = ibuf_tree_root_get(&mtr);
ut_ad(page_no == flst_get_last(root + PAGE_HEADER
+ PAGE_BTR_IBUF_FREE_LIST, &mtr)
.page);
+ PAGE_BTR_IBUF_FREE_LIST, &mtr).page);
{
buf_block_t* block = buf_page_get(
space, 0, page_no, RW_X_LATCH, &mtr);
buf_block_t* block;
block = buf_page_get(
IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
page = buf_block_get_frame(block);
}
......@@ -1895,20 +2078,22 @@ ibuf_remove_free_page(
flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
ibuf_data->seg_size--;
ibuf_data->free_list_len--;
ibuf->seg_size--;
ibuf->free_list_len--;
mutex_exit(&ibuf_pessimistic_insert_mutex);
/* Set the bit indicating that this page is no more an ibuf tree page
(level 2 page) */
bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr);
bitmap_page = ibuf_bitmap_get_map_page(
IBUF_SPACE_ID, page_no, zip_size, &mtr);
ibuf_bitmap_page_set_bits(
bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr);
ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
IBUF_BITMAP_IBUF, FALSE, &mtr);
#ifdef UNIV_DEBUG_FILE_ACCESSES
buf_page_set_file_page_was_freed(space, page_no);
buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no);
#endif
mtr_commit(&mtr);
......@@ -1923,39 +2108,28 @@ thread calls fsp services to allocate a new file segment, or a new page to a
file segment, and the thread did not own the fsp latch before this call. */
UNIV_INTERN
void
ibuf_free_excess_pages(
/*===================*/
ulint space) /* in: compressed page size in bytes, or 0 */
ibuf_free_excess_pages(void)
/*=======================*/
{
ibuf_data_t* ibuf_data;
ulint i;
if (space != 0) {
fprintf(stderr,
"InnoDB: Error: calling ibuf_free_excess_pages"
" for space %lu\n", (ulong) space);
return;
}
#ifdef UNIV_SYNC_DEBUG
ut_ad(rw_lock_own(fil_space_get_latch(space, NULL), RW_LOCK_EX));
ut_ad(rw_lock_own(fil_space_get_latch(IBUF_SPACE_ID, NULL),
RW_LOCK_EX));
#endif /* UNIV_SYNC_DEBUG */
ut_ad(rw_lock_get_x_lock_count(fil_space_get_latch(space, NULL)) == 1);
ut_ad(rw_lock_get_x_lock_count(
fil_space_get_latch(IBUF_SPACE_ID, NULL)) == 1);
ut_ad(!ibuf_inside());
/* NOTE: We require that the thread did not own the latch before,
because then we know that we can obey the correct latching order
for ibuf latches */
ibuf_data = fil_space_get_ibuf_data(space);
if (ibuf_data == NULL) {
/* Not yet initialized */
#if 0 /* defined UNIV_DEBUG */
fprintf(stderr,
"Ibuf for space %lu not yet initialized\n", space);
#endif
if (!ibuf) {
/* Not yet initialized; not sure if this is possible, but
does no harm to check for it. */
return;
}
......@@ -1967,7 +2141,7 @@ ibuf_free_excess_pages(
mutex_enter(&ibuf_mutex);
if (!ibuf_data_too_much_free(ibuf_data)) {
if (!ibuf_data_too_much_free()) {
mutex_exit(&ibuf_mutex);
......@@ -1976,7 +2150,7 @@ ibuf_free_excess_pages(
mutex_exit(&ibuf_mutex);
ibuf_remove_free_page(space, ibuf_data);
ibuf_remove_free_page();
}
}
......@@ -2051,14 +2225,13 @@ ibuf_get_merge_page_nos(
rec_space_id = ibuf_rec_get_space(rec);
if (rec_space_id != first_space_id
|| rec_page_no / IBUF_MERGE_AREA
!= first_page_no / IBUF_MERGE_AREA) {
|| (rec_page_no / IBUF_MERGE_AREA)
!= (first_page_no / IBUF_MERGE_AREA)) {
break;
}
} else if (rec_page_no != prev_page_no
|| rec_space_id != prev_space_id) {
if (rec_page_no != prev_page_no
|| rec_space_id != prev_space_id) {
n_pages++;
}
......@@ -2167,11 +2340,7 @@ ibuf_contract_ext(
issued read with the highest tablespace address
to complete */
{
ulint rnd_pos;
ibuf_data_t* data;
btr_pcur_t pcur;
ulint space;
ibool all_trees_empty;
ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
ulint space_ids[IBUF_MAX_N_PAGES_MERGED];
ib_longlong space_versions[IBUF_MAX_N_PAGES_MERGED];
......@@ -2180,54 +2349,16 @@ ibuf_contract_ext(
mtr_t mtr;
*n_pages = 0;
loop:
ut_ad(!ibuf_inside());
mutex_enter(&ibuf_mutex);
ut_ad(ibuf_validate_low());
/* Choose an ibuf tree at random (though there really is only one tree
in the current implementation) */
ibuf_rnd += 865558671;
rnd_pos = ibuf_rnd % ibuf->size;
all_trees_empty = TRUE;
data = UT_LIST_GET_FIRST(ibuf->data_list);
for (;;) {
if (!data->empty) {
all_trees_empty = FALSE;
if (rnd_pos < data->size) {
break;
}
rnd_pos -= data->size;
}
data = UT_LIST_GET_NEXT(data_list, data);
if (data == NULL) {
if (all_trees_empty) {
mutex_exit(&ibuf_mutex);
return(0);
}
data = UT_LIST_GET_FIRST(ibuf->data_list);
}
}
mutex_enter(&ibuf_mutex);
ut_ad(data);
if (ibuf->empty) {
mutex_exit(&ibuf_mutex);
space = data->index->space;
return(0);
}
ut_a(space == 0); /* We currently only have an ibuf tree in
space 0 */
mtr_start(&mtr);
ibuf_enter();
......@@ -2235,13 +2366,16 @@ loop:
/* Open a cursor to a randomly chosen leaf of the tree, at a random
position within the leaf */
btr_pcur_open_at_rnd_pos(data->index, BTR_SEARCH_LEAF, &pcur, &mtr);
btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr);
if (0 == page_get_n_recs(btr_pcur_get_page(&pcur))) {
if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) {
/* When the ibuf tree is emptied completely, the last record
is removed using an optimistic delete and ibuf_size_update
is not called, causing ibuf->empty to remain FALSE. If we do
not reset it to TRUE here then database shutdown will hang
in the loop in ibuf_contract_for_n_pages. */
/* This tree is empty */
data->empty = TRUE;
ibuf->empty = TRUE;
ibuf_exit();
......@@ -2250,14 +2384,15 @@ loop:
mutex_exit(&ibuf_mutex);
goto loop;
return(0);
}
mutex_exit(&ibuf_mutex);
sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur),
space_ids, space_versions,
page_nos, &n_stored);
sum_sizes = ibuf_get_merge_page_nos(
TRUE, btr_pcur_get_rec(&pcur),
space_ids, space_versions, page_nos, &n_stored);
#if 0 /* defined UNIV_IBUF_DEBUG */
fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n",
sync, n_stored, sum_sizes);
......@@ -2441,13 +2576,18 @@ ibuf_get_volume_buffered(
}
{
buf_block_t* block = buf_page_get(
0, 0, prev_page_no, RW_X_LATCH, mtr);
buf_block_t* block;
block = buf_page_get(
IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
prev_page = buf_block_get_frame(block);
}
#ifdef UNIV_BTR_DEBUG
ut_a(btr_page_get_next(prev_page, mtr)
== page_get_page_no(page));
......@@ -2511,16 +2651,20 @@ count_later:
}
{
buf_block_t* block = buf_page_get(
0, 0, next_page_no, RW_X_LATCH, mtr);
buf_block_t* block;
block = buf_page_get(
IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
next_page = buf_block_get_frame(block);
}
#ifdef UNIV_BTR_DEBUG
ut_a(btr_page_get_prev(next_page, mtr)
== page_get_page_no(page));
ut_a(btr_page_get_prev(next_page, mtr) == page_get_page_no(page));
#endif /* UNIV_BTR_DEBUG */
rec = page_get_infimum_rec(next_page);
......@@ -2558,22 +2702,18 @@ ibuf_update_max_tablespace_id(void)
const rec_t* rec;
const byte* field;
ulint len;
ibuf_data_t* ibuf_data;
dict_index_t* ibuf_index;
btr_pcur_t pcur;
mtr_t mtr;
ibuf_data = fil_space_get_ibuf_data(0);
ibuf_index = ibuf_data->index;
ut_a(!dict_table_is_comp(ibuf_index->table));
ut_a(!dict_table_is_comp(ibuf->index->table));
ibuf_enter();
mtr_start(&mtr);
btr_pcur_open_at_index_side(FALSE, ibuf_index, BTR_SEARCH_LEAF,
&pcur, TRUE, &mtr);
btr_pcur_open_at_index_side(
FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
btr_pcur_move_to_prev(&pcur, &mtr);
if (btr_pcur_is_before_first_on_page(&pcur)) {
......@@ -2598,6 +2738,165 @@ ibuf_update_max_tablespace_id(void)
fil_set_max_space_id_if_bigger(max_space_id);
}
/********************************************************************
Helper function for ibuf_set_entry_counter. Checks if rec is for (space,
page_no), and if so, reads counter value from it and returns that + 1.
Otherwise, returns 0. */
static
ulint
ibuf_set_entry_counter_low(
/*=======================*/
/* out: new counter value */
rec_t* rec, /* in: record */
ulint space, /* in: space id */
ulint page_no) /* in: page number */
{
ulint counter;
if (ibuf_rec_get_space(rec) == space
&& ibuf_rec_get_page_no(rec) == page_no) {
ibuf_rec_get_info(rec, NULL, NULL, NULL, &counter);
ut_a(counter < 0xFFFF);
counter++;
} else {
/* No entries in ibuf tree for (space, page_no). */
counter = 0;
}
return(counter);
}
/********************************************************************
Set the counter field in entry to the correct value based on the current
last record in ibuf for (space, page_no). */
static
ibool
ibuf_set_entry_counter(
/*===================*/
/* out: FALSE if we should abort
this insertion to ibuf */
dtuple_t* entry, /* in: entry to patch */
ulint space, /* in: space id of entry */
ulint page_no, /* in: page number of entry */
btr_pcur_t* pcur, /* in: pcur positioned on the record
found by btr_pcur_open(.., entry,
PAGE_CUR_LE, ..., pcur, ...) */
ibool is_optimistic, /* in: is this an optimistic insert */
mtr_t* mtr) /* in: mtr */
{
ulint counter = 0xFFFF + 1;
dfield_t* field;
void* data;
/* FIXME: if pcur (or the previous rec if we're on infimum) points
to a record that has no counter field, return FALSE since we can't
mix records with counters with records without counters. */
/* pcur points to either a user rec or to a page's infimum record. */
if (btr_pcur_is_on_user_rec(pcur)) {
counter = ibuf_set_entry_counter_low(
btr_pcur_get_rec(pcur), space, page_no);
} else if (btr_pcur_is_before_first_in_tree(pcur, mtr)) {
/* Ibuf tree is either completely empty, or the insert
position is at the very first record of a non-empty tree. In
either case we have no previous records for (space,
page_no). */
counter = 0;
} else if (btr_pcur_is_before_first_on_page(pcur)) {
btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
if (cursor->low_match < 3) {
/* If low_match < 3, we know that the father node
pointer did not contain the searched for (space,
page_no), which means that the search ended on the
right page regardless of the counter value, and
since we're at the infimum record, there are no
existing records. */
counter = 0;
} else {
rec_t* rec;
page_t* page;
buf_block_t* block;
page_t* prev_page;
ulint prev_page_no;
ut_a(cursor->ibuf_cnt != ULINT_UNDEFINED);
page = btr_pcur_get_page(pcur);
prev_page_no = btr_page_get_prev(page, mtr);
ut_ad(prev_page_no != FIL_NULL);
block = buf_page_get(
IBUF_SPACE_ID, 0, prev_page_no,
RW_X_LATCH, mtr);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
prev_page = buf_block_get_frame(block);
rec = page_rec_get_prev(
page_get_supremum_rec(prev_page));
ut_ad(page_rec_is_user_rec(rec));
counter = ibuf_set_entry_counter_low(
rec, space, page_no);
if (counter < cursor->ibuf_cnt) {
/* Search ended on the wrong page. */
if (is_optimistic) {
/* In an optimistic insert, we can
shift the insert position to the left
page, since it only needs an X-latch
on the page itself, which the
original search acquired for us. */
btr_cur_position(
ibuf->index, rec, block,
btr_pcur_get_btr_cur(pcur));
} else {
/* We can't shift the insert
position to the left page in a
pessimistic insert since it would
require an X-latch on the left
page's left page, so we have to
abort. */
return(FALSE);
}
} else {
/* The counter field in the father node is
the same as we would insert; we don't know
whether the insert should go to this page or
the left page (the later fields can differ),
so refuse the insert. */
return(FALSE);
}
}
}
/* Patch counter value in already built entry. */
field = dtuple_get_nth_field(entry, 3);
data = dfield_get_data(field);
mach_write_to_2((byte*) data + IBUF_REC_OFFSET_COUNTER, counter);
return(TRUE);
}
/*************************************************************************
Makes an index insert to the insert buffer, instead of directly to the disk
page, if this is possible. */
......@@ -2607,6 +2906,7 @@ ibuf_insert_low(
/*============*/
/* out: DB_SUCCESS, DB_FAIL, DB_STRONG_FAIL */
ulint mode, /* in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */
ibuf_op_t op, /* in: operation type */
const dtuple_t* entry, /* in: index entry to insert */
ulint entry_size,
/* in: rec_get_converted_size(index, entry) */
......@@ -2626,8 +2926,6 @@ ibuf_insert_low(
rec_t* ins_rec;
ibool old_bit_value;
page_t* bitmap_page;
ibuf_data_t* ibuf_data;
dict_index_t* ibuf_index;
page_t* root;
ulint err;
ibool do_merge;
......@@ -2642,18 +2940,12 @@ ibuf_insert_low(
ut_a(!dict_index_is_clust(index));
ut_ad(dtuple_check_typed(entry));
ut_ad(ut_is_2pow(zip_size));
ut_a(op < IBUF_OP_COUNT);
ut_a(trx_sys_multiple_tablespace_format);
do_merge = FALSE;
/* Currently the insert buffer of space 0 takes care of inserts to all
tablespaces */
ibuf_data = fil_space_get_ibuf_data(0);
ibuf_index = ibuf_data->index;
mutex_enter(&ibuf_mutex);
if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
......@@ -2680,7 +2972,7 @@ ibuf_insert_low(
mutex_enter(&ibuf_mutex);
while (!ibuf_data_enough_free_for_insert(ibuf_data)) {
while (!ibuf_data_enough_free_for_insert()) {
mutex_exit(&ibuf_mutex);
......@@ -2688,7 +2980,7 @@ ibuf_insert_low(
mutex_exit(&ibuf_pessimistic_insert_mutex);
err = ibuf_add_free_page(0, ibuf_data);
err = ibuf_add_free_page();
if (err == DB_STRONG_FAIL) {
......@@ -2707,11 +2999,16 @@ ibuf_insert_low(
heap = mem_heap_create(512);
/* Build the entry which contains the space id and the page number as
the first fields and the type information for other fields, and which
will be inserted to the insert buffer. */
/* Build the entry which contains the space id and the page number
as the first fields and the type information for other fields, and
which will be inserted to the insert buffer. Using a counter value
of 0xFFFF we find the last record for (space, page_no), from which
we can then read the counter value N and use N + 1 in the record we
insert. (We patch the ibuf_entry's counter field to the correct
value just before actually inserting the entry.) */
ibuf_entry = ibuf_entry_build(index, entry, space, page_no, heap);
ibuf_entry = ibuf_entry_build(
op, index, entry, space, page_no, 0xFFFF, heap);
/* Open a cursor to the insert buffer tree to calculate if we can add
the new entry to it without exceeding the free space limit for the
......@@ -2719,7 +3016,15 @@ ibuf_insert_low(
mtr_start(&mtr);
btr_pcur_open(ibuf_index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
/* Don't buffer deletes if the page has been read in to the buffer
pool. */
if (op == IBUF_OP_DELETE && buf_pool_watch_happened(space, page_no)) {
err = DB_STRONG_FAIL;
goto function_exit;
}
/* Find out the volume of already buffered inserts for the same index
page */
......@@ -2730,8 +3035,8 @@ ibuf_insert_low(
#endif
mtr_start(&bitmap_mtr);
bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
zip_size, &bitmap_mtr);
bitmap_page = ibuf_bitmap_get_map_page(
space, page_no, zip_size, &bitmap_mtr);
/* We check if the index page is suitable for buffered entries */
......@@ -2744,21 +3049,35 @@ ibuf_insert_low(
goto function_exit;
}
bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
IBUF_BITMAP_FREE, &bitmap_mtr);
bits = ibuf_bitmap_page_get_bits(
bitmap_page, page_no, zip_size, IBUF_BITMAP_FREE, &bitmap_mtr);
if (buffered + entry_size + page_dir_calc_reserved_space(1)
> ibuf_index_page_calc_free_from_bits(zip_size, bits)) {
mtr_commit(&bitmap_mtr);
/* It may not fit */
err = DB_STRONG_FAIL;
mtr_commit(&bitmap_mtr);
do_merge = TRUE;
ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur),
space_ids, space_versions,
page_nos, &n_stored);
ibuf_get_merge_page_nos(
FALSE, btr_pcur_get_rec(&pcur),
space_ids, space_versions, page_nos, &n_stored);
goto function_exit;
}
/* Patch correct counter value to the entry to insert. This can
change the insert position, which can result in the need to abort in
some cases. */
if (!ibuf_set_entry_counter(ibuf_entry, space, page_no, &pcur,
mode == BTR_MODIFY_PREV, &mtr)) {
err = DB_STRONG_FAIL;
mtr_commit(&bitmap_mtr);
goto function_exit;
}
......@@ -2768,6 +3087,7 @@ ibuf_insert_low(
old_bit_value = ibuf_bitmap_page_get_bits(
bitmap_page, page_no, zip_size,
IBUF_BITMAP_BUFFERED, &bitmap_mtr);
if (!old_bit_value) {
ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
IBUF_BITMAP_BUFFERED, TRUE,
......@@ -2795,7 +3115,7 @@ ibuf_insert_low(
which would cause the x-latching of the root after that to
break the latching order. */
root = ibuf_tree_root_get(ibuf_data, 0, &mtr);
root = ibuf_tree_root_get(&mtr);
err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
| BTR_NO_UNDO_LOG_FLAG,
......@@ -2808,7 +3128,7 @@ ibuf_insert_low(
thr_get_trx(thr)->id);
}
ibuf_data_sizes_update(ibuf_data, root, &mtr);
ibuf_size_update(root, &mtr);
}
function_exit:
......@@ -2824,7 +3144,6 @@ function_exit:
}
#endif
if (mode == BTR_MODIFY_TREE) {
ut_ad(ibuf_validate_low());
mutex_exit(&ibuf_mutex);
mutex_exit(&ibuf_pessimistic_insert_mutex);
......@@ -2839,8 +3158,7 @@ function_exit:
mutex_enter(&ibuf_mutex);
if (err == DB_SUCCESS) {
ibuf_data->empty = FALSE;
ibuf_data->n_inserts++;
ibuf->empty = FALSE;
}
mutex_exit(&ibuf_mutex);
......@@ -2861,14 +3179,15 @@ function_exit:
}
/*************************************************************************
Makes an index insert to the insert buffer, instead of directly to the disk
page, if this is possible. Does not do insert if the index is clustered
or unique. */
Buffer an operation in the insert/delete buffer, instead of doing it
directly to the disk page, if this is possible. Does not do it if the index
is clustered or unique. */
UNIV_INTERN
ibool
ibuf_insert(
/*========*/
/* out: TRUE if success */
ibuf_op_t op, /* in: operation type */
const dtuple_t* entry, /* in: index entry to insert */
dict_index_t* index, /* in: index where to insert */
ulint space, /* in: space id where to insert */
......@@ -2878,25 +3197,26 @@ ibuf_insert(
{
ulint err;
ulint entry_size;
ibool comp = dict_table_is_comp(index->table);
ut_a(trx_sys_multiple_tablespace_format);
ut_ad(dtuple_check_typed(entry));
ut_ad(ut_is_2pow(zip_size));
ut_a(op < IBUF_OP_COUNT);
ut_a(!dict_index_is_clust(index));
entry_size = rec_get_converted_size(index, entry, 0);
if (entry_size
>= (page_get_free_space_of_empty(dict_table_is_comp(index->table))
/ 2)) {
if (entry_size >= (page_get_free_space_of_empty(comp) / 2)) {
return(FALSE);
}
err = ibuf_insert_low(BTR_MODIFY_PREV, entry, entry_size,
err = ibuf_insert_low(BTR_MODIFY_PREV, op, entry, entry_size,
index, space, zip_size, page_no, thr);
if (err == DB_FAIL) {
err = ibuf_insert_low(BTR_MODIFY_TREE, entry, entry_size,
err = ibuf_insert_low(BTR_MODIFY_TREE, op, entry, entry_size,
index, space, zip_size, page_no, thr);
}
......@@ -2970,8 +3290,8 @@ dump:
return;
}
low_match = page_cur_search(block, index, entry,
PAGE_CUR_LE, &page_cur);
low_match = page_cur_search(
block, index, entry, PAGE_CUR_LE, &page_cur);
if (low_match == dtuple_get_n_fields(entry)) {
buf_block_t* block;
......@@ -2981,7 +3301,7 @@ dump:
block = page_cur_get_block(&page_cur);
page_zip = buf_block_get_page_zip(block);
btr_cur_del_unmark_for_ibuf(rec, page_zip, mtr);
btr_cur_set_deleted_flag_for_ibuf(rec, page_zip, FALSE, mtr);
} else {
rec = page_cur_tuple_insert(&page_cur, entry, index, 0, mtr);
......@@ -3043,6 +3363,100 @@ dump:
}
}
/********************************************************************
During merge, sets the delete mark on a record for a secondary index
entry. */
static
void
ibuf_set_del_mark(
/*==============*/
dtuple_t* entry, /* in: entry */
buf_block_t* block, /* in: block */
dict_index_t* index, /* in: record descriptor */
mtr_t* mtr) /* in: mtr */
{
page_cur_t page_cur;
ulint low_match;
ut_ad(ibuf_inside());
ut_ad(dtuple_check_typed(entry));
low_match = page_cur_search(
block, index, entry, PAGE_CUR_LE, &page_cur);
if (low_match == dtuple_get_n_fields(entry)) {
rec_t* rec;
page_zip_des_t* page_zip;
rec = page_cur_get_rec(&page_cur);
block = page_cur_get_block(&page_cur);
page_zip = buf_block_get_page_zip(block);
btr_cur_set_deleted_flag_for_ibuf(rec, page_zip, TRUE, mtr);
} else {
/* This can happen benignly in some situations. */
}
}
/********************************************************************
During merge, delete a record for a secondary index entry. */
static
void
ibuf_delete(
/*========*/
dtuple_t* entry, /* in: entry */
buf_block_t* block, /* in: block */
dict_index_t* index, /* in: record descriptor */
mtr_t* mtr) /* in: mtr */
{
page_cur_t page_cur;
ulint low_match;
ut_ad(ibuf_inside());
ut_ad(dtuple_check_typed(entry));
low_match = page_cur_search(
block, index, entry, PAGE_CUR_LE, &page_cur);
if (low_match == dtuple_get_n_fields(entry)) {
page_t* page;
rec_t* rec = page_cur_get_rec(&page_cur);
/* TODO: the below should probably be a separate function,
it's a bastardized version of btr_cur_optimistic_delete. */
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets = offsets_;
mem_heap_t* heap = NULL;
ulint max_ins_size;
rec_offs_init(offsets_);
offsets = rec_get_offsets(
rec, index, offsets, ULINT_UNDEFINED, &heap);
lock_update_delete(block, rec);
page = buf_block_get_frame(block);
max_ins_size = page_get_max_insert_size_after_reorganize(
page, 1);
page_cur_delete_rec(&page_cur, index, offsets, mtr);
ibuf_update_free_bits_low(block, max_ins_size, mtr);
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
} else {
/* This can happen benignly in some situations: either when
we crashed at just the right time, or on database startup
when we redo some old log entries (due to worse stored
position granularity on disk than in memory). */
}
}
/*************************************************************************
Deletes from ibuf the record on which pcur is positioned. If we have to
resort to a pessimistic delete, this function commits mtr and closes
......@@ -3063,7 +3477,6 @@ ibuf_delete_rec(
mtr_t* mtr) /* in: mtr */
{
ibool success;
ibuf_data_t* ibuf_data;
page_t* root;
ulint err;
......@@ -3088,11 +3501,6 @@ ibuf_delete_rec(
btr_pcur_commit_specify_mtr(pcur, mtr);
/* Currently the insert buffer of space 0 takes care of inserts to all
tablespaces */
ibuf_data = fil_space_get_ibuf_data(0);
mutex_enter(&ibuf_mutex);
mtr_start(mtr);
......@@ -3119,7 +3527,7 @@ ibuf_delete_rec(
btr_pcur_commit_specify_mtr(pcur, mtr);
fputs("InnoDB: Validating insert buffer tree:\n", stderr);
if (!btr_validate_index(ibuf_data->index, NULL)) {
if (!btr_validate_index(ibuf->index, NULL)) {
ut_error;
}
......@@ -3133,7 +3541,7 @@ ibuf_delete_rec(
return(TRUE);
}
root = ibuf_tree_root_get(ibuf_data, 0, mtr);
root = ibuf_tree_root_get(mtr);
btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur),
FALSE, mtr);
......@@ -3144,9 +3552,7 @@ ibuf_delete_rec(
#else
UT_NOT_USED(space);
#endif
ibuf_data_sizes_update(ibuf_data, root, mtr);
ut_ad(ibuf_validate_low());
ibuf_size_update(root, mtr);
btr_pcur_commit_specify_mtr(pcur, mtr);
......@@ -3159,11 +3565,11 @@ ibuf_delete_rec(
/*************************************************************************
When an index page is read from a disk to the buffer pool, this function
inserts to the page the possible index entries buffered in the insert buffer.
The entries are deleted from the insert buffer. If the page is not read, but
created in the buffer pool, this function deletes its buffered entries from
the insert buffer; there can exist entries for such a page if the page
belonged to an index which subsequently was dropped. */
applies any buffered operations to the page and deletes the entries from the
insert buffer. If the page is not read, but created in the buffer pool, this
function deletes its buffered entries from the insert buffer; there can
exist entries for such a page if the page belonged to an index which
subsequently was dropped. */
UNIV_INTERN
void
ibuf_merge_or_delete_for_page(
......@@ -3183,12 +3589,7 @@ ibuf_merge_or_delete_for_page(
{
mem_heap_t* heap;
btr_pcur_t pcur;
dtuple_t* entry;
dtuple_t* search_tuple;
rec_t* ibuf_rec;
page_t* bitmap_page;
ibuf_data_t* ibuf_data;
ulint n_inserts;
#ifdef UNIV_IBUF_DEBUG
ulint volume;
#endif
......@@ -3197,6 +3598,10 @@ ibuf_merge_or_delete_for_page(
ibool corruption_noticed = FALSE;
mtr_t mtr;
/* Counts for merged & discarded operations. */
ulint mops[IBUF_OP_COUNT];
ulint dops[IBUF_OP_COUNT];
ut_ad(!block || buf_block_get_space(block) == space);
ut_ad(!block || buf_block_get_page_no(block) == page_no);
ut_ad(!block || buf_block_get_zip_size(block) == zip_size);
......@@ -3204,24 +3609,24 @@ ibuf_merge_or_delete_for_page(
if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
return;
}
} else if (trx_sys_hdr_page(space, page_no)) {
if (trx_sys_hdr_page(space, page_no)) {
return;
}
} else if (ibuf_fixed_addr_page(space, 0, page_no)
|| fsp_descr_page(0, page_no)) {
/* The following assumes that the uncompressed page size
is a power-of-2 multiple of zip_size. */
if (ibuf_fixed_addr_page(space, 0, page_no)
|| fsp_descr_page(0, page_no)) {
/* This assumes that the uncompressed page size
is a power-of-2 multiple of zip_size. */
return;
}
if (UNIV_LIKELY(update_ibuf_bitmap)) {
ut_a(ut_is_2pow(zip_size));
if (ibuf_fixed_addr_page(space, zip_size, page_no)
|| fsp_descr_page(zip_size, page_no)) {
return;
}
......@@ -3239,9 +3644,12 @@ ibuf_merge_or_delete_for_page(
block = NULL;
update_ibuf_bitmap = FALSE;
} else {
page_t* bitmap_page;
mtr_start(&mtr);
bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
zip_size, &mtr);
bitmap_page = ibuf_bitmap_get_map_page(
space, page_no, zip_size, &mtr);
if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no,
zip_size,
......@@ -3258,17 +3666,12 @@ ibuf_merge_or_delete_for_page(
}
mtr_commit(&mtr);
}
} else if (block) {
if (ibuf_fixed_addr_page(space, zip_size, page_no)
|| fsp_descr_page(zip_size, page_no)) {
return;
}
}
} else if (block
&& (ibuf_fixed_addr_page(space, zip_size, page_no)
|| fsp_descr_page(zip_size, page_no))) {
/* Currently the insert buffer of space 0 takes care of inserts to all
tablespaces */
ibuf_data = fil_space_get_ibuf_data(0);
return;
}
ibuf_enter();
......@@ -3294,6 +3697,8 @@ ibuf_merge_or_delete_for_page(
if (UNIV_UNLIKELY(fil_page_get_type(block->frame)
!= FIL_PAGE_INDEX)) {
page_t* bitmap_page;
corruption_noticed = TRUE;
ut_print_timestamp(stderr);
......@@ -3334,7 +3739,9 @@ ibuf_merge_or_delete_for_page(
}
}
n_inserts = 0;
memset(mops, 0, sizeof(mops));
memset(dops, 0, sizeof(dops));
#ifdef UNIV_IBUF_DEBUG
volume = 0;
#endif
......@@ -3342,11 +3749,14 @@ loop:
mtr_start(&mtr);
if (block) {
ibool success = buf_page_get_known_nowait(RW_X_LATCH, block,
BUF_KEEP_OLD,
__FILE__, __LINE__,
&mtr);
ibool success;
success = buf_page_get_known_nowait(
RW_X_LATCH, block,
BUF_KEEP_OLD, __FILE__, __LINE__, &mtr);
ut_a(success);
#ifdef UNIV_SYNC_DEBUG
buf_block_dbg_add_level(block, SYNC_TREE_NODE);
#endif /* UNIV_SYNC_DEBUG */
......@@ -3354,8 +3764,10 @@ loop:
/* Position pcur in the insert buffer at the first entry for this
index page */
btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE,
BTR_MODIFY_LEAF, &pcur, &mtr);
btr_pcur_open_on_user_rec(
ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
&pcur, &mtr);
if (!btr_pcur_is_on_user_rec(&pcur)) {
ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
......@@ -3363,50 +3775,82 @@ loop:
}
for (;;) {
rec_t* rec;
ut_ad(btr_pcur_is_on_user_rec(&pcur));
ibuf_rec = btr_pcur_get_rec(&pcur);
rec = btr_pcur_get_rec(&pcur);
/* Check if the entry is for this index page */
if (ibuf_rec_get_page_no(ibuf_rec) != page_no
|| ibuf_rec_get_space(ibuf_rec) != space) {
if (ibuf_rec_get_page_no(rec) != page_no
|| ibuf_rec_get_space(rec) != space) {
if (block) {
page_header_reset_last_insert(
block->frame, page_zip, &mtr);
}
goto reset_bit;
}
if (UNIV_UNLIKELY(corruption_noticed)) {
fputs("InnoDB: Discarding record\n ", stderr);
rec_print_old(stderr, ibuf_rec);
rec_print_old(stderr, rec);
fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
} else if (block) {
/* Now we have at pcur a record which should be
inserted to the index page; NOTE that the call below
copies pointers to fields in ibuf_rec, and we must
keep the latch to the ibuf_rec page until the
copies pointers to fields in rec, and we must
keep the latch to the rec page until the
insertion is finished! */
dtuple_t* entry;
dulint max_trx_id;
dict_index_t* dummy_index;
dulint max_trx_id = page_get_max_trx_id(
page_align(ibuf_rec));
ibuf_op_t op = ibuf_rec_get_op_type(rec);
max_trx_id = page_get_max_trx_id(page_align(rec));
page_update_max_trx_id(block, page_zip, max_trx_id);
entry = ibuf_build_entry_from_ibuf_rec(
ibuf_rec, heap, &dummy_index);
rec, heap, &dummy_index);
#ifdef UNIV_IBUF_DEBUG
volume += rec_get_converted_size(dummy_index, entry, 0)
+ page_dir_calc_reserved_space(1);
ut_a(volume <= 4 * UNIV_PAGE_SIZE
/ IBUF_PAGE_SIZE_PER_FREE_SPACE);
if (op == IBUF_OP_INSERT) {
volume += rec_get_converted_size(
dummy_index, entry, 0);
volume += page_dir_calc_reserved_space(1);
ut_a(volume <= 4 * UNIV_PAGE_SIZE
/ IBUF_PAGE_SIZE_PER_FREE_SPACE);
}
#endif
ibuf_insert_to_index_page(entry, block,
dummy_index, &mtr);
switch (op) {
case IBUF_OP_INSERT:
ibuf_insert_to_index_page(
entry, block, dummy_index, &mtr);
break;
case IBUF_OP_DELETE_MARK:
ibuf_set_del_mark(
entry, block, dummy_index, &mtr);
break;
case IBUF_OP_DELETE:
ibuf_delete(entry, block, dummy_index, &mtr);
break;
default:
ut_error;
}
mops[op]++;
ibuf_dummy_index_free(dummy_index);
} else {
dops[ibuf_rec_get_op_type(rec)]++;
}
n_inserts++;
/* Delete the record from ibuf */
if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
&mtr)) {
......@@ -3414,9 +3858,7 @@ loop:
we start from the beginning again */
goto loop;
}
if (btr_pcur_is_after_last_on_page(&pcur)) {
} else if (btr_pcur_is_after_last_on_page(&pcur)) {
mtr_commit(&mtr);
btr_pcur_close(&pcur);
......@@ -3425,43 +3867,32 @@ loop:
}
reset_bit:
#ifdef UNIV_IBUF_COUNT_DEBUG
if (ibuf_count_get(space, page_no) > 0) {
/* btr_print_tree(ibuf_data->index->tree, 100);
ibuf_print(); */
}
#endif
if (UNIV_LIKELY(update_ibuf_bitmap)) {
bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
zip_size, &mtr);
ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
IBUF_BITMAP_BUFFERED, FALSE, &mtr);
page_t* bitmap_page;
bitmap_page = ibuf_bitmap_get_map_page(
space, page_no, zip_size, &mtr);
ibuf_bitmap_page_set_bits(
bitmap_page, page_no, zip_size,
IBUF_BITMAP_BUFFERED, FALSE, &mtr);
if (block) {
ulint old_bits = ibuf_bitmap_page_get_bits(
bitmap_page, page_no, zip_size,
IBUF_BITMAP_FREE, &mtr);
ulint new_bits = ibuf_index_page_calc_free(
zip_size, block);
#if 0 /* defined UNIV_IBUF_DEBUG */
fprintf(stderr, "Old bits %lu new bits %lu"
" max size %lu\n",
old_bits, new_bits,
page_get_max_insert_size_after_reorganize(
page, 1));
#endif
if (old_bits != new_bits) {
ibuf_bitmap_page_set_bits(bitmap_page, page_no,
zip_size,
IBUF_BITMAP_FREE,
new_bits, &mtr);
ibuf_bitmap_page_set_bits(
bitmap_page, page_no, zip_size,
IBUF_BITMAP_FREE, new_bits, &mtr);
}
}
}
#if 0 /* defined UNIV_IBUF_DEBUG */
fprintf(stderr,
"Ibuf merge %lu records volume %lu to page no %lu\n",
n_inserts, volume, page_no);
#endif
mtr_commit(&mtr);
btr_pcur_close(&pcur);
mem_heap_free(heap);
......@@ -3469,8 +3900,9 @@ reset_bit:
/* Protect our statistics keeping from race conditions */
mutex_enter(&ibuf_mutex);
ibuf_data->n_merges++;
ibuf_data->n_merged_recs += n_inserts;
ibuf->n_merges++;
ibuf_add_ops(ibuf->n_merged_ops, mops);
ibuf_add_ops(ibuf->n_discarded_ops, dops);
mutex_exit(&ibuf_mutex);
......@@ -3480,6 +3912,7 @@ reset_bit:
}
ibuf_exit();
#ifdef UNIV_IBUF_COUNT_DEBUG
ut_a(ibuf_count_get(space, page_no) == 0);
#endif
......@@ -3502,14 +3935,10 @@ ibuf_delete_for_discarded_space(
rec_t* ibuf_rec;
ulint page_no;
ibool closed;
ibuf_data_t* ibuf_data;
ulint n_inserts;
mtr_t mtr;
/* Currently the insert buffer of space 0 takes care of inserts to all
tablespaces */
ibuf_data = fil_space_get_ibuf_data(0);
/* Counts for discarded operations. */
ulint dops[IBUF_OP_COUNT];
heap = mem_heap_create(512);
......@@ -3518,7 +3947,7 @@ ibuf_delete_for_discarded_space(
search_tuple = ibuf_new_search_tuple_build(space, 0, heap);
n_inserts = 0;
memset(dops, 0, sizeof(dops));
loop:
ibuf_enter();
......@@ -3526,8 +3955,10 @@ loop:
/* Position pcur in the insert buffer at the first entry for the
space */
btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE,
BTR_MODIFY_LEAF, &pcur, &mtr);
btr_pcur_open_on_user_rec(
ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
&pcur, &mtr);
if (!btr_pcur_is_on_user_rec(&pcur)) {
ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
......@@ -3547,7 +3978,7 @@ loop:
page_no = ibuf_rec_get_page_no(ibuf_rec);
n_inserts++;
dops[ibuf_rec_get_op_type(ibuf_rec)]++;
/* Delete the record from ibuf */
closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple,
......@@ -3577,51 +4008,14 @@ leave_loop:
/* Protect our statistics keeping from race conditions */
mutex_enter(&ibuf_mutex);
ibuf_data->n_merges++;
ibuf_data->n_merged_recs += n_inserts;
ibuf_add_ops(ibuf->n_discarded_ops, dops);
mutex_exit(&ibuf_mutex);
/*
fprintf(stderr,
"InnoDB: Discarded %lu ibuf entries for space %lu\n",
(ulong) n_inserts, (ulong) space);
*/
ibuf_exit();
mem_heap_free(heap);
}
#ifdef UNIV_DEBUG
/**********************************************************************
Validates the ibuf data structures when the caller owns ibuf_mutex. */
static
ibool
ibuf_validate_low(void)
/*===================*/
/* out: TRUE if ok */
{
ibuf_data_t* data;
ulint sum_sizes;
ut_ad(mutex_own(&ibuf_mutex));
sum_sizes = 0;
data = UT_LIST_GET_FIRST(ibuf->data_list);
while (data) {
sum_sizes += data->size;
data = UT_LIST_GET_NEXT(data_list, data);
}
ut_a(sum_sizes == ibuf->size);
return(TRUE);
}
#endif /* UNIV_DEBUG */
/**********************************************************************
Looks if the insert buffer is empty. */
UNIV_INTERN
......@@ -3630,7 +4024,6 @@ ibuf_is_empty(void)
/*===============*/
/* out: TRUE if empty */
{
ibuf_data_t* data;
ibool is_empty;
const page_t* root;
mtr_t mtr;
......@@ -3639,17 +4032,15 @@ ibuf_is_empty(void)
mutex_enter(&ibuf_mutex);
data = UT_LIST_GET_FIRST(ibuf->data_list);
mtr_start(&mtr);
root = ibuf_tree_root_get(data, 0, &mtr);
root = ibuf_tree_root_get(&mtr);
if (page_get_n_recs(root) == 0) {
is_empty = TRUE;
if (data->empty == FALSE) {
if (ibuf->empty == FALSE) {
fprintf(stderr,
"InnoDB: Warning: insert buffer tree is empty"
" but the data struct does not\n"
......@@ -3658,15 +4049,13 @@ ibuf_is_empty(void)
"InnoDB: run to completion.\n");
}
} else {
ut_a(data->empty == FALSE);
ut_a(ibuf->empty == FALSE);
is_empty = FALSE;
}
mtr_commit(&mtr);
ut_a(data->space == 0);
mutex_exit(&ibuf_mutex);
ibuf_exit();
......@@ -3682,39 +4071,42 @@ ibuf_print(
/*=======*/
FILE* file) /* in: file where to print */
{
ibuf_data_t* data;
#ifdef UNIV_IBUF_COUNT_DEBUG
ulint i;
#endif
mutex_enter(&ibuf_mutex);
data = UT_LIST_GET_FIRST(ibuf->data_list);
while (data) {
fprintf(file,
"Ibuf: size %lu, free list len %lu, seg size %lu,\n"
"%lu inserts, %lu merged recs, %lu merges\n",
(ulong) data->size,
(ulong) data->free_list_len,
(ulong) data->seg_size,
(ulong) data->n_inserts,
(ulong) data->n_merged_recs,
(ulong) data->n_merges);
fprintf(file,
"Ibuf: size %lu, free list len %lu, seg size %lu, %lu merges\n"
"total operations:\n ",
(ulong) ibuf->size,
(ulong) ibuf->free_list_len,
(ulong) ibuf->seg_size,
(ulong) ibuf->n_merges);
ibuf_print_ops(ibuf->n_ops, file);
fprintf(file, "\nmerged operations:\n ");
ibuf_print_ops(ibuf->n_merged_ops, file);
fprintf(file, "\ndiscarded operations:\n ");
ibuf_print_ops(ibuf->n_discarded_ops, file);
fputs("\n", file);
#ifdef UNIV_IBUF_COUNT_DEBUG
for (i = 0; i < IBUF_COUNT_N_PAGES; i++) {
if (ibuf_count_get(data->space, i) > 0) {
for (i = 0; i < IBUF_COUNT_N_SPACES; i++) {
for (j = 0; j < IBUF_COUNT_N_PAGES; j++) {
ulint count = ibuf_count_get(i, j);
if (count > 0) {
fprintf(stderr,
"Ibuf count for page %lu is %lu\n",
(ulong) i,
(ulong)
ibuf_count_get(data->space, i));
"Ibuf count for space/page %lu/%lu"
" is %lu\n",
(ulong) i, (ulong) j, (ulong) count);
}
}
#endif
data = UT_LIST_GET_NEXT(data_list, data);
}
#endif /* UNIV_IBUF_COUNT_DEBUG */
mutex_exit(&ibuf_mutex);
}
......@@ -42,6 +42,8 @@ failure. */
#define BTR_SEARCH_PREV 35
#define BTR_MODIFY_PREV 36
/* BTR_INSERT, BTR_DELETE and BTR_DELETE_MARK are mutually exclusive. */
/* If this is ORed to the latch mode, it means that the search tuple will be
inserted to the index, at the searched position */
#define BTR_INSERT 512
......@@ -55,6 +57,19 @@ UNIQUE definition on secondary indexes when we decide if we can use the
insert buffer to speed up inserts */
#define BTR_IGNORE_SEC_UNIQUE 2048
/* Try to delete mark the record at the searched position using the
insert/delete buffer. */
#define BTR_DELETE_MARK 4096
/* Try to delete the record at the searched position using the insert/delete
buffer. */
#define BTR_DELETE 8192
/* If the leaf page is not in the buffer pool: don't read it in, set
cursor->leaf_in_buf_pool to FALSE, and set buf_pool_t::watch_* that
watches for the page to get read in. */
#define BTR_WATCH_LEAF 16384
/******************************************************************
Gets the root node of a tree and x-latches it. */
UNIV_INTERN
......
......@@ -118,7 +118,7 @@ btr_page_get_level(
/*===============*/
/* out: level, leaf level == 0 */
const page_t* page, /* in: index page */
mtr_t* mtr __attribute__((unused)))
mtr_t* mtr UNIV_UNUSED)
/* in: mini-transaction handle */
{
ut_ad(page && mtr);
......@@ -160,7 +160,7 @@ btr_page_get_next(
/*==============*/
/* out: next page number */
const page_t* page, /* in: index page */
mtr_t* mtr __attribute__((unused)))
mtr_t* mtr UNIV_UNUSED)
/* in: mini-transaction handle */
{
ut_ad(page && mtr);
......@@ -200,7 +200,7 @@ btr_page_get_prev(
/*==============*/
/* out: prev page number */
const page_t* page, /* in: index page */
mtr_t* mtr __attribute__((unused))) /* in: mini-transaction handle */
mtr_t* mtr UNIV_UNUSED) /* in: mini-transaction handle */
{
ut_ad(page && mtr);
......
......@@ -312,8 +312,8 @@ btr_cur_del_mark_set_sec_rec(
que_thr_t* thr, /* in: query thread */
mtr_t* mtr); /* in: mtr */
/***************************************************************
Sets a secondary index record delete mark to FALSE. This function is
only used by the insert buffer insert merge mechanism. */
Sets a secondary index record delete mark to the given value. This
function is only used by the insert buffer insert merge mechanism. */
UNIV_INTERN
void
btr_cur_del_unmark_for_ibuf(
......@@ -323,6 +323,7 @@ btr_cur_del_unmark_for_ibuf(
corresponding to rec, or NULL
when the tablespace is
uncompressed */
ibool val, /* value to set */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
Tries to compress a page of the tree if it seems useful. It is assumed
......@@ -572,7 +573,20 @@ btr_push_update_extern_fields(
const upd_t* update, /* in: update vector */
mem_heap_t* heap) /* in: memory heap */
__attribute__((nonnull));
/***************************************************************
Sets a secondary index record's delete mark to the given value. This
function is only used by the insert buffer merge mechanism. */
void
btr_cur_set_deleted_flag_for_ibuf(
/*==============================*/
rec_t* rec, /* in: record */
page_zip_des_t* page_zip, /* in/out: compressed page
corresponding to rec, or NULL
when the tablespace is
uncompressed */
ibool val, /* in: value to set */
mtr_t* mtr); /* in: mtr */
/*######################################################################*/
/* In the pessimistic delete, if the page data size drops below this
......@@ -657,6 +671,28 @@ struct btr_cur_struct {
NULL */
ulint fold; /* fold value used in the search if
flag is BTR_CUR_HASH */
/*----- Delete buffering -------*/
ulint ibuf_cnt; /* in searches done on insert buffer
trees, this contains the "counter"
value (the first two bytes of the
fourth field) extracted from the
page above the leaf page, from the
father node pointer that pointed to
the leaf page. in other words, it
contains the minimum counter value
for records to be inserted on the
chosen leaf page. If for some reason
this can't be read, or if the search
ended on the leftmost leaf page in
the tree (in which case the father
node pointer had the 'minimum
record' flag set), this is
ULINT_UNDEFINED. */
ibool leaf_in_buf_pool;
/* in: in searches done with
BTR_CHECK_LEAF, this is TRUE if the
leaf page is in the buffer pool,
FALSE otherwise. */
/*------------------------------*/
btr_path_t* path_arr; /* in estimating the number of
rows in range, we store in this array
......@@ -675,6 +711,13 @@ struct btr_cur_struct {
#define BTR_CUR_BINARY 3 /* success using the binary search */
#define BTR_CUR_INSERT_TO_IBUF 4 /* performed the intended insert to
the insert buffer */
#define BTR_CUR_DEL_MARK_IBUF 5 /* performed the intended delete
mark in the insert/delete buffer */
#define BTR_CUR_DELETE_IBUF 6 /* performed the intended delete in
the insert/delete buffer */
#define BTR_CUR_ABORTED 7 /* search with BTR_CHECK_LEAF
aborted due to leaf page not being
in buffer pool */
/* If pessimistic delete fails because of lack of file space,
there is still a good change of success a little later: try this many times,
......
......@@ -79,6 +79,16 @@ btr_pcur_open(
btr_pcur_t* cursor, /* in: memory buffer for persistent cursor */
mtr_t* mtr); /* in: mtr */
/******************************************************************
Check if an operation was buffered. */
UNIV_INLINE
ibool
btr_pcur_was_buffered(
/*==================*/
/* out: TRUE if the operation was buffered
in the insert/delete buffer */
const btr_pcur_t* cursor);
/* in: persistent cursor */
/******************************************************************
Opens an persistent cursor to an index tree without initializing the
cursor. */
UNIV_INLINE
......
......@@ -506,6 +506,28 @@ btr_pcur_open(
cursor->trx_if_known = NULL;
}
/******************************************************************
Check if an operation was buffered. */
UNIV_INLINE
ibool
btr_pcur_was_buffered(
/*==================*/
/* out: TRUE if the operation was buffered
in the insert/delete buffer */
const btr_pcur_t* cursor)
/* in: persistent cursor */
{
const btr_cur_t* btr_cursor;
/* Look in the tree cursor */
btr_cursor = btr_pcur_get_btr_cur(cursor);
return((btr_cursor->flag == BTR_CUR_DEL_MARK_IBUF)
|| (btr_cursor->flag == BTR_CUR_DELETE_IBUF)
|| (btr_cursor->flag == BTR_CUR_INSERT_TO_IBUF));
}
/******************************************************************
Opens an persistent cursor to an index tree without initializing the
cursor. */
......
......@@ -43,6 +43,10 @@ Created 11/5/1995 Heikki Tuuri
it is error-prone programming not to
set a latch, and it should be used
with care */
#define BUF_GET_IF_IN_POOL_OR_WATCH 15
/* Get the page only if it's in the
buffer pool, if not then set a watch
on the page. */
/* Modes for buf_page_get_known_nowait */
#define BUF_MAKE_YOUNG 51
#define BUF_KEEP_OLD 52
......@@ -165,20 +169,22 @@ read the contents of the page unless you know it is safe. Do not modify
the contents of the page! We have separated this case, because it is
error-prone programming not to set a latch, and it should be used
with care. */
#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\
#define buf_page_get_with_no_latch(SP, ZS, OF, MTR) buf_page_get_gen(\
SP, ZS, OF, RW_NO_LATCH, NULL,\
BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
BUF_GET_NO_LATCH, \
__FILE__, __LINE__, MTR)
/******************************************************************
NOTE! The following macros should be used instead of buf_page_get_gen, to
improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed as LA! */
#define buf_page_get_nowait(SP, ZS, OF, LA, MTR) buf_page_get_gen(\
SP, ZS, OF, LA, NULL,\
BUF_GET_NOWAIT, __FILE__, __LINE__, MTR)
BUF_GET_NOWAIT, \
__FILE__, __LINE__, MTR)
/******************************************************************
NOTE! The following macros should be used instead of
buf_page_optimistic_get_func, to improve debugging. Only values RW_S_LATCH and
RW_X_LATCH are allowed as LA! */
#define buf_page_optimistic_get(LA, BL, MC, MTR) \
#define buf_page_optimistic_get(LA, BL, MC, MTR) \
buf_page_optimistic_get_func(LA, BL, MC, __FILE__, __LINE__, MTR)
/************************************************************************
This is the general function used to get optimistic access to a database
......@@ -258,7 +264,8 @@ buf_page_get_gen(
ulint rw_latch,/* in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
buf_block_t* guess, /* in: guessed block or NULL */
ulint mode, /* in: BUF_GET, BUF_GET_IF_IN_POOL,
BUF_GET_NO_LATCH */
BUF_GET_NO_LATCH, BUF_GET_NOWAIT or
BUF_GET_IF_IN_POOL_WATCH*/
const char* file, /* in: file name */
ulint line, /* in: line where called */
mtr_t* mtr); /* in: mini-transaction */
......@@ -952,8 +959,23 @@ UNIV_INTERN
ulint
buf_get_free_list_len(void);
/*=======================*/
/********************************************************************
Stop watching if the marked page is read in. */
void
buf_pool_remove_watch(void);
/*=======================*/
/********************************************************************
Check if the given page is being watched and has been read to the buffer
pool. */
ibool
buf_pool_watch_happened(
/*====================*/
/* out: TRUE if the given page is being
watched and it has been read in */
ulint space, /* in: space id */
ulint page_no); /* in: page number */
/* The common buffer control block structure
for compressed and uncompressed frames */
......@@ -1186,6 +1208,16 @@ struct buf_pool_struct{
buf_block_t file pages,
buf_page_in_file() == TRUE,
indexed by (space_id, offset) */
/*--------------------------*/ /* Delete buffering data */
ibool watch_active; /* if TRUE, set watch_happened to
TRUE when page watch_space/
watch_page_no is read in. */
ulint watch_space; /* space id of watched page */
ulint watch_page_no; /* page number of watched page */
ibool watch_happened; /* has watched page been read in */
/*--------------------------*/
hash_table_t* zip_hash; /* hash table of buf_block_t blocks
whose frames are allocated to the
zip buddy system,
......
......@@ -158,14 +158,6 @@ fil_space_get_type(
/* out: FIL_TABLESPACE or FIL_LOG */
ulint id); /* in: space id */
/***********************************************************************
Returns the ibuf data of a file space. */
UNIV_INTERN
ibuf_data_t*
fil_space_get_ibuf_data(
/*====================*/
/* out: ibuf data for this space */
ulint id); /* in: space id */
/***********************************************************************
Appends a new file to the chain of files of a space. File must be closed. */
UNIV_INTERN
void
......@@ -274,14 +266,6 @@ fil_set_max_space_id_if_bigger(
/*===========================*/
ulint max_id);/* in: maximum known id */
/********************************************************************
Initializes the ibuf data structure for space 0 == the system tablespace.
This can be called after the file space headers have been created and the
dictionary system has been initialized. */
UNIV_INTERN
void
fil_ibuf_init_at_db_start(void);
/*===========================*/
/********************************************************************
Writes the flushed lsn and the latest archived log number to the page
header of the first page of each data file in the system tablespace. */
UNIV_INTERN
......
......@@ -18,23 +18,21 @@ Created 7/19/1997 Heikki Tuuri
#include "ibuf0types.h"
#include "fsp0fsp.h"
/* Possible operations buffered in the insert/whatever buffer. See
ibuf_insert(). DO NOT CHANGE THE VALUES OF THESE, THEY ARE STORED ON DISK. */
typedef enum {
IBUF_OP_INSERT = 0,
IBUF_OP_DELETE_MARK = 1,
IBUF_OP_DELETE = 2,
/* Number of different operation types. */
IBUF_OP_COUNT = 3,
} ibuf_op_t;
extern ibuf_t* ibuf;
/**********************************************************************
Creates the insert buffer data struct for a single tablespace. Reads the
root page of the insert buffer tree in the tablespace. This function can
be called only after the dictionary system has been initialized, as this
creates also the insert buffer table and index for this tablespace. */
UNIV_INTERN
ibuf_data_t*
ibuf_data_init_for_space(
/*=====================*/
/* out, own: ibuf data struct, linked to the list
in ibuf control structure. */
ulint space); /* in: space id */
/**********************************************************************
Creates the insert buffer data structure at a database startup and
initializes the data structures for the insert buffer of each tablespace. */
Creates the insert buffer data structure at a database startup. */
UNIV_INTERN
void
ibuf_init_at_db_start(void);
......@@ -165,38 +163,29 @@ ibuf_page(
/* out: TRUE if level 2 or level 3 page */
ulint space, /* in: space id */
ulint zip_size,/* in: compressed page size in bytes, or 0 */
ulint page_no);/* in: page number */
/***************************************************************************
Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages. */
UNIV_INTERN
ibool
ibuf_page_low(
/*==========*/
/* out: TRUE if level 2 or level 3 page */
ulint space, /* in: space id */
ulint zip_size,/* in: compressed page size in bytes, or 0 */
ulint page_no,/* in: page number */
mtr_t* mtr); /* in: mtr which will contain an x-latch to the
bitmap page if the page is not one of the fixed
address ibuf pages */
address ibuf pages, or NULL, in which case a new
transaction is created. */
/***************************************************************************
Frees excess pages from the ibuf free list. This function is called when an OS
thread calls fsp services to allocate a new file segment, or a new page to a
file segment, and the thread did not own the fsp latch before this call. */
UNIV_INTERN
void
ibuf_free_excess_pages(
/*===================*/
ulint space); /* in: space id */
ibuf_free_excess_pages(void);
/*========================*/
/*************************************************************************
Makes an index insert to the insert buffer, instead of directly to the disk
page, if this is possible. Does not do insert if the index is clustered
or unique. */
Buffer an operation in the insert/delete buffer, instead of doing it
directly to the disk page, if this is possible. Does not do it if the index
is clustered or unique. */
UNIV_INTERN
ibool
ibuf_insert(
/*========*/
/* out: TRUE if success */
ibuf_op_t op, /* in: operation type */
const dtuple_t* entry, /* in: index entry to insert */
dict_index_t* index, /* in: index where to insert */
ulint space, /* in: space id where to insert */
......@@ -205,11 +194,11 @@ ibuf_insert(
que_thr_t* thr); /* in: query thread */
/*************************************************************************
When an index page is read from a disk to the buffer pool, this function
inserts to the page the possible index entries buffered in the insert buffer.
The entries are deleted from the insert buffer. If the page is not read, but
created in the buffer pool, this function deletes its buffered entries from
the insert buffer; there can exist entries for such a page if the page
belonged to an index which subsequently was dropped. */
applies any buffered operations to the page and deletes the entries from the
insert buffer. If the page is not read, but created in the buffer pool, this
function deletes its buffered entries from the insert buffer; there can
exist entries for such a page if the page belonged to an index which
subsequently was dropped. */
UNIV_INTERN
void
ibuf_merge_or_delete_for_page(
......@@ -300,6 +289,16 @@ void
ibuf_print(
/*=======*/
FILE* file); /* in: file where to print */
/********************************************************************
Read the first two bytes from a record's fourth field (counter field in new
records; something else in older records). */
ulint
ibuf_rec_get_fake_counter(
/*======================*/
/* out: "counter" field, or ULINT_UNDEFINED if for
some reason it can't be read*/
rec_t* rec); /* in: ibuf record */
#define IBUF_HEADER_PAGE_NO FSP_IBUF_HEADER_PAGE_NO
#define IBUF_TREE_ROOT_PAGE_NO FSP_IBUF_TREE_ROOT_PAGE_NO
......@@ -309,6 +308,9 @@ for the file segment from which the pages for the ibuf tree are allocated */
#define IBUF_HEADER PAGE_DATA
#define IBUF_TREE_SEG_HEADER 0 /* fseg header for ibuf tree */
/* The insert buffer tree itself is always located in space 0. */
#define IBUF_SPACE_ID 0
#ifndef UNIV_NONINL
#include "ibuf0ibuf.ic"
#endif
......
......@@ -18,36 +18,37 @@ If there is this much of free space, the corresponding bits are set in the
ibuf bitmap. */
#define IBUF_PAGE_SIZE_PER_FREE_SPACE 32
/* Insert buffer data struct for a single tablespace */
struct ibuf_data_struct{
ulint space; /* space id */
ulint seg_size;/* allocated pages if the file segment
containing ibuf header and tree */
ulint size; /* size of the insert buffer tree in pages */
ibool empty; /* after an insert to the ibuf tree is
performed, this is set to FALSE, and if a
contract operation finds the tree empty, this
is set to TRUE */
ulint free_list_len;
/* length of the free list */
ulint height; /* tree height */
dict_index_t* index; /* insert buffer index */
UT_LIST_NODE_T(ibuf_data_t) data_list;
/* list of ibuf data structs */
ulint n_inserts;/* number of inserts made to the insert
buffer */
ulint n_merges;/* number of pages merged */
ulint n_merged_recs;/* number of records merged */
};
/* Insert buffer struct */
struct ibuf_struct{
ulint size; /* current size of the ibuf index
trees in pages */
ulint max_size; /* recommended maximum size in pages
for the ibuf index tree */
UT_LIST_BASE_NODE_T(ibuf_data_t) data_list;
/* list of ibuf data structs for
each tablespace */
tree, in pages */
ulint max_size; /* recommended maximum size of the
ibuf index tree, in pages */
ulint seg_size; /* allocated pages of the file
segment containing ibuf header and
tree */
ibool empty; /* after an insert to the ibuf tree
is performed, this is set to FALSE,
and if a contract operation finds
the tree empty, this is set to
TRUE */
ulint free_list_len; /* length of the free list */
ulint height; /* tree height */
dict_index_t* index; /* insert buffer index */
ulint n_ops[IBUF_OP_COUNT];
/* number of operations of each type
done */
ulint n_merges; /* number of pages merged */
ulint n_merged_ops[IBUF_OP_COUNT];
/* number of operations of each type
merged to index pages */
ulint n_discarded_ops[IBUF_OP_COUNT];
/* number of operations of each type
discarded without merging due to the
tablespace being deleted or the
index being dropped */
};
/****************************************************************************
......
......@@ -9,7 +9,6 @@ Created 7/29/1997 Heikki Tuuri
#ifndef ibuf0types_h
#define ibuf0types_h
typedef struct ibuf_data_struct ibuf_data_t;
typedef struct ibuf_struct ibuf_t;
#endif
......@@ -268,6 +268,9 @@ ibool
row_search_index_entry(
/*===================*/
/* out: TRUE if found */
ibool* was_buffered,
/* out: TRUE if the operation was buffered
in the insert/delete buffer. Can be NULL. */
dict_index_t* index, /* in: index */
const dtuple_t* entry, /* in: index entry */
ulint mode, /* in: BTR_MODIFY_LEAF, ... */
......
......@@ -137,6 +137,9 @@ operations (very slow); also UNIV_DEBUG must be defined */
for compressed pages */
#endif
//#define UNIV_DEBUG
//#define UNIV_SYNC_DEBUG
//#define UNIV_IBUF_DEBUG
#define UNIV_BTR_DEBUG /* check B-tree links */
#define UNIV_LIGHT_MEM_DEBUG /* light memory debugging */
......@@ -316,8 +319,11 @@ it is read. */
/* Minimize cache-miss latency by moving data at addr into a cache before
it is read or written. */
# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
/* Tell the compiler that variable/function is unused. */
# define UNIV_UNUSED __attribute__ ((unused))
#else
/* Dummy versions of the macros */
# define UNIV_UNUSED
# define UNIV_EXPECT(expr,value) (expr)
# define UNIV_LIKELY_NULL(expr) (expr)
# define UNIV_PREFETCH_R(addr) ((void) 0)
......
......@@ -197,11 +197,12 @@ retry:
}
/***************************************************************
Removes a secondary index entry if possible. */
Removes a secondary index entry if possible, without trying to use the
insert/delete buffer. */
static
ibool
row_purge_remove_sec_if_poss_low(
/*=============================*/
row_purge_remove_sec_if_poss_low_nonbuffered(
/*=========================================*/
/* out: TRUE if success or if not found */
purge_node_t* node, /* in: row purge node */
dict_index_t* index, /* in: index */
......@@ -212,7 +213,7 @@ row_purge_remove_sec_if_poss_low(
btr_pcur_t pcur;
btr_cur_t* btr_cur;
ibool success;
ibool old_has = 0; /* remove warning */
ibool old_has = FALSE; /* remove warning */
ibool found;
ulint err;
mtr_t mtr;
......@@ -221,13 +222,13 @@ row_purge_remove_sec_if_poss_low(
log_free_check();
mtr_start(&mtr);
found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
found = row_search_index_entry(NULL, index, entry, mode, &pcur, &mtr);
if (!found) {
/* Not found */
/* fputs("PURGE:........sec entry not found\n", stderr); */
/* dtuple_print(stderr, entry); */
/* dtuple_print(entry); */
btr_pcur_close(&pcur);
mtr_commit(&mtr);
......@@ -266,8 +267,13 @@ row_purge_remove_sec_if_poss_low(
ut_ad(mode == BTR_MODIFY_TREE);
btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
FALSE, &mtr);
success = err == DB_SUCCESS;
ut_a(success || err == DB_OUT_OF_FILE_SPACE);
if (err == DB_SUCCESS) {
success = TRUE;
} else if (err == DB_OUT_OF_FILE_SPACE) {
success = FALSE;
} else {
ut_error;
}
}
}
......@@ -277,6 +283,117 @@ row_purge_remove_sec_if_poss_low(
return(success);
}
/***************************************************************
Removes a secondary index entry if possible. */
static
ibool
row_purge_remove_sec_if_poss_low(
/*=============================*/
/* out: TRUE if success or if not found */
purge_node_t* node, /* in: row purge node */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry */
ulint mode) /* in: latch mode BTR_MODIFY_LEAF or
BTR_MODIFY_TREE */
{
mtr_t mtr;
btr_pcur_t pcur;
btr_cur_t* btr_cur;
ibool found;
ibool success;
ibool was_buffered;
ibool old_has = FALSE;
ibool leaf_in_buf_pool;
ut_a((mode == BTR_MODIFY_TREE) || (mode == BTR_MODIFY_LEAF));
if (mode == BTR_MODIFY_TREE) {
/* Can't use the insert/delete buffer if we potentially
need to split pages. */
return(row_purge_remove_sec_if_poss_low_nonbuffered(
node, index, entry, mode));
}
log_free_check();
mtr_start(&mtr);
found = row_search_index_entry(
NULL, index, entry,
BTR_SEARCH_LEAF | BTR_WATCH_LEAF, &pcur, &mtr);
btr_cur = btr_pcur_get_btr_cur(&pcur);
leaf_in_buf_pool = btr_cur->leaf_in_buf_pool;
ut_a(!(found && !leaf_in_buf_pool));
btr_pcur_close(&pcur);
mtr_commit(&mtr);
if (leaf_in_buf_pool) {
if (found) {
/* Index entry exists and is in the buffer pool, no
need to use the insert/delete buffer. */
return(row_purge_remove_sec_if_poss_low_nonbuffered(
node, index, entry, BTR_MODIFY_LEAF));
} else {
/* Index entry does not exist, nothing to do. */
return(TRUE);
}
}
/* We should remove the index record if no later version of the row,
which cannot be purged yet, requires its existence. If some
requires, we should do nothing. */
mtr_start(&mtr);
success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr);
if (success) {
old_has = row_vers_old_has_index_entry(
TRUE, btr_pcur_get_rec(&node->pcur),
&mtr, index, entry);
}
btr_pcur_commit_specify_mtr(&node->pcur, &mtr);
if (success && old_has) {
/* Can't remove the index record yet. */
buf_pool_remove_watch();
return(TRUE);
}
mtr_start(&mtr);
btr_cur->thr = que_node_get_parent(node);
row_search_index_entry(&was_buffered, index, entry,
BTR_MODIFY_LEAF | BTR_DELETE, &pcur,
&mtr);
btr_pcur_close(&pcur);
mtr_commit(&mtr);
buf_pool_remove_watch();
if (!was_buffered) {
/* Page read into buffer pool or delete-buffering failed. */
return(row_purge_remove_sec_if_poss_low_nonbuffered(
node, index, entry, BTR_MODIFY_LEAF));
}
return(TRUE);
}
/***************************************************************
Removes a secondary index entry if possible. */
UNIV_INLINE
......
......@@ -789,6 +789,9 @@ ibool
row_search_index_entry(
/*===================*/
/* out: TRUE if found */
ibool* was_buffered,
/* out: TRUE if the operation was buffered
in the insert/delete buffer. Can be NULL. */
dict_index_t* index, /* in: index */
const dtuple_t* entry, /* in: index entry */
ulint mode, /* in: BTR_MODIFY_LEAF, ... */
......@@ -799,17 +802,48 @@ row_search_index_entry(
ulint n_fields;
ulint low_match;
rec_t* rec;
ibool ret;
ut_ad(dtuple_check_typed(entry));
btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
ret = btr_pcur_was_buffered(pcur);
if (was_buffered) {
*was_buffered = ret;
}
if (ret) {
/* Operation was buffered in the insert/delete buffer;
pretend that we found the record. */
return(TRUE);
} else if ((mode & BTR_WATCH_LEAF)
&& !btr_pcur_get_btr_cur(pcur)->leaf_in_buf_pool) {
/* We did not read in the leaf page, thus we can't have
found anything. */
return(FALSE);
}
low_match = btr_pcur_get_low_match(pcur);
rec = btr_pcur_get_rec(pcur);
n_fields = dtuple_get_n_fields(entry);
return(!page_rec_is_infimum(rec) && low_match == n_fields);
if (page_rec_is_infimum(rec)) {
return(FALSE);
} else if (low_match != n_fields) {
/* Not found */
return(FALSE);
}
return(TRUE);
}
#ifndef UNIV_HOTBACKUP
......
......@@ -136,7 +136,7 @@ row_undo_ins_remove_sec_low(
log_free_check();
mtr_start(&mtr);
found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
found = row_search_index_entry(NULL, index, entry, mode, &pcur, &mtr);
btr_cur = btr_pcur_get_btr_cur(&pcur);
......
......@@ -307,7 +307,7 @@ row_undo_mod_del_mark_or_remove_sec_low(
log_free_check();
mtr_start(&mtr);
found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
found = row_search_index_entry(NULL, index, entry, mode, &pcur, &mtr);
btr_cur = btr_pcur_get_btr_cur(&pcur);
......@@ -432,7 +432,7 @@ row_undo_mod_del_unmark_sec_and_undo_update(
return(DB_SUCCESS);
}
if (UNIV_UNLIKELY(!row_search_index_entry(index, entry,
if (UNIV_UNLIKELY(!row_search_index_entry(NULL, index, entry,
mode, &pcur, &mtr))) {
fputs("InnoDB: error in sec index entry del undo in\n"
"InnoDB: ", stderr);
......
......@@ -1451,21 +1451,23 @@ row_upd_sec_index_entry(
upd_node_t* node, /* in: row update node */
que_thr_t* thr) /* in: query thread */
{
ibool check_ref;
ibool found;
dict_index_t* index;
dtuple_t* entry;
mtr_t mtr;
rec_t* rec;
btr_pcur_t pcur;
btr_cur_t* btr_cur;
mem_heap_t* heap;
rec_t* rec;
ulint err = DB_SUCCESS;
mtr_t mtr;
trx_t* trx = thr_get_trx(thr);
dtuple_t* entry;
dict_index_t* index;
ibool found;
btr_cur_t* btr_cur;
ibool referenced;
ibool was_buffered;
ulint err = DB_SUCCESS;
trx_t* trx = thr_get_trx(thr);
ulint mode = BTR_MODIFY_LEAF;
index = node->index;
check_ref = row_upd_index_is_referenced(index, trx);
referenced = row_upd_index_is_referenced(index, trx);
heap = mem_heap_create(1024);
......@@ -1476,8 +1478,24 @@ row_upd_sec_index_entry(
log_free_check();
mtr_start(&mtr);
found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
&mtr);
btr_pcur_get_btr_cur(&pcur)->thr = thr;
/* We can only try to use the insert/delete buffer to buffer
delete-mark operations if the index we're modifying has no foreign
key constraints referring to it. */
if (!referenced) {
mode |= BTR_DELETE_MARK;
}
found = row_search_index_entry(
&was_buffered, index, entry, BTR_MODIFY_LEAF, &pcur, &mtr);
if (was_buffered) {
/* Entry was delete marked already. */
goto close_cur;
}
btr_cur = btr_pcur_get_btr_cur(&pcur);
rec = btr_cur_get_rec(btr_cur);
......@@ -1504,15 +1522,20 @@ row_upd_sec_index_entry(
delete marked if we return after a lock wait in
row_ins_index_entry below */
if (!rec_get_deleted_flag(rec,
dict_table_is_comp(index->table))) {
err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE,
thr, &mtr);
if (err == DB_SUCCESS && check_ref) {
if (!rec_get_deleted_flag(
rec, dict_table_is_comp(index->table))) {
err = btr_cur_del_mark_set_sec_rec(
0, btr_cur, TRUE, thr, &mtr);
if (err == DB_SUCCESS && referenced) {
ulint* offsets;
offsets = rec_get_offsets(
rec, index, NULL, ULINT_UNDEFINED,
&heap);
ulint* offsets = rec_get_offsets(
rec, index, NULL,
ULINT_UNDEFINED, &heap);
/* NOTE that the following call loses
the position of pcur ! */
err = row_upd_check_references_constraints(
......@@ -1522,6 +1545,7 @@ row_upd_sec_index_entry(
}
}
close_cur:
btr_pcur_close(&pcur);
mtr_commit(&mtr);
......@@ -1583,7 +1607,7 @@ row_upd_clust_rec_by_insert(
upd_node_t* node, /* in: row update node */
dict_index_t* index, /* in: clustered index of the record */
que_thr_t* thr, /* in: query thread */
ibool check_ref,/* in: TRUE if index may be referenced in
ibool referenced,/* in: TRUE if index may be referenced in
a foreign key constraint */
mtr_t* mtr) /* in: mtr; gets committed here */
{
......@@ -1629,16 +1653,21 @@ row_upd_clust_rec_by_insert(
btr_cur_mark_extern_inherited_fields(
btr_cur_get_page_zip(btr_cur),
rec, index, offsets, node->update, mtr);
if (check_ref) {
if (referenced) {
/* NOTE that the following call loses
the position of pcur ! */
err = row_upd_check_references_constraints(
node, pcur, table, index, offsets, thr, mtr);
if (err != DB_SUCCESS) {
mtr_commit(mtr);
if (UNIV_LIKELY_NULL(heap)) {
mem_heap_free(heap);
}
return(err);
}
}
......@@ -1794,7 +1823,8 @@ row_upd_del_mark_clust_rec(
ulint* offsets,/* in/out: rec_get_offsets() for the
record under the cursor */
que_thr_t* thr, /* in: query thread */
ibool check_ref,/* in: TRUE if index may be referenced in
ibool referenced,
/* in: TRUE if index may be referenced in
a foreign key constraint */
mtr_t* mtr) /* in: mtr; gets committed here */
{
......@@ -1819,13 +1849,11 @@ row_upd_del_mark_clust_rec(
err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
btr_cur, TRUE, thr, mtr);
if (err == DB_SUCCESS && check_ref) {
if (err == DB_SUCCESS && referenced) {
/* NOTE that the following call loses the position of pcur ! */
err = row_upd_check_references_constraints(node,
pcur, index->table,
index, offsets,
thr, mtr);
err = row_upd_check_references_constraints(
node, pcur, index->table, index, offsets, thr, mtr);
}
mtr_commit(mtr);
......@@ -1848,7 +1876,6 @@ row_upd_clust_step(
dict_index_t* index;
btr_pcur_t* pcur;
ibool success;
ibool check_ref;
ulint err;
mtr_t* mtr;
mtr_t mtr_buf;
......@@ -1856,11 +1883,12 @@ row_upd_clust_step(
mem_heap_t* heap = NULL;
ulint offsets_[REC_OFFS_NORMAL_SIZE];
ulint* offsets;
ibool referenced;
rec_offs_init(offsets_);
index = dict_table_get_first_index(node->table);
check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr));
referenced = row_upd_index_is_referenced(index, thr_get_trx(thr));
pcur = node->pcur;
......@@ -1930,8 +1958,9 @@ row_upd_clust_step(
/* NOTE: the following function calls will also commit mtr */
if (node->is_delete) {
err = row_upd_del_mark_clust_rec(node, index, offsets,
thr, check_ref, mtr);
err = row_upd_del_mark_clust_rec(
node, index, offsets, thr, referenced, mtr);
if (err == DB_SUCCESS) {
node->state = UPD_NODE_UPDATE_ALL_SEC;
node->index = dict_table_get_next_index(index);
......@@ -1979,8 +2008,9 @@ exit_func:
choosing records to update. MySQL solves now the problem
externally! */
err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
mtr);
err = row_upd_clust_rec_by_insert(
node, index, thr, referenced, mtr);
if (err != DB_SUCCESS) {
return(err);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment