Commit 52949169 authored by unknown's avatar unknown

Many files:

  Merge 3.23.52


innobase/btr/btr0btr.c:
  Merge 3.23.52
innobase/btr/btr0cur.c:
  Merge 3.23.52
innobase/btr/btr0sea.c:
  Merge 3.23.52
innobase/include/btr0btr.h:
  Merge 3.23.52
innobase/include/btr0cur.h:
  Merge 3.23.52
innobase/include/btr0sea.h:
  Merge 3.23.52
innobase/include/buf0buf.h:
  Merge 3.23.52
innobase/include/buf0rea.h:
  Merge 3.23.52
innobase/include/data0data.h:
  Merge 3.23.52
innobase/include/data0data.ic:
  Merge 3.23.52
innobase/include/log0log.h:
  Merge 3.23.52
innobase/include/log0log.ic:
  Merge 3.23.52
innobase/include/os0file.h:
  Merge 3.23.52
innobase/include/page0page.h:
  Merge 3.23.52
innobase/include/page0page.ic:
  Merge 3.23.52
innobase/include/row0mysql.h:
  Merge 3.23.52
innobase/include/trx0roll.h:
  Merge 3.23.52
innobase/include/trx0sys.h:
  Merge 3.23.52
innobase/include/trx0trx.h:
  Merge 3.23.52
innobase/include/ut0ut.h:
  Merge 3.23.52
innobase/include/univ.i:
  Merge 3.23.52
innobase/include/ut0ut.ic:
  Merge 3.23.52
innobase/buf/buf0buf.c:
  Merge 3.23.52
innobase/buf/buf0rea.c:
  Merge 3.23.52
innobase/data/data0data.c:
  Merge 3.23.52
innobase/dict/dict0crea.c:
  Merge 3.23.52
innobase/dict/dict0dict.c:
  Merge 3.23.52
innobase/dict/dict0load.c:
  Merge 3.23.52
innobase/dict/dict0mem.c:
  Merge 3.23.52
innobase/fsp/fsp0fsp.c:
  Merge 3.23.52
innobase/ibuf/ibuf0ibuf.c:
  Merge 3.23.52
innobase/lock/lock0lock.c:
  Merge 3.23.52
innobase/log/log0log.c:
  Merge 3.23.52
innobase/log/log0recv.c:
  Merge 3.23.52
innobase/mtr/mtr0log.c:
  Merge 3.23.52
innobase/mtr/mtr0mtr.c:
  Merge 3.23.52
innobase/os/os0file.c:
  Merge 3.23.52
innobase/page/page0cur.c:
  Merge 3.23.52
innobase/page/page0page.c:
  Merge 3.23.52
innobase/rem/rem0cmp.c:
  Merge 3.23.52
innobase/row/row0ins.c:
  Merge 3.23.52
innobase/row/row0mysql.c:
  Merge 3.23.52
innobase/row/row0purge.c:
  Merge 3.23.52
innobase/row/row0upd.c:
  Merge 3.23.52
innobase/srv/srv0srv.c:
  Merge 3.23.52
innobase/srv/srv0start.c:
  Merge 3.23.52
innobase/trx/trx0roll.c:
  Merge 3.23.52
innobase/trx/trx0sys.c:
  Merge 3.23.52
innobase/trx/trx0trx.c:
  Merge 3.23.52
innobase/trx/trx0undo.c:
  Merge 3.23.52
innobase/ut/ut0mem.c:
  Merge 3.23.52
innobase/ut/ut0ut.c:
  Merge 3.23.52
parent bd4b2812
......@@ -572,6 +572,13 @@ btr_page_get_father_for_rec(
if (btr_node_ptr_get_child_page_no(node_ptr) !=
buf_frame_get_page_no(page)) {
fprintf(stderr,
"InnoDB: Dump of the child page:\n");
buf_page_print(buf_frame_align(page));
fprintf(stderr,
"InnoDB: Dump of the parent page:\n");
buf_page_print(buf_frame_align(node_ptr));
fprintf(stderr,
"InnoDB: Corruption of an index tree: table %s, index %s,\n"
"InnoDB: father ptr page no %lu, child page no %lu\n",
......@@ -581,6 +588,12 @@ btr_page_get_father_for_rec(
buf_frame_get_page_no(page));
page_rec_print(page_rec_get_next(page_get_infimum_rec(page)));
page_rec_print(node_ptr);
fprintf(stderr,
"InnoDB: You should dump + drop + reimport the table to fix the\n"
"InnoDB: corruption. If the crash happens at the database startup, see\n"
"InnoDB: section 6.1 of http://www.innodb.com/ibman.html about forcing\n"
"InnoDB: recovery. Then dump + drop + reimport.\n");
}
ut_a(btr_node_ptr_get_child_page_no(node_ptr) ==
......@@ -780,12 +793,14 @@ btr_free_root(
/*****************************************************************
Reorganizes an index page. */
static
void
btr_page_reorganize_low(
/*====================*/
ibool low, /* in: TRUE if locks should not be updated, i.e.,
there cannot exist locks on the page */
ibool recovery,/* in: TRUE if called in recovery: locks should not
be updated, i.e., there cannot exist locks on the
page, and a hash index should not be dropped: it
cannot exist */
page_t* page, /* in: page to be reorganized */
mtr_t* mtr) /* in: mtr */
{
......@@ -805,7 +820,9 @@ btr_page_reorganize_low(
/* Copy the old page to temporary space */
buf_frame_copy(new_page, page);
btr_search_drop_page_hash_index(page);
if (!recovery) {
btr_search_drop_page_hash_index(page);
}
/* Recreate the page: note that global data on page (possible
segment headers, next page-field, etc.) is preserved intact */
......@@ -820,7 +837,7 @@ btr_page_reorganize_low(
/* Copy max trx id to recreated page */
page_set_max_trx_id(page, page_get_max_trx_id(new_page));
if (!low) {
if (!recovery) {
/* Update the record lock bitmaps */
lock_move_reorganize_page(page, new_page);
}
......
......@@ -36,9 +36,14 @@ Created 10/16/1994 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "lock0lock.h"
/* If the following is set to TRUE, this module prints a lot of
trace information of individual record operations */
ibool btr_cur_print_record_ops = FALSE;
ulint btr_cur_rnd = 0;
ulint btr_cur_n_non_sea = 0;
ulint btr_cur_n_sea = 0;
/* In the optimistic insert, if the insert does not fit, but this much space
can be released by page reorganize, then it is reorganized */
......@@ -187,11 +192,7 @@ btr_cur_search_to_nth_level(
tuple must be set so that it cannot get
compared to the node ptr page number field! */
ulint mode, /* in: PAGE_CUR_L, ...;
NOTE that if the search is made using a unique
prefix of a record, mode should be
PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
may end up on the previous page relative to the
record! Inserts should always be made using
Inserts should always be made using
PAGE_CUR_LE to search the position! */
ulint latch_mode, /* in: BTR_SEARCH_LEAF, ..., ORed with
BTR_INSERT and BTR_ESTIMATE;
......@@ -268,7 +269,7 @@ btr_cur_search_to_nth_level(
#ifdef UNIV_SEARCH_PERF_STAT
info->n_searches++;
#endif
if (btr_search_latch.writer != RW_LOCK_NOT_LOCKED
if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
&& latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
&& !estimate
&& btr_search_guess_on_hash(index, info, tuple, mode,
......@@ -283,14 +284,14 @@ btr_cur_search_to_nth_level(
|| mode != PAGE_CUR_LE);
ut_ad(cursor->low_match != ULINT_UNDEFINED
|| mode != PAGE_CUR_LE);
btr_cur_n_sea++;
return;
}
#endif
#endif
#ifdef UNIV_SEARCH_PERF_STAT
btr_cur_n_non_sea++;
#endif
/* If the hash search did not succeed, do binary search down the
tree */
......@@ -796,15 +797,28 @@ btr_cur_optimistic_insert(
ulint data_size;
ulint extra_size;
ulint type;
ulint err;
ut_ad(dtuple_check_typed(entry));
ulint err;
*big_rec = NULL;
page = btr_cur_get_page(cursor);
index = cursor->index;
if (!dtuple_check_typed_no_assert(entry)) {
fprintf(stderr,
"InnoDB: Error in a tuple to insert into table %lu index %lu\n",
index->table_name, index->name);
}
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to insert to table %s index %s\n",
ut_dulint_get_high(thr_get_trx(thr)->id),
ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
dtuple_print(entry);
}
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
MTR_MEMO_PAGE_X_FIX));
max_size = page_get_max_insert_size_after_reorganize(page, 1);
......@@ -928,7 +942,7 @@ btr_cur_optimistic_insert(
buf_frame_get_page_no(page), max_size,
rec_size + PAGE_DIR_SLOT_SIZE, type);
*/
if (!(type & (DICT_CLUSTERED | DICT_UNIQUE))) {
if (!(type & DICT_CLUSTERED)) {
/* We have added a record to page: update its free bits */
ibuf_update_free_bits_if_full(cursor->index, page, max_size,
rec_size + PAGE_DIR_SLOT_SIZE);
......@@ -1258,6 +1272,15 @@ btr_cur_update_sec_rec_in_place(
rec = btr_cur_get_rec(cursor);
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to update table %s index %s\n",
ut_dulint_get_high(thr_get_trx(thr)->id),
ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
rec_print(rec);
}
err = lock_sec_rec_modify_check_and_lock(0, rec, index, thr);
if (err != DB_SUCCESS) {
......@@ -1312,6 +1335,15 @@ btr_cur_update_in_place(
index = cursor->index;
trx = thr_get_trx(thr);
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to update table %s index %s\n",
ut_dulint_get_high(thr_get_trx(thr)->id),
ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
rec_print(rec);
}
/* Do lock checking and undo logging */
err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
thr, &roll_ptr);
......@@ -1398,6 +1430,15 @@ btr_cur_optimistic_update(
rec = btr_cur_get_rec(cursor);
index = cursor->index;
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to update table %s index %s\n",
ut_dulint_get_high(thr_get_trx(thr)->id),
ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
rec_print(rec);
}
ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
MTR_MEMO_PAGE_X_FIX));
if (!row_upd_changes_field_size(rec, index, update)) {
......@@ -1973,6 +2014,15 @@ btr_cur_del_mark_set_clust_rec(
rec = btr_cur_get_rec(cursor);
index = cursor->index;
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to del mark table %s index %s\n",
ut_dulint_get_high(thr_get_trx(thr)->id),
ut_dulint_get_low(thr_get_trx(thr)->id),
index->table_name, index->name);
rec_print(rec);
}
ut_ad(index->type & DICT_CLUSTERED);
ut_ad(rec_get_deleted_flag(rec) == FALSE);
......@@ -2102,6 +2152,15 @@ btr_cur_del_mark_set_sec_rec(
rec = btr_cur_get_rec(cursor);
if (btr_cur_print_record_ops && thr) {
printf(
"Trx with id %lu %lu going to del mark table %s index %s\n",
ut_dulint_get_high(thr_get_trx(thr)->id),
ut_dulint_get_low(thr_get_trx(thr)->id),
cursor->index->table_name, cursor->index->name);
rec_print(rec);
}
err = lock_sec_rec_modify_check_and_lock(flags, rec, cursor->index,
thr);
if (err != DB_SUCCESS) {
......
......@@ -15,6 +15,7 @@ Created 2/17/1996 Heikki Tuuri
#include "page0page.h"
#include "page0cur.h"
#include "btr0cur.h"
#include "btr0pcur.h"
#include "btr0btr.h"
ulint btr_search_n_succ = 0;
......@@ -145,6 +146,8 @@ btr_search_info_create(
info = mem_heap_alloc(heap, sizeof(btr_search_t));
info->magic_n = BTR_SEARCH_MAGIC_N;
info->last_search = NULL;
info->n_direction = 0;
info->root_guess = NULL;
......@@ -159,6 +162,12 @@ btr_search_info_create(
info->n_patt_succ = 0;
info->n_searches = 0;
/* Set some sensible values */
info->n_fields = 1;
info->n_bytes = 0;
info->side = BTR_SEARCH_LEFT_SIDE;
return(info);
}
......@@ -197,7 +206,7 @@ btr_search_info_update_hash(
/* Test if the search would have succeeded using the recommended
hash prefix */
if ((info->n_fields >= n_unique) && (cursor->up_match >= n_unique)) {
if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
info->n_hash_potential++;
......@@ -207,8 +216,8 @@ btr_search_info_update_hash(
cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
cursor->low_match, cursor->low_bytes);
if (((info->side == BTR_SEARCH_LEFT_SIDE) && (cmp <= 0))
|| ((info->side == BTR_SEARCH_RIGHT_SIDE) && (cmp > 0))) {
if ((info->side == BTR_SEARCH_LEFT_SIDE && cmp <= 0)
|| (info->side == BTR_SEARCH_RIGHT_SIDE && cmp > 0)) {
goto set_new_recomm;
}
......@@ -216,8 +225,8 @@ btr_search_info_update_hash(
cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
cursor->up_match, cursor->up_bytes);
if (((info->side == BTR_SEARCH_LEFT_SIDE) && (cmp > 0))
|| ((info->side == BTR_SEARCH_RIGHT_SIDE) && (cmp <= 0))) {
if ((info->side == BTR_SEARCH_LEFT_SIDE && cmp > 0)
|| (info->side == BTR_SEARCH_RIGHT_SIDE && cmp <= 0)) {
goto set_new_recomm;
}
......@@ -233,19 +242,18 @@ btr_search_info_update_hash(
info->hash_analysis = 0;
if ((cursor->up_match >= n_unique)
|| (cursor->low_match >= n_unique)) {
info->n_fields = n_unique;
info->n_bytes = 0;
info->side = BTR_SEARCH_LEFT_SIDE;
}
cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
cursor->low_match, cursor->low_bytes);
if (cmp == 0) {
info->n_hash_potential = 0;
/* For extra safety, we set some sensible values here */
info->n_fields = 1;
info->n_bytes = 0;
info->side = BTR_SEARCH_LEFT_SIDE;
} else if (cmp > 0) {
info->n_hash_potential = 1;
......@@ -305,6 +313,9 @@ btr_search_update_block_hash_info(
info->last_hash_succ = FALSE;
ut_a(block->magic_n == BUF_BLOCK_MAGIC_N);
ut_a(info->magic_n == BTR_SEARCH_MAGIC_N);
if ((block->n_hash_helps > 0)
&& (info->n_hash_potential > 0)
&& (block->n_fields == info->n_fields)
......@@ -622,6 +633,7 @@ btr_search_guess_on_hash(
dulint tree_id;
#ifdef notdefined
btr_cur_t cursor2;
btr_pcur_t pcur;
#endif
ut_ad(index && info && tuple && cursor && mtr);
ut_ad((latch_mode == BTR_SEARCH_LEAF)
......@@ -754,7 +766,26 @@ btr_search_guess_on_hash(
btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
&cursor2, 0, mtr);
ut_a(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
if (mode == PAGE_CUR_GE
&& btr_cur_get_rec(&cursor2) == page_get_supremum_rec(
buf_frame_align(btr_cur_get_rec(&cursor2)))) {
/* If mode is PAGE_CUR_GE, then the binary search
in the index tree may actually take us to the supremum
of the previous page */
info->last_hash_succ = FALSE;
btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode,
&pcur, mtr);
ut_a(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
} else {
ut_a(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
}
/* NOTE that it is theoretically possible that the above assertions
fail if the page of the cursor gets removed from the buffer pool
meanwhile! Thus it might not be a bug. */
info->last_hash_succ = TRUE;
#endif
......@@ -835,6 +866,8 @@ btr_search_drop_page_hash_index(
n_fields = block->curr_n_fields;
n_bytes = block->curr_n_bytes;
ut_a(n_fields + n_bytes > 0);
rw_lock_s_unlock(&btr_search_latch);
n_recs = page_get_n_recs(page);
......@@ -851,6 +884,14 @@ btr_search_drop_page_hash_index(
rec = page_get_infimum_rec(page);
rec = page_rec_get_next(rec);
if (rec != sup) {
ut_a(n_fields <= rec_get_n_fields(rec));
if (n_bytes > 0) {
ut_a(n_fields < rec_get_n_fields(rec));
}
}
tree_id = btr_page_get_index_id(page);
prev_fold = 0;
......@@ -980,6 +1021,8 @@ btr_search_build_page_hash_index(
return;
}
ut_a(n_fields + n_bytes > 0);
/* Calculate and cache fold values and corresponding records into
an array for fast insertion to the hash index */
......@@ -995,6 +1038,14 @@ btr_search_build_page_hash_index(
rec = page_get_infimum_rec(page);
rec = page_rec_get_next(rec);
if (rec != sup) {
ut_a(n_fields <= rec_get_n_fields(rec));
if (n_bytes > 0) {
ut_a(n_fields < rec_get_n_fields(rec));
}
}
/* FIXME: in a mixed tree, all records may not have enough ordering
fields: */
......
......@@ -1125,13 +1125,51 @@ buf_page_get_known_nowait(
return(TRUE);
}
/************************************************************************
Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
void
buf_page_init_for_backup_restore(
/*=============================*/
ulint space, /* in: space id */
ulint offset, /* in: offset of the page within space
in units of a page */
buf_block_t* block) /* in: block to init */
{
/* Set the state of the block */
block->magic_n = BUF_BLOCK_MAGIC_N;
block->state = BUF_BLOCK_FILE_PAGE;
block->space = space;
block->offset = offset;
block->lock_hash_val = 0;
block->lock_mutex = NULL;
block->freed_page_clock = 0;
block->newest_modification = ut_dulint_zero;
block->oldest_modification = ut_dulint_zero;
block->accessed = FALSE;
block->buf_fix_count = 0;
block->io_fix = 0;
block->n_hash_helps = 0;
block->is_hashed = FALSE;
block->n_fields = 1;
block->n_bytes = 0;
block->side = BTR_SEARCH_LEFT_SIDE;
block->file_page_was_freed = FALSE;
}
/************************************************************************
Inits a page to the buffer buf_pool. */
static
void
buf_page_init(
/*==========*/
/* out: pointer to the block */
ulint space, /* in: space id */
ulint offset, /* in: offset of the page within space
in units of a page */
......@@ -1141,6 +1179,8 @@ buf_page_init(
ut_ad(block->state == BUF_BLOCK_READY_FOR_USE);
/* Set the state of the block */
block->magic_n = BUF_BLOCK_MAGIC_N;
block->state = BUF_BLOCK_FILE_PAGE;
block->space = space;
block->offset = offset;
......
......@@ -100,6 +100,11 @@ buf_read_page_low(
block = buf_page_init_for_read(mode, space, offset);
if (block != NULL) {
if (buf_debug_prints) {
printf("Posting read request for page %lu, sync %lu\n",
offset, sync);
}
fil_io(OS_FILE_READ | wake_later,
sync, space, offset, 0, UNIV_PAGE_SIZE,
(void*)block->frame, (void*)block);
......@@ -467,6 +472,12 @@ buf_read_ahead_linear(
count = 0;
/* Since Windows XP seems to schedule the i/o handler thread
very eagerly, and consequently it does not wait for the
full read batch to be posted, we use special heuristics here */
os_aio_simulated_put_read_threads_to_sleep();
for (i = low; i < high; i++) {
/* It is only sensible to do read-ahead in the non-sync
aio mode: hence FALSE as the first parameter */
......@@ -556,16 +567,34 @@ buf_read_recv_pages(
highest page number the last in the array */
ulint n_stored) /* in: number of page numbers in the array */
{
ulint count;
ulint i;
for (i = 0; i < n_stored; i++) {
count = 0;
os_aio_print_debug = FALSE;
while (buf_pool->n_pend_reads >= RECV_POOL_N_FREE_BLOCKS / 2) {
os_aio_simulated_wake_handler_threads();
os_thread_sleep(500000);
count++;
if (count > 100) {
fprintf(stderr,
"InnoDB: Error: InnoDB has waited for 50 seconds for pending\n"
"InnoDB: reads to the buffer pool to be finished.\n"
"InnoDB: Number of pending reads %lu\n", buf_pool->n_pend_reads);
os_aio_print_debug = TRUE;
}
}
os_aio_print_debug = FALSE;
if ((i + 1 == n_stored) && sync) {
buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space,
page_nos[i]);
......
......@@ -64,6 +64,35 @@ dtuple_get_nth_field_noninline(
return(dtuple_get_nth_field(tuple, n));
}
/*************************************************************************
Tests if dfield data length and content is equal to the given. */
ibool
dfield_data_is_binary_equal(
/*========================*/
/* out: TRUE if equal */
dfield_t* field, /* in: field */
ulint len, /* in: data length or UNIV_SQL_NULL */
byte* data) /* in: data */
{
if (len != field->len) {
return(FALSE);
}
if (len == UNIV_SQL_NULL) {
return(TRUE);
}
if (0 != ut_memcmp(field->data, data, len)) {
return(FALSE);
}
return(TRUE);
}
/****************************************************************
Returns TRUE if lengths of two dtuples are equal and respective data fields
in them are equal when compared with collation in char fields (not as binary
......@@ -153,6 +182,69 @@ dtuple_set_n_fields(
tuple->n_fields_cmp = n_fields;
}
/**************************************************************
Checks that a data field is typed. */
static
ibool
dfield_check_typed_no_assert(
/*=========================*/
/* out: TRUE if ok */
dfield_t* field) /* in: data field */
{
if (dfield_get_type(field)->mtype > DATA_MYSQL
|| dfield_get_type(field)->mtype < DATA_VARCHAR) {
fprintf(stderr,
"InnoDB: Error: data field type %lu, len %lu\n",
dfield_get_type(field)->mtype, dfield_get_len(field));
return(FALSE);
}
return(TRUE);
}
/**************************************************************
Checks that a data tuple is typed. */
ibool
dtuple_check_typed_no_assert(
/*=========================*/
/* out: TRUE if ok */
dtuple_t* tuple) /* in: tuple */
{
dfield_t* field;
ulint i;
char err_buf[1000];
if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
fprintf(stderr,
"InnoDB: Error: index entry has %lu fields\n",
dtuple_get_n_fields(tuple));
dtuple_sprintf(err_buf, 900, tuple);
fprintf(stderr,
"InnoDB: Tuple contents: %s\n", err_buf);
return(FALSE);
}
for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
field = dtuple_get_nth_field(tuple, i);
if (!dfield_check_typed_no_assert(field)) {
dtuple_sprintf(err_buf, 900, tuple);
fprintf(stderr,
"InnoDB: Tuple contents: %s\n", err_buf);
return(FALSE);
}
}
return(TRUE);
}
/**************************************************************
Checks that a data field is typed. Asserts an error if not. */
......@@ -162,8 +254,15 @@ dfield_check_typed(
/* out: TRUE if ok */
dfield_t* field) /* in: data field */
{
ut_a(dfield_get_type(field)->mtype <= DATA_MYSQL);
ut_a(dfield_get_type(field)->mtype >= DATA_VARCHAR);
if (dfield_get_type(field)->mtype > DATA_MYSQL
|| dfield_get_type(field)->mtype < DATA_VARCHAR) {
fprintf(stderr,
"InnoDB: Error: data field type %lu, len %lu\n",
dfield_get_type(field)->mtype, dfield_get_len(field));
ut_a(0);
}
return(TRUE);
}
......@@ -460,9 +559,21 @@ dtuple_convert_big_rec(
ibool is_externally_stored;
ulint i;
ulint j;
char err_buf[1000];
ut_a(dtuple_check_typed_no_assert(entry));
size = rec_get_converted_size(entry);
if (size > 1000000000) {
fprintf(stderr,
"InnoDB: Warning: tuple size very big: %lu\n", size);
dtuple_sprintf(err_buf, 900, entry);
fprintf(stderr,
"InnoDB: Tuple contents: %s\n", err_buf);
}
heap = mem_heap_create(size + dtuple_get_n_fields(entry)
* sizeof(big_rec_field_t) + 1000);
......
......@@ -153,6 +153,7 @@ dict_create_sys_tables_tuple(
if (table->type == DICT_TABLE_CLUSTER_MEMBER) {
dfield_set_data(dfield, table->cluster_name,
ut_strlen(table->cluster_name));
ut_a(0); /* Oracle-style clusters are not supported yet */
} else {
dfield_set_data(dfield, NULL, UNIV_SQL_NULL);
}
......
......@@ -2805,6 +2805,12 @@ dict_update_statistics_low(
index = dict_table_get_first_index(table);
if (index == NULL) {
/* Table definition is corrupt */
return;
}
while (index) {
size = btr_get_size(index, BTR_TOTAL_SIZE);
......@@ -3196,6 +3202,14 @@ dict_print_info_on_foreign_keys(
buf2 += sprintf(buf2, ")");
if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) {
buf2 += sprintf(buf2, " ON DELETE CASCADE");
}
if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) {
buf2 += sprintf(buf2, " ON DELETE SET NULL");
}
foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
}
......
This diff is collapsed.
......@@ -65,6 +65,9 @@ dict_mem_table_create(
table->cached = FALSE;
table->mix_id = ut_dulint_zero;
table->mix_len = 0;
table->cols = mem_heap_alloc(heap, (n_cols + DATA_N_SYS_COLS)
* sizeof(dict_col_t));
UT_LIST_INIT(table->indexes);
......
......@@ -2608,6 +2608,7 @@ fseg_free_page_low(
ulint not_full_n_used;
ulint state;
ulint i;
char errbuf[200];
ut_ad(seg_inode && mtr);
ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) ==
......@@ -2621,8 +2622,25 @@ fseg_free_page_low(
descr = xdes_get_descriptor(space, page, mtr);
ut_a(descr);
ut_a(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)
== FALSE);
if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)
!= FALSE) {
ut_sprintf_buf(errbuf, descr, 40);
fprintf(stderr,
"InnoDB: Dump of the tablespace extent descriptor: %s\n", errbuf);
fprintf(stderr,
"InnoDB: Serious error! InnoDB is trying to free page %lu\n",
"InnoDB: though it is already marked as free in the tablespace!\n"
"InnoDB: The tablespace free space info is corrupt.\n"
"InnoDB: You may need to dump your InnoDB tables and recreate the whole\n"
"InnoDB: database!\n", page);
fprintf(stderr,
"InnoDB: If the InnoDB recovery crashes here, see section 6.1\n"
"InnoDB: of http://www.innodb.com/ibman.html about forcing recovery.\n");
ut_a(0);
}
state = xdes_get_state(descr, mtr);
if (state != XDES_FSEG) {
......
......@@ -685,21 +685,21 @@ ibuf_bitmap_get_map_page(
/****************************************************************************
Sets the free bits of the page in the ibuf bitmap. This is done in a separate
mini-transaction, hence this operation does not restrict further work to only
ibuf bitmap operations, which would result if the latch to the bitmap pag
ibuf bitmap operations, which would result if the latch to the bitmap page
were kept. */
UNIV_INLINE
void
ibuf_set_free_bits_low(
/*===================*/
ulint type, /* in: index type */
page_t* page, /* in: index page; free bit is reset if the index is
a non-clustered non-unique, and page level is 0 */
page_t* page, /* in: index page; free bit is set if the index is
non-clustered and page level is 0 */
ulint val, /* in: value to set: < 4 */
mtr_t* mtr) /* in: mtr */
{
page_t* bitmap_page;
if (type & (DICT_CLUSTERED | DICT_UNIQUE)) {
if (type & DICT_CLUSTERED) {
return;
}
......@@ -733,8 +733,8 @@ void
ibuf_set_free_bits(
/*===============*/
ulint type, /* in: index type */
page_t* page, /* in: index page; free bit is reset if the index is
a non-clustered non-unique, and page level is 0 */
page_t* page, /* in: index page; free bit is set if the index is
non-clustered and page level is 0 */
ulint val, /* in: value to set: < 4 */
ulint max_val)/* in: ULINT_UNDEFINED or a maximum value which
the bits must have before setting; this is for
......@@ -743,7 +743,7 @@ ibuf_set_free_bits(
mtr_t mtr;
page_t* bitmap_page;
if (type & (DICT_CLUSTERED | DICT_UNIQUE)) {
if (type & DICT_CLUSTERED) {
return;
}
......@@ -2024,7 +2024,7 @@ ibuf_insert_low(
ulint n_stored;
ulint bits;
ut_a(!(index->type & (DICT_UNIQUE | DICT_CLUSTERED)));
ut_a(!(index->type & DICT_CLUSTERED));
ut_ad(dtuple_check_typed(entry));
do_merge = FALSE;
......@@ -2254,10 +2254,7 @@ ibuf_insert(
ut_ad(dtuple_check_typed(entry));
if (index->type & DICT_CLUSTERED || index->type & DICT_UNIQUE) {
return(FALSE);
}
ut_a(!(index->type & DICT_CLUSTERED));
if (rec_get_converted_size(entry)
>= page_get_free_space_of_empty() / 2) {
......@@ -2302,6 +2299,7 @@ ibuf_insert_to_index_page(
rec_t* rec;
page_t* bitmap_page;
ulint old_bits;
char errbuf[1000];
ut_ad(ibuf_inside());
ut_ad(dtuple_check_typed(entry));
......@@ -2324,11 +2322,24 @@ ibuf_insert_to_index_page(
/* This time the record must fit */
if (!page_cur_tuple_insert(&page_cur, entry, mtr)) {
printf(
"Ibuf insert fails; page free %lu, dtuple size %lu\n",
ut_print_timestamp(stderr);
fprintf(stderr,
"InnoDB: Error: Insert buffer insert fails; page free %lu, dtuple size %lu\n",
page_get_max_insert_size(page, 1),
rec_get_converted_size(entry));
dtuple_sprintf(errbuf, 900, entry);
fprintf(stderr,
"InnoDB: Cannot insert index record %s\n", errbuf);
fprintf(stderr,
"InnoDB: The table where where this index record belongs\n"
"InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
"InnoDB: that table.\n");
bitmap_page = ibuf_bitmap_get_map_page(
buf_frame_get_space_id(page),
buf_frame_get_page_no(page),
......@@ -2339,9 +2350,11 @@ ibuf_insert_to_index_page(
buf_frame_get_page_no(page),
IBUF_BITMAP_FREE, mtr);
printf("Bitmap bits %lu\n", old_bits);
ut_error;
fprintf(stderr, "Bitmap bits %lu\n", old_bits);
fprintf(stderr,
"InnoDB: Send a detailed bug report to mysql@lists.mysql.com!\n");
}
}
}
......
......@@ -204,16 +204,6 @@ btr_page_reorganize(
page_t* page, /* in: page to be reorganized */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
Reorganizes an index page. */
void
btr_page_reorganize_low(
/*====================*/
ibool low, /* in: TRUE if locks should not be updated, i.e.,
there cannot exist locks on the page */
page_t* page, /* in: page to be reorganized */
mtr_t* mtr); /* in: mtr */
/*****************************************************************
Decides if the page should be split at the convergence point of
inserts converging to left. */
......
......@@ -709,6 +709,7 @@ allowed to free an inherited external field. */
#define BTR_EXTERN_INHERITED_FLAG 64
extern ulint btr_cur_n_non_sea;
extern ulint btr_cur_n_sea;
#ifndef UNIV_NONINL
#include "btr0cur.ic"
......
......@@ -176,6 +176,7 @@ btr_search_validate(void);
/* The search info struct in an index */
struct btr_search_struct{
ulint magic_n; /* magic number */
/* The following 4 fields are currently not used: */
rec_t* last_search; /* pointer to the lower limit record of the
previous search; NULL if not known */
......@@ -220,6 +221,8 @@ struct btr_search_struct{
ulint n_searches; /* number of searches */
};
#define BTR_SEARCH_MAGIC_N 1112765
/* The hash index system */
typedef struct btr_search_sys_struct btr_search_sys_t;
......
......@@ -219,6 +219,16 @@ buf_page_create(
a page */
mtr_t* mtr); /* in: mini-transaction handle */
/************************************************************************
Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
void
buf_page_init_for_backup_restore(
/*=============================*/
ulint space, /* in: space id */
ulint offset, /* in: offset of the page within space
in units of a page */
buf_block_t* block); /* in: block to init */
/************************************************************************
Decrements the bufferfix count of a buffer control block and releases
a latch, if specified. */
UNIV_INLINE
......@@ -605,6 +615,7 @@ struct buf_block_struct{
/* 1. General fields */
ulint magic_n; /* magic number to check */
ulint state; /* state of the control block:
BUF_BLOCK_NOT_USED, ... */
byte* frame; /* pointer to buffer frame which
......@@ -729,6 +740,8 @@ struct buf_block_struct{
frees a page in buffer pool */
};
#define BUF_BLOCK_MAGIC_N 41526563
/* The buffer pool structure. NOTE! The definition appears here only for
other modules of this directory (buf) to see it. Do not use from outside! */
......
......@@ -89,7 +89,7 @@ buf_read_recv_pages(
/* The size in pages of the area which the read-ahead algorithms read if
invoked */
#define BUF_READ_AHEAD_AREA ut_min(32, buf_pool->curr_size / 16)
#define BUF_READ_AHEAD_AREA ut_min(64, ut_2_power_up(buf_pool->curr_size / 32))
/* Modes used in read-ahead */
#define BUF_READ_IBUF_PAGES_ONLY 131
......
......@@ -123,7 +123,7 @@ dfield_datas_are_binary_equal(
dfield_t* field2);/* in: field */
/*************************************************************************
Tests if dfield data length and content is equal to the given. */
UNIV_INLINE
ibool
dfield_data_is_binary_equal(
/*========================*/
......@@ -279,6 +279,14 @@ dtuple_check_typed(
/* out: TRUE if ok */
dtuple_t* tuple); /* in: tuple */
/**************************************************************
Checks that a data tuple is typed. */
ibool
dtuple_check_typed_no_assert(
/*=========================*/
/* out: TRUE if ok */
dtuple_t* tuple); /* in: tuple */
/**************************************************************
Validates the consistency of a tuple which must be complete, i.e,
all fields must have been set. */
......
......@@ -153,30 +153,6 @@ dfield_datas_are_binary_equal(
return(TRUE);
}
/*************************************************************************
Tests if dfield data length and content is equal to the given. */
UNIV_INLINE
ibool
dfield_data_is_binary_equal(
/*========================*/
/* out: TRUE if equal */
dfield_t* field, /* in: field */
ulint len, /* in: data length or UNIV_SQL_NULL */
byte* data) /* in: data */
{
if (len != field->len) {
return(FALSE);
}
if (len != UNIV_SQL_NULL && 0 != ut_memcmp(field->data, data, len)) {
return(FALSE);
}
return(TRUE);
}
/*************************************************************************
Gets info bits in a data tuple. */
UNIV_INLINE
......
......@@ -157,6 +157,14 @@ log_io_complete(
/*============*/
log_group_t* group); /* in: log group */
/**********************************************************
Flushes the log files to the disk, using, for example, the Unix fsync.
This function does the flush even if the user has set
srv_flush_log_at_trx_commit = FALSE. */
void
log_flush_to_disk(void);
/*===================*/
/**********************************************************
This function is called, e.g., when a transaction wants to commit. It checks
that the log has been flushed to disk up to the last log entry written by the
transaction. If there is a flush running, it waits and checks if the flush
......@@ -260,7 +268,9 @@ log_reset_first_header_and_checkpoint(
/*==================================*/
byte* hdr_buf,/* in: buffer which will be written to the start
of the first log file */
dulint lsn); /* in: lsn of the start of the first log file */
dulint start); /* in: lsn of the start of the first log file;
we pretend that there is a checkpoint at
start + LOG_BLOCK_HDR_SIZE */
/************************************************************************
Starts an archiving operation. */
......@@ -463,6 +473,15 @@ log_block_init(
byte* log_block, /* in: pointer to the log buffer */
dulint lsn); /* in: lsn within the log block */
/****************************************************************
Initializes a log block in the log buffer in the old, < 3.23.52 format, where
there was no checksum yet. */
UNIV_INLINE
void
log_block_init_in_old_format(
/*=========================*/
byte* log_block, /* in: pointer to the log buffer */
dulint lsn); /* in: lsn within the log block */
/****************************************************************
Converts a lsn to a log block number. */
UNIV_INLINE
ulint
......@@ -523,7 +542,10 @@ extern log_t* log_sys;
bytes */
/* Offsets of a log block trailer from the end of the block */
#define LOG_BLOCK_TRL_NO 4 /* log block number */
#define LOG_BLOCK_TRL_CHECKSUM 4 /* 1 byte checksum of the log block
contents */
#define LOG_BLOCK_TRL_NO 3 /* 3 lowest bytes of the log block
number */
#define LOG_BLOCK_TRL_SIZE 4 /* trailer size in bytes */
/* Offsets for a checkpoint field */
......@@ -558,11 +580,22 @@ extern log_t* log_sys;
#define LOG_GROUP_ID 0 /* log group number */
#define LOG_FILE_START_LSN 4 /* lsn of the start of data in this
log file */
#define LOG_FILE_NO 12 /* 4-byte archived log file number */
#define LOG_FILE_NO 12 /* 4-byte archived log file number;
this field is only defined in an
archived log file */
#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16
/* a 32-byte field which contains
the string 'ibbackup' and the
creation time if the log file was
created by ibbackup --restore;
when mysqld is first time started
on the restored database, it can
print helpful info for the user */
#define LOG_FILE_ARCH_COMPLETED OS_FILE_LOG_BLOCK_SIZE
/* this 4-byte field is TRUE when
the writing of an archived log file
has been completed */
has been completed; this field is
only defined in an archived log file */
#define LOG_FILE_END_LSN (OS_FILE_LOG_BLOCK_SIZE + 4)
/* lsn where the archived log file
at least extends: actually the
......@@ -572,7 +605,14 @@ extern log_t* log_sys;
is defined only when an archived log
file has been completely written */
#define LOG_CHECKPOINT_1 OS_FILE_LOG_BLOCK_SIZE
/* first checkpoint field in the log
header; we write alternately to the
checkpoint fields when we make new
checkpoints; this field is only defined
in the first log file of a log group */
#define LOG_CHECKPOINT_2 (3 * OS_FILE_LOG_BLOCK_SIZE)
/* second checkpoint field in the log
header */
#define LOG_FILE_HDR_SIZE (4 * OS_FILE_LOG_BLOCK_SIZE)
#define LOG_GROUP_OK 301
......@@ -678,7 +718,7 @@ struct log_struct{
write i/o has been completed for all
log groups */
dulint flush_lsn; /* end lsn for the current flush */
ulint flush_end_offset;/* the data in buffer ha been flushed
ulint flush_end_offset;/* the data in buffer has been flushed
up to this offset when the current
flush ends: this field will then
be copied to buf_next_to_write */
......
......@@ -179,7 +179,7 @@ log_block_get_trl_no(
trailer */
byte* log_block) /* in: log block */
{
return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
return(mach_read_from_3(log_block + OS_FILE_LOG_BLOCK_SIZE
- LOG_BLOCK_TRL_NO));
}
......@@ -192,8 +192,8 @@ log_block_set_trl_no(
byte* log_block, /* in: log block */
ulint n) /* in: log block number */
{
mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_NO,
n);
mach_write_to_3(log_block + OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_NO,
n & 0xFFFFFF);
}
/****************************************************************
......@@ -237,6 +237,29 @@ log_block_init(
log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
log_block_set_first_rec_group(log_block, 0);
}
/****************************************************************
Initializes a log block in the log buffer in the old format, where there
was no checksum yet. */
UNIV_INLINE
void
log_block_init_in_old_format(
/*=========================*/
byte* log_block, /* in: pointer to the log buffer */
dulint lsn) /* in: lsn within the log block */
{
ulint no;
ut_ad(mutex_own(&(log_sys->mutex)));
no = log_block_convert_lsn_to_no(lsn);
log_block_set_hdr_no(log_block, no);
mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
- LOG_BLOCK_TRL_NO - 1, no);
log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
log_block_set_first_rec_group(log_block, 0);
}
/****************************************************************
Writes to the log the string given. The log must be released with
......
......@@ -16,6 +16,7 @@ Created 10/21/1995 Heikki Tuuri
os_file_write */
extern ibool os_do_not_call_flush_at_each_write;
extern ibool os_has_said_disk_full;
extern ibool os_aio_print_debug;
#ifdef __WIN__
......@@ -33,6 +34,8 @@ extern ibool os_has_said_disk_full;
typedef int os_file_t;
#endif
extern ulint os_innodb_umask;
/* If this flag is TRUE, then we will use the native aio of the
OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads */
......@@ -309,6 +312,15 @@ Wakes up simulated aio i/o-handler threads if they have something to do. */
void
os_aio_simulated_wake_handler_threads(void);
/*=======================================*/
/**************************************************************************
This function can be called if one wants to post a batch of reads and
prefers an i/o-handler thread to handle them all at once later. You must
call os_aio_simulated_wake_handler_threads later to ensure the threads
are not left sleeping! */
void
os_aio_simulated_put_read_threads_to_sleep(void);
/*============================================*/
#ifdef WIN_ASYNC_IO
/**************************************************************************
......
......@@ -328,7 +328,7 @@ page_dir_calc_reserved_space(
ulint n_recs); /* in: number of records */
/*******************************************************************
Looks for the directory slot which owns the given record. */
UNIV_INLINE
ulint
page_dir_find_owner_slot(
/*=====================*/
......
......@@ -479,6 +479,8 @@ page_rec_get_next(
offs = rec_get_next_offs(rec);
ut_a(offs < UNIV_PAGE_SIZE);
if (offs == 0) {
return(NULL);
......@@ -487,40 +489,6 @@ page_rec_get_next(
return(page + offs);
}
/*******************************************************************
Looks for the directory slot which owns the given record. */
UNIV_INLINE
ulint
page_dir_find_owner_slot(
/*=====================*/
/* out: the directory slot number */
rec_t* rec) /* in: the physical record */
{
ulint i;
page_t* page;
page_dir_slot_t* slot;
ut_ad(page_rec_check(rec));
while (rec_get_n_owned(rec) == 0) {
rec = page_rec_get_next(rec);
}
page = buf_frame_align(rec);
i = page_dir_get_n_slots(page) - 1;
slot = page_dir_get_nth_slot(page, i);
while (page_dir_slot_get_rec(slot) != rec) {
ut_a(i > 0);
i--;
slot = page_dir_get_nth_slot(page, i);
}
return(i);
}
/****************************************************************
Sets the pointer to the next record on the page. */
UNIV_INLINE
......@@ -534,7 +502,7 @@ page_rec_set_next(
page_t* page;
ut_ad(page_rec_check(rec));
ut_ad((next == NULL)
ut_a((next == NULL)
|| (buf_frame_align(rec) == buf_frame_align(next)));
page = buf_frame_align(rec);
......@@ -573,7 +541,7 @@ page_rec_get_prev(
slot_no = page_dir_find_owner_slot(rec);
ut_ad(slot_no != 0);
ut_a(slot_no != 0);
slot = page_dir_get_nth_slot(page, slot_no - 1);
......@@ -584,7 +552,7 @@ page_rec_get_prev(
rec2 = page_rec_get_next(rec2);
}
ut_ad(prev_rec);
ut_a(prev_rec);
return(prev_rec);
}
......
......@@ -230,6 +230,19 @@ row_update_cascade_for_mysql(
or set null operation */
dict_table_t* table); /* in: table where we do the operation */
/*************************************************************************
Locks the data dictionary exclusively for performing a table create
operation. */
void
row_mysql_lock_data_dictionary(void);
/*================================*/
/*************************************************************************
Unlocks the data dictionary exclusively lock. */
void
row_mysql_unlock_data_dictionary(void);
/*==================================*/
/*************************************************************************
Does a table creation operation for MySQL. If the name of the created
table ends to characters INNODB_MONITOR, then this also starts
printing of monitor output by the master thread. */
......
......@@ -102,11 +102,13 @@ trx_rollback(
calling function can start running
a new query thread */
/***********************************************************************
Rollback uncommitted transactions which have no user session. */
Rollback or clean up transactions which have no user session. If the
transaction already was committed, then we clean up a possible insert
undo log. If the transaction was not yet committed, then we roll it back. */
void
trx_rollback_all_without_sess(void);
/*===============================*/
trx_rollback_or_clean_all_without_sess(void);
/*========================================*/
/********************************************************************
Finishes a transaction rollback. */
......
......@@ -24,6 +24,14 @@ Created 3/26/1996 Heikki Tuuri
#include "fsp0fsp.h"
#include "read0types.h"
/* In a MySQL replication slave, in crash recovery we store the master log
file name and position here. We have successfully got the updates to InnoDB
up to this position. If .._pos is -1, it means no crash recovery was needed,
or there was no master log position info inside InnoDB. */
extern char trx_sys_mysql_master_log_name[];
extern ib_longlong trx_sys_mysql_master_log_pos;
/* The transaction system */
extern trx_sys_t* trx_sys;
......@@ -229,13 +237,18 @@ trx_in_trx_list(
trx_t* in_trx);/* in: trx */
/*********************************************************************
Updates the offset information about the end of the MySQL binlog entry
which corresponds to the transaction just being committed. */
which corresponds to the transaction just being committed. In a MySQL
replication slave updates the latest master binlog position up to which
replication has proceeded. */
void
trx_sys_update_mysql_binlog_offset(
/*===============================*/
trx_t* trx, /* in: transaction being committed */
mtr_t* mtr); /* in: mtr */
char* file_name,/* in: MySQL log file name */
ib_longlong offset, /* in: position in that log file */
ulint field, /* in: offset of the MySQL log info field in
the trx sys header */
mtr_t* mtr); /* in: mtr */
/*********************************************************************
Prints to stderr the MySQL binlog offset info in the trx system header if
the magic number shows it valid. */
......@@ -243,15 +256,17 @@ the magic number shows it valid. */
void
trx_sys_print_mysql_binlog_offset(void);
/*===================================*/
/*********************************************************************
Prints to stderr the MySQL master log offset info in the trx system header if
the magic number shows it valid. */
void
trx_sys_print_mysql_master_log_pos(void);
/*====================================*/
/* The automatically created system rollback segment has this id */
#define TRX_SYS_SYSTEM_RSEG_ID 0
/* Max number of rollback segments: the number of segment specification slots
in the transaction system array; rollback segment id must fit in one byte,
therefore 256 */
#define TRX_SYS_N_RSEGS 256
/* Space id and page no where the trx system file copy resides */
#define TRX_SYS_SPACE 0 /* the SYSTEM tablespace */
#define TRX_SYS_PAGE_NO FSP_TRX_SYS_PAGE_NO
......@@ -277,22 +292,29 @@ therefore 256 */
segment specification slots */
/*-------------------------------------------------------------*/
#define TRX_SYS_MYSQL_LOG_NAME_LEN 32
/* Max number of rollback segments: the number of segment specification slots
in the transaction system array; rollback segment id must fit in one byte,
therefore 256; each slot is currently 8 bytes in size */
#define TRX_SYS_N_RSEGS 256
#define TRX_SYS_MYSQL_LOG_NAME_LEN 512
#define TRX_SYS_MYSQL_LOG_MAGIC_N 873422344
/* The offset of the MySQL replication info on the trx system header page;
this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
#define TRX_SYS_MYSQL_MASTER_LOG_INFO (UNIV_PAGE_SIZE - 2000)
/* The offset of the MySQL binlog offset info on the trx system header page */
#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 300)
#define TRX_SYS_MYSQL_LOG_INFO (UNIV_PAGE_SIZE - 1000)
#define TRX_SYS_MYSQL_LOG_MAGIC_N_FLD 0 /* magic number which shows
if we have valid data in the
MySQL binlog info; the value
is ..._MAGIC_N if yes */
#define TRX_SYS_MYSQL_LOG_NAME 4 /* MySQL log file name */
#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH (4 + TRX_SYS_MYSQL_LOG_NAME_LEN)
/* high 4 bytes of the offset
#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH 4 /* high 4 bytes of the offset
within that file */
#define TRX_SYS_MYSQL_LOG_OFFSET_LOW (8 + TRX_SYS_MYSQL_LOG_NAME_LEN)
/* low 4 bytes of the offset
#define TRX_SYS_MYSQL_LOG_OFFSET_LOW 8 /* low 4 bytes of the offset
within that file */
#define TRX_SYS_MYSQL_LOG_NAME 12 /* MySQL log file name */
/* The offset of the doublewrite buffer header on the trx system header page */
#define TRX_SYS_DOUBLEWRITE (UNIV_PAGE_SIZE - 200)
......
......@@ -124,6 +124,15 @@ void
trx_commit_off_kernel(
/*==================*/
trx_t* trx); /* in: transaction */
/********************************************************************
Cleans up a transaction at database startup. The cleanup is needed if
the transaction already got to the middle of a commit when the database
crashed, andf we cannot roll it back. */
void
trx_cleanup_at_db_startup(
/*======================*/
trx_t* trx); /* in: transaction */
/**************************************************************************
Does the transaction commit for MySQL. */
......@@ -322,13 +331,24 @@ struct trx_struct{
void* mysql_thd; /* MySQL thread handle corresponding
to this trx, or NULL */
char* mysql_log_file_name;
/* If MySQL binlog is used, this field
/* if MySQL binlog is used, this field
contains a pointer to the latest file
name; this is NULL if binlog is not
used */
ib_longlong mysql_log_offset;/* If MySQL binlog is used, this field
ib_longlong mysql_log_offset;/* if MySQL binlog is used, this field
contains the end offset of the binlog
entry */
char* mysql_master_log_file_name;
/* if the database server is a MySQL
replication slave, we have here the
master binlog name up to which
replication has processed; otherwise
this is a pointer to a null character */
ib_longlong mysql_master_log_pos;
/* if the database server is a MySQL
replication slave, this is the
position in the log file up to which
replication has processed */
os_thread_id_t mysql_thread_id;/* id of the MySQL thread associated
with this transaction object */
/*------------------------------*/
......
......@@ -9,40 +9,26 @@ Created 1/20/1994 Heikki Tuuri
#ifndef univ_i
#define univ_i
#if (defined(_WIN32) || defined(_WIN64)) && !defined(MYSQL_SERVER)
#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER)
#define __WIN__
#include <windows.h>
/* When compiling for Itanium IA64, undefine the flag below to prevent use
of 32-bit assembler */
#ifndef WIN64
#if !defined(WIN64) && !defined(_WIN64)
#define UNIV_CAN_USE_X86_ASSEMBLER
#endif
/* If you want to check for errors with compiler level -W4,
comment out the above include of windows.h and let the following defines
be defined:
#define HANDLE void*
#define CRITICAL_SECTION ulint
*/
#ifdef _NT_
#define __NT__
#endif
#else
/* The Unix version */
/* Most C compilers other than gcc do not know 'extern inline' */
#if !defined(__GNUC__) && !defined(__WIN__)
#define UNIV_MUST_NOT_INLINE
#endif
/* The defines used with MySQL */
/* Include two header files from MySQL to make the Unix flavor used
in compiling more Posix-compatible. We assume that 'innobase' is a
subdirectory of 'mysql'. */
in compiling more Posix-compatible. These headers also define __WIN__
if we are compiling on Windows. */
#include <global.h>
#include <my_pthread.h>
......@@ -59,6 +45,20 @@ subdirectory of 'mysql'. */
#include <sched.h>
#endif
/* When compiling for Itanium IA64, undefine the flag below to prevent use
of the 32-bit x86 assembler in mutex operations. */
#if defined(__WIN__) && !defined(WIN64) && !defined(_WIN64)
#define UNIV_CAN_USE_X86_ASSEMBLER
#endif
/* We only try to do explicit inlining of functions with gcc and
Microsoft Visual C++ */
#if !defined(__GNUC__) && !defined(__WIN__)
#define UNIV_MUST_NOT_INLINE
#endif
#ifdef HAVE_PREAD
#define HAVE_PWRITE
#endif
......
......@@ -114,7 +114,7 @@ ut_2_exp(
ulint n); /* in: number */
/*****************************************************************
Calculates fast the number rounded up to the nearest power of 2. */
UNIV_INLINE
ulint
ut_2_power_up(
/*==========*/
......@@ -155,6 +155,13 @@ ut_print_timestamp(
/*===============*/
FILE* file); /* in: file where to print */
/**************************************************************
Sprintfs a timestamp to a buffer. */
void
ut_sprintf_timestamp(
/*=================*/
char* buf); /* in: buffer where to sprintf */
/**************************************************************
Returns current year, month, day. */
void
......
......@@ -172,25 +172,3 @@ ut_2_exp(
{
return(1 << n);
}
/*****************************************************************
Calculates fast the number rounded up to the nearest power of 2. */
UNIV_INLINE
ulint
ut_2_power_up(
/*==========*/
/* out: first power of 2 which is >= n */
ulint n) /* in: number != 0 */
{
ulint res;
res = 1;
ut_ad(n > 0);
while (res < n) {
res = res * 2;
}
return(res);
}
......@@ -1541,6 +1541,15 @@ lock_rec_enqueue_waiting(
trx = thr_get_trx(thr);
if (trx->dict_operation) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: a record lock wait happens in a dictionary operation!\n"
"InnoDB: Table name %s. Send a bug report to mysql@lists.mysql.com\n",
index->table_name);
}
/* Enqueue the lock request that will wait to be granted */
lock = lock_rec_create(type_mode | LOCK_WAIT, rec, index, trx);
......@@ -2914,7 +2923,7 @@ lock_table_enqueue_waiting(
trx_t* trx;
ut_ad(mutex_own(&kernel_mutex));
/* Test if there already is some other reason to suspend thread:
we do not enqueue a lock request if the query thread should be
stopped anyway */
......@@ -2926,6 +2935,15 @@ lock_table_enqueue_waiting(
}
trx = thr_get_trx(thr);
if (trx->dict_operation) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: a table lock wait happens in a dictionary operation!\n"
"InnoDB: Table name %s. Send a bug report to mysql@lists.mysql.com\n",
table->name);
}
/* Enqueue the lock request that will wait to be granted */
......
This diff is collapsed.
......@@ -568,6 +568,55 @@ recv_read_cp_info_for_backup(
return(TRUE);
}
/**********************************************************
Checks the 1-byte checksum to the trailer checksum field of a log block.
We also accept a log block in the old format where the checksum field
contained the highest byte of the log block number. */
static
ibool
log_block_checksum_is_ok_or_old_format(
/*===================================*/
/* out: TRUE if ok, or if the log block may be in the
format of InnoDB version < 3.23.52 */
byte* block) /* in: pointer to a log block */
{
ulint i;
ulint sum;
sum = 1;
for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
sum += (ulint)(*(block + i));
}
/* printf("Checksum %lu, byte %lu\n", 0xFF & sum,
mach_read_from_1(block + OS_FILE_LOG_BLOCK_SIZE
- LOG_BLOCK_TRL_CHECKSUM));
*/
if (mach_read_from_1(block + OS_FILE_LOG_BLOCK_SIZE
- LOG_BLOCK_TRL_CHECKSUM)
== (0xFF & sum)) {
return(TRUE);
}
if (((0xFF000000 & log_block_get_hdr_no(block)) >> 24)
== mach_read_from_1(block + OS_FILE_LOG_BLOCK_SIZE
- LOG_BLOCK_TRL_CHECKSUM)) {
/* We assume the log block is in the format of
InnoDB version < 3.23.52 and the block is ok */
/*
fprintf(stderr,
"InnoDB: Scanned old format < InnoDB-3.23.52 log block number %lu\n",
log_block_get_hdr_no(block));
*/
return(TRUE);
}
return(FALSE);
}
/***********************************************************************
Scans the log segment and n_bytes_scanned is set to the length of valid
log scanned. */
......@@ -598,12 +647,13 @@ recv_scan_log_seg_for_backup(
no = log_block_get_hdr_no(log_block);
/* fprintf(stderr, "Log block header no %lu\n", no); */
/* fprintf(stderr, "Log block header no %lu\n", no); */
if (no != log_block_get_trl_no(log_block)
|| no != log_block_convert_lsn_to_no(*scanned_lsn)) {
/* printf(
if ((no & 0xFFFFFF) != log_block_get_trl_no(log_block)
|| no != log_block_convert_lsn_to_no(*scanned_lsn)
|| !log_block_checksum_is_ok_or_old_format(log_block)) {
/*
printf(
"Log block n:o %lu, trailer n:o %lu, scanned lsn n:o %lu\n",
no, log_block_get_trl_no(log_block),
log_block_convert_lsn_to_no(*scanned_lsn));
......@@ -611,8 +661,8 @@ recv_scan_log_seg_for_backup(
/* Garbage or an incompletely written log block */
log_block += OS_FILE_LOG_BLOCK_SIZE;
/* printf(
/*
printf(
"Next log block n:o %lu, trailer n:o %lu\n",
log_block_get_hdr_no(log_block),
log_block_get_trl_no(log_block));
......@@ -629,11 +679,11 @@ recv_scan_log_seg_for_backup(
/* Garbage from a log buffer flush which was made
before the most recent database recovery */
/*
printf("Scanned cp n:o %lu, block cp n:o %lu\n",
*scanned_checkpoint_no,
log_block_get_checkpoint_no(log_block));
*/
break;
}
......@@ -1011,7 +1061,7 @@ recv_recover_page(
page_lsn = page_newest_lsn;
}
} else {
/* In recovery from a backup we do not use the buffer
/* In recovery from a backup we do not really use the buffer
pool */
page_newest_lsn = ut_dulint_zero;
......@@ -1361,6 +1411,14 @@ recv_apply_log_recs_for_backup(
nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT),
UNIV_PAGE_SIZE);
/* We simulate a page read made by the buffer pool,
to make sure recovery works ok. We must init the
block corresponding to buf_pool->frame_zero
(== page) */
buf_page_init_for_backup_restore(0, i,
buf_block_align(page));
recv_recover_page(TRUE, FALSE, page, 0, i);
buf_flush_init_for_writing(page,
......@@ -2037,8 +2095,33 @@ recv_scan_log_recs(
/* fprintf(stderr, "Log block header no %lu\n", no); */
if (no != log_block_get_trl_no(log_block)
|| no != log_block_convert_lsn_to_no(scanned_lsn)) {
if ((no & 0xFFFFFF) != log_block_get_trl_no(log_block)
|| no != log_block_convert_lsn_to_no(scanned_lsn)
|| !log_block_checksum_is_ok_or_old_format(log_block)) {
if ((no & 0xFFFFFF) == log_block_get_trl_no(log_block)
&& no == log_block_convert_lsn_to_no(scanned_lsn)
&& !log_block_checksum_is_ok_or_old_format(
log_block)) {
fprintf(stderr,
"InnoDB: Log block no %lu at lsn %lu %lu has\n"
"InnoDB: ok header and trailer, but checksum field contains %lu\n",
no, ut_dulint_get_high(scanned_lsn),
ut_dulint_get_low(scanned_lsn),
mach_read_from_1(log_block
+ OS_FILE_LOG_BLOCK_SIZE
- LOG_BLOCK_TRL_CHECKSUM));
}
if ((no & 0xFFFFFF)
!= log_block_get_trl_no(log_block)) {
fprintf(stderr,
"InnoDB: Log block with header no %lu at lsn %lu %lu has\n"
"InnoDB: trailer no %lu\n",
no, ut_dulint_get_high(scanned_lsn),
ut_dulint_get_low(scanned_lsn),
log_block_get_trl_no(log_block));
}
/* Garbage or an incompletely written log block */
......@@ -2241,6 +2324,7 @@ recv_recovery_from_checkpoint_start(
dulint archived_lsn;
ulint capacity;
byte* buf;
byte log_hdr_buf[LOG_FILE_HDR_SIZE];
ulint err;
ut_ad((type != LOG_CHECKPOINT)
......@@ -2288,6 +2372,33 @@ recv_recovery_from_checkpoint_start(
checkpoint_no = mach_read_from_8(buf + LOG_CHECKPOINT_NO);
archived_lsn = mach_read_from_8(buf + LOG_CHECKPOINT_ARCHIVED_LSN);
/* Read the first log file header to print a note if this is
a recovery from a restored InnoDB Hot Backup */
fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, max_cp_group->space_id,
0, 0, LOG_FILE_HDR_SIZE,
log_hdr_buf, max_cp_group);
if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
"ibbackup", ut_strlen("ibbackup"))) {
/* This log file was created by ibbackup --restore: print
a note to the user about it */
fprintf(stderr,
"InnoDB: The log file was created by ibbackup --restore at\n"
"InnoDB: %s\n", log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP);
/* Wipe over the label now */
ut_memcpy(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
" ", 4);
/* Write to the log file to wipe over the label */
fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE,
max_cp_group->space_id,
0, 0, OS_FILE_LOG_BLOCK_SIZE,
log_hdr_buf, max_cp_group);
}
group = UT_LIST_GET_FIRST(log_sys->log_groups);
while (group) {
......@@ -2471,7 +2582,7 @@ recv_recovery_from_checkpoint_finish(void)
/* Rollback the uncommitted transactions which have no user session */
if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
trx_rollback_all_without_sess();
trx_rollback_or_clean_all_without_sess();
}
/* Apply the hashed log records to the respective file pages */
......@@ -2487,6 +2598,7 @@ recv_recovery_from_checkpoint_finish(void)
}
if (recv_needed_recovery) {
trx_sys_print_mysql_master_log_pos();
trx_sys_print_mysql_binlog_offset();
}
......@@ -2614,10 +2726,9 @@ recv_reset_log_files_for_backup(
/* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */
log_reset_first_header_and_checkpoint(buf,
ut_dulint_add(lsn, LOG_BLOCK_HDR_SIZE));
log_reset_first_header_and_checkpoint(buf, lsn);
log_block_init(buf + LOG_FILE_HDR_SIZE, lsn);
log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn);
log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE,
LOG_BLOCK_HDR_SIZE);
sprintf(name, "%sib_logfile%lu", log_dir, 0);
......@@ -2754,7 +2865,7 @@ log_group_recover_from_archive_file(
if (ut_dulint_cmp(recv_sys->parse_start_lsn, start_lsn) < 0) {
fprintf(stderr,
"InnoDB: Archive log file %s starts from too big a lsn\n",
name);
name);
return(TRUE);
}
......@@ -2765,7 +2876,7 @@ log_group_recover_from_archive_file(
fprintf(stderr,
"InnoDB: Archive log file %s starts from a wrong lsn\n",
name);
name);
return(TRUE);
}
......
......@@ -290,7 +290,7 @@ mlog_write_string(
ut_a(0);
}
ut_ad(ptr && mtr);
ut_ad(len < UNIV_PAGE_SIZE);
ut_a(len < UNIV_PAGE_SIZE);
ut_memcpy(ptr, str, len);
......@@ -338,9 +338,13 @@ mlog_parse_string(
offset = mach_read_from_2(ptr);
ptr += 2;
ut_a(offset < UNIV_PAGE_SIZE);
len = mach_read_from_2(ptr);
ptr += 2;
ut_a(len + offset < UNIV_PAGE_SIZE);
if (end_ptr < ptr + len) {
return(NULL);
......
......@@ -315,7 +315,7 @@ mtr_log_reserve_and_write(
}
data_size = dyn_array_get_data_size(mlog);
/* Open the database log for log_write_low */
mtr->start_lsn = log_reserve_and_open(data_size);
......
......@@ -22,6 +22,16 @@ Created 10/21/1995 Heikki Tuuri
#endif
/* This specifies the file permissions InnoDB uses when it craetes files in
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
my_umask */
#ifndef __WIN__
ulint os_innodb_umask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
#else
ulint os_innodb_umask = 0;
#endif
/* If the following is set to TRUE, we do not call os_file_flush in every
os_file_write. We can set this TRUE if the doublewrite buffer is used. */
ibool os_do_not_call_flush_at_each_write = FALSE;
......@@ -32,7 +42,7 @@ OS does not provide an atomic pread or pwrite, or similar */
os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
/* In simulated aio, merge at most this many consecutive i/os */
#define OS_AIO_MERGE_N_CONSECUTIVE 32
#define OS_AIO_MERGE_N_CONSECUTIVE 64
/* If this flag is TRUE, then we will use the native aio of the
OS (provided we compiled Innobase with it in), otherwise we will
......@@ -40,6 +50,8 @@ use simulated aio we build below with threads */
ibool os_aio_use_native_aio = FALSE;
ibool os_aio_print_debug = FALSE;
/* The aio array slot structure */
typedef struct os_aio_slot_struct os_aio_slot_t;
......@@ -115,7 +127,12 @@ os_aio_array_t* os_aio_sync_array = NULL;
ulint os_aio_n_segments = ULINT_UNDEFINED;
/* If the following is TRUE, read i/o handler threads try to
wait until a batch of new read requests have been posted */
ibool os_aio_recommend_sleep_for_read_threads = FALSE;
ulint os_n_file_reads = 0;
ulint os_bytes_read_since_printout = 0;
ulint os_n_file_writes = 0;
ulint os_n_fsyncs = 0;
ulint os_n_file_reads_old = 0;
......@@ -412,8 +429,8 @@ os_file_create_simple(
}
if (create_mode == OS_FILE_CREATE) {
file = open(name, create_flag, S_IRUSR | S_IWUSR | S_IRGRP
| S_IWGRP | S_IROTH | S_IWOTH);
file = open(name, create_flag, S_IRUSR | S_IWUSR
| S_IRGRP | S_IWGRP);
} else {
file = open(name, create_flag);
}
......@@ -548,8 +565,7 @@ os_file_create(
}
#endif
if (create_mode == OS_FILE_CREATE) {
file = open(name, create_flag, S_IRUSR | S_IWUSR | S_IRGRP
| S_IWGRP | S_IROTH | S_IWOTH);
file = open(name, create_flag, os_innodb_umask);
} else {
file = open(name, create_flag);
}
......@@ -735,6 +751,8 @@ os_file_flush(
ut_a(file);
os_n_fsyncs++;
ret = FlushFileBuffers(file);
if (ret) {
......@@ -957,6 +975,7 @@ os_file_read(
ut_a((offset & 0xFFFFFFFF) == offset);
os_n_file_reads++;
os_bytes_read_since_printout += n;
try_again:
ut_ad(file);
......@@ -1626,13 +1645,40 @@ os_aio_simulated_wake_handler_threads(void)
/* We do not use simulated aio: do nothing */
return;
}
}
os_aio_recommend_sleep_for_read_threads = FALSE;
for (i = 0; i < os_aio_n_segments; i++) {
os_aio_simulated_wake_handler_thread(i);
}
}
/**************************************************************************
This function can be called if one wants to post a batch of reads and
prefers an i/o-handler thread to handle them all at once later. You must
call os_aio_simulated_wake_handler_threads later to ensure the threads
are not left sleeping! */
void
os_aio_simulated_put_read_threads_to_sleep(void)
/*============================================*/
{
os_aio_array_t* array;
ulint g;
os_aio_recommend_sleep_for_read_threads = TRUE;
for (g = 0; g < os_aio_n_segments; g++) {
os_aio_get_array_and_local_segment(&array, g);
if (array == os_aio_read_array) {
os_event_reset(os_aio_segment_wait_events[g]);
}
}
}
/***********************************************************************
Requests an asynchronous i/o operation. */
......@@ -2042,15 +2088,10 @@ os_aio_simulated_handle(
ibool ret;
ulint n;
ulint i;
segment = os_aio_get_array_and_local_segment(&array, global_segment);
restart:
/* Give other threads chance to add several i/os to the array
at once */
os_thread_yield();
/* NOTE! We only access constant fields in os_aio_array. Therefore
we do not have to acquire the protecting mutex yet */
......@@ -2061,6 +2102,15 @@ os_aio_simulated_handle(
/* Look through n slots after the segment * n'th slot */
if (array == os_aio_read_array
&& os_aio_recommend_sleep_for_read_threads) {
/* Give other threads chance to add several i/os to the array
at once. */
goto recommended_sleep;
}
os_mutex_enter(array->mutex);
/* Check if there is a slot for which the i/o has already been
......@@ -2071,6 +2121,11 @@ os_aio_simulated_handle(
if (slot->reserved && slot->io_already_done) {
if (os_aio_print_debug) {
fprintf(stderr,
"InnoDB: i/o for slot %lu already done, returning\n", i);
}
ret = TRUE;
goto slot_io_done;
......@@ -2177,6 +2232,13 @@ os_aio_simulated_handle(
srv_io_thread_op_info[global_segment] = (char*) "doing file i/o";
if (os_aio_print_debug) {
fprintf(stderr,
"InnoDB: doing i/o of type %lu at offset %lu %lu, length %lu\n",
slot->type, slot->offset_high, slot->offset,
total_len);
}
/* Do the i/o with ordinary, synchronous i/o functions: */
if (slot->type == OS_FILE_WRITE) {
ret = os_file_write(slot->name, slot->file, combined_buf,
......@@ -2244,10 +2306,18 @@ os_aio_simulated_handle(
os_mutex_exit(array->mutex);
srv_io_thread_op_info[global_segment] = (char*) "waiting for i/o request";
recommended_sleep:
srv_io_thread_op_info[global_segment] =
(char*)"waiting for i/o request";
os_event_wait(os_aio_segment_wait_events[global_segment]);
if (os_aio_print_debug) {
fprintf(stderr,
"InnoDB: i/o handler thread for i/o segment %lu wakes up\n",
global_segment);
}
goto restart;
}
......@@ -2316,6 +2386,7 @@ os_aio_print(void)
ulint n_reserved;
time_t current_time;
double time_elapsed;
double avg_bytes_read;
ulint i;
for (i = 0; i < srv_n_file_io_threads; i++) {
......@@ -2392,9 +2463,19 @@ os_aio_print(void)
fil_n_pending_log_flushes, fil_n_pending_tablespace_flushes);
printf("%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
os_n_file_reads, os_n_file_writes, os_n_fsyncs);
printf("%.2f reads/s, %.2f writes/s, %.2f fsyncs/s\n",
if (os_n_file_reads == os_n_file_reads_old) {
avg_bytes_read = 0.0;
} else {
avg_bytes_read = os_bytes_read_since_printout /
(os_n_file_reads - os_n_file_reads_old);
}
printf(
"%.2f reads/s, %lu avg bytes/read, %.2f writes/s, %.2f fsyncs/s\n",
(os_n_file_reads - os_n_file_reads_old)
/ time_elapsed,
(ulint)avg_bytes_read,
(os_n_file_writes - os_n_file_writes_old)
/ time_elapsed,
(os_n_fsyncs - os_n_fsyncs_old)
......@@ -2403,6 +2484,7 @@ os_aio_print(void)
os_n_file_reads_old = os_n_file_reads;
os_n_file_writes_old = os_n_file_writes;
os_n_fsyncs_old = os_n_fsyncs;
os_bytes_read_since_printout = 0;
os_last_printout = current_time;
}
......
......@@ -403,6 +403,8 @@ page_cur_insert_rec_write_log(
byte* log_ptr;
ulint i;
ut_a(rec_size < UNIV_PAGE_SIZE);
log_ptr = mlog_open(mtr, 30 + MLOG_BUF_MARGIN);
if (log_ptr == NULL) {
......@@ -491,6 +493,8 @@ page_cur_insert_rec_write_log(
mlog_close(mtr, log_ptr);
ut_a(rec_size - i < UNIV_PAGE_SIZE);
if (rec_size - i >= MLOG_BUF_MARGIN) {
mlog_catenate_string(mtr, ins_ptr, rec_size - i);
}
......@@ -602,6 +606,9 @@ page_cur_parse_insert_rec(
/* Build the inserted record to buf */
ut_a(mismatch_index < UNIV_PAGE_SIZE);
ut_a(end_seg_len < UNIV_PAGE_SIZE);
ut_memcpy(buf, rec_get_start(cursor_rec), mismatch_index);
ut_memcpy(buf + mismatch_index, ptr, end_seg_len);
......@@ -937,6 +944,8 @@ page_copy_rec_list_end_to_created_page(
log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len;
ut_a(log_data_len < 100 * UNIV_PAGE_SIZE);
mach_write_to_4(log_ptr, log_data_len);
rec_set_next_offs(insert_rec, PAGE_SUPREMUM);
......
......@@ -17,6 +17,7 @@ Created 2/2/1994 Heikki Tuuri
#include "lock0lock.h"
#include "fut0lst.h"
#include "btr0sea.h"
#include "buf0buf.h"
/* A cached template page used in page_create */
page_t* page_template = NULL;
......@@ -63,6 +64,65 @@ Assuming a page size of 8 kB, a typical index page of a secondary
index contains 300 index entries, and the size of the page directory
is 50 x 4 bytes = 200 bytes. */
/*******************************************************************
Looks for the directory slot which owns the given record. */
ulint
page_dir_find_owner_slot(
/*=====================*/
/* out: the directory slot number */
rec_t* rec) /* in: the physical record */
{
ulint i;
ulint steps = 0;
page_t* page;
page_dir_slot_t* slot;
rec_t* original_rec = rec;
char err_buf[1000];
ut_ad(page_rec_check(rec));
while (rec_get_n_owned(rec) == 0) {
steps++;
rec = page_rec_get_next(rec);
}
page = buf_frame_align(rec);
i = page_dir_get_n_slots(page) - 1;
slot = page_dir_get_nth_slot(page, i);
while (page_dir_slot_get_rec(slot) != rec) {
if (i == 0) {
fprintf(stderr,
"InnoDB: Probable data corruption on page %lu\n",
buf_frame_get_page_no(page));
rec_sprintf(err_buf, 900, original_rec);
fprintf(stderr,
"InnoDB: Original record %s\n"
"InnoDB: on that page. Steps %lu.\n", err_buf, steps);
rec_sprintf(err_buf, 900, rec);
fprintf(stderr,
"InnoDB: Cannot find the dir slot for record %s\n"
"InnoDB: on that page!\n", err_buf);
buf_page_print(page);
ut_a(0);
}
i--;
slot = page_dir_get_nth_slot(page, i);
}
return(i);
}
/******************************************************************
Used to check the consistency of a directory slot. */
static
......
......@@ -104,7 +104,9 @@ cmp_types_are_equal(
if ((type1->mtype == DATA_VARCHAR && type2->mtype == DATA_CHAR)
|| (type1->mtype == DATA_CHAR && type2->mtype == DATA_VARCHAR)
|| (type1->mtype == DATA_FIXBINARY && type2->mtype == DATA_BINARY)
|| (type1->mtype == DATA_BINARY && type2->mtype == DATA_FIXBINARY)) {
|| (type1->mtype == DATA_BINARY && type2->mtype == DATA_FIXBINARY)
|| (type1->mtype == DATA_MYSQL && type2->mtype == DATA_VARMYSQL)
|| (type1->mtype == DATA_VARMYSQL && type2->mtype == DATA_MYSQL)) {
return(TRUE);
}
......@@ -124,14 +126,9 @@ cmp_types_are_equal(
return(FALSE);
}
if (type1->mtype == DATA_MYSQL
|| type1->mtype == DATA_VARMYSQL) {
if (type1->mtype == DATA_INT && type1->len != type2->len) {
if ((type1->prtype & ~DATA_NOT_NULL)
!= (type2->prtype & ~DATA_NOT_NULL)) {
return(FALSE);
}
return(FALSE);
}
return(TRUE);
......
......@@ -609,7 +609,7 @@ the caller must have a shared latch on dict_foreign_key_check_lock. */
ulint
row_ins_check_foreign_constraint(
/*=============================*/
/* out: DB_SUCCESS, DB_LOCK_WAIT,
/* out: DB_SUCCESS,
DB_NO_REFERENCED_ROW,
or DB_ROW_IS_REFERENCED */
ibool check_ref,/* in: TRUE if we want to check that
......@@ -635,6 +635,7 @@ row_ins_check_foreign_constraint(
ulint i;
mtr_t mtr;
run_again:
ut_ad(rw_lock_own(&dict_foreign_key_check_lock, RW_LOCK_SHARED));
if (thr_get_trx(thr)->check_foreigns == FALSE) {
......@@ -682,7 +683,7 @@ row_ins_check_foreign_constraint(
if (err != DB_SUCCESS) {
return(err);
goto do_possible_lock_wait;
}
}
......@@ -727,6 +728,11 @@ row_ins_check_foreign_constraint(
if (!rec_get_deleted_flag(rec)) {
/* Found a matching record */
/* printf(
"FOREIGN: Found matching record from %s %s\n",
check_index->table_name, check_index->name);
rec_print(rec);
*/
if (check_ref) {
err = DB_SUCCESS;
......@@ -779,6 +785,17 @@ row_ins_check_foreign_constraint(
/* Restore old value */
dtuple_set_n_fields_cmp(entry, n_fields_cmp);
do_possible_lock_wait:
if (err == DB_LOCK_WAIT) {
thr_get_trx(thr)->error_state = err;
que_thr_stop_for_mysql(thr);
row_mysql_handle_errors(&err, thr_get_trx(thr), thr, NULL);
goto run_again;
}
return(err);
}
......@@ -792,8 +809,7 @@ static
ulint
row_ins_check_foreign_constraints(
/*==============================*/
/* out: DB_SUCCESS, DB_LOCK_WAIT, or error
code */
/* out: DB_SUCCESS or error code */
dict_table_t* table, /* in: table */
dict_index_t* index, /* in: index */
dtuple_t* entry, /* in: index entry for index */
......
......@@ -934,6 +934,7 @@ row_update_for_mysql(
ut_ad(!prebuilt->sql_stat_start);
que_thr_move_to_run_state_for_mysql(thr, trx);
run_again:
thr->run_node = node;
thr->prev_node = node;
......@@ -998,7 +999,6 @@ row_update_cascade_for_mysql(
trx_t* trx;
trx = thr_get_trx(thr);
run_again:
thr->run_node = node;
thr->prev_node = node;
......@@ -1130,6 +1130,35 @@ row_mysql_recover_tmp_table(
return(row_rename_table_for_mysql(old_name, table->name, trx));
}
/*************************************************************************
Locks the data dictionary exclusively for performing a table create
operation. */
void
row_mysql_lock_data_dictionary(void)
/*================================*/
{
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks or lock waits can occur then in these operations */
rw_lock_x_lock(&(dict_foreign_key_check_lock));
mutex_enter(&(dict_sys->mutex));
}
/*************************************************************************
Unlocks the data dictionary exclusively lock. */
void
row_mysql_unlock_data_dictionary(void)
/*==================================*/
{
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations */
mutex_exit(&(dict_sys->mutex));
rw_lock_x_unlock(&(dict_foreign_key_check_lock));
}
/*************************************************************************
Does a table creation operation for MySQL. If the name of the created
table ends to characters INNODB_MONITOR, then this also starts
......@@ -1150,6 +1179,7 @@ row_create_table_for_mysql(
ulint err;
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
ut_ad(mutex_own(&(dict_sys->mutex)));
if (srv_created_new_raw || srv_force_recovery) {
fprintf(stderr,
......@@ -1263,19 +1293,13 @@ row_create_table_for_mysql(
"to use this feature you must compile InnoDB with\n"
"UNIV_MEM_DEBUG defined in univ.i and the server must be\n"
"quiet because allocation from a mem heap is not protected\n"
"by any semaphore.\n");
"by any semaphore.\n");
ut_a(mem_validate());
printf("Memory validated\n");
}
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations */
rw_lock_x_lock(&(dict_foreign_key_check_lock));
mutex_enter(&(dict_sys->mutex));
heap = mem_heap_create(512);
trx->dict_operation = TRUE;
......@@ -1325,9 +1349,6 @@ row_create_table_for_mysql(
trx->error_state = DB_SUCCESS;
}
mutex_exit(&(dict_sys->mutex));
rw_lock_x_unlock(&(dict_foreign_key_check_lock));
que_graph_free((que_t*) que_node_get_parent(thr));
trx->op_info = "";
......@@ -1354,6 +1375,7 @@ row_create_index_for_mysql(
ulint keywordlen;
ulint err;
ut_ad(mutex_own(&(dict_sys->mutex)));
ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
trx->op_info = "creating index";
......@@ -1372,12 +1394,6 @@ row_create_index_for_mysql(
return(DB_SUCCESS);
}
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations */
rw_lock_x_lock(&(dict_foreign_key_check_lock));
mutex_enter(&(dict_sys->mutex));
heap = mem_heap_create(512);
trx->dict_operation = TRUE;
......@@ -1405,9 +1421,6 @@ row_create_index_for_mysql(
trx->error_state = DB_SUCCESS;
}
mutex_exit(&(dict_sys->mutex));
rw_lock_x_unlock(&(dict_foreign_key_check_lock));
que_graph_free((que_t*) que_node_get_parent(thr));
trx->op_info = "";
......@@ -1441,6 +1454,7 @@ row_table_add_foreign_constraints(
ulint keywordlen;
ulint err;
ut_ad(mutex_own(&(dict_sys->mutex)));
ut_a(sql_string);
trx->op_info = "adding foreign keys";
......@@ -1459,12 +1473,6 @@ row_table_add_foreign_constraints(
return(DB_SUCCESS);
}
/* Serialize data dictionary operations with dictionary mutex:
no deadlocks can occur then in these operations */
rw_lock_x_lock(&(dict_foreign_key_check_lock));
mutex_enter(&(dict_sys->mutex));
trx->dict_operation = TRUE;
err = dict_create_foreign_constraints(trx, sql_string, name);
......@@ -1486,9 +1494,6 @@ row_table_add_foreign_constraints(
trx->error_state = DB_SUCCESS;
}
mutex_exit(&(dict_sys->mutex));
rw_lock_x_unlock(&(dict_foreign_key_check_lock));
return((int) err);
}
......@@ -1917,6 +1922,13 @@ row_drop_table_for_mysql(
ut_a(0);
} else {
dict_table_remove_from_cache(table);
if (dict_load_table(name) != NULL) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: dropping of table %s failed!\n", name);
}
}
funct_exit:
rw_lock_s_unlock(&(purge_sys->purge_is_running));
......
......@@ -511,6 +511,14 @@ row_purge_parse_undo_rec(
clust_index = dict_table_get_first_index(node->table);
if (clust_index == NULL) {
/* The table was corrupt in the data dictionary */
rw_lock_x_unlock(&(purge_sys->purge_is_running));
return(FALSE);
}
ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
node->heap);
......
......@@ -129,8 +129,7 @@ static
ulint
row_upd_check_references_constraints(
/*=================================*/
/* out: DB_SUCCESS, DB_LOCK_WAIT, or an error
code */
/* out: DB_SUCCESS or an error code */
btr_pcur_t* pcur, /* in: cursor positioned on a record; NOTE: the
cursor position is lost in this function! */
dict_table_t* table, /* in: table in question */
......@@ -626,7 +625,7 @@ row_upd_index_parse(
/*******************************************************************
Returns TRUE if ext_vec contains i. */
UNIV_INLINE
static
ibool
upd_ext_vec_contains(
/*=================*/
......@@ -738,6 +737,7 @@ row_upd_build_difference_binary(
ulint n_diff;
ulint roll_ptr_pos;
ulint trx_id_pos;
ibool extern_bit;
ulint i;
/* This function is used only for a clustered index */
......@@ -763,9 +763,10 @@ row_upd_build_difference_binary(
goto skip_compare;
}
extern_bit = rec_get_nth_field_extern_bit(rec, i);
if (rec_get_nth_field_extern_bit(rec, i)
!= upd_ext_vec_contains(ext_vec, n_ext_vec, i)
if (extern_bit != upd_ext_vec_contains(ext_vec, n_ext_vec, i)
|| !dfield_data_is_binary_equal(dfield, len, data)) {
upd_field = upd_get_nth_field(update, n_diff);
......@@ -1362,7 +1363,7 @@ ulint
row_upd_del_mark_clust_rec(
/*=======================*/
/* out: DB_SUCCESS if operation successfully
completed, else error code or DB_LOCK_WAIT */
completed, else error code */
upd_node_t* node, /* in: row update node */
dict_index_t* index, /* in: clustered index */
que_thr_t* thr, /* in: query thread */
......@@ -1381,8 +1382,6 @@ row_upd_del_mark_clust_rec(
pcur = node->pcur;
btr_cur = btr_pcur_get_btr_cur(pcur);
ut_ad(FALSE == rec_get_deleted_flag(btr_pcur_get_rec(pcur)));
/* Store row because we have to build also the secondary index
entries */
......@@ -1391,11 +1390,11 @@ row_upd_del_mark_clust_rec(
/* Mark the clustered index record deleted; we do not have to check
locks, because we assume that we have an x-lock on the record */
err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG, btr_cur,
TRUE, thr, mtr);
err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
btr_cur, TRUE, thr, mtr);
if (err == DB_SUCCESS && check_ref) {
/* NOTE that the following call loses
the position of pcur ! */
/* NOTE that the following call loses the position of pcur ! */
err = row_upd_check_references_constraints(pcur, index->table,
index, thr, mtr);
if (err != DB_SUCCESS) {
......
......@@ -639,7 +639,7 @@ srv_release_threads(
slot = srv_table_get_nth_slot(i);
if ((slot->type == type) && slot->suspended) {
if (slot->in_use && slot->type == type && slot->suspended) {
slot->suspended = FALSE;
......@@ -1631,6 +1631,7 @@ srv_init(void)
for (i = 0; i < OS_THREAD_MAX_N; i++) {
slot = srv_mysql_table + i;
slot->in_use = FALSE;
slot->type = 0;
slot->event = os_event_create(NULL);
ut_a(slot->event);
}
......@@ -1890,8 +1891,6 @@ srv_conc_exit_innodb(
trx_t* trx) /* in: transaction object associated with the
thread */
{
srv_conc_slot_t* slot = NULL;
if (srv_thread_concurrency >= 500) {
return;
......@@ -2200,10 +2199,12 @@ srv_lock_timeout_and_monitor_thread(
"FILE I/O\n"
"--------\n");
os_aio_print();
printf("-------------\n"
"INSERT BUFFER\n"
"-------------\n");
printf("-------------------------------------\n"
"INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
"-------------------------------------\n");
ibuf_print();
printf("Successful hash searches %lu, non-hash searches %lu\n",
btr_cur_n_sea, btr_cur_n_non_sea);
printf("---\n"
"LOG\n"
"---\n");
......@@ -2498,18 +2499,19 @@ srv_master_thread(
for (i = 0; i < 10; i++) {
n_ios_old = log_sys->n_log_ios + buf_pool->n_pages_read
+ buf_pool->n_pages_written;
srv_main_thread_op_info = "sleeping";
srv_main_thread_op_info = (char*)"sleeping";
os_thread_sleep(1000000);
/* ALTER TABLE in MySQL requires on Unix that the table handler
can drop tables lazily after there no longer are SELECT
queries to them. */
srv_main_thread_op_info = "doing background drop tables";
srv_main_thread_op_info =
(char*)"doing background drop tables";
row_drop_tables_for_mysql_in_background();
srv_main_thread_op_info = "";
srv_main_thread_op_info = (char*)"";
if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
......@@ -2520,8 +2522,9 @@ srv_master_thread(
is issued or the we have specified in my.cnf no flush
at transaction commit */
srv_main_thread_op_info = "flushing log";
srv_main_thread_op_info = (char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
/* If there were less than 10 i/os during the
one second sleep, we assume that there is free
......@@ -2533,11 +2536,14 @@ srv_master_thread(
n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ buf_pool->n_pages_written;
if (n_pend_ios < 3 && (n_ios - n_ios_old < 10)) {
srv_main_thread_op_info = "doing insert buffer merge";
srv_main_thread_op_info =
(char*)"doing insert buffer merge";
ibuf_contract_for_n_pages(TRUE, 5);
srv_main_thread_op_info = "flushing log";
srv_main_thread_op_info =
(char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
}
if (srv_fast_shutdown && srv_shutdown_state > 0) {
......@@ -2578,16 +2584,18 @@ srv_master_thread(
srv_main_thread_op_info = "flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
}
/* We run a batch of insert buffer merge every 10 seconds,
even if the server were active */
srv_main_thread_op_info = "doing insert buffer merge";
srv_main_thread_op_info = (char*)"doing insert buffer merge";
ibuf_contract_for_n_pages(TRUE, 5);
srv_main_thread_op_info = "flushing log";
srv_main_thread_op_info = (char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
/* We run a full purge every 10 seconds, even if the server
were active */
......@@ -2603,7 +2611,7 @@ srv_master_thread(
goto background_loop;
}
srv_main_thread_op_info = "purging";
srv_main_thread_op_info = (char*)"purging";
n_pages_purged = trx_purge();
current_time = time(NULL);
......@@ -2612,6 +2620,7 @@ srv_master_thread(
srv_main_thread_op_info = "flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
last_flush_time = current_time;
}
}
......@@ -2620,25 +2629,25 @@ srv_master_thread(
/* In this loop we run background operations when the server
is quiet and we also come here about once in 10 seconds */
srv_main_thread_op_info = "doing background drop tables";
srv_main_thread_op_info = (char*)"doing background drop tables";
n_tables_to_drop = row_drop_tables_for_mysql_in_background();
srv_main_thread_op_info = "";
srv_main_thread_op_info = (char*)"";
srv_main_thread_op_info = "flushing buffer pool pages";
srv_main_thread_op_info = (char*)"flushing buffer pool pages";
/* Flush a few oldest pages to make the checkpoint younger */
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10, ut_dulint_max);
srv_main_thread_op_info = "making checkpoint";
srv_main_thread_op_info = (char*)"making checkpoint";
/* Make a new checkpoint about once in 10 seconds */
log_checkpoint(TRUE, FALSE);
srv_main_thread_op_info = "reserving kernel mutex";
srv_main_thread_op_info = (char*)"reserving kernel mutex";
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
......@@ -2651,11 +2660,11 @@ srv_master_thread(
/* The server has been quiet for a while: start running background
operations */
srv_main_thread_op_info = "purging";
srv_main_thread_op_info = (char*)"purging";
n_pages_purged = trx_purge();
srv_main_thread_op_info = "reserving kernel mutex";
srv_main_thread_op_info = (char*)"reserving kernel mutex";
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
......@@ -2664,10 +2673,10 @@ srv_master_thread(
}
mutex_exit(&kernel_mutex);
srv_main_thread_op_info = "doing insert buffer merge";
srv_main_thread_op_info = (char*)"doing insert buffer merge";
n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
srv_main_thread_op_info = "reserving kernel mutex";
srv_main_thread_op_info = (char*)"reserving kernel mutex";
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
......@@ -2676,10 +2685,10 @@ srv_master_thread(
}
mutex_exit(&kernel_mutex);
srv_main_thread_op_info = "flushing buffer pool pages";
srv_main_thread_op_info = (char*)"flushing buffer pool pages";
n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
srv_main_thread_op_info = "reserving kernel mutex";
srv_main_thread_op_info = (char*)"reserving kernel mutex";
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
......@@ -2691,11 +2700,11 @@ srv_master_thread(
srv_main_thread_op_info = "waiting for buffer pool flush to end";
buf_flush_wait_batch_end(BUF_FLUSH_LIST);
srv_main_thread_op_info = "making checkpoint";
srv_main_thread_op_info = (char*)"making checkpoint";
log_checkpoint(TRUE, FALSE);
srv_main_thread_op_info = "reserving kernel mutex";
srv_main_thread_op_info = (char*)"reserving kernel mutex";
mutex_enter(&kernel_mutex);
if (srv_activity_count != old_activity_count) {
......@@ -2704,7 +2713,8 @@ srv_master_thread(
}
mutex_exit(&kernel_mutex);
srv_main_thread_op_info = "archiving log (if log archive is on)";
srv_main_thread_op_info =
(char*)"archiving log (if log archive is on)";
log_archive_do(FALSE, &n_bytes_archived);
......@@ -2730,7 +2740,7 @@ srv_master_thread(
master thread to wait for more server activity */
suspend_thread:
srv_main_thread_op_info = "suspending";
srv_main_thread_op_info = (char*)"suspending";
mutex_enter(&kernel_mutex);
......@@ -2744,7 +2754,7 @@ srv_master_thread(
mutex_exit(&kernel_mutex);
srv_main_thread_op_info = "waiting for server activity";
srv_main_thread_op_info = (char*)"waiting for server activity";
os_event_wait(event);
......
......@@ -932,6 +932,26 @@ innobase_start_or_create_for_mysql(void)
ulint k;
mtr_t mtr;
#ifdef UNIV_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!!!!!!!\n");
#endif
#ifdef UNIV_SYNC_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!!!!!!!\n");
#endif
#ifdef UNIV_SEARCH_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!!!!!!!\n");
#endif
#ifdef UNIV_MEM_DEBUG
fprintf(stderr,
"InnoDB: !!!!!!!!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!!!!!!!\n");
#endif
log_do_write = TRUE;
/* yydebug = TRUE; */
......@@ -999,7 +1019,7 @@ innobase_start_or_create_for_mysql(void)
os_aio_use_native_aio = FALSE;
if (!os_aio_use_native_aio) {
os_aio_init(4 * SRV_N_PENDING_IOS_PER_THREAD
os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
* srv_n_file_io_threads,
srv_n_file_io_threads,
SRV_MAX_N_PENDING_SYNC_IOS);
......
......@@ -160,11 +160,13 @@ trx_rollback_last_sql_stat_for_mysql(
}
/***********************************************************************
Rollback uncommitted transactions which have no user session. */
Rollback or clean up transactions which have no user session. If the
transaction already was committed, then we clean up a possible insert
undo log. If the transaction was not yet committed, then we roll it back. */
void
trx_rollback_all_without_sess(void)
/*===============================*/
trx_rollback_or_clean_all_without_sess(void)
/*========================================*/
{
mem_heap_t* heap;
que_fork_t* fork;
......@@ -217,6 +219,19 @@ trx_rollback_all_without_sess(void)
trx->sess = trx_dummy_sess;
if (trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
fprintf(stderr, "InnoDB: Cleaning up trx with id %lu %lu\n",
ut_dulint_get_high(trx->id),
ut_dulint_get_low(trx->id));
trx_cleanup_at_db_startup(trx);
mem_heap_free(heap);
goto loop;
}
fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
fork->trx = trx;
......@@ -264,9 +279,17 @@ trx_rollback_all_without_sess(void)
/* If the transaction was for a dictionary operation, we
drop the relevant table, if it still exists */
fprintf(stderr,
"InnoDB: Dropping table with id %lu %lu in recovery if it exists\n",
ut_dulint_get_high(trx->table_id),
ut_dulint_get_low(trx->table_id));
table = dict_table_get_on_id_low(trx->table_id, trx);
if (table) {
fprintf(stderr,
"InnoDB: Table found: dropping table %s in recovery\n", table->name);
err = row_drop_table_for_mysql(table->name, trx,
TRUE);
ut_a(err == (int) DB_SUCCESS);
......
......@@ -26,6 +26,14 @@ Created 3/26/1996 Heikki Tuuri
trx_sys_t* trx_sys = NULL;
trx_doublewrite_t* trx_doublewrite = NULL;
/* In a MySQL replication slave, in crash recovery we store the master log
file name and position here. We have successfully got the updates to InnoDB
up to this position. If .._pos is -1, it means no crash recovery was needed,
or there was no master log position info inside InnoDB. */
char trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
ib_longlong trx_sys_mysql_master_log_pos = -1;
/********************************************************************
Determines if a page number is located inside the doublewrite buffer. */
......@@ -427,75 +435,62 @@ trx_sys_flush_max_trx_id(void)
/*********************************************************************
Updates the offset information about the end of the MySQL binlog entry
which corresponds to the transaction just being committed. */
which corresponds to the transaction just being committed. In a MySQL
replication slave updates the latest master binlog position up to which
replication has proceeded. */
void
trx_sys_update_mysql_binlog_offset(
/*===============================*/
trx_t* trx, /* in: transaction being committed */
mtr_t* mtr) /* in: mtr */
char* file_name,/* in: MySQL log file name */
ib_longlong offset, /* in: position in that log file */
ulint field, /* in: offset of the MySQL log info field in
the trx sys header */
mtr_t* mtr) /* in: mtr */
{
trx_sysf_t* sys_header;
char namebuf[TRX_SYS_MYSQL_LOG_NAME_LEN];
ut_ad(trx->mysql_log_file_name);
memset(namebuf, ' ', TRX_SYS_MYSQL_LOG_NAME_LEN - 1);
namebuf[TRX_SYS_MYSQL_LOG_NAME_LEN - 1] = '\0';
if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
/* Copy the whole MySQL log file name to the buffer, or only the
last characters, if it does not fit */
/* We cannot fit the name to the 512 bytes we have reserved */
if (ut_strlen(trx->mysql_log_file_name)
> TRX_SYS_MYSQL_LOG_NAME_LEN - 1) {
ut_memcpy(namebuf, trx->mysql_log_file_name
+ ut_strlen(trx->mysql_log_file_name)
- (TRX_SYS_MYSQL_LOG_NAME_LEN - 1),
TRX_SYS_MYSQL_LOG_NAME_LEN - 1);
} else {
ut_memcpy(namebuf, trx->mysql_log_file_name,
1 + ut_strlen(trx->mysql_log_file_name));
return;
}
namebuf[TRX_SYS_MYSQL_LOG_NAME_LEN - 1] = '\0';
sys_header = trx_sysf_get(mtr);
if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
if (mach_read_from_4(sys_header + field
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
!= TRX_SYS_MYSQL_LOG_MAGIC_N) {
mlog_write_ulint(sys_header + TRX_SYS_MYSQL_LOG_INFO
mlog_write_ulint(sys_header + field
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
TRX_SYS_MYSQL_LOG_MAGIC_N,
MLOG_4BYTES, mtr);
}
if (0 != ut_memcmp(sys_header + TRX_SYS_MYSQL_LOG_INFO
+ TRX_SYS_MYSQL_LOG_NAME,
namebuf, TRX_SYS_MYSQL_LOG_NAME_LEN)) {
if (0 != ut_memcmp(sys_header + field + TRX_SYS_MYSQL_LOG_NAME,
file_name, 1 + ut_strlen(file_name))) {
mlog_write_string(sys_header + TRX_SYS_MYSQL_LOG_INFO
mlog_write_string(sys_header + field
+ TRX_SYS_MYSQL_LOG_NAME,
namebuf, TRX_SYS_MYSQL_LOG_NAME_LEN, mtr);
file_name, 1 + ut_strlen(file_name), mtr);
}
if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
if (mach_read_from_4(sys_header + field
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
|| (trx->mysql_log_offset >> 32) > 0) {
|| (offset >> 32) > 0) {
mlog_write_ulint(sys_header + TRX_SYS_MYSQL_LOG_INFO
mlog_write_ulint(sys_header + field
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
(ulint)(trx->mysql_log_offset >> 32),
(ulint)(offset >> 32),
MLOG_4BYTES, mtr);
}
mlog_write_ulint(sys_header + TRX_SYS_MYSQL_LOG_INFO
mlog_write_ulint(sys_header + field
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW,
(ulint)(trx->mysql_log_offset & 0xFFFFFFFF),
(ulint)(offset & 0xFFFFFFFF),
MLOG_4BYTES, mtr);
trx->mysql_log_file_name = NULL;
}
/*********************************************************************
......@@ -533,6 +528,58 @@ trx_sys_print_mysql_binlog_offset(void)
mtr_commit(&mtr);
}
/*********************************************************************
Prints to stderr the MySQL master log offset info in the trx system header if
the magic number shows it valid. */
void
trx_sys_print_mysql_master_log_pos(void)
/*====================================*/
{
trx_sysf_t* sys_header;
mtr_t mtr;
mtr_start(&mtr);
sys_header = trx_sysf_get(&mtr);
if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
!= TRX_SYS_MYSQL_LOG_MAGIC_N) {
mtr_commit(&mtr);
return;
}
fprintf(stderr,
"InnoDB: In a MySQL replication slave the last master binlog file\n"
"InnoDB: position %lu %lu, file name %s\n",
mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_NAME);
/* Copy the master log position info to global variables we can
use in ha_innobase.cc to initialize glob_mi to right values */
ut_memcpy(trx_sys_mysql_master_log_name,
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_NAME,
TRX_SYS_MYSQL_LOG_NAME_LEN);
trx_sys_mysql_master_log_pos =
(((ib_longlong)mach_read_from_4(
sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH))
<< 32)
+ (ib_longlong)
mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
mtr_commit(&mtr);
}
/********************************************************************
Looks for a free slot for a rollback segment in the trx system file copy. */
......@@ -660,7 +707,7 @@ trx_sys_init_at_db_start(void)
if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
fprintf(stderr,
"InnoDB: %lu uncommitted transaction(s) which must be rolled back\n",
"InnoDB: %lu transaction(s) which must be rolled back or cleaned up\n",
UT_LIST_GET_LEN(trx_sys->trx_list));
fprintf(stderr, "InnoDB: Trx id counter is %lu %lu\n",
......
......@@ -83,6 +83,8 @@ trx_create(
trx->mysql_log_file_name = NULL;
trx->mysql_log_offset = 0;
trx->mysql_master_log_file_name = "";
trx->mysql_master_log_pos = 0;
trx->ignore_duplicates_in_insert = FALSE;
......@@ -363,16 +365,31 @@ trx_lists_init_at_db_start(void)
trx = trx_create(NULL);
trx->id = undo->trx_id;
trx->insert_undo = undo;
trx->rseg = rseg;
if (undo->state != TRX_UNDO_ACTIVE) {
trx->conc_state = TRX_COMMITTED_IN_MEMORY;
/* We give a dummy value for the trx no;
this should have no relevance since purge
is not interested in committed transaction
numbers, unless they are in the history
list, in which case it looks the number
from the disk based undo log structure */
trx->no = trx->id;
} else {
trx->conc_state = TRX_ACTIVE;
}
trx->id = undo->trx_id;
trx->insert_undo = undo;
trx->rseg = rseg;
/* A running transaction always has the number
field inited to ut_dulint_max */
trx->no = ut_dulint_max;
}
if (undo->dict_operation) {
trx->dict_operation = undo->dict_operation;
......@@ -397,14 +414,25 @@ trx_lists_init_at_db_start(void)
if (NULL == trx) {
trx = trx_create(NULL);
trx->id = undo->trx_id;
if (undo->state != TRX_UNDO_ACTIVE) {
trx->conc_state =
TRX_COMMITTED_IN_MEMORY;
/* We give a dummy value for the trx
number */
trx->no = trx->id;
} else {
trx->conc_state = TRX_ACTIVE;
/* A running transaction always has
the number field inited to
ut_dulint_max */
trx->no = ut_dulint_max;
}
trx->id = undo->trx_id;
trx->rseg = rseg;
trx_list_insert_ordered(trx);
......@@ -583,7 +611,7 @@ trx_commit_off_kernel(
if (undo) {
mutex_enter(&kernel_mutex);
#ifdef notdefined
/* ########## There is a bug here: purge and rollback
/* !!!!!!!!! There is a bug here: purge and rollback
need the whole stack of old record versions even if no
consistent read would need them!! This is because they
decide on the basis of the old versions when we can
......@@ -627,12 +655,25 @@ trx_commit_off_kernel(
mutex_exit(&(rseg->mutex));
/* Update the latest MySQL binlog name and offset info
in trx sys header if MySQL binlogging is on */
in trx sys header if MySQL binlogging is on or the database
server is a MySQL replication slave */
if (trx->mysql_log_file_name) {
trx_sys_update_mysql_binlog_offset(trx, &mtr);
trx_sys_update_mysql_binlog_offset(
trx->mysql_log_file_name,
trx->mysql_log_offset,
TRX_SYS_MYSQL_LOG_INFO, &mtr);
trx->mysql_log_file_name = NULL;
}
if (trx->mysql_master_log_file_name[0] != '\0') {
/* This database server is a MySQL replication slave */
trx_sys_update_mysql_binlog_offset(
trx->mysql_master_log_file_name,
trx->mysql_master_log_pos,
TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
}
/* If we did not take the shortcut, the following call
commits the mini-transaction, making the whole transaction
committed in the file-based world at this log sequence number;
......@@ -707,12 +748,12 @@ trx_commit_off_kernel(
/*-------------------------------------*/
/* Most MySQL users run with srv_flush.. set to FALSE: */
/* Most MySQL users run with srv_flush_.. set to FALSE: */
if (srv_flush_log_at_trx_commit) {
log_flush_up_to(lsn, LOG_WAIT_ONE_GROUP);
}
}
/*-------------------------------------*/
......@@ -730,6 +771,29 @@ trx_commit_off_kernel(
UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
}
/********************************************************************
Cleans up a transaction at database startup. The cleanup is needed if
the transaction already got to the middle of a commit when the database
crashed, andf we cannot roll it back. */
void
trx_cleanup_at_db_startup(
/*======================*/
trx_t* trx) /* in: transaction */
{
if (trx->insert_undo != NULL) {
trx_undo_insert_cleanup(trx);
}
trx->conc_state = TRX_NOT_STARTED;
trx->rseg = NULL;
trx->undo_no = ut_dulint_zero;
trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
}
/************************************************************************
Assigns a read view for a consistent read query. All the consistent reads
within the same transaction will get the same read view, which is created
......
......@@ -1147,7 +1147,7 @@ trx_undo_mem_create_at_db_start(
/* If the log segment is being freed, the page list is inconsistent! */
if (state == TRX_UNDO_TO_FREE) {
return(undo);
goto add_to_list;
}
last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr);
......@@ -1166,7 +1166,7 @@ trx_undo_mem_create_at_db_start(
undo->top_offset = rec - last_page;
undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
}
add_to_list:
if (type == TRX_UNDO_INSERT) {
if (state != TRX_UNDO_CACHED) {
UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list,
......
......@@ -38,6 +38,8 @@ os_fast_mutex_t ut_list_mutex; /* this protects the list */
ibool ut_mem_block_list_inited = FALSE;
ulint* ut_mem_null_ptr = NULL;
/**************************************************************************
Initializes the mem block list at database startup. */
static
......@@ -83,12 +85,16 @@ ut_malloc_low(
"InnoDB: Check if you should increase the swap file or\n"
"InnoDB: ulimits of your operating system.\n"
"InnoDB: On FreeBSD check you have compiled the OS with\n"
"InnoDB: a big enough maximum process size.\n",
"InnoDB: a big enough maximum process size.\n"
"InnoDB: We now intentionally generate a seg fault so that\n"
"InnoDB: on Linux we get a stack trace.\n",
n, ut_total_allocated_memory, errno);
os_fast_mutex_unlock(&ut_list_mutex);
exit(1);
/* Make an intentional seg fault so that we get a stack
trace */
printf("%lu\n", *ut_mem_null_ptr);
}
if (set_to_zero) {
......
......@@ -110,6 +110,49 @@ ut_print_timestamp(
#endif
}
/**************************************************************
Sprintfs a timestamp to a buffer. */
void
ut_sprintf_timestamp(
/*=================*/
char* buf) /* in: buffer where to sprintf */
{
#ifdef __WIN__
SYSTEMTIME cal_tm;
GetLocalTime(&cal_tm);
sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
(int)cal_tm.wYear % 100,
(int)cal_tm.wMonth,
(int)cal_tm.wDay,
(int)cal_tm.wHour,
(int)cal_tm.wMinute,
(int)cal_tm.wSecond);
#else
struct tm cal_tm;
struct tm* cal_tm_ptr;
time_t tm;
time(&tm);
#ifdef HAVE_LOCALTIME_R
localtime_r(&tm, &cal_tm);
cal_tm_ptr = &cal_tm;
#else
cal_tm_ptr = localtime(&tm);
#endif
sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
cal_tm_ptr->tm_year % 100,
cal_tm_ptr->tm_mon + 1,
cal_tm_ptr->tm_mday,
cal_tm_ptr->tm_hour,
cal_tm_ptr->tm_min,
cal_tm_ptr->tm_sec);
#endif
}
/**************************************************************
Returns current year, month, day. */
......@@ -258,3 +301,26 @@ ut_ulint_sort(ulint* arr, ulint* aux_arr, ulint low, ulint high)
UT_SORT_FUNCTION_BODY(ut_ulint_sort, arr, aux_arr, low, high,
ut_ulint_cmp);
}
/*****************************************************************
Calculates fast the number rounded up to the nearest power of 2. */
ulint
ut_2_power_up(
/*==========*/
/* out: first power of 2 which is >= n */
ulint n) /* in: number != 0 */
{
ulint res;
res = 1;
ut_ad(n > 0);
while (res < n) {
res = res * 2;
}
return(res);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment