Commit 9d6d1902 authored by Sunny Bains's avatar Sunny Bains

Bug #11766227: InnoDB purge lag much worse for 5.5.8 versus 5.1

Bug #11766501: Multiple RBS break the get rseg with mininum trx_t::no code during purge
      
Bug# 59291 changes:
      
Main problem is that truncating the UNDO log at the completion of every
trx_purge() call is expensive as the number of rollback segments is increased.
We truncate after a configurable amount of pages. The innodb_purge_batch_size
parameter is used to control when InnoDB does the actual truncate. The truncate
is done once after 128 (or TRX_SYS_N_RSEGS iterations). In other words we
truncate after purge 128 * innodb_purge_batch_size. The smaller the batch
size the quicker we truncate.
      
Introduce a new parameter that allows how many rollback segments to use for
storing REDO information. This is really step 1 in allowing complete control
to the user over rollback space management.
      
New parameters:
    i) innodb_rollback_segments = number of rollback_segments to use
       (default is now 128) dynamic parameter, can be changed anytime.
       Currently there is little benefit in changing it from the default.
      
Optimisations in the patch.
      
    i. Change the O(n) behaviour of trx_rseg_get_on_id() to O(log n)
       Backported from 5.6. Refactor some of the binary heap code.
       Create a new include/ut0bh.ic file.
      
    ii. Avoid truncating the rollback segments after every purge.
      
Related changes that were moved to a separate patch:
      
    i. Purge should not do any flushing, only wait for space to be free so that
       it only does purging of records unless it is held up by a long running
       transaction that is preventing it from progressing.
      
   ii. Give the purge thread preference over transactions when acquiring the
       rseg->mutex during commit. This to avoid purge blocking unnecessarily
       when getting the next rollback segment to purge.
      
Bug #11766501 changes:
      
Add the rseg to the min binary heap under the cover of the kernel mutex and
the binary heap mutex. This ensures the ordering of the min binary heap.
      
The two changes have to be committed together because they share the same
that fixes both issues.
      
rb://567 Approved by: Inaam Rana.
parent 7e6f3517
# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2006, 2011, Oracle and/or its affiliates. All rights reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
......@@ -246,7 +246,7 @@ SET(INNOBASE_SOURCES btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
usr/usr0sess.c
ut/ut0byte.c ut/ut0dbg.c ut/ut0list.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c
ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c)
ut/ut0ut.c ut/ut0vec.c ut/ut0wqueue.c ut/ut0bh.c)
IF(WITH_INNODB)
# Legacy option
......
/*****************************************************************************
Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
Copyright (c) 2000, 2011, MySQL AB & Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
......@@ -262,7 +262,7 @@ static PSI_mutex_info all_innodb_mutexes[] = {
# endif /* UNIV_MEM_DEBUG */
{&mem_pool_mutex_key, "mem_pool_mutex", 0},
{&mutex_list_mutex_key, "mutex_list_mutex", 0},
{&purge_sys_mutex_key, "purge_sys_mutex", 0},
{&purge_sys_bh_mutex_key, "purge_sys_bh_mutex", 0},
{&recv_sys_mutex_key, "recv_sys_mutex", 0},
{&rseg_mutex_key, "rseg_mutex", 0},
# ifdef UNIV_SYNC_DEBUG
......@@ -10971,16 +10971,23 @@ static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
static MYSQL_SYSVAR_ULONG(purge_batch_size, srv_purge_batch_size,
PLUGIN_VAR_OPCMDARG,
"Number of UNDO logs to purge in one batch from the history list. "
"Default is 20",
"Number of UNDO log pages to purge in one batch from the history list.",
NULL, NULL,
20, /* Default setting */
1, /* Minimum value */
5000, 0); /* Maximum value */
10000, 0); /* Maximum value */
static MYSQL_SYSVAR_ULONG(rollback_segments, srv_rollback_segments,
PLUGIN_VAR_OPCMDARG,
"Number of UNDO logs to use.",
NULL, NULL,
128, /* Default setting */
1, /* Minimum value */
TRX_SYS_N_RSEGS, 0); /* Maximum value */
static MYSQL_SYSVAR_ULONG(purge_threads, srv_n_purge_threads,
PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
"Purge threads can be either 0 or 1. Default is 0.",
"Purge threads can be either 0 or 1.",
NULL, NULL,
0, /* Default setting */
0, /* Minimum value */
......@@ -11331,6 +11338,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(io_capacity),
MYSQL_SYSVAR(purge_threads),
MYSQL_SYSVAR(purge_batch_size),
MYSQL_SYSVAR(rollback_segments),
NULL
};
......
......@@ -138,7 +138,18 @@ UNIV_INTERN
void
buf_flush_wait_batch_end(
/*=====================*/
buf_pool_t* buf_pool, /*!< buffer pool instance */
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
enum buf_flush type); /*!< in: BUF_FLUSH_LRU
or BUF_FLUSH_LIST */
/******************************************************************//**
Waits until a flush batch of the given type ends. This is called by
a thread that only wants to wait for a flush to end but doesn't do
any flushing itself. */
UNIV_INTERN
void
buf_flush_wait_batch_end_wait_only(
/*===============================*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
enum buf_flush type); /*!< in: BUF_FLUSH_LRU
or BUF_FLUSH_LIST */
/********************************************************************//**
......
/*****************************************************************************
Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, 2009, Google Inc.
Copyright (c) 2009, Percona Inc.
......@@ -294,9 +294,12 @@ extern ulint srv_log_waits;
/* the number of purge threads to use from the worker pool (currently 0 or 1) */
extern ulong srv_n_purge_threads;
/* the number of records to purge in one batch */
/* the number of pages to purge in one batch */
extern ulong srv_purge_batch_size;
/* the number of rollback segments to use */
extern ulong srv_rollback_segments;
/* variable that counts amount of data read in total (in bytes) */
extern ulint srv_data_read;
......
/*****************************************************************************
Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
......@@ -93,7 +93,7 @@ extern mysql_pfs_key_t mem_hash_mutex_key;
# endif /* UNIV_MEM_DEBUG */
extern mysql_pfs_key_t mem_pool_mutex_key;
extern mysql_pfs_key_t mutex_list_mutex_key;
extern mysql_pfs_key_t purge_sys_mutex_key;
extern mysql_pfs_key_t purge_sys_bh_mutex_key;
extern mysql_pfs_key_t recv_sys_mutex_key;
extern mysql_pfs_key_t rseg_mutex_key;
# ifdef UNIV_SYNC_DEBUG
......@@ -637,7 +637,6 @@ or row lock! */
#define SYNC_TREE_NODE_NEW 892
#define SYNC_TREE_NODE_FROM_HASH 891
#define SYNC_TREE_NODE 890
#define SYNC_PURGE_SYS 810
#define SYNC_PURGE_LATCH 800
#define SYNC_TRX_UNDO 700
#define SYNC_RSEG 600
......@@ -659,6 +658,7 @@ or row lock! */
#define SYNC_REC_LOCK 299
#define SYNC_TRX_LOCK_HEAP 298
#define SYNC_TRX_SYS_HEADER 290
#define SYNC_PURGE_QUEUE 200
#define SYNC_LOG 170
#define SYNC_LOG_FLUSH_ORDER 147
#define SYNC_RECV 168
......
/*****************************************************************************
Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -68,8 +68,9 @@ Creates the global purge system control structure and inits the history
mutex. */
UNIV_INTERN
void
trx_purge_sys_create(void);
/*======================*/
trx_purge_sys_create(
/*=================*/
ib_bh_t* ib_bh); /*!< in/own: UNDO log min binary heap*/
/********************************************************************//**
Frees the global purge system control structure. */
UNIV_INTERN
......@@ -128,20 +129,20 @@ struct trx_purge_struct{
ulint state; /*!< Purge system state */
sess_t* sess; /*!< System session running the purge
query */
trx_t* trx; /*!< System transaction running the purge
trx_t* trx; /*!< System transaction running the
purge
query: this trx is not in the trx list
of the trx system and it never ends */
que_t* query; /*!< The query graph which will do the
parallelized purge operation */
rw_lock_t latch; /*!< The latch protecting the purge view.
A purge operation must acquire an
x-latch here for the instant at which
rw_lock_t latch; /*!< The latch protecting the purge
view. A purge operation must acquire
an x-latch here for the instant at which
it changes the purge view: an undo
log operation can prevent this by
obtaining an s-latch here. */
read_view_t* view; /*!< The purge will not remove undo logs
which are >= this view (purge view) */
mutex_t mutex; /*!< Mutex protecting the fields below */
ulint n_pages_handled;/*!< Approximate number of undo log
pages processed in purge */
ulint handle_limit; /*!< Target of how many pages to get
......@@ -179,6 +180,11 @@ struct trx_purge_struct{
mem_heap_t* heap; /*!< Temporary storage used during a
purge: can be emptied after purge
completes */
/*-----------------------------*/
ib_bh_t* ib_bh; /*!< Binary min-heap, ordered on
rseg_queue_t::trx_no. It is protected
by the bh_mutex */
mutex_t bh_mutex; /*!< Mutex protecting ib_bh */
};
#define TRX_PURGE_ON 1 /* purge operation is running */
......
/*****************************************************************************
Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -113,7 +113,9 @@ void
trx_rseg_list_and_array_init(
/*=========================*/
trx_sysf_t* sys_header, /*!< in: trx system header */
ib_bh_t* ib_bh, /*!< in: rseg queue */
mtr_t* mtr); /*!< in: mtr */
/***************************************************************************
Free's an instance of the rollback segment in memory. */
UNIV_INTERN
......@@ -180,6 +182,14 @@ struct trx_rseg_struct{
memory objects */
};
/** For prioritising the rollback segments for purge. */
struct rseg_queue_struct {
trx_id_t trx_no; /*!< trx_rseg_t::last_trx_no */
trx_rseg_t* rseg; /*!< Rollback segment */
};
typedef struct rseg_queue_struct rseg_queue_t;
/* Undo log segment slot in a rollback segment header */
/*-------------------------------------------------------------*/
#define TRX_RSEG_SLOT_PAGE_NO 0 /* Page number of the header page of
......
/*****************************************************************************
Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -38,6 +38,7 @@ Created 3/26/1996 Heikki Tuuri
#include "mem0mem.h"
#include "sync0sync.h"
#include "ut0lst.h"
#include "ut0bh.h"
#include "read0types.h"
#include "page0types.h"
......@@ -221,13 +222,6 @@ UNIV_INLINE
trx_id_t
trx_sys_get_new_trx_id(void);
/*========================*/
/*****************************************************************//**
Allocates a new transaction number.
@return new, allocated trx number */
UNIV_INLINE
trx_id_t
trx_sys_get_new_trx_no(void);
/*========================*/
#endif /* !UNIV_HOTBACKUP */
/*****************************************************************//**
Writes a trx id to an index page. In case that the id size changes in
......
......@@ -369,16 +369,4 @@ trx_sys_get_new_trx_id(void)
return(id);
}
/*****************************************************************//**
Allocates a new transaction number.
@return new, allocated trx number */
UNIV_INLINE
trx_id_t
trx_sys_get_new_trx_no(void)
/*========================*/
{
ut_ad(mutex_own(&kernel_mutex));
return(trx_sys_get_new_trx_id());
}
#endif /* !UNIV_HOTBACKUP */
/***************************************************************************//**
Copyright (c) 2011, Oracle Corpn. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*****************************************************************************/
/******************************************************************//**
@file include/ut0bh.ic
Binary min-heap implementation.
Created 2011-01-15 by Sunny Bains
*******************************************************/
#include "ut0bh.h"
/**********************************************************************//**
Get the number of elements in the binary heap.
@return number of elements */
UNIV_INLINE
ulint
ib_bh_size(
/*=======*/
const ib_bh_t* ib_bh) /*!< in: instance */
{
return(ib_bh->n_elems);
}
/**********************************************************************//**
Test if binary heap is empty.
@return TRUE if empty. */
UNIV_INLINE
ibool
ib_bh_is_empty(
/*===========*/
const ib_bh_t* ib_bh) /*!< in: instance */
{
return(ib_bh_size(ib_bh) == 0);
}
/**********************************************************************//**
Test if binary heap is full.
@return TRUE if full. */
UNIV_INLINE
ibool
ib_bh_is_full(
/*===========*/
const ib_bh_t* ib_bh) /*!< in: instance */
{
return(ib_bh_size(ib_bh) >= ib_bh->max_elems);
}
/**********************************************************************//**
Get a pointer to the element.
@return pointer to element */
UNIV_INLINE
void*
ib_bh_get(
/*=======*/
ib_bh_t* ib_bh, /*!< in: instance */
ulint i) /*!< in: index */
{
byte* ptr = (byte*) (ib_bh + 1);
ut_a(i < ib_bh_size(ib_bh));
return(ptr + (ib_bh->sizeof_elem * i));
}
/**********************************************************************//**
Copy an element to the binary heap.
@return pointer to copied element */
UNIV_INLINE
void*
ib_bh_set(
/*======*/
ib_bh_t* ib_bh, /*!< in/out: instance */
ulint i, /*!< in: index */
const void* elem) /*!< in: element to add */
{
void* ptr = ib_bh_get(ib_bh, i);
ut_memcpy(ptr, elem, ib_bh->sizeof_elem);
return(ptr);
}
/**********************************************************************//**
Return the first element from the binary heap.
@return pointer to first element or NULL if empty. */
UNIV_INLINE
void*
ib_bh_first(
/*========*/
ib_bh_t* ib_bh) /*!< in: instance */
{
return(ib_bh_is_empty(ib_bh) ? NULL : ib_bh_get(ib_bh, 0));
}
/**********************************************************************//**
Return the last element from the binary heap.
@return pointer to last element or NULL if empty. */
UNIV_INLINE
void*
ib_bh_last(
/*========*/
ib_bh_t* ib_bh) /*!< in/out: instance */
{
return(ib_bh_is_empty(ib_bh)
? NULL
: ib_bh_get(ib_bh, ib_bh_size(ib_bh) - 1));
}
/*****************************************************************************
Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1995, 2011, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
......@@ -269,9 +269,12 @@ UNIV_INTERN ulong srv_max_buf_pool_modified_pct = 75;
/* the number of purge threads to use from the worker pool (currently 0 or 1).*/
UNIV_INTERN ulong srv_n_purge_threads = 0;
/* the number of records to purge in one batch */
/* the number of pages to purge in one batch */
UNIV_INTERN ulong srv_purge_batch_size = 20;
/* the number of rollback segments to use */
UNIV_INTERN ulong srv_rollback_segments = TRX_SYS_N_RSEGS;
/* variable counts amount of data read in total (in bytes) */
UNIV_INTERN ulint srv_data_read = 0;
......@@ -3096,6 +3099,7 @@ srv_purge_thread(
required by os_thread_create */
{
srv_slot_t* slot;
ulint retries = 0;
ulint slot_no = ULINT_UNDEFINED;
ulint n_total_purged = ULINT_UNDEFINED;
......@@ -3122,7 +3126,7 @@ srv_purge_thread(
while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
ulint n_pages_purged;
ulint n_pages_purged = 0;
/* If there are very few records to purge or the last
purge didn't purge any records then wait for activity.
......@@ -3130,7 +3134,8 @@ srv_purge_thread(
because in the worst case we will end up waiting for
the next purge event. */
if (trx_sys->rseg_history_len < srv_purge_batch_size
|| n_total_purged == 0) {
|| (n_total_purged == 0
&& retries >= TRX_SYS_N_RSEGS)) {
os_event_t event;
......@@ -3141,6 +3146,8 @@ srv_purge_thread(
mutex_exit(&kernel_mutex);
os_event_wait(event);
retries = 0;
}
/* Check for shutdown and whether we should do purge at all. */
......@@ -3151,7 +3158,12 @@ srv_purge_thread(
break;
}
n_total_purged = 0;
if (n_total_purged == 0 && retries <= TRX_SYS_N_RSEGS) {
++retries;
} else if (n_total_purged > 0) {
retries = 0;
n_total_purged = 0;
}
/* Purge until there are no more records to purge and there is
no change in configuration or server state. */
......
/*****************************************************************************
Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1995, 2011, Innobase Oy. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Portions of this file contain modifications contributed and copyrighted by
......@@ -1172,7 +1172,7 @@ sync_thread_add_level(
case SYNC_RSEG:
case SYNC_TRX_UNDO:
case SYNC_PURGE_LATCH:
case SYNC_PURGE_SYS:
case SYNC_PURGE_QUEUE:
case SYNC_DICT_AUTOINC_MUTEX:
case SYNC_DICT_OPERATION:
case SYNC_DICT_HEADER:
......@@ -1239,10 +1239,16 @@ sync_thread_add_level(
|| sync_thread_levels_g(array, SYNC_FSP, TRUE));
break;
case SYNC_TRX_UNDO_PAGE:
/* Purge is allowed to read in as many UNDO pages as it likes,
there was a bogus rule here earlier that forced the caller to
acquire the purge_sys_t::mutex. The purge mutex did not really
protect anything because it was only ever acquired by the
single purge thread. The purge thread can read the UNDO pages
without any covering mutex. */
ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO)
|| sync_thread_levels_contain(array, SYNC_RSEG)
|| sync_thread_levels_contain(array, SYNC_PURGE_SYS)
|| sync_thread_levels_g(array, SYNC_TRX_UNDO_PAGE, TRUE));
|| sync_thread_levels_g(array, level - 1, TRUE));
break;
case SYNC_RSEG_HEADER:
ut_a(sync_thread_levels_contain(array, SYNC_RSEG));
......
/*****************************************************************************
Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -57,8 +57,8 @@ UNIV_INTERN mysql_pfs_key_t trx_purge_latch_key;
#endif /* UNIV_PFS_RWLOCK */
#ifdef UNIV_PFS_MUTEX
/* Key to register purge_sys_mutex with performance schema */
UNIV_INTERN mysql_pfs_key_t purge_sys_mutex_key;
/* Key to register purge_sys_bh_mutex with performance schema */
UNIV_INTERN mysql_pfs_key_t purge_sys_bh_mutex_key;
#endif /* UNIV_PFS_MUTEX */
/*****************************************************************//**
......@@ -219,13 +219,16 @@ Creates the global purge system control structure and inits the history
mutex. */
UNIV_INTERN
void
trx_purge_sys_create(void)
/*======================*/
trx_purge_sys_create(
/*=================*/
ib_bh_t* ib_bh) /*!< in, own: UNDO log min binary heap */
{
ut_ad(mutex_own(&kernel_mutex));
purge_sys = mem_alloc(sizeof(trx_purge_t));
purge_sys = mem_zalloc(sizeof(trx_purge_t));
/* Take ownership of ib_bh, we are responsible for freeing it. */
purge_sys->ib_bh = ib_bh;
purge_sys->state = TRX_STOP_PURGE;
purge_sys->n_pages_handled = 0;
......@@ -237,8 +240,9 @@ trx_purge_sys_create(void)
rw_lock_create(trx_purge_latch_key,
&purge_sys->latch, SYNC_PURGE_LATCH);
mutex_create(purge_sys_mutex_key,
&purge_sys->mutex, SYNC_PURGE_SYS);
mutex_create(
purge_sys_bh_mutex_key, &purge_sys->bh_mutex,
SYNC_PURGE_QUEUE);
purge_sys->heap = mem_heap_create(256);
......@@ -288,9 +292,12 @@ trx_purge_sys_close(void)
trx_undo_arr_free(purge_sys->arr);
rw_lock_free(&purge_sys->latch);
mutex_free(&purge_sys->mutex);
mutex_free(&purge_sys->bh_mutex);
mem_heap_free(purge_sys->heap);
ib_bh_free(purge_sys->ib_bh);
mem_free(purge_sys);
purge_sys = NULL;
......@@ -311,34 +318,31 @@ trx_purge_add_update_undo_to_history(
mtr_t* mtr) /*!< in: mtr */
{
trx_undo_t* undo;
trx_rseg_t* rseg;
trx_rsegf_t* rseg_header;
#ifdef UNIV_DEBUG
trx_usegf_t* seg_header;
#endif /* UNIV_DEBUG */
trx_ulogf_t* undo_header;
ulint hist_size;
undo = trx->update_undo;
ut_ad(undo);
rseg = undo->rseg;
ut_ad(mutex_own(&undo->rseg->mutex));
ut_ad(mutex_own(&(rseg->mutex)));
rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size,
rseg->page_no, mtr);
rseg_header = trx_rsegf_get(
undo->rseg->space, undo->rseg->zip_size, undo->rseg->page_no,
mtr);
undo_header = undo_page + undo->hdr_offset;
/* Add the log as the first in the history list */
if (undo->state != TRX_UNDO_CACHED) {
ulint hist_size;
#ifdef UNIV_DEBUG
seg_header = undo_page + TRX_UNDO_SEG_HDR;
trx_usegf_t* seg_header = undo_page + TRX_UNDO_SEG_HDR;
#endif /* UNIV_DEBUG */
if (undo->state != TRX_UNDO_CACHED) {
/* The undo log segment will not be reused */
if (undo->id >= TRX_RSEG_N_SLOTS) {
if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
fprintf(stderr,
"InnoDB: Error: undo->id is %lu\n",
(ulong) undo->id);
......@@ -347,42 +351,50 @@ trx_purge_add_update_undo_to_history(
trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
MLOG_4BYTES, mtr);
hist_size = mtr_read_ulint(
rseg_header + TRX_RSEG_HISTORY_SIZE, MLOG_4BYTES, mtr);
ut_ad(undo->size == flst_get_len(
seg_header + TRX_UNDO_PAGE_LIST, mtr));
mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
hist_size + undo->size, MLOG_4BYTES, mtr);
mlog_write_ulint(
rseg_header + TRX_RSEG_HISTORY_SIZE,
hist_size + undo->size, MLOG_4BYTES, mtr);
}
/* Add the log as the first in the history list */
flst_add_first(rseg_header + TRX_RSEG_HISTORY,
undo_header + TRX_UNDO_HISTORY_NODE, mtr);
mutex_enter(&kernel_mutex);
trx_sys->rseg_history_len++;
mutex_exit(&kernel_mutex);
if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
/* Inform the purge thread that there is work to do. */
srv_wake_purge_thread_if_not_active();
}
flst_add_first(
rseg_header + TRX_RSEG_HISTORY,
undo_header + TRX_UNDO_HISTORY_NODE, mtr);
/* Write the trx number to the undo log header */
mlog_write_ull(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
/* Write information about delete markings to the undo log header */
if (!undo->del_marks) {
mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
MLOG_2BYTES, mtr);
mlog_write_ulint(
undo_header + TRX_UNDO_DEL_MARKS, FALSE,
MLOG_2BYTES, mtr);
}
if (rseg->last_page_no == FIL_NULL) {
if (undo->rseg->last_page_no == FIL_NULL) {
undo->rseg->last_trx_no = trx->no;
undo->rseg->last_offset = undo->hdr_offset;
undo->rseg->last_page_no = undo->hdr_page_no;
undo->rseg->last_del_marks = undo->del_marks;
rseg->last_page_no = undo->hdr_page_no;
rseg->last_offset = undo->hdr_offset;
rseg->last_trx_no = trx->no;
rseg->last_del_marks = undo->del_marks;
/* FIXME: Add a bin heap validate function to check that
the rseg exists. */
}
mutex_enter(&kernel_mutex);
trx_sys->rseg_history_len++;
mutex_exit(&kernel_mutex);
if (!(trx_sys->rseg_history_len % srv_purge_batch_size)) {
/* Inform the purge thread that there is work to do. */
srv_wake_purge_thread_if_not_active();
}
}
......@@ -411,7 +423,6 @@ trx_purge_free_segment(
/* fputs("Freeing an update undo log segment\n", stderr); */
ut_ad(mutex_own(&(purge_sys->mutex)));
loop:
mtr_start(&mtr);
mutex_enter(&(rseg->mutex));
......@@ -515,8 +526,6 @@ trx_purge_truncate_rseg_history(
mtr_t mtr;
trx_id_t undo_trx_no;
ut_ad(mutex_own(&(purge_sys->mutex)));
mtr_start(&mtr);
mutex_enter(&(rseg->mutex));
......@@ -609,10 +618,8 @@ trx_purge_truncate_history(void)
trx_id_t limit_trx_no;
undo_no_t limit_undo_no;
ut_ad(mutex_own(&(purge_sys->mutex)));
trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no,
&limit_undo_no);
trx_purge_arr_get_biggest(
purge_sys->arr, &limit_trx_no, &limit_undo_no);
if (limit_trx_no == 0) {
......@@ -630,34 +637,29 @@ trx_purge_truncate_history(void)
ut_ad(limit_trx_no <= purge_sys->view->low_limit_no);
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
for (rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
rseg != NULL;
rseg = UT_LIST_GET_NEXT(rseg_list, rseg)) {
while (rseg) {
trx_purge_truncate_rseg_history(rseg, limit_trx_no,
limit_undo_no);
rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
trx_purge_truncate_rseg_history(
rseg, limit_trx_no, limit_undo_no);
}
}
/********************************************************************//**
Does a truncate if the purge array is empty. NOTE that when this function is
called, the caller must not have any latches on undo log pages!
@return TRUE if array empty */
called, the caller must not have any latches on undo log pages! */
UNIV_INLINE
ibool
void
trx_purge_truncate_if_arr_empty(void)
/*=================================*/
{
ut_ad(mutex_own(&(purge_sys->mutex)));
static ulint count;
if (purge_sys->arr->n_used == 0) {
if (!(++count % TRX_SYS_N_RSEGS) && purge_sys->arr->n_used == 0) {
trx_purge_truncate_history();
return(TRUE);
}
return(FALSE);
}
/***********************************************************************//**
......@@ -675,8 +677,8 @@ trx_purge_rseg_get_next_history_log(
trx_id_t trx_no;
ibool del_marks;
mtr_t mtr;
ut_ad(mutex_own(&(purge_sys->mutex)));
rseg_queue_t rseg_queue;
const void* ptr;
mutex_enter(&(rseg->mutex));
......@@ -688,8 +690,9 @@ trx_purge_rseg_get_next_history_log(
mtr_start(&mtr);
undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
rseg->last_page_no, &mtr);
undo_page = trx_undo_page_get_s_latched(
rseg->space, rseg->zip_size, rseg->last_page_no, &mtr);
log_hdr = undo_page + rseg->last_offset;
/* Increase the purge page count by one for every handled log */
......@@ -698,6 +701,7 @@ trx_purge_rseg_get_next_history_log(
prev_log_addr = trx_purge_get_log_from_hist(
flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
if (prev_log_addr.page == FIL_NULL) {
/* No logs left in the history list */
......@@ -712,11 +716,11 @@ trx_purge_rseg_get_next_history_log(
on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
file-based list was corrupt. The prev node pointer was
FIL_NULL, even though the list length was over 8 million nodes!
We assume that purge truncates the history list in moderate
We assume that purge truncates the history list in large
size pieces, and if we here reach the head of the list, the
list cannot be longer than 20 000 undo logs now. */
list cannot be longer than 2000 000 undo logs now. */
if (trx_sys->rseg_history_len > 20000) {
if (trx_sys->rseg_history_len > 2000000) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Warning: purge reached the"
......@@ -756,105 +760,150 @@ trx_purge_rseg_get_next_history_log(
rseg->last_trx_no = trx_no;
rseg->last_del_marks = del_marks;
rseg_queue.rseg = rseg;
rseg_queue.trx_no = rseg->last_trx_no;
/* Purge can also produce events, however these are already ordered
in the rollback segment and any user generated event will be greater
than the events that Purge produces. ie. Purge can never produce
events from an empty rollback segment. */
mutex_enter(&purge_sys->bh_mutex);
ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
ut_a(ptr != NULL);
mutex_exit(&purge_sys->bh_mutex);
mutex_exit(&(rseg->mutex));
}
/***********************************************************************//**
Chooses the next undo log to purge and updates the info in purge_sys. This
function is used to initialize purge_sys when the next record to purge is
not known, and also to update the purge system info on the next record when
purge has handled the whole undo log for a transaction. */
Chooses the rollback segment with the smallest trx_id.
@return zip_size if log is for a compressed table, ULINT_UNDEFINED if
no rollback segments to purge, 0 for non compressed tables. */
static
void
trx_purge_choose_next_log(void)
/*===========================*/
ulint
trx_purge_get_rseg_with_min_trx_id(
/*===============================*/
trx_purge_t* purge_sys) /*!< in/out: purge instance */
{
trx_undo_rec_t* rec;
trx_rseg_t* rseg;
trx_rseg_t* min_rseg;
trx_id_t min_trx_no;
ulint space = 0; /* remove warning (??? bug ???) */
ulint zip_size = 0;
ulint page_no = 0; /* remove warning (??? bug ???) */
ulint offset = 0; /* remove warning (??? bug ???) */
mtr_t mtr;
ut_ad(mutex_own(&(purge_sys->mutex)));
ut_ad(purge_sys->next_stored == FALSE);
mutex_enter(&purge_sys->bh_mutex);
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
/* Only purge consumes events from the binary heap, user
threads only produce the events. */
min_trx_no = IB_ULONGLONG_MAX;
if (!ib_bh_is_empty(purge_sys->ib_bh)) {
trx_rseg_t* rseg;
min_rseg = NULL;
rseg = ((rseg_queue_t*) ib_bh_first(purge_sys->ib_bh))->rseg;
ib_bh_pop(purge_sys->ib_bh);
while (rseg) {
mutex_enter(&(rseg->mutex));
mutex_exit(&purge_sys->bh_mutex);
if (rseg->last_page_no != FIL_NULL) {
purge_sys->rseg = rseg;
} else {
mutex_exit(&purge_sys->bh_mutex);
if (min_rseg == NULL
|| min_trx_no > rseg->last_trx_no) {
purge_sys->rseg = NULL;
min_rseg = rseg;
min_trx_no = rseg->last_trx_no;
space = rseg->space;
zip_size = rseg->zip_size;
ut_a(space == 0); /* We assume in purge of
externally stored fields
that space id == 0 */
page_no = rseg->last_page_no;
offset = rseg->last_offset;
}
}
return(ULINT_UNDEFINED);
}
mutex_exit(&(rseg->mutex));
ut_a(purge_sys->rseg != NULL);
rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
}
mutex_enter(&purge_sys->rseg->mutex);
if (min_rseg == NULL) {
ut_a(purge_sys->rseg->last_page_no != FIL_NULL);
return;
}
/* We assume in purge of externally stored fields
that space id == 0 */
ut_a(purge_sys->rseg->space == 0);
mtr_start(&mtr);
zip_size = purge_sys->rseg->zip_size;
if (!min_rseg->last_del_marks) {
/* No need to purge this log */
ut_a(purge_sys->purge_trx_no <= purge_sys->rseg->last_trx_no);
rec = &trx_purge_dummy_rec;
} else {
rec = trx_undo_get_first_rec(space, zip_size, page_no, offset,
RW_S_LATCH, &mtr);
if (rec == NULL) {
/* Undo log empty */
purge_sys->purge_trx_no = purge_sys->rseg->last_trx_no;
purge_sys->hdr_offset = purge_sys->rseg->last_offset;
purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
mutex_exit(&purge_sys->rseg->mutex);
return(zip_size);
}
/***********************************************************************//**
Position the purge sys "iterator" on the undo record to use for purging. */
static
void
trx_purge_read_undo_rec(
/*====================*/
trx_purge_t* purge_sys, /*!< in/out: purge instance */
ulint zip_size) /*!< in: block size or 0 */
{
ulint page_no;
ulint offset = 0;
ib_uint64_t undo_no = 0;
purge_sys->hdr_offset = purge_sys->rseg->last_offset;
page_no = purge_sys->hdr_page_no = purge_sys->rseg->last_page_no;
if (purge_sys->rseg->last_del_marks) {
mtr_t mtr;
trx_undo_rec_t* undo_rec;
rec = &trx_purge_dummy_rec;
mtr_start(&mtr);
undo_rec = trx_undo_get_first_rec(
0 /* System space id */, zip_size,
purge_sys->hdr_page_no,
purge_sys->hdr_offset, RW_S_LATCH, &mtr);
if (undo_rec != NULL) {
offset = page_offset(undo_rec);
undo_no = trx_undo_rec_get_undo_no(undo_rec);
page_no = page_get_page_no(page_align(undo_rec));
}
mtr_commit(&mtr);
}
purge_sys->offset = offset;
purge_sys->page_no = page_no;
purge_sys->purge_undo_no = undo_no;
purge_sys->next_stored = TRUE;
purge_sys->rseg = min_rseg;
}
purge_sys->hdr_page_no = page_no;
purge_sys->hdr_offset = offset;
/***********************************************************************//**
Chooses the next undo log to purge and updates the info in purge_sys. This
function is used to initialize purge_sys when the next record to purge is
not known, and also to update the purge system info on the next record when
purge has handled the whole undo log for a transaction. */
static
void
trx_purge_choose_next_log(void)
/*===========================*/
{
ulint zip_size;
purge_sys->purge_trx_no = min_trx_no;
ut_ad(purge_sys->next_stored == FALSE);
if (rec == &trx_purge_dummy_rec) {
zip_size = trx_purge_get_rseg_with_min_trx_id(purge_sys);
purge_sys->purge_undo_no = 0;
purge_sys->page_no = page_no;
purge_sys->offset = 0;
} else {
purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec);
if (purge_sys->rseg != NULL) {
purge_sys->page_no = page_get_page_no(page_align(rec));
purge_sys->offset = page_offset(rec);
trx_purge_read_undo_rec(purge_sys, zip_size);
} else {
/* There is nothing to do yet. */
os_thread_yield();
}
mtr_commit(&mtr);
}
/***********************************************************************//**
......@@ -880,7 +929,6 @@ trx_purge_get_next_rec(
ulint cmpl_info;
mtr_t mtr;
ut_ad(mutex_own(&(purge_sys->mutex)));
ut_ad(purge_sys->next_stored);
space = purge_sys->rseg->space;
......@@ -903,8 +951,8 @@ trx_purge_get_next_rec(
mtr_start(&mtr);
undo_page = trx_undo_page_get_s_latched(space, zip_size,
page_no, &mtr);
undo_page = trx_undo_page_get_s_latched(space, zip_size, page_no, &mtr);
rec = undo_page + offset;
rec2 = rec;
......@@ -913,9 +961,9 @@ trx_purge_get_next_rec(
/* Try first to find the next record which requires a purge
operation from the same page of the same undo log */
next_rec = trx_undo_page_get_next_rec(rec2,
purge_sys->hdr_page_no,
purge_sys->hdr_offset);
next_rec = trx_undo_page_get_next_rec(
rec2, purge_sys->hdr_page_no, purge_sys->hdr_offset);
if (next_rec == NULL) {
rec2 = trx_undo_get_next_rec(
rec2, purge_sys->hdr_page_no,
......@@ -995,17 +1043,12 @@ trx_purge_fetch_next_rec(
{
trx_undo_rec_t* undo_rec;
mutex_enter(&(purge_sys->mutex));
if (purge_sys->state == TRX_STOP_PURGE) {
trx_purge_truncate_if_arr_empty();
mutex_exit(&(purge_sys->mutex));
return(NULL);
}
if (!purge_sys->next_stored) {
} else if (!purge_sys->next_stored) {
trx_purge_choose_next_log();
if (!purge_sys->next_stored) {
......@@ -1020,8 +1063,6 @@ trx_purge_fetch_next_rec(
(ulong) purge_sys->n_pages_handled);
}
mutex_exit(&(purge_sys->mutex));
return(NULL);
}
}
......@@ -1032,18 +1073,12 @@ trx_purge_fetch_next_rec(
trx_purge_truncate_if_arr_empty();
mutex_exit(&(purge_sys->mutex));
return(NULL);
}
if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) {
} else if (purge_sys->purge_trx_no >= purge_sys->view->low_limit_no) {
purge_sys->state = TRX_STOP_PURGE;
trx_purge_truncate_if_arr_empty();
mutex_exit(&(purge_sys->mutex));
return(NULL);
}
......@@ -1052,12 +1087,13 @@ trx_purge_fetch_next_rec(
(ullint) purge_sys->purge_trx_no,
(ullint) purge_sys->purge_undo_no); */
*roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id,
purge_sys->page_no,
purge_sys->offset);
*cell = trx_purge_arr_store_info(purge_sys->purge_trx_no,
purge_sys->purge_undo_no);
*roll_ptr = trx_undo_build_roll_ptr(
FALSE, (purge_sys->rseg)->id, purge_sys->page_no,
purge_sys->offset);
*cell = trx_purge_arr_store_info(
purge_sys->purge_trx_no, purge_sys->purge_undo_no);
ut_ad(purge_sys->purge_trx_no < purge_sys->view->low_limit_no);
......@@ -1066,8 +1102,6 @@ trx_purge_fetch_next_rec(
undo_rec = trx_purge_get_next_rec(heap);
mutex_exit(&(purge_sys->mutex));
return(undo_rec);
}
......@@ -1079,11 +1113,7 @@ trx_purge_rec_release(
/*==================*/
trx_undo_inf_t* cell) /*!< in: storage cell */
{
mutex_enter(&(purge_sys->mutex));
trx_purge_arr_remove_info(cell);
mutex_exit(&(purge_sys->mutex));
}
/*******************************************************************//**
......@@ -1097,23 +1127,11 @@ trx_purge(
purge in one batch */
{
que_thr_t* thr;
/* que_thr_t* thr2; */
ulint old_pages_handled;
mutex_enter(&(purge_sys->mutex));
if (purge_sys->trx->n_active_thrs > 0) {
ut_a(purge_sys->trx->n_active_thrs == 0);
mutex_exit(&(purge_sys->mutex));
/* Should not happen */
ut_error;
return(0);
}
rw_lock_x_lock(&(purge_sys->latch));
rw_lock_x_lock(&purge_sys->latch);
mutex_enter(&kernel_mutex);
......@@ -1147,8 +1165,9 @@ trx_purge(
}
}
purge_sys->view = read_view_oldest_copy_or_open_new(0,
purge_sys->heap);
purge_sys->view = read_view_oldest_copy_or_open_new(
0, purge_sys->heap);
mutex_exit(&kernel_mutex);
rw_lock_x_unlock(&(purge_sys->latch));
......@@ -1159,7 +1178,6 @@ trx_purge(
old_pages_handled = purge_sys->n_pages_handled;
mutex_exit(&(purge_sys->mutex));
mutex_enter(&kernel_mutex);
......@@ -1167,15 +1185,8 @@ trx_purge(
ut_ad(thr);
/* thr2 = que_fork_start_command(purge_sys->query);
ut_ad(thr2); */
mutex_exit(&kernel_mutex);
/* srv_que_task_enqueue(thr2); */
if (srv_print_thread_releases) {
fputs("Starting purge\n", stderr);
......
/*****************************************************************************
Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1996, 2011, Oracle Corpn. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -50,11 +50,11 @@ trx_rseg_get_on_id(
{
trx_rseg_t* rseg;
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
ut_a(id < TRX_SYS_N_RSEGS);
while (rseg && rseg->id != id) {
rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
}
rseg = trx_sys->rseg_array[id];
ut_a(rseg == NULL || id == rseg->id);
return(rseg);
}
......@@ -181,12 +181,15 @@ static
trx_rseg_t*
trx_rseg_mem_create(
/*================*/
ulint id, /*!< in: rollback segment id */
ulint space, /*!< in: space where the segment placed */
ulint zip_size, /*!< in: compressed page size in bytes
or 0 for uncompressed pages */
ulint page_no, /*!< in: page number of the segment header */
mtr_t* mtr) /*!< in: mtr */
ulint id, /*!< in: rollback segment id */
ulint space, /*!< in: space where the segment
placed */
ulint zip_size, /*!< in: compressed page size in bytes
or 0 for uncompressed pages */
ulint page_no, /*!< in: page number of the segment
header */
ib_bh_t* ib_bh, /*!< in/out: rseg queue */
mtr_t* mtr) /*!< in: mtr */
{
ulint len;
trx_rseg_t* rseg;
......@@ -225,6 +228,9 @@ trx_rseg_mem_create(
len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
if (len > 0) {
const void* ptr;
rseg_queue_t rseg_queue;
trx_sys->rseg_history_len += len;
node_addr = trx_purge_get_log_from_hist(
......@@ -240,6 +246,17 @@ trx_rseg_mem_create(
undo_log_hdr + TRX_UNDO_TRX_NO);
rseg->last_del_marks = mtr_read_ulint(
undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
rseg_queue.rseg = rseg;
rseg_queue.trx_no = rseg->last_trx_no;
if (rseg->last_page_no != FIL_NULL) {
/* There is no need to cover this operation by the purge
mutex because we are still bootstrapping. */
ptr = ib_bh_push(ib_bh, &rseg_queue);
ut_a(ptr != NULL);
}
} else {
rseg->last_page_no = FIL_NULL;
}
......@@ -255,6 +272,7 @@ void
trx_rseg_create_instance(
/*=====================*/
trx_sysf_t* sys_header, /*!< in: trx system header */
ib_bh_t* ib_bh, /*!< in/out: rseg queue */
mtr_t* mtr) /*!< in: mtr */
{
ulint i;
......@@ -278,7 +296,7 @@ trx_rseg_create_instance(
zip_size = space ? fil_space_get_zip_size(space) : 0;
rseg = trx_rseg_mem_create(
i, space, zip_size, page_no, mtr);
i, space, zip_size, page_no, ib_bh, mtr);
ut_a(rseg->id == i);
}
......@@ -327,7 +345,8 @@ trx_rseg_create(void)
zip_size = space ? fil_space_get_zip_size(space) : 0;
rseg = trx_rseg_mem_create(
slot_no, space, zip_size, page_no, &mtr);
slot_no, space, zip_size, page_no,
purge_sys->ib_bh, &mtr);
}
mutex_exit(&kernel_mutex);
......@@ -342,13 +361,14 @@ UNIV_INTERN
void
trx_rseg_list_and_array_init(
/*=========================*/
trx_sysf_t* sys_header, /* in: trx system header */
mtr_t* mtr) /* in: mtr */
trx_sysf_t* sys_header, /*!< in: trx system header */
ib_bh_t* ib_bh, /*!< in: rseg queue */
mtr_t* mtr) /*!< in: mtr */
{
UT_LIST_INIT(trx_sys->rseg_list);
trx_sys->rseg_history_len = 0;
trx_rseg_create_instance(sys_header, mtr);
trx_rseg_create_instance(sys_header, ib_bh, mtr);
}
/*****************************************************************************
Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -945,6 +945,31 @@ trx_sysf_create(
mutex_exit(&kernel_mutex);
}
/*****************************************************************//**
Compare two trx_rseg_t instances on last_trx_no. */
static
int
trx_rseg_compare_last_trx_no(
/*=========================*/
const void* p1, /*!< in: elem to compare */
const void* p2) /*!< in: elem to compare */
{
ib_int64_t cmp;
const rseg_queue_t* rseg_q1 = (const rseg_queue_t*) p1;
const rseg_queue_t* rseg_q2 = (const rseg_queue_t*) p2;
cmp = rseg_q1->trx_no - rseg_q2->trx_no;
if (cmp < 0) {
return(-1);
} else if (cmp > 0) {
return(1);
}
return(0);
}
/*****************************************************************//**
Creates and initializes the central memory structures for the transaction
system. This is called when the database is started. */
......@@ -958,6 +983,7 @@ trx_sys_init_at_db_start(void)
const char* unit = "";
trx_t* trx;
mtr_t mtr;
ib_bh_t* ib_bh;
mtr_start(&mtr);
......@@ -965,11 +991,19 @@ trx_sys_init_at_db_start(void)
mutex_enter(&kernel_mutex);
trx_sys = mem_alloc(sizeof(trx_sys_t));
/* We create the min binary heap here and pass ownership to
purge when we init the purge sub-system. Purge is responsible
for freeing the binary heap. */
ib_bh = ib_bh_create(
trx_rseg_compare_last_trx_no,
sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
trx_sys = mem_zalloc(sizeof(*trx_sys));
sys_header = trx_sysf_get(&mtr);
trx_rseg_list_and_array_init(sys_header, &mtr);
trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr);
trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
......@@ -1023,7 +1057,8 @@ trx_sys_init_at_db_start(void)
UT_LIST_INIT(trx_sys->view_list);
trx_purge_sys_create();
/* Transfer ownership to purge. */
trx_purge_sys_create(ib_bh);
mutex_exit(&kernel_mutex);
......
/*****************************************************************************
Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 1996, 2011, Innobase Oy. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -42,6 +42,7 @@ Created 3/26/1996 Heikki Tuuri
#include "btr0sea.h"
#include "os0proc.h"
#include "trx0xa.h"
#include "trx0purge.h"
#include "ha_prototypes.h"
/** Dummy session used currently in MySQL interface */
......@@ -604,36 +605,26 @@ trx_lists_init_at_db_start(void)
/******************************************************************//**
Assigns a rollback segment to a transaction in a round-robin fashion.
Skips the SYSTEM rollback segment if another is available.
@return assigned rollback segment id */
@return assigned rollback segment instance */
UNIV_INLINE
ulint
trx_assign_rseg(void)
/*=================*/
trx_rseg_t*
trx_assign_rseg(
/*============*/
ulint max_undo_logs) /*!< in: maximum number of UNDO logs to use */
{
trx_rseg_t* rseg = trx_sys->latest_rseg;
trx_rseg_t* rseg = trx_sys->latest_rseg;
ut_ad(mutex_own(&kernel_mutex));
loop:
/* Get next rseg in a round-robin fashion */
rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
if (rseg == NULL) {
if (rseg == NULL || rseg->id == max_undo_logs - 1) {
rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
}
/* If it is the SYSTEM rollback segment, and there exist others, skip
it */
if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
&& (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
goto loop;
}
trx_sys->latest_rseg = rseg;
return(rseg->id);
return(rseg);
}
/****************************************************************//**
......@@ -663,12 +654,9 @@ trx_start_low(
ut_ad(trx->conc_state != TRX_ACTIVE);
if (rseg_id == ULINT_UNDEFINED) {
rseg_id = trx_assign_rseg();
}
ut_a(rseg_id == ULINT_UNDEFINED);
rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
rseg = trx_assign_rseg(srv_rollback_segments);
trx->id = trx_sys_get_new_trx_id();
......@@ -719,107 +707,179 @@ trx_start(
}
/****************************************************************//**
Commits a transaction. */
UNIV_INTERN
Set the transaction serialisation number. */
static
void
trx_commit_off_kernel(
/*==================*/
trx_t* trx) /*!< in: transaction */
trx_serialisation_number_get(
/*=========================*/
trx_t* trx) /*!< in: transaction */
{
page_t* update_hdr_page;
ib_uint64_t lsn = 0;
trx_rseg_t* rseg;
trx_undo_t* undo;
mtr_t mtr;
ut_ad(mutex_own(&kernel_mutex));
rseg = trx->rseg;
trx->must_flush_log_later = FALSE;
ut_ad(mutex_own(&rseg->mutex));
rseg = trx->rseg;
mutex_enter(&kernel_mutex);
if (trx->insert_undo != NULL || trx->update_undo != NULL) {
trx->no = trx_sys_get_new_trx_id();
/* If the rollack segment is not empty then the
new trx_t::no can't be less than any trx_t::no
already in the rollback segment. User threads only
produce events when a rollback segment is empty. */
if (rseg->last_page_no == FIL_NULL) {
void* ptr;
rseg_queue_t rseg_queue;
rseg_queue.rseg = rseg;
rseg_queue.trx_no = trx->no;
mutex_enter(&purge_sys->bh_mutex);
/* This is to reduce the pressure on the kernel mutex,
though in reality it should make very little (read no)
difference because this code path is only taken when the
rbs is empty. */
mutex_exit(&kernel_mutex);
mtr_start(&mtr);
ptr = ib_bh_push(purge_sys->ib_bh, &rseg_queue);
ut_a(ptr);
/* Change the undo log segment states from TRX_UNDO_ACTIVE
to some other state: these modifications to the file data
structure define the transaction as committed in the file
based world, at the serialization point of the log sequence
number lsn obtained below. */
mutex_exit(&purge_sys->bh_mutex);
} else {
mutex_exit(&kernel_mutex);
}
}
mutex_enter(&(rseg->mutex));
/****************************************************************//**
Assign the transaction its history serialisation number and write the
update UNDO log record to the assigned rollback segment.
@return the LSN of the UNDO log write. */
static
ib_uint64_t
trx_write_serialisation_history(
/*============================*/
trx_t* trx) /*!< in: transaction */
{
mtr_t mtr;
trx_rseg_t* rseg;
if (trx->insert_undo != NULL) {
trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
}
ut_ad(!mutex_own(&kernel_mutex));
undo = trx->update_undo;
rseg = trx->rseg;
if (undo) {
mutex_enter(&kernel_mutex);
trx->no = trx_sys_get_new_trx_no();
mutex_exit(&kernel_mutex);
mtr_start(&mtr);
/* It is not necessary to obtain trx->undo_mutex here
because only a single OS thread is allowed to do the
transaction commit for this transaction. */
/* Change the undo log segment states from TRX_UNDO_ACTIVE
to some other state: these modifications to the file data
structure define the transaction as committed in the file
based domain, at the serialization point of the log sequence
number lsn obtained below. */
update_hdr_page = trx_undo_set_state_at_finish(
undo, &mtr);
if (trx->update_undo != NULL) {
page_t* undo_hdr_page;
trx_undo_t* undo = trx->update_undo;
/* We have to do the cleanup for the update log while
holding the rseg mutex because update log headers
have to be put to the history list in the order of
the trx number. */
/* We have to hold the rseg mutex because update
log headers have to be put to the history list in the
(serialisation) order of the UNDO trx number. This is
required for the purge in-memory data structures too. */
trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
}
mutex_enter(&rseg->mutex);
mutex_exit(&(rseg->mutex));
/* Assign the transaction serialisation number and also
update the purge min binary heap if this is the first
UNDO log being written to the assigned rollback segment. */
/* Update the latest MySQL binlog name and offset info
in trx sys header if MySQL binlogging is on or the database
server is a MySQL replication slave */
if (trx->mysql_log_file_name
&& trx->mysql_log_file_name[0] != '\0') {
trx_sys_update_mysql_binlog_offset(
trx->mysql_log_file_name,
trx->mysql_log_offset,
TRX_SYS_MYSQL_LOG_INFO, &mtr);
trx->mysql_log_file_name = NULL;
}
trx_serialisation_number_get(trx);
/* The following call commits the mini-transaction, making the
whole transaction committed in the file-based world, at this
log sequence number. The transaction becomes 'durable' when
we write the log to disk, but in the logical sense the commit
in the file-based data structures (undo logs etc.) happens
here.
NOTE that transaction numbers, which are assigned only to
transactions with an update undo log, do not necessarily come
in exactly the same order as commit lsn's, if the transactions
have different rollback segments. To get exactly the same
order we should hold the kernel mutex up to this point,
adding to the contention of the kernel mutex. However, if
a transaction T2 is able to see modifications made by
a transaction T1, T2 will always get a bigger transaction
number and a bigger commit lsn than T1. */
/* It is not necessary to obtain trx->undo_mutex here
because only a single OS thread is allowed to do the
transaction commit for this transaction. */
/*--------------*/
mtr_commit(&mtr);
/*--------------*/
lsn = mtr.end_lsn;
undo_hdr_page = trx_undo_set_state_at_finish(undo, &mtr);
trx_undo_update_cleanup(trx, undo_hdr_page, &mtr);
} else {
mutex_enter(&rseg->mutex);
}
if (trx->insert_undo != NULL) {
trx_undo_set_state_at_finish(trx->insert_undo, &mtr);
}
mutex_exit(&rseg->mutex);
/* Update the latest MySQL binlog name and offset info
in trx sys header if MySQL binlogging is on or the database
server is a MySQL replication slave */
if (trx->mysql_log_file_name
&& trx->mysql_log_file_name[0] != '\0') {
trx_sys_update_mysql_binlog_offset(
trx->mysql_log_file_name,
trx->mysql_log_offset,
TRX_SYS_MYSQL_LOG_INFO, &mtr);
trx->mysql_log_file_name = NULL;
}
/* The following call commits the mini-transaction, making the
whole transaction committed in the file-based world, at this
log sequence number. The transaction becomes 'durable' when
we write the log to disk, but in the logical sense the commit
in the file-based data structures (undo logs etc.) happens
here.
NOTE that transaction numbers, which are assigned only to
transactions with an update undo log, do not necessarily come
in exactly the same order as commit lsn's, if the transactions
have different rollback segments. To get exactly the same
order we should hold the kernel mutex up to this point,
adding to the contention of the kernel mutex. However, if
a transaction T2 is able to see modifications made by
a transaction T1, T2 will always get a bigger transaction
number and a bigger commit lsn than T1. */
/*--------------*/
mtr_commit(&mtr);
/*--------------*/
return(mtr.end_lsn);
}
/****************************************************************//**
Commits a transaction. */
UNIV_INTERN
void
trx_commit_off_kernel(
/*==================*/
trx_t* trx) /*!< in: transaction */
{
ib_uint64_t lsn;
ut_ad(mutex_own(&kernel_mutex));
trx->must_flush_log_later = FALSE;
/* If the transaction made any updates then we need to write the
UNDO logs for the updates to the assigned rollback segment. */
if (trx->insert_undo != NULL || trx->update_undo != NULL) {
mutex_exit(&kernel_mutex);
lsn = trx_write_serialisation_history(trx);
mutex_enter(&kernel_mutex);
} else {
lsn = 0;
}
ut_ad(trx->conc_state == TRX_ACTIVE
|| trx->conc_state == TRX_PREPARED);
ut_ad(trx->conc_state == TRX_ACTIVE || trx->conc_state == TRX_PREPARED);
ut_ad(mutex_own(&kernel_mutex));
/* The following assignment makes the transaction committed in memory
......
/*****************************************************************************
Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
Copyright (c) 2009, Sun Microsystems, Inc.
Portions of this file contain modifications contributed and copyrighted by
Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
are described briefly in the InnoDB documentation. The contributions by
Sun Microsystems are incorporated with their permission, and subject to the
conditions contained in the file COPYING.Sun_Microsystems.
Copyright (c) 2011, Oracle Corpn. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment