Commit 8d8f52e9 authored by unknown's avatar unknown

Many files:

  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released


sql/log.cc:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/handler.cc:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/handler.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/ha_innodb.cc:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
sql/ha_innodb.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/include/log0log.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/include/trx0trx.h:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/os/os0file.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/buf/buf0flu.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/trx/trx0trx.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/log/log0log.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/srv/srv0srv.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
innobase/row/row0mysql.c:
  Eliminate the LOCK_log bottleneck in group commit in binlogging: flush InnoDB log files only after it has been released
parent 87039789
......@@ -398,7 +398,7 @@ buf_flush_write_block_low(
"Warning: cannot force log to disk in the log debug version!\n");
#else
/* Force the log to the disk before writing the modified block */
log_flush_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS);
log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
#endif
buf_flush_init_for_writing(block->frame, block->newest_modification,
block->space, block->offset);
......
......@@ -20,7 +20,7 @@ typedef struct log_group_struct log_group_t;
extern ibool log_do_write;
extern ibool log_debug_writes;
/* Wait modes for log_flush_up_to */
/* Wait modes for log_write_up_to */
#define LOG_NO_WAIT 91
#define LOG_WAIT_ONE_GROUP 92
#define LOG_WAIT_ALL_GROUPS 93
......@@ -157,26 +157,21 @@ log_io_complete(
/*============*/
log_group_t* group); /* in: log group */
/**********************************************************
Flushes the log files to the disk, using, for example, the Unix fsync.
This function does the flush even if the user has set
srv_flush_log_at_trx_commit = FALSE. */
void
log_flush_to_disk(void);
/*===================*/
/**********************************************************
This function is called, e.g., when a transaction wants to commit. It checks
that the log has been flushed to disk up to the last log entry written by the
transaction. If there is a flush running, it waits and checks if the flush
flushed enough. If not, starts a new flush. */
that the log has been written to the log file up to the last log entry written
by the transaction. If there is a flush running, it waits and checks if the
flush flushed enough. If not, starts a new flush. */
void
log_flush_up_to(
log_write_up_to(
/*============*/
dulint lsn, /* in: log sequence number up to which the log should
be flushed, ut_dulint_max if not specified */
ulint wait); /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
be written, ut_dulint_max if not specified */
ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
or LOG_WAIT_ALL_GROUPS */
ibool flush_to_disk);
/* in: TRUE if we want the written log also to be
flushed to disk */
/********************************************************************
Advances the smallest lsn for which there are unflushed dirty blocks in the
buffer pool and also may make a new checkpoint. NOTE: this function may only
......@@ -741,27 +736,37 @@ struct log_struct{
be advanced, it is enough that the
write i/o has been completed for all
log groups */
dulint flush_lsn; /* end lsn for the current flush */
ulint flush_end_offset;/* the data in buffer has been flushed
dulint write_lsn; /* end lsn for the current running
write */
ulint write_end_offset;/* the data in buffer has been written
up to this offset when the current
flush ends: this field will then
write ends: this field will then
be copied to buf_next_to_write */
ulint n_pending_writes;/* number of currently pending flush
writes */
dulint current_flush_lsn;/* end lsn for the current running
write + flush operation */
dulint flushed_to_disk_lsn;
/* how far we have written the log
AND flushed to disk */
ulint n_pending_writes;/* number of currently pending flushes
or writes */
/* NOTE on the 'flush' in names of the fields below: starting from
4.0.14, we separate the write of the log file and the actual fsync()
or other method to flush it to disk. The names below shhould really
be 'flush_or_write'! */
os_event_t no_flush_event; /* this event is in the reset state
when a flush is running; a thread
should wait for this without owning
the log mutex, but NOTE that to set or
reset this event, the thread MUST own
the log mutex! */
when a flush or a write is running;
a thread should wait for this without
owning the log mutex, but NOTE that
to set or reset this event, the
thread MUST own the log mutex! */
ibool one_flushed; /* during a flush, this is first FALSE
and becomes TRUE when one log group
has been flushed */
has been written or flushed */
os_event_t one_flushed_event;/* this event is reset when the
flush has not yet completed for any
log group; e.g., this means that a
transaction has been committed when
this is set; a thread should wait
flush or write has not yet completed
for any log group; e.g., this means
that a transaction has been committed
when this is set; a thread should wait
for this without owning the log mutex,
but NOTE that to set or reset this
event, the thread MUST own the log
......
......@@ -157,6 +157,15 @@ trx_commit_for_mysql(
/* out: 0 or error number */
trx_t* trx); /* in: trx handle */
/**************************************************************************
If required, flushes the log to disk if we called trx_commit_for_mysql()
with trx->flush_log_later == TRUE. */
ulint
trx_commit_complete_for_mysql(
/*==========================*/
/* out: 0 or error number */
trx_t* trx); /* in: trx handle */
/**************************************************************************
Marks the latest SQL statement ended. */
void
......@@ -343,6 +352,11 @@ struct trx_struct{
dulint no; /* transaction serialization number ==
max trx id when the transaction is
moved to COMMITTED_IN_MEMORY state */
ibool flush_log_later;/* when we commit the transaction
in MySQL's binlog write, we will
flush the log to disk later in
a separate call */
dulint commit_lsn; /* lsn at the time of the commit */
ibool dict_operation; /* TRUE if the trx is used to create
a table, create an index, or drop a
table */
......
......@@ -178,7 +178,7 @@ log_reserve_and_open(
/* Not enough free space, do a syncronous flush of the log
buffer */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS);
log_write_up_to(ut_dulint_max, LOG_WAIT_ALL_GROUPS, TRUE);
count++;
......@@ -675,7 +675,9 @@ log_init(void)
log_sys->buf_next_to_write = 0;
log_sys->flush_lsn = ut_dulint_zero;
log_sys->write_lsn = ut_dulint_zero;
log_sys->current_flush_lsn = ut_dulint_zero;
log_sys->flushed_to_disk_lsn = ut_dulint_zero;
log_sys->written_to_some_lsn = log_sys->lsn;
log_sys->written_to_all_lsn = log_sys->lsn;
......@@ -867,7 +869,7 @@ log_group_check_flush_completion(
printf("Log flushed first to group %lu\n", group->id);
}
log_sys->written_to_some_lsn = log_sys->flush_lsn;
log_sys->written_to_some_lsn = log_sys->write_lsn;
log_sys->one_flushed = TRUE;
return(LOG_UNLOCK_NONE_FLUSHED_LOCK);
......@@ -896,15 +898,15 @@ log_sys_check_flush_completion(void)
if (log_sys->n_pending_writes == 0) {
log_sys->written_to_all_lsn = log_sys->flush_lsn;
log_sys->buf_next_to_write = log_sys->flush_end_offset;
log_sys->written_to_all_lsn = log_sys->write_lsn;
log_sys->buf_next_to_write = log_sys->write_end_offset;
if (log_sys->flush_end_offset > log_sys->max_buf_free / 2) {
if (log_sys->write_end_offset > log_sys->max_buf_free / 2) {
/* Move the log buffer content to the start of the
buffer */
move_start = ut_calc_align_down(
log_sys->flush_end_offset,
log_sys->write_end_offset,
OS_FILE_LOG_BLOCK_SIZE);
move_end = ut_calc_align(log_sys->buf_free,
OS_FILE_LOG_BLOCK_SIZE);
......@@ -981,57 +983,6 @@ log_io_complete(
mutex_exit(&(log_sys->mutex));
}
/**********************************************************
Flushes the log files to the disk, using, for example, the Unix fsync.
This function does the flush even if the user has set
srv_flush_log_at_trx_commit = FALSE. */
void
log_flush_to_disk(void)
/*===================*/
{
log_group_t* group;
loop:
mutex_enter(&(log_sys->mutex));
if (log_sys->n_pending_writes > 0) {
/* A log file write is running */
mutex_exit(&(log_sys->mutex));
/* Wait for the log file write to complete and try again */
os_event_wait(log_sys->no_flush_event);
goto loop;
}
group = UT_LIST_GET_FIRST(log_sys->log_groups);
log_sys->n_pending_writes++;
group->n_pending_writes++;
os_event_reset(log_sys->no_flush_event);
os_event_reset(log_sys->one_flushed_event);
mutex_exit(&(log_sys->mutex));
fil_flush(group->space_id);
mutex_enter(&(log_sys->mutex));
ut_a(group->n_pending_writes == 1);
ut_a(log_sys->n_pending_writes == 1);
group->n_pending_writes--;
log_sys->n_pending_writes--;
os_event_set(log_sys->no_flush_event);
os_event_set(log_sys->one_flushed_event);
mutex_exit(&(log_sys->mutex));
}
/**********************************************************
Writes a log file header to a log file space. */
static
......@@ -1205,12 +1156,15 @@ by the transaction. If there is a flush running, it waits and checks if the
flush flushed enough. If not, starts a new flush. */
void
log_flush_up_to(
log_write_up_to(
/*============*/
dulint lsn, /* in: log sequence number up to which the log should
be written, ut_dulint_max if not specified */
ulint wait) /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
or LOG_WAIT_ALL_GROUPS */
ibool flush_to_disk)
/* in: TRUE if we want the written log also to be
flushed to disk */
{
log_group_t* group;
ulint start_offset;
......@@ -1239,9 +1193,18 @@ log_flush_up_to(
mutex_enter(&(log_sys->mutex));
if ((ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0)
|| ((ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0)
&& (wait != LOG_WAIT_ALL_GROUPS))) {
if (flush_to_disk
&& ut_dulint_cmp(log_sys->flushed_to_disk_lsn, lsn) >= 0) {
mutex_exit(&(log_sys->mutex));
return;
}
if (!flush_to_disk
&& (ut_dulint_cmp(log_sys->written_to_all_lsn, lsn) >= 0
|| (ut_dulint_cmp(log_sys->written_to_some_lsn, lsn) >= 0
&& wait != LOG_WAIT_ALL_GROUPS))) {
mutex_exit(&(log_sys->mutex));
......@@ -1249,10 +1212,19 @@ log_flush_up_to(
}
if (log_sys->n_pending_writes > 0) {
/* A flush is running */
/* A write (+ possibly flush to disk) is running */
if (flush_to_disk
&& ut_dulint_cmp(log_sys->current_flush_lsn, lsn) >= 0) {
/* The write + flush will write enough: wait for it to
complete */
goto do_waits;
}
if (ut_dulint_cmp(log_sys->flush_lsn, lsn) >= 0) {
/* The flush will flush enough: wait for it to
if (!flush_to_disk
&& ut_dulint_cmp(log_sys->write_lsn, lsn) >= 0) {
/* The write will write enough: wait for it to
complete */
goto do_waits;
......@@ -1260,16 +1232,17 @@ log_flush_up_to(
mutex_exit(&(log_sys->mutex));
/* Wait for the flush to complete and try to start a new
flush */
/* Wait for the write to complete and try to start a new
write */
os_event_wait(log_sys->no_flush_event);
goto loop;
}
if (log_sys->buf_free == log_sys->buf_next_to_write) {
/* Nothing to flush */
if (!flush_to_disk
&& log_sys->buf_free == log_sys->buf_next_to_write) {
/* Nothing to write and no flush to disk requested */
mutex_exit(&(log_sys->mutex));
......@@ -1277,7 +1250,7 @@ log_flush_up_to(
}
if (log_debug_writes) {
printf("Flushing log from %lu %lu up to lsn %lu %lu\n",
printf("Writing log from %lu %lu up to lsn %lu %lu\n",
ut_dulint_get_high(log_sys->written_to_all_lsn),
ut_dulint_get_low(log_sys->written_to_all_lsn),
ut_dulint_get_high(log_sys->lsn),
......@@ -1301,7 +1274,12 @@ log_flush_up_to(
ut_ad(area_end - area_start > 0);
log_sys->flush_lsn = log_sys->lsn;
log_sys->write_lsn = log_sys->lsn;
if (flush_to_disk) {
log_sys->current_flush_lsn = log_sys->lsn;
}
log_sys->one_flushed = FALSE;
log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
......@@ -1318,10 +1296,12 @@ log_flush_up_to(
OS_FILE_LOG_BLOCK_SIZE);
log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE;
log_sys->flush_end_offset = log_sys->buf_free;
log_sys->write_end_offset = log_sys->buf_free;
group = UT_LIST_GET_FIRST(log_sys->log_groups);
/* Do the write to the log files */
while (group) {
log_group_write_buf(LOG_FLUSH, group,
log_sys->buf + area_start,
......@@ -1330,20 +1310,25 @@ log_flush_up_to(
OS_FILE_LOG_BLOCK_SIZE),
start_offset - area_start);
log_group_set_fields(group, log_sys->flush_lsn);
log_group_set_fields(group, log_sys->write_lsn);
group = UT_LIST_GET_NEXT(log_groups, group);
}
mutex_exit(&(log_sys->mutex));
if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC
&& srv_flush_log_at_trx_commit != 2) {
if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
/* O_DSYNC means the OS did not buffer the log file at all:
so we have also flushed to disk what we have written */
log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
} else if (flush_to_disk) {
group = UT_LIST_GET_FIRST(log_sys->log_groups);
fil_flush(group->space_id);
log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
}
mutex_enter(&(log_sys->mutex));
......@@ -1403,7 +1388,7 @@ log_flush_margin(void)
mutex_exit(&(log->mutex));
if (do_flush) {
log_flush_up_to(ut_dulint_max, LOG_NO_WAIT);
log_write_up_to(ut_dulint_max, LOG_NO_WAIT, FALSE);
}
}
......@@ -1555,7 +1540,8 @@ log_group_checkpoint(
buf = group->checkpoint_buf;
mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
mach_write_to_8(buf + LOG_CHECKPOINT_LSN,
log_sys->next_checkpoint_lsn);
mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
log_group_calc_lsn_offset(
......@@ -1664,8 +1650,10 @@ log_reset_first_header_and_checkpoint(
lsn = ut_dulint_add(start, LOG_BLOCK_HDR_SIZE);
/* Write the label of ibbackup --restore */
sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "ibbackup ");
ut_sprintf_timestamp((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
sprintf((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
"ibbackup ");
ut_sprintf_timestamp(
(char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+ strlen("ibbackup "));
buf = hdr_buf + LOG_CHECKPOINT_1;
......@@ -1773,7 +1761,7 @@ log_checkpoint(
write-ahead-logging algorithm ensures that the log has been flushed
up to oldest_lsn. */
log_flush_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS);
log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
mutex_enter(&(log_sys->mutex));
......@@ -2466,7 +2454,7 @@ log_archive_do(
mutex_exit(&(log_sys->mutex));
log_flush_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS);
log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
calc_new_limit = FALSE;
......@@ -3104,8 +3092,8 @@ log_print(
"Last checkpoint at %lu %lu\n",
ut_dulint_get_high(log_sys->lsn),
ut_dulint_get_low(log_sys->lsn),
ut_dulint_get_high(log_sys->written_to_some_lsn),
ut_dulint_get_low(log_sys->written_to_some_lsn),
ut_dulint_get_high(log_sys->flushed_to_disk_lsn),
ut_dulint_get_low(log_sys->flushed_to_disk_lsn),
ut_dulint_get_high(log_sys->last_checkpoint_lsn),
ut_dulint_get_low(log_sys->last_checkpoint_lsn));
......
......@@ -521,10 +521,11 @@ os_file_create(
}
#endif
#ifdef UNIV_NON_BUFFERED_IO
if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
if (type == OS_LOG_FILE) {
/* Do not use unbuffered i/o to log files because
value 2 denotes that we do not flush the log at every
commit, but only once per second */
to allow group commit to work when MySQL binlogging
is used we must separate log file write and log
file flush to disk. */
} else {
if (srv_win_file_flush_method ==
SRV_WIN_IO_UNBUFFERED) {
......
......@@ -1664,7 +1664,7 @@ row_drop_table_for_mysql_in_background(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
trx_commit_for_mysql(trx);
......
......@@ -2812,8 +2812,7 @@ srv_master_thread(
at transaction commit */
srv_main_thread_op_info = (char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* If there were less than 10 i/os during the
one second sleep, we assume that there is free
......@@ -2831,8 +2830,8 @@ srv_master_thread(
srv_main_thread_op_info =
(char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP,
TRUE);
}
if (srv_activity_count == old_activity_count) {
......@@ -2867,8 +2866,7 @@ srv_master_thread(
buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
srv_main_thread_op_info = (char*) "flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
}
/* We run a batch of insert buffer merge every 10 seconds,
......@@ -2878,8 +2876,7 @@ srv_master_thread(
ibuf_contract_for_n_pages(TRUE, 5);
srv_main_thread_op_info = (char*)"flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* We run a full purge every 10 seconds, even if the server
were active */
......@@ -2903,8 +2900,8 @@ srv_master_thread(
if (difftime(current_time, last_flush_time) > 1) {
srv_main_thread_op_info = (char*) "flushing log";
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP,
TRUE);
last_flush_time = current_time;
}
}
......
......@@ -89,6 +89,8 @@ trx_create(
trx->check_foreigns = TRUE;
trx->check_unique_secondary = TRUE;
trx->flush_log_later = FALSE;
trx->dict_operation = FALSE;
trx->mysql_thd = NULL;
......@@ -780,13 +782,26 @@ trx_commit_off_kernel(
/*-------------------------------------*/
/* Most MySQL users run with srv_flush_.. set to FALSE: */
/* Most MySQL users run with srv_flush_.. set to 0: */
if (srv_flush_log_at_trx_commit) {
log_flush_up_to(lsn, LOG_WAIT_ONE_GROUP);
if (srv_flush_log_at_trx_commit != 0) {
if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC
&& srv_flush_log_at_trx_commit != 2
&& !trx->flush_log_later) {
/* Write the log to the log files AND flush
them to disk */
log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
} else {
/* Write the log but do not flush it to disk */
log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
}
}
trx->commit_lsn = lsn;
/*-------------------------------------*/
mutex_enter(&kernel_mutex);
......@@ -1467,6 +1482,31 @@ trx_commit_for_mysql(
return(0);
}
/**************************************************************************
If required, flushes the log to disk if we called trx_commit_for_mysql()
with trx->flush_log_later == TRUE. */
ulint
trx_commit_complete_for_mysql(
/*==========================*/
/* out: 0 or error number */
trx_t* trx) /* in: trx handle */
{
ut_a(trx);
if (srv_flush_log_at_trx_commit == 1
&& srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
trx->op_info = (char *) "flushing log";
/* Flush the log files to disk */
log_write_up_to(trx->commit_lsn, LOG_WAIT_ONE_GROUP, TRUE);
trx->op_info = (char *) "";
}
}
/**************************************************************************
Marks the latest SQL statement ended. */
......
......@@ -872,8 +872,7 @@ innobase_flush_logs(void)
DBUG_ENTER("innobase_flush_logs");
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_flush_to_disk();
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
DBUG_RETURN(result);
}
......@@ -920,7 +919,7 @@ Commits a transaction in an InnoDB database. */
int
innobase_commit(
/*============*/
/* out: 0 or error number */
/* out: 0 */
THD* thd, /* in: MySQL thread handle of the user for whom
the transaction should be committed */
void* trx_handle)/* in: InnoDB trx handle or
......@@ -928,7 +927,6 @@ innobase_commit(
that the current SQL statement ended, and we should
mark the start of a new statement with a savepoint */
{
int error = 0;
trx_t* trx;
DBUG_ENTER("innobase_commit");
......@@ -955,29 +953,27 @@ innobase_commit(
innobase_release_stat_resources(trx);
trx_mark_sql_stat_end(trx);
#ifndef DBUG_OFF
if (error) {
DBUG_PRINT("error", ("error: %d", error));
}
#endif
/* Tell InnoDB server that there might be work for
utility threads: */
srv_active_wake_master_thread();
DBUG_RETURN(error);
DBUG_RETURN(0);
}
/*********************************************************************
This is called when MySQL writes the binlog entry for the current
transaction. Writes to the InnoDB tablespace info which tells where the
MySQL binlog entry for the current transaction ended. Also commits the
transaction inside InnoDB. */
transaction inside InnoDB but does NOT flush InnoDB log files to disk.
To flush you have to call innobase_flush_log_to_disk. We have separated
flushing to eliminate the bottleneck of LOCK_log in log.cc which disabled
InnoDB's group commit capability. */
int
innobase_report_binlog_offset_and_commit(
/*=====================================*/
/* out: 0 or error code */
/* out: 0 */
THD* thd, /* in: user thread */
void* trx_handle, /* in: InnoDB trx handle */
char* log_file_name, /* in: latest binlog file name */
......@@ -993,7 +989,39 @@ innobase_report_binlog_offset_and_commit(
trx->mysql_log_file_name = log_file_name;
trx->mysql_log_offset = (ib_longlong)end_offset;
return(innobase_commit(thd, trx_handle));
trx->flush_log_later = TRUE;
innobase_commit(thd, trx_handle);
trx->flush_log_later = FALSE;
return(0);
}
/*********************************************************************
This is called after MySQL has written the binlog entry for the current
transaction. Flushes the InnoDB log files to disk if required. */
int
innobase_commit_complete(
/*=====================*/
/* out: 0 */
void* trx_handle) /* in: InnoDB trx handle */
{
trx_t* trx;
if (srv_flush_log_at_trx_commit == 0) {
return(0);
}
trx = (trx_t*)trx_handle;
ut_a(trx != NULL);
trx_commit_complete_for_mysql(trx);
return(0);
}
/*********************************************************************
......@@ -3202,7 +3230,7 @@ ha_innobase::create(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
innobase_table = dict_table_get(norm_name, NULL);
......@@ -3277,7 +3305,7 @@ ha_innobase::delete_table(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for
utility threads: */
......@@ -3347,7 +3375,7 @@ innobase_drop_database(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for
utility threads: */
......@@ -3419,7 +3447,7 @@ ha_innobase::rename_table(
the InnoDB data dictionary get out-of-sync if the user runs
with innodb_flush_log_at_trx_commit = 0 */
log_flush_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP);
log_write_up_to(ut_dulint_max, LOG_WAIT_ONE_GROUP, TRUE);
/* Tell the InnoDB server that there might be work for
utility threads: */
......@@ -3936,7 +3964,7 @@ ha_innobase::extra(
case HA_EXTRA_RESET:
case HA_EXTRA_RESET_STATE:
prebuilt->read_just_key = 0;
break;
break;
case HA_EXTRA_NO_KEYREAD:
prebuilt->read_just_key = 0;
break;
......
......@@ -211,6 +211,8 @@ int innobase_report_binlog_offset_and_commit(
void* trx_handle,
char* log_file_name,
my_off_t end_offset);
int innobase_commit_complete(
void* trx_handle);
int innobase_rollback(THD *thd, void* trx_handle);
int innobase_close_connection(THD *thd);
int innobase_drop_database(char *path);
......
......@@ -243,6 +243,9 @@ int ha_autocommit_or_rollback(THD *thd, int error)
replication. This function also calls the commit of the table
handler, because the order of transactions in the log of the table
handler must be the same as in the binlog.
NOTE that to eliminate the bottleneck of the group commit, we do not
flush the handler log files here, but only later in a call of
ha_commit_complete().
arguments:
thd: the thread handle of the current connection
......@@ -269,12 +272,37 @@ int ha_report_binlog_offset_and_commit(THD *thd,
my_error(ER_ERROR_DURING_COMMIT, MYF(0), error);
error=1;
}
trans->innodb_active_trans=0;
}
#endif
return error;
}
/*
Flushes the handler log files (if my.cnf settings do not free us from it)
after we have called ha_report_binlog_offset_and_commit(). To eliminate
the bottleneck from the group commit, this should be called when
LOCK_log has been released in log.cc.
arguments:
thd: the thread handle of the current connection
return value: always 0
*/
int ha_commit_complete(THD *thd)
{
#ifdef HAVE_INNOBASE_DB
THD_TRANS *trans;
trans = &thd->transaction.all;
if (trans->innobase_tid)
{
innobase_commit_complete(trans->innobase_tid);
trans->innodb_active_trans=0;
}
#endif
return 0;
}
/*
This function should be called when MySQL sends rows of a SELECT result set
or the EOF mark to the client. It releases a possible adaptive hash index
......
......@@ -372,6 +372,7 @@ void ha_resize_key_cache(void);
int ha_start_stmt(THD *thd);
int ha_report_binlog_offset_and_commit(THD *thd, char *log_file_name,
my_off_t end_offset);
int ha_commit_complete(THD *thd);
int ha_release_temporary_latches(THD *thd);
int ha_commit_trans(THD *thd, THD_TRANS *trans);
int ha_rollback_trans(THD *thd, THD_TRANS *trans);
......
......@@ -1033,6 +1033,8 @@ bool MYSQL_LOG::write(THD *thd,enum enum_server_command command,
bool MYSQL_LOG::write(Log_event* event_info)
{
THD *thd=event_info->thd;
bool called_handler_commit=0;
bool error=0;
DBUG_ENTER("MYSQL_LOG::write(event)");
......@@ -1047,7 +1049,6 @@ bool MYSQL_LOG::write(Log_event* event_info)
if (is_open())
{
bool should_rotate = 0;
THD *thd=event_info->thd;
const char *local_db = event_info->get_db();
#ifdef USING_TRANSACTIONS
IO_CACHE *file = ((event_info->get_cache_stmt()) ?
......@@ -1147,6 +1148,7 @@ bool MYSQL_LOG::write(Log_event* event_info)
{
error = ha_report_binlog_offset_and_commit(thd, log_file_name,
file->pos_in_file);
called_handler_commit=1;
}
should_rotate= (my_b_tell(file) >= (my_off_t) max_binlog_size);
......@@ -1172,6 +1174,15 @@ bool MYSQL_LOG::write(Log_event* event_info)
}
pthread_mutex_unlock(&LOCK_log);
/* Flush the transactional handler log file now that we have released
LOCK_log; the flush is placed here to eliminate the bottleneck on the
group commit */
if (called_handler_commit) {
ha_commit_complete(thd);
}
DBUG_RETURN(error);
}
......@@ -1277,6 +1288,13 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache)
}
VOID(pthread_mutex_unlock(&LOCK_log));
/* Flush the transactional handler log file now that we have released
LOCK_log; the flush is placed here to eliminate the bottleneck on the
group commit */
ha_commit_complete(thd);
DBUG_RETURN(0);
err:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment