Commit 2f9fb41b authored by inaam's avatar inaam

branches/innodb+

Provide support for native AIO on Linux.

rb://46 approved by: Marko
parent 44e408b9
......@@ -4436,11 +4436,14 @@ fil_aio_wait(
ut_ad(fil_validate());
if (os_aio_use_native_aio) {
if (srv_use_native_aio) {
srv_set_io_thread_op_info(segment, "native aio handle");
#ifdef WIN_ASYNC_IO
ret = os_aio_windows_handle(segment, 0, &fil_node,
&message, &type);
#elif defined(LINUX_NATIVE_AIO)
ret = os_aio_linux_handle(segment, &fil_node,
&message, &type);
#else
ret = 0; /* Eliminate compiler warning */
ut_error;
......
......@@ -9573,6 +9573,11 @@ static MYSQL_SYSVAR_STR(version, innodb_version_str,
PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
"InnoDB version", NULL, NULL, INNODB_VERSION_STR);
static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
"Use native AIO if supported on this platform.",
NULL, NULL, TRUE);
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(additional_mem_pool_size),
MYSQL_SYSVAR(autoextend_increment),
......@@ -9619,6 +9624,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(thread_sleep_delay),
MYSQL_SYSVAR(autoinc_lock_mode),
MYSQL_SYSVAR(version),
MYSQL_SYSVAR(use_native_aio),
NULL
};
......
......@@ -51,12 +51,6 @@ typedef int os_file_t;
extern ulint os_innodb_umask;
/* If this flag is TRUE, then we will use the native aio of the
OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads */
extern ibool os_aio_use_native_aio;
#define OS_FILE_SECTOR_SIZE 512
/* The next value should be smaller or equal to the smallest sector size used
......@@ -98,6 +92,7 @@ log. */
to become available again */
#define OS_FILE_SHARING_VIOLATION 76
#define OS_FILE_ERROR_NOT_SPECIFIED 77
#define OS_FILE_AIO_INTERRUPTED 78
/* Types for aio operations */
#define OS_FILE_READ 10
......@@ -556,9 +551,10 @@ in the three first aio arrays is the parameter n_segments given to the
function. The caller must create an i/o handler thread for each segment in
the four first arrays, but not for the sync aio array. */
UNIV_INTERN
void
ibool
os_aio_init(
/*========*/
/* out: TRUE on success. */
ulint n, /* in: maximum number of pending aio operations
allowed; n must be divisible by n_segments */
ulint n_segments, /* in: combined number of segments in the four
......@@ -737,4 +733,32 @@ innobase_mysql_tmpfile(void);
/* out: temporary file descriptor, or < 0 on error */
#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
#if defined(LINUX_NATIVE_AIO)
/**************************************************************************
This function is only used in Linux native asynchronous i/o.
Waits for an aio operation to complete. This function is used to wait the
for completed requests. The aio array of pending requests is divided
into segments. The thread specifies which segment or slot it wants to wait
for. NOTE: this function will also take care of freeing the aio slot,
therefore no other thread is allowed to do the freeing! */
UNIV_INTERN
ibool
os_aio_linux_handle(
/*================*/
/* out: TRUE if the IO was successful */
ulint global_seg, /* in: segment number in the aio array
to wait for; segment 0 is the ibuf
i/o thread, segment 1 is log i/o thread,
then follow the non-ibuf read threads,
and the last are the non-ibuf write
threads. */
fil_node_t**message1, /* out: the messages passed with the */
void** message2, /* aio request; note that in case the
aio operation failed, these output
parameters are valid and can be used to
restart the operation. */
ulint* type); /* out: OS_FILE_WRITE or ..._READ */
#endif /* LINUX_NATIVE_AIO */
#endif
......@@ -68,6 +68,11 @@ extern ulint srv_check_file_format_at_startup;
on duplicate key checking and foreign key checking */
extern ibool srv_locks_unsafe_for_binlog;
/* If this flag is TRUE, then we will use the native aio of the
OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads.
Currently we support native aio on windows and linux */
extern my_bool srv_use_native_aio;
extern ulint srv_n_data_files;
extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
......
......@@ -162,6 +162,9 @@ operations (very slow); also UNIV_DEBUG must be defined */
for compressed pages */
#define UNIV_ZIP_COPY /* call page_zip_copy_recs()
more often */
#define UNIV_AIO_DEBUG /* prints info about
submitted and reaped AIO
requests to the log. */
#endif
#define UNIV_BTR_DEBUG /* check B-tree links */
......
......@@ -22,6 +22,10 @@ Created 10/21/1995 Heikki Tuuri
#include <errno.h>
#endif /* UNIV_HOTBACKUP */
#if defined(LINUX_NATIVE_AIO)
#include <libaio.h>
#endif
/* This specifies the file permissions InnoDB uses when it creates files in
Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
my_umask */
......@@ -49,11 +53,59 @@ UNIV_INTERN os_mutex_t os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
/* In simulated aio, merge at most this many consecutive i/os */
#define OS_AIO_MERGE_N_CONSECUTIVE 64
/* If this flag is TRUE, then we will use the native aio of the
OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads */
UNIV_INTERN ibool os_aio_use_native_aio = FALSE;
/**********************************************************************
InnoDB AIO Implementation:
=========================
We support native AIO for windows and linux. For rest of the platforms
we simulate AIO by special io-threads servicing the IO-requests.
Simulated AIO:
==============
In platforms where we 'simulate' AIO following is a rough explanation
of the high level design.
There are four io-threads (for ibuf, log, read, write).
All synchronous IO requests are serviced by the calling thread using
os_file_write/os_file_read. The Asynchronous requests are queued up
in an array (there are four such arrays) by the calling thread.
Later these requests are picked up by the io-thread and are serviced
synchronously.
Windows native AIO:
==================
If srv_use_native_aio is not set then windows follow the same
code as simulated AIO. If the flag is set then native AIO interface
is used. On windows, one of the limitation is that if a file is opened
for AIO no synchronous IO can be done on it. Therefore we have an
extra fifth array to queue up synchronous IO requests.
There are innodb_file_io_threads helper threads. These threads work
on the four arrays mentioned above in Simulated AIO. No thread is
required for the sync array.
If a synchronous IO request is made, it is first queued in the sync
array. Then the calling thread itself waits on the request, thus
making the call synchronous.
If an AIO request is made the calling thread not only queues it in the
array but also submits the requests. The helper thread then collects
the completed IO request and calls completion routine on it.
Linux native AIO:
=================
If we have libaio installed on the system and innodb_use_native_aio
is set to TRUE we follow the code path of native AIO, otherwise we
do simulated AIO.
There are innodb_file_io_threads helper threads. These threads work
on the four arrays mentioned above in Simulated AIO.
If a synchronous IO request is made, it is handled by calling
os_file_write/os_file_read.
If an AIO request is made the calling thread not only queues it in the
array but also submits the requests. The helper thread then collects
the completed IO request and calls completion routine on it.
**********************************************************************/
UNIV_INTERN ibool os_aio_print_debug = FALSE;
......@@ -90,6 +142,10 @@ struct os_aio_slot_struct{
OVERLAPPED struct */
OVERLAPPED control; /* Windows control block for the
aio request */
#elif defined(LINUX_NATIVE_AIO)
struct iocb control; /* Linux control block for aio */
int n_bytes; /* bytes written/read. */
int ret; /* AIO return code */
#endif
};
......@@ -109,6 +165,10 @@ struct os_aio_array_struct{
ulint n_segments;/* Number of segments in the aio array of
pending aio requests. A thread can wait
separately for any one of the segments. */
ulint cur_seg; /* We reserve IO requests in round robin
to different segments. This points to the
segment that is to be used to service
next IO request. */
ulint n_reserved;/* Number of reserved slots in the
aio array outside the ibuf segment */
os_aio_slot_t* slots; /* Pointer to the slots in the array */
......@@ -120,8 +180,31 @@ struct os_aio_array_struct{
in WaitForMultipleObjects; used only in
Windows */
#endif
#if defined(LINUX_NATIVE_AIO)
io_context_t* aio_ctx;
/* completion queue for IO. There is
one such queue per segment. Each thread
will work on one ctx exclusively. */
struct io_event* aio_events;
/* The array to collect completed IOs.
There is one such event for each
possible pending IO. The size of the
array is equal to n_slots. */
#endif
};
#if defined(LINUX_NATIVE_AIO)
/* timeout for each io_getevents() call = 500ms. */
#define OS_AIO_REAP_TIMEOUT (500000000UL)
/* time to sleep, in microseconds if io_setup() returns EAGAIN. */
#define OS_AIO_IO_SETUP_RETRY_SLEEP (500000UL)
/* number of attempts before giving up on io_setup(). */
#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS 5
#endif
/* Array of events used in simulated aio */
static os_event_t* os_aio_segment_wait_events = NULL;
......@@ -133,6 +216,7 @@ static os_aio_array_t* os_aio_ibuf_array = NULL;
static os_aio_array_t* os_aio_log_array = NULL;
static os_aio_array_t* os_aio_sync_array = NULL;
/* Total number of segments. */
static ulint os_aio_n_segments = ULINT_UNDEFINED;
/* If the following is TRUE, read i/o handler threads try to
......@@ -320,17 +404,29 @@ os_file_get_last_error(
fflush(stderr);
if (err == ENOSPC) {
switch (err) {
case ENOSPC:
return(OS_FILE_DISK_FULL);
} else if (err == ENOENT) {
case ENOENT:
return(OS_FILE_NOT_FOUND);
} else if (err == EEXIST) {
case EEXIST:
return(OS_FILE_ALREADY_EXISTS);
} else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
case EXDEV:
case ENOTDIR:
case EISDIR:
return(OS_FILE_PATH_ERROR);
} else {
return(100 + err);
case EAGAIN:
if (srv_use_native_aio) {
return(OS_FILE_AIO_RESOURCES_RESERVED);
}
break;
case EINTR:
if (srv_use_native_aio) {
return(OS_FILE_AIO_INTERRUPTED);
}
break;
}
return(100 + err);
#endif
}
......@@ -380,6 +476,9 @@ os_file_handle_error_cond_exit(
return(FALSE);
} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
return(TRUE);
} else if (err == OS_FILE_AIO_INTERRUPTED) {
return(TRUE);
} else if (err == OS_FILE_ALREADY_EXISTS
|| err == OS_FILE_PATH_ERROR) {
......@@ -1188,7 +1287,7 @@ try_again:
buffering of writes in the OS */
attributes = 0;
#ifdef WIN_ASYNC_IO
if (os_aio_use_native_aio) {
if (srv_use_native_aio) {
attributes = attributes | FILE_FLAG_OVERLAPPED;
}
#endif
......@@ -2851,13 +2950,103 @@ os_aio_array_get_nth_slot(
return((array->slots) + index);
}
/****************************************************************************
Creates an aio wait array. */
#if defined(LINUX_NATIVE_AIO)
/**********************************************************************
Creates an io_context for native linux AIO. */
static
ibool
os_aio_linux_create_io_ctx(
/*=======================*/
/* out: TRUE on success. */
ulint max_events, /* in: number of events. */
io_context_t* io_ctx) /* out: io_ctx to initialize. */
{
int ret;
ulint retries = 0;
retry:
memset(io_ctx, 0x0, sizeof(*io_ctx));
/* Initialize the io_ctx. Tell it how many pending
IO requests this context will handle. */
ret = io_setup(max_events, io_ctx);
if (ret == 0) {
#if defined(UNIV_AIO_DEBUG)
fprintf(stderr,
"InnoDB: Linux native AIO:"
" initialized io_ctx for segment\n");
#endif
/* Success. Return now. */
return(TRUE);
}
/* If we hit EAGAIN we'll make a few attempts before failing. */
switch (ret) {
case -EAGAIN:
if (retries == 0) {
/* First time around. */
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Warning: io_setup() failed"
" with EAGAIN. Will make %d attempts"
" before giving up.\n",
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
}
if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
++retries;
fprintf(stderr,
"InnoDB: Warning: io_setup() attempt"
" %lu failed.\n",
retries);
os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
goto retry;
}
/* Have tried enough. Better call it a day. */
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: io_setup() failed"
" with EAGAIN after %d attempts.\n",
OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
break;
case -ENOSYS:
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: Linux Native AIO interface"
" is not supported on this platform. Please"
" check your OS documentation and install"
" appropriate binary of InnoDB.\n");
break;
default:
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Error: Linux Native AIO setup"
" returned following error[%d]\n", -ret);
break;
}
fprintf(stderr,
"InnoDB: You can disable Linux Native AIO by"
" setting innodb_native_aio = off in my.cnf\n");
return(FALSE);
}
#endif /* LINUX_NATIVE_AIO */
/**********************************************************************
Creates an aio wait array. Note that we return NULL in case of failure.
We don't care about freeing memory here because we assume that a
failure will result in server refusing to start up. */
static
os_aio_array_t*
os_aio_array_create(
/*================*/
/* out, own: aio array */
/* out, own: aio array, NULL on failure */
ulint n, /* in: maximum number of pending aio operations
allowed; n must be divisible by n_segments */
ulint n_segments) /* in: number of segments in the aio array */
......@@ -2867,6 +3056,8 @@ os_aio_array_create(
os_aio_slot_t* slot;
#ifdef WIN_ASYNC_IO
OVERLAPPED* over;
#elif defined(LINUX_NATIVE_AIO)
struct io_event* io_event = NULL;
#endif
ut_a(n > 0);
ut_a(n_segments > 0);
......@@ -2882,10 +3073,44 @@ os_aio_array_create(
array->n_slots = n;
array->n_segments = n_segments;
array->n_reserved = 0;
array->cur_seg = 0;
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
#ifdef __WIN__
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
#endif
#if defined(LINUX_NATIVE_AIO)
/* If we are not using native aio interface then skip this
part of initialization. */
if (!srv_use_native_aio) {
goto skip_native_aio;
}
/* Initialize the io_context array. One io_context
per segment in the array. */
array->aio_ctx = ut_malloc(n_segments *
sizeof(*array->aio_ctx));
for (i = 0; i < n_segments; ++i) {
if (!os_aio_linux_create_io_ctx(n/n_segments,
&array->aio_ctx[i])) {
/* If something bad happened during aio setup
we should call it a day and return right away.
We don't care about any leaks because a failure
to initialize the io subsystem means that the
server (or atleast the innodb storage engine)
is not going to startup. */
return(NULL);
}
}
/* Initialize the event array. One event per slot. */
io_event = ut_malloc(n * sizeof(*io_event));
memset(io_event, 0x0, sizeof(*io_event) * n);
array->aio_events = io_event;
skip_native_aio:
#endif /* LINUX_NATIVE_AIO */
for (i = 0; i < n; i++) {
slot = os_aio_array_get_nth_slot(array, i);
......@@ -2899,6 +3124,12 @@ os_aio_array_create(
over->hEvent = slot->event->handle;
*((array->native_events) + i) = over->hEvent;
#elif defined(LINUX_NATIVE_AIO)
memset(&slot->control, 0x0, sizeof(slot->control));
slot->n_bytes = 0;
slot->ret = 0;
#endif
}
......@@ -2915,9 +3146,10 @@ in the three first aio arrays is the parameter n_segments given to the
function. The caller must create an i/o handler thread for each segment in
the four first arrays, but not for the sync aio array. */
UNIV_INTERN
void
ibool
os_aio_init(
/*========*/
/* out: TRUE on success. */
ulint n, /* in: maximum number of pending aio operations
allowed; n must be divisible by n_segments */
ulint n_segments, /* in: combined number of segments in the four
......@@ -2945,15 +3177,25 @@ os_aio_init(
/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
if (os_aio_ibuf_array == NULL) {
goto err_exit;
}
srv_io_thread_function[0] = "insert buffer thread";
os_aio_log_array = os_aio_array_create(n_per_seg, 1);
if (os_aio_log_array == NULL) {
goto err_exit;
}
srv_io_thread_function[1] = "log thread";
os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
n_read_segs);
if (os_aio_read_array == NULL) {
goto err_exit;
}
for (i = 2; i < 2 + n_read_segs; i++) {
ut_a(i < SRV_MAX_N_IO_THREADS);
srv_io_thread_function[i] = "read thread";
......@@ -2961,12 +3203,20 @@ os_aio_init(
os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
n_write_segs);
if (os_aio_write_array == NULL) {
goto err_exit;
}
for (i = 2 + n_read_segs; i < n_segments; i++) {
ut_a(i < SRV_MAX_N_IO_THREADS);
srv_io_thread_function[i] = "write thread";
}
os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
if (os_aio_sync_array == NULL) {
goto err_exit;
}
os_aio_n_segments = n_segments;
......@@ -2980,6 +3230,11 @@ os_aio_init(
os_last_printout = time(NULL);
return(TRUE);
err_exit:
return(FALSE);
}
#ifdef WIN_ASYNC_IO
......@@ -3017,6 +3272,19 @@ os_aio_wake_all_threads_at_shutdown(void)
os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
#elif defined(LINUX_NATIVE_AIO)
/* When using native AIO interface the io helper threads
wait on io_getevents with a timeout value of 500ms. At
each wake up these threads check the server status.
No need to do anything to wake them up. */
if (srv_use_native_aio) {
return;
}
/* Fall through to simulated AIO handler wakeup if we are
not using native AIO. */
#endif
/* This loop wakes up all simulated ai/o threads */
......@@ -3135,18 +3403,25 @@ os_aio_array_reserve_slot(
offset */
ulint len) /* in: length of the block to read or write */
{
os_aio_slot_t* slot;
os_aio_slot_t* slot = NULL;
#ifdef WIN_ASYNC_IO
OVERLAPPED* control;
#elif defined(LINUX_NATIVE_AIO)
struct iocb* iocb;
off_t aio_offset;
#endif
ulint i;
ulint n;
loop:
os_mutex_enter(array->mutex);
if (array->n_reserved == array->n_slots) {
os_mutex_exit(array->mutex);
if (!os_aio_use_native_aio) {
if (!srv_use_native_aio) {
/* If the handler threads are suspended, wake them
so that we get more slots */
......@@ -3158,14 +3433,38 @@ loop:
goto loop;
}
/* First try to allocate a slot from the next segment in
round robin. */
ut_a(array->cur_seg < array->n_segments);
n = array->n_slots / array->n_segments;
for (i = array->cur_seg * n; i < ((array->cur_seg + 1) * n); i++) {
slot = os_aio_array_get_nth_slot(array, i);
if (slot->reserved == FALSE) {
goto found;
}
}
ut_ad(i < array->n_slots);
array->cur_seg = (array->cur_seg + 1) % array->n_segments;
/* If we are unable to find a slot in our desired segment we do
a linear search of entire array. We are guaranteed to find a
slot in linear search. */
for (i = 0;; i++) {
slot = os_aio_array_get_nth_slot(array, i);
if (slot->reserved == FALSE) {
break;
goto found;
}
}
/* We MUST always be able to get hold of a reserved slot. */
ut_error;
found:
ut_ad(!slot->reserved);
array->n_reserved++;
if (array->n_reserved == 1) {
......@@ -3194,8 +3493,42 @@ loop:
control->Offset = (DWORD)offset;
control->OffsetHigh = (DWORD)offset_high;
os_event_reset(slot->event);
#endif
#elif defined(LINUX_NATIVE_AIO)
/* If we are not using native AIO skip this part. */
if (!srv_use_native_aio) {
goto skip_native_aio;
}
/* Check if we are dealing with 64 bit arch.
If not then make sure that offset fits in 32 bits. */
if (sizeof(aio_offset) == 8) {
aio_offset = offset_high;
aio_offset <<= 32;
aio_offset += offset;
} else {
ut_a(offset_high == 0);
aio_offset = offset;
}
iocb = &slot->control;
if (type == OS_FILE_READ) {
io_prep_pread(iocb, file, buf, len, aio_offset);
} else {
ut_a(type == OS_FILE_WRITE);
io_prep_pwrite(iocb, file, buf, len, aio_offset);
}
iocb->data = (void*)slot;
slot->n_bytes = 0;
slot->ret = 0;
/*fprintf(stderr, "Filled up Linux native iocb.\n");*/
skip_native_aio:
#endif /* LINUX_NATIVE_AIO */
os_mutex_exit(array->mutex);
return(slot);
......@@ -3230,7 +3563,23 @@ os_aio_array_free_slot(
}
#ifdef WIN_ASYNC_IO
os_event_reset(slot->event);
#elif defined(LINUX_NATIVE_AIO)
if (srv_use_native_aio) {
memset(&slot->control, 0x0, sizeof(slot->control));
slot->n_bytes = 0;
slot->ret = 0;
/*fprintf(stderr, "Freed up Linux native slot.\n");*/
} else {
/* These fields should not be used if we are not
using native AIO. */
ut_ad(slot->n_bytes == 0);
ut_ad(slot->ret == 0);
}
#endif
os_mutex_exit(array->mutex);
}
......@@ -3250,7 +3599,7 @@ os_aio_simulated_wake_handler_thread(
ulint n;
ulint i;
ut_ad(!os_aio_use_native_aio);
ut_ad(!srv_use_native_aio);
segment = os_aio_get_array_and_local_segment(&array, global_segment);
......@@ -3286,7 +3635,7 @@ os_aio_simulated_wake_handler_threads(void)
{
ulint i;
if (os_aio_use_native_aio) {
if (srv_use_native_aio) {
/* We do not use simulated aio: do nothing */
return;
......@@ -3324,6 +3673,54 @@ os_aio_simulated_put_read_threads_to_sleep(void)
}
}
#if defined(LINUX_NATIVE_AIO)
/***********************************************************************
Dispatch an AIO request to the kernel. */
static
ibool
os_aio_linux_dispatch(
/*==================*/
/* out: TRUE on success. */
os_aio_array_t* array, /* in: io request array. */
os_aio_slot_t* slot) /* in: an already reserved slot. */
{
int ret;
ulint io_ctx_index;
struct iocb* iocb;
ut_ad(slot != NULL);
ut_ad(array);
ut_a(slot->reserved);
/* Find out what we are going to work with.
The iocb struct is directly in the slot.
The io_context is one per segment. */
iocb = &slot->control;
io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
#if defined(UNIV_AIO_DEBUG)
fprintf(stderr,
"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
#endif
/* io_submit returns number of successfully
queued requests or -errno. */
if (UNIV_UNLIKELY(ret != 1)) {
errno = -ret;
return(FALSE);
}
return(TRUE);
}
#endif /* LINUX_NATIVE_AIO */
/***********************************************************************
Requests an asynchronous i/o operation. */
UNIV_INTERN
......@@ -3372,7 +3769,6 @@ os_aio(
void* dummy_mess2;
ulint dummy_type;
#endif
ulint err = 0;
ibool retry;
ulint wake_later;
......@@ -3388,7 +3784,7 @@ os_aio(
if (mode == OS_AIO_SYNC
#ifdef WIN_ASYNC_IO
&& !os_aio_use_native_aio
&& !srv_use_native_aio
#endif
) {
/* This is actually an ordinary synchronous read or write:
......@@ -3428,6 +3824,11 @@ try_again:
array = os_aio_log_array;
} else if (mode == OS_AIO_SYNC) {
array = os_aio_sync_array;
#if defined(LINUX_NATIVE_AIO)
/* In Linux native AIO we don't use sync IO array. */
ut_a(!srv_use_native_aio);
#endif
} else {
array = NULL; /* Eliminate compiler warning */
ut_error;
......@@ -3436,13 +3837,17 @@ try_again:
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
name, buf, offset, offset_high, n);
if (type == OS_FILE_READ) {
if (os_aio_use_native_aio) {
#ifdef WIN_ASYNC_IO
if (srv_use_native_aio) {
os_n_file_reads++;
os_bytes_read_since_printout += len;
os_bytes_read_since_printout += n;
#ifdef WIN_ASYNC_IO
ret = ReadFile(file, buf, (DWORD)n, &len,
&(slot->control));
#elif defined(LINUX_NATIVE_AIO)
if (!os_aio_linux_dispatch(array, slot)) {
goto err_exit;
}
#endif
} else {
if (!wake_later) {
......@@ -3452,11 +3857,16 @@ try_again:
}
}
} else if (type == OS_FILE_WRITE) {
if (os_aio_use_native_aio) {
#ifdef WIN_ASYNC_IO
if (srv_use_native_aio) {
os_n_file_writes++;
#ifdef WIN_ASYNC_IO
ret = WriteFile(file, buf, (DWORD)n, &len,
&(slot->control));
#elif defined(LINUX_NATIVE_AIO)
if (!os_aio_linux_dispatch(array, slot)) {
goto err_exit;
}
#endif
} else {
if (!wake_later) {
......@@ -3470,7 +3880,7 @@ try_again:
}
#ifdef WIN_ASYNC_IO
if (os_aio_use_native_aio) {
if (srv_use_native_aio) {
if ((ret && len == n)
|| (!ret && GetLastError() == ERROR_IO_PENDING)) {
/* aio was queued successfully! */
......@@ -3493,15 +3903,13 @@ try_again:
return(TRUE);
}
err = 1; /* Fall through the next if */
goto err_exit;
}
#endif
if (err == 0) {
/* aio was queued successfully! */
return(TRUE);
}
err_exit:
os_aio_array_free_slot(array, slot);
retry = os_file_handle_error(name,
......@@ -3604,7 +4012,9 @@ os_aio_windows_handle(
#ifdef UNIV_DO_FLUSH
if (slot->type == OS_FILE_WRITE
&& !os_do_not_call_flush_at_each_write) {
ut_a(TRUE == os_file_flush(slot->file));
if (!os_file_flush(slot->file)) {
ut_error;
}
}
#endif /* UNIV_DO_FLUSH */
} else {
......@@ -3621,6 +4031,257 @@ os_aio_windows_handle(
}
#endif
#if defined(LINUX_NATIVE_AIO)
/**********************************************************************
This function is only used in Linux native asynchronous i/o. This is
called from within the io-thread. If there are no completed IO requests
in the slot array, the thread calls this function to collect more
requests from the kernel.
The io-thread waits on io_getevents(), which is a blocking call, with
a timeout value. Unless the system is very heavy loaded, keeping the
io-thread very busy, the io-thread will spend most of its time waiting
in this function.
The io-thread also exits in this function. It checks server status at
each wakeup and that is why we use timed wait in io_getevents(). */
static
void
os_aio_linux_collect(
/*=================*/
os_aio_array_t* array, /* in/out: slot array. */
ulint segment, /* in: local segment no. */
ulint seg_size) /* in: segment size. */
{
int i;
int ret;
ulint start_pos;
ulint end_pos;
struct timespec timeout;
struct io_event* events;
struct io_context* io_ctx;
/* sanity checks. */
ut_ad(array != NULL);
ut_ad(seg_size > 0);
ut_ad(segment < array->n_segments);
/* Which part of event array we are going to work on. */
events = &array->aio_events[segment * seg_size];
/* Which io_context we are going to use. */
io_ctx = array->aio_ctx[segment];
/* Starting point of the segment we will be working on. */
start_pos = segment * seg_size;
/* End point. */
end_pos = start_pos + seg_size;
retry:
/* Go down if we are in shutdown mode.
In case of srv_fast_shutdown == 2, there may be pending
IO requests but that should be OK as we essentially treat
that as a crash of InnoDB. */
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
os_thread_exit(NULL);
}
/* Initialize the events. The timeout value is arbitrary.
We probably need to experiment with it a little. */
memset(events, 0, sizeof(*events) * seg_size);
timeout.tv_sec = 0;
timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
/* This error handling is for any error in collecting the
IO requests. The errors, if any, for any particular IO
request are simply passed on to the calling routine. */
/* Not enough resources! Try again. */
if (ret == -EAGAIN) {
goto retry;
}
/* Interrupted! I have tested the behaviour in case of an
interrupt. If we have some completed IOs available then
the return code will be the number of IOs. We get EINTR only
if there are no completed IOs and we have been interrupted. */
if (ret == -EINTR) {
goto retry;
}
/* No pending request! Go back and check again. */
if (ret == 0) {
goto retry;
}
/* All other errors! should cause a trap for now. */
if (UNIV_UNLIKELY(ret < 0)) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: unexpected ret_code[%d] from"
" io_getevents()!\n", ret);
ut_error;
}
ut_a(ret > 0);
for (i = 0; i < ret; i++) {
os_aio_slot_t* slot;
struct iocb* control;
control = (struct iocb *)events[i].obj;
ut_a(control != NULL);
slot = (os_aio_slot_t *) control->data;
/* Some sanity checks. */
ut_a(slot != NULL);
ut_a(slot->reserved);
#if defined(UNIV_AIO_DEBUG)
fprintf(stderr,
"io_getevents[%c]: slot[%p] ctx[%p]"
" seg[%lu]\n",
(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
slot, io_ctx, segment);
#endif
/* We are not scribbling previous segment. */
ut_a(slot->pos >= start_pos);
/* We have not overstepped to next segment. */
ut_a(slot->pos < end_pos);
/* Mark this request as completed. The error handling
will be done in the calling function. */
os_mutex_enter(array->mutex);
slot->n_bytes = events[i].res;
slot->ret = events[i].res2;
slot->io_already_done = TRUE;
os_mutex_exit(array->mutex);
}
return;
}
/**************************************************************************
This function is only used in Linux native asynchronous i/o.
Waits for an aio operation to complete. This function is used to wait for
the completed requests. The aio array of pending requests is divided
into segments. The thread specifies which segment or slot it wants to wait
for. NOTE: this function will also take care of freeing the aio slot,
therefore no other thread is allowed to do the freeing! */
UNIV_INTERN
ibool
os_aio_linux_handle(
/*================*/
/* out: TRUE if the IO was successful */
ulint global_seg, /* in: segment number in the aio array
to wait for; segment 0 is the ibuf
i/o thread, segment 1 is log i/o thread,
then follow the non-ibuf read threads,
and the last are the non-ibuf write
threads. */
fil_node_t**message1, /* out: the messages passed with the */
void** message2, /* aio request; note that in case the
aio operation failed, these output
parameters are valid and can be used to
restart the operation. */
ulint* type) /* out: OS_FILE_WRITE or ..._READ */
{
ulint segment;
os_aio_array_t* array;
os_aio_slot_t* slot;
ulint n;
ulint i;
ibool ret = FALSE;
/* Should never be doing Sync IO here. */
ut_a(global_seg != ULINT_UNDEFINED);
/* Find the array and the local segment. */
segment = os_aio_get_array_and_local_segment(&array, global_seg);
n = array->n_slots / array->n_segments;
/* Loop until we have found a completed request. */
for (;;) {
os_mutex_enter(array->mutex);
for (i = 0; i < n; ++i) {
slot = os_aio_array_get_nth_slot(
array, i + segment * n);
if (slot->reserved && slot->io_already_done) {
/* Something for us to work on. */
goto found;
}
}
os_mutex_exit(array->mutex);
/* We don't have any completed request.
Wait for some request. Note that we return
from wait iff we have found a request. */
srv_set_io_thread_op_info(global_seg,
"waiting for completed aio requests");
os_aio_linux_collect(array, segment, n);
}
found:
/* Note that it may be that there are more then one completed
IO requests. We process them one at a time. We may have a case
here to improve the performance slightly by dealing with all
requests in one sweep. */
srv_set_io_thread_op_info(global_seg,
"processing completed aio requests");
/* Ensure that we are scribbling only our segment. */
ut_a(i < n);
ut_ad(slot != NULL);
ut_ad(slot->reserved);
ut_ad(slot->io_already_done);
*message1 = slot->message1;
*message2 = slot->message2;
*type = slot->type;
if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
ret = TRUE;
#ifdef UNIV_DO_FLUSH
if (slot->type == OS_FILE_WRITE
&& !os_do_not_call_flush_at_each_write)
&& !os_file_flush(slot->file) {
ut_error;
}
#endif /* UNIV_DO_FLUSH */
} else {
errno = -slot->ret;
/* os_file_handle_error does tell us if we should retry
this IO. As it stands now, we don't do this retry when
reaping requests from a different context than
the dispatcher. This non-retry logic is the same for
windows and linux native AIO.
We should probably look into this to transparently
re-submit the IO. */
os_file_handle_error(slot->name, "Linux aio");
ret = FALSE;
}
os_mutex_exit(array->mutex);
os_aio_array_free_slot(array, slot);
return(ret);
}
#endif /* LINUX_NATIVE_AIO */
/**************************************************************************
Does simulated aio. This function should be called by an i/o-handler
thread. */
......@@ -3995,6 +4656,40 @@ os_aio_validate(void)
return(TRUE);
}
/**************************************************************************
Prints pending IO requests per segment of an aio array.
We probably don't need per segment statistics but they can help us
during development phase to see if the IO requests are being
distributed as expected. */
static
void
os_aio_print_segment_info(
/*======================*/
FILE* file, /* in: file where to print */
ulint* n_seg, /* in: pending IO array */
os_aio_array_t* array) /* in: array to process */
{
ulint i;
ut_ad(array);
ut_ad(n_seg);
ut_ad(array->n_segments > 0);
if (array->n_segments == 1) {
return;
}
fprintf(file, " [");
for (i = 0; i < array->n_segments; i++) {
if (i != 0) {
fprintf(file, ", ");
}
fprintf(file, "%lu", n_seg[i]);
}
fprintf(file, "] ");
}
/**************************************************************************
Prints info of the aio arrays. */
UNIV_INTERN
......@@ -4006,6 +4701,7 @@ os_aio_print(
os_aio_array_t* array;
os_aio_slot_t* slot;
ulint n_reserved;
ulint n_res_seg[SRV_MAX_N_IO_THREADS];
time_t current_time;
double time_elapsed;
double avg_bytes_read;
......@@ -4038,11 +4734,15 @@ loop:
n_reserved = 0;
memset(n_res_seg, 0x0, sizeof(n_res_seg));
for (i = 0; i < array->n_slots; i++) {
slot = os_aio_array_get_nth_slot(array, i);
ulint seg_no = (i * array->n_segments) / array->n_slots;
if (slot->reserved) {
n_reserved++;
n_res_seg[seg_no]++;
#if 0
fprintf(stderr, "Reserved slot, messages %p %p\n",
(void*) slot->message1,
......@@ -4056,6 +4756,8 @@ loop:
fprintf(file, " %lu", (ulong) n_reserved);
os_aio_print_segment_info(file, n_res_seg, array);
os_mutex_exit(array->mutex);
if (array == os_aio_read_array) {
......
......@@ -12,6 +12,14 @@ MYSQL_PLUGIN_ACTIONS(innobase, [
AC_C_BIGENDIAN
case "$target_os" in
lin*)
AC_CHECK_HEADER(libaio.h,
AC_CHECK_LIB(aio, io_setup,
LIBS="$LIBS -laio"
AC_DEFINE(LINUX_NATIVE_AIO, [1],
[Linux native async I/O support]),
AC_MSG_WARN([No Linux native async I/O])),
AC_MSG_WARN([No Linux native async I/O]))
CFLAGS="$CFLAGS -DUNIV_LINUX";;
hpux10*)
CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX -DUNIV_HPUX10";;
......
......@@ -102,6 +102,12 @@ UNIV_INTERN ulint srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX;
on duplicate key checking and foreign key checking */
UNIV_INTERN ibool srv_locks_unsafe_for_binlog = FALSE;
/* If this flag is TRUE, then we will use the native aio of the
OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads.
Currently we support native aio on windows and linux */
UNIV_INTERN my_bool srv_use_native_aio = TRUE;
UNIV_INTERN ulint srv_n_data_files = 0;
UNIV_INTERN char** srv_data_file_names = NULL;
/* size in database pages */
......
......@@ -969,6 +969,7 @@ innobase_start_or_create_for_mysql(void)
ibool log_file_created;
ibool log_created = FALSE;
ibool log_opened = FALSE;
ibool success;
ib_uint64_t min_flushed_lsn;
ib_uint64_t max_flushed_lsn;
#ifdef UNIV_LOG_ARCHIVE
......@@ -1071,7 +1072,6 @@ innobase_start_or_create_for_mysql(void)
srv_is_being_started = TRUE;
srv_startup_is_before_trx_rollback_phase = TRUE;
os_aio_use_native_aio = FALSE;
#ifdef __WIN__
if (os_get_os_version() == OS_WIN95
......@@ -1083,12 +1083,30 @@ innobase_start_or_create_for_mysql(void)
but when run in conjunction with InnoDB Hot Backup, it seemed
to corrupt the data files. */
os_aio_use_native_aio = FALSE;
srv_use_native_aio = FALSE;
} else {
/* On Win 2000 and XP use async i/o */
os_aio_use_native_aio = TRUE;
srv_use_native_aio = TRUE;
}
#elif defined(LINUX_NATIVE_AIO)
if (srv_use_native_aio) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Using Linux native AIO\n");
}
#else
/* Currently native AIO is supported only on windows and linux
and that also when the support is compiled in. In all other
cases, we ignore the setting of innodb_use_native_aio. */
/* TODO: comment this out after internal testing. */
fprintf(stderr, "Ignoring innodb_use_native_aio\n");
srv_use_native_aio = FALSE;
#endif
if (srv_file_flush_method_str == NULL) {
/* These are the default options */
......@@ -1113,11 +1131,11 @@ innobase_start_or_create_for_mysql(void)
#else
} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
os_aio_use_native_aio = FALSE;
srv_use_native_aio = FALSE;
} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
os_aio_use_native_aio = FALSE;
srv_use_native_aio = FALSE;
} else if (0 == ut_strcmp(srv_file_flush_method_str,
"async_unbuffered")) {
......@@ -1210,19 +1228,38 @@ innobase_start_or_create_for_mysql(void)
srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
}
if (!os_aio_use_native_aio) {
if (!srv_use_native_aio) {
/* In simulated aio we currently have use only for 4 threads */
srv_n_file_io_threads = 4;
os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
* srv_n_file_io_threads,
success = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD *
srv_n_file_io_threads,
srv_n_file_io_threads,
SRV_MAX_N_PENDING_SYNC_IOS);
if (!success) {
return(DB_ERROR);
}
} else {
os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
* srv_n_file_io_threads,
/* Windows has a pending IO per thread limit.
Linux does not have any such restriction.
The question of what should be segment size
is a trade off. The larger size means longer
linear searches through the array and a smaller
value can lead to array being full, causing
unnecessary delays. The following value
for Linux is fairly arbitrary and needs to be
tested and tuned. */
success = os_aio_init(
#if defined(LINUX_NATIVE_AIO)
8 *
#endif /* LINUX_NATIVE_AIO */
SRV_N_PENDING_IOS_PER_THREAD *
srv_n_file_io_threads,
srv_n_file_io_threads,
SRV_MAX_N_PENDING_SYNC_IOS);
if (!success) {
return(DB_ERROR);
}
}
fil_init(srv_max_n_open_files);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment