branches/innodb+

Provide support for native AIO on Linux. rb://46 approved by: Marko

branches/innodb+
Provide support for native AIO on Linux. rb://46 approved by: Marko
2f9fb41b · inaam · 44e408b9 · 2f9fb41b · 2f9fb41b · 2f9fb41b
Commit 2f9fb41b authored Jan 13, 2009 by inaam
9 changed files
--- a/fil/fil0fil.c
+++ b/fil/fil0fil.c
@@ -4436,11 +4436,14 @@ fil_aio_wait(

 	ut_ad(fil_validate());

-	if (os_aio_use_native_aio) {
+	if (srv_use_native_aio) {
 		srv_set_io_thread_op_info(segment, "native aio handle");
 #ifdef WIN_ASYNC_IO
 		ret = os_aio_windows_handle(segment, 0, &fil_node,
 					    &message, &type);
+#elif defined(LINUX_NATIVE_AIO)
+		ret = os_aio_linux_handle(segment, &fil_node,
+					  &message, &type);
 #else
 		ret = 0; /* Eliminate compiler warning */
 		ut_error;

--- a/handler/ha_innodb.cc
+++ b/handler/ha_innodb.cc
@@ -9573,6 +9573,11 @@ static MYSQL_SYSVAR_STR(version, innodb_version_str,
  PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
  "InnoDB version", NULL, NULL, INNODB_VERSION_STR);

+static MYSQL_SYSVAR_BOOL(use_native_aio, srv_use_native_aio,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Use native AIO if supported on this platform.",
+  NULL, NULL, TRUE);
+
 static struct st_mysql_sys_var* innobase_system_variables[]= {
  MYSQL_SYSVAR(additional_mem_pool_size),
  MYSQL_SYSVAR(autoextend_increment),
@@ -9619,6 +9624,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
  MYSQL_SYSVAR(thread_sleep_delay),
  MYSQL_SYSVAR(autoinc_lock_mode),
  MYSQL_SYSVAR(version),
+  MYSQL_SYSVAR(use_native_aio),
  NULL
 };


--- a/include/os0file.h
+++ b/include/os0file.h
@@ -51,12 +51,6 @@ typedef int	os_file_t;

 extern ulint	os_innodb_umask;

-/* If this flag is TRUE, then we will use the native aio of the
-OS (provided we compiled Innobase with it in), otherwise we will
-use simulated aio we build below with threads */
-
-extern ibool	os_aio_use_native_aio;
-
 #define OS_FILE_SECTOR_SIZE		512

 /* The next value should be smaller or equal to the smallest sector size used
@@ -98,6 +92,7 @@ log. */
 						to become available again */
 #define	OS_FILE_SHARING_VIOLATION	76
 #define	OS_FILE_ERROR_NOT_SPECIFIED	77
+#define	OS_FILE_AIO_INTERRUPTED		78

 /* Types for aio operations */
 #define OS_FILE_READ	10
@@ -556,9 +551,10 @@ in the three first aio arrays is the parameter n_segments given to the
 function. The caller must create an i/o handler thread for each segment in
 the four first arrays, but not for the sync aio array. */
 UNIV_INTERN
-void
+ibool
 os_aio_init(
 /*========*/
+				/* out: TRUE on success. */
 	ulint	n,		/* in: maximum number of pending aio operations
 				allowed; n must be divisible by n_segments */
 	ulint	n_segments,	/* in: combined number of segments in the four
@@ -737,4 +733,32 @@ innobase_mysql_tmpfile(void);
 			/* out: temporary file descriptor, or < 0 on error */
 #endif /* !UNIV_HOTBACKUP && !__NETWARE__ */

+
+#if defined(LINUX_NATIVE_AIO)
+/**************************************************************************
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing! */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+				/* out: TRUE if the IO was successful */
+	ulint	global_seg,	/* in: segment number in the aio array
+				to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 is log i/o thread,
+				then follow the non-ibuf read threads,
+				and the last are the non-ibuf write
+				threads. */
+	fil_node_t**message1,	/* out: the messages passed with the */
+	void**	message2,	/* aio request; note that in case the
+				aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation. */
+	ulint*	type);		/* out: OS_FILE_WRITE or ..._READ */
+#endif /* LINUX_NATIVE_AIO */
+
 #endif
--- a/include/srv0srv.h
+++ b/include/srv0srv.h
@@ -68,6 +68,11 @@ extern ulint	srv_check_file_format_at_startup;
 on duplicate key checking and foreign key checking */
 extern ibool	srv_locks_unsafe_for_binlog;

+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+extern my_bool	srv_use_native_aio;
 extern ulint	srv_n_data_files;
 extern char**	srv_data_file_names;
 extern ulint*	srv_data_file_sizes;

--- a/include/univ.i
+++ b/include/univ.i
@@ -162,6 +162,9 @@ operations (very slow); also UNIV_DEBUG must be defined */
 						for compressed pages */
 #define UNIV_ZIP_COPY				/* call page_zip_copy_recs()
 						more often */
+#define UNIV_AIO_DEBUG				/* prints info about
+						submitted and reaped AIO
+						requests to the log. */
 #endif

 #define UNIV_BTR_DEBUG				/* check B-tree links */

--- a/os/os0file.c
+++ b/os/os0file.c
@@ -22,6 +22,10 @@ Created 10/21/1995 Heikki Tuuri
 #include <errno.h>
 #endif /* UNIV_HOTBACKUP */

+#if defined(LINUX_NATIVE_AIO)
+#include <libaio.h>
+#endif
+
 /* This specifies the file permissions InnoDB uses when it creates files in
 Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
 my_umask */
@@ -49,11 +53,59 @@ UNIV_INTERN os_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
 /* In simulated aio, merge at most this many consecutive i/os */
 #define OS_AIO_MERGE_N_CONSECUTIVE	64

-/* If this flag is TRUE, then we will use the native aio of the
-OS (provided we compiled Innobase with it in), otherwise we will
-use simulated aio we build below with threads */
-
-UNIV_INTERN ibool	os_aio_use_native_aio	= FALSE;
+/**********************************************************************
+
+InnoDB AIO Implementation:
+=========================
+
+We support native AIO for windows and linux. For rest of the platforms
+we simulate AIO by special io-threads servicing the IO-requests.
+
+Simulated AIO:
+==============
+
+In platforms where we 'simulate' AIO following is a rough explanation
+of the high level design.
+There are four io-threads (for ibuf, log, read, write).
+All synchronous IO requests are serviced by the calling thread using
+os_file_write/os_file_read. The Asynchronous requests are queued up
+in an array (there are four such arrays) by the calling thread. 
+Later these requests are picked up by the io-thread and are serviced
+synchronously.
+
+Windows native AIO:
+==================
+
+If srv_use_native_aio is not set then windows follow the same
+code as simulated AIO. If the flag is set then native AIO interface
+is used. On windows, one of the limitation is that if a file is opened
+for AIO no synchronous IO can be done on it. Therefore we have an
+extra fifth array to queue up synchronous IO requests.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO. No thread is
+required for the sync array.
+If a synchronous IO request is made, it is first queued in the sync
+array. Then the calling thread itself waits on the request, thus
+making the call synchronous.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+Linux native AIO:
+=================
+
+If we have libaio installed on the system and innodb_use_native_aio
+is set to TRUE we follow the code path of native AIO, otherwise we
+do simulated AIO.
+There are innodb_file_io_threads helper threads. These threads work
+on the four arrays mentioned above in Simulated AIO.
+If a synchronous IO request is made, it is handled by calling
+os_file_write/os_file_read.
+If an AIO request is made the calling thread not only queues it in the
+array but also submits the requests. The helper thread then collects
+the completed IO request and calls completion routine on it.
+
+**********************************************************************/

 UNIV_INTERN ibool	os_aio_print_debug	= FALSE;

@@ -90,6 +142,10 @@ struct os_aio_slot_struct{
 					OVERLAPPED struct */
 	OVERLAPPED	control;	/* Windows control block for the
 					aio request */
+#elif defined(LINUX_NATIVE_AIO)
+	struct iocb	control;	/* Linux control block for aio */
+	int		n_bytes;	/* bytes written/read. */
+	int		ret;		/* AIO return code */
 #endif
 };

@@ -109,6 +165,10 @@ struct os_aio_array_struct{
 	ulint		n_segments;/* Number of segments in the aio array of
 				  pending aio requests. A thread can wait
 				  separately for any one of the segments. */
+	ulint		cur_seg;  /* We reserve IO requests in round robin
+				  to different segments. This points to the
+				  segment that is to be used to service
+				  next IO request. */
 	ulint		n_reserved;/* Number of reserved slots in the
 				  aio array outside the ibuf segment */
 	os_aio_slot_t*	slots;	  /* Pointer to the slots in the array */
@@ -120,8 +180,31 @@ struct os_aio_array_struct{
 				  in WaitForMultipleObjects; used only in
 				  Windows */
 #endif
+
+#if defined(LINUX_NATIVE_AIO)
+	io_context_t*		aio_ctx;
+				/* completion queue for IO. There is 
+				one such queue per segment. Each thread
+				will work on one ctx exclusively. */
+	struct io_event*	aio_events;
+				/* The array to collect completed IOs.
+				There is one such event for each
+				possible pending IO. The size of the
+				array is equal to n_slots. */
+#endif
 };

+#if defined(LINUX_NATIVE_AIO)
+/* timeout for each io_getevents() call = 500ms. */
+#define OS_AIO_REAP_TIMEOUT	(500000000UL)
+
+/* time to sleep, in microseconds if io_setup() returns EAGAIN. */
+#define OS_AIO_IO_SETUP_RETRY_SLEEP	(500000UL)
+
+/* number of attempts before giving up on io_setup(). */
+#define OS_AIO_IO_SETUP_RETRY_ATTEMPTS	5
+#endif
+
 /* Array of events used in simulated aio */
 static os_event_t*	os_aio_segment_wait_events	= NULL;

@@ -133,6 +216,7 @@ static os_aio_array_t*	os_aio_ibuf_array	= NULL;
 static os_aio_array_t*	os_aio_log_array	= NULL;
 static os_aio_array_t*	os_aio_sync_array	= NULL;

+/* Total number of segments. */
 static ulint	os_aio_n_segments	= ULINT_UNDEFINED;

 /* If the following is TRUE, read i/o handler threads try to
@@ -320,17 +404,29 @@ os_file_get_last_error(

 	fflush(stderr);

-	if (err == ENOSPC) {
+	switch (err) {
+	case ENOSPC:
 		return(OS_FILE_DISK_FULL);
-	} else if (err == ENOENT) {
+	case ENOENT:
 		return(OS_FILE_NOT_FOUND);
-	} else if (err == EEXIST) {
+	case EEXIST:
 		return(OS_FILE_ALREADY_EXISTS);
-	} else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
+	case EXDEV:
+	case ENOTDIR:
+	case EISDIR:
 		return(OS_FILE_PATH_ERROR);
-	} else {
-		return(100 + err);
+	case EAGAIN:
+		if (srv_use_native_aio) {
+			return(OS_FILE_AIO_RESOURCES_RESERVED);
+		}
+		break;
+	case EINTR:
+		if (srv_use_native_aio) {
+			return(OS_FILE_AIO_INTERRUPTED);
+		}
+		break;
 	}
+	return(100 + err);
 #endif
 }

@@ -380,6 +476,9 @@ os_file_handle_error_cond_exit(
 		return(FALSE);
 	} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {

+		return(TRUE);
+	} else if (err == OS_FILE_AIO_INTERRUPTED) {
+
 		return(TRUE);
 	} else if (err == OS_FILE_ALREADY_EXISTS
 		   || err == OS_FILE_PATH_ERROR) {
@@ -1188,7 +1287,7 @@ try_again:
 		buffering of writes in the OS */
 		attributes = 0;
 #ifdef WIN_ASYNC_IO
-		if (os_aio_use_native_aio) {
+		if (srv_use_native_aio) {
 			attributes = attributes | FILE_FLAG_OVERLAPPED;
 		}
 #endif
@@ -2851,13 +2950,103 @@ os_aio_array_get_nth_slot(
 	return((array->slots) + index);
 }

-/****************************************************************************
-Creates an aio wait array. */
+#if defined(LINUX_NATIVE_AIO)
+/**********************************************************************
+Creates an io_context for native linux AIO. */
+static
+ibool
+os_aio_linux_create_io_ctx(
+/*=======================*/
+					/* out: TRUE on success. */
+	ulint		max_events,	/* in: number of events. */
+	io_context_t*	io_ctx)		/* out: io_ctx to initialize. */
+{
+	int	ret;
+	ulint	retries = 0;
+
+retry:
+	memset(io_ctx, 0x0, sizeof(*io_ctx));
+
+	/* Initialize the io_ctx. Tell it how many pending
+	IO requests this context will handle. */
+
+	ret = io_setup(max_events, io_ctx);
+	if (ret == 0) {
+#if defined(UNIV_AIO_DEBUG)
+		fprintf(stderr,
+			"InnoDB: Linux native AIO:"
+			" initialized io_ctx for segment\n");
+#endif
+		/* Success. Return now. */
+		return(TRUE);
+	}
+
+	/* If we hit EAGAIN we'll make a few attempts before failing. */
+
+	switch (ret) {
+	case -EAGAIN:
+		if (retries == 0) {
+			/* First time around. */
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Warning: io_setup() failed"
+				" with EAGAIN. Will make %d attempts"
+				" before giving up.\n",
+				OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+		}
+
+		if (retries < OS_AIO_IO_SETUP_RETRY_ATTEMPTS) {
+			++retries;
+			fprintf(stderr,
+				"InnoDB: Warning: io_setup() attempt"
+				" %lu failed.\n",
+				retries);
+			os_thread_sleep(OS_AIO_IO_SETUP_RETRY_SLEEP);
+			goto retry;
+		}
+
+		/* Have tried enough. Better call it a day. */
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: io_setup() failed"
+			" with EAGAIN after %d attempts.\n",
+			OS_AIO_IO_SETUP_RETRY_ATTEMPTS);
+		break;
+
+	case -ENOSYS:
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: Linux Native AIO interface"
+			" is not supported on this platform. Please"
+			" check your OS documentation and install"
+			" appropriate binary of InnoDB.\n");
+
+		break;
+
+	default:
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: Linux Native AIO setup"
+			" returned following error[%d]\n", -ret);
+		break;
+	}
+
+	fprintf(stderr,
+		"InnoDB: You can disable Linux Native AIO by"
+		" setting innodb_native_aio = off in my.cnf\n");
+	return(FALSE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+/**********************************************************************
+Creates an aio wait array. Note that we return NULL in case of failure.
+We don't care about freeing memory here because we assume that a
+failure will result in server refusing to start up. */
 static
 os_aio_array_t*
 os_aio_array_create(
 /*================*/
-				/* out, own: aio array */
+				/* out, own: aio array, NULL on failure */
 	ulint	n,		/* in: maximum number of pending aio operations
 				allowed; n must be divisible by n_segments */
 	ulint	n_segments)	/* in: number of segments in the aio array */
@@ -2867,6 +3056,8 @@ os_aio_array_create(
 	os_aio_slot_t*	slot;
 #ifdef WIN_ASYNC_IO
 	OVERLAPPED*	over;
+#elif defined(LINUX_NATIVE_AIO)
+	struct io_event*	io_event = NULL;
 #endif
 	ut_a(n > 0);
 	ut_a(n_segments > 0);
@@ -2882,10 +3073,44 @@ os_aio_array_create(
 	array->n_slots		= n;
 	array->n_segments	= n_segments;
 	array->n_reserved	= 0;
+	array->cur_seg		= 0;
 	array->slots		= ut_malloc(n * sizeof(os_aio_slot_t));
 #ifdef __WIN__
 	array->native_events	= ut_malloc(n * sizeof(os_native_event_t));
 #endif
+
+#if defined(LINUX_NATIVE_AIO)
+	/* If we are not using native aio interface then skip this
+	part of initialization. */
+	if (!srv_use_native_aio) {
+		goto skip_native_aio;
+	}
+
+	/* Initialize the io_context array. One io_context
+	per segment in the array. */
+
+	array->aio_ctx = ut_malloc(n_segments *
+				   sizeof(*array->aio_ctx));
+	for (i = 0; i < n_segments; ++i) {
+		if (!os_aio_linux_create_io_ctx(n/n_segments,
+					   &array->aio_ctx[i])) {
+			/* If something bad happened during aio setup
+			we should call it a day and return right away.
+			We don't care about any leaks because a failure
+			to initialize the io subsystem means that the
+			server (or atleast the innodb storage engine)
+			is not going to startup. */
+			return(NULL);
+		}
+	}
+
+	/* Initialize the event array. One event per slot. */
+	io_event = ut_malloc(n * sizeof(*io_event));
+	memset(io_event, 0x0, sizeof(*io_event) * n);
+	array->aio_events = io_event;
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
 	for (i = 0; i < n; i++) {
 		slot = os_aio_array_get_nth_slot(array, i);

@@ -2899,6 +3124,12 @@ os_aio_array_create(
 		over->hEvent = slot->event->handle;

 		*((array->native_events) + i) = over->hEvent;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+		memset(&slot->control, 0x0, sizeof(slot->control));
+		slot->n_bytes = 0;
+		slot->ret = 0;
 #endif
 	}

@@ -2915,9 +3146,10 @@ in the three first aio arrays is the parameter n_segments given to the
 function. The caller must create an i/o handler thread for each segment in
 the four first arrays, but not for the sync aio array. */
 UNIV_INTERN
-void
+ibool
 os_aio_init(
 /*========*/
+				/* out: TRUE on success. */
 	ulint	n,		/* in: maximum number of pending aio operations
 				allowed; n must be divisible by n_segments */
 	ulint	n_segments,	/* in: combined number of segments in the four
@@ -2945,15 +3177,25 @@ os_aio_init(
 	/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */

 	os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+	if (os_aio_ibuf_array == NULL) {
+		goto err_exit;
+	}

 	srv_io_thread_function[0] = "insert buffer thread";

 	os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+	if (os_aio_log_array == NULL) {
+		goto err_exit;
+	}

 	srv_io_thread_function[1] = "log thread";

 	os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
 						n_read_segs);
+	if (os_aio_read_array == NULL) {
+		goto err_exit;
+	}
+
 	for (i = 2; i < 2 + n_read_segs; i++) {
 		ut_a(i < SRV_MAX_N_IO_THREADS);
 		srv_io_thread_function[i] = "read thread";
@@ -2961,12 +3203,20 @@ os_aio_init(

 	os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
 						 n_write_segs);
+	if (os_aio_write_array == NULL) {
+		goto err_exit;
+	}
+
 	for (i = 2 + n_read_segs; i < n_segments; i++) {
 		ut_a(i < SRV_MAX_N_IO_THREADS);
 		srv_io_thread_function[i] = "write thread";
 	}

 	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+	if (os_aio_sync_array == NULL) {
+		goto err_exit;
+	}
+

 	os_aio_n_segments = n_segments;

@@ -2980,6 +3230,11 @@ os_aio_init(

 	os_last_printout = time(NULL);

+	return(TRUE);
+
+err_exit:
+	return(FALSE);
+
 }

 #ifdef WIN_ASYNC_IO
@@ -3017,6 +3272,19 @@ os_aio_wake_all_threads_at_shutdown(void)
 	os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
 	os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
 	os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	/* When using native AIO interface the io helper threads
+	wait on io_getevents with a timeout value of 500ms. At
+	each wake up these threads check the server status.
+	No need to do anything to wake them up. */
+
+	if (srv_use_native_aio) {
+		return;
+	}
+	/* Fall through to simulated AIO handler wakeup if we are
+	not using native AIO. */
 #endif
 	/* This loop wakes up all simulated ai/o threads */

@@ -3135,18 +3403,25 @@ os_aio_array_reserve_slot(
 				offset */
 	ulint		len)	/* in: length of the block to read or write */
 {
-	os_aio_slot_t*	slot;
+	os_aio_slot_t*	slot = NULL;
 #ifdef WIN_ASYNC_IO
 	OVERLAPPED*	control;
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	struct iocb*	iocb;
+	off_t		aio_offset;
+
 #endif
 	ulint		i;
+	ulint		n;
 loop:
 	os_mutex_enter(array->mutex);

 	if (array->n_reserved == array->n_slots) {
 		os_mutex_exit(array->mutex);

-		if (!os_aio_use_native_aio) {
+		if (!srv_use_native_aio) {
 			/* If the handler threads are suspended, wake them
 			so that we get more slots */

@@ -3158,14 +3433,38 @@ loop:
 		goto loop;
 	}

+	/* First try to allocate a slot from the next segment in
+	round robin. */
+	ut_a(array->cur_seg < array->n_segments);
+
+	n = array->n_slots / array->n_segments;
+	for (i = array->cur_seg * n; i < ((array->cur_seg + 1) * n); i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved == FALSE) {
+			goto found;
+		}
+	}
+
+	ut_ad(i < array->n_slots);
+	array->cur_seg = (array->cur_seg + 1) % array->n_segments;
+
+	/* If we are unable to find a slot in our desired segment we do
+	a linear search of entire array. We are guaranteed to find a
+	slot in linear search. */
 	for (i = 0;; i++) {
 		slot = os_aio_array_get_nth_slot(array, i);

 		if (slot->reserved == FALSE) {
-			break;
+			goto found;
 		}
 	}

+	/* We MUST always be able to get hold of a reserved slot. */
+	ut_error;
+found:
+	ut_ad(!slot->reserved);
+
 	array->n_reserved++;

 	if (array->n_reserved == 1) {
@@ -3194,8 +3493,42 @@ loop:
 	control->Offset = (DWORD)offset;
 	control->OffsetHigh = (DWORD)offset_high;
 	os_event_reset(slot->event);
-#endif

+#elif defined(LINUX_NATIVE_AIO)
+
+	/* If we are not using native AIO skip this part. */
+	if (!srv_use_native_aio) {
+		goto skip_native_aio;
+	}
+
+	/* Check if we are dealing with 64 bit arch.
+	If not then make sure that offset fits in 32 bits. */
+	if (sizeof(aio_offset) == 8) {
+		aio_offset = offset_high;
+		aio_offset <<= 32;
+		aio_offset += offset;
+	} else {
+		ut_a(offset_high == 0);
+		aio_offset = offset;
+	}
+
+	iocb = &slot->control;
+
+	if (type == OS_FILE_READ) {
+		io_prep_pread(iocb, file, buf, len, aio_offset);
+	} else {
+		ut_a(type == OS_FILE_WRITE);
+		io_prep_pwrite(iocb, file, buf, len, aio_offset);
+	}
+
+	iocb->data = (void*)slot;
+	slot->n_bytes = 0;
+	slot->ret = 0;
+	/*fprintf(stderr, "Filled up Linux native iocb.\n");*/
+	
+
+skip_native_aio:
+#endif /* LINUX_NATIVE_AIO */
 	os_mutex_exit(array->mutex);

 	return(slot);
@@ -3230,7 +3563,23 @@ os_aio_array_free_slot(
 	}

 #ifdef WIN_ASYNC_IO
+
 	os_event_reset(slot->event);
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	if (srv_use_native_aio) {
+		memset(&slot->control, 0x0, sizeof(slot->control));
+		slot->n_bytes = 0;
+		slot->ret = 0;
+		/*fprintf(stderr, "Freed up Linux native slot.\n");*/
+	} else {
+		/* These fields should not be used if we are not
+		using native AIO. */
+		ut_ad(slot->n_bytes == 0);
+		ut_ad(slot->ret == 0);
+	}
+
 #endif
 	os_mutex_exit(array->mutex);
 }
@@ -3250,7 +3599,7 @@ os_aio_simulated_wake_handler_thread(
 	ulint		n;
 	ulint		i;

-	ut_ad(!os_aio_use_native_aio);
+	ut_ad(!srv_use_native_aio);

 	segment = os_aio_get_array_and_local_segment(&array, global_segment);

@@ -3286,7 +3635,7 @@ os_aio_simulated_wake_handler_threads(void)
 {
 	ulint	i;

-	if (os_aio_use_native_aio) {
+	if (srv_use_native_aio) {
 		/* We do not use simulated aio: do nothing */

 		return;
@@ -3324,6 +3673,54 @@ os_aio_simulated_put_read_threads_to_sleep(void)
 	}
 }

+#if defined(LINUX_NATIVE_AIO)
+/***********************************************************************
+Dispatch an AIO request to the kernel. */
+static
+ibool
+os_aio_linux_dispatch(
+/*==================*/
+				/* out: TRUE on success. */
+	os_aio_array_t*	array,	/* in: io request array. */
+	os_aio_slot_t*	slot)	/* in: an already reserved slot. */
+{
+	int		ret;
+	ulint		io_ctx_index;
+	struct iocb*	iocb;
+
+	ut_ad(slot != NULL);
+	ut_ad(array);
+
+	ut_a(slot->reserved);
+
+	/* Find out what we are going to work with.
+	The iocb struct is directly in the slot.
+	The io_context is one per segment. */
+
+	iocb = &slot->control;
+	io_ctx_index = (slot->pos * array->n_segments) / array->n_slots;
+
+	ret = io_submit(array->aio_ctx[io_ctx_index], 1, &iocb);
+
+#if defined(UNIV_AIO_DEBUG)
+	fprintf(stderr,
+		"io_submit[%c] ret[%d]: slot[%p] ctx[%p] seg[%lu]\n",
+		(slot->type == OS_FILE_WRITE) ? 'w' : 'r', ret, slot,
+		array->aio_ctx[io_ctx_index], (ulong)io_ctx_index);
+#endif
+
+	/* io_submit returns number of successfully
+	queued requests or -errno. */
+	if (UNIV_UNLIKELY(ret != 1)) {
+		errno = -ret;
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+#endif /* LINUX_NATIVE_AIO */
+
+
 /***********************************************************************
 Requests an asynchronous i/o operation. */
 UNIV_INTERN
@@ -3372,7 +3769,6 @@ os_aio(
 	void*		dummy_mess2;
 	ulint		dummy_type;
 #endif
-	ulint		err		= 0;
 	ibool		retry;
 	ulint		wake_later;

@@ -3388,7 +3784,7 @@ os_aio(

 	if (mode == OS_AIO_SYNC
 #ifdef WIN_ASYNC_IO
-	    && !os_aio_use_native_aio
+	    && !srv_use_native_aio
 #endif
 	    ) {
 		/* This is actually an ordinary synchronous read or write:
@@ -3428,6 +3824,11 @@ try_again:
 		array = os_aio_log_array;
 	} else if (mode == OS_AIO_SYNC) {
 		array = os_aio_sync_array;
+
+#if defined(LINUX_NATIVE_AIO)
+		/* In Linux native AIO we don't use sync IO array. */
+		ut_a(!srv_use_native_aio);
+#endif
 	} else {
 		array = NULL; /* Eliminate compiler warning */
 		ut_error;
@@ -3436,13 +3837,17 @@ try_again:
 	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
 					 name, buf, offset, offset_high, n);
 	if (type == OS_FILE_READ) {
-		if (os_aio_use_native_aio) {
-#ifdef WIN_ASYNC_IO
+		if (srv_use_native_aio) {
 			os_n_file_reads++;
-			os_bytes_read_since_printout += len;
-
+			os_bytes_read_since_printout += n;
+#ifdef WIN_ASYNC_IO
 			ret = ReadFile(file, buf, (DWORD)n, &len,
 				       &(slot->control));
+
+#elif defined(LINUX_NATIVE_AIO)
+			if (!os_aio_linux_dispatch(array, slot)) {
+				goto err_exit;
+			}
 #endif
 		} else {
 			if (!wake_later) {
@@ -3452,11 +3857,16 @@ try_again:
 			}
 		}
 	} else if (type == OS_FILE_WRITE) {
-		if (os_aio_use_native_aio) {
-#ifdef WIN_ASYNC_IO
+		if (srv_use_native_aio) {
 			os_n_file_writes++;
+#ifdef WIN_ASYNC_IO
 			ret = WriteFile(file, buf, (DWORD)n, &len,
 					&(slot->control));
+
+#elif defined(LINUX_NATIVE_AIO)
+			if (!os_aio_linux_dispatch(array, slot)) {
+				goto err_exit;
+			}
 #endif
 		} else {
 			if (!wake_later) {
@@ -3470,7 +3880,7 @@ try_again:
 	}

 #ifdef WIN_ASYNC_IO
-	if (os_aio_use_native_aio) {
+	if (srv_use_native_aio) {
 		if ((ret && len == n)
 		    || (!ret && GetLastError() == ERROR_IO_PENDING)) {
 			/* aio was queued successfully! */
@@ -3493,15 +3903,13 @@ try_again:
 			return(TRUE);
 		}

-		err = 1; /* Fall through the next if */
+		goto err_exit;
 	}
 #endif
-	if (err == 0) {
-		/* aio was queued successfully! */
-
-		return(TRUE);
-	}
+	/* aio was queued successfully! */
+	return(TRUE);

+err_exit:
 	os_aio_array_free_slot(array, slot);

 	retry = os_file_handle_error(name,
@@ -3604,7 +4012,9 @@ os_aio_windows_handle(
 #ifdef UNIV_DO_FLUSH
 		if (slot->type == OS_FILE_WRITE
 		    && !os_do_not_call_flush_at_each_write) {
-			ut_a(TRUE == os_file_flush(slot->file));
+			if (!os_file_flush(slot->file)) {
+				ut_error;
+			}
 		}
 #endif /* UNIV_DO_FLUSH */
 	} else {
@@ -3621,6 +4031,257 @@ os_aio_windows_handle(
 }
 #endif

+#if defined(LINUX_NATIVE_AIO)
+/**********************************************************************
+This function is only used in Linux native asynchronous i/o. This is
+called from within the io-thread. If there are no completed IO requests
+in the slot array, the thread calls this function to collect more
+requests from the kernel.
+The io-thread waits on io_getevents(), which is a blocking call, with
+a timeout value. Unless the system is very heavy loaded, keeping the
+io-thread very busy, the io-thread will spend most of its time waiting
+in this function.
+The io-thread also exits in this function. It checks server status at
+each wakeup and that is why we use timed wait in io_getevents(). */
+static
+void
+os_aio_linux_collect(
+/*=================*/
+	os_aio_array_t* array,		/* in/out: slot array. */
+	ulint		segment,	/* in: local segment no. */
+	ulint		seg_size)	/* in: segment size. */
+{
+	int			i;
+	int			ret;
+	ulint			start_pos;
+	ulint			end_pos;
+	struct timespec		timeout;
+	struct io_event*	events;
+	struct io_context*	io_ctx;
+
+	/* sanity checks. */
+	ut_ad(array != NULL);
+	ut_ad(seg_size > 0);
+	ut_ad(segment < array->n_segments);
+
+	/* Which part of event array we are going to work on. */
+	events = &array->aio_events[segment * seg_size];
+
+	/* Which io_context we are going to use. */
+	io_ctx = array->aio_ctx[segment];
+
+	/* Starting point of the segment we will be working on. */
+	start_pos = segment * seg_size;
+
+	/* End point. */
+	end_pos = start_pos + seg_size;
+
+retry:
+
+	/* Go down if we are in shutdown mode.
+	In case of srv_fast_shutdown == 2, there may be pending
+	IO requests but that should be OK as we essentially treat
+	that as a crash of InnoDB. */
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_thread_exit(NULL);
+	}
+
+	/* Initialize the events. The timeout value is arbitrary.
+	We probably need to experiment with it a little. */
+	memset(events, 0, sizeof(*events) * seg_size);
+	timeout.tv_sec = 0;
+	timeout.tv_nsec = OS_AIO_REAP_TIMEOUT;
+
+	ret = io_getevents(io_ctx, 1, seg_size, events, &timeout);
+
+	/* This error handling is for any error in collecting the
+	IO requests. The errors, if any, for any particular IO
+	request are simply passed on to the calling routine. */
+
+	/* Not enough resources! Try again. */
+	if (ret == -EAGAIN) {
+		goto retry;
+	}
+
+	/* Interrupted! I have tested the behaviour in case of an
+	interrupt. If we have some completed IOs available then
+	the return code will be the number of IOs. We get EINTR only
+	if there are no completed IOs and we have been interrupted. */
+	if (ret == -EINTR) {
+		goto retry;
+	}
+
+	/* No pending request! Go back and check again. */
+	if (ret == 0) {
+		goto retry;
+	}
+
+	/* All other errors! should cause a trap for now. */
+	if (UNIV_UNLIKELY(ret < 0)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: unexpected ret_code[%d] from"
+			" io_getevents()!\n", ret);
+		ut_error;
+	}
+
+	ut_a(ret > 0);
+
+	for (i = 0; i < ret; i++) {
+		os_aio_slot_t*	slot;
+		struct iocb*	control;
+
+		control = (struct iocb *)events[i].obj;
+		ut_a(control != NULL);
+
+		slot = (os_aio_slot_t *) control->data;
+
+		/* Some sanity checks. */
+		ut_a(slot != NULL);
+		ut_a(slot->reserved);
+
+#if defined(UNIV_AIO_DEBUG)
+		fprintf(stderr,
+			"io_getevents[%c]: slot[%p] ctx[%p]"
+			" seg[%lu]\n",
+			(slot->type == OS_FILE_WRITE) ? 'w' : 'r',
+			slot, io_ctx, segment);
+#endif
+
+		/* We are not scribbling previous segment. */
+		ut_a(slot->pos >= start_pos);
+
+		/* We have not overstepped to next segment. */
+		ut_a(slot->pos < end_pos);
+
+		/* Mark this request as completed. The error handling
+		will be done in the calling function. */
+		os_mutex_enter(array->mutex);
+		slot->n_bytes = events[i].res;
+		slot->ret = events[i].res2;
+		slot->io_already_done = TRUE;
+		os_mutex_exit(array->mutex);
+	}
+
+	return;
+}
+
+/**************************************************************************
+This function is only used in Linux native asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait for
+the completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing! */
+UNIV_INTERN
+ibool
+os_aio_linux_handle(
+/*================*/
+				/* out: TRUE if the IO was successful */
+	ulint	global_seg,	/* in: segment number in the aio array
+				to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 is log i/o thread,
+				then follow the non-ibuf read threads,
+				and the last are the non-ibuf write
+				threads. */
+	fil_node_t**message1,	/* out: the messages passed with the */
+	void**	message2,	/* aio request; note that in case the
+				aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation. */
+	ulint*	type)		/* out: OS_FILE_WRITE or ..._READ */
+{
+	ulint		segment;
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+	ulint		n;
+	ulint		i;
+	ibool		ret = FALSE;
+
+	/* Should never be doing Sync IO here. */
+	ut_a(global_seg != ULINT_UNDEFINED);
+
+	/* Find the array and the local segment. */
+	segment = os_aio_get_array_and_local_segment(&array, global_seg);
+	n = array->n_slots / array->n_segments;
+
+	/* Loop until we have found a completed request. */
+	for (;;) {
+		os_mutex_enter(array->mutex);
+		for (i = 0; i < n; ++i) {
+			slot = os_aio_array_get_nth_slot(
+					array, i + segment * n);
+			if (slot->reserved && slot->io_already_done) {
+				/* Something for us to work on. */
+				goto found;
+			}
+		}
+
+		os_mutex_exit(array->mutex);
+
+		/* We don't have any completed request.
+		Wait for some request. Note that we return
+		from wait iff we have found a request. */
+
+		srv_set_io_thread_op_info(global_seg,
+			"waiting for completed aio requests");
+		os_aio_linux_collect(array, segment, n);
+	}
+
+found:
+	/* Note that it may be that there are more then one completed
+	IO requests. We process them one at a time. We may have a case
+	here to improve the performance slightly by dealing with all
+	requests in one sweep. */
+	srv_set_io_thread_op_info(global_seg,
+				"processing completed aio requests");
+
+	/* Ensure that we are scribbling only our segment. */
+	ut_a(i < n);
+
+	ut_ad(slot != NULL);
+	ut_ad(slot->reserved);
+	ut_ad(slot->io_already_done);
+
+	*message1 = slot->message1;
+	*message2 = slot->message2;
+
+	*type = slot->type;
+
+	if ((slot->ret == 0) && (slot->n_bytes == (long)slot->len)) {
+		ret = TRUE;
+
+#ifdef UNIV_DO_FLUSH
+		if (slot->type == OS_FILE_WRITE
+		    && !os_do_not_call_flush_at_each_write)
+		    && !os_file_flush(slot->file) {
+			ut_error;
+		}
+#endif /* UNIV_DO_FLUSH */
+	} else {
+		errno = -slot->ret;
+
+		/* os_file_handle_error does tell us if we should retry
+		this IO. As it stands now, we don't do this retry when
+		reaping requests from a different context than
+		the dispatcher. This non-retry logic is the same for
+		windows and linux native AIO.
+		We should probably look into this to transparently
+		re-submit the IO. */
+		os_file_handle_error(slot->name, "Linux aio");
+
+		ret = FALSE;
+	}
+
+	os_mutex_exit(array->mutex);
+
+	os_aio_array_free_slot(array, slot);
+
+	return(ret);
+}
+
+#endif /* LINUX_NATIVE_AIO */
+
 /**************************************************************************
 Does simulated aio. This function should be called by an i/o-handler
 thread. */
@@ -3995,6 +4656,40 @@ os_aio_validate(void)
 	return(TRUE);
 }

+/**************************************************************************
+Prints pending IO requests per segment of an aio array.
+We probably don't need per segment statistics but they can help us
+during development phase to see if the IO requests are being
+distributed as expected. */
+static
+void
+os_aio_print_segment_info(
+/*======================*/
+	FILE*		file,	/* in: file where to print */
+	ulint*		n_seg,	/* in: pending IO array */
+	os_aio_array_t*	array)	/* in: array to process */
+{
+	ulint	i;
+
+	ut_ad(array);
+	ut_ad(n_seg);
+	ut_ad(array->n_segments > 0);
+
+	if (array->n_segments == 1) {
+		return;
+	}
+
+	fprintf(file, " [");
+	for (i = 0; i < array->n_segments; i++) {
+		if (i != 0) {
+			fprintf(file, ", ");
+		}
+
+		fprintf(file, "%lu", n_seg[i]);
+	}
+	fprintf(file, "] ");
+}
+
 /**************************************************************************
 Prints info of the aio arrays. */
 UNIV_INTERN
@@ -4006,6 +4701,7 @@ os_aio_print(
 	os_aio_array_t*	array;
 	os_aio_slot_t*	slot;
 	ulint		n_reserved;
+	ulint		n_res_seg[SRV_MAX_N_IO_THREADS];
 	time_t		current_time;
 	double		time_elapsed;
 	double		avg_bytes_read;
@@ -4038,11 +4734,15 @@ loop:

 	n_reserved = 0;

+	memset(n_res_seg, 0x0, sizeof(n_res_seg));
+
 	for (i = 0; i < array->n_slots; i++) {
 		slot = os_aio_array_get_nth_slot(array, i);

+		ulint	seg_no = (i * array->n_segments) / array->n_slots;
 		if (slot->reserved) {
 			n_reserved++;
+			n_res_seg[seg_no]++;
 #if 0
 			fprintf(stderr, "Reserved slot, messages %p %p\n",
 				(void*) slot->message1,
@@ -4056,6 +4756,8 @@ loop:

 	fprintf(file, " %lu", (ulong) n_reserved);

+	os_aio_print_segment_info(file, n_res_seg, array);
+
 	os_mutex_exit(array->mutex);

 	if (array == os_aio_read_array) {

--- a/plug.in
+++ b/plug.in
@@ -12,6 +12,14 @@ MYSQL_PLUGIN_ACTIONS(innobase,  [
  AC_C_BIGENDIAN
  case "$target_os" in
 	lin*)
+             AC_CHECK_HEADER(libaio.h,
+               AC_CHECK_LIB(aio, io_setup,
+                 LIBS="$LIBS -laio"
+                 AC_DEFINE(LINUX_NATIVE_AIO, [1],
+                           [Linux native async I/O support]),
+                 AC_MSG_WARN([No Linux native async I/O])),
+               AC_MSG_WARN([No Linux native async I/O]))
+
 		CFLAGS="$CFLAGS -DUNIV_LINUX";;
 	hpux10*)
 		CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX -DUNIV_HPUX10";;

--- a/srv/srv0srv.c
+++ b/srv/srv0srv.c
@@ -102,6 +102,12 @@ UNIV_INTERN ulint	srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX;
 on duplicate key checking and foreign key checking */
 UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;

+/* If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads.
+Currently we support native aio on windows and linux */
+UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
+
 UNIV_INTERN ulint	srv_n_data_files = 0;
 UNIV_INTERN char**	srv_data_file_names = NULL;
 /* size in database pages */

--- a/srv/srv0start.c
+++ b/srv/srv0start.c
@@ -969,6 +969,7 @@ innobase_start_or_create_for_mysql(void)
 	ibool		log_file_created;
 	ibool		log_created	= FALSE;
 	ibool		log_opened	= FALSE;
+	ibool		success;
 	ib_uint64_t	min_flushed_lsn;
 	ib_uint64_t	max_flushed_lsn;
 #ifdef UNIV_LOG_ARCHIVE
@@ -1071,7 +1072,6 @@ innobase_start_or_create_for_mysql(void)

 	srv_is_being_started = TRUE;
 	srv_startup_is_before_trx_rollback_phase = TRUE;
-	os_aio_use_native_aio = FALSE;

 #ifdef __WIN__
 	if (os_get_os_version() == OS_WIN95
@@ -1083,12 +1083,30 @@ innobase_start_or_create_for_mysql(void)
 		but when run in conjunction with InnoDB Hot Backup, it seemed
 		to corrupt the data files. */

-		os_aio_use_native_aio = FALSE;
+		srv_use_native_aio = FALSE;
 	} else {
 		/* On Win 2000 and XP use async i/o */
-		os_aio_use_native_aio = TRUE;
+		srv_use_native_aio = TRUE;
 	}
+
+#elif defined(LINUX_NATIVE_AIO)
+
+	if (srv_use_native_aio) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Using Linux native AIO\n");
+	}
+#else
+	/* Currently native AIO is supported only on windows and linux
+	and that also when the support is compiled in. In all other
+	cases, we ignore the setting of innodb_use_native_aio. */
+
+	/* TODO: comment this out after internal testing. */
+	fprintf(stderr, "Ignoring innodb_use_native_aio\n");
+	srv_use_native_aio = FALSE;
+
 #endif
+
 	if (srv_file_flush_method_str == NULL) {
 		/* These are the default options */

@@ -1113,11 +1131,11 @@ innobase_start_or_create_for_mysql(void)
 #else
 	} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
 		srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
-		os_aio_use_native_aio = FALSE;
+		srv_use_native_aio = FALSE;

 	} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
 		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
-		os_aio_use_native_aio = FALSE;
+		srv_use_native_aio = FALSE;

 	} else if (0 == ut_strcmp(srv_file_flush_method_str,
 				  "async_unbuffered")) {
@@ -1210,19 +1228,38 @@ innobase_start_or_create_for_mysql(void)
 		srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
 	}

-	if (!os_aio_use_native_aio) {
+	if (!srv_use_native_aio) {
 		/* In simulated aio we currently have use only for 4 threads */
 		srv_n_file_io_threads = 4;

-		os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
-			    * srv_n_file_io_threads,
-			    srv_n_file_io_threads,
-			    SRV_MAX_N_PENDING_SYNC_IOS);
+		success = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD *
+				      srv_n_file_io_threads,
+				      srv_n_file_io_threads,
+				      SRV_MAX_N_PENDING_SYNC_IOS);
+		if (!success) {
+			return(DB_ERROR);
+		}
 	} else {
-		os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
-			    * srv_n_file_io_threads,
-			    srv_n_file_io_threads,
-			    SRV_MAX_N_PENDING_SYNC_IOS);
+		/* Windows has a pending IO per thread limit.
+		Linux does not have any such restriction.
+		The question of what should be segment size
+		is a trade off. The larger size means longer
+		linear searches through the array and a smaller
+		value can lead to array being full, causing
+		unnecessary delays. The following value
+		for Linux is fairly arbitrary and needs to be
+		tested and tuned. */
+		success = os_aio_init(
+#if defined(LINUX_NATIVE_AIO)
+				      8 * 
+#endif /* LINUX_NATIVE_AIO */
+				      SRV_N_PENDING_IOS_PER_THREAD *
+				      srv_n_file_io_threads,
+				      srv_n_file_io_threads,
+				      SRV_MAX_N_PENDING_SYNC_IOS);
+		if (!success) {
+			return(DB_ERROR);
+		}
 	}

 	fil_init(srv_max_n_open_files);