Commit 0802e5da authored by Calvin Sun's avatar Calvin Sun

Improve InnoDB synchronization primitives on Windows

This patch was originally developed by Vladislav Vaintroub.
The main changes are:

 * Use TryEnterCriticalSection in os_fast_mutex_trylock().
 * Use lightweight condition variables on Vista or later Windows;
   but fall back to events on older Windows, such as XP.

This patch also fixes the following bugs:
  bug# 52102 InnoDB Plugin shows performance drop compared to InnoDB
             on Windows
  bug# 53204 os_fastmutex_trylock is implemented incorrectly on Windows

rb://363 approved by Inaam Rana
parent 17fd8dec
...@@ -188,11 +188,7 @@ IF(SIZEOF_PTHREAD_T) ...@@ -188,11 +188,7 @@ IF(SIZEOF_PTHREAD_T)
ENDIF() ENDIF()
IF(MSVC) IF(MSVC)
# Windows atomics do not perform well. Disable Windows atomics by default. ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
# See bug#52102 for details.
#ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION)
ENDIF() ENDIF()
......
...@@ -177,6 +177,13 @@ log. */ ...@@ -177,6 +177,13 @@ log. */
#define OS_WIN95 2 /*!< Microsoft Windows 95 */ #define OS_WIN95 2 /*!< Microsoft Windows 95 */
#define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */ #define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */
#define OS_WIN2000 4 /*!< Microsoft Windows 2000 */ #define OS_WIN2000 4 /*!< Microsoft Windows 2000 */
#define OS_WINXP 5 /*!< Microsoft Windows XP
or Windows Server 2003 */
#define OS_WINVISTA 6 /*!< Microsoft Windows Vista
or Windows Server 2008 */
#define OS_WIN7 7 /*!< Microsoft Windows 7
or Windows Server 2008 R2 */
extern ulint os_n_file_reads; extern ulint os_n_file_reads;
extern ulint os_n_file_writes; extern ulint os_n_file_writes;
...@@ -368,7 +375,8 @@ typedef DIR* os_file_dir_t; /*!< directory stream */ ...@@ -368,7 +375,8 @@ typedef DIR* os_file_dir_t; /*!< directory stream */
/***********************************************************************//** /***********************************************************************//**
Gets the operating system version. Currently works only on Windows. Gets the operating system version. Currently works only on Windows.
@return OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */ @return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
OS_WIN7. */
UNIV_INTERN UNIV_INTERN
ulint ulint
os_get_os_version(void); os_get_os_version(void);
......
...@@ -38,28 +38,18 @@ Created 9/6/1995 Heikki Tuuri ...@@ -38,28 +38,18 @@ Created 9/6/1995 Heikki Tuuri
#include "ut0lst.h" #include "ut0lst.h"
#ifdef __WIN__ #ifdef __WIN__
/** Native event (slow)*/
/** Native mutex */
#define os_fast_mutex_t CRITICAL_SECTION
/** Native event */
typedef HANDLE os_native_event_t; typedef HANDLE os_native_event_t;
/** Native mutex */
/** Operating system event */ typedef CRITICAL_SECTION os_fast_mutex_t;
typedef struct os_event_struct os_event_struct_t; /** Native condition variable. */
/** Operating system event handle */ typedef CONDITION_VARIABLE os_cond_t;
typedef os_event_struct_t* os_event_t;
/** An asynchronous signal sent between threads */
struct os_event_struct {
os_native_event_t handle;
/*!< Windows event */
UT_LIST_NODE_T(os_event_struct_t) os_event_list;
/*!< list of all created events */
};
#else #else
/** Native mutex */ /** Native mutex */
typedef pthread_mutex_t os_fast_mutex_t; typedef pthread_mutex_t os_fast_mutex_t;
/** Native condition variable */
typedef pthread_cond_t os_cond_t;
#endif
/** Operating system event */ /** Operating system event */
typedef struct os_event_struct os_event_struct_t; typedef struct os_event_struct os_event_struct_t;
...@@ -68,6 +58,10 @@ typedef os_event_struct_t* os_event_t; ...@@ -68,6 +58,10 @@ typedef os_event_struct_t* os_event_t;
/** An asynchronous signal sent between threads */ /** An asynchronous signal sent between threads */
struct os_event_struct { struct os_event_struct {
#ifdef __WIN__
HANDLE handle; /*!< kernel event object, slow,
used on older Windows */
#endif
os_fast_mutex_t os_mutex; /*!< this mutex protects the next os_fast_mutex_t os_mutex; /*!< this mutex protects the next
fields */ fields */
ibool is_set; /*!< this is TRUE when the event is ibool is_set; /*!< this is TRUE when the event is
...@@ -76,24 +70,17 @@ struct os_event_struct { ...@@ -76,24 +70,17 @@ struct os_event_struct {
this event */ this event */
ib_int64_t signal_count; /*!< this is incremented each time ib_int64_t signal_count; /*!< this is incremented each time
the event becomes signaled */ the event becomes signaled */
pthread_cond_t cond_var; /*!< condition variable is used in os_cond_t cond_var; /*!< condition variable is used in
waiting for the event */ waiting for the event */
UT_LIST_NODE_T(os_event_struct_t) os_event_list; UT_LIST_NODE_T(os_event_struct_t) os_event_list;
/*!< list of all created events */ /*!< list of all created events */
}; };
#endif
/** Operating system mutex */ /** Operating system mutex */
typedef struct os_mutex_struct os_mutex_str_t; typedef struct os_mutex_struct os_mutex_str_t;
/** Operating system mutex handle */ /** Operating system mutex handle */
typedef os_mutex_str_t* os_mutex_t; typedef os_mutex_str_t* os_mutex_t;
/** Denotes an infinite delay for os_event_wait_time() */
#define OS_SYNC_INFINITE_TIME ((ulint)(-1))
/** Return value of os_event_wait_time() when the time is exceeded */
#define OS_SYNC_TIME_EXCEEDED 1
/** Mutex protecting counts and the event and OS 'slow' mutex lists */ /** Mutex protecting counts and the event and OS 'slow' mutex lists */
extern os_mutex_t os_sync_mutex; extern os_mutex_t os_sync_mutex;
...@@ -187,42 +174,14 @@ os_event_wait_low( ...@@ -187,42 +174,14 @@ os_event_wait_low(
#define os_event_wait(event) os_event_wait_low(event, 0) #define os_event_wait(event) os_event_wait_low(event, 0)
/**********************************************************//**
Waits for an event object until it is in the signaled state or
a timeout is exceeded. In Unix the timeout is always infinite.
@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
UNIV_INTERN
ulint
os_event_wait_time(
/*===============*/
os_event_t event, /*!< in: event to wait */
ulint time); /*!< in: timeout in microseconds, or
OS_SYNC_INFINITE_TIME */
#ifdef __WIN__
/**********************************************************//**
Waits for any event in an OS native event array. Returns if even a single
one is signaled or becomes signaled.
@return index of the event which was signaled */
UNIV_INTERN
ulint
os_event_wait_multiple(
/*===================*/
ulint n, /*!< in: number of events in the
array */
os_native_event_t* native_event_array);
/*!< in: pointer to an array of event
handles */
#endif
/*********************************************************//** /*********************************************************//**
Creates an operating system mutex semaphore. Because these are slow, the Creates an operating system mutex semaphore. Because these are slow, the
mutex semaphore of InnoDB itself (mutex_t) should be used where possible. mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
@return the mutex handle */ @return the mutex handle */
UNIV_INTERN UNIV_INTERN
os_mutex_t os_mutex_t
os_mutex_create( os_mutex_create(void);
/*============*/ /*=================*/
const char* name); /*!< in: the name of the mutex, if NULL
the mutex is created without a name */
/**********************************************************//** /**********************************************************//**
Acquires ownership of a mutex semaphore. */ Acquires ownership of a mutex semaphore. */
UNIV_INTERN UNIV_INTERN
......
...@@ -28,8 +28,7 @@ Created 9/6/1995 Heikki Tuuri ...@@ -28,8 +28,7 @@ Created 9/6/1995 Heikki Tuuri
#endif #endif
/**********************************************************//** /**********************************************************//**
Acquires ownership of a fast mutex. Currently in Windows this is the same Acquires ownership of a fast mutex.
as os_fast_mutex_lock!
@return 0 if success, != 0 if was reserved by another thread */ @return 0 if success, != 0 if was reserved by another thread */
UNIV_INLINE UNIV_INLINE
ulint ulint
...@@ -38,9 +37,13 @@ os_fast_mutex_trylock( ...@@ -38,9 +37,13 @@ os_fast_mutex_trylock(
os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */ os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */
{ {
#ifdef __WIN__ #ifdef __WIN__
EnterCriticalSection(fast_mutex); if (TryEnterCriticalSection(fast_mutex)) {
return(0); return(0);
} else {
return(1);
}
#else #else
/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
so that it returns 0 on success. In the operating system so that it returns 0 on success. In the operating system
......
...@@ -112,6 +112,9 @@ OS (provided we compiled Innobase with it in), otherwise we will ...@@ -112,6 +112,9 @@ OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads. use simulated aio we build below with threads.
Currently we support native aio on windows and linux */ Currently we support native aio on windows and linux */
extern my_bool srv_use_native_aio; extern my_bool srv_use_native_aio;
#ifdef __WIN__
extern ibool srv_use_native_conditions;
#endif
extern ulint srv_n_data_files; extern ulint srv_n_data_files;
extern char** srv_data_file_names; extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes; extern ulint* srv_data_file_sizes;
......
...@@ -183,7 +183,7 @@ struct os_aio_slot_struct{ ...@@ -183,7 +183,7 @@ struct os_aio_slot_struct{
which pending aio operation was which pending aio operation was
completed */ completed */
#ifdef WIN_ASYNC_IO #ifdef WIN_ASYNC_IO
os_event_t event; /*!< event object we need in the HANDLE handle; /*!< handle object we need in the
OVERLAPPED struct */ OVERLAPPED struct */
OVERLAPPED control; /*!< Windows control block for the OVERLAPPED control; /*!< Windows control block for the
aio request */ aio request */
...@@ -225,7 +225,7 @@ struct os_aio_array_struct{ ...@@ -225,7 +225,7 @@ struct os_aio_array_struct{
aio array outside the ibuf segment */ aio array outside the ibuf segment */
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
#ifdef __WIN__ #ifdef __WIN__
os_native_event_t* native_events; HANDLE* handles;
/*!< Pointer to an array of OS native /*!< Pointer to an array of OS native
event handles where we copied the event handles where we copied the
handles from slots, in the same handles from slots, in the same
...@@ -304,7 +304,8 @@ UNIV_INTERN ulint os_n_pending_reads = 0; ...@@ -304,7 +304,8 @@ UNIV_INTERN ulint os_n_pending_reads = 0;
/***********************************************************************//** /***********************************************************************//**
Gets the operating system version. Currently works only on Windows. Gets the operating system version. Currently works only on Windows.
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */ @return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
OS_WIN7. */
UNIV_INTERN UNIV_INTERN
ulint ulint
os_get_os_version(void) os_get_os_version(void)
...@@ -322,10 +323,18 @@ os_get_os_version(void) ...@@ -322,10 +323,18 @@ os_get_os_version(void)
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
return(OS_WIN95); return(OS_WIN95);
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
if (os_info.dwMajorVersion <= 4) { switch (os_info.dwMajorVersion) {
return(OS_WINNT); case 3:
} else { case 4:
return(OS_WIN2000); return OS_WINNT;
case 5:
return (os_info.dwMinorVersion == 0) ? OS_WIN2000
: OS_WINXP;
case 6:
return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
: OS_WIN7;
default:
return OS_WIN7;
} }
} else { } else {
ut_error; ut_error;
...@@ -673,10 +682,10 @@ os_io_init_simple(void) ...@@ -673,10 +682,10 @@ os_io_init_simple(void)
{ {
ulint i; ulint i;
os_file_count_mutex = os_mutex_create(NULL); os_file_count_mutex = os_mutex_create();
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
os_file_seek_mutexes[i] = os_mutex_create(NULL); os_file_seek_mutexes[i] = os_mutex_create();
} }
} }
...@@ -3217,7 +3226,7 @@ os_aio_array_create( ...@@ -3217,7 +3226,7 @@ os_aio_array_create(
array = ut_malloc(sizeof(os_aio_array_t)); array = ut_malloc(sizeof(os_aio_array_t));
array->mutex = os_mutex_create(NULL); array->mutex = os_mutex_create();
array->not_full = os_event_create(NULL); array->not_full = os_event_create(NULL);
array->is_empty = os_event_create(NULL); array->is_empty = os_event_create(NULL);
...@@ -3229,7 +3238,7 @@ os_aio_array_create( ...@@ -3229,7 +3238,7 @@ os_aio_array_create(
array->cur_seg = 0; array->cur_seg = 0;
array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
#ifdef __WIN__ #ifdef __WIN__
array->native_events = ut_malloc(n * sizeof(os_native_event_t)); array->handles = ut_malloc(n * sizeof(HANDLE));
#endif #endif
#if defined(LINUX_NATIVE_AIO) #if defined(LINUX_NATIVE_AIO)
...@@ -3273,13 +3282,13 @@ os_aio_array_create( ...@@ -3273,13 +3282,13 @@ os_aio_array_create(
slot->pos = i; slot->pos = i;
slot->reserved = FALSE; slot->reserved = FALSE;
#ifdef WIN_ASYNC_IO #ifdef WIN_ASYNC_IO
slot->event = os_event_create(NULL); slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
over = &(slot->control); over = &(slot->control);
over->hEvent = slot->event->handle; over->hEvent = slot->handle;
*((array->native_events) + i) = over->hEvent; *((array->handles) + i) = over->hEvent;
#elif defined(LINUX_NATIVE_AIO) #elif defined(LINUX_NATIVE_AIO)
...@@ -3305,12 +3314,12 @@ os_aio_array_free( ...@@ -3305,12 +3314,12 @@ os_aio_array_free(
for (i = 0; i < array->n_slots; i++) { for (i = 0; i < array->n_slots; i++) {
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
os_event_free(slot->event); CloseHandle(slot->handle);
} }
#endif /* WIN_ASYNC_IO */ #endif /* WIN_ASYNC_IO */
#ifdef __WIN__ #ifdef __WIN__
ut_free(array->native_events); ut_free(array->handles);
#endif /* __WIN__ */ #endif /* __WIN__ */
os_mutex_free(array->mutex); os_mutex_free(array->mutex);
os_event_free(array->not_full); os_event_free(array->not_full);
...@@ -3463,7 +3472,7 @@ os_aio_array_wake_win_aio_at_shutdown( ...@@ -3463,7 +3472,7 @@ os_aio_array_wake_win_aio_at_shutdown(
for (i = 0; i < array->n_slots; i++) { for (i = 0; i < array->n_slots; i++) {
os_event_set((array->slots + i)->event); SetEvent((array->slots + i)->handle);
} }
} }
#endif #endif
...@@ -3702,7 +3711,7 @@ os_aio_array_reserve_slot( ...@@ -3702,7 +3711,7 @@ os_aio_array_reserve_slot(
control = &(slot->control); control = &(slot->control);
control->Offset = (DWORD)offset; control->Offset = (DWORD)offset;
control->OffsetHigh = (DWORD)offset_high; control->OffsetHigh = (DWORD)offset_high;
os_event_reset(slot->event); ResetEvent(slot->handle);
#elif defined(LINUX_NATIVE_AIO) #elif defined(LINUX_NATIVE_AIO)
...@@ -3774,7 +3783,7 @@ os_aio_array_free_slot( ...@@ -3774,7 +3783,7 @@ os_aio_array_free_slot(
#ifdef WIN_ASYNC_IO #ifdef WIN_ASYNC_IO
os_event_reset(slot->event); ResetEvent(slot->handle);
#elif defined(LINUX_NATIVE_AIO) #elif defined(LINUX_NATIVE_AIO)
...@@ -4208,13 +4217,20 @@ os_aio_windows_handle( ...@@ -4208,13 +4217,20 @@ os_aio_windows_handle(
n = array->n_slots / array->n_segments; n = array->n_slots / array->n_segments;
if (array == os_aio_sync_array) { if (array == os_aio_sync_array) {
os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); WaitForSingleObject(
os_aio_array_get_nth_slot(array, pos)->handle,
INFINITE);
i = pos; i = pos;
} else { } else {
srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
i = os_event_wait_multiple(n, i = WaitForMultipleObjects((DWORD) n,
(array->native_events) array->handles + segment * n,
+ segment * n); FALSE,
INFINITE);
}
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
os_thread_exit(NULL);
} }
os_mutex_enter(array->mutex); os_mutex_enter(array->mutex);
......
This diff is collapsed.
...@@ -252,7 +252,7 @@ os_thread_yield(void) ...@@ -252,7 +252,7 @@ os_thread_yield(void)
/*=================*/ /*=================*/
{ {
#if defined(__WIN__) #if defined(__WIN__)
Sleep(0); SwitchToThread();
#elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H)) #elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H))
sched_yield(); sched_yield();
#elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG) #elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG)
......
...@@ -142,6 +142,21 @@ use simulated aio we build below with threads. ...@@ -142,6 +142,21 @@ use simulated aio we build below with threads.
Currently we support native aio on windows and linux */ Currently we support native aio on windows and linux */
UNIV_INTERN my_bool srv_use_native_aio = TRUE; UNIV_INTERN my_bool srv_use_native_aio = TRUE;
#ifdef __WIN__
/* Windows native condition variables. We use runtime loading / function
pointers, because they are not available on Windows Server 2003 and
Windows XP/2000.
We use condition for events on Windows if possible, even if os_event
resembles Windows kernel event object well API-wise. The reason is
performance, kernel objects are heavyweights and WaitForSingleObject() is a
performance killer causing calling thread to context switch. Besides, Innodb
is preallocating large number (often millions) of os_events. With kernel event
objects it takes a big chunk out of non-paged pool, which is better suited
for tasks like IO than for storing idle event objects. */
UNIV_INTERN ibool srv_use_native_conditions = FALSE;
#endif /* __WIN__ */
UNIV_INTERN ulint srv_n_data_files = 0; UNIV_INTERN ulint srv_n_data_files = 0;
UNIV_INTERN char** srv_data_file_names = NULL; UNIV_INTERN char** srv_data_file_names = NULL;
/* size in database pages */ /* size in database pages */
......
...@@ -1160,9 +1160,17 @@ innobase_start_or_create_for_mysql(void) ...@@ -1160,9 +1160,17 @@ innobase_start_or_create_for_mysql(void)
srv_use_native_aio = FALSE; srv_use_native_aio = FALSE;
break; break;
case OS_WIN2000:
case OS_WINXP:
/* On 2000 and XP, async IO is available. */
srv_use_native_aio = TRUE;
break;
default: default:
/* On Win 2000 and XP use async i/o */ /* Vista and later have both async IO and condition variables */
srv_use_native_aio = TRUE; srv_use_native_aio = TRUE;
srv_use_native_conditions = TRUE;
break; break;
} }
......
...@@ -250,7 +250,7 @@ sync_array_create( ...@@ -250,7 +250,7 @@ sync_array_create(
/* Then create the mutex to protect the wait array complex */ /* Then create the mutex to protect the wait array complex */
if (protection == SYNC_ARRAY_OS_MUTEX) { if (protection == SYNC_ARRAY_OS_MUTEX) {
arr->os_mutex = os_mutex_create(NULL); arr->os_mutex = os_mutex_create();
} else if (protection == SYNC_ARRAY_MUTEX) { } else if (protection == SYNC_ARRAY_MUTEX) {
mutex_create(syn_arr_mutex_key, mutex_create(syn_arr_mutex_key,
&arr->mutex, SYNC_NO_ORDER_CHECK); &arr->mutex, SYNC_NO_ORDER_CHECK);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment