Commit d3ab61c2 authored by Calvin Sun's avatar Calvin Sun

Improve InnoDB synchronization primitives on Windows

This patch was originally developed by Vladislav Vaintroub.
The main changes are:

 * Use TryEnterCriticalSection in os_fast_mutex_trylock().
 * Use lightweight condition variables on Vista or later Windows;
   but fall back to events on older Windows, such as XP.

This patch also fixes the following bugs:
  bug# 52102 InnoDB Plugin shows performance drop compared to InnoDB
             on Windows
  bug# 53204 os_fastmutex_trylock is implemented incorrectly on Windows

rb://363 approved by Inaam Rana
parent bf76fcbb
......@@ -188,11 +188,7 @@ IF(SIZEOF_PTHREAD_T)
ENDIF()
IF(MSVC)
# Windows atomics do not perform well. Disable Windows atomics by default.
# See bug#52102 for details.
#ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION)
ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
ENDIF()
......
......@@ -177,6 +177,13 @@ log. */
#define OS_WIN95 2 /*!< Microsoft Windows 95 */
#define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */
#define OS_WIN2000 4 /*!< Microsoft Windows 2000 */
#define OS_WINXP 5 /*!< Microsoft Windows XP
or Windows Server 2003 */
#define OS_WINVISTA 6 /*!< Microsoft Windows Vista
or Windows Server 2008 */
#define OS_WIN7 7 /*!< Microsoft Windows 7
or Windows Server 2008 R2 */
extern ulint os_n_file_reads;
extern ulint os_n_file_writes;
......@@ -368,7 +375,8 @@ typedef DIR* os_file_dir_t; /*!< directory stream */
/***********************************************************************//**
Gets the operating system version. Currently works only on Windows.
@return OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
OS_WIN7. */
UNIV_INTERN
ulint
os_get_os_version(void);
......
......@@ -38,28 +38,18 @@ Created 9/6/1995 Heikki Tuuri
#include "ut0lst.h"
#ifdef __WIN__
/** Native event (slow)*/
typedef HANDLE os_native_event_t;
/** Native mutex */
#define os_fast_mutex_t CRITICAL_SECTION
/** Native event */
typedef HANDLE os_native_event_t;
/** Operating system event */
typedef struct os_event_struct os_event_struct_t;
/** Operating system event handle */
typedef os_event_struct_t* os_event_t;
/** An asynchronous signal sent between threads */
struct os_event_struct {
os_native_event_t handle;
/*!< Windows event */
UT_LIST_NODE_T(os_event_struct_t) os_event_list;
/*!< list of all created events */
};
typedef CRITICAL_SECTION os_fast_mutex_t;
/** Native condition variable. */
typedef CONDITION_VARIABLE os_cond_t;
#else
/** Native mutex */
typedef pthread_mutex_t os_fast_mutex_t;
typedef pthread_mutex_t os_fast_mutex_t;
/** Native condition variable */
typedef pthread_cond_t os_cond_t;
#endif
/** Operating system event */
typedef struct os_event_struct os_event_struct_t;
......@@ -68,6 +58,10 @@ typedef os_event_struct_t* os_event_t;
/** An asynchronous signal sent between threads */
struct os_event_struct {
#ifdef __WIN__
HANDLE handle; /*!< kernel event object, slow,
used on older Windows */
#endif
os_fast_mutex_t os_mutex; /*!< this mutex protects the next
fields */
ibool is_set; /*!< this is TRUE when the event is
......@@ -76,24 +70,17 @@ struct os_event_struct {
this event */
ib_int64_t signal_count; /*!< this is incremented each time
the event becomes signaled */
pthread_cond_t cond_var; /*!< condition variable is used in
os_cond_t cond_var; /*!< condition variable is used in
waiting for the event */
UT_LIST_NODE_T(os_event_struct_t) os_event_list;
/*!< list of all created events */
};
#endif
/** Operating system mutex */
typedef struct os_mutex_struct os_mutex_str_t;
/** Operating system mutex handle */
typedef os_mutex_str_t* os_mutex_t;
/** Denotes an infinite delay for os_event_wait_time() */
#define OS_SYNC_INFINITE_TIME ((ulint)(-1))
/** Return value of os_event_wait_time() when the time is exceeded */
#define OS_SYNC_TIME_EXCEEDED 1
/** Mutex protecting counts and the event and OS 'slow' mutex lists */
extern os_mutex_t os_sync_mutex;
......@@ -187,42 +174,14 @@ os_event_wait_low(
#define os_event_wait(event) os_event_wait_low(event, 0)
/**********************************************************//**
Waits for an event object until it is in the signaled state or
a timeout is exceeded. In Unix the timeout is always infinite.
@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
UNIV_INTERN
ulint
os_event_wait_time(
/*===============*/
os_event_t event, /*!< in: event to wait */
ulint time); /*!< in: timeout in microseconds, or
OS_SYNC_INFINITE_TIME */
#ifdef __WIN__
/**********************************************************//**
Waits for any event in an OS native event array. Returns if even a single
one is signaled or becomes signaled.
@return index of the event which was signaled */
UNIV_INTERN
ulint
os_event_wait_multiple(
/*===================*/
ulint n, /*!< in: number of events in the
array */
os_native_event_t* native_event_array);
/*!< in: pointer to an array of event
handles */
#endif
/*********************************************************//**
Creates an operating system mutex semaphore. Because these are slow, the
mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
@return the mutex handle */
UNIV_INTERN
os_mutex_t
os_mutex_create(
/*============*/
const char* name); /*!< in: the name of the mutex, if NULL
the mutex is created without a name */
os_mutex_create(void);
/*=================*/
/**********************************************************//**
Acquires ownership of a mutex semaphore. */
UNIV_INTERN
......
......@@ -28,8 +28,7 @@ Created 9/6/1995 Heikki Tuuri
#endif
/**********************************************************//**
Acquires ownership of a fast mutex. Currently in Windows this is the same
as os_fast_mutex_lock!
Acquires ownership of a fast mutex.
@return 0 if success, != 0 if was reserved by another thread */
UNIV_INLINE
ulint
......@@ -38,9 +37,13 @@ os_fast_mutex_trylock(
os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */
{
#ifdef __WIN__
EnterCriticalSection(fast_mutex);
if (TryEnterCriticalSection(fast_mutex)) {
return(0);
return(0);
} else {
return(1);
}
#else
/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
so that it returns 0 on success. In the operating system
......
......@@ -112,6 +112,9 @@ OS (provided we compiled Innobase with it in), otherwise we will
use simulated aio we build below with threads.
Currently we support native aio on windows and linux */
extern my_bool srv_use_native_aio;
#ifdef __WIN__
extern ibool srv_use_native_conditions;
#endif
extern ulint srv_n_data_files;
extern char** srv_data_file_names;
extern ulint* srv_data_file_sizes;
......
......@@ -183,7 +183,7 @@ struct os_aio_slot_struct{
which pending aio operation was
completed */
#ifdef WIN_ASYNC_IO
os_event_t event; /*!< event object we need in the
HANDLE handle; /*!< handle object we need in the
OVERLAPPED struct */
OVERLAPPED control; /*!< Windows control block for the
aio request */
......@@ -225,7 +225,7 @@ struct os_aio_array_struct{
aio array outside the ibuf segment */
os_aio_slot_t* slots; /*!< Pointer to the slots in the array */
#ifdef __WIN__
os_native_event_t* native_events;
HANDLE* handles;
/*!< Pointer to an array of OS native
event handles where we copied the
handles from slots, in the same
......@@ -304,7 +304,8 @@ UNIV_INTERN ulint os_n_pending_reads = 0;
/***********************************************************************//**
Gets the operating system version. Currently works only on Windows.
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA,
OS_WIN7. */
UNIV_INTERN
ulint
os_get_os_version(void)
......@@ -322,10 +323,18 @@ os_get_os_version(void)
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
return(OS_WIN95);
} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
if (os_info.dwMajorVersion <= 4) {
return(OS_WINNT);
} else {
return(OS_WIN2000);
switch (os_info.dwMajorVersion) {
case 3:
case 4:
return OS_WINNT;
case 5:
return (os_info.dwMinorVersion == 0) ? OS_WIN2000
: OS_WINXP;
case 6:
return (os_info.dwMinorVersion == 0) ? OS_WINVISTA
: OS_WIN7;
default:
return OS_WIN7;
}
} else {
ut_error;
......@@ -673,10 +682,10 @@ os_io_init_simple(void)
{
ulint i;
os_file_count_mutex = os_mutex_create(NULL);
os_file_count_mutex = os_mutex_create();
for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
os_file_seek_mutexes[i] = os_mutex_create(NULL);
os_file_seek_mutexes[i] = os_mutex_create();
}
}
......@@ -3217,7 +3226,7 @@ os_aio_array_create(
array = ut_malloc(sizeof(os_aio_array_t));
array->mutex = os_mutex_create(NULL);
array->mutex = os_mutex_create();
array->not_full = os_event_create(NULL);
array->is_empty = os_event_create(NULL);
......@@ -3229,7 +3238,7 @@ os_aio_array_create(
array->cur_seg = 0;
array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
#ifdef __WIN__
array->native_events = ut_malloc(n * sizeof(os_native_event_t));
array->handles = ut_malloc(n * sizeof(HANDLE));
#endif
#if defined(LINUX_NATIVE_AIO)
......@@ -3273,13 +3282,13 @@ os_aio_array_create(
slot->pos = i;
slot->reserved = FALSE;
#ifdef WIN_ASYNC_IO
slot->event = os_event_create(NULL);
slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL);
over = &(slot->control);
over->hEvent = slot->event->handle;
over->hEvent = slot->handle;
*((array->native_events) + i) = over->hEvent;
*((array->handles) + i) = over->hEvent;
#elif defined(LINUX_NATIVE_AIO)
......@@ -3305,12 +3314,12 @@ os_aio_array_free(
for (i = 0; i < array->n_slots; i++) {
os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
os_event_free(slot->event);
CloseHandle(slot->handle);
}
#endif /* WIN_ASYNC_IO */
#ifdef __WIN__
ut_free(array->native_events);
ut_free(array->handles);
#endif /* __WIN__ */
os_mutex_free(array->mutex);
os_event_free(array->not_full);
......@@ -3463,7 +3472,7 @@ os_aio_array_wake_win_aio_at_shutdown(
for (i = 0; i < array->n_slots; i++) {
os_event_set((array->slots + i)->event);
SetEvent((array->slots + i)->handle);
}
}
#endif
......@@ -3702,7 +3711,7 @@ os_aio_array_reserve_slot(
control = &(slot->control);
control->Offset = (DWORD)offset;
control->OffsetHigh = (DWORD)offset_high;
os_event_reset(slot->event);
ResetEvent(slot->handle);
#elif defined(LINUX_NATIVE_AIO)
......@@ -3774,7 +3783,7 @@ os_aio_array_free_slot(
#ifdef WIN_ASYNC_IO
os_event_reset(slot->event);
ResetEvent(slot->handle);
#elif defined(LINUX_NATIVE_AIO)
......@@ -4208,13 +4217,20 @@ os_aio_windows_handle(
n = array->n_slots / array->n_segments;
if (array == os_aio_sync_array) {
os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
WaitForSingleObject(
os_aio_array_get_nth_slot(array, pos)->handle,
INFINITE);
i = pos;
} else {
srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
i = os_event_wait_multiple(n,
(array->native_events)
+ segment * n);
i = WaitForMultipleObjects((DWORD) n,
array->handles + segment * n,
FALSE,
INFINITE);
}
if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
os_thread_exit(NULL);
}
os_mutex_enter(array->mutex);
......
This diff is collapsed.
......@@ -252,7 +252,7 @@ os_thread_yield(void)
/*=================*/
{
#if defined(__WIN__)
Sleep(0);
SwitchToThread();
#elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H))
sched_yield();
#elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG)
......
......@@ -142,6 +142,21 @@ use simulated aio we build below with threads.
Currently we support native aio on windows and linux */
UNIV_INTERN my_bool srv_use_native_aio = TRUE;
#ifdef __WIN__
/* Windows native condition variables. We use runtime loading / function
pointers, because they are not available on Windows Server 2003 and
Windows XP/2000.
We use condition for events on Windows if possible, even if os_event
resembles Windows kernel event object well API-wise. The reason is
performance, kernel objects are heavyweights and WaitForSingleObject() is a
performance killer causing calling thread to context switch. Besides, Innodb
is preallocating large number (often millions) of os_events. With kernel event
objects it takes a big chunk out of non-paged pool, which is better suited
for tasks like IO than for storing idle event objects. */
UNIV_INTERN ibool srv_use_native_conditions = FALSE;
#endif /* __WIN__ */
UNIV_INTERN ulint srv_n_data_files = 0;
UNIV_INTERN char** srv_data_file_names = NULL;
/* size in database pages */
......
......@@ -1160,9 +1160,17 @@ innobase_start_or_create_for_mysql(void)
srv_use_native_aio = FALSE;
break;
case OS_WIN2000:
case OS_WINXP:
/* On 2000 and XP, async IO is available. */
srv_use_native_aio = TRUE;
break;
default:
/* On Win 2000 and XP use async i/o */
/* Vista and later have both async IO and condition variables */
srv_use_native_aio = TRUE;
srv_use_native_conditions = TRUE;
break;
}
......
......@@ -250,7 +250,7 @@ sync_array_create(
/* Then create the mutex to protect the wait array complex */
if (protection == SYNC_ARRAY_OS_MUTEX) {
arr->os_mutex = os_mutex_create(NULL);
arr->os_mutex = os_mutex_create();
} else if (protection == SYNC_ARRAY_MUTEX) {
mutex_create(syn_arr_mutex_key,
&arr->mutex, SYNC_NO_ORDER_CHECK);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment