Commit 021212b5 authored by Jan Lindström's avatar Jan Lindström Committed by GitHub

Merge pull request #245 from sensssz/10.1-vats

MDEV-11039 - Add new scheduling algorithm for reducing tail latencies
parents 1bfa37a7 183c0283
...@@ -300,6 +300,22 @@ static TYPELIB innodb_checksum_algorithm_typelib = { ...@@ -300,6 +300,22 @@ static TYPELIB innodb_checksum_algorithm_typelib = {
NULL NULL
}; };
/** Possible values of the parameter innodb_lock_schedule_algorithm */
static const char* innodb_lock_schedule_algorithm_names[] = {
"fcfs",
"vats",
NullS
};
/** Used to define an enumerate type of the system variable
innodb_lock_schedule_algorithm. */
static TYPELIB innodb_lock_schedule_algorithm_typelib = {
array_elements(innodb_lock_schedule_algorithm_names) - 1,
"innodb_lock_schedule_algorithm_typelib",
innodb_lock_schedule_algorithm_names,
NULL
};
/* The following counter is used to convey information to InnoDB /* The following counter is used to convey information to InnoDB
about server activity: in case of normal DML ops it is not about server activity: in case of normal DML ops it is not
sensible to call srv_active_wake_master_thread after each sensible to call srv_active_wake_master_thread after each
...@@ -19013,6 +19029,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size, ...@@ -19013,6 +19029,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
NULL, NULL, 120, 1, 127, 0); NULL, NULL, 120, 1, 127, 0);
#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */ #endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
PLUGIN_VAR_RQCMDARG,
"The algorithm Innodb uses for deciding which locks to grant next when"
" a lock is released. Possible values are"
" FCFS"
" grant the locks in First-Come-First-Served order;"
" VATS"
" use the Variance-Aware-Transaction-Scheduling algorithm, which"
" uses an Eldest-Transaction-First heuristic.",
NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
&innodb_lock_schedule_algorithm_typelib);
static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of buffer pool instances, set to higher value on high-end machines to increase scalability", "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
...@@ -19828,6 +19856,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { ...@@ -19828,6 +19856,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(ft_sort_pll_degree),
MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(large_prefix),
MYSQL_SYSVAR(force_load_corrupted), MYSQL_SYSVAR(force_load_corrupted),
MYSQL_SYSVAR(lock_schedule_algorithm),
MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(locks_unsafe_for_binlog),
MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(lock_wait_timeout),
#ifdef UNIV_LOG_ARCHIVE #ifdef UNIV_LOG_ARCHIVE
......
...@@ -43,6 +43,15 @@ Created 5/7/1996 Heikki Tuuri ...@@ -43,6 +43,15 @@ Created 5/7/1996 Heikki Tuuri
extern ibool lock_print_waits; extern ibool lock_print_waits;
#endif /* UNIV_DEBUG */ #endif /* UNIV_DEBUG */
/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
setting innodb_lock_schedule_algorithm. */
enum innodb_lock_schedule_algorithm_t {
INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, /*!< First Come First Served */
INNODB_LOCK_SCHEDULE_ALGORITHM_VATS /*!< Variance-Aware-Transaction-Scheduling */
};
extern ulong innodb_lock_schedule_algorithm;
/*********************************************************************//** /*********************************************************************//**
Gets the size of a lock struct. Gets the size of a lock struct.
@return size in bytes */ @return size in bytes */
......
...@@ -76,6 +76,9 @@ bitmap */ ...@@ -76,6 +76,9 @@ bitmap */
#define LOCK_PAGE_BITMAP_MARGIN 64 #define LOCK_PAGE_BITMAP_MARGIN 64
/** Lock scheduling algorithm */
ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS;
/* An explicit record lock affects both the record and the gap before it. /* An explicit record lock affects both the record and the gap before it.
An implicit x-lock does not affect the gap, it only locks the index An implicit x-lock does not affect the gap, it only locks the index
record from read or update. record from read or update.
...@@ -385,6 +388,9 @@ extern "C" int thd_need_wait_for(const MYSQL_THD thd); ...@@ -385,6 +388,9 @@ extern "C" int thd_need_wait_for(const MYSQL_THD thd);
extern "C" extern "C"
int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
extern "C"
int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
/** Stack to use during DFS search. Currently only a single stack is required /** Stack to use during DFS search. Currently only a single stack is required
because there is no parallel deadlock check. This stack is protected by because there is no parallel deadlock check. This stack is protected by
the lock_sys_t::mutex. */ the lock_sys_t::mutex. */
...@@ -1982,6 +1988,84 @@ wsrep_print_wait_locks( ...@@ -1982,6 +1988,84 @@ wsrep_print_wait_locks(
} }
#endif /* WITH_WSREP */ #endif /* WITH_WSREP */
/*********************************************************************//**
Check if lock1 has higher priority than lock2.
NULL has lowest priority.
Respect the preference of the upper server layer to reduce conflict
during in-order parallel replication.
If neither of them is wait lock, the first one has higher priority.
If only one of them is a wait lock, it has lower priority.
Otherwise, the one with an older transaction has higher priority.
@returns true if lock1 has higher priority, false otherwise. */
bool
has_higher_priority(
lock_t *lock1,
lock_t *lock2)
{
if (lock1 == NULL) {
return false;
} else if (lock2 == NULL) {
return true;
}
// Ask the upper server layer if any of the two trx should be prefered.
int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd, lock2->trx->mysql_thd);
if (preference == -1) {
// lock1 is preferred as a victim, so lock2 has higher priority
return false;
} else if (preference == 1) {
// lock2 is preferred as a victim, so lock1 has higher priority
return true;
}
// No preference. Compre them by wait mode and trx age.
if (!lock_get_wait(lock1)) {
return true;
} else if (!lock_get_wait(lock2)) {
return false;
}
return lock1->trx->start_time < lock2->trx->start_time;
}
/*********************************************************************//**
Insert a lock to the hash list according to the mode (whether it is a wait lock)
and the age of the transaction the it is associated with.
If the lock is not a wait lock, insert it to the head of the hash list.
Otherwise, insert it to the middle of the wait locks according to the age of the
transaciton.
*/
static
void
lock_rec_insert_by_trx_age(
lock_t *in_lock, /*!< in: lock to be insert */
bool wait) /*!< in: whether it's a wait lock */
{
ulint space;
ulint page_no;
ulint rec_fold;
hash_cell_t* cell;
lock_t* node;
lock_t* next;
space = in_lock->un_member.rec_lock.space;
page_no = in_lock->un_member.rec_lock.page_no;
rec_fold = lock_rec_fold(space, page_no);
cell = hash_get_nth_cell(lock_sys->rec_hash,
hash_calc_hash(rec_fold, lock_sys->rec_hash));
node = (lock_t *) cell->node;
// If in_lock is not a wait lock, we insert it to the head of the list.
if (node == NULL || !wait || has_higher_priority(in_lock, node)) {
cell->node = in_lock;
in_lock->hash = node;
return;
}
while (node != NULL && has_higher_priority((lock_t *) node->hash, in_lock)) {
node = (lock_t *) node->hash;
}
next = (lock_t *) node->hash;
node->hash = in_lock;
in_lock->hash = next;
}
/*********************************************************************//** /*********************************************************************//**
Creates a new record lock and inserts it to the lock queue. Does NOT check Creates a new record lock and inserts it to the lock queue. Does NOT check
for deadlocks or lock compatibility! for deadlocks or lock compatibility!
...@@ -2146,11 +2230,11 @@ lock_rec_create( ...@@ -2146,11 +2230,11 @@ lock_rec_create(
trx_mutex_exit(c_lock->trx); trx_mutex_exit(c_lock->trx);
} else { } else {
HASH_INSERT(lock_t, hash, lock_sys->rec_hash, HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
lock_rec_fold(space, page_no), lock); lock_rec_fold(space, page_no), lock);
} }
#else #else
HASH_INSERT(lock_t, hash, lock_sys->rec_hash, HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
lock_rec_fold(space, page_no), lock); lock_rec_fold(space, page_no), lock);
#endif /* WITH_WSREP */ #endif /* WITH_WSREP */
if (!caller_owns_trx_mutex) { if (!caller_owns_trx_mutex) {
...@@ -2281,6 +2365,13 @@ lock_rec_enqueue_waiting( ...@@ -2281,6 +2365,13 @@ lock_rec_enqueue_waiting(
return(DB_SUCCESS_LOCKED_REC); return(DB_SUCCESS_LOCKED_REC);
} }
// Move it only when it does not cause a deadlock.
if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) {
HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
lock_rec_fold(buf_block_get_space(block), buf_block_get_page_no(block)), lock);
lock_rec_insert_by_trx_age(lock, true);
}
trx->lock.que_state = TRX_QUE_LOCK_WAIT; trx->lock.que_state = TRX_QUE_LOCK_WAIT;
trx->lock.was_chosen_as_deadlock_victim = FALSE; trx->lock.was_chosen_as_deadlock_victim = FALSE;
...@@ -2822,6 +2913,27 @@ lock_rec_cancel( ...@@ -2822,6 +2913,27 @@ lock_rec_cancel(
trx_mutex_exit(lock->trx); trx_mutex_exit(lock->trx);
} }
/*************************************************************//**
Move the lock to the head of the hash list. */
static
void
lock_rec_move_to_front(
lock_t *lock_to_move, /*!< in: lock to be moved */
ulint rec_fold) /*!< in: rec fold of the lock */
{
if (lock_to_move != NULL)
{
// Move the target lock to the head of the list
hash_cell_t* cell = hash_get_nth_cell(lock_sys->rec_hash,
hash_calc_hash(rec_fold, lock_sys->rec_hash));
if (lock_to_move != cell->node) {
lock_t *next = (lock_t *) cell->node;
cell->node = lock_to_move;
lock_to_move->hash = next;
}
}
}
/*************************************************************//** /*************************************************************//**
Removes a record lock request, waiting or granted, from the queue and Removes a record lock request, waiting or granted, from the queue and
grants locks to other transactions in the queue if they now are entitled grants locks to other transactions in the queue if they now are entitled
...@@ -2839,7 +2951,9 @@ lock_rec_dequeue_from_page( ...@@ -2839,7 +2951,9 @@ lock_rec_dequeue_from_page(
{ {
ulint space; ulint space;
ulint page_no; ulint page_no;
ulint rec_fold;
lock_t* lock; lock_t* lock;
lock_t* previous = NULL;
trx_lock_t* trx_lock; trx_lock_t* trx_lock;
ut_ad(lock_mutex_own()); ut_ad(lock_mutex_own());
...@@ -2850,6 +2964,7 @@ lock_rec_dequeue_from_page( ...@@ -2850,6 +2964,7 @@ lock_rec_dequeue_from_page(
space = in_lock->un_member.rec_lock.space; space = in_lock->un_member.rec_lock.space;
page_no = in_lock->un_member.rec_lock.page_no; page_no = in_lock->un_member.rec_lock.page_no;
rec_fold = lock_rec_fold(space, page_no);
in_lock->index->table->n_rec_locks--; in_lock->index->table->n_rec_locks--;
...@@ -2861,20 +2976,51 @@ lock_rec_dequeue_from_page( ...@@ -2861,20 +2976,51 @@ lock_rec_dequeue_from_page(
MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_INC(MONITOR_RECLOCK_REMOVED);
MONITOR_DEC(MONITOR_NUM_RECLOCK); MONITOR_DEC(MONITOR_NUM_RECLOCK);
/* Check if waiting locks in the queue can now be granted: grant if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) {
locks if there are no conflicting locks ahead. Stop at the first /* Check if waiting locks in the queue can now be granted: grant
X lock that is waiting or has been granted. */ locks if there are no conflicting locks ahead. Stop at the first
X lock that is waiting or has been granted. */
for (lock = lock_rec_get_first_on_page_addr(space, page_no); for (lock = lock_rec_get_first_on_page_addr(space, page_no);
lock != NULL; lock != NULL;
lock = lock_rec_get_next_on_page(lock)) { lock = lock_rec_get_next_on_page(lock)) {
if (lock_get_wait(lock) if (lock_get_wait(lock)
&& !lock_rec_has_to_wait_in_queue(lock)) { && !lock_rec_has_to_wait_in_queue(lock)) {
/* Grant the lock */ /* Grant the lock */
ut_ad(lock->trx != in_lock->trx); ut_ad(lock->trx != in_lock->trx);
lock_grant(lock); lock_grant(lock);
}
}
} else {
/* Grant locks if there are no conflicting locks ahead.
Move granted locks to the head of the list. */
for (lock = lock_rec_get_first_on_page_addr(space, page_no);
lock != NULL;) {
/* If the lock is a wait lock on this page, and it does not need to wait. */
if ((lock->un_member.rec_lock.space == space)
&& (lock->un_member.rec_lock.page_no == page_no)
&& lock_get_wait(lock)
&& !lock_rec_has_to_wait_in_queue(lock)) {
lock_grant(lock);
if (previous != NULL) {
/* Move the lock to the head of the list. */
HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
lock_rec_move_to_front(lock, rec_fold);
} else {
/* Already at the head of the list. */
previous = lock;
}
/* Move on to the next lock. */
lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
} else {
previous = lock;
lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
}
} }
} }
} }
...@@ -4080,7 +4226,8 @@ lock_get_first_lock( ...@@ -4080,7 +4226,8 @@ lock_get_first_lock(
} }
ut_a(lock != NULL); ut_a(lock != NULL);
ut_a(lock != ctx->wait_lock); ut_a(lock != ctx->wait_lock ||
innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS);
ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock)); ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
return(lock); return(lock);
...@@ -6287,8 +6434,10 @@ lock_rec_queue_validate( ...@@ -6287,8 +6434,10 @@ lock_rec_queue_validate(
mode, 0, 0, block, heap_no, lock->trx)); mode, 0, 0, block, heap_no, lock->trx));
#endif /* WITH_WSREP */ #endif /* WITH_WSREP */
} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)
&& innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) {
// If using VATS, it's possible that a wait lock is inserted to a place in the list
// such that it does not need to wait.
ut_a(lock_rec_has_to_wait_in_queue(lock)); ut_a(lock_rec_has_to_wait_in_queue(lock));
} }
} }
......
...@@ -348,6 +348,23 @@ static TYPELIB innodb_empty_free_list_algorithm_typelib = { ...@@ -348,6 +348,23 @@ static TYPELIB innodb_empty_free_list_algorithm_typelib = {
NULL NULL
}; };
/** Possible values of the parameter innodb_lock_schedule_algorithm */
static const char* innodb_lock_schedule_algorithm_names[] = {
"fcfs",
"vats",
NullS
};
/** Used to define an enumerate type of the system variable
innodb_lock_schedule_algorithm. */
static TYPELIB innodb_lock_schedule_algorithm_typelib = {
array_elements(innodb_lock_schedule_algorithm_names) - 1,
"innodb_lock_schedule_algorithm_typelib",
innodb_lock_schedule_algorithm_names,
NULL
};
/* The following counter is used to convey information to InnoDB /* The following counter is used to convey information to InnoDB
about server activity: in case of normal DML ops it is not about server activity: in case of normal DML ops it is not
sensible to call srv_active_wake_master_thread after each sensible to call srv_active_wake_master_thread after each
...@@ -20473,6 +20490,18 @@ static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm, ...@@ -20473,6 +20490,18 @@ static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm,
innodb_srv_empty_free_list_algorithm_validate, NULL, SRV_EMPTY_FREE_LIST_BACKOFF, innodb_srv_empty_free_list_algorithm_validate, NULL, SRV_EMPTY_FREE_LIST_BACKOFF,
&innodb_empty_free_list_algorithm_typelib); &innodb_empty_free_list_algorithm_typelib);
static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
PLUGIN_VAR_RQCMDARG,
"The algorithm Innodb uses for deciding which locks to grant next when"
" a lock is released. Possible values are"
" FCFS"
" grant the locks in First-Come-First-Served order;"
" VATS"
" use the Variance-Aware-Transaction-Scheduling algorithm, which"
" uses an Eldest-Transaction-First heuristic.",
NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
&innodb_lock_schedule_algorithm_typelib);
static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances, static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of buffer pool instances, set to higher value on high-end machines to increase scalability", "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
...@@ -21366,6 +21395,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { ...@@ -21366,6 +21395,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(ft_sort_pll_degree), MYSQL_SYSVAR(ft_sort_pll_degree),
MYSQL_SYSVAR(large_prefix), MYSQL_SYSVAR(large_prefix),
MYSQL_SYSVAR(force_load_corrupted), MYSQL_SYSVAR(force_load_corrupted),
MYSQL_SYSVAR(lock_schedule_algorithm),
MYSQL_SYSVAR(locks_unsafe_for_binlog), MYSQL_SYSVAR(locks_unsafe_for_binlog),
MYSQL_SYSVAR(lock_wait_timeout), MYSQL_SYSVAR(lock_wait_timeout),
#ifdef UNIV_LOG_ARCHIVE #ifdef UNIV_LOG_ARCHIVE
......
...@@ -45,6 +45,15 @@ Created 5/7/1996 Heikki Tuuri ...@@ -45,6 +45,15 @@ Created 5/7/1996 Heikki Tuuri
extern ibool lock_print_waits; extern ibool lock_print_waits;
#endif /* UNIV_DEBUG */ #endif /* UNIV_DEBUG */
/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
setting innodb_lock_schedule_algorithm. */
enum innodb_lock_schedule_algorithm_t {
INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS, /*!< First Come First Served */
INNODB_LOCK_SCHEDULE_ALGORITHM_VATS /*!< Variance-Aware-Transaction-Scheduling */
};
extern ulong innodb_lock_schedule_algorithm;
extern ulint srv_n_lock_deadlock_count; extern ulint srv_n_lock_deadlock_count;
/*********************************************************************//** /*********************************************************************//**
......
...@@ -76,6 +76,9 @@ bitmap */ ...@@ -76,6 +76,9 @@ bitmap */
#define LOCK_PAGE_BITMAP_MARGIN 64 #define LOCK_PAGE_BITMAP_MARGIN 64
/** Lock scheduling algorithm */
ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS;
/* An explicit record lock affects both the record and the gap before it. /* An explicit record lock affects both the record and the gap before it.
An implicit x-lock does not affect the gap, it only locks the index An implicit x-lock does not affect the gap, it only locks the index
record from read or update. record from read or update.
...@@ -385,6 +388,9 @@ extern "C" int thd_need_wait_for(const MYSQL_THD thd); ...@@ -385,6 +388,9 @@ extern "C" int thd_need_wait_for(const MYSQL_THD thd);
extern "C" extern "C"
int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd); int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
extern "C"
int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
/** Stack to use during DFS search. Currently only a single stack is required /** Stack to use during DFS search. Currently only a single stack is required
because there is no parallel deadlock check. This stack is protected by because there is no parallel deadlock check. This stack is protected by
the lock_sys_t::mutex. */ the lock_sys_t::mutex. */
...@@ -2005,6 +2011,84 @@ wsrep_print_wait_locks( ...@@ -2005,6 +2011,84 @@ wsrep_print_wait_locks(
} }
#endif /* WITH_WSREP */ #endif /* WITH_WSREP */
/*********************************************************************//**
Check if lock1 has higher priority than lock2.
NULL has lowest priority.
Respect the preference of the upper server layer to reduce conflict
during in-order parallel replication.
If neither of them is wait lock, the first one has higher priority.
If only one of them is a wait lock, it has lower priority.
Otherwise, the one with an older transaction has higher priority.
@returns true if lock1 has higher priority, false otherwise. */
bool
has_higher_priority(
lock_t *lock1,
lock_t *lock2)
{
if (lock1 == NULL) {
return false;
} else if (lock2 == NULL) {
return true;
}
// Ask the upper server layer if any of the two trx should be prefered.
int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd, lock2->trx->mysql_thd);
if (preference == -1) {
// lock1 is preferred as a victim, so lock2 has higher priority
return false;
} else if (preference == 1) {
// lock2 is preferred as a victim, so lock1 has higher priority
return true;
}
// No preference. Compre them by wait mode and trx age.
if (!lock_get_wait(lock1)) {
return true;
} else if (!lock_get_wait(lock2)) {
return false;
}
return lock1->trx->start_time < lock2->trx->start_time;
}
/*********************************************************************//**
Insert a lock to the hash list according to the mode (whether it is a wait lock)
and the age of the transaction the it is associated with.
If the lock is not a wait lock, insert it to the head of the hash list.
Otherwise, insert it to the middle of the wait locks according to the age of the
transaciton.
*/
static
void
lock_rec_insert_by_trx_age(
lock_t *in_lock, /*!< in: lock to be insert */
bool wait) /*!< in: whether it's a wait lock */
{
ulint space;
ulint page_no;
ulint rec_fold;
hash_cell_t* cell;
lock_t* node;
lock_t* next;
space = in_lock->un_member.rec_lock.space;
page_no = in_lock->un_member.rec_lock.page_no;
rec_fold = lock_rec_fold(space, page_no);
cell = hash_get_nth_cell(lock_sys->rec_hash,
hash_calc_hash(rec_fold, lock_sys->rec_hash));
node = (lock_t *) cell->node;
// If in_lock is not a wait lock, we insert it to the head of the list.
if (node == NULL || !wait || has_higher_priority(in_lock, node)) {
cell->node = in_lock;
in_lock->hash = node;
return;
}
while (node != NULL && has_higher_priority((lock_t *) node->hash, in_lock)) {
node = (lock_t *) node->hash;
}
next = (lock_t *) node->hash;
node->hash = in_lock;
in_lock->hash = next;
}
/*********************************************************************//** /*********************************************************************//**
Creates a new record lock and inserts it to the lock queue. Does NOT check Creates a new record lock and inserts it to the lock queue. Does NOT check
for deadlocks or lock compatibility! for deadlocks or lock compatibility!
...@@ -2169,11 +2253,11 @@ lock_rec_create( ...@@ -2169,11 +2253,11 @@ lock_rec_create(
trx_mutex_exit(c_lock->trx); trx_mutex_exit(c_lock->trx);
} else { } else {
HASH_INSERT(lock_t, hash, lock_sys->rec_hash, HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
lock_rec_fold(space, page_no), lock); lock_rec_fold(space, page_no), lock);
} }
#else #else
HASH_INSERT(lock_t, hash, lock_sys->rec_hash, HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
lock_rec_fold(space, page_no), lock); lock_rec_fold(space, page_no), lock);
#endif /* WITH_WSREP */ #endif /* WITH_WSREP */
lock_sys->rec_num++; lock_sys->rec_num++;
...@@ -2309,6 +2393,13 @@ lock_rec_enqueue_waiting( ...@@ -2309,6 +2393,13 @@ lock_rec_enqueue_waiting(
return(DB_SUCCESS_LOCKED_REC); return(DB_SUCCESS_LOCKED_REC);
} }
// Move it only when it does not cause a deadlock.
if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS) {
HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
lock_rec_fold(buf_block_get_space(block), buf_block_get_page_no(block)), lock);
lock_rec_insert_by_trx_age(lock, true);
}
trx->lock.que_state = TRX_QUE_LOCK_WAIT; trx->lock.que_state = TRX_QUE_LOCK_WAIT;
trx->lock.was_chosen_as_deadlock_victim = FALSE; trx->lock.was_chosen_as_deadlock_victim = FALSE;
...@@ -2858,6 +2949,27 @@ lock_rec_cancel( ...@@ -2858,6 +2949,27 @@ lock_rec_cancel(
trx_mutex_exit(lock->trx); trx_mutex_exit(lock->trx);
} }
/*************************************************************//**
Move the lock to the head of the hash list. */
static
void
lock_rec_move_to_front(
lock_t *lock_to_move, /*!< in: lock to be moved */
ulint rec_fold) /*!< in: rec fold of the lock */
{
if (lock_to_move != NULL)
{
// Move the target lock to the head of the list
hash_cell_t* cell = hash_get_nth_cell(lock_sys->rec_hash,
hash_calc_hash(rec_fold, lock_sys->rec_hash));
if (lock_to_move != cell->node) {
lock_t *next = (lock_t *) cell->node;
cell->node = lock_to_move;
lock_to_move->hash = next;
}
}
}
/*************************************************************//** /*************************************************************//**
Removes a record lock request, waiting or granted, from the queue and Removes a record lock request, waiting or granted, from the queue and
grants locks to other transactions in the queue if they now are entitled grants locks to other transactions in the queue if they now are entitled
...@@ -2875,7 +2987,9 @@ lock_rec_dequeue_from_page( ...@@ -2875,7 +2987,9 @@ lock_rec_dequeue_from_page(
{ {
ulint space; ulint space;
ulint page_no; ulint page_no;
ulint rec_fold;
lock_t* lock; lock_t* lock;
lock_t* previous = NULL;
trx_lock_t* trx_lock; trx_lock_t* trx_lock;
ut_ad(lock_mutex_own()); ut_ad(lock_mutex_own());
...@@ -2886,6 +3000,7 @@ lock_rec_dequeue_from_page( ...@@ -2886,6 +3000,7 @@ lock_rec_dequeue_from_page(
space = in_lock->un_member.rec_lock.space; space = in_lock->un_member.rec_lock.space;
page_no = in_lock->un_member.rec_lock.page_no; page_no = in_lock->un_member.rec_lock.page_no;
rec_fold = lock_rec_fold(space, page_no);
in_lock->index->table->n_rec_locks--; in_lock->index->table->n_rec_locks--;
...@@ -2898,20 +3013,51 @@ lock_rec_dequeue_from_page( ...@@ -2898,20 +3013,51 @@ lock_rec_dequeue_from_page(
MONITOR_INC(MONITOR_RECLOCK_REMOVED); MONITOR_INC(MONITOR_RECLOCK_REMOVED);
MONITOR_DEC(MONITOR_NUM_RECLOCK); MONITOR_DEC(MONITOR_NUM_RECLOCK);
/* Check if waiting locks in the queue can now be granted: grant if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) {
locks if there are no conflicting locks ahead. Stop at the first /* Check if waiting locks in the queue can now be granted: grant
X lock that is waiting or has been granted. */ locks if there are no conflicting locks ahead. Stop at the first
X lock that is waiting or has been granted. */
for (lock = lock_rec_get_first_on_page_addr(space, page_no); for (lock = lock_rec_get_first_on_page_addr(space, page_no);
lock != NULL; lock != NULL;
lock = lock_rec_get_next_on_page(lock)) { lock = lock_rec_get_next_on_page(lock)) {
if (lock_get_wait(lock) if (lock_get_wait(lock)
&& !lock_rec_has_to_wait_in_queue(lock)) { && !lock_rec_has_to_wait_in_queue(lock)) {
/* Grant the lock */ /* Grant the lock */
ut_ad(lock->trx != in_lock->trx); ut_ad(lock->trx != in_lock->trx);
lock_grant(lock); lock_grant(lock);
}
}
} else {
/* Grant locks if there are no conflicting locks ahead.
Move granted locks to the head of the list. */
for (lock = lock_rec_get_first_on_page_addr(space, page_no);
lock != NULL;) {
/* If the lock is a wait lock on this page, and it does not need to wait. */
if ((lock->un_member.rec_lock.space == space)
&& (lock->un_member.rec_lock.page_no == page_no)
&& lock_get_wait(lock)
&& !lock_rec_has_to_wait_in_queue(lock)) {
lock_grant(lock);
if (previous != NULL) {
/* Move the lock to the head of the list. */
HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
lock_rec_move_to_front(lock, rec_fold);
} else {
/* Already at the head of the list. */
previous = lock;
}
/* Move on to the next lock. */
lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
} else {
previous = lock;
lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
}
} }
} }
} }
...@@ -4118,7 +4264,8 @@ lock_get_first_lock( ...@@ -4118,7 +4264,8 @@ lock_get_first_lock(
} }
ut_a(lock != NULL); ut_a(lock != NULL);
ut_a(lock != ctx->wait_lock); ut_a(lock != ctx->wait_lock ||
innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS);
ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock)); ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
return(lock); return(lock);
...@@ -6350,8 +6497,10 @@ lock_rec_queue_validate( ...@@ -6350,8 +6497,10 @@ lock_rec_queue_validate(
mode, 0, 0, block, heap_no, lock->trx->id)); mode, 0, 0, block, heap_no, lock->trx->id));
#endif /* WITH_WSREP */ #endif /* WITH_WSREP */
} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) { } else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)
&& innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) {
// If using VATS, it's possible that a wait lock is inserted to a place in the list
// such that it does not need to wait.
ut_a(lock_rec_has_to_wait_in_queue(lock)); ut_a(lock_rec_has_to_wait_in_queue(lock));
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment