Commit d039f388 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking fixes from Thomas Gleixner:
 "Two regression fixes for reader writer semaphores:

   - Plug a race in the lock handoff which is caused by inconsistency of
     the reader and writer path and can lead to corruption of the
     underlying counter.

   - down_read_trylock() is suboptimal when the lock is contended and
     multiple readers trylock concurrently. That's due to the initial
     value being read non-atomically which results in at least two
     compare exchange loops. Making the initial readout atomic reduces
     this significantly. Whith 40 readers by 11% in a benchmark which
     enforces contention on mmap_sem"

* tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  locking/rwsem: Optimize down_read_trylock() under highly contended case
  locking/rwsem: Make handoff bit handling more consistent
parents f8132d62 14c24048
......@@ -105,9 +105,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock.
*
* There are three places where the lock handoff bit may be set or cleared.
* 1) rwsem_mark_wake() for readers.
* 2) rwsem_try_write_lock() for writers.
* 3) Error path of rwsem_down_write_slowpath().
* 1) rwsem_mark_wake() for readers -- set, clear
* 2) rwsem_try_write_lock() for writers -- set, clear
* 3) rwsem_del_waiter() -- clear
*
* For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff
......@@ -334,6 +334,9 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
/* Writer only, not initialized in reader */
bool handoff_set;
};
#define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
......@@ -344,12 +347,6 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
};
enum writer_wait_state {
WRITER_NOT_FIRST, /* Writer is not first in wait list */
WRITER_FIRST, /* Writer is first in wait list */
WRITER_HANDOFF /* Writer is first & handoff needed */
};
/*
* The typical HZ value is either 250 or 1000. So set the minimum waiting
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
......@@ -365,6 +362,31 @@ enum writer_wait_state {
*/
#define MAX_READERS_WAKEUP 0x100
static inline void
rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_add_tail(&waiter->list, &sem->wait_list);
/* caller will set RWSEM_FLAG_WAITERS */
}
/*
* Remove a waiter from the wait_list and clear flags.
*
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
* this function. Modify with care.
*/
static inline void
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_del(&waiter->list);
if (likely(!list_empty(&sem->wait_list)))
return;
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
}
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
......@@ -376,6 +398,8 @@ enum writer_wait_state {
* preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false
*
* Implies rwsem_del_waiter() for all woken readers.
*/
static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type,
......@@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken);
oldcount = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) {
/* hit end of list above */
/*
* Combined with list_move_tail() above, this implies
* rwsem_del_waiter().
*/
adjustment -= RWSEM_FLAG_WAITERS;
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
} else if (woken) {
/*
* When we've woken a reader, we no longer need to force
* writers to give up the lock and we can clear HANDOFF.
*/
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
}
/*
* When we've woken a reader, we no longer need to force writers
* to give up the lock and we can clear HANDOFF.
*/
if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
adjustment -= RWSEM_FLAG_HANDOFF;
if (adjustment)
atomic_long_add(adjustment, &sem->count);
......@@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* race conditions between checking the rwsem wait list and setting the
* sem->count accordingly.
*
* If wstate is WRITER_HANDOFF, it will make sure that either the handoff
* bit is set or the lock is acquired with handoff bit cleared.
* Implies rwsem_del_waiter() on success.
*/
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
enum writer_wait_state wstate)
struct rwsem_waiter *waiter)
{
bool first = rwsem_first_waiter(sem) == waiter;
long count, new;
lockdep_assert_held(&sem->wait_lock);
......@@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
if (has_handoff && wstate == WRITER_NOT_FIRST)
return false;
if (has_handoff) {
if (!first)
return false;
/* First waiter inherits a previously set handoff bit */
waiter->handoff_set = true;
}
new = count;
if (count & RWSEM_LOCK_MASK) {
if (has_handoff || (wstate != WRITER_HANDOFF))
if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false;
new |= RWSEM_FLAG_HANDOFF;
......@@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* We have either acquired the lock with handoff bit cleared or
* set the handoff bit.
*/
if (new & RWSEM_FLAG_HANDOFF)
if (new & RWSEM_FLAG_HANDOFF) {
waiter->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false;
}
/*
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
* success.
*/
list_del(&waiter->list);
rwsem_set_owner(sem);
return true;
}
......@@ -956,7 +1001,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
}
adjustment += RWSEM_FLAG_WAITERS;
}
list_add_tail(&waiter.list, &sem->wait_list);
rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count);
......@@ -1002,11 +1047,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
return sem;
out_nolock:
list_del(&waiter.list);
if (list_empty(&sem->wait_list)) {
atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
&sem->count);
}
rwsem_del_waiter(sem, &waiter);
raw_spin_unlock_irq(&sem->wait_lock);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
......@@ -1020,9 +1061,7 @@ static struct rw_semaphore *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
long count;
enum writer_wait_state wstate;
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
......@@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
/* account for this before adding a new element to the list */
wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
list_add_tail(&waiter.list, &sem->wait_list);
rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock */
if (wstate == WRITER_NOT_FIRST) {
if (rwsem_first_waiter(sem) != &waiter) {
count = atomic_long_read(&sem->count);
/*
......@@ -1083,13 +1119,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/* wait until we successfully acquire the lock */
set_current_state(state);
for (;;) {
if (rwsem_try_write_lock(sem, wstate)) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
break;
}
raw_spin_unlock_irq(&sem->wait_lock);
if (signal_pending_state(state, current))
goto out_nolock;
/*
* After setting the handoff bit and failing to acquire
* the lock, attempt to spin on owner to accelerate lock
......@@ -1098,7 +1137,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
* In this case, we attempt to acquire the lock again
* without sleeping.
*/
if (wstate == WRITER_HANDOFF) {
if (waiter.handoff_set) {
enum owner_state owner_state;
preempt_disable();
......@@ -1109,66 +1148,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
goto trylock_again;
}
/* Block until there are no active lockers. */
for (;;) {
if (signal_pending_state(state, current))
goto out_nolock;
schedule();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
/*
* If HANDOFF bit is set, unconditionally do
* a trylock.
*/
if (wstate == WRITER_HANDOFF)
break;
if ((wstate == WRITER_NOT_FIRST) &&
(rwsem_first_waiter(sem) == &waiter))
wstate = WRITER_FIRST;
count = atomic_long_read(&sem->count);
if (!(count & RWSEM_LOCK_MASK))
break;
/*
* The setting of the handoff bit is deferred
* until rwsem_try_write_lock() is called.
*/
if ((wstate == WRITER_FIRST) && (rt_task(current) ||
time_after(jiffies, waiter.timeout))) {
wstate = WRITER_HANDOFF;
lockevent_inc(rwsem_wlock_handoff);
break;
}
}
schedule();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
trylock_again:
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
return ret;
return sem;
out_nolock:
__set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list);
if (unlikely(wstate == WRITER_HANDOFF))
atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
if (list_empty(&sem->wait_list))
atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
else
rwsem_del_waiter(sem, &waiter);
if (!list_empty(&sem->wait_list))
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
lockevent_inc(rwsem_wlock_fail);
return ERR_PTR(-EINTR);
}
......@@ -1249,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
/*
* Optimize for the case when the rwsem is not locked at all.
*/
tmp = RWSEM_UNLOCKED_VALUE;
do {
tmp = atomic_long_read(&sem->count);
while (!(tmp & RWSEM_READ_FAILED_MASK)) {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) {
tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem);
return 1;
}
} while (!(tmp & RWSEM_READ_FAILED_MASK));
}
return 0;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment