Commit d039f388 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking fixes from Thomas Gleixner:
 "Two regression fixes for reader writer semaphores:

   - Plug a race in the lock handoff which is caused by inconsistency of
     the reader and writer path and can lead to corruption of the
     underlying counter.

   - down_read_trylock() is suboptimal when the lock is contended and
     multiple readers trylock concurrently. That's due to the initial
     value being read non-atomically which results in at least two
     compare exchange loops. Making the initial readout atomic reduces
     this significantly. Whith 40 readers by 11% in a benchmark which
     enforces contention on mmap_sem"

* tag 'locking-urgent-2021-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  locking/rwsem: Optimize down_read_trylock() under highly contended case
  locking/rwsem: Make handoff bit handling more consistent
parents f8132d62 14c24048
...@@ -105,9 +105,9 @@ ...@@ -105,9 +105,9 @@
* atomic_long_cmpxchg() will be used to obtain writer lock. * atomic_long_cmpxchg() will be used to obtain writer lock.
* *
* There are three places where the lock handoff bit may be set or cleared. * There are three places where the lock handoff bit may be set or cleared.
* 1) rwsem_mark_wake() for readers. * 1) rwsem_mark_wake() for readers -- set, clear
* 2) rwsem_try_write_lock() for writers. * 2) rwsem_try_write_lock() for writers -- set, clear
* 3) Error path of rwsem_down_write_slowpath(). * 3) rwsem_del_waiter() -- clear
* *
* For all the above cases, wait_lock will be held. A writer must also * For all the above cases, wait_lock will be held. A writer must also
* be the first one in the wait_list to be eligible for setting the handoff * be the first one in the wait_list to be eligible for setting the handoff
...@@ -334,6 +334,9 @@ struct rwsem_waiter { ...@@ -334,6 +334,9 @@ struct rwsem_waiter {
struct task_struct *task; struct task_struct *task;
enum rwsem_waiter_type type; enum rwsem_waiter_type type;
unsigned long timeout; unsigned long timeout;
/* Writer only, not initialized in reader */
bool handoff_set;
}; };
#define rwsem_first_waiter(sem) \ #define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list) list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
...@@ -344,12 +347,6 @@ enum rwsem_wake_type { ...@@ -344,12 +347,6 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
}; };
enum writer_wait_state {
WRITER_NOT_FIRST, /* Writer is not first in wait list */
WRITER_FIRST, /* Writer is first in wait list */
WRITER_HANDOFF /* Writer is first & handoff needed */
};
/* /*
* The typical HZ value is either 250 or 1000. So set the minimum waiting * The typical HZ value is either 250 or 1000. So set the minimum waiting
* time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
...@@ -365,6 +362,31 @@ enum writer_wait_state { ...@@ -365,6 +362,31 @@ enum writer_wait_state {
*/ */
#define MAX_READERS_WAKEUP 0x100 #define MAX_READERS_WAKEUP 0x100
static inline void
rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_add_tail(&waiter->list, &sem->wait_list);
/* caller will set RWSEM_FLAG_WAITERS */
}
/*
* Remove a waiter from the wait_list and clear flags.
*
* Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of
* this function. Modify with care.
*/
static inline void
rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter)
{
lockdep_assert_held(&sem->wait_lock);
list_del(&waiter->list);
if (likely(!list_empty(&sem->wait_list)))
return;
atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count);
}
/* /*
* handle the lock release when processes blocked on it that can now run * handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
...@@ -376,6 +398,8 @@ enum writer_wait_state { ...@@ -376,6 +398,8 @@ enum writer_wait_state {
* preferably when the wait_lock is released * preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed * - woken process blocks are discarded from the list after having task zeroed
* - writers are only marked woken if downgrading is false * - writers are only marked woken if downgrading is false
*
* Implies rwsem_del_waiter() for all woken readers.
*/ */
static void rwsem_mark_wake(struct rw_semaphore *sem, static void rwsem_mark_wake(struct rw_semaphore *sem,
enum rwsem_wake_type wake_type, enum rwsem_wake_type wake_type,
...@@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, ...@@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
adjustment = woken * RWSEM_READER_BIAS - adjustment; adjustment = woken * RWSEM_READER_BIAS - adjustment;
lockevent_cond_inc(rwsem_wake_reader, woken); lockevent_cond_inc(rwsem_wake_reader, woken);
oldcount = atomic_long_read(&sem->count);
if (list_empty(&sem->wait_list)) { if (list_empty(&sem->wait_list)) {
/* hit end of list above */ /*
* Combined with list_move_tail() above, this implies
* rwsem_del_waiter().
*/
adjustment -= RWSEM_FLAG_WAITERS; adjustment -= RWSEM_FLAG_WAITERS;
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
} else if (woken) {
/*
* When we've woken a reader, we no longer need to force
* writers to give up the lock and we can clear HANDOFF.
*/
if (oldcount & RWSEM_FLAG_HANDOFF)
adjustment -= RWSEM_FLAG_HANDOFF;
} }
/*
* When we've woken a reader, we no longer need to force writers
* to give up the lock and we can clear HANDOFF.
*/
if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
adjustment -= RWSEM_FLAG_HANDOFF;
if (adjustment) if (adjustment)
atomic_long_add(adjustment, &sem->count); atomic_long_add(adjustment, &sem->count);
...@@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, ...@@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* race conditions between checking the rwsem wait list and setting the * race conditions between checking the rwsem wait list and setting the
* sem->count accordingly. * sem->count accordingly.
* *
* If wstate is WRITER_HANDOFF, it will make sure that either the handoff * Implies rwsem_del_waiter() on success.
* bit is set or the lock is acquired with handoff bit cleared.
*/ */
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
enum writer_wait_state wstate) struct rwsem_waiter *waiter)
{ {
bool first = rwsem_first_waiter(sem) == waiter;
long count, new; long count, new;
lockdep_assert_held(&sem->wait_lock); lockdep_assert_held(&sem->wait_lock);
...@@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, ...@@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
do { do {
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
if (has_handoff && wstate == WRITER_NOT_FIRST) if (has_handoff) {
return false; if (!first)
return false;
/* First waiter inherits a previously set handoff bit */
waiter->handoff_set = true;
}
new = count; new = count;
if (count & RWSEM_LOCK_MASK) { if (count & RWSEM_LOCK_MASK) {
if (has_handoff || (wstate != WRITER_HANDOFF)) if (has_handoff || (!rt_task(waiter->task) &&
!time_after(jiffies, waiter->timeout)))
return false; return false;
new |= RWSEM_FLAG_HANDOFF; new |= RWSEM_FLAG_HANDOFF;
...@@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, ...@@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
* We have either acquired the lock with handoff bit cleared or * We have either acquired the lock with handoff bit cleared or
* set the handoff bit. * set the handoff bit.
*/ */
if (new & RWSEM_FLAG_HANDOFF) if (new & RWSEM_FLAG_HANDOFF) {
waiter->handoff_set = true;
lockevent_inc(rwsem_wlock_handoff);
return false; return false;
}
/*
* Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on
* success.
*/
list_del(&waiter->list);
rwsem_set_owner(sem); rwsem_set_owner(sem);
return true; return true;
} }
...@@ -956,7 +1001,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat ...@@ -956,7 +1001,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
} }
adjustment += RWSEM_FLAG_WAITERS; adjustment += RWSEM_FLAG_WAITERS;
} }
list_add_tail(&waiter.list, &sem->wait_list); rwsem_add_waiter(sem, &waiter);
/* we're now waiting on the lock, but no longer actively locking */ /* we're now waiting on the lock, but no longer actively locking */
count = atomic_long_add_return(adjustment, &sem->count); count = atomic_long_add_return(adjustment, &sem->count);
...@@ -1002,11 +1047,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat ...@@ -1002,11 +1047,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
return sem; return sem;
out_nolock: out_nolock:
list_del(&waiter.list); rwsem_del_waiter(sem, &waiter);
if (list_empty(&sem->wait_list)) {
atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
&sem->count);
}
raw_spin_unlock_irq(&sem->wait_lock); raw_spin_unlock_irq(&sem->wait_lock);
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail); lockevent_inc(rwsem_rlock_fail);
...@@ -1020,9 +1061,7 @@ static struct rw_semaphore * ...@@ -1020,9 +1061,7 @@ static struct rw_semaphore *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{ {
long count; long count;
enum writer_wait_state wstate;
struct rwsem_waiter waiter; struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q); DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */ /* do optimistic spinning and steal lock if possible */
...@@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) ...@@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
waiter.task = current; waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.type = RWSEM_WAITING_FOR_WRITE;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock); raw_spin_lock_irq(&sem->wait_lock);
rwsem_add_waiter(sem, &waiter);
/* account for this before adding a new element to the list */
wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock */ /* we're now waiting on the lock */
if (wstate == WRITER_NOT_FIRST) { if (rwsem_first_waiter(sem) != &waiter) {
count = atomic_long_read(&sem->count); count = atomic_long_read(&sem->count);
/* /*
...@@ -1083,13 +1119,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) ...@@ -1083,13 +1119,16 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
/* wait until we successfully acquire the lock */ /* wait until we successfully acquire the lock */
set_current_state(state); set_current_state(state);
for (;;) { for (;;) {
if (rwsem_try_write_lock(sem, wstate)) { if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */ /* rwsem_try_write_lock() implies ACQUIRE on success */
break; break;
} }
raw_spin_unlock_irq(&sem->wait_lock); raw_spin_unlock_irq(&sem->wait_lock);
if (signal_pending_state(state, current))
goto out_nolock;
/* /*
* After setting the handoff bit and failing to acquire * After setting the handoff bit and failing to acquire
* the lock, attempt to spin on owner to accelerate lock * the lock, attempt to spin on owner to accelerate lock
...@@ -1098,7 +1137,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) ...@@ -1098,7 +1137,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
* In this case, we attempt to acquire the lock again * In this case, we attempt to acquire the lock again
* without sleeping. * without sleeping.
*/ */
if (wstate == WRITER_HANDOFF) { if (waiter.handoff_set) {
enum owner_state owner_state; enum owner_state owner_state;
preempt_disable(); preempt_disable();
...@@ -1109,66 +1148,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) ...@@ -1109,66 +1148,26 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
goto trylock_again; goto trylock_again;
} }
/* Block until there are no active lockers. */ schedule();
for (;;) { lockevent_inc(rwsem_sleep_writer);
if (signal_pending_state(state, current)) set_current_state(state);
goto out_nolock;
schedule();
lockevent_inc(rwsem_sleep_writer);
set_current_state(state);
/*
* If HANDOFF bit is set, unconditionally do
* a trylock.
*/
if (wstate == WRITER_HANDOFF)
break;
if ((wstate == WRITER_NOT_FIRST) &&
(rwsem_first_waiter(sem) == &waiter))
wstate = WRITER_FIRST;
count = atomic_long_read(&sem->count);
if (!(count & RWSEM_LOCK_MASK))
break;
/*
* The setting of the handoff bit is deferred
* until rwsem_try_write_lock() is called.
*/
if ((wstate == WRITER_FIRST) && (rt_task(current) ||
time_after(jiffies, waiter.timeout))) {
wstate = WRITER_HANDOFF;
lockevent_inc(rwsem_wlock_handoff);
break;
}
}
trylock_again: trylock_again:
raw_spin_lock_irq(&sem->wait_lock); raw_spin_lock_irq(&sem->wait_lock);
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock); raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock); lockevent_inc(rwsem_wlock);
return sem;
return ret;
out_nolock: out_nolock:
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
raw_spin_lock_irq(&sem->wait_lock); raw_spin_lock_irq(&sem->wait_lock);
list_del(&waiter.list); rwsem_del_waiter(sem, &waiter);
if (!list_empty(&sem->wait_list))
if (unlikely(wstate == WRITER_HANDOFF))
atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
if (list_empty(&sem->wait_list))
atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
else
rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock); raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q); wake_up_q(&wake_q);
lockevent_inc(rwsem_wlock_fail); lockevent_inc(rwsem_wlock_fail);
return ERR_PTR(-EINTR); return ERR_PTR(-EINTR);
} }
...@@ -1249,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) ...@@ -1249,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
/* tmp = atomic_long_read(&sem->count);
* Optimize for the case when the rwsem is not locked at all. while (!(tmp & RWSEM_READ_FAILED_MASK)) {
*/
tmp = RWSEM_UNLOCKED_VALUE;
do {
if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
tmp + RWSEM_READER_BIAS)) { tmp + RWSEM_READER_BIAS)) {
rwsem_set_reader_owned(sem); rwsem_set_reader_owned(sem);
return 1; return 1;
} }
} while (!(tmp & RWSEM_READ_FAILED_MASK)); }
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment