Commit 2a9127fc authored by Linus Torvalds's avatar Linus Torvalds

mm: rewrite wait_on_page_bit_common() logic

It turns out that wait_on_page_bit_common() had several problems,
ranging from just unfair behavioe due to re-queueing at the end of the
wait queue when re-trying, and an outright bug that could result in
missed wakeups (but probably never happened in practice).

This rewrites the whole logic to avoid both issues, by simply moving the
logic to check (and possibly take) the bit lock into the wakeup path
instead.

That makes everything much more straightforward, and means that we never
need to re-queue the wait entry: if we get woken up, we'll be notified
through WQ_FLAG_WOKEN, and the wait queue entry will have been removed,
and everything will have been done for us.

Link: https://lore.kernel.org/lkml/CAHk-=wjJA2Z3kUFb-5s=6+n0qbTs8ELqKFt9B3pH85a8fGD73w@mail.gmail.com/
Link: https://lore.kernel.org/lkml/alpine.LSU.2.11.2007221359450.1017@eggly.anvils/Reported-by: default avatarOleg Nesterov <oleg@redhat.com>
Reported-by: default avatarHugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Reviewed-by: default avatarOleg Nesterov <oleg@redhat.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent bcf87687
...@@ -1002,6 +1002,7 @@ struct wait_page_queue { ...@@ -1002,6 +1002,7 @@ struct wait_page_queue {
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{ {
int ret;
struct wait_page_key *key = arg; struct wait_page_key *key = arg;
struct wait_page_queue *wait_page struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait); = container_of(wait, struct wait_page_queue, wait);
...@@ -1014,17 +1015,40 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, ...@@ -1014,17 +1015,40 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
return 0; return 0;
/* /*
* Stop walking if it's locked. * If it's an exclusive wait, we get the bit for it, and
* Is this safe if put_and_wait_on_page_locked() is in use? * stop walking if we can't.
* Yes: the waker must hold a reference to this page, and if PG_locked *
* has now already been set by another task, that task must also hold * If it's a non-exclusive wait, then the fact that this
* a reference to the *same usage* of this page; so there is no need * wake function was called means that the bit already
* to walk on to wake even the put_and_wait_on_page_locked() callers. * was cleared, and we don't care if somebody then
*/ * re-took it.
if (test_bit(key->bit_nr, &key->page->flags)) */
ret = 0;
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
if (test_and_set_bit(key->bit_nr, &key->page->flags))
return -1; return -1;
ret = 1;
}
wait->flags |= WQ_FLAG_WOKEN;
wake_up_state(wait->private, mode);
return autoremove_wake_function(wait, mode, sync, key); /*
* Ok, we have successfully done what we're waiting for,
* and we can unconditionally remove the wait entry.
*
* Note that this has to be the absolute last thing we do,
* since after list_del_init(&wait->entry) the wait entry
* might be de-allocated and the process might even have
* exited.
*
* We _really_ should have a "list_del_init_careful()" to
* properly pair with the unlocked "list_empty_careful()"
* in finish_wait().
*/
smp_mb();
list_del_init(&wait->entry);
return ret;
} }
static void wake_up_page_bit(struct page *page, int bit_nr) static void wake_up_page_bit(struct page *page, int bit_nr)
...@@ -1103,16 +1127,31 @@ enum behavior { ...@@ -1103,16 +1127,31 @@ enum behavior {
*/ */
}; };
/*
* Attempt to check (or get) the page bit, and mark the
* waiter woken if successful.
*/
static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
if (test_and_set_bit(bit_nr, &page->flags))
return false;
} else if (test_bit(bit_nr, &page->flags))
return false;
wait->flags |= WQ_FLAG_WOKEN;
return true;
}
static inline int wait_on_page_bit_common(wait_queue_head_t *q, static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior) struct page *page, int bit_nr, int state, enum behavior behavior)
{ {
struct wait_page_queue wait_page; struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait; wait_queue_entry_t *wait = &wait_page.wait;
bool bit_is_set;
bool thrashing = false; bool thrashing = false;
bool delayacct = false; bool delayacct = false;
unsigned long pflags; unsigned long pflags;
int ret = 0;
if (bit_nr == PG_locked && if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) { !PageUptodate(page) && PageWorkingset(page)) {
...@@ -1130,48 +1169,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, ...@@ -1130,48 +1169,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
wait_page.page = page; wait_page.page = page;
wait_page.bit_nr = bit_nr; wait_page.bit_nr = bit_nr;
for (;;) { /*
* Do one last check whether we can get the
* page bit synchronously.
*
* Do the SetPageWaiters() marking before that
* to let any waker we _just_ missed know they
* need to wake us up (otherwise they'll never
* even go to the slow case that looks at the
* page queue), and add ourselves to the wait
* queue if we need to sleep.
*
* This part needs to be done under the queue
* lock to avoid races.
*/
spin_lock_irq(&q->lock); spin_lock_irq(&q->lock);
if (likely(list_empty(&wait->entry))) {
__add_wait_queue_entry_tail(q, wait);
SetPageWaiters(page); SetPageWaiters(page);
} if (!trylock_page_bit_common(page, bit_nr, wait))
__add_wait_queue_entry_tail(q, wait);
set_current_state(state);
spin_unlock_irq(&q->lock); spin_unlock_irq(&q->lock);
bit_is_set = test_bit(bit_nr, &page->flags); /*
* From now on, all the logic will be based on
* the WQ_FLAG_WOKEN flag, and the and the page
* bit testing (and setting) will be - or has
* already been - done by the wake function.
*
* We can drop our reference to the page.
*/
if (behavior == DROP) if (behavior == DROP)
put_page(page); put_page(page);
if (likely(bit_is_set)) for (;;) {
io_schedule(); set_current_state(state);
if (behavior == EXCLUSIVE) { if (signal_pending_state(state, current))
if (!test_and_set_bit_lock(bit_nr, &page->flags))
break;
} else if (behavior == SHARED) {
if (!test_bit(bit_nr, &page->flags))
break; break;
}
if (signal_pending_state(state, current)) { if (wait->flags & WQ_FLAG_WOKEN)
ret = -EINTR;
break; break;
}
if (behavior == DROP) { io_schedule();
/*
* We can no longer safely access page->flags:
* even if CONFIG_MEMORY_HOTREMOVE is not enabled,
* there is a risk of waiting forever on a page reused
* for something that keeps it locked indefinitely.
* But best check for -EINTR above before breaking.
*/
break;
}
} }
finish_wait(q, wait); finish_wait(q, wait);
...@@ -1190,7 +1228,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, ...@@ -1190,7 +1228,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
* bother with signals either. * bother with signals either.
*/ */
return ret; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
} }
void wait_on_page_bit(struct page *page, int bit_nr) void wait_on_page_bit(struct page *page, int bit_nr)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment