Commit af53d3e9 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

mm: swapoff: shmem_unuse() stop eviction without igrab()

The igrab() in shmem_unuse() looks good, but we forgot that it gives no
protection against concurrent unmounting: a point made by Konstantin
Khlebnikov eight years ago, and then fixed in 2.6.39 by 778dd893
("tmpfs: fix race between umount and swapoff").  The current 5.1-rc
swapoff is liable to hit "VFS: Busy inodes after unmount of tmpfs.
Self-destruct in 5 seconds.  Have a nice day..." followed by GPF.

Once again, give up on using igrab(); but don't go back to making such
heavy-handed use of shmem_swaplist_mutex as last time: that would spoil
the new design, and I expect could deadlock inside shmem_swapin_page().

Instead, shmem_unuse() just raise a "stop_eviction" count in the shmem-
specific inode, and shmem_evict_inode() wait for that to go down to 0.
Call it "stop_eviction" rather than "swapoff_busy" because it can be put
to use for others later (huge tmpfs patches expect to use it).

That simplifies shmem_unuse(), protecting it from both unlink and
unmount; and in practice lets it locate all the swap in its first try.
But do not rely on that: there's still a theoretical case, when
shmem_writepage() might have been preempted after its get_swap_page(),
before making the swap entry visible to swapoff.

[hughd@google.com: remove incorrect list_del()]
  Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904091133570.1898@eggly.anvils
Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1904081259400.1523@eggly.anvils
Fixes: b56a2d8a ("mm: rid swapoff of quadratic complexity")
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Cc: "Alex Xu (Hello71)" <alex_y_xu@yahoo.ca>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kelley Nielsen <kelleynnn@gmail.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vineeth Pillai <vpillai@digitalocean.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 64165b1a
...@@ -21,6 +21,7 @@ struct shmem_inode_info { ...@@ -21,6 +21,7 @@ struct shmem_inode_info {
struct list_head swaplist; /* chain of maybes on swap */ struct list_head swaplist; /* chain of maybes on swap */
struct shared_policy policy; /* NUMA memory alloc policy */ struct shared_policy policy; /* NUMA memory alloc policy */
struct simple_xattrs xattrs; /* list of xattrs */ struct simple_xattrs xattrs; /* list of xattrs */
atomic_t stop_eviction; /* hold when working on inode */
struct inode vfs_inode; struct inode vfs_inode;
}; };
......
...@@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode) ...@@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode)
} }
spin_unlock(&sbinfo->shrinklist_lock); spin_unlock(&sbinfo->shrinklist_lock);
} }
if (!list_empty(&info->swaplist)) { while (!list_empty(&info->swaplist)) {
/* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
mutex_lock(&shmem_swaplist_mutex); mutex_lock(&shmem_swaplist_mutex);
list_del_init(&info->swaplist); /* ...but beware of the race if we peeked too early */
if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
} }
} }
...@@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap, ...@@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse) unsigned long *fs_pages_to_unuse)
{ {
struct shmem_inode_info *info, *next; struct shmem_inode_info *info, *next;
struct inode *inode;
struct inode *prev_inode = NULL;
int error = 0; int error = 0;
if (list_empty(&shmem_swaplist)) if (list_empty(&shmem_swaplist))
return 0; return 0;
mutex_lock(&shmem_swaplist_mutex); mutex_lock(&shmem_swaplist_mutex);
/*
* The extra refcount on the inode is necessary to safely dereference
* p->next after re-acquiring the lock. New shmem inodes with swap
* get added to the end of the list and we will scan them all.
*/
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) { if (!info->swapped) {
list_del_init(&info->swaplist); list_del_init(&info->swaplist);
continue; continue;
} }
/*
inode = igrab(&info->vfs_inode); * Drop the swaplist mutex while searching the inode for swap;
if (!inode) * but before doing so, make sure shmem_evict_inode() will not
continue; * remove placeholder inode from swaplist, nor let it be freed
* (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
if (prev_inode)
iput(prev_inode);
prev_inode = inode;
error = shmem_unuse_inode(inode, type, frontswap, error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
fs_pages_to_unuse); fs_pages_to_unuse);
cond_resched(); cond_resched();
...@@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap, ...@@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap,
next = list_next_entry(info, swaplist); next = list_next_entry(info, swaplist);
if (!info->swapped) if (!info->swapped)
list_del_init(&info->swaplist); list_del_init(&info->swaplist);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error) if (error)
break; break;
} }
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
if (prev_inode)
iput(prev_inode);
return error; return error;
} }
...@@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode ...@@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
info = SHMEM_I(inode); info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info); memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock); spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL; info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE; info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->shrinklist);
......
...@@ -2116,12 +2116,11 @@ int try_to_unuse(unsigned int type, bool frontswap, ...@@ -2116,12 +2116,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
* Under global memory pressure, swap entries can be reinserted back * Under global memory pressure, swap entries can be reinserted back
* into process space after the mmlist loop above passes over them. * into process space after the mmlist loop above passes over them.
* *
* Limit the number of retries? No: when shmem_unuse()'s igrab() fails, * Limit the number of retries? No: when mmget_not_zero() above fails,
* a shmem inode using swap is being evicted; and when mmget_not_zero() * that mm is likely to be freeing swap from exit_mmap(), which proceeds
* above fails, that mm is likely to be freeing swap from exit_mmap(). * at its own independent pace; and even shmem_writepage() could have
* Both proceed at their own independent pace: we could move them to * been preempted after get_swap_page(), temporarily hiding that swap.
* separate lists, and wait for those lists to be emptied; but it's * It's easy and robust (though cpu-intensive) just to keep retrying.
* easier and more robust (though cpu-intensive) just to keep retrying.
*/ */
if (si->inuse_pages) { if (si->inuse_pages) {
if (!signal_pending(current)) if (!signal_pending(current))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment