Commit 1235ccd0 authored by Suren Baghdasaryan's avatar Suren Baghdasaryan Committed by Andrew Morton

mm: handle swap page faults under per-VMA lock

When page fault is handled under per-VMA lock protection, all swap page
faults are retried with mmap_lock because folio_lock_or_retry has to drop
and reacquire mmap_lock if folio could not be immediately locked.  Follow
the same pattern as mmap_lock to drop per-VMA lock when waiting for folio
and retrying once folio is available.

With this obstacle removed, enable do_swap_page to operate under per-VMA
lock protection.  Drivers implementing ops->migrate_to_ram might still
rely on mmap_lock, therefore we have to fall back to mmap_lock in that
particular case.

Note that the only time do_swap_page calls synchronous swap_readpage is
when SWP_SYNCHRONOUS_IO is set, which is only set for
QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and pmem).
Therefore we don't sleep in this path, and there's no need to drop the
mmap or per-VMA lock.

Link: https://lkml.kernel.org/r/20230630211957.1341547-6-surenb@google.comSigned-off-by: default avatarSuren Baghdasaryan <surenb@google.com>
Tested-by: default avatarAlistair Popple <apopple@nvidia.com>
Reviewed-by: default avatarAlistair Popple <apopple@nvidia.com>
Acked-by: default avatarPeter Xu <peterx@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Hillf Danton <hdanton@sina.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Laurent Dufour <ldufour@linux.ibm.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michel Lespinasse <michel@lespinasse.org>
Cc: Minchan Kim <minchan@google.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Punit Agrawal <punit.agrawal@bytedance.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent fdc724d6
...@@ -729,6 +729,14 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) ...@@ -729,6 +729,14 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
vma->detached = detached; vma->detached = detached;
} }
static inline void release_fault_lock(struct vm_fault *vmf)
{
if (vmf->flags & FAULT_FLAG_VMA_LOCK)
vma_end_read(vmf->vma);
else
mmap_read_unlock(vmf->vma->vm_mm);
}
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
unsigned long address); unsigned long address);
...@@ -749,6 +757,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, ...@@ -749,6 +757,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
return NULL; return NULL;
} }
static inline void release_fault_lock(struct vm_fault *vmf)
{
mmap_read_unlock(vmf->vma->vm_mm);
}
#endif /* CONFIG_PER_VMA_LOCK */ #endif /* CONFIG_PER_VMA_LOCK */
extern const struct vm_operations_struct vma_dummy_vm_ops; extern const struct vm_operations_struct vma_dummy_vm_ops;
......
...@@ -1671,27 +1671,26 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) ...@@ -1671,27 +1671,26 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
* Return values: * Return values:
* 0 - folio is locked. * 0 - folio is locked.
* non-zero - folio is not locked. * non-zero - folio is not locked.
* mmap_lock has been released (mmap_read_unlock(), unless flags had both * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
* which case mmap_lock is still held. * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
* *
* If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
* with the folio locked and the mmap_lock unperturbed. * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
*/ */
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
{ {
struct mm_struct *mm = vmf->vma->vm_mm;
unsigned int flags = vmf->flags; unsigned int flags = vmf->flags;
if (fault_flag_allow_retry_first(flags)) { if (fault_flag_allow_retry_first(flags)) {
/* /*
* CAUTION! In this case, mmap_lock is not released * CAUTION! In this case, mmap_lock/per-VMA lock is not
* even though return VM_FAULT_RETRY. * released even though returning VM_FAULT_RETRY.
*/ */
if (flags & FAULT_FLAG_RETRY_NOWAIT) if (flags & FAULT_FLAG_RETRY_NOWAIT)
return VM_FAULT_RETRY; return VM_FAULT_RETRY;
mmap_read_unlock(mm); release_fault_lock(vmf);
if (flags & FAULT_FLAG_KILLABLE) if (flags & FAULT_FLAG_KILLABLE)
folio_wait_locked_killable(folio); folio_wait_locked_killable(folio);
else else
...@@ -1703,7 +1702,7 @@ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) ...@@ -1703,7 +1702,7 @@ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
ret = __folio_lock_killable(folio); ret = __folio_lock_killable(folio);
if (ret) { if (ret) {
mmap_read_unlock(mm); release_fault_lock(vmf);
return VM_FAULT_RETRY; return VM_FAULT_RETRY;
} }
} else { } else {
......
...@@ -3746,12 +3746,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -3746,12 +3746,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!pte_unmap_same(vmf)) if (!pte_unmap_same(vmf))
goto out; goto out;
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
ret = VM_FAULT_RETRY;
vma_end_read(vma);
goto out;
}
entry = pte_to_swp_entry(vmf->orig_pte); entry = pte_to_swp_entry(vmf->orig_pte);
if (unlikely(non_swap_entry(entry))) { if (unlikely(non_swap_entry(entry))) {
if (is_migration_entry(entry)) { if (is_migration_entry(entry)) {
...@@ -3761,6 +3755,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -3761,6 +3755,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->page = pfn_swap_entry_to_page(entry); vmf->page = pfn_swap_entry_to_page(entry);
ret = remove_device_exclusive_entry(vmf); ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) { } else if (is_device_private_entry(entry)) {
if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
/*
* migrate_to_ram is not yet ready to operate
* under VMA lock.
*/
vma_end_read(vma);
ret = VM_FAULT_RETRY;
goto out;
}
vmf->page = pfn_swap_entry_to_page(entry); vmf->page = pfn_swap_entry_to_page(entry);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment