Commit 70e806e4 authored by Peter Xu's avatar Peter Xu Committed by Linus Torvalds

mm: Do early cow for pinned pages during fork() for ptes

This allows copy_pte_range() to do early cow if the pages were pinned on
the source mm.

Currently we don't have an accurate way to know whether a page is pinned
or not.  The only thing we have is page_maybe_dma_pinned().  However
that's good enough for now.  Especially, with the newly added
mm->has_pinned flag to make sure we won't affect processes that never
pinned any pages.

It would be easier if we can do GFP_KERNEL allocation within
copy_one_pte().  Unluckily, we can't because we're with the page table
locks held for both the parent and child processes.  So the page
allocation needs to be done outside copy_one_pte().

Some trick is there in copy_present_pte(), majorly the wrprotect trick
to block concurrent fast-gup.  Comments in the function should explain
better in place.

Oleg Nesterov reported a (probably harmless) bug during review that we
didn't reset entry.val properly in copy_pte_range() so that potentially
there's chance to call add_swap_count_continuation() multiple times on
the same swp entry.  However that should be harmless since even if it
happens, the same function (add_swap_count_continuation()) will return
directly noticing that there're enough space for the swp counter.  So
instead of a standalone stable patch, it is touched up in this patch
directly.

Link: https://lore.kernel.org/lkml/20200914143829.GA1424636@nvidia.com/Suggested-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarPeter Xu <peterx@redhat.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 7a4830c3
...@@ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return 0; return 0;
} }
static inline void /*
* Copy a present and normal page if necessary.
*
* NOTE! The usual case is that this doesn't need to do
* anything, and can just return a positive value. That
* will let the caller know that it can just increase
* the page refcount and re-use the pte the traditional
* way.
*
* But _if_ we need to copy it because it needs to be
* pinned in the parent (and the child should get its own
* copy rather than just a reference to the same page),
* we'll do that here and return zero to let the caller
* know we're done.
*
* And if we need a pre-allocated page but don't yet have
* one, return a negative error to let the preallocation
* code know so that it can do so outside the page table
* lock.
*/
static inline int
copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte,
struct vm_area_struct *vma, struct vm_area_struct *new,
unsigned long addr, int *rss, struct page **prealloc,
pte_t pte, struct page *page)
{
struct page *new_page;
if (!is_cow_mapping(vma->vm_flags))
return 1;
/*
* The trick starts.
*
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
* instead of wrprotect the pte on both sides, we copy
* the page immediately so that we'll always guarantee
* the pinned page won't be randomly replaced in the
* future.
*
* To achieve this, we do the following:
*
* 1. Write-protect the pte if it's writable. This is
* to protect concurrent write fast-gup with
* FOLL_PIN, so that we'll fail the fast-gup with
* the write bit removed.
*
* 2. Check page_maybe_dma_pinned() to see whether this
* page may have been pinned.
*
* The order of these steps is important to serialize
* against the fast-gup code (gup_pte_range()) on the
* pte check and try_grab_compound_head(), so that
* we'll make sure either we'll capture that fast-gup
* so we'll copy the pinned page here, or we'll fail
* that fast-gup.
*
* NOTE! Even if we don't end up copying the page,
* we won't undo this wrprotect(), because the normal
* reference copy will need it anyway.
*/
if (pte_write(pte))
ptep_set_wrprotect(src_mm, addr, src_pte);
/*
* These are the "normally we can just copy by reference"
* checks.
*/
if (likely(!atomic_read(&src_mm->has_pinned)))
return 1;
if (likely(!page_maybe_dma_pinned(page)))
return 1;
/*
* Uhhuh. It looks like the page might be a pinned page,
* and we actually need to copy it. Now we can set the
* source pte back to being writable.
*/
if (pte_write(pte))
set_pte_at(src_mm, addr, src_pte, pte);
new_page = *prealloc;
if (!new_page)
return -EAGAIN;
/*
* We have a prealloc page, all good! Take it
* over and copy the page & arm it.
*/
*prealloc = NULL;
copy_user_highpage(new_page, page, addr, vma);
__SetPageUptodate(new_page);
page_add_new_anon_rmap(new_page, new, addr, false);
lru_cache_add_inactive_or_unevictable(new_page, new);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
pte = mk_pte(new_page, new->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), new);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
/*
* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
* is required to copy this pte.
*/
static inline int
copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss) struct vm_area_struct *new,
unsigned long addr, int *rss, struct page **prealloc)
{ {
unsigned long vm_flags = vma->vm_flags; unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte; pte_t pte = *src_pte;
struct page *page; struct page *page;
page = vm_normal_page(vma, addr, pte);
if (page) {
int retval;
retval = copy_present_page(dst_mm, src_mm,
dst_pte, src_pte,
vma, new,
addr, rss, prealloc,
pte, page);
if (retval <= 0)
return retval;
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
/* /*
* If it's a COW mapping, write protect it both * If it's a COW mapping, write protect it both
* in the parent and the child * in the parent and the child
...@@ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!(vm_flags & VM_UFFD_WP)) if (!(vm_flags & VM_UFFD_WP))
pte = pte_clear_uffd_wp(pte); pte = pte_clear_uffd_wp(pte);
page = vm_normal_page(vma, addr, pte); set_pte_at(dst_mm, addr, dst_pte, pte);
if (page) { return 0;
get_page(page); }
page_dup_rmap(page, false);
rss[mm_counter(page)]++; static inline struct page *
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
unsigned long addr)
{
struct page *new_page;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
if (!new_page)
return NULL;
if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
put_page(new_page);
return NULL;
} }
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
set_pte_at(dst_mm, addr, dst_pte, pte); return new_page;
} }
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
...@@ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *orig_src_pte, *orig_dst_pte; pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte; pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl; spinlock_t *src_ptl, *dst_ptl;
int progress = 0; int progress, ret = 0;
int rss[NR_MM_COUNTERS]; int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0}; swp_entry_t entry = (swp_entry_t){0};
struct page *prealloc = NULL;
again: again:
progress = 0;
init_rss_vec(rss); init_rss_vec(rss);
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
if (!dst_pte) if (!dst_pte) {
return -ENOMEM; ret = -ENOMEM;
goto out;
}
src_pte = pte_offset_map(src_pmd, addr); src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd); src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
...@@ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
progress += 8; progress += 8;
continue; continue;
} }
copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, /* copy_present_pte() will clear `*prealloc' if consumed */
vma, addr, rss); ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
vma, new, addr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
*/
if (unlikely(ret == -EAGAIN))
break;
if (unlikely(prealloc)) {
/*
* pre-alloc page cannot be reused by next time so as
* to strictly follow mempolicy (e.g., alloc_page_vma()
* will allocate page according to address). This
* could only happen if one pinned pte changed.
*/
put_page(prealloc);
prealloc = NULL;
}
progress += 8; progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
...@@ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
cond_resched(); cond_resched();
if (entry.val) { if (entry.val) {
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
ret = -ENOMEM;
goto out;
}
entry.val = 0;
} else if (ret) {
WARN_ON_ONCE(ret != -EAGAIN);
prealloc = page_copy_prealloc(src_mm, vma, addr);
if (!prealloc)
return -ENOMEM; return -ENOMEM;
progress = 0; /* We've captured and resolved the error. Reset, try again. */
ret = 0;
} }
if (addr != end) if (addr != end)
goto again; goto again;
return 0; out:
if (unlikely(prealloc))
put_page(prealloc);
return ret;
} }
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment