Commit 72e315f7 authored by Hugh Dickins's avatar Hugh Dickins Committed by Andrew Morton

mempolicy: mmap_lock is not needed while migrating folios

mbind(2) holds down_write of current task's mmap_lock throughout
(exclusive because it needs to set the new mempolicy on the vmas);
migrate_pages(2) holds down_read of pid's mmap_lock throughout.

They both hold mmap_lock across the internal migrate_pages(), under which
all new page allocations (huge or small) are made.  I'm nervous about it;
and migrate_pages() certainly does not need mmap_lock itself.  It's done
this way for mbind(2), because its page allocator is vma_alloc_folio() or
alloc_hugetlb_folio_vma(), both of which depend on vma and address.

Now that we have alloc_pages_mpol(), depending on (refcounted) memory
policy and interleave index, mbind(2) can be modified to use that or
alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the
internal migrate_pages() at all: add alloc_migration_target_by_mpol() to
replace mbind's new_page().

(After that change, alloc_hugetlb_folio_vma() is used by nothing but a
userfaultfd function: move it out of hugetlb.h and into the #ifdef.)

migrate_pages(2) has chosen its target node before migrating, so can
continue to use the standard alloc_migration_target(); but let it take and
drop mmap_lock just around migrate_to_node()'s queue_pages_range():
neither the node-to-node calculations nor the page migrations need it.

It seems unlikely, but it is conceivable that some userspace depends on
the kernel's mmap_lock exclusion here, instead of doing its own locking:
more likely in a testsuite than in real life.  It is also possible, of
course, that some pages on the list will be munmapped by another thread
before they are migrated, or a newer memory policy applied to the range by
that time: but such races could happen before, as soon as mmap_lock was
dropped, so it does not appear to be a concern.

Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.comSigned-off-by: default avatarHugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent ddc1a5cb
...@@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, ...@@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve); unsigned long addr, int avoid_reserve);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask); nodemask_t *nmask, gfp_t gfp_mask);
struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx); pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
...@@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, ...@@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
return NULL; return NULL;
} }
static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address)
{
return NULL;
}
static inline int __alloc_bootmem_huge_page(struct hstate *h) static inline int __alloc_bootmem_huge_page(struct hstate *h)
{ {
return 0; return 0;
......
...@@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, ...@@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
} }
/* mempolicy aware migration callback */
struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address)
{
struct mempolicy *mpol;
nodemask_t *nodemask;
struct folio *folio;
gfp_t gfp_mask;
int node;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return folio;
}
/* /*
* Increase the hugetlb pool such that it can accommodate a reservation * Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'. * of size 'delta'.
...@@ -6559,6 +6541,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -6559,6 +6541,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} }
#ifdef CONFIG_USERFAULTFD #ifdef CONFIG_USERFAULTFD
/*
* Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
*/
static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct mempolicy *mpol;
nodemask_t *nodemask;
struct folio *folio;
gfp_t gfp_mask;
int node;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return folio;
}
/* /*
* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
* with modifications for hugetlb pages. * with modifications for hugetlb pages.
......
...@@ -415,6 +415,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { ...@@ -415,6 +415,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
unsigned long flags); unsigned long flags);
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
pgoff_t ilx, int *nid);
static bool strictly_unmovable(unsigned long flags) static bool strictly_unmovable(unsigned long flags)
{ {
...@@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, ...@@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
node_set(source, nmask); node_set(source, nmask);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
mmap_read_lock(mm);
vma = find_vma(mm, 0); vma = find_vma(mm, 0);
/* /*
...@@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, ...@@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
*/ */
nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist); flags | MPOL_MF_DISCONTIG_OK, &pagelist);
mmap_read_unlock(mm);
if (!list_empty(&pagelist)) { if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_migration_target, NULL, err = migrate_pages(&pagelist, alloc_migration_target, NULL,
...@@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, ...@@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
lru_cache_disable(); lru_cache_disable();
mmap_read_lock(mm);
/* /*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest' * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
...@@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, ...@@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (err < 0) if (err < 0)
break; break;
} }
mmap_read_unlock(mm);
lru_cache_enable(); lru_cache_enable();
if (err < 0) if (err < 0)
...@@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, ...@@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
} }
/* /*
* Allocate a new page for page migration based on vma policy. * Allocate a new folio for page migration, according to NUMA mempolicy.
* Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/ */
static struct folio *new_folio(struct folio *src, unsigned long start) static struct folio *alloc_migration_target_by_mpol(struct folio *src,
unsigned long private)
{ {
struct vm_area_struct *vma; struct mempolicy *pol = (struct mempolicy *)private;
unsigned long address; pgoff_t ilx = 0; /* improve on this later */
VMA_ITERATOR(vmi, current->mm, start); struct page *page;
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; unsigned int order;
int nid = numa_node_id();
for_each_vma(vmi, vma) { gfp_t gfp;
address = page_address_in_vma(&src->page, vma);
if (address != -EFAULT)
break;
}
/* order = folio_order(src);
* __get_vma_policy() now expects a genuine non-NULL vma. Return NULL ilx += src->index >> order;
* when the page can no longer be located in a vma: that is not ideal
* (migrate_pages() will give up early, presuming ENOMEM), but good
* enough to avoid a crash by syzkaller or concurrent holepunch.
*/
if (!vma)
return NULL;
if (folio_test_hugetlb(src)) { if (folio_test_hugetlb(src)) {
return alloc_hugetlb_folio_vma(folio_hstate(src), nodemask_t *nodemask;
vma, address); struct hstate *h;
h = folio_hstate(src);
gfp = htlb_alloc_mask(h);
nodemask = policy_nodemask(gfp, pol, ilx, &nid);
return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
} }
if (folio_test_large(src)) if (folio_test_large(src))
gfp = GFP_TRANSHUGE; gfp = GFP_TRANSHUGE;
else
gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
return vma_alloc_folio(gfp, folio_order(src), vma, address, page = alloc_pages_mpol(gfp, order, pol, ilx, nid);
folio_test_large(src)); return page_rmappable_folio(page);
} }
#else #else
...@@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, ...@@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
return -ENOSYS; return -ENOSYS;
} }
static struct folio *new_folio(struct folio *src, unsigned long start) static struct folio *alloc_migration_target_by_mpol(struct folio *src,
unsigned long private)
{ {
return NULL; return NULL;
} }
...@@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start, unsigned long len, ...@@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (nr_failed < 0) { if (nr_failed < 0) {
err = nr_failed; err = nr_failed;
nr_failed = 0;
} else { } else {
vma_iter_init(&vmi, mm, start); vma_iter_init(&vmi, mm, start);
prev = vma_prev(&vmi); prev = vma_prev(&vmi);
...@@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start, unsigned long len, ...@@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start, unsigned long len,
} }
} }
if (!err) { mmap_write_unlock(mm);
if (!list_empty(&pagelist)) {
nr_failed |= migrate_pages(&pagelist, new_folio, NULL, if (!err && !list_empty(&pagelist)) {
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); /* Convert MPOL_DEFAULT's NULL to task or default policy */
if (!new) {
new = get_task_policy(current);
mpol_get(new);
} }
if (nr_failed && (flags & MPOL_MF_STRICT)) nr_failed |= migrate_pages(&pagelist,
err = -EIO; alloc_migration_target_by_mpol, NULL,
(unsigned long)new, MIGRATE_SYNC,
MR_MEMPOLICY_MBIND, NULL);
} }
if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
if (!list_empty(&pagelist)) if (!list_empty(&pagelist))
putback_movable_pages(&pagelist); putback_movable_pages(&pagelist);
mmap_write_unlock(mm);
mpol_out: mpol_out:
mpol_put(new); mpol_put(new);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment