Commit 3800a713 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'mm-hotfixes-stable-2022-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull last (?) hotfixes from Andrew Morton:
 "26 hotfixes.

  8 are for issues which were introduced during this -rc cycle, 18 are
  for earlier issues, and are cc:stable"

* tag 'mm-hotfixes-stable-2022-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (26 commits)
  x86/uaccess: avoid check_object_size() in copy_from_user_nmi()
  mm/page_isolation: fix isolate_single_pageblock() isolation behavior
  mm,hwpoison: check mm when killing accessing process
  mm/hugetlb: correct demote page offset logic
  mm: prevent page_frag_alloc() from corrupting the memory
  mm: bring back update_mmu_cache() to finish_fault()
  frontswap: don't call ->init if no ops are registered
  mm/huge_memory: use pfn_to_online_page() in split_huge_pages_all()
  mm: fix madivse_pageout mishandling on non-LRU page
  powerpc/64s/radix: don't need to broadcast IPI for radix pmd collapse flush
  mm: gup: fix the fast GUP race against THP collapse
  mm: fix dereferencing possible ERR_PTR
  vmscan: check folio_test_private(), not folio_get_private()
  mm: fix VM_BUG_ON in __delete_from_swap_cache()
  tools: fix compilation after gfp_types.h split
  mm/damon/dbgfs: fix memory leak when using debugfs_lookup()
  mm/migrate_device.c: copy pte dirty bit to page
  mm/migrate_device.c: add missing flush_cache_page()
  mm/migrate_device.c: flush TLB while holding PTL
  x86/mm: disable instrumentations of mm/pgprot.c
  ...
parents 3a710532 59298997
...@@ -937,15 +937,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre ...@@ -937,15 +937,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
pmd = *pmdp; pmd = *pmdp;
pmd_clear(pmdp); pmd_clear(pmdp);
/*
* pmdp collapse_flush need to ensure that there are no parallel gup
* walk after this call. This is needed so that we can have stable
* page ref count when collapsing a page. We don't allow a collapse page
* if we have gup taken on the page. We can ensure that by sending IPI
* because gup walk happens with IRQ disabled.
*/
serialize_against_pte_lookup(vma->vm_mm);
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
return pmd; return pmd;
......
...@@ -44,7 +44,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) ...@@ -44,7 +44,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
* called from other contexts. * called from other contexts.
*/ */
pagefault_disable(); pagefault_disable();
ret = __copy_from_user_inatomic(to, from, n); ret = raw_copy_from_user(to, from, n);
pagefault_enable(); pagefault_enable();
return ret; return ret;
......
...@@ -4,10 +4,12 @@ KCOV_INSTRUMENT_tlb.o := n ...@@ -4,10 +4,12 @@ KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n KCOV_INSTRUMENT_mem_encrypt.o := n
KCOV_INSTRUMENT_mem_encrypt_amd.o := n KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n KCOV_INSTRUMENT_mem_encrypt_identity.o := n
KCOV_INSTRUMENT_pgprot.o := n
KASAN_SANITIZE_mem_encrypt.o := n KASAN_SANITIZE_mem_encrypt.o := n
KASAN_SANITIZE_mem_encrypt_amd.o := n KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n KASAN_SANITIZE_mem_encrypt_identity.o := n
KASAN_SANITIZE_pgprot.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions # Disable KCSAN entirely, because otherwise we get warnings that some functions
# reference __initdata sections. # reference __initdata sections.
...@@ -17,6 +19,7 @@ ifdef CONFIG_FUNCTION_TRACER ...@@ -17,6 +19,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg CFLAGS_REMOVE_mem_encrypt.o = -pg
CFLAGS_REMOVE_mem_encrypt_amd.o = -pg CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
CFLAGS_REMOVE_pgprot.o = -pg
endif endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
......
...@@ -2092,7 +2092,8 @@ static bool load_system_files(ntfs_volume *vol) ...@@ -2092,7 +2092,8 @@ static bool load_system_files(ntfs_volume *vol)
// TODO: Initialize security. // TODO: Initialize security.
/* Get the extended system files' directory inode. */ /* Get the extended system files' directory inode. */
vol->extend_ino = ntfs_iget(sb, FILE_Extend); vol->extend_ino = ntfs_iget(sb, FILE_Extend);
if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) { if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
!S_ISDIR(vol->extend_ino->i_mode)) {
if (!IS_ERR(vol->extend_ino)) if (!IS_ERR(vol->extend_ino))
iput(vol->extend_ino); iput(vol->extend_ino);
ntfs_error(sb, "Failed to load $Extend."); ntfs_error(sb, "Failed to load $Extend.");
......
...@@ -175,13 +175,13 @@ xfs_dax_notify_failure( ...@@ -175,13 +175,13 @@ xfs_dax_notify_failure(
u64 ddev_start; u64 ddev_start;
u64 ddev_end; u64 ddev_end;
if (!(mp->m_sb.sb_flags & SB_BORN)) { if (!(mp->m_super->s_flags & SB_BORN)) {
xfs_warn(mp, "filesystem is not ready for notify_failure()!"); xfs_warn(mp, "filesystem is not ready for notify_failure()!");
return -EIO; return -EIO;
} }
if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
xfs_warn(mp, xfs_debug(mp,
"notify_failure() not supported on realtime device!"); "notify_failure() not supported on realtime device!");
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
...@@ -194,7 +194,7 @@ xfs_dax_notify_failure( ...@@ -194,7 +194,7 @@ xfs_dax_notify_failure(
} }
if (!xfs_has_rmapbt(mp)) { if (!xfs_has_rmapbt(mp)) {
xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
......
...@@ -139,6 +139,11 @@ struct dev_pagemap { ...@@ -139,6 +139,11 @@ struct dev_pagemap {
}; };
}; };
static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
{
return pgmap->ops && pgmap->ops->memory_failure;
}
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{ {
if (pgmap->flags & PGMAP_ALTMAP_VALID) if (pgmap->flags & PGMAP_ALTMAP_VALID)
......
...@@ -884,6 +884,7 @@ static int dbgfs_rm_context(char *name) ...@@ -884,6 +884,7 @@ static int dbgfs_rm_context(char *name)
struct dentry *root, *dir, **new_dirs; struct dentry *root, *dir, **new_dirs;
struct damon_ctx **new_ctxs; struct damon_ctx **new_ctxs;
int i, j; int i, j;
int ret = 0;
if (damon_nr_running_ctxs()) if (damon_nr_running_ctxs())
return -EBUSY; return -EBUSY;
...@@ -898,14 +899,16 @@ static int dbgfs_rm_context(char *name) ...@@ -898,14 +899,16 @@ static int dbgfs_rm_context(char *name)
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL); GFP_KERNEL);
if (!new_dirs) if (!new_dirs) {
return -ENOMEM; ret = -ENOMEM;
goto out_dput;
}
new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs), new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
GFP_KERNEL); GFP_KERNEL);
if (!new_ctxs) { if (!new_ctxs) {
kfree(new_dirs); ret = -ENOMEM;
return -ENOMEM; goto out_new_dirs;
} }
for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) { for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
...@@ -925,7 +928,13 @@ static int dbgfs_rm_context(char *name) ...@@ -925,7 +928,13 @@ static int dbgfs_rm_context(char *name)
dbgfs_ctxs = new_ctxs; dbgfs_ctxs = new_ctxs;
dbgfs_nr_ctxs--; dbgfs_nr_ctxs--;
return 0; goto out_dput;
out_new_dirs:
kfree(new_dirs);
out_dput:
dput(dir);
return ret;
} }
static ssize_t dbgfs_rm_context_write(struct file *file, static ssize_t dbgfs_rm_context_write(struct file *file,
......
...@@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map) ...@@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly. * p->frontswap set to something valid to work properly.
*/ */
frontswap_map_set(sis, map); frontswap_map_set(sis, map);
if (!frontswap_enabled())
return;
frontswap_ops->init(type); frontswap_ops->init(type);
} }
......
...@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, ...@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
} }
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, /*
unsigned int flags, struct page **pages, int *nr) * Fast-gup relies on pte change detection to avoid concurrent pgtable
* operations.
*
* To pin the page, fast-gup needs to do below in order:
* (1) pin the page (by prefetching pte), then (2) check pte not changed.
*
* For the rest of pgtable operations where pgtable updates can be racy
* with fast-gup, we need to do (1) clear pte, then (2) check whether page
* is pinned.
*
* Above will work for all pte-level operations, including THP split.
*
* For THP collapse, it's a bit more complicated because fast-gup may be
* walking a pgtable page that is being freed (pte is still valid but pmd
* can be cleared already). To avoid race in such condition, we need to
* also check pmd here to make sure pmd doesn't change (corresponds to
* pmdp_collapse_flush() in the THP collapse code path).
*/
static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{ {
struct dev_pagemap *pgmap = NULL; struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0; int nr_start = *nr, ret = 0;
...@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ...@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap; goto pte_unmap;
} }
if (unlikely(pte_val(pte) != pte_val(*ptep))) { if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags); gup_put_folio(folio, 1, flags);
goto pte_unmap; goto pte_unmap;
} }
...@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ...@@ -2439,8 +2460,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* get_user_pages_fast_only implementation that can pin pages. Thus it's still * get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes. * useful to have gup_huge_pmd even if we can't operate on ptes.
*/ */
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
unsigned int flags, struct page **pages, int *nr) unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{ {
return 0; return 0;
} }
...@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo ...@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr)) PMD_SHIFT, next, flags, pages, nr))
return 0; return 0;
} else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0; return 0;
} while (pmdp++, addr = next, addr != end); } while (pmdp++, addr = next, addr != end);
......
...@@ -2894,11 +2894,9 @@ static void split_huge_pages_all(void) ...@@ -2894,11 +2894,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone); max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages; int nr_pages;
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn); page = pfn_to_online_page(pfn);
if (!get_page_unless_zero(page)) if (!page || !get_page_unless_zero(page))
continue; continue;
if (zone != page_zone(page)) if (zone != page_zone(page))
......
...@@ -3420,6 +3420,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) ...@@ -3420,6 +3420,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
{ {
int i, nid = page_to_nid(page); int i, nid = page_to_nid(page);
struct hstate *target_hstate; struct hstate *target_hstate;
struct page *subpage;
int rc = 0; int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
...@@ -3453,15 +3454,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) ...@@ -3453,15 +3454,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
mutex_lock(&target_hstate->resize_lock); mutex_lock(&target_hstate->resize_lock);
for (i = 0; i < pages_per_huge_page(h); for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) { i += pages_per_huge_page(target_hstate)) {
subpage = nth_page(page, i);
if (hstate_is_gigantic(target_hstate)) if (hstate_is_gigantic(target_hstate))
prep_compound_gigantic_page_for_demote(page + i, prep_compound_gigantic_page_for_demote(subpage,
target_hstate->order); target_hstate->order);
else else
prep_compound_page(page + i, target_hstate->order); prep_compound_page(subpage, target_hstate->order);
set_page_private(page + i, 0); set_page_private(subpage, 0);
set_page_refcounted(page + i); set_page_refcounted(subpage);
prep_new_huge_page(target_hstate, page + i, nid); prep_new_huge_page(target_hstate, subpage, nid);
put_page(page + i); put_page(subpage);
} }
mutex_unlock(&target_hstate->resize_lock); mutex_unlock(&target_hstate->resize_lock);
......
...@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm, ...@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/* /*
* After this gup_fast can't run anymore. This also removes * This removes any huge TLB entry from the CPU so we won't allow
* any huge TLB entry from the CPU so we won't allow * huge and small TLB entries for the same virtual address to
* huge and small TLB entries for the same virtual address * avoid the risk of CPU bugs in that area.
* to avoid the risk of CPU bugs in that area. *
* Parallel fast GUP is fine since fast GUP will back off when
* it detects PMD is changed.
*/ */
_pmd = pmdp_collapse_flush(vma, address, pmd); _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl); spin_unlock(pmd_ptl);
......
...@@ -451,8 +451,11 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, ...@@ -451,8 +451,11 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
continue; continue;
} }
/* Do not interfere with other mappings of this page */ /*
if (page_mapcount(page) != 1) * Do not interfere with other mappings of this page and
* non-LRU page.
*/
if (!PageLRU(page) || page_mapcount(page) != 1)
continue; continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page); VM_BUG_ON_PAGE(PageTransCompound(page), page);
......
...@@ -345,13 +345,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, ...@@ -345,13 +345,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* not much we can do. We just print a message and ignore otherwise. * not much we can do. We just print a message and ignore otherwise.
*/ */
#define FSDAX_INVALID_PGOFF ULONG_MAX
/* /*
* Schedule a process for later kill. * Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
* *
* Notice: @fsdax_pgoff is used only when @p is a fsdax page. * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
* In other cases, such as anonymous and file-backend page, the address to be * filesystem with a memory failure handler has claimed the
* killed can be caculated by @p itself. * memory_failure event. In all other cases, page->index and
* page->mapping are sufficient for mapping the page back to its
* corresponding user virtual address.
*/ */
static void add_to_kill(struct task_struct *tsk, struct page *p, static void add_to_kill(struct task_struct *tsk, struct page *p,
pgoff_t fsdax_pgoff, struct vm_area_struct *vma, pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
...@@ -367,11 +371,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, ...@@ -367,11 +371,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
tk->addr = page_address_in_vma(p, vma); tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p)) { if (is_zone_device_page(p)) {
/* if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
* Since page->mapping is not used for fsdax, we need
* calculate the address based on the vma.
*/
if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
} else } else
...@@ -523,7 +523,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, ...@@ -523,7 +523,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma)) if (!page_mapped_in_vma(page, vma))
continue; continue;
if (vma->vm_mm == t->mm) if (vma->vm_mm == t->mm)
add_to_kill(t, page, 0, vma, to_kill); add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
to_kill);
} }
} }
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
...@@ -559,7 +560,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, ...@@ -559,7 +560,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions. * to be informed of all such data corruptions.
*/ */
if (vma->vm_mm == t->mm) if (vma->vm_mm == t->mm)
add_to_kill(t, page, 0, vma, to_kill); add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
to_kill);
} }
} }
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
...@@ -743,6 +745,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn, ...@@ -743,6 +745,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
}; };
priv.tk.tsk = p; priv.tk.tsk = p;
if (!p->mm)
return -EFAULT;
mmap_read_lock(p->mm); mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops, ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
(void *)&priv); (void *)&priv);
...@@ -1928,7 +1933,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, ...@@ -1928,7 +1933,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* Call driver's implementation to handle the memory failure, otherwise * Call driver's implementation to handle the memory failure, otherwise
* fall back to generic handler. * fall back to generic handler.
*/ */
if (pgmap->ops->memory_failure) { if (pgmap_has_memory_failure(pgmap)) {
rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/* /*
* Fall back to generic handler too if operation is not * Fall back to generic handler too if operation is not
......
...@@ -4386,14 +4386,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf) ...@@ -4386,14 +4386,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
ret = 0;
/* Re-check under ptl */ /* Re-check under ptl */
if (likely(!vmf_pte_changed(vmf))) if (likely(!vmf_pte_changed(vmf))) {
do_set_pte(vmf, page, vmf->address); do_set_pte(vmf, page, vmf->address);
else
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, vmf->address, vmf->pte);
ret = 0;
} else {
update_mmu_tlb(vma, vmf->address, vmf->pte);
ret = VM_FAULT_NOPAGE; ret = VM_FAULT_NOPAGE;
}
update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret; return ret;
} }
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/export.h> #include <linux/export.h>
#include <linux/memremap.h> #include <linux/memremap.h>
#include <linux/migrate.h> #include <linux/migrate.h>
#include <linux/mm.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/mmu_notifier.h> #include <linux/mmu_notifier.h>
#include <linux/oom.h> #include <linux/oom.h>
...@@ -193,10 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ...@@ -193,10 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
bool anon_exclusive; bool anon_exclusive;
pte_t swp_pte; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page); anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) { if (anon_exclusive) {
flush_cache_page(vma, addr, pte_pfn(*ptep)); pte = ptep_clear_flush(vma, addr, ptep);
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) { if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte); set_pte_at(mm, addr, ptep, pte);
...@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ...@@ -206,11 +207,15 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next; goto next;
} }
} else { } else {
ptep_get_and_clear(mm, addr, ptep); pte = ptep_get_and_clear(mm, addr, ptep);
} }
migrate->cpages++; migrate->cpages++;
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pte))
folio_mark_dirty(page_folio(page));
/* Setup special migration page table entry */ /* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE) if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry( entry = make_writable_migration_entry(
...@@ -254,13 +259,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ...@@ -254,13 +259,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
migrate->dst[migrate->npages] = 0; migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn; migrate->src[migrate->npages++] = mpfn;
} }
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */ /* Only flush the TLB if we actually modified any entries */
if (unmapped) if (unmapped)
flush_tlb_range(walk->vma, start, end); flush_tlb_range(walk->vma, start, end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
return 0; return 0;
} }
......
...@@ -4708,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask) ...@@ -4708,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
EXPORT_SYMBOL_GPL(fs_reclaim_release); EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif #endif
/*
* Zonelists may change due to hotplug during allocation. Detect when zonelists
* have been rebuilt so allocation retries. Reader side does not lock and
* retries the allocation if zonelist changes. Writer side is protected by the
* embedded spin_lock.
*/
static DEFINE_SEQLOCK(zonelist_update_seq);
static unsigned int zonelist_iter_begin(void)
{
if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
return read_seqbegin(&zonelist_update_seq);
return 0;
}
static unsigned int check_retry_zonelist(unsigned int seq)
{
if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
return read_seqretry(&zonelist_update_seq, seq);
return seq;
}
/* Perform direct synchronous page reclaim */ /* Perform direct synchronous page reclaim */
static unsigned long static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order, __perform_reclaim(gfp_t gfp_mask, unsigned int order,
...@@ -5001,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, ...@@ -5001,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int compaction_retries; int compaction_retries;
int no_progress_loops; int no_progress_loops;
unsigned int cpuset_mems_cookie; unsigned int cpuset_mems_cookie;
unsigned int zonelist_iter_cookie;
int reserve_flags; int reserve_flags;
/* /*
...@@ -5011,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, ...@@ -5011,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC; gfp_mask &= ~__GFP_ATOMIC;
retry_cpuset: restart:
compaction_retries = 0; compaction_retries = 0;
no_progress_loops = 0; no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY; compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin(); cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
/* /*
* The fast path uses conservative alloc_flags to succeed only until * The fast path uses conservative alloc_flags to succeed only until
...@@ -5187,9 +5213,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, ...@@ -5187,9 +5213,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto retry; goto retry;
/* Deal with possible cpuset update races before we start OOM killing */ /*
if (check_retry_cpuset(cpuset_mems_cookie, ac)) * Deal with possible cpuset update races or zonelist updates to avoid
goto retry_cpuset; * a unnecessary OOM kill.
*/
if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
check_retry_zonelist(zonelist_iter_cookie))
goto restart;
/* Reclaim has failed us, start killing things */ /* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
...@@ -5209,9 +5239,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, ...@@ -5209,9 +5239,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
} }
nopage: nopage:
/* Deal with possible cpuset update races before we fail */ /*
if (check_retry_cpuset(cpuset_mems_cookie, ac)) * Deal with possible cpuset update races or zonelist updates to avoid
goto retry_cpuset; * a unnecessary OOM kill.
*/
if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
check_retry_zonelist(zonelist_iter_cookie))
goto restart;
/* /*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
...@@ -5706,6 +5740,18 @@ void *page_frag_alloc_align(struct page_frag_cache *nc, ...@@ -5706,6 +5740,18 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
/* reset page count bias and offset to start of new frag */ /* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz; offset = size - fragsz;
if (unlikely(offset < 0)) {
/*
* The caller is trying to allocate a fragment
* with fragsz > PAGE_SIZE but the cache isn't big
* enough to satisfy the request, this may
* happen in low memory conditions.
* We don't release the cache page because
* it could make memory pressure worse
* so we simply return NULL here.
*/
return NULL;
}
} }
nc->pagecnt_bias--; nc->pagecnt_bias--;
...@@ -6514,9 +6560,8 @@ static void __build_all_zonelists(void *data) ...@@ -6514,9 +6560,8 @@ static void __build_all_zonelists(void *data)
int nid; int nid;
int __maybe_unused cpu; int __maybe_unused cpu;
pg_data_t *self = data; pg_data_t *self = data;
static DEFINE_SPINLOCK(lock);
spin_lock(&lock); write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load)); memset(node_load, 0, sizeof(node_load));
...@@ -6553,7 +6598,7 @@ static void __build_all_zonelists(void *data) ...@@ -6553,7 +6598,7 @@ static void __build_all_zonelists(void *data)
#endif #endif
} }
spin_unlock(&lock); write_sequnlock(&zonelist_update_seq);
} }
static noinline void __init static noinline void __init
......
...@@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) ...@@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @isolate_before: isolate the pageblock before the boundary_pfn * @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second * @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock() * isolate_single_pageblock()
* @migratetype: migrate type to set in error recovery.
* *
* Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same * pageblock. When not all pageblocks within a page are isolated at the same
...@@ -302,9 +303,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) ...@@ -302,9 +303,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page. * the in-use page then splitting the free page.
*/ */
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
gfp_t gfp_flags, bool isolate_before, bool skip_isolation) gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
int migratetype)
{ {
unsigned char saved_mt;
unsigned long start_pfn; unsigned long start_pfn;
unsigned long isolate_pageblock; unsigned long isolate_pageblock;
unsigned long pfn; unsigned long pfn;
...@@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, ...@@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES), start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
zone->zone_start_pfn); zone->zone_start_pfn);
saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); if (skip_isolation) {
int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
if (skip_isolation) VM_BUG_ON(!is_migrate_isolate(mt));
VM_BUG_ON(!is_migrate_isolate(saved_mt)); } else {
else { ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags, flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
if (ret) if (ret)
return ret; return ret;
...@@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, ...@@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed: failed:
/* restore the original migratetype */ /* restore the original migratetype */
if (!skip_isolation) if (!skip_isolation)
unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt); unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
return -EBUSY; return -EBUSY;
} }
...@@ -537,7 +538,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, ...@@ -537,7 +538,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false; bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */ /* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation); ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
skip_isolation, migratetype);
if (ret) if (ret)
return ret; return ret;
...@@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, ...@@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true; skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */ /* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation); ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
skip_isolation, migratetype);
if (ret) { if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype); unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret; return ret;
......
...@@ -285,7 +285,7 @@ static int secretmem_init(void) ...@@ -285,7 +285,7 @@ static int secretmem_init(void)
secretmem_mnt = kern_mount(&secretmem_fs); secretmem_mnt = kern_mount(&secretmem_fs);
if (IS_ERR(secretmem_mnt)) if (IS_ERR(secretmem_mnt))
ret = PTR_ERR(secretmem_mnt); return PTR_ERR(secretmem_mnt);
/* prevent secretmem mappings from ever getting PROT_EXEC */ /* prevent secretmem mappings from ever getting PROT_EXEC */
secretmem_mnt->mnt_flags |= MNT_NOEXEC; secretmem_mnt->mnt_flags |= MNT_NOEXEC;
......
...@@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio, ...@@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio,
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow); void *entry = xas_store(&xas, shadow);
VM_BUG_ON_FOLIO(entry != folio, folio); VM_BUG_ON_PAGE(entry != folio, entry);
set_page_private(folio_page(folio, i), 0); set_page_private(folio_page(folio, i), 0);
xas_next(&xas); xas_next(&xas);
} }
......
...@@ -2550,8 +2550,8 @@ static void shrink_active_list(unsigned long nr_to_scan, ...@@ -2550,8 +2550,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
} }
if (unlikely(buffer_heads_over_limit)) { if (unlikely(buffer_heads_over_limit)) {
if (folio_get_private(folio) && folio_trylock(folio)) { if (folio_test_private(folio) && folio_trylock(folio)) {
if (folio_get_private(folio)) if (folio_test_private(folio))
filemap_release_folio(folio, 0); filemap_release_folio(folio, 0);
folio_unlock(folio); folio_unlock(folio);
} }
......
...@@ -3,26 +3,7 @@ ...@@ -3,26 +3,7 @@
#define _TOOLS_INCLUDE_LINUX_GFP_H #define _TOOLS_INCLUDE_LINUX_GFP_H
#include <linux/types.h> #include <linux/types.h>
#include <linux/gfp_types.h>
#define __GFP_BITS_SHIFT 26
#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
#define __GFP_HIGH 0x20u
#define __GFP_IO 0x40u
#define __GFP_FS 0x80u
#define __GFP_NOWARN 0x200u
#define __GFP_ZERO 0x8000u
#define __GFP_ATOMIC 0x80000u
#define __GFP_ACCOUNT 0x100000u
#define __GFP_DIRECT_RECLAIM 0x400000u
#define __GFP_KSWAPD_RECLAIM 0x2000000u
#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM)
#define GFP_ZONEMASK 0x0fu
#define GFP_ATOMIC (__GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{ {
......
#include "../../../include/linux/gfp_types.h"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment