Commit 54c23548 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'mm-hotfixes-stable-2024-04-18-14-41' of...

Merge tag 'mm-hotfixes-stable-2024-04-18-14-41' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull misc fixes from Andrew Morton:
 "15 hotfixes. 9 are cc:stable and the remainder address post-6.8 issues
  or aren't considered suitable for backporting.

  There are a significant number of fixups for this cycle's page_owner
  changes (series "page_owner: print stacks and their outstanding
  allocations"). Apart from that, singleton changes all over, mainly in
  MM"

* tag 'mm-hotfixes-stable-2024-04-18-14-41' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm:
  nilfs2: fix OOB in nilfs_set_de_type
  MAINTAINERS: update Naoya Horiguchi's email address
  fork: defer linking file vma until vma is fully initialized
  mm/shmem: inline shmem_is_huge() for disabled transparent hugepages
  mm,page_owner: defer enablement of static branch
  Squashfs: check the inode number is not the invalid value of zero
  mm,swapops: update check in is_pfn_swap_entry for hwpoison entries
  mm/memory-failure: fix deadlock when hugetlb_optimize_vmemmap is enabled
  mm/userfaultfd: allow hugetlb change protection upon poison entry
  mm,page_owner: fix printing of stack records
  mm,page_owner: fix accounting of pages when migrating
  mm,page_owner: fix refcount imbalance
  mm,page_owner: update metadata for tail pages
  userfaultfd: change src_folio after ensuring it's unpinned in UFFDIO_MOVE
  mm/madvise: make MADV_POPULATE_(READ|WRITE) handle VM_FAULT_RETRY properly
parents 2668e3ae c4a7dc95
...@@ -446,7 +446,8 @@ Mythri P K <mythripk@ti.com> ...@@ -446,7 +446,8 @@ Mythri P K <mythripk@ti.com>
Nadav Amit <nadav.amit@gmail.com> <namit@vmware.com> Nadav Amit <nadav.amit@gmail.com> <namit@vmware.com>
Nadav Amit <nadav.amit@gmail.com> <namit@cs.technion.ac.il> Nadav Amit <nadav.amit@gmail.com> <namit@cs.technion.ac.il>
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com> Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
Naoya Horiguchi <naoya.horiguchi@nec.com> <n-horiguchi@ah.jp.nec.com> Naoya Horiguchi <nao.horiguchi@gmail.com> <n-horiguchi@ah.jp.nec.com>
Naoya Horiguchi <nao.horiguchi@gmail.com> <naoya.horiguchi@nec.com>
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com> Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
Neeraj Upadhyay <quic_neeraju@quicinc.com> <neeraju@codeaurora.org> Neeraj Upadhyay <quic_neeraju@quicinc.com> <neeraju@codeaurora.org>
Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com> Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
......
...@@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of ...@@ -24,10 +24,10 @@ fragmentation statistics can be obtained through gfp flag information of
each page. It is already implemented and activated if page owner is each page. It is already implemented and activated if page owner is
enabled. Other usages are more than welcome. enabled. Other usages are more than welcome.
It can also be used to show all the stacks and their outstanding It can also be used to show all the stacks and their current number of
allocations, which gives us a quick overview of where the memory is going allocated base pages, which gives us a quick overview of where the memory
without the need to screen through all the pages and match the allocation is going without the need to screen through all the pages and match the
and free operation. allocation and free operation.
page owner is disabled by default. So, if you'd like to use it, you need page owner is disabled by default. So, if you'd like to use it, you need
to add "page_owner=on" to your boot cmdline. If the kernel is built to add "page_owner=on" to your boot cmdline. If the kernel is built
...@@ -75,42 +75,45 @@ Usage ...@@ -75,42 +75,45 @@ Usage
cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt
cat stacks.txt cat stacks.txt
prep_new_page+0xa9/0x120 post_alloc_hook+0x177/0x1a0
get_page_from_freelist+0x7e6/0x2140 get_page_from_freelist+0xd01/0xd80
__alloc_pages+0x18a/0x370 __alloc_pages+0x39e/0x7e0
new_slab+0xc8/0x580 allocate_slab+0xbc/0x3f0
___slab_alloc+0x1f2/0xaf0 ___slab_alloc+0x528/0x8a0
__slab_alloc.isra.86+0x22/0x40 kmem_cache_alloc+0x224/0x3b0
kmem_cache_alloc+0x31b/0x350 sk_prot_alloc+0x58/0x1a0
__khugepaged_enter+0x39/0x100 sk_alloc+0x32/0x4f0
dup_mmap+0x1c7/0x5ce inet_create+0x427/0xb50
copy_process+0x1afe/0x1c90 __sock_create+0x2e4/0x650
kernel_clone+0x9a/0x3c0 inet_ctl_sock_create+0x30/0x180
__do_sys_clone+0x66/0x90 igmp_net_init+0xc1/0x130
do_syscall_64+0x7f/0x160 ops_init+0x167/0x410
entry_SYSCALL_64_after_hwframe+0x6c/0x74 setup_net+0x304/0xa60
stack_count: 234 copy_net_ns+0x29b/0x4a0
create_new_namespaces+0x4a1/0x820
nr_base_pages: 16
... ...
... ...
echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold
cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt
cat stacks_7000.txt cat stacks_7000.txt
prep_new_page+0xa9/0x120 post_alloc_hook+0x177/0x1a0
get_page_from_freelist+0x7e6/0x2140 get_page_from_freelist+0xd01/0xd80
__alloc_pages+0x18a/0x370 __alloc_pages+0x39e/0x7e0
alloc_pages_mpol+0xdf/0x1e0 alloc_pages_mpol+0x22e/0x490
folio_alloc+0x14/0x50 folio_alloc+0xd5/0x110
filemap_alloc_folio+0xb0/0x100 filemap_alloc_folio+0x78/0x230
page_cache_ra_unbounded+0x97/0x180 page_cache_ra_order+0x287/0x6f0
filemap_fault+0x4b4/0x1200 filemap_get_pages+0x517/0x1160
__do_fault+0x2d/0x110 filemap_read+0x304/0x9f0
do_pte_missing+0x4b0/0xa30 xfs_file_buffered_read+0xe6/0x1d0 [xfs]
__handle_mm_fault+0x7fa/0xb70 xfs_file_read_iter+0x1f0/0x380 [xfs]
handle_mm_fault+0x125/0x300 __kernel_read+0x3b9/0x730
do_user_addr_fault+0x3c9/0x840 kernel_read_file+0x309/0x4d0
exc_page_fault+0x68/0x150 __do_sys_finit_module+0x381/0x730
asm_exc_page_fault+0x22/0x30 do_syscall_64+0x8d/0x150
stack_count: 8248 entry_SYSCALL_64_after_hwframe+0x62/0x6a
nr_base_pages: 20824
... ...
cat /sys/kernel/debug/page_owner > page_owner_full.txt cat /sys/kernel/debug/page_owner > page_owner_full.txt
......
...@@ -10024,7 +10024,7 @@ F: drivers/media/platform/st/sti/hva ...@@ -10024,7 +10024,7 @@ F: drivers/media/platform/st/sti/hva
HWPOISON MEMORY FAILURE HANDLING HWPOISON MEMORY FAILURE HANDLING
M: Miaohe Lin <linmiaohe@huawei.com> M: Miaohe Lin <linmiaohe@huawei.com>
R: Naoya Horiguchi <naoya.horiguchi@nec.com> R: Naoya Horiguchi <nao.horiguchi@gmail.com>
L: linux-mm@kvack.org L: linux-mm@kvack.org
S: Maintained S: Maintained
F: mm/hwpoison-inject.c F: mm/hwpoison-inject.c
......
...@@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = { ...@@ -240,7 +240,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = {
#define S_SHIFT 12 #define S_SHIFT 12
static unsigned char static unsigned char
nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
[S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
......
...@@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, ...@@ -48,6 +48,10 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
gid_t i_gid; gid_t i_gid;
int err; int err;
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
if (inode->i_ino == 0)
return -EINVAL;
err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid);
if (err) if (err)
return err; return err;
...@@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, ...@@ -58,7 +62,6 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
i_uid_write(inode, i_uid); i_uid_write(inode, i_uid);
i_gid_write(inode, i_gid); i_gid_write(inode, i_gid);
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0);
inode_set_atime(inode, inode_get_mtime_sec(inode), 0); inode_set_atime(inode, inode_get_mtime_sec(inode), 0);
inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); inode_set_ctime(inode, inode_get_mtime_sec(inode), 0);
......
...@@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, ...@@ -110,8 +110,17 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type); int shmem_unuse(unsigned int type);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force, extern bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
struct mm_struct *mm, unsigned long vm_flags); struct mm_struct *mm, unsigned long vm_flags);
#else
static __always_inline bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
struct mm_struct *mm, unsigned long vm_flags)
{
return false;
}
#endif
#ifdef CONFIG_SHMEM #ifdef CONFIG_SHMEM
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
#else #else
......
...@@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) ...@@ -390,6 +390,35 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry)
} }
#endif /* CONFIG_MIGRATION */ #endif /* CONFIG_MIGRATION */
#ifdef CONFIG_MEMORY_FAILURE
/*
* Support for hardware poisoned pages
*/
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
BUG_ON(!PageLocked(page));
return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}
static inline int is_hwpoison_entry(swp_entry_t entry)
{
return swp_type(entry) == SWP_HWPOISON;
}
#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
return swp_entry(0, 0);
}
static inline int is_hwpoison_entry(swp_entry_t swp)
{
return 0;
}
#endif
typedef unsigned long pte_marker; typedef unsigned long pte_marker;
#define PTE_MARKER_UFFD_WP BIT(0) #define PTE_MARKER_UFFD_WP BIT(0)
...@@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) ...@@ -483,8 +512,9 @@ static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
/* /*
* A pfn swap entry is a special type of swap entry that always has a pfn stored * A pfn swap entry is a special type of swap entry that always has a pfn stored
* in the swap offset. They are used to represent unaddressable device memory * in the swap offset. They can either be used to represent unaddressable device
* and to restrict access to a page undergoing migration. * memory, to restrict access to a page undergoing migration or to represent a
* pfn which has been hwpoisoned and unmapped.
*/ */
static inline bool is_pfn_swap_entry(swp_entry_t entry) static inline bool is_pfn_swap_entry(swp_entry_t entry)
{ {
...@@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry) ...@@ -492,7 +522,7 @@ static inline bool is_pfn_swap_entry(swp_entry_t entry)
BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);
return is_migration_entry(entry) || is_device_private_entry(entry) || return is_migration_entry(entry) || is_device_private_entry(entry) ||
is_device_exclusive_entry(entry); is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
} }
struct page_vma_mapped_walk; struct page_vma_mapped_walk;
...@@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) ...@@ -561,35 +591,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd)
} }
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
#ifdef CONFIG_MEMORY_FAILURE
/*
* Support for hardware poisoned pages
*/
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
BUG_ON(!PageLocked(page));
return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}
static inline int is_hwpoison_entry(swp_entry_t entry)
{
return swp_type(entry) == SWP_HWPOISON;
}
#else
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
return swp_entry(0, 0);
}
static inline int is_hwpoison_entry(swp_entry_t swp)
{
return 0;
}
#endif
static inline int non_swap_entry(swp_entry_t entry) static inline int non_swap_entry(swp_entry_t entry)
{ {
return swp_type(entry) >= MAX_SWAPFILES; return swp_type(entry) >= MAX_SWAPFILES;
......
...@@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -714,6 +714,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
} else if (anon_vma_fork(tmp, mpnt)) } else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork; goto fail_nomem_anon_vma_fork;
vm_flags_clear(tmp, VM_LOCKED_MASK); vm_flags_clear(tmp, VM_LOCKED_MASK);
/*
* Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
/*
* Link the vma into the MT. After using __mt_dup(), memory
* allocation is not necessary here, so it cannot fail.
*/
vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
file = tmp->vm_file; file = tmp->vm_file;
if (file) { if (file) {
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
...@@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -730,25 +747,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
i_mmap_unlock_write(mapping); i_mmap_unlock_write(mapping);
} }
/*
* Copy/update hugetlb private vma information.
*/
if (is_vm_hugetlb_page(tmp))
hugetlb_dup_vma_private(tmp);
/*
* Link the vma into the MT. After using __mt_dup(), memory
* allocation is not necessary here, so it cannot fail.
*/
vma_iter_bulk_store(&vmi, tmp);
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK)) if (!(tmp->vm_flags & VM_WIPEONFORK))
retval = copy_page_range(tmp, mpnt); retval = copy_page_range(tmp, mpnt);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
if (retval) { if (retval) {
mpnt = vma_next(&vmi); mpnt = vma_next(&vmi);
goto loop_out; goto loop_out;
......
...@@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm, ...@@ -1206,6 +1206,22 @@ static long __get_user_pages(struct mm_struct *mm,
/* first iteration or cross vma bound */ /* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) { if (!vma || start >= vma->vm_end) {
/*
* MADV_POPULATE_(READ|WRITE) wants to handle VMA
* lookups+error reporting differently.
*/
if (gup_flags & FOLL_MADV_POPULATE) {
vma = vma_lookup(mm, start);
if (!vma) {
ret = -ENOMEM;
goto out;
}
if (check_vma_flags(vma, gup_flags)) {
ret = -EINVAL;
goto out;
}
goto retry;
}
vma = gup_vma_lookup(mm, start); vma = gup_vma_lookup(mm, start);
if (!vma && in_gate_area(mm, start)) { if (!vma && in_gate_area(mm, start)) {
ret = get_gate_page(mm, start & PAGE_MASK, ret = get_gate_page(mm, start & PAGE_MASK,
...@@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma, ...@@ -1685,35 +1701,35 @@ long populate_vma_page_range(struct vm_area_struct *vma,
} }
/* /*
* faultin_vma_page_range() - populate (prefault) page tables inside the * faultin_page_range() - populate (prefault) page tables inside the
* given VMA range readable/writable * given range readable/writable
* *
* This takes care of mlocking the pages, too, if VM_LOCKED is set. * This takes care of mlocking the pages, too, if VM_LOCKED is set.
* *
* @vma: target vma * @mm: the mm to populate page tables in
* @start: start address * @start: start address
* @end: end address * @end: end address
* @write: whether to prefault readable or writable * @write: whether to prefault readable or writable
* @locked: whether the mmap_lock is still held * @locked: whether the mmap_lock is still held
* *
* Returns either number of processed pages in the vma, or a negative error * Returns either number of processed pages in the MM, or a negative error
* code on error (see __get_user_pages()). * code on error (see __get_user_pages()). Note that this function reports
* errors related to VMAs, such as incompatible mappings, as expected by
* MADV_POPULATE_(READ|WRITE).
* *
* vma->vm_mm->mmap_lock must be held. The range must be page-aligned and * The range must be page-aligned.
* covered by the VMA. If it's released, *@locked will be set to 0. *
* mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
*/ */
long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long end, bool write, int *locked) unsigned long end, bool write, int *locked)
{ {
struct mm_struct *mm = vma->vm_mm;
unsigned long nr_pages = (end - start) / PAGE_SIZE; unsigned long nr_pages = (end - start) / PAGE_SIZE;
int gup_flags; int gup_flags;
long ret; long ret;
VM_BUG_ON(!PAGE_ALIGNED(start)); VM_BUG_ON(!PAGE_ALIGNED(start));
VM_BUG_ON(!PAGE_ALIGNED(end)); VM_BUG_ON(!PAGE_ALIGNED(end));
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
mmap_assert_locked(mm); mmap_assert_locked(mm);
/* /*
...@@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, ...@@ -1725,19 +1741,13 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
* a poisoned page. * a poisoned page.
* !FOLL_FORCE: Require proper access permissions. * !FOLL_FORCE: Require proper access permissions.
*/ */
gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE; gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
FOLL_MADV_POPULATE;
if (write) if (write)
gup_flags |= FOLL_WRITE; gup_flags |= FOLL_WRITE;
/* ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
* We want to report -EINVAL instead of -EFAULT for any permission gup_flags);
* problems or incompatible mappings.
*/
if (check_vma_flags(vma, gup_flags))
return -EINVAL;
ret = __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, locked);
lru_add_drain(); lru_add_drain();
return ret; return ret;
} }
......
...@@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm ...@@ -2259,9 +2259,6 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
goto unlock_ptls; goto unlock_ptls;
} }
folio_move_anon_rmap(src_folio, dst_vma);
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd); src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
/* Folio got pinned from under us. Put it back and fail the move. */ /* Folio got pinned from under us. Put it back and fail the move. */
if (folio_maybe_dma_pinned(src_folio)) { if (folio_maybe_dma_pinned(src_folio)) {
...@@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm ...@@ -2270,6 +2267,9 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
goto unlock_ptls; goto unlock_ptls;
} }
folio_move_anon_rmap(src_folio, dst_vma);
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot); _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */ /* Follow mremap() behavior and treat the entry dirty after the move */
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma); _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
......
...@@ -7044,9 +7044,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, ...@@ -7044,9 +7044,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
if (!pte_same(pte, newpte)) if (!pte_same(pte, newpte))
set_huge_pte_at(mm, address, ptep, newpte, psize); set_huge_pte_at(mm, address, ptep, newpte, psize);
} else if (unlikely(is_pte_marker(pte))) { } else if (unlikely(is_pte_marker(pte))) {
/* No other markers apply for now. */ /*
WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); * Do nothing on a poison marker; page is
if (uffd_wp_resolve) * corrupted, permissons do not apply. Here
* pte_marker_uffd_wp()==true implies !poison
* because they're mutual exclusive.
*/
if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
/* Safe to modify directly (non-present->none). */ /* Safe to modify directly (non-present->none). */
huge_pte_clear(mm, address, ptep, psize); huge_pte_clear(mm, address, ptep, psize);
} else if (!huge_pte_none(pte)) { } else if (!huge_pte_none(pte)) {
......
...@@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio); ...@@ -686,9 +686,8 @@ struct anon_vma *folio_anon_vma(struct folio *folio);
void unmap_mapping_folio(struct folio *folio); void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma, extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked); unsigned long start, unsigned long end, int *locked);
extern long faultin_vma_page_range(struct vm_area_struct *vma, extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
unsigned long start, unsigned long end, unsigned long end, bool write, int *locked);
bool write, int *locked);
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
unsigned long bytes); unsigned long bytes);
...@@ -1127,10 +1126,13 @@ enum { ...@@ -1127,10 +1126,13 @@ enum {
FOLL_FAST_ONLY = 1 << 20, FOLL_FAST_ONLY = 1 << 20,
/* allow unlocking the mmap lock */ /* allow unlocking the mmap lock */
FOLL_UNLOCKABLE = 1 << 21, FOLL_UNLOCKABLE = 1 << 21,
/* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
FOLL_MADV_POPULATE = 1 << 22,
}; };
#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ #define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
FOLL_FAST_ONLY | FOLL_UNLOCKABLE) FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
FOLL_MADV_POPULATE)
/* /*
* Indicates for which pages that are write-protected in the page table, * Indicates for which pages that are write-protected in the page table,
......
...@@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma, ...@@ -908,27 +908,14 @@ static long madvise_populate(struct vm_area_struct *vma,
{ {
const bool write = behavior == MADV_POPULATE_WRITE; const bool write = behavior == MADV_POPULATE_WRITE;
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
unsigned long tmp_end;
int locked = 1; int locked = 1;
long pages; long pages;
*prev = vma; *prev = vma;
while (start < end) { while (start < end) {
/*
* We might have temporarily dropped the lock. For example,
* our VMA might have been split.
*/
if (!vma || start >= vma->vm_end) {
vma = vma_lookup(mm, start);
if (!vma)
return -ENOMEM;
}
tmp_end = min_t(unsigned long, end, vma->vm_end);
/* Populate (prefault) page tables readable/writable. */ /* Populate (prefault) page tables readable/writable. */
pages = faultin_vma_page_range(vma, start, tmp_end, write, pages = faultin_page_range(mm, start, end, write, &locked);
&locked);
if (!locked) { if (!locked) {
mmap_read_lock(mm); mmap_read_lock(mm);
locked = 1; locked = 1;
...@@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma, ...@@ -949,7 +936,7 @@ static long madvise_populate(struct vm_area_struct *vma,
pr_warn_once("%s: unhandled return value: %ld\n", pr_warn_once("%s: unhandled return value: %ld\n",
__func__, pages); __func__, pages);
fallthrough; fallthrough;
case -ENOMEM: case -ENOMEM: /* No VMA or out of memory. */
return -ENOMEM; return -ENOMEM;
} }
} }
......
...@@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page) ...@@ -154,11 +154,23 @@ static int __page_handle_poison(struct page *page)
{ {
int ret; int ret;
zone_pcp_disable(page_zone(page)); /*
* zone_pcp_disable() can't be used here. It will
* hold pcp_batch_high_lock and dissolve_free_huge_page() might hold
* cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
* optimization is enabled. This will break current lock dependency
* chain and leads to deadlock.
* Disabling pcp before dissolving the page was a deterministic
* approach because we made sure that those pages cannot end up in any
* PCP list. Draining PCP lists expels those pages to the buddy system,
* but nothing guarantees that those pages do not get back to a PCP
* queue if we need to refill those.
*/
ret = dissolve_free_huge_page(page); ret = dissolve_free_huge_page(page);
if (!ret) if (!ret) {
drain_all_pages(page_zone(page));
ret = take_page_off_buddy(page); ret = take_page_off_buddy(page);
zone_pcp_enable(page_zone(page)); }
return ret; return ret;
} }
......
...@@ -118,7 +118,6 @@ static __init void init_page_owner(void) ...@@ -118,7 +118,6 @@ static __init void init_page_owner(void)
register_dummy_stack(); register_dummy_stack();
register_failure_stack(); register_failure_stack();
register_early_stack(); register_early_stack();
static_branch_enable(&page_owner_inited);
init_early_allocated_pages(); init_early_allocated_pages();
/* Initialize dummy and failure stacks and link them to stack_list */ /* Initialize dummy and failure stacks and link them to stack_list */
dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle); dummy_stack.stack_record = __stack_depot_get_stack_record(dummy_handle);
...@@ -129,6 +128,7 @@ static __init void init_page_owner(void) ...@@ -129,6 +128,7 @@ static __init void init_page_owner(void)
refcount_set(&failure_stack.stack_record->count, 1); refcount_set(&failure_stack.stack_record->count, 1);
dummy_stack.next = &failure_stack; dummy_stack.next = &failure_stack;
stack_list = &dummy_stack; stack_list = &dummy_stack;
static_branch_enable(&page_owner_inited);
} }
struct page_ext_operations page_owner_ops = { struct page_ext_operations page_owner_ops = {
...@@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record, ...@@ -196,7 +196,8 @@ static void add_stack_record_to_list(struct stack_record *stack_record,
spin_unlock_irqrestore(&stack_list_lock, flags); spin_unlock_irqrestore(&stack_list_lock, flags);
} }
static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask,
int nr_base_pages)
{ {
struct stack_record *stack_record = __stack_depot_get_stack_record(handle); struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
...@@ -217,20 +218,74 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) ...@@ -217,20 +218,74 @@ static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask)
/* Add the new stack_record to our list */ /* Add the new stack_record to our list */
add_stack_record_to_list(stack_record, gfp_mask); add_stack_record_to_list(stack_record, gfp_mask);
} }
refcount_inc(&stack_record->count); refcount_add(nr_base_pages, &stack_record->count);
} }
static void dec_stack_record_count(depot_stack_handle_t handle) static void dec_stack_record_count(depot_stack_handle_t handle,
int nr_base_pages)
{ {
struct stack_record *stack_record = __stack_depot_get_stack_record(handle); struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
if (stack_record) if (!stack_record)
refcount_dec(&stack_record->count); return;
if (refcount_sub_and_test(nr_base_pages, &stack_record->count))
pr_warn("%s: refcount went to 0 for %u handle\n", __func__,
handle);
} }
void __reset_page_owner(struct page *page, unsigned short order) static inline void __update_page_owner_handle(struct page_ext *page_ext,
depot_stack_handle_t handle,
unsigned short order,
gfp_t gfp_mask,
short last_migrate_reason, u64 ts_nsec,
pid_t pid, pid_t tgid, char *comm)
{ {
int i; int i;
struct page_owner *page_owner;
for (i = 0; i < (1 << order); i++) {
page_owner = get_page_owner(page_ext);
page_owner->handle = handle;
page_owner->order = order;
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = last_migrate_reason;
page_owner->pid = pid;
page_owner->tgid = tgid;
page_owner->ts_nsec = ts_nsec;
strscpy(page_owner->comm, comm,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
page_ext = page_ext_next(page_ext);
}
}
static inline void __update_page_owner_free_handle(struct page_ext *page_ext,
depot_stack_handle_t handle,
unsigned short order,
pid_t pid, pid_t tgid,
u64 free_ts_nsec)
{
int i;
struct page_owner *page_owner;
for (i = 0; i < (1 << order); i++) {
page_owner = get_page_owner(page_ext);
/* Only __reset_page_owner() wants to clear the bit */
if (handle) {
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
page_owner->free_handle = handle;
}
page_owner->free_ts_nsec = free_ts_nsec;
page_owner->free_pid = current->pid;
page_owner->free_tgid = current->tgid;
page_ext = page_ext_next(page_ext);
}
}
void __reset_page_owner(struct page *page, unsigned short order)
{
struct page_ext *page_ext; struct page_ext *page_ext;
depot_stack_handle_t handle; depot_stack_handle_t handle;
depot_stack_handle_t alloc_handle; depot_stack_handle_t alloc_handle;
...@@ -245,16 +300,10 @@ void __reset_page_owner(struct page *page, unsigned short order) ...@@ -245,16 +300,10 @@ void __reset_page_owner(struct page *page, unsigned short order)
alloc_handle = page_owner->handle; alloc_handle = page_owner->handle;
handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
for (i = 0; i < (1 << order); i++) { __update_page_owner_free_handle(page_ext, handle, order, current->pid,
__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); current->tgid, free_ts_nsec);
page_owner->free_handle = handle;
page_owner->free_ts_nsec = free_ts_nsec;
page_owner->free_pid = current->pid;
page_owner->free_tgid = current->tgid;
page_ext = page_ext_next(page_ext);
page_owner = get_page_owner(page_ext);
}
page_ext_put(page_ext); page_ext_put(page_ext);
if (alloc_handle != early_handle) if (alloc_handle != early_handle)
/* /*
* early_handle is being set as a handle for all those * early_handle is being set as a handle for all those
...@@ -263,39 +312,14 @@ void __reset_page_owner(struct page *page, unsigned short order) ...@@ -263,39 +312,14 @@ void __reset_page_owner(struct page *page, unsigned short order)
* the machinery is not ready yet, we cannot decrement * the machinery is not ready yet, we cannot decrement
* their refcount either. * their refcount either.
*/ */
dec_stack_record_count(alloc_handle); dec_stack_record_count(alloc_handle, 1 << order);
}
static inline void __set_page_owner_handle(struct page_ext *page_ext,
depot_stack_handle_t handle,
unsigned short order, gfp_t gfp_mask)
{
struct page_owner *page_owner;
int i;
u64 ts_nsec = local_clock();
for (i = 0; i < (1 << order); i++) {
page_owner = get_page_owner(page_ext);
page_owner->handle = handle;
page_owner->order = order;
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
page_owner->pid = current->pid;
page_owner->tgid = current->tgid;
page_owner->ts_nsec = ts_nsec;
strscpy(page_owner->comm, current->comm,
sizeof(page_owner->comm));
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
page_ext = page_ext_next(page_ext);
}
} }
noinline void __set_page_owner(struct page *page, unsigned short order, noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask) gfp_t gfp_mask)
{ {
struct page_ext *page_ext; struct page_ext *page_ext;
u64 ts_nsec = local_clock();
depot_stack_handle_t handle; depot_stack_handle_t handle;
handle = save_stack(gfp_mask); handle = save_stack(gfp_mask);
...@@ -303,9 +327,11 @@ noinline void __set_page_owner(struct page *page, unsigned short order, ...@@ -303,9 +327,11 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
page_ext = page_ext_get(page); page_ext = page_ext_get(page);
if (unlikely(!page_ext)) if (unlikely(!page_ext))
return; return;
__set_page_owner_handle(page_ext, handle, order, gfp_mask); __update_page_owner_handle(page_ext, handle, order, gfp_mask, -1,
current->pid, current->tgid, ts_nsec,
current->comm);
page_ext_put(page_ext); page_ext_put(page_ext);
inc_stack_record_count(handle, gfp_mask); inc_stack_record_count(handle, gfp_mask, 1 << order);
} }
void __set_page_owner_migrate_reason(struct page *page, int reason) void __set_page_owner_migrate_reason(struct page *page, int reason)
...@@ -340,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order) ...@@ -340,9 +366,12 @@ void __split_page_owner(struct page *page, int old_order, int new_order)
void __folio_copy_owner(struct folio *newfolio, struct folio *old) void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{ {
int i;
struct page_ext *old_ext; struct page_ext *old_ext;
struct page_ext *new_ext; struct page_ext *new_ext;
struct page_owner *old_page_owner, *new_page_owner; struct page_owner *old_page_owner;
struct page_owner *new_page_owner;
depot_stack_handle_t migrate_handle;
old_ext = page_ext_get(&old->page); old_ext = page_ext_get(&old->page);
if (unlikely(!old_ext)) if (unlikely(!old_ext))
...@@ -356,30 +385,32 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) ...@@ -356,30 +385,32 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
old_page_owner = get_page_owner(old_ext); old_page_owner = get_page_owner(old_ext);
new_page_owner = get_page_owner(new_ext); new_page_owner = get_page_owner(new_ext);
new_page_owner->order = old_page_owner->order; migrate_handle = new_page_owner->handle;
new_page_owner->gfp_mask = old_page_owner->gfp_mask; __update_page_owner_handle(new_ext, old_page_owner->handle,
new_page_owner->last_migrate_reason = old_page_owner->order, old_page_owner->gfp_mask,
old_page_owner->last_migrate_reason; old_page_owner->last_migrate_reason,
new_page_owner->handle = old_page_owner->handle; old_page_owner->ts_nsec, old_page_owner->pid,
new_page_owner->pid = old_page_owner->pid; old_page_owner->tgid, old_page_owner->comm);
new_page_owner->tgid = old_page_owner->tgid; /*
new_page_owner->free_pid = old_page_owner->free_pid; * Do not proactively clear PAGE_EXT_OWNER{_ALLOCATED} bits as the folio
new_page_owner->free_tgid = old_page_owner->free_tgid; * will be freed after migration. Keep them until then as they may be
new_page_owner->ts_nsec = old_page_owner->ts_nsec; * useful.
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; */
strcpy(new_page_owner->comm, old_page_owner->comm); __update_page_owner_free_handle(new_ext, 0, old_page_owner->order,
old_page_owner->free_pid,
old_page_owner->free_tgid,
old_page_owner->free_ts_nsec);
/* /*
* We don't clear the bit on the old folio as it's going to be freed * We linked the original stack to the new folio, we need to do the same
* after migration. Until then, the info can be useful in case of * for the new one and the old folio otherwise there will be an imbalance
* a bug, and the overall stats will be off a bit only temporarily. * when subtracting those pages from the stack.
* Also, migrate_misplaced_transhuge_page() can still fail the
* migration and then we want the old folio to retain the info. But
* in that case we also don't need to explicitly clear the info from
* the new page, which will be freed.
*/ */
__set_bit(PAGE_EXT_OWNER, &new_ext->flags); for (i = 0; i < (1 << new_page_owner->order); i++) {
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); old_page_owner->handle = migrate_handle;
old_ext = page_ext_next(old_ext);
old_page_owner = get_page_owner(old_ext);
}
page_ext_put(new_ext); page_ext_put(new_ext);
page_ext_put(old_ext); page_ext_put(old_ext);
} }
...@@ -787,8 +818,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) ...@@ -787,8 +818,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
goto ext_put_continue; goto ext_put_continue;
/* Found early allocated page */ /* Found early allocated page */
__set_page_owner_handle(page_ext, early_handle, __update_page_owner_handle(page_ext, early_handle, 0, 0,
0, 0); -1, local_clock(), current->pid,
current->tgid, current->comm);
count++; count++;
ext_put_continue: ext_put_continue:
page_ext_put(page_ext); page_ext_put(page_ext);
...@@ -840,13 +872,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos) ...@@ -840,13 +872,11 @@ static void *stack_start(struct seq_file *m, loff_t *ppos)
* value of stack_list. * value of stack_list.
*/ */
stack = smp_load_acquire(&stack_list); stack = smp_load_acquire(&stack_list);
m->private = stack;
} else { } else {
stack = m->private; stack = m->private;
stack = stack->next;
} }
m->private = stack;
return stack; return stack;
} }
...@@ -861,11 +891,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos) ...@@ -861,11 +891,11 @@ static void *stack_next(struct seq_file *m, void *v, loff_t *ppos)
return stack; return stack;
} }
static unsigned long page_owner_stack_threshold; static unsigned long page_owner_pages_threshold;
static int stack_print(struct seq_file *m, void *v) static int stack_print(struct seq_file *m, void *v)
{ {
int i, stack_count; int i, nr_base_pages;
struct stack *stack = v; struct stack *stack = v;
unsigned long *entries; unsigned long *entries;
unsigned long nr_entries; unsigned long nr_entries;
...@@ -876,14 +906,14 @@ static int stack_print(struct seq_file *m, void *v) ...@@ -876,14 +906,14 @@ static int stack_print(struct seq_file *m, void *v)
nr_entries = stack_record->size; nr_entries = stack_record->size;
entries = stack_record->entries; entries = stack_record->entries;
stack_count = refcount_read(&stack_record->count) - 1; nr_base_pages = refcount_read(&stack_record->count) - 1;
if (stack_count < 1 || stack_count < page_owner_stack_threshold) if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold)
return 0; return 0;
for (i = 0; i < nr_entries; i++) for (i = 0; i < nr_entries; i++)
seq_printf(m, " %pS\n", (void *)entries[i]); seq_printf(m, " %pS\n", (void *)entries[i]);
seq_printf(m, "stack_count: %d\n\n", stack_count); seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages);
return 0; return 0;
} }
...@@ -913,13 +943,13 @@ static const struct file_operations page_owner_stack_operations = { ...@@ -913,13 +943,13 @@ static const struct file_operations page_owner_stack_operations = {
static int page_owner_threshold_get(void *data, u64 *val) static int page_owner_threshold_get(void *data, u64 *val)
{ {
*val = READ_ONCE(page_owner_stack_threshold); *val = READ_ONCE(page_owner_pages_threshold);
return 0; return 0;
} }
static int page_owner_threshold_set(void *data, u64 val) static int page_owner_threshold_set(void *data, u64 val)
{ {
WRITE_ONCE(page_owner_stack_threshold, val); WRITE_ONCE(page_owner_pages_threshold, val);
return 0; return 0;
} }
......
...@@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb, ...@@ -748,12 +748,6 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY #define shmem_huge SHMEM_HUGE_DENY
bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
struct mm_struct *mm, unsigned long vm_flags)
{
return false;
}
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shrink_control *sc, unsigned long nr_to_split) struct shrink_control *sc, unsigned long nr_to_split)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment