Commit 8291eaaf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull more MM updates from Andrew Morton:

 - Two follow-on fixes for the post-5.19 series "Use pageblock_order for
   cma and alloc_contig_range alignment", from Zi Yan.

 - A series of z3fold cleanups and fixes from Miaohe Lin.

 - Some memcg selftests work from Michal Koutný <mkoutny@suse.com>

 - Some swap fixes and cleanups from Miaohe Lin

 - Several individual minor fixups

* tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits)
  mm/shmem.c: suppress shift warning
  mm: Kconfig: reorganize misplaced mm options
  mm: kasan: fix input of vmalloc_to_page()
  mm: fix is_pinnable_page against a cma page
  mm: filter out swapin error entry in shmem mapping
  mm/shmem: fix infinite loop when swap in shmem error at swapoff time
  mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range
  mm/swapfile: fix lost swap bits in unuse_pte()
  mm/swapfile: unuse_pte can map random data if swap read fails
  selftests: memcg: factor out common parts of memory.{low,min} tests
  selftests: memcg: remove protection from top level memcg
  selftests: memcg: adjust expected reclaim values of protected cgroups
  selftests: memcg: expect no low events in unprotected sibling
  selftests: memcg: fix compilation
  mm/z3fold: fix z3fold_page_migrate races with z3fold_map
  mm/z3fold: fix z3fold_reclaim_page races with z3fold_free
  mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock
  mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails
  revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc"
  mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc
  ...
parents 77fb622d fa020a2b
......@@ -5062,6 +5062,7 @@ L: linux-mm@kvack.org
S: Maintained
F: mm/memcontrol.c
F: mm/swap_cgroup.c
F: tools/testing/selftests/cgroup/memcg_protection.m
F: tools/testing/selftests/cgroup/test_kmem.c
F: tools/testing/selftests/cgroup/test_memcontrol.c
......
......@@ -1594,8 +1594,13 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
#ifdef CONFIG_MIGRATION
static inline bool is_pinnable_page(struct page *page)
{
return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
is_zero_pfn(page_to_pfn(page));
#ifdef CONFIG_CMA
int mt = get_pageblock_migratetype(page);
if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
return false;
#endif
return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)));
}
#else
static inline bool is_pinnable_page(struct page *page)
......
......@@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
* actions on faults.
*/
#define SWP_SWAPIN_ERROR_NUM 1
#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
SWP_PTE_MARKER_NUM)
/*
* PTE markers are used to persist information onto PTEs that are mapped with
* file-backed memories. As its name "PTE" hints, it should only be applied to
......@@ -120,7 +124,8 @@ static inline int current_is_kswapd(void)
#define MAX_SWAPFILES \
((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM)
/*
* Magic header for a swap area. The first part of the union is
......
......@@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
return xa_mk_value(entry.val);
}
static inline swp_entry_t make_swapin_error_entry(struct page *page)
{
return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page));
}
static inline int is_swapin_error_entry(swp_entry_t entry)
{
return swp_type(entry) == SWP_SWAPIN_ERROR;
}
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
......
......@@ -1842,60 +1842,6 @@ config DEBUG_PERF_USE_VMALLOC
endmenu
config VM_EVENT_COUNTERS
default y
bool "Enable VM event counters for /proc/vmstat" if EXPERT
help
VM event counters are needed for event counts to be shown.
This option allows the disabling of the VM event counters
on EXPERT systems. /proc/vmstat will only show page counts
if VM event counters are disabled.
config SLUB_DEBUG
default y
bool "Enable SLUB debugging support" if EXPERT
depends on SLUB && SYSFS
select STACKDEPOT if STACKTRACE_SUPPORT
help
SLUB has extensive debug support features. Disabling these can
result in significant savings in code size. This also disables
SLUB sysfs support. /sys/slab will not exist and there will be
no support for cache validation etc.
config COMPAT_BRK
bool "Disable heap randomization"
default y
help
Randomizing heap placement makes heap exploits harder, but it
also breaks ancient binaries (including anything libc5 based).
This option changes the bootup default to heap randomization
disabled, and can be overridden at runtime by setting
/proc/sys/kernel/randomize_va_space to 2.
On non-ancient distros (post-2000 ones) N is usually a safe choice.
config MMAP_ALLOW_UNINITIALIZED
bool "Allow mmapped anonymous memory to be uninitialized"
depends on EXPERT && !MMU
default n
help
Normally, and according to the Linux spec, anonymous memory obtained
from mmap() has its contents cleared before it is passed to
userspace. Enabling this config option allows you to request that
mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
providing a huge performance boost. If this option is not enabled,
then the flag will be ignored.
This is taken advantage of by uClibc's malloc(), and also by
ELF-FDPIC binfmt's brk and stack allocator.
Because of the obvious security issues, this option should only be
enabled on embedded devices where you control what is run in
userspace. Since that isn't generally a problem on no-MMU systems,
it is normally safe to say Y here.
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
config SYSTEM_DATA_VERIFICATION
def_bool n
select SYSTEM_TRUSTED_KEYRING
......
......@@ -699,41 +699,6 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
help
Debug objects boot parameter default value
config DEBUG_SLAB
bool "Debug slab memory allocations"
depends on DEBUG_KERNEL && SLAB
help
Say Y here to have the kernel do limited verification on memory
allocation as well as poisoning memory on free to catch use of freed
memory. This can make kmalloc/kfree-intensive workloads much slower.
config SLUB_DEBUG_ON
bool "SLUB debugging on by default"
depends on SLUB && SLUB_DEBUG
select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
default n
help
Boot with debugging on by default. SLUB boots by default with
the runtime debug capabilities switched off. Enabling this is
equivalent to specifying the "slub_debug" parameter on boot.
There is no support for more fine grained debug control like
possible with slub_debug=xxx. SLUB debugging may be switched
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
"slub_debug=-".
config SLUB_STATS
default n
bool "Enable SLUB performance statistics"
depends on SLUB && SYSFS
help
SLUB statistics are useful to debug SLUBs allocation behavior in
order find ways to optimize the allocator. This should never be
enabled for production use since keeping statistics slows down
the allocator by a few percentage points. The slabinfo command
supports the determination of the most active slabs to figure
out which slabs are relevant to a particular load.
Try running: slabinfo -DA
config HAVE_DEBUG_KMEMLEAK
bool
......
......@@ -270,6 +270,19 @@ config SLAB_FREELIST_HARDENED
sanity-checking than others. This option is most effective with
CONFIG_SLUB.
config SLUB_STATS
default n
bool "Enable SLUB performance statistics"
depends on SLUB && SYSFS
help
SLUB statistics are useful to debug SLUBs allocation behavior in
order find ways to optimize the allocator. This should never be
enabled for production use since keeping statistics slows down
the allocator by a few percentage points. The slabinfo command
supports the determination of the most active slabs to figure
out which slabs are relevant to a particular load.
Try running: slabinfo -DA
config SLUB_CPU_PARTIAL
default y
depends on SLUB && SMP
......@@ -307,6 +320,40 @@ config SHUFFLE_PAGE_ALLOCATOR
Say Y if unsure.
config COMPAT_BRK
bool "Disable heap randomization"
default y
help
Randomizing heap placement makes heap exploits harder, but it
also breaks ancient binaries (including anything libc5 based).
This option changes the bootup default to heap randomization
disabled, and can be overridden at runtime by setting
/proc/sys/kernel/randomize_va_space to 2.
On non-ancient distros (post-2000 ones) N is usually a safe choice.
config MMAP_ALLOW_UNINITIALIZED
bool "Allow mmapped anonymous memory to be uninitialized"
depends on EXPERT && !MMU
default n
help
Normally, and according to the Linux spec, anonymous memory obtained
from mmap() has its contents cleared before it is passed to
userspace. Enabling this config option allows you to request that
mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
providing a huge performance boost. If this option is not enabled,
then the flag will be ignored.
This is taken advantage of by uClibc's malloc(), and also by
ELF-FDPIC binfmt's brk and stack allocator.
Because of the obvious security issues, this option should only be
enabled on embedded devices where you control what is run in
userspace. Since that isn't generally a problem on no-MMU systems,
it is normally safe to say Y here.
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
......@@ -964,6 +1011,15 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
config VM_EVENT_COUNTERS
default y
bool "Enable VM event counters for /proc/vmstat" if EXPERT
help
VM event counters are needed for event counts to be shown.
This option allows the disabling of the VM event counters
on EXPERT systems. /proc/vmstat will only show page counts
if VM event counters are disabled.
config PERCPU_STATS
bool "Collect percpu memory statistics"
help
......
......@@ -45,6 +45,39 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
config DEBUG_SLAB
bool "Debug slab memory allocations"
depends on DEBUG_KERNEL && SLAB
help
Say Y here to have the kernel do limited verification on memory
allocation as well as poisoning memory on free to catch use of freed
memory. This can make kmalloc/kfree-intensive workloads much slower.
config SLUB_DEBUG
default y
bool "Enable SLUB debugging support" if EXPERT
depends on SLUB && SYSFS
select STACKDEPOT if STACKTRACE_SUPPORT
help
SLUB has extensive debug support features. Disabling these can
result in significant savings in code size. This also disables
SLUB sysfs support. /sys/slab will not exist and there will be
no support for cache validation etc.
config SLUB_DEBUG_ON
bool "SLUB debugging on by default"
depends on SLUB && SLUB_DEBUG
select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
default n
help
Boot with debugging on by default. SLUB boots by default with
the runtime debug capabilities switched off. Enabling this is
equivalent to specifying the "slub_debug" parameter on boot.
There is no support for more fine grained debug control like
possible with slub_debug=xxx. SLUB debugging may be switched
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
"slub_debug=-".
config PAGE_OWNER
bool "Track page owner"
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
......
......@@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
void split_free_page(struct page *free_page,
int order, unsigned long split_pfn_offset);
int split_free_page(struct page *free_page,
unsigned int order, unsigned long split_pfn_offset);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
......
......@@ -347,7 +347,7 @@ static void print_address_description(void *addr, u8 tag)
va->addr, va->addr + va->size, va->caller);
pr_err("\n");
page = vmalloc_to_page(page);
page = vmalloc_to_page(addr);
}
}
......
......@@ -248,10 +248,13 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
if (!xa_is_value(page))
continue;
swap = radix_to_swp_entry(page);
/* There might be swapin error entries in shmem mapping. */
if (non_swap_entry(swap))
continue;
xas_pause(&xas);
rcu_read_unlock();
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false, &splug);
if (page)
......@@ -624,11 +627,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
swp_entry_t entry;
entry = pte_to_swp_entry(ptent);
if (non_swap_entry(entry))
continue;
nr_swap--;
free_swap_and_cache(entry);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
if (!non_swap_entry(entry)) {
nr_swap--;
free_swap_and_cache(entry);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
is_swapin_error_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
}
continue;
}
......
......@@ -1487,7 +1487,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
/* Only drop the uffd-wp marker if explicitly requested */
if (!zap_drop_file_uffd_wp(details))
continue;
} else if (is_hwpoison_entry(entry)) {
} else if (is_hwpoison_entry(entry) ||
is_swapin_error_entry(entry)) {
if (!should_zap_cows(details))
continue;
} else {
......@@ -3727,6 +3728,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_swapin_error_entry(entry)) {
ret = VM_FAULT_SIGBUS;
} else if (is_pte_marker_entry(entry)) {
ret = handle_pte_marker(vmf);
} else {
......
......@@ -482,8 +482,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
/*
* This races, without locks, with set_pfnblock_flags_mask(). Ensure
* a consistent read of the memory array, so that results, even though
* racy, are not corrupted.
*/
word = READ_ONCE(bitmap[word_bitidx]);
return (word >> bitidx) & mask;
}
......@@ -1100,30 +1104,44 @@ static inline void __free_one_page(struct page *page,
* @order: the order of the page
* @split_pfn_offset: split offset within the page
*
* Return -ENOENT if the free page is changed, otherwise 0
*
* It is used when the free page crosses two pageblocks with different migratetypes
* at split_pfn_offset within the page. The split free page will be put into
* separate migratetype lists afterwards. Otherwise, the function achieves
* nothing.
*/
void split_free_page(struct page *free_page,
int order, unsigned long split_pfn_offset)
int split_free_page(struct page *free_page,
unsigned int order, unsigned long split_pfn_offset)
{
struct zone *zone = page_zone(free_page);
unsigned long free_page_pfn = page_to_pfn(free_page);
unsigned long pfn;
unsigned long flags;
int free_page_order;
int mt;
int ret = 0;
if (split_pfn_offset == 0)
return;
return ret;
spin_lock_irqsave(&zone->lock, flags);
if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
ret = -ENOENT;
goto out;
}
mt = get_pageblock_migratetype(free_page);
if (likely(!is_migrate_isolate(mt)))
__mod_zone_freepage_state(zone, -(1UL << order), mt);
del_page_from_free_list(free_page, zone, order);
for (pfn = free_page_pfn;
pfn < free_page_pfn + (1UL << order);) {
int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
free_page_order = min_t(int,
free_page_order = min_t(unsigned int,
pfn ? __ffs(pfn) : order,
__fls(split_pfn_offset));
__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
......@@ -1134,7 +1152,9 @@ void split_free_page(struct page *free_page,
if (split_pfn_offset == 0)
split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
}
out:
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
......
......@@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
gfp_t gfp_flags, bool isolate_before)
gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
{
unsigned char saved_mt;
unsigned long start_pfn;
......@@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
if (skip_isolation)
VM_BUG_ON(!is_migrate_isolate(saved_mt));
else {
ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
}
/*
* Bail out early when the to-be-isolated pageblock does not form
......@@ -366,9 +371,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (PageBuddy(page)) {
int order = buddy_order(page);
if (pfn + (1UL << order) > boundary_pfn)
split_free_page(page, order, boundary_pfn - pfn);
pfn += (1UL << order);
if (pfn + (1UL << order) > boundary_pfn) {
/* free page changed before split, check it again */
if (split_free_page(page, order, boundary_pfn - pfn))
continue;
}
pfn += 1UL << order;
continue;
}
/*
......@@ -463,7 +472,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
return 0;
failed:
/* restore the original migratetype */
unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
if (!skip_isolation)
unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
return -EBUSY;
}
......@@ -522,14 +532,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
int ret;
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false);
ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
if (ret)
return ret;
if (isolate_start == isolate_end - pageblock_nr_pages)
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true);
ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret;
......
......@@ -1174,6 +1174,10 @@ static int shmem_find_swap_entries(struct address_space *mapping,
continue;
entry = radix_to_swp_entry(folio);
/*
* swapin error entries can be found in the mapping. But they're
* deliberately ignored here as we've done everything we can do.
*/
if (swp_type(entry) != type)
continue;
......@@ -1671,6 +1675,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
return error;
}
static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
struct folio *folio, swp_entry_t swap)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
swp_entry_t swapin_error;
void *old;
swapin_error = make_swapin_error_entry(&folio->page);
old = xa_cmpxchg_irq(&mapping->i_pages, index,
swp_to_radix_entry(swap),
swp_to_radix_entry(swapin_error), 0);
if (old != swp_to_radix_entry(swap))
return;
folio_wait_writeback(folio);
delete_from_swap_cache(&folio->page);
spin_lock_irq(&info->lock);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
* be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
* shmem_evict_inode.
*/
info->alloced--;
info->swapped--;
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
swap_free(swap);
}
/*
* Swap in the page pointed to by *pagep.
* Caller has to make sure that *pagep contains a valid swapped page.
......@@ -1694,6 +1728,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
swap = radix_to_swp_entry(*foliop);
*foliop = NULL;
if (is_swapin_error_entry(swap))
return -EIO;
/* Look it up and read it in.. */
page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
......@@ -1761,6 +1798,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
if (error == -EIO)
shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
if (folio) {
folio_unlock(folio);
......@@ -1906,7 +1945,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
spin_lock_irq(&info->lock);
info->alloced += folio_nr_pages(folio);
inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio);
inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
alloced = true;
......
......@@ -410,6 +410,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
return NULL;
swp = radix_to_swp_entry(page);
/* There might be swapin error entries in shmem mapping. */
if (non_swap_entry(swp))
return NULL;
/* Prevent swapoff from happening to us */
si = get_swap_device(swp);
if (!si)
......
......@@ -1775,7 +1775,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
{
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte;
pte_t *pte, new_pte;
int ret = 1;
swapcache = page;
......@@ -1789,6 +1789,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
if (unlikely(!PageUptodate(page))) {
pte_t pteval;
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
pteval = swp_entry_to_pte(make_swapin_error_entry(page));
set_pte_at(vma->vm_mm, addr, pte, pteval);
swap_free(entry);
ret = 0;
goto out;
}
/* See do_swap_page() */
BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
BUG_ON(PageAnon(page) && PageAnonExclusive(page));
......@@ -1813,8 +1824,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
}
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
if (pte_swp_soft_dirty(*pte))
new_pte = pte_mksoft_dirty(new_pte);
if (pte_swp_uffd_wp(*pte))
new_pte = pte_mkuffd_wp(new_pte);
set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
pte_unmap_unlock(pte, ptl);
......
......@@ -181,6 +181,7 @@ enum z3fold_page_flags {
NEEDS_COMPACTING,
PAGE_STALE,
PAGE_CLAIMED, /* by either reclaim or free */
PAGE_MIGRATED, /* page is migrated and soon to be released */
};
/*
......@@ -212,10 +213,8 @@ static int size_to_chunks(size_t size)
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
struct z3fold_buddy_slots *slots;
slots = kmem_cache_zalloc(pool->c_handle,
(gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
gfp);
if (slots) {
/* It will be freed separately in free_handle(). */
......@@ -272,8 +271,13 @@ static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
if (locked)
break;
if (locked) {
struct page *page = virt_to_page(zhdr);
if (!test_bit(PAGE_MIGRATED, &page->private))
break;
z3fold_page_unlock(zhdr);
}
cpu_relax();
} while (true);
} else {
......@@ -391,6 +395,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
clear_bit(PAGE_CLAIMED, &page->private);
clear_bit(PAGE_MIGRATED, &page->private);
if (headless)
return zhdr;
......@@ -521,13 +526,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
atomic64_dec(&pool->pages_nr);
}
static void release_z3fold_page(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
__release_z3fold_page(zhdr, false);
}
static void release_z3fold_page_locked(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
......@@ -940,10 +938,19 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
}
}
if (zhdr && !zhdr->slots)
zhdr->slots = alloc_slots(pool,
can_sleep ? GFP_NOIO : GFP_ATOMIC);
if (zhdr && !zhdr->slots) {
zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
if (!zhdr->slots)
goto out_fail;
}
return zhdr;
out_fail:
if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
}
return NULL;
}
/*
......@@ -1066,7 +1073,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
enum buddy bud;
bool can_sleep = gfpflags_allow_blocking(gfp);
if (!size)
if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
if (size > PAGE_SIZE)
......@@ -1093,28 +1100,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
bud = FIRST;
}
page = NULL;
if (can_sleep) {
spin_lock(&pool->stale_lock);
zhdr = list_first_entry_or_null(&pool->stale,
struct z3fold_header, buddy);
/*
* Before allocating a page, let's see if we can take one from
* the stale pages list. cancel_work_sync() can sleep so we
* limit this case to the contexts where we can sleep
*/
if (zhdr) {
list_del(&zhdr->buddy);
spin_unlock(&pool->stale_lock);
cancel_work_sync(&zhdr->work);
page = virt_to_page(zhdr);
} else {
spin_unlock(&pool->stale_lock);
}
}
if (!page)
page = alloc_page(gfp);
page = alloc_page(gfp);
if (!page)
return -ENOMEM;
......@@ -1134,10 +1120,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
__SetPageMovable(page, pool->inode->i_mapping);
unlock_page(page);
} else {
if (trylock_page(page)) {
__SetPageMovable(page, pool->inode->i_mapping);
unlock_page(page);
}
WARN_ON(!trylock_page(page));
__SetPageMovable(page, pool->inode->i_mapping);
unlock_page(page);
}
z3fold_page_lock(zhdr);
......@@ -1236,8 +1221,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
return;
}
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
put_z3fold_header(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
put_z3fold_header(zhdr);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
......@@ -1332,12 +1317,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
break;
}
if (kref_get_unless_zero(&zhdr->refcount) == 0) {
zhdr = NULL;
break;
}
if (!z3fold_page_trylock(zhdr)) {
kref_put(&zhdr->refcount, release_z3fold_page);
zhdr = NULL;
continue; /* can't evict at this point */
}
......@@ -1348,14 +1328,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
*/
if (zhdr->foreign_handles ||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
if (!kref_put(&zhdr->refcount,
release_z3fold_page_locked))
z3fold_page_unlock(zhdr);
z3fold_page_unlock(zhdr);
zhdr = NULL;
continue; /* can't evict such page */
}
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
/* See comment in __z3fold_alloc. */
kref_get(&zhdr->refcount);
break;
}
......@@ -1437,8 +1417,10 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
if (list_empty(&zhdr->buddy))
add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
/* We started off locked to we need to lock the pool back */
......@@ -1590,8 +1572,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
if (!z3fold_page_trylock(zhdr))
return -EAGAIN;
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
z3fold_page_unlock(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
return -EBUSY;
}
if (work_pending(&zhdr->work)) {
......@@ -1601,7 +1583,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
new_zhdr = page_address(newpage);
memcpy(new_zhdr, zhdr, PAGE_SIZE);
newpage->private = page->private;
page->private = 0;
set_bit(PAGE_MIGRATED, &page->private);
z3fold_page_unlock(zhdr);
spin_lock_init(&new_zhdr->page_lock);
INIT_WORK(&new_zhdr->work, compact_page_work);
......@@ -1631,7 +1613,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
clear_bit(PAGE_CLAIMED, &page->private);
/* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
page->private = 0;
put_page(page);
return 0;
}
......@@ -1653,6 +1636,8 @@ static void z3fold_page_putback(struct page *page)
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
if (list_empty(&zhdr->buddy))
add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
......
% SPDX-License-Identifier: GPL-2.0
%
% run as: octave-cli memcg_protection.m
%
% This script simulates reclaim protection behavior on a single level of memcg
% hierarchy to illustrate how overcommitted protection spreads among siblings
% (as it depends also on their current consumption).
%
% Simulation assumes siblings consumed the initial amount of memory (w/out
% reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated
% same. It simulates only non-low reclaim and assumes all memory.min = 0.
%
% Input configurations
% --------------------
% E number parent effective protection
% n vector nominal protection of siblings set at the given level (memory.low)
% c vector current consumption -,,- (memory.current)
% example from testcase (values in GB)
E = 50 / 1024;
n = [75 25 0 500 ] / 1024;
c = [50 50 50 0] / 1024;
% Reclaim parameters
% ------------------
% Minimal reclaim amount (GB)
cluster = 32*4 / 2**20;
% Reclaim coefficient (think as 0.5^sc->priority)
alpha = .1
% Simulation parameters
% ---------------------
epsilon = 1e-7;
timeout = 1000;
% Simulation loop
% ---------------
ch = [];
eh = [];
rh = [];
for t = 1:timeout
% low_usage
u = min(c, n);
siblings = sum(u);
% effective_protection()
protected = min(n, c); % start with nominal
e = protected * min(1, E / siblings); % normalize overcommit
% recursive protection
unclaimed = max(0, E - siblings);
parent_overuse = sum(c) - siblings;
if (unclaimed > 0 && parent_overuse > 0)
overuse = max(0, c - protected);
e += unclaimed * (overuse / parent_overuse);
endif
% get_scan_count()
r = alpha * c; % assume all memory is in a single LRU list
% commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection")
sz = max(e, c);
r .*= (1 - (e+epsilon) ./ (sz+epsilon));
% uncomment to debug prints
% e, c, r
% nothing to reclaim, reached equilibrium
if max(r) < epsilon
break;
endif
% SWAP_CLUSTER_MAX roundup
r = max(r, (r > epsilon) .* cluster);
% XXX here I do parallel reclaim of all siblings
% in reality reclaim is serialized and each sibling recalculates own residual
c = max(c - r, 0);
ch = [ch ; c];
eh = [eh ; e];
rh = [rh ; r];
endfor
t
c, e
......@@ -190,13 +190,6 @@ static int test_memcg_current(const char *root)
return ret;
}
static int alloc_pagecache_50M(const char *cgroup, void *arg)
{
int fd = (long)arg;
return alloc_pagecache(fd, MB(50));
}
static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
{
int fd = (long)arg;
......@@ -247,33 +240,39 @@ static int cg_test_proc_killed(const char *cgroup)
/*
* First, this test creates the following hierarchy:
* A memory.min = 50M, memory.max = 200M
* A/B memory.min = 50M, memory.current = 50M
* A memory.min = 0, memory.max = 200M
* A/B memory.min = 50M
* A/B/C memory.min = 75M, memory.current = 50M
* A/B/D memory.min = 25M, memory.current = 50M
* A/B/E memory.min = 0, memory.current = 50M
* A/B/F memory.min = 500M, memory.current = 0
*
* Usages are pagecache, but the test keeps a running
* (or memory.low if we test soft protection)
*
* Usages are pagecache and the test keeps a running
* process in every leaf cgroup.
* Then it creates A/G and creates a significant
* memory pressure in it.
* memory pressure in A.
*
* Then it checks actual memory usages and expects that:
* A/B memory.current ~= 50M
* A/B/C memory.current ~= 33M
* A/B/D memory.current ~= 17M
* A/B/F memory.current ~= 0
* A/B/C memory.current ~= 29M
* A/B/D memory.current ~= 21M
* A/B/E memory.current ~= 0
* A/B/F memory.current = 0
* (for origin of the numbers, see model in memcg_protection.m.)
*
* After that it tries to allocate more than there is
* unprotected memory in A available, and checks
* checks that memory.min protects pagecache even
* in this case.
* unprotected memory in A available, and checks that:
* a) memory.min protects pagecache even in this case,
* b) memory.low allows reclaiming page cache with low events.
*/
static int test_memcg_min(const char *root)
static int test_memcg_protection(const char *root, bool min)
{
int ret = KSFT_FAIL;
int ret = KSFT_FAIL, rc;
char *parent[3] = {NULL};
char *children[4] = {NULL};
const char *attribute = min ? "memory.min" : "memory.low";
long c[4];
int i, attempts;
int fd;
......@@ -297,8 +296,10 @@ static int test_memcg_min(const char *root)
if (cg_create(parent[0]))
goto cleanup;
if (cg_read_long(parent[0], "memory.min")) {
ret = KSFT_SKIP;
if (cg_read_long(parent[0], attribute)) {
/* No memory.min on older kernels is fine */
if (min)
ret = KSFT_SKIP;
goto cleanup;
}
......@@ -335,17 +336,15 @@ static int test_memcg_min(const char *root)
(void *)(long)fd);
}
if (cg_write(parent[0], "memory.min", "50M"))
if (cg_write(parent[1], attribute, "50M"))
goto cleanup;
if (cg_write(parent[1], "memory.min", "50M"))
if (cg_write(children[0], attribute, "75M"))
goto cleanup;
if (cg_write(children[0], "memory.min", "75M"))
if (cg_write(children[1], attribute, "25M"))
goto cleanup;
if (cg_write(children[1], "memory.min", "25M"))
if (cg_write(children[2], attribute, "0"))
goto cleanup;
if (cg_write(children[2], "memory.min", "0"))
goto cleanup;
if (cg_write(children[3], "memory.min", "500M"))
if (cg_write(children[3], attribute, "500M"))
goto cleanup;
attempts = 0;
......@@ -365,170 +364,35 @@ static int test_memcg_min(const char *root)
for (i = 0; i < ARRAY_SIZE(children); i++)
c[i] = cg_read_long(children[i], "memory.current");
if (!values_close(c[0], MB(33), 10))
if (!values_close(c[0], MB(29), 10))
goto cleanup;
if (!values_close(c[1], MB(17), 10))
if (!values_close(c[1], MB(21), 10))
goto cleanup;
if (c[3] != 0)
goto cleanup;
if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
goto cleanup;
if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
goto cleanup;
ret = KSFT_PASS;
cleanup:
for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
if (!children[i])
continue;
cg_destroy(children[i]);
free(children[i]);
}
for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
if (!parent[i])
continue;
cg_destroy(parent[i]);
free(parent[i]);
}
close(fd);
return ret;
}
/*
* First, this test creates the following hierarchy:
* A memory.low = 50M, memory.max = 200M
* A/B memory.low = 50M, memory.current = 50M
* A/B/C memory.low = 75M, memory.current = 50M
* A/B/D memory.low = 25M, memory.current = 50M
* A/B/E memory.low = 0, memory.current = 50M
* A/B/F memory.low = 500M, memory.current = 0
*
* Usages are pagecache.
* Then it creates A/G an creates a significant
* memory pressure in it.
*
* Then it checks actual memory usages and expects that:
* A/B memory.current ~= 50M
* A/B/ memory.current ~= 33M
* A/B/D memory.current ~= 17M
* A/B/F memory.current ~= 0
*
* After that it tries to allocate more than there is
* unprotected memory in A available,
* and checks low and oom events in memory.events.
*/
static int test_memcg_low(const char *root)
{
int ret = KSFT_FAIL;
char *parent[3] = {NULL};
char *children[4] = {NULL};
long low, oom;
long c[4];
int i;
int fd;
fd = get_temp_fd();
if (fd < 0)
goto cleanup;
parent[0] = cg_name(root, "memcg_test_0");
if (!parent[0])
goto cleanup;
parent[1] = cg_name(parent[0], "memcg_test_1");
if (!parent[1])
goto cleanup;
parent[2] = cg_name(parent[0], "memcg_test_2");
if (!parent[2])
goto cleanup;
if (cg_create(parent[0]))
goto cleanup;
if (cg_read_long(parent[0], "memory.low"))
goto cleanup;
if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
goto cleanup;
if (cg_write(parent[0], "memory.max", "200M"))
goto cleanup;
if (cg_write(parent[0], "memory.swap.max", "0"))
goto cleanup;
if (cg_create(parent[1]))
goto cleanup;
if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
if (min && !rc)
goto cleanup;
if (cg_create(parent[2]))
else if (!min && rc) {
fprintf(stderr,
"memory.low prevents from allocating anon memory\n");
goto cleanup;
for (i = 0; i < ARRAY_SIZE(children); i++) {
children[i] = cg_name_indexed(parent[1], "child_memcg", i);
if (!children[i])
goto cleanup;
if (cg_create(children[i]))
goto cleanup;
if (i > 2)
continue;
if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
goto cleanup;
}
if (cg_write(parent[0], "memory.low", "50M"))
goto cleanup;
if (cg_write(parent[1], "memory.low", "50M"))
goto cleanup;
if (cg_write(children[0], "memory.low", "75M"))
goto cleanup;
if (cg_write(children[1], "memory.low", "25M"))
goto cleanup;
if (cg_write(children[2], "memory.low", "0"))
goto cleanup;
if (cg_write(children[3], "memory.low", "500M"))
goto cleanup;
if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
goto cleanup;
if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
goto cleanup;
for (i = 0; i < ARRAY_SIZE(children); i++)
c[i] = cg_read_long(children[i], "memory.current");
if (!values_close(c[0], MB(33), 10))
goto cleanup;
if (!values_close(c[1], MB(17), 10))
goto cleanup;
if (c[3] != 0)
goto cleanup;
if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
fprintf(stderr,
"memory.low prevents from allocating anon memory\n");
if (min) {
ret = KSFT_PASS;
goto cleanup;
}
for (i = 0; i < ARRAY_SIZE(children); i++) {
int no_low_events_index = has_recursiveprot ? 2 : 1;
int no_low_events_index = 1;
long low, oom;
oom = cg_read_key_long(children[i], "memory.events", "oom ");
low = cg_read_key_long(children[i], "memory.events", "low ");
......@@ -564,6 +428,16 @@ static int test_memcg_low(const char *root)
return ret;
}
static int test_memcg_min(const char *root)
{
return test_memcg_protection(root, true);
}
static int test_memcg_low(const char *root)
{
return test_memcg_protection(root, false);
}
static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
{
size_t size = MB(50);
......@@ -1241,7 +1115,16 @@ static int test_memcg_oom_group_leaf_events(const char *root)
if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
goto cleanup;
if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0)
parent_oom_events = cg_read_key_long(
parent, "memory.events", "oom_kill ");
/*
* If memory_localevents is not enabled (the default), the parent should
* count OOM events in its children groups. Otherwise, it should not
* have observed any events.
*/
if (has_localevents && parent_oom_events != 0)
goto cleanup;
else if (!has_localevents && parent_oom_events <= 0)
goto cleanup;
ret = KSFT_PASS;
......@@ -1349,20 +1232,14 @@ static int test_memcg_oom_group_score_events(const char *root)
if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
goto cleanup;
parent_oom_events = cg_read_key_long(
parent, "memory.events", "oom_kill ");
/*
* If memory_localevents is not enabled (the default), the parent should
* count OOM events in its children groups. Otherwise, it should not
* have observed any events.
*/
if ((has_localevents && parent_oom_events == 0) ||
parent_oom_events > 0)
ret = KSFT_PASS;
if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
goto cleanup;
if (kill(safe_pid, SIGKILL))
goto cleanup;
ret = KSFT_PASS;
cleanup:
if (memcg)
cg_destroy(memcg);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment