Commit 9651fced authored by Jason A. Donenfeld's avatar Jason A. Donenfeld

mm: add MAP_DROPPABLE for designating always lazily freeable mappings

The vDSO getrandom() implementation works with a buffer allocated with a
new system call that has certain requirements:

- It shouldn't be written to core dumps.
  * Easy: VM_DONTDUMP.
- It should be zeroed on fork.
  * Easy: VM_WIPEONFORK.

- It shouldn't be written to swap.
  * Uh-oh: mlock is rlimited.
  * Uh-oh: mlock isn't inherited by forks.

- It shouldn't reserve actual memory, but it also shouldn't crash when
  page faulting in memory if none is available
  * Uh-oh: VM_NORESERVE means segfaults.

It turns out that the vDSO getrandom() function has three really nice
characteristics that we can exploit to solve this problem:

1) Due to being wiped during fork(), the vDSO code is already robust to
   having the contents of the pages it reads zeroed out midway through
   the function's execution.

2) In the absolute worst case of whatever contingency we're coding for,
   we have the option to fallback to the getrandom() syscall, and
   everything is fine.

3) The buffers the function uses are only ever useful for a maximum of
   60 seconds -- a sort of cache, rather than a long term allocation.

These characteristics mean that we can introduce VM_DROPPABLE, which
has the following semantics:

a) It never is written out to swap.
b) Under memory pressure, mm can just drop the pages (so that they're
   zero when read back again).
c) It is inherited by fork.
d) It doesn't count against the mlock budget, since nothing is locked.
e) If there's not enough memory to service a page fault, it's not fatal,
   and no signal is sent.

This way, allocations used by vDSO getrandom() can use:

    VM_DROPPABLE | VM_DONTDUMP | VM_WIPEONFORK | VM_NORESERVE

And there will be no problem with OOMing, crashing on overcommitment,
using memory when not in use, not wiping on fork(), coredumps, or
writing out to swap.

In order to let vDSO getrandom() use this, expose these via mmap(2) as
MAP_DROPPABLE.

Note that this involves removing the MADV_FREE special case from
sort_folio(), which according to Yu Zhao is unnecessary and will simply
result in an extra call to shrink_folio_list() in the worst case. The
chunk removed reenables the swapbacked flag, which we don't want for
VM_DROPPABLE, and we can't conditionalize it here because there isn't a
vma reference available.

Finally, the provided self test ensures that this is working as desired.

Cc: linux-mm@kvack.org
Acked-by: default avatarDavid Hildenbrand <david@redhat.com>
Signed-off-by: default avatarJason A. Donenfeld <Jason@zx2c4.com>
parent 8a18fda0
...@@ -708,6 +708,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) ...@@ -708,6 +708,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_SHADOW_STACK)] = "ss", [ilog2(VM_SHADOW_STACK)] = "ss",
#endif #endif
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
[ilog2(VM_DROPPABLE)] = "dp",
[ilog2(VM_SEALED)] = "sl", [ilog2(VM_SEALED)] = "sl",
#endif #endif
}; };
......
...@@ -406,6 +406,13 @@ extern unsigned int kobjsize(const void *objp); ...@@ -406,6 +406,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ALLOW_ANY_UNCACHED VM_NONE #define VM_ALLOW_ANY_UNCACHED VM_NONE
#endif #endif
#ifdef CONFIG_64BIT
#define VM_DROPPABLE_BIT 40
#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT)
#else
#define VM_DROPPABLE VM_NONE
#endif
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */ /* VM is sealed, in vm_flags */
#define VM_SEALED _BITUL(63) #define VM_SEALED _BITUL(63)
......
...@@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, ...@@ -218,6 +218,9 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
{ {
vm_flags &= __VM_UFFD_FLAGS; vm_flags &= __VM_UFFD_FLAGS;
if (vm_flags & VM_DROPPABLE)
return false;
if ((vm_flags & VM_UFFD_MINOR) && if ((vm_flags & VM_UFFD_MINOR) &&
(!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
return false; return false;
......
...@@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3) ...@@ -165,6 +165,12 @@ IF_HAVE_PG_ARCH_X(arch_3)
# define IF_HAVE_UFFD_MINOR(flag, name) # define IF_HAVE_UFFD_MINOR(flag, name)
#endif #endif
#ifdef CONFIG_64BIT
# define IF_HAVE_VM_DROPPABLE(flag, name) {flag, name},
#else
# define IF_HAVE_VM_DROPPABLE(flag, name)
#endif
#define __def_vmaflag_names \ #define __def_vmaflag_names \
{VM_READ, "read" }, \ {VM_READ, "read" }, \
{VM_WRITE, "write" }, \ {VM_WRITE, "write" }, \
...@@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ ...@@ -197,6 +203,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \ {VM_MIXEDMAP, "mixedmap" }, \
{VM_HUGEPAGE, "hugepage" }, \ {VM_HUGEPAGE, "hugepage" }, \
{VM_NOHUGEPAGE, "nohugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \
IF_HAVE_VM_DROPPABLE(VM_DROPPABLE, "droppable" ) \
{VM_MERGEABLE, "mergeable" } \ {VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \ #define show_vma_flags(flags) \
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */ #define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */
/* /*
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
......
...@@ -717,7 +717,7 @@ static bool vma_ksm_compatible(struct vm_area_struct *vma) ...@@ -717,7 +717,7 @@ static bool vma_ksm_compatible(struct vm_area_struct *vma)
{ {
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP | if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE | VM_PFNMAP |
VM_IO | VM_DONTEXPAND | VM_HUGETLB | VM_IO | VM_DONTEXPAND | VM_HUGETLB |
VM_MIXEDMAP)) VM_MIXEDMAP| VM_DROPPABLE))
return false; /* just ignore the advice */ return false; /* just ignore the advice */
if (vma_is_dax(vma)) if (vma_is_dax(vma))
......
...@@ -1068,13 +1068,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, ...@@ -1068,13 +1068,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_WIPEONFORK; new_flags |= VM_WIPEONFORK;
break; break;
case MADV_KEEPONFORK: case MADV_KEEPONFORK:
if (vma->vm_flags & VM_DROPPABLE)
return -EINVAL;
new_flags &= ~VM_WIPEONFORK; new_flags &= ~VM_WIPEONFORK;
break; break;
case MADV_DONTDUMP: case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP; new_flags |= VM_DONTDUMP;
break; break;
case MADV_DODUMP: case MADV_DODUMP:
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
(vma->vm_flags & VM_DROPPABLE))
return -EINVAL; return -EINVAL;
new_flags &= ~VM_DONTDUMP; new_flags &= ~VM_DONTDUMP;
break; break;
......
...@@ -5660,6 +5660,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -5660,6 +5660,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* If the fault handler drops the mmap_lock, vma may be freed */ /* If the fault handler drops the mmap_lock, vma may be freed */
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
vm_fault_t ret; vm_fault_t ret;
bool is_droppable;
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
...@@ -5674,6 +5675,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -5674,6 +5675,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
goto out; goto out;
} }
is_droppable = !!(vma->vm_flags & VM_DROPPABLE);
/* /*
* Enable the memcg OOM handling for faults triggered in user * Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully. * space. Kernel faults are handled more gracefully.
...@@ -5688,8 +5691,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -5688,8 +5691,18 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
else else
ret = __handle_mm_fault(vma, address, flags); ret = __handle_mm_fault(vma, address, flags);
/*
* Warning: It is no longer safe to dereference vma-> after this point,
* because mmap_lock might have been dropped by __handle_mm_fault(), so
* vma might be destroyed from underneath us.
*/
lru_gen_exit_fault(); lru_gen_exit_fault();
/* If the mapping is droppable, then errors due to OOM aren't fatal. */
if (is_droppable)
ret &= ~VM_FAULT_OOM;
if (flags & FAULT_FLAG_USER) { if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault(); mem_cgroup_exit_user_fault();
/* /*
......
...@@ -2300,6 +2300,9 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct ...@@ -2300,6 +2300,9 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
pgoff_t ilx; pgoff_t ilx;
struct page *page; struct page *page;
if (vma->vm_flags & VM_DROPPABLE)
gfp |= __GFP_NOWARN;
pol = get_vma_policy(vma, addr, order, &ilx); pol = get_vma_policy(vma, addr, order, &ilx);
page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order,
pol, ilx, numa_node_id()); pol, ilx, numa_node_id());
......
...@@ -485,7 +485,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, ...@@ -485,7 +485,7 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (newflags == oldflags || (oldflags & VM_SPECIAL) || if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
vma_is_dax(vma) || vma_is_secretmem(vma)) vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out; goto out;
......
...@@ -1369,6 +1369,36 @@ unsigned long do_mmap(struct file *file, unsigned long addr, ...@@ -1369,6 +1369,36 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
pgoff = 0; pgoff = 0;
vm_flags |= VM_SHARED | VM_MAYSHARE; vm_flags |= VM_SHARED | VM_MAYSHARE;
break; break;
case MAP_DROPPABLE:
if (VM_DROPPABLE == VM_NONE)
return -ENOTSUPP;
/*
* A locked or stack area makes no sense to be droppable.
*
* Also, since droppable pages can just go away at any time
* it makes no sense to copy them on fork or dump them.
*
* And don't attempt to combine with hugetlb for now.
*/
if (flags & (MAP_LOCKED | MAP_HUGETLB))
return -EINVAL;
if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
return -EINVAL;
vm_flags |= VM_DROPPABLE;
/*
* If the pages can be dropped, then it doesn't make
* sense to reserve them.
*/
vm_flags |= VM_NORESERVE;
/*
* Likewise, they're volatile enough that they
* shouldn't survive forks or coredumps.
*/
vm_flags |= VM_WIPEONFORK | VM_DONTDUMP;
fallthrough;
case MAP_PRIVATE: case MAP_PRIVATE:
/* /*
* Set pgoff according to addr for anon_vma. * Set pgoff according to addr for anon_vma.
......
...@@ -1397,7 +1397,12 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, ...@@ -1397,7 +1397,12 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
VM_BUG_ON_VMA(address < vma->vm_start || VM_BUG_ON_VMA(address < vma->vm_start ||
address + (nr << PAGE_SHIFT) > vma->vm_end, vma); address + (nr << PAGE_SHIFT) > vma->vm_end, vma);
__folio_set_swapbacked(folio); /*
* VM_DROPPABLE mappings don't swap; instead they're just dropped when
* under memory pressure.
*/
if (!(vma->vm_flags & VM_DROPPABLE))
__folio_set_swapbacked(folio);
__folio_set_anon(folio, vma, address, true); __folio_set_anon(folio, vma, address, true);
if (likely(!folio_test_large(folio))) { if (likely(!folio_test_large(folio))) {
...@@ -1841,7 +1846,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, ...@@ -1841,7 +1846,13 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* plus the rmap(s) (dropped by discard:). * plus the rmap(s) (dropped by discard:).
*/ */
if (ref_count == 1 + map_count && if (ref_count == 1 + map_count &&
!folio_test_dirty(folio)) { (!folio_test_dirty(folio) ||
/*
* Unlike MADV_FREE mappings, VM_DROPPABLE
* ones can be dropped even if they've
* been dirtied.
*/
(vma->vm_flags & VM_DROPPABLE))) {
dec_mm_counter(mm, MM_ANONPAGES); dec_mm_counter(mm, MM_ANONPAGES);
goto discard; goto discard;
} }
...@@ -1851,7 +1862,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, ...@@ -1851,7 +1862,12 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* discarded. Remap the page to page table. * discarded. Remap the page to page table.
*/ */
set_pte_at(mm, address, pvmw.pte, pteval); set_pte_at(mm, address, pvmw.pte, pteval);
folio_set_swapbacked(folio); /*
* Unlike MADV_FREE mappings, VM_DROPPABLE ones
* never get swap backed on failure to drop.
*/
if (!(vma->vm_flags & VM_DROPPABLE))
folio_set_swapbacked(folio);
ret = false; ret = false;
page_vma_mapped_walk_done(&pvmw); page_vma_mapped_walk_done(&pvmw);
break; break;
......
...@@ -4265,15 +4265,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c ...@@ -4265,15 +4265,6 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c
return true; return true;
} }
/* dirty lazyfree */
if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
success = lru_gen_del_folio(lruvec, folio, true);
VM_WARN_ON_ONCE_FOLIO(!success, folio);
folio_set_swapbacked(folio);
lruvec_add_folio_tail(lruvec, folio);
return true;
}
/* promoted */ /* promoted */
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
list_move(&folio->lru, &lrugen->folios[gen][type][zone]); list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */ #define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ #define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_DROPPABLE 0x08 /* Zero memory under memory pressure. */
/* /*
* Huge page size encoding when MAP_HUGETLB is specified, and a huge page * Huge page size encoding when MAP_HUGETLB is specified, and a huge page
......
...@@ -49,3 +49,4 @@ hugetlb_fault_after_madv ...@@ -49,3 +49,4 @@ hugetlb_fault_after_madv
hugetlb_madv_vs_map hugetlb_madv_vs_map
mseal_test mseal_test
seal_elf seal_elf
droppable
...@@ -73,6 +73,7 @@ TEST_GEN_FILES += ksm_functional_tests ...@@ -73,6 +73,7 @@ TEST_GEN_FILES += ksm_functional_tests
TEST_GEN_FILES += mdwe_test TEST_GEN_FILES += mdwe_test
TEST_GEN_FILES += hugetlb_fault_after_madv TEST_GEN_FILES += hugetlb_fault_after_madv
TEST_GEN_FILES += hugetlb_madv_vs_map TEST_GEN_FILES += hugetlb_madv_vs_map
TEST_GEN_FILES += droppable
ifneq ($(ARCH),arm64) ifneq ($(ARCH),arm64)
TEST_GEN_FILES += soft-dirty TEST_GEN_FILES += soft-dirty
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <sys/mman.h>
#include <linux/mman.h>
#include "../kselftest.h"
int main(int argc, char *argv[])
{
size_t alloc_size = 134217728;
size_t page_size = getpagesize();
void *alloc;
pid_t child;
ksft_print_header();
ksft_set_plan(1);
alloc = mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_DROPPABLE, -1, 0);
assert(alloc != MAP_FAILED);
memset(alloc, 'A', alloc_size);
for (size_t i = 0; i < alloc_size; i += page_size)
assert(*(uint8_t *)(alloc + i));
child = fork();
assert(child >= 0);
if (!child) {
for (;;)
*(char *)malloc(page_size) = 'B';
}
for (bool done = false; !done;) {
for (size_t i = 0; i < alloc_size; i += page_size) {
if (!*(uint8_t *)(alloc + i)) {
done = true;
break;
}
}
}
kill(child, SIGTERM);
ksft_test_result_pass("MAP_DROPPABLE: PASS\n");
exit(KSFT_PASS);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment