Commit d4af56c5 authored by Liam R. Howlett's avatar Liam R. Howlett Committed by Andrew Morton

mm: start tracking VMAs with maple tree

Start tracking the VMAs with the new maple tree structure in parallel with
the rb_tree.  Add debug and trace events for maple tree operations and
duplicate the rb_tree that is created on forks into the maple tree.

The maple tree is added to the mm_struct including the mm_init struct,
added support in required mm/mmap functions, added tracking in kernel/fork
for process forking, and used to find the unmapped_area and checked
against what the rbtree finds.

This also moves the mmap_lock() in exit_mmap() since the oom reaper call
does walk the VMAs.  Otherwise lockdep will be unhappy if oom happens.

When splitting a vma fails due to allocations of the maple tree nodes,
the error path in __split_vma() calls new->vm_ops->close(new).  The page
accounting for hugetlb is actually in the close() operation,  so it
accounts for the removal of 1/2 of the VMA which was not adjusted.  This
results in a negative exit value.  To avoid the negative charge, set
vm_start = vm_end and vm_pgoff = 0.

There is also a potential accounting issue in special mappings from
insert_vm_struct() failing to allocate, so reverse the charge there in
the failure scenario.

Link: https://lkml.kernel.org/r/20220906194824.2110408-9-Liam.Howlett@oracle.comSigned-off-by: default avatarLiam R. Howlett <Liam.Howlett@Oracle.com>
Signed-off-by: default avatarMatthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: default avatarYu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent e15e06a8
...@@ -96,6 +96,7 @@ void __init tboot_probe(void) ...@@ -96,6 +96,7 @@ void __init tboot_probe(void)
static pgd_t *tboot_pg_dir; static pgd_t *tboot_pg_dir;
static struct mm_struct tboot_mm = { static struct mm_struct tboot_mm = {
.mm_rb = RB_ROOT, .mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock),
.pgd = swapper_pg_dir, .pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2), .mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1), .mm_count = ATOMIC_INIT(1),
......
...@@ -58,6 +58,7 @@ static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; ...@@ -58,6 +58,7 @@ static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR;
struct mm_struct efi_mm = { struct mm_struct efi_mm = {
.mm_rb = RB_ROOT, .mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock),
.mm_users = ATOMIC_INIT(2), .mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1), .mm_count = ATOMIC_INIT(1),
.write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq), .write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq),
......
...@@ -2567,6 +2567,8 @@ extern bool arch_has_descending_max_zone_pfns(void); ...@@ -2567,6 +2567,8 @@ extern bool arch_has_descending_max_zone_pfns(void);
/* nommu.c */ /* nommu.c */
extern atomic_long_t mmap_pages_allocated; extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
/* mmap.c */
void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
/* interval_tree.c */ /* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node, void vma_interval_tree_insert(struct vm_area_struct *node,
...@@ -2630,6 +2632,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, ...@@ -2630,6 +2632,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
bool *need_rmap_locks); bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *); extern void exit_mmap(struct mm_struct *);
void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas);
void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas);
static inline int check_data_rlimit(unsigned long rlim, static inline int check_data_rlimit(unsigned long rlim,
unsigned long new, unsigned long new,
unsigned long start, unsigned long start,
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/maple_tree.h>
#include <linux/rwsem.h> #include <linux/rwsem.h>
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/cpumask.h> #include <linux/cpumask.h>
...@@ -486,6 +487,7 @@ struct kioctx_table; ...@@ -486,6 +487,7 @@ struct kioctx_table;
struct mm_struct { struct mm_struct {
struct { struct {
struct vm_area_struct *mmap; /* list of VMAs */ struct vm_area_struct *mmap; /* list of VMAs */
struct maple_tree mm_mt;
struct rb_root mm_rb; struct rb_root mm_rb;
u64 vmacache_seqnum; /* per-thread vmacache */ u64 vmacache_seqnum; /* per-thread vmacache */
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
...@@ -697,6 +699,7 @@ struct mm_struct { ...@@ -697,6 +699,7 @@ struct mm_struct {
unsigned long cpu_bitmap[]; unsigned long cpu_bitmap[];
}; };
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
extern struct mm_struct init_mm; extern struct mm_struct init_mm;
/* Pointer magic because the dynamic array size confuses some compilers. */ /* Pointer magic because the dynamic array size confuses some compilers. */
......
...@@ -42,6 +42,79 @@ TRACE_EVENT(vm_unmapped_area, ...@@ -42,6 +42,79 @@ TRACE_EVENT(vm_unmapped_area,
__entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->low_limit, __entry->high_limit, __entry->align_mask,
__entry->align_offset) __entry->align_offset)
); );
TRACE_EVENT(vma_mas_szero,
TP_PROTO(struct maple_tree *mt, unsigned long start,
unsigned long end),
TP_ARGS(mt, start, end),
TP_STRUCT__entry(
__field(struct maple_tree *, mt)
__field(unsigned long, start)
__field(unsigned long, end)
),
TP_fast_assign(
__entry->mt = mt;
__entry->start = start;
__entry->end = end;
),
TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,",
__entry->mt,
(unsigned long) __entry->start,
(unsigned long) __entry->end
)
);
TRACE_EVENT(vma_store,
TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma),
TP_ARGS(mt, vma),
TP_STRUCT__entry(
__field(struct maple_tree *, mt)
__field(struct vm_area_struct *, vma)
__field(unsigned long, vm_start)
__field(unsigned long, vm_end)
),
TP_fast_assign(
__entry->mt = mt;
__entry->vma = vma;
__entry->vm_start = vma->vm_start;
__entry->vm_end = vma->vm_end - 1;
),
TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,",
__entry->mt, __entry->vma,
(unsigned long) __entry->vm_start,
(unsigned long) __entry->vm_end
)
);
TRACE_EVENT(exit_mmap,
TP_PROTO(struct mm_struct *mm),
TP_ARGS(mm),
TP_STRUCT__entry(
__field(struct mm_struct *, mm)
__field(struct maple_tree *, mt)
),
TP_fast_assign(
__entry->mm = mm;
__entry->mt = &mm->mm_mt;
),
TP_printk("mt_mod %p, DESTROY\n",
__entry->mt
)
);
#endif #endif
/* This part must be outside protection */ /* This part must be outside protection */
......
...@@ -585,6 +585,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -585,6 +585,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
int retval; int retval;
unsigned long charge; unsigned long charge;
LIST_HEAD(uf); LIST_HEAD(uf);
MA_STATE(mas, &mm->mm_mt, 0, 0);
uprobe_start_dup_mmap(); uprobe_start_dup_mmap();
if (mmap_write_lock_killable(oldmm)) { if (mmap_write_lock_killable(oldmm)) {
...@@ -614,6 +615,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -614,6 +615,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
goto out; goto out;
khugepaged_fork(mm, oldmm); khugepaged_fork(mm, oldmm);
retval = mas_expected_entries(&mas, oldmm->map_count);
if (retval)
goto out;
prev = NULL; prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file; struct file *file;
...@@ -629,7 +634,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -629,7 +634,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
*/ */
if (fatal_signal_pending(current)) { if (fatal_signal_pending(current)) {
retval = -EINTR; retval = -EINTR;
goto out; goto loop_out;
} }
if (mpnt->vm_flags & VM_ACCOUNT) { if (mpnt->vm_flags & VM_ACCOUNT) {
unsigned long len = vma_pages(mpnt); unsigned long len = vma_pages(mpnt);
...@@ -694,6 +699,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -694,6 +699,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
rb_link = &tmp->vm_rb.rb_right; rb_link = &tmp->vm_rb.rb_right;
rb_parent = &tmp->vm_rb; rb_parent = &tmp->vm_rb;
/* Link the vma into the MT */
mas.index = tmp->vm_start;
mas.last = tmp->vm_end - 1;
mas_store(&mas, tmp);
mm->map_count++; mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK)) if (!(tmp->vm_flags & VM_WIPEONFORK))
retval = copy_page_range(tmp, mpnt); retval = copy_page_range(tmp, mpnt);
...@@ -702,10 +712,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -702,10 +712,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->vm_ops->open(tmp); tmp->vm_ops->open(tmp);
if (retval) if (retval)
goto out; goto loop_out;
} }
/* a new mm has just been created */ /* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm); retval = arch_dup_mmap(oldmm, mm);
loop_out:
mas_destroy(&mas);
out: out:
mmap_write_unlock(mm); mmap_write_unlock(mm);
flush_tlb_mm(oldmm); flush_tlb_mm(oldmm);
...@@ -721,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, ...@@ -721,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
fail_nomem: fail_nomem:
retval = -ENOMEM; retval = -ENOMEM;
vm_unacct_memory(charge); vm_unacct_memory(charge);
goto out; goto loop_out;
} }
static inline int mm_alloc_pgd(struct mm_struct *mm) static inline int mm_alloc_pgd(struct mm_struct *mm)
...@@ -1111,6 +1123,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, ...@@ -1111,6 +1123,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
{ {
mm->mmap = NULL; mm->mmap = NULL;
mm->mm_rb = RB_ROOT; mm->mm_rb = RB_ROOT;
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
mm->vmacache_seqnum = 0; mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1); atomic_set(&mm->mm_count, 1);
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/mm_types.h> #include <linux/mm_types.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/maple_tree.h>
#include <linux/rwsem.h> #include <linux/rwsem.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/list.h> #include <linux/list.h>
...@@ -29,6 +30,7 @@ ...@@ -29,6 +30,7 @@
*/ */
struct mm_struct init_mm = { struct mm_struct init_mm = {
.mm_rb = RB_ROOT, .mm_rb = RB_ROOT,
.mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
.pgd = swapper_pg_dir, .pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2), .mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1), .mm_count = ATOMIC_INIT(1),
......
This diff is collapsed.
...@@ -545,6 +545,19 @@ static void put_nommu_region(struct vm_region *region) ...@@ -545,6 +545,19 @@ static void put_nommu_region(struct vm_region *region)
__put_nommu_region(region); __put_nommu_region(region);
} }
void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
mas_store_prealloc(mas, vma);
}
void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
{
mas->index = vma->vm_start;
mas->last = vma->vm_end - 1;
mas_store_prealloc(mas, NULL);
}
/* /*
* add a VMA into a process's mm_struct in the appropriate place in the list * add a VMA into a process's mm_struct in the appropriate place in the list
* and tree and add to the address space's page tree also if not an anonymous * and tree and add to the address space's page tree also if not an anonymous
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment