Commit 50eb842f authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "28 patches.

  Subsystems affected by this series: mm (memblock, pagealloc, hugetlb,
  highmem, kfence, oom-kill, madvise, kasan, userfaultfd, memcg, and
  zram), core-kernel, kconfig, fork, binfmt, MAINTAINERS, kbuild, and
  ia64"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (28 commits)
  zram: fix broken page writeback
  zram: fix return value on writeback_store
  mm/memcg: set memcg when splitting page
  mm/memcg: rename mem_cgroup_split_huge_fixup to split_page_memcg and add nr_pages argument
  ia64: fix ptrace(PTRACE_SYSCALL_INFO_EXIT) sign
  ia64: fix ia64_syscall_get_set_arguments() for break-based syscalls
  mm/userfaultfd: fix memory corruption due to writeprotect
  kasan: fix KASAN_STACK dependency for HW_TAGS
  kasan, mm: fix crash with HW_TAGS and DEBUG_PAGEALLOC
  mm/madvise: replace ptrace attach requirement for process_madvise
  include/linux/sched/mm.h: use rcu_dereference in in_vfork()
  kfence: fix reports if constant function prefixes exist
  kfence, slab: fix cache_alloc_debugcheck_after() for bulk allocations
  kfence: fix printk format for ptrdiff_t
  linux/compiler-clang.h: define HAVE_BUILTIN_BSWAP*
  MAINTAINERS: exclude uapi directories in API/ABI section
  binfmt_misc: fix possible deadlock in bm_register_write
  mm/highmem.c: fix zero_user_segments() with start > end
  hugetlb: do early cow when page pinned on src mm
  mm: use is_cow_mapping() across tree where proper
  ...
parents 88fe4924 2766f182
......@@ -261,8 +261,8 @@ ABI/API
L: linux-api@vger.kernel.org
F: include/linux/syscalls.h
F: kernel/sys_ni.c
F: include/uapi/
F: arch/*/include/uapi/
X: include/uapi/
X: arch/*/include/uapi/
ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
M: Hans de Goede <hdegoede@redhat.com>
......
......@@ -32,7 +32,7 @@ static inline void syscall_rollback(struct task_struct *task,
static inline long syscall_get_error(struct task_struct *task,
struct pt_regs *regs)
{
return regs->r10 == -1 ? regs->r8:0;
return regs->r10 == -1 ? -regs->r8:0;
}
static inline long syscall_get_return_value(struct task_struct *task,
......
......@@ -2013,27 +2013,39 @@ static void syscall_get_set_args_cb(struct unw_frame_info *info, void *data)
{
struct syscall_get_set_args *args = data;
struct pt_regs *pt = args->regs;
unsigned long *krbs, cfm, ndirty;
unsigned long *krbs, cfm, ndirty, nlocals, nouts;
int i, count;
if (unw_unwind_to_user(info) < 0)
return;
/*
* We get here via a few paths:
* - break instruction: cfm is shared with caller.
* syscall args are in out= regs, locals are non-empty.
* - epsinstruction: cfm is set by br.call
* locals don't exist.
*
* For both cases argguments are reachable in cfm.sof - cfm.sol.
* CFM: [ ... | sor: 17..14 | sol : 13..7 | sof : 6..0 ]
*/
cfm = pt->cr_ifs;
nlocals = (cfm >> 7) & 0x7f; /* aka sol */
nouts = (cfm & 0x7f) - nlocals; /* aka sof - sol */
krbs = (unsigned long *)info->task + IA64_RBS_OFFSET/8;
ndirty = ia64_rse_num_regs(krbs, krbs + (pt->loadrs >> 19));
count = 0;
if (in_syscall(pt))
count = min_t(int, args->n, cfm & 0x7f);
count = min_t(int, args->n, nouts);
/* Iterate over outs. */
for (i = 0; i < count; i++) {
int j = ndirty + nlocals + i + args->i;
if (args->rw)
*ia64_rse_skip_regs(krbs, ndirty + i + args->i) =
args->args[i];
*ia64_rse_skip_regs(krbs, j) = args->args[i];
else
args->args[i] = *ia64_rse_skip_regs(krbs,
ndirty + i + args->i);
args->args[i] = *ia64_rse_skip_regs(krbs, j);
}
if (!args->rw) {
......
......@@ -627,7 +627,7 @@ static ssize_t writeback_store(struct device *dev,
struct bio_vec bio_vec;
struct page *page;
ssize_t ret = len;
int mode;
int mode, err;
unsigned long blk_idx = 0;
if (sysfs_streq(buf, "idle"))
......@@ -638,8 +638,8 @@ static ssize_t writeback_store(struct device *dev,
if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
return -EINVAL;
ret = kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index);
if (ret || index >= nr_pages)
if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) ||
index >= nr_pages)
return -EINVAL;
nr_pages = 1;
......@@ -663,7 +663,7 @@ static ssize_t writeback_store(struct device *dev,
goto release_init_lock;
}
while (nr_pages--) {
for (; nr_pages != 0; index++, nr_pages--) {
struct bio_vec bvec;
bvec.bv_page = page;
......@@ -728,12 +728,17 @@ static ssize_t writeback_store(struct device *dev,
* XXX: A single page IO would be inefficient for write
* but it would be not bad as starter.
*/
ret = submit_bio_wait(&bio);
if (ret) {
err = submit_bio_wait(&bio);
if (err) {
zram_slot_lock(zram, index);
zram_clear_flag(zram, index, ZRAM_UNDER_WB);
zram_clear_flag(zram, index, ZRAM_IDLE);
zram_slot_unlock(zram, index);
/*
* Return last IO error unless every IO were
* not suceeded.
*/
ret = err;
continue;
}
......
......@@ -500,8 +500,6 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
vm_fault_t ret;
pgoff_t fault_page_size;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool is_cow_mapping =
(vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
switch (pe_size) {
case PE_SIZE_PMD:
......@@ -518,7 +516,7 @@ vm_fault_t vmw_bo_vm_huge_fault(struct vm_fault *vmf,
}
/* Always do write dirty-tracking and COW on PTE level. */
if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping))
if (write && (READ_ONCE(vbo->dirty) || is_cow_mapping(vma->vm_flags)))
return VM_FAULT_FALLBACK;
ret = ttm_bo_vm_reserve(bo, vmf);
......
......@@ -49,7 +49,7 @@ int vmw_mmap(struct file *filp, struct vm_area_struct *vma)
vma->vm_ops = &vmw_vm_ops;
/* Use VM_PFNMAP rather than VM_MIXEDMAP if not a COW mapping */
if ((vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) != VM_MAYWRITE)
if (!is_cow_mapping(vma->vm_flags))
vma->vm_flags = (vma->vm_flags & ~VM_MIXEDMAP) | VM_PFNMAP;
return 0;
......
......@@ -649,12 +649,24 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
struct super_block *sb = file_inode(file)->i_sb;
struct dentry *root = sb->s_root, *dentry;
int err = 0;
struct file *f = NULL;
e = create_entry(buffer, count);
if (IS_ERR(e))
return PTR_ERR(e);
if (e->flags & MISC_FMT_OPEN_FILE) {
f = open_exec(e->interpreter);
if (IS_ERR(f)) {
pr_notice("register: failed to install interpreter file %s\n",
e->interpreter);
kfree(e);
return PTR_ERR(f);
}
e->interp_file = f;
}
inode_lock(d_inode(root));
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
......@@ -678,21 +690,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
goto out2;
}
if (e->flags & MISC_FMT_OPEN_FILE) {
struct file *f;
f = open_exec(e->interpreter);
if (IS_ERR(f)) {
err = PTR_ERR(f);
pr_notice("register: failed to install interpreter file %s\n", e->interpreter);
simple_release_fs(&bm_mnt, &entry_count);
iput(inode);
inode = NULL;
goto out2;
}
e->interp_file = f;
}
e->dentry = dget(dentry);
inode->i_private = e;
inode->i_fop = &bm_entry_operations;
......@@ -709,6 +706,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
inode_unlock(d_inode(root));
if (err) {
if (f)
filp_close(f, NULL);
kfree(e);
return err;
}
......
......@@ -1036,8 +1036,6 @@ struct clear_refs_private {
#ifdef CONFIG_MEM_SOFT_DIRTY
#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE)
static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
struct page *page;
......
......@@ -31,6 +31,12 @@
#define __no_sanitize_thread
#endif
#if defined(CONFIG_ARCH_USE_BUILTIN_BSWAP)
#define __HAVE_BUILTIN_BSWAP32__
#define __HAVE_BUILTIN_BSWAP64__
#define __HAVE_BUILTIN_BSWAP16__
#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
#if __has_feature(undefined_behavior_sanitizer)
/* GCC does not have __SANITIZE_UNDEFINED__ */
#define __no_sanitize_undefined \
......
......@@ -460,7 +460,7 @@ static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
/*
* Set the allocation direction to bottom-up or top-down.
*/
static inline void memblock_set_bottom_up(bool enable)
static inline __init void memblock_set_bottom_up(bool enable)
{
memblock.bottom_up = enable;
}
......@@ -470,7 +470,7 @@ static inline void memblock_set_bottom_up(bool enable)
* if this is true, that said, memblock will allocate memory
* in bottom-up direction.
*/
static inline bool memblock_bottom_up(void)
static inline __init bool memblock_bottom_up(void)
{
return memblock.bottom_up;
}
......
......@@ -1061,9 +1061,7 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm,
rcu_read_unlock();
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void mem_cgroup_split_huge_fixup(struct page *head);
#endif
void split_page_memcg(struct page *head, unsigned int nr);
#else /* CONFIG_MEMCG */
......@@ -1400,7 +1398,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
return 0;
}
static inline void mem_cgroup_split_huge_fixup(struct page *head)
static inline void split_page_memcg(struct page *head, unsigned int nr)
{
}
......
......@@ -1300,6 +1300,27 @@ static inline bool page_maybe_dma_pinned(struct page *page)
GUP_PIN_COUNTING_BIAS;
}
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
/*
* This should most likely only be called during fork() to see whether we
* should break the cow immediately for a page on the src mm.
*/
static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
struct page *page)
{
if (!is_cow_mapping(vma->vm_flags))
return false;
if (!atomic_read(&vma->vm_mm->has_pinned))
return false;
return page_maybe_dma_pinned(page);
}
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif
......
......@@ -23,6 +23,7 @@
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
#define INIT_PASID 0
struct address_space;
struct mem_cgroup;
......
......@@ -140,7 +140,8 @@ static inline bool in_vfork(struct task_struct *tsk)
* another oom-unkillable task does this it should blame itself.
*/
rcu_read_lock();
ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
ret = tsk->vfork_done &&
rcu_dereference(tsk->real_parent)->mm == tsk->mm;
rcu_read_unlock();
return ret;
......
......@@ -128,7 +128,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus);
#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
static __always_inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
unsigned long flags;
......@@ -139,14 +139,15 @@ static inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
return ret;
}
static inline int stop_machine(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
static __always_inline int
stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
{
return stop_machine_cpuslocked(fn, data, cpus);
}
static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
static __always_inline int
stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus)
{
return stop_machine(fn, data, cpus);
}
......
......@@ -119,8 +119,7 @@ config INIT_ENV_ARG_LIMIT
config COMPILE_TEST
bool "Compile also drivers which will not load"
depends on !UML && !S390
default n
depends on HAS_IOMEM
help
Some drivers can be compiled on a different platform than they are
intended to be run on. Despite they cannot be loaded there (or even
......
......@@ -994,6 +994,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
static void mm_init_pasid(struct mm_struct *mm)
{
#ifdef CONFIG_IOMMU_SUPPORT
mm->pasid = INIT_PASID;
#endif
}
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
......@@ -1024,6 +1031,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
mm_init_pasid(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
......
......@@ -156,6 +156,7 @@ config KASAN_STACK_ENABLE
config KASAN_STACK
int
depends on KASAN_GENERIC || KASAN_SW_TAGS
default 1 if KASAN_STACK_ENABLE || CC_IS_GCC
default 0
......
......@@ -368,20 +368,24 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
BUG_ON(end1 > page_size(page) || end2 > page_size(page));
if (start1 >= end1)
start1 = end1 = 0;
if (start2 >= end2)
start2 = end2 = 0;
for (i = 0; i < compound_nr(page); i++) {
void *kaddr = NULL;
if (start1 < PAGE_SIZE || start2 < PAGE_SIZE)
kaddr = kmap_atomic(page + i);
if (start1 >= PAGE_SIZE) {
start1 -= PAGE_SIZE;
end1 -= PAGE_SIZE;
} else {
unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
if (end1 > start1)
if (end1 > start1) {
kaddr = kmap_atomic(page + i);
memset(kaddr + start1, 0, this_end - start1);
}
end1 -= this_end;
start1 = 0;
}
......@@ -392,8 +396,11 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
} else {
unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
if (end2 > start2)
if (end2 > start2) {
if (!kaddr)
kaddr = kmap_atomic(page + i);
memset(kaddr + start2, 0, this_end - start2);
}
end2 -= this_end;
start2 = 0;
}
......
......@@ -1100,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
if (unlikely(is_cow_mapping(vma->vm_flags) &&
atomic_read(&src_mm->has_pinned) &&
page_maybe_dma_pinned(src_page))) {
if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
......@@ -1214,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
/* Please refer to comments in copy_huge_pmd() */
if (unlikely(is_cow_mapping(vma->vm_flags) &&
atomic_read(&src_mm->has_pinned) &&
page_maybe_dma_pinned(pud_page(pud)))) {
if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
__split_huge_pud(vma, src_pud, addr);
......@@ -2471,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int i;
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
split_page_memcg(head, nr);
if (PageAnon(head) && PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
......
......@@ -331,6 +331,24 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
}
}
static inline long
hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
long to, struct hstate *h, struct hugetlb_cgroup *cg,
long *regions_needed)
{
struct file_region *nrg;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(map, from, to);
record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(map, nrg);
} else
*regions_needed += 1;
return to - from;
}
/*
* Must be called with resv->lock held.
*
......@@ -346,7 +364,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
struct file_region *rg = NULL, *trg = NULL;
if (regions_needed)
*regions_needed = 0;
......@@ -369,24 +387,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* When we find a region that starts beyond our range, we've
* finished.
*/
if (rg->from > t)
if (rg->from >= t)
break;
/* Add an entry for last_accounted_offset -> rg->from, and
* update last_accounted_offset.
*/
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
} else
*regions_needed += 1;
}
if (rg->from > last_accounted_offset)
add += hugetlb_resv_map_add(resv, rg,
last_accounted_offset,
rg->from, h, h_cg,
regions_needed);
last_accounted_offset = rg->to;
}
......@@ -394,17 +405,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
} else
*regions_needed += 1;
}
if (last_accounted_offset < t)
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
VM_BUG_ON(add < 0);
return add;
......@@ -3725,21 +3728,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
return false;
}
static void
hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
struct page *new_page)
{
__SetPageUptodate(new_page);
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugepage_add_new_anon_rmap(new_page, vma, addr);
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
int cow;
bool cow = is_cow_mapping(vma->vm_flags);
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
if (cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
vma->vm_start,
......@@ -3784,6 +3798,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
dst_entry = huge_ptep_get(dst_pte);
again:
if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
/*
* Skip if src entry none. Also, skip in the
......@@ -3807,6 +3822,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
/*
* This is a rare case where we see pinned hugetlb
* pages while they're prone to COW. We need to do the
* COW earlier during fork.
*
* When pre-allocating the page or copying data, we
* need to be without the pgtable locks since we could
* sleep during the process.
*/
if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
pte_t src_pte_old = entry;
struct page *new;
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
new = alloc_huge_page(vma, addr, 1);
if (IS_ERR(new)) {
put_page(ptepage);
ret = PTR_ERR(new);
break;
}
copy_user_huge_page(new, ptepage, addr, vma,
npages);
put_page(ptepage);
/* Install the new huge page if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
put_page(new);
/* dst_entry won't change as in child */
goto again;
}
hugetlb_install_page(vma, dst_pte, addr, new);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
}
if (cow) {
/*
* No need to notify as we are downgrading page
......@@ -3817,12 +3878,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
}
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(pages_per_huge_page(h), dst);
hugetlb_count_add(npages, dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
......
......@@ -296,11 +296,6 @@ static inline unsigned int buddy_order(struct page *page)
*/
#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
/*
* These three helpers classifies VMAs for virtual memory accounting.
*/
......
......@@ -20,6 +20,11 @@
#include "kfence.h"
/* May be overridden by <asm/kfence.h>. */
#ifndef ARCH_FUNC_PREFIX
#define ARCH_FUNC_PREFIX ""
#endif
extern bool no_hash_pointers;
/* Helper function to either print to a seq_file or to console. */
......@@ -67,8 +72,9 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
for (skipnr = 0; skipnr < num_entries; skipnr++) {
int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
!strncmp(buf, "__slab_free", len)) {
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") ||
!strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
/*
* In case of tail calls from any of the below
* to any of the above.
......@@ -77,10 +83,10 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
}
/* Also the *_bulk() variants by only checking prefixes. */
if (str_has_prefix(buf, "kfree") ||
str_has_prefix(buf, "kmem_cache_free") ||
str_has_prefix(buf, "__kmalloc") ||
str_has_prefix(buf, "kmem_cache_alloc"))
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
goto found;
}
if (fallback < num_entries)
......@@ -116,12 +122,12 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met
lockdep_assert_held(&meta->lock);
if (meta->state == KFENCE_OBJECT_UNUSED) {
seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata);
seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata);
return;
}
seq_con_printf(seq,
"kfence-#%zd [0x%p-0x%p"
"kfence-#%td [0x%p-0x%p"
", size=%d, cache=%s] allocated by task %d:\n",
meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
(cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
......@@ -204,7 +210,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
(void *)stack_entries[skipnr]);
pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%zd):\n",
pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n",
get_access_type(is_write), (void *)address,
left_of_object ? meta->addr - address : address - meta->addr,
left_of_object ? "left" : "right", object_index);
......@@ -213,14 +219,14 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
case KFENCE_ERROR_UAF:
pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
(void *)stack_entries[skipnr]);
pr_err("Use-after-free %s at 0x%p (in kfence-#%zd):\n",
pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n",
get_access_type(is_write), (void *)address, object_index);
break;
case KFENCE_ERROR_CORRUPTION:
pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
pr_err("Corrupted memory at 0x%p ", (void *)address);
print_diff_canary(address, 16, meta);
pr_cont(" (in kfence-#%zd):\n", object_index);
pr_cont(" (in kfence-#%td):\n", object_index);
break;
case KFENCE_ERROR_INVALID:
pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
......@@ -230,7 +236,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r
break;
case KFENCE_ERROR_INVALID_FREE:
pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
pr_err("Invalid free of 0x%p (in kfence-#%zd):\n", (void *)address,
pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address,
object_index);
break;
}
......
......@@ -1198,12 +1198,22 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm)) {
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
goto release_task;
}
/*
* Require CAP_SYS_NICE for influencing process performance. Note that
* only non-destructive hints are currently supported.
*/
if (!capable(CAP_SYS_NICE)) {
ret = -EPERM;
goto release_mm;
}
total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) {
......@@ -1218,6 +1228,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
if (ret == 0)
ret = total_len - iov_iter_count(&iter);
release_mm:
mmput(mm);
release_task:
put_task_struct(task);
......
......@@ -3287,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* Because page_memcg(head) is not set on compound tails, set it now.
* Because page_memcg(head) is not set on tails, set it now.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
void split_page_memcg(struct page *head, unsigned int nr)
{
struct mem_cgroup *memcg = page_memcg(head);
int i;
if (mem_cgroup_disabled())
if (mem_cgroup_disabled() || !memcg)
return;
for (i = 1; i < HPAGE_PMD_NR; i++) {
css_get(&memcg->css);
head[i].memcg_data = (unsigned long)memcg;
}
for (i = 1; i < nr; i++)
head[i].memcg_data = head->memcg_data;
css_get_many(&memcg->css, nr - 1);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEMCG_SWAP
/**
......
......@@ -809,12 +809,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc, pte_t pte, struct page *page)
{
struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
if (!is_cow_mapping(src_vma->vm_flags))
return 1;
/*
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
......@@ -828,9 +824,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* the page count. That might give false positives for
* for pinning, but it will work correctly.
*/
if (likely(!atomic_read(&src_mm->has_pinned)))
return 1;
if (likely(!page_maybe_dma_pinned(page)))
if (likely(!page_needs_cow_for_dma(src_vma, page)))
return 1;
new_page = *prealloc;
......@@ -3103,6 +3097,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_WP);
}
/*
* Userfaultfd write-protect can defer flushes. Ensure the TLB
* is flushed in this case before copying.
*/
if (unlikely(userfaultfd_wp(vmf->vma) &&
mm_tlb_flush_pending(vmf->vma->vm_mm)))
flush_tlb_page(vmf->vma, vmf->address);
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*
......
......@@ -1281,6 +1281,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
kernel_poison_pages(page, 1 << order);
/*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
kasan_free_nondeferred_pages(page, order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
......@@ -1290,8 +1296,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_pagealloc_unmap_pages(page, 1 << order);
kasan_free_nondeferred_pages(page, order);
return true;
}
......@@ -3310,6 +3314,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, 1 << order);
split_page_memcg(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
......@@ -6259,12 +6264,65 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Only struct pages that correspond to ranges defined by memblock.memory
* are zeroed and initialized by going through __init_single_page() during
* memmap_init_zone().
*
* But, there could be struct pages that correspond to holes in
* memblock.memory. This can happen because of the following reasons:
* - physical memory bank size is not necessarily the exact multiple of the
* arbitrary section size
* - early reserved memory may not be listed in memblock.memory
* - memory layouts defined with memmap= kernel parameter may not align
* nicely with memmap sections
*
* Explicitly initialize those struct pages so that:
* - PG_Reserved is set
* - zone and node links point to zone and node that span the page if the
* hole is in the middle of a zone
* - zone and node links point to adjacent zone/node if the hole falls on
* the zone boundary; the pages in such holes will be prepended to the
* zone/node above the hole except for the trailing pages in the last
* section that will be appended to the zone/node below.
*/
static u64 __meminit init_unavailable_range(unsigned long spfn,
unsigned long epfn,
int zone, int node)
{
unsigned long pfn;
u64 pgcnt = 0;
for (pfn = spfn; pfn < epfn; pfn++) {
if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ pageblock_nr_pages - 1;
continue;
}
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
return pgcnt;
}
#else
static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
int zone, int node)
{
return 0;
}
#endif
void __meminit __weak memmap_init_zone(struct zone *zone)
{
unsigned long zone_start_pfn = zone->zone_start_pfn;
unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
static unsigned long hole_pfn;
unsigned long start_pfn, end_pfn;
u64 pgcnt = 0;
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
......@@ -6274,7 +6332,29 @@ void __meminit __weak memmap_init_zone(struct zone *zone)
memmap_init_range(end_pfn - start_pfn, nid,
zone_id, start_pfn, zone_end_pfn,
MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
if (hole_pfn < start_pfn)
pgcnt += init_unavailable_range(hole_pfn, start_pfn,
zone_id, nid);
hole_pfn = end_pfn;
}
#ifdef CONFIG_SPARSEMEM
/*
* Initialize the hole in the range [zone_end_pfn, section_end].
* If zone boundary falls in the middle of a section, this hole
* will be re-initialized during the call to this function for the
* higher zone.
*/
end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
if (hole_pfn < end_pfn)
pgcnt += init_unavailable_range(hole_pfn, end_pfn,
zone_id, nid);
#endif
if (pgcnt)
pr_info(" %s zone: %llu pages in unavailable ranges\n",
zone->name, pgcnt);
}
static int zone_batchsize(struct zone *zone)
......@@ -7071,88 +7151,6 @@ void __init free_area_init_memoryless_node(int nid)
free_area_init_node(nid);
}
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Initialize all valid struct pages in the range [spfn, epfn) and mark them
* PageReserved(). Return the number of struct pages that were initialized.
*/
static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
{
unsigned long pfn;
u64 pgcnt = 0;
for (pfn = spfn; pfn < epfn; pfn++) {
if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ pageblock_nr_pages - 1;
continue;
}
/*
* Use a fake node/zone (0) for now. Some of these pages
* (in memblock.reserved but not in memblock.memory) will
* get re-initialized via reserve_bootmem_region() later.
*/
__init_single_page(pfn_to_page(pfn), pfn, 0, 0);
__SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
return pgcnt;
}
/*
* Only struct pages that are backed by physical memory are zeroed and
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly initialize those struct pages.
*
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
* layout is manually configured via memmap=, or when the highest physical
* address (max_pfn) does not end on a section boundary.
*/
static void __init init_unavailable_mem(void)
{
phys_addr_t start, end;
u64 i, pgcnt;
phys_addr_t next = 0;
/*
* Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
for_each_mem_range(i, &start, &end) {
if (next < start)
pgcnt += init_unavailable_range(PFN_DOWN(next),
PFN_UP(start));
next = end;
}
/*
* Early sections always have a fully populated memmap for the whole
* section - see pfn_valid(). If the last section has holes at the
* end and that section is marked "online", the memmap will be
* considered initialized. Make sure that memmap has a well defined
* state.
*/
pgcnt += init_unavailable_range(PFN_DOWN(next),
round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
* firmware is using some of this memory, or for some other reasons.
*/
if (pgcnt)
pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
#else
static inline void __init init_unavailable_mem(void)
{
}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
......@@ -7576,7 +7574,6 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);
......
......@@ -2992,7 +2992,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
if (!objp)
if (!objp || is_kfence_address(objp))
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment