Commit 995d03ae authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "15 fixes"

[ This does not merge the "fortify: use WARN instead of BUG for now"
  patch, which needs a bit of extra work to build cleanly with all
  configurations. Arnd is on it.   - Linus ]

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  ocfs2: don't clear SGID when inheriting ACLs
  mm: allow page_cache_get_speculative in interrupt context
  userfaultfd: non-cooperative: flush event_wqh at release time
  ipc: add missing container_of()s for randstruct
  cpuset: fix a deadlock due to incomplete patching of cpusets_enabled()
  userfaultfd_zeropage: return -ENOSPC in case mm has gone
  mm: take memory hotplug lock within numa_zonelist_order_handler()
  mm/page_io.c: fix oops during block io poll in swapin path
  zram: do not free pool->size_class
  kthread: fix documentation build warning
  kasan: avoid -Wmaybe-uninitialized warning
  userfaultfd: non-cooperative: notify about unmap of destination during mremap
  mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries
  pid: kill pidhash_size in pidhash_init()
  mm/hugetlb.c: __get_user_pages ignores certain follow_hugetlb_page errors
parents 8d3fe85f 19ec8e48
......@@ -240,18 +240,6 @@ int ocfs2_set_acl(handle_t *handle,
switch (type) {
case ACL_TYPE_ACCESS:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl) {
umode_t mode;
ret = posix_acl_update_mode(inode, &mode, &acl);
if (ret)
return ret;
ret = ocfs2_acl_set_mode(inode, di_bh,
handle, mode);
if (ret)
return ret;
}
break;
case ACL_TYPE_DEFAULT:
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
......@@ -289,7 +277,19 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
if (had_lock < 0)
return had_lock;
if (type == ACL_TYPE_ACCESS && acl) {
umode_t mode;
status = posix_acl_update_mode(inode, &mode, &acl);
if (status)
goto unlock;
status = ocfs2_acl_set_mode(inode, bh, NULL, mode);
if (status)
goto unlock;
}
status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
unlock:
ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
brelse(bh);
return status;
......
......@@ -854,6 +854,9 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
spin_unlock(&ctx->fault_pending_wqh.lock);
/* Flush pending events that may still wait on event_wqh */
wake_up_all(&ctx->event_wqh);
wake_up_poll(&ctx->fd_wqh, POLLHUP);
userfaultfd_ctx_put(ctx);
return 0;
......@@ -1643,6 +1646,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ENOSPC;
}
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
return -EFAULT;
......
......@@ -18,6 +18,19 @@
#ifdef CONFIG_CPUSETS
/*
* Static branch rewrites can happen in an arbitrary order for a given
* key. In code paths where we need to loop with read_mems_allowed_begin() and
* read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
* to ensure that begin() always gets rewritten before retry() in the
* disabled -> enabled transition. If not, then if local irqs are disabled
* around the loop, we can deadlock since retry() would always be
* comparing the latest value of the mems_allowed seqcount against 0 as
* begin() still would see cpusets_enabled() as false. The enabled -> disabled
* transition should happen in reverse order for the same reasons (want to stop
* looking at real value of mems_allowed.sequence in retry() first).
*/
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
static inline bool cpusets_enabled(void)
{
......@@ -32,12 +45,14 @@ static inline int nr_cpusets(void)
static inline void cpuset_inc(void)
{
static_branch_inc(&cpusets_pre_enable_key);
static_branch_inc(&cpusets_enabled_key);
}
static inline void cpuset_dec(void)
{
static_branch_dec(&cpusets_enabled_key);
static_branch_dec(&cpusets_pre_enable_key);
}
extern int cpuset_init(void);
......@@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void);
*/
static inline unsigned int read_mems_allowed_begin(void)
{
if (!cpusets_enabled())
if (!static_branch_unlikely(&cpusets_pre_enable_key))
return 0;
return read_seqcount_begin(&current->mems_allowed_seq);
......@@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void)
*/
static inline bool read_mems_allowed_retry(unsigned int seq)
{
if (!cpusets_enabled())
if (!static_branch_unlikely(&cpusets_enabled_key))
return false;
return read_seqcount_retry(&current->mems_allowed_seq, seq);
......
......@@ -15,7 +15,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
* @threadfn: the function to run in the thread
* @data: data pointer for @threadfn()
* @namefmt: printf-style format string for the thread name
* @...: arguments for @namefmt.
* @arg...: arguments for @namefmt.
*
* This macro will create a kthread on the current node, leaving it in
* the stopped state. This is just a helper for kthread_create_on_node();
......
......@@ -494,6 +494,10 @@ struct mm_struct {
* PROT_NONE or PROT_NUMA mapped page.
*/
bool tlb_flush_pending;
#endif
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/* See flush_tlb_batched_pending() */
bool tlb_flush_batched;
#endif
struct uprobes_state uprobes_state;
#ifdef CONFIG_HUGETLB_PAGE
......
......@@ -163,8 +163,6 @@ void release_pages(struct page **pages, int nr, bool cold);
*/
static inline int page_cache_get_speculative(struct page *page)
{
VM_BUG_ON(in_interrupt());
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic() && !irqs_disabled());
......
......@@ -1034,7 +1034,8 @@ void msg_exit_ns(struct ipc_namespace *ns)
static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
{
struct user_namespace *user_ns = seq_user_ns(s);
struct msg_queue *msq = it;
struct kern_ipc_perm *ipcp = it;
struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
seq_printf(s,
"%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
......
......@@ -2179,7 +2179,8 @@ void exit_sem(struct task_struct *tsk)
static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
{
struct user_namespace *user_ns = seq_user_ns(s);
struct sem_array *sma = it;
struct kern_ipc_perm *ipcp = it;
struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
time_t sem_otime;
/*
......
......@@ -1380,9 +1380,11 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
{
struct user_namespace *user_ns = seq_user_ns(s);
struct shmid_kernel *shp = it;
struct kern_ipc_perm *ipcp = it;
struct shmid_kernel *shp;
unsigned long rss = 0, swp = 0;
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
shm_add_rss_swap(shp, &rss, &swp);
#if BITS_PER_LONG <= 32
......
......@@ -63,6 +63,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
......
......@@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
unsigned int pidhash_size;
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
pidhash_size = 1U << pidhash_shift;
}
void __init pidmap_init(void)
......
......@@ -4078,6 +4078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
int err = -EFAULT;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
......@@ -4154,11 +4155,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, flags);
if (err)
return err;
err = vm_fault_to_errno(ret, flags);
remainder = 0;
break;
}
......@@ -4213,7 +4210,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
*position = vaddr;
return i ? i : -EFAULT;
return i ? i : err;
}
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
......
......@@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
......@@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void)
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
......
......@@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size,
disable_trace_on_warning();
info.access_addr = (void *)addr;
info.first_bad_addr = (void *)addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
......
......@@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
for (; addr != end; pte++, addr += PAGE_SIZE) {
ptent = *pte;
......
......@@ -1197,6 +1197,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
......
......@@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
atomic_read(&vma->vm_mm->mm_users) == 1)
target_node = numa_node_id();
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
......
......@@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
flush_tlb_batched_pending(vma->vm_mm);
arch_enter_lazy_mmu_mode();
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
......@@ -428,6 +429,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
struct vm_userfaultfd_ctx *uf,
struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
struct mm_struct *mm = current->mm;
......@@ -446,7 +448,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
ret = do_munmap(mm, new_addr, new_len, NULL);
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
goto out;
......@@ -514,6 +516,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long charged = 0;
bool locked = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
......@@ -541,7 +544,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
&locked, &uf, &uf_unmap);
&locked, &uf, &uf_unmap_early, &uf_unmap);
goto out;
}
......@@ -621,6 +624,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
up_write(&current->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
userfaultfd_unmap_complete(mm, &uf_unmap_early);
mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
return ret;
......
......@@ -4891,9 +4891,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
mem_hotplug_begin();
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
mem_hotplug_done();
}
}
out:
......
......@@ -22,6 +22,7 @@
#include <linux/frontswap.h>
#include <linux/blkdev.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
......@@ -136,6 +137,7 @@ static void end_swap_bio_read(struct bio *bio)
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
wake_up_process(waiter);
put_task_struct(waiter);
}
int generic_swapfile_activate(struct swap_info_struct *sis,
......@@ -378,6 +380,11 @@ int swap_readpage(struct page *page, bool do_poll)
goto out;
}
bdev = bio->bi_bdev;
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
count_vm_event(PSWPIN);
......
......@@ -604,6 +604,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
/*
* Ensure compiler does not re-order the setting of tlb_flush_batched
* before the PTE is cleared.
*/
barrier();
mm->tlb_flush_batched = true;
/*
* If the PTE was dirty then it's best to assume it's writable. The
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
......@@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
return should_defer;
}
/*
* Reclaim unmaps pages under the PTL but do not flush the TLB prior to
* releasing the PTL if TLB flushes are batched. It's possible for a parallel
* operation such as mprotect or munmap to race between reclaim unmapping
* the page and flushing the page. If this race occurs, it potentially allows
* access to data via a stale TLB entry. Tracking all mm's that have TLB
* batching in flight would be expensive during reclaim so instead track
* whether TLB batching occurred in the past and if so then do a flush here
* if required. This will cost one additional flush per reclaim cycle paid
* by the first operation at risk such as mprotect and mumap.
*
* This must be called under the PTL so that an access to tlb_flush_batched
* that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
* via the PTL.
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
if (mm->tlb_flush_batched) {
flush_tlb_mm(mm);
/*
* Do not allow the compiler to re-order the clearing of
* tlb_flush_batched before the tlb is flushed.
*/
barrier();
mm->tlb_flush_batched = false;
}
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
......
......@@ -2453,7 +2453,6 @@ void zs_destroy_pool(struct zs_pool *pool)
}
destroy_cache(pool);
kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment