Commit a602285a authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'per_signal_struct_coredumps-for-v5.16' of...

Merge branch 'per_signal_struct_coredumps-for-v5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull per signal_struct coredumps from Eric Biederman:
 "Current coredumps are mixed up with the exit code, the signal handling
  code, and the ptrace code making coredumps much more complicated than
  necessary and difficult to follow.

  This series of changes starts with ptrace_stop and cleans it up,
  making it easier to follow what is happening in ptrace_stop. Then
  cleans up the exec interactions with coredumps. Then cleans up the
  coredump interactions with exit. Finally the coredump interactions
  with the signal handling code is cleaned up.

  The first and last changes are bug fixes for minor bugs.

  I believe the fact that vfork followed by execve can kill the process
  the called vfork if exec fails is sufficient justification to change
  the userspace visible behavior.

  In previous discussions some of these changes were organized
  differently and individually appeared to make the code base worse. As
  currently written I believe they all stand on their own as cleanups
  and bug fixes.

  Which means that even if the worst should happen and the last change
  needs to be reverted for some unimaginable reason, the code base will
  still be improved.

  If the worst does not happen there are a more cleanups that can be
  made. Signals that generate coredumps can easily become eligible for
  short circuit delivery in complete_signal. The entire rendezvous for
  generating a coredump can move into get_signal. The function
  force_sig_info_to_task be written in a way that does not modify the
  signal handling state of the target task (because coredumps are
  eligible for short circuit delivery). Many of these future cleanups
  can be done another way but nothing so cleanly as if coredumps become
  per signal_struct"

* 'per_signal_struct_coredumps-for-v5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  coredump: Limit coredumps to a single thread group
  coredump:  Don't perform any cleanups before dumping core
  exit: Factor coredump_exit_mm out of exit_mm
  exec: Check for a pending fatal signal instead of core_state
  ptrace: Remove the unnecessary arguments from arch_ptrace_stop
  signal: Remove the bogus sigkill_pending in ptrace_stop
parents 5c4e0a21 3f66f86b
......@@ -134,9 +134,9 @@ static inline long regs_return_value(struct pt_regs *regs)
extern void ia64_decrement_ip (struct pt_regs *pt);
extern void ia64_ptrace_stop(void);
#define arch_ptrace_stop(code, info) \
#define arch_ptrace_stop() \
ia64_ptrace_stop()
#define arch_ptrace_stop_needed(code, info) \
#define arch_ptrace_stop_needed() \
(!test_thread_flag(TIF_RESTORE_RSE))
extern void ptrace_attach_sync_user_rbs (struct task_struct *);
......
......@@ -26,12 +26,12 @@ static inline bool pt_regs_clear_syscall(struct pt_regs *regs)
return (regs->tstate &= ~TSTATE_SYSCALL);
}
#define arch_ptrace_stop_needed(exit_code, info) \
#define arch_ptrace_stop_needed() \
({ flush_user_windows(); \
get_thread_wsaved() != 0; \
})
#define arch_ptrace_stop(exit_code, info) \
#define arch_ptrace_stop() \
synchronize_user_stack()
#define current_pt_regs() \
......@@ -129,12 +129,12 @@ static inline bool pt_regs_clear_syscall(struct pt_regs *regs)
return (regs->psr &= ~PSR_SYSCALL);
}
#define arch_ptrace_stop_needed(exit_code, info) \
#define arch_ptrace_stop_needed() \
({ flush_user_windows(); \
current_thread_info()->w_saved != 0; \
})
#define arch_ptrace_stop(exit_code, info) \
#define arch_ptrace_stop() \
synchronize_user_stack()
#define current_pt_regs() \
......
......@@ -1834,7 +1834,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
/*
* Allocate a structure for each thread.
*/
for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
......@@ -2024,7 +2024,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
if (!elf_note_info_init(info))
return 0;
for (ct = current->mm->core_state->dumper.next;
for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
ets = kzalloc(sizeof(*ets), GFP_KERNEL);
if (!ets)
......
......@@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
goto end_coredump;
for (ct = current->mm->core_state->dumper.next;
for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
ct->task, &thread_status_size);
......
......@@ -359,7 +359,7 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
if (t != current && !(t->flags & PF_POSTCOREDUMP)) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
......@@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
return nr;
}
static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
static int zap_threads(struct task_struct *tsk,
struct core_state *core_state, int exit_code)
{
struct task_struct *g, *p;
unsigned long flags;
int nr = -EAGAIN;
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
tsk->signal->core_state = core_state;
tsk->signal->group_exit_task = tsk;
nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
tsk->flags |= PF_DUMPCORE;
atomic_set(&core_state->nr_threads, nr);
}
spin_unlock_irq(&tsk->sighand->siglock);
if (unlikely(nr < 0))
return nr;
tsk->flags |= PF_DUMPCORE;
if (atomic_read(&mm->mm_users) == nr + 1)
goto done;
/*
* We should find and kill all tasks which use this mm, and we should
* count them correctly into ->nr_threads. We don't take tasklist
* lock, but this is safe wrt:
*
* fork:
* None of sub-threads can fork after zap_process(leader). All
* processes which were created before this point should be
* visible to zap_threads() because copy_process() adds the new
* process to the tail of init_task.tasks list, and lock/unlock
* of ->siglock provides a memory barrier.
*
* do_exit:
* The caller holds mm->mmap_lock. This means that the task which
* uses this mm can't pass exit_mm(), so it can't exit or clear
* its ->mm.
*
* de_thread:
* It does list_replace_rcu(&leader->tasks, &current->tasks),
* we must see either old or new leader, this does not matter.
* However, it can change p->sighand, so lock_task_sighand(p)
* must be used. Since p->mm != NULL and we hold ->mmap_lock
* it can't fail.
*
* Note also that "g" can be the old leader with ->mm == NULL
* and already unhashed and thus removed from ->thread_group.
* This is OK, __unhash_process()->list_del_rcu() does not
* clear the ->next pointer, we will find the new leader via
* next_thread().
*/
rcu_read_lock();
for_each_process(g) {
if (g == tsk->group_leader)
continue;
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, p) {
if (unlikely(!p->mm))
continue;
if (unlikely(p->mm == mm)) {
lock_task_sighand(p, &flags);
nr += zap_process(p, exit_code,
SIGNAL_GROUP_EXIT);
unlock_task_sighand(p, &flags);
}
break;
}
}
rcu_read_unlock();
done:
atomic_set(&core_state->nr_threads, nr);
return nr;
}
static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
int core_waiters = -EBUSY;
init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;
if (mmap_write_lock_killable(mm))
return -EINTR;
if (!mm->core_state)
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
mmap_write_unlock(mm);
core_waiters = zap_threads(tsk, core_state, exit_code);
if (core_waiters > 0) {
struct core_thread *ptr;
......@@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
return core_waiters;
}
static void coredump_finish(struct mm_struct *mm, bool core_dumped)
static void coredump_finish(bool core_dumped)
{
struct core_thread *curr, *next;
struct task_struct *task;
......@@ -493,22 +428,21 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
current->signal->group_exit_code |= 0x80;
current->signal->group_exit_task = NULL;
current->signal->flags = SIGNAL_GROUP_EXIT;
next = current->signal->core_state->dumper.next;
current->signal->core_state = NULL;
spin_unlock_irq(&current->sighand->siglock);
next = mm->core_state->dumper.next;
while ((curr = next) != NULL) {
next = curr->next;
task = curr->task;
/*
* see exit_mm(), curr->task must not see
* see coredump_task_exit(), curr->task must not see
* ->task == NULL before we read ->next.
*/
smp_mb();
curr->task = NULL;
wake_up_process(task);
}
mm->core_state = NULL;
}
static bool dump_interrupted(void)
......@@ -839,7 +773,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
fail_unlock:
kfree(argv);
kfree(cn.corename);
coredump_finish(mm, core_dumped);
coredump_finish(core_dumped);
revert_creds(old_cred);
fail_creds:
put_cred(cred);
......
......@@ -987,16 +987,14 @@ static int exec_mmap(struct mm_struct *mm)
if (old_mm) {
/*
* Make sure that if there is a core dump in progress
* for the old mm, we get out and die instead of going
* through with the exec. We must hold mmap_lock around
* checking core_state and changing tsk->mm.
* If there is a pending fatal signal perhaps a signal
* whose default action is to create a coredump get
* out and die instead of going through with the exec.
*/
mmap_read_lock(old_mm);
if (unlikely(old_mm->core_state)) {
mmap_read_unlock(old_mm);
ret = mmap_read_lock_killable(old_mm);
if (ret) {
up_write(&tsk->signal->exec_update_lock);
return -EINTR;
return ret;
}
}
......
......@@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
cpumask_pr_args(&task->cpus_mask));
}
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
{
seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
seq_putc(m, '\n');
}
......@@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
if (mm) {
task_mem(m, mm);
task_core_dumping(m, mm);
task_core_dumping(m, task);
task_thp_status(m, mm);
mmput(mm);
}
......
......@@ -454,17 +454,6 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
struct core_thread {
struct task_struct *task;
struct core_thread *next;
};
struct core_state {
atomic_t nr_threads;
struct core_thread dumper;
struct completion startup;
};
struct kioctx_table;
struct mm_struct {
struct {
......@@ -585,8 +574,6 @@ struct mm_struct {
unsigned long flags; /* Must use atomic bitops to access */
struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
......
......@@ -362,29 +362,25 @@ static inline void user_single_step_report(struct pt_regs *regs)
#ifndef arch_ptrace_stop_needed
/**
* arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called
* @code: current->exit_code value ptrace will stop with
* @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with
*
* This is called with the siglock held, to decide whether or not it's
* necessary to release the siglock and call arch_ptrace_stop() with the
* same @code and @info arguments. It can be defined to a constant if
* arch_ptrace_stop() is never required, or always is. On machines where
* this makes sense, it should be defined to a quick test to optimize out
* calling arch_ptrace_stop() when it would be superfluous. For example,
* if the thread has not been back to user mode since the last stop, the
* thread state might indicate that nothing needs to be done.
* necessary to release the siglock and call arch_ptrace_stop(). It can be
* defined to a constant if arch_ptrace_stop() is never required, or always
* is. On machines where this makes sense, it should be defined to a quick
* test to optimize out calling arch_ptrace_stop() when it would be
* superfluous. For example, if the thread has not been back to user mode
* since the last stop, the thread state might indicate that nothing needs
* to be done.
*
* This is guaranteed to be invoked once before a task stops for ptrace and
* may include arch-specific operations necessary prior to a ptrace stop.
*/
#define arch_ptrace_stop_needed(code, info) (0)
#define arch_ptrace_stop_needed() (0)
#endif
#ifndef arch_ptrace_stop
/**
* arch_ptrace_stop - Do machine-specific work before stopping for ptrace
* @code: current->exit_code value ptrace will stop with
* @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with
*
* This is called with no locks held when arch_ptrace_stop_needed() has
* just returned nonzero. It is allowed to block, e.g. for user memory
......@@ -394,7 +390,7 @@ static inline void user_single_step_report(struct pt_regs *regs)
* we only do it when the arch requires it for this particular stop, as
* indicated by arch_ptrace_stop_needed().
*/
#define arch_ptrace_stop(code, info) do { } while (0)
#define arch_ptrace_stop() do { } while (0)
#endif
#ifndef current_pt_regs
......
......@@ -1661,6 +1661,7 @@ extern struct pid *cad_pid;
#define PF_VCPU 0x00000001 /* I'm a virtual CPU */
#define PF_IDLE 0x00000002 /* I am an IDLE thread */
#define PF_EXITING 0x00000004 /* Getting shut down */
#define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */
#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */
......
......@@ -72,6 +72,17 @@ struct multiprocess_signals {
struct hlist_node node;
};
struct core_thread {
struct task_struct *task;
struct core_thread *next;
};
struct core_state {
atomic_t nr_threads;
struct core_thread dumper;
struct completion startup;
};
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
......@@ -110,6 +121,8 @@ struct signal_struct {
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */
struct core_state *core_state; /* coredumping support */
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
......
......@@ -340,6 +340,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
static void coredump_task_exit(struct task_struct *tsk)
{
struct core_state *core_state;
/*
* Serialize with any possible pending coredump.
* We must hold siglock around checking core_state
* and setting PF_POSTCOREDUMP. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group without PF_POSTCOREDUMP set.
*/
spin_lock_irq(&tsk->sighand->siglock);
tsk->flags |= PF_POSTCOREDUMP;
core_state = tsk->signal->core_state;
spin_unlock_irq(&tsk->sighand->siglock);
if (core_state) {
struct core_thread self;
self.task = current;
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
}
}
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
......@@ -435,47 +475,12 @@ void mm_update_next_owner(struct mm_struct *mm)
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
struct core_state *core_state;
exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
* We must hold mmap_lock around checking core_state
* and clearing tsk->mm. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
mmap_read_lock(mm);
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
mmap_read_unlock(mm);
self.task = current;
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
......@@ -763,6 +768,7 @@ void __noreturn do_exit(long code)
profile_task_exit(tsk);
kcov_task_exit(tsk);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
......
......@@ -1043,7 +1043,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
......@@ -1391,8 +1390,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* purposes.
*/
if (tsk->clear_child_tid) {
if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
atomic_read(&mm->mm_users) > 1) {
if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
......
......@@ -2145,40 +2145,6 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_unlock_irqrestore(&sighand->siglock, flags);
}
static inline bool may_ptrace_stop(void)
{
if (!likely(current->ptrace))
return false;
/*
* Are we in the middle of do_coredump?
* If so and our tracer is also part of the coredump stopping
* is a deadlock situation, and pointless because our tracer
* is dead so don't allow us to stop.
* If SIGKILL was already sent before the caller unlocked
* ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
*
* This is almost outdated, a task with the pending SIGKILL can't
* block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
* after SIGKILL was already dequeued.
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
return false;
return true;
}
/*
* Return non-zero if there is a SIGKILL that should be waking us up.
* Called with the siglock held.
*/
static bool sigkill_pending(struct task_struct *tsk)
{
return sigismember(&tsk->pending.signal, SIGKILL) ||
sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
}
/*
* This must be called with current->sighand->siglock held.
*
......@@ -2196,7 +2162,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
{
bool gstop_done = false;
if (arch_ptrace_stop_needed(exit_code, info)) {
if (arch_ptrace_stop_needed()) {
/*
* The arch code has something special to do before a
* ptrace stop. This is allowed to block, e.g. for faults
......@@ -2204,17 +2170,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* calling arch_ptrace_stop, so we must release it now.
* To preserve proper semantics, we must do this before
* any signal bookkeeping like checking group_stop_count.
* Meanwhile, a SIGKILL could come in before we retake the
* siglock. That must prevent us from sleeping in TASK_TRACED.
* So after regaining the lock, we must check for SIGKILL.
*/
spin_unlock_irq(&current->sighand->siglock);
arch_ptrace_stop(exit_code, info);
arch_ptrace_stop();
spin_lock_irq(&current->sighand->siglock);
if (sigkill_pending(current))
return;
}
/*
* schedule() will not sleep if there is a pending signal that
* can awaken the task.
*/
set_special_state(TASK_TRACED);
/*
......@@ -2260,7 +2225,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
if (likely(current->ptrace)) {
/*
* Notify parents of the stop.
*
......
......@@ -216,7 +216,7 @@ void dump_mm(const struct mm_struct *mm)
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
"binfmt %px flags %lx core_state %px\n"
"binfmt %px flags %lx\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
......@@ -248,7 +248,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
mm->binfmt, mm->flags, mm->core_state,
mm->binfmt, mm->flags,
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
......
......@@ -787,9 +787,9 @@ static inline bool __task_will_free_mem(struct task_struct *task)
struct signal_struct *sig = task->signal;
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
* A coredumping process may sleep for an extended period in
* coredump_task_exit(), so the oom killer cannot assume that
* the process will promptly exit and release memory.
*/
if (sig->flags & SIGNAL_GROUP_COREDUMP)
return false;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment