Commit 4b871ce2 authored by Eric W. Biederman's avatar Eric W. Biederman

Merged 'Infrastructure to allow fixing exec deadlocks' from Bernd Edlinger

This is an infrastructure change that makes way for fixing this issue.
Each patch was already posted previously so this is just a cleanup of
the original mailing list thread(s) which got out of control by now.

Everything started here:
https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/

I added reviewed-by tags from the mailing list threads, except when
withdrawn.

It took a lot longer than expected to collect everything from the
mailinglist threads, since several commit messages have been infected
with typos, and they got fixed without a new patch version.

- Correct the point of no return.
- Add two new mutexes to replace cred_guard_mutex.
- Fix each use of cred_guard_mutex.
- Update documentation.
- Add a test case.

-- EWB Removed the last 2 patches they need more work

Bernd Edlinger (9):
      exec: Fix a deadlock in strace
      selftests/ptrace: add test cases for dead-locks
      mm: docs: Fix a comment in process_vm_rw_core
      kernel: doc: remove outdated comment cred.c
      kernel/kcmp.c: Use new infrastructure to fix deadlocks in execve
      proc: Use new infrastructure to fix deadlocks in execve
      proc: io_accounting: Use new infrastructure to fix deadlocks in execve
      perf: Use new infrastructure to fix deadlocks in execve
      pidfd: Use new infrastructure to fix deadlocks in execve

Eric W. Biederman (5):
      exec: Only compute current once in flush_old_exec
      exec: Factor unshare_sighand out of de_thread and call it separately
      exec: Move cleanup of posix timers on exec out of de_thread
      exec: Move exec_mmap right after de_thread in flush_old_exec
      exec: Add exec_update_mutex to replace cred_guard_mutex

 fs/exec.c                                 | 78 +++++++++++++++++++---------
 fs/proc/base.c                            | 10 ++--
 include/linux/binfmts.h                   |  8 ++-
 include/linux/sched/signal.h              |  9 +++-
 init/init_task.c                          |  1 +
 kernel/cred.c                             |  2 -
 kernel/events/core.c                      | 12 ++---
 kernel/fork.c                             |  5 +-
 kernel/kcmp.c                             |  8 +--
 kernel/pid.c                              |  4 +-
 mm/process_vm_access.c                    |  2 +-
 tools/testing/selftests/ptrace/Makefile   |  4 +-
 tools/testing/selftests/ptrace/vmaccess.c | 86 +++++++++++++++++++++++++++++++
 13 files changed, 179 insertions(+), 50 deletions(-)
Signed-off-by: default avatarBernd Edlinger <bernd.edlinger@hotmail.de>
Signed-off-by: default avatar"Eric W. Biederman" <ebiederm@xmission.com>
parents a0d4a141 501f9328
......@@ -1010,16 +1010,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
}
EXPORT_SYMBOL(read_code);
/*
* Maps the mm_struct mm into the current task struct.
* On success, this function returns with the mutex
* exec_update_mutex locked.
*/
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
struct mm_struct *old_mm, *active_mm;
int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
if (ret)
return ret;
if (old_mm) {
sync_mm_rss(old_mm);
/*
......@@ -1031,9 +1041,11 @@ static int exec_mmap(struct mm_struct *mm)
down_read(&old_mm->mmap_sem);
if (unlikely(old_mm->core_state)) {
up_read(&old_mm->mmap_sem);
mutex_unlock(&tsk->signal->exec_update_mutex);
return -EINTR;
}
}
task_lock(tsk);
active_mm = tsk->active_mm;
membarrier_exec_mmap(mm);
......@@ -1189,10 +1201,22 @@ static int de_thread(struct task_struct *tsk)
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
#ifdef CONFIG_POSIX_TIMERS
exit_itimers(sig);
flush_itimer_signals();
#endif
BUG_ON(!thread_group_leader(tsk));
return 0;
killed:
/* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
sig->group_exit_task = NULL;
sig->notify_count = 0;
read_unlock(&tasklist_lock);
return -EAGAIN;
}
static int unshare_sighand(struct task_struct *me)
{
struct sighand_struct *oldsighand = me->sighand;
if (refcount_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
......@@ -1210,23 +1234,13 @@ static int de_thread(struct task_struct *tsk)
write_lock_irq(&tasklist_lock);
spin_lock(&oldsighand->siglock);
rcu_assign_pointer(tsk->sighand, newsighand);
rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);
__cleanup_sighand(oldsighand);
}
BUG_ON(!thread_group_leader(tsk));
return 0;
killed:
/* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
sig->group_exit_task = NULL;
sig->notify_count = 0;
read_unlock(&tasklist_lock);
return -EAGAIN;
}
char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk)
......@@ -1260,13 +1274,13 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
*/
int flush_old_exec(struct linux_binprm * bprm)
{
struct task_struct *me = current;
int retval;
/*
* Make sure we have a private signal table and that
* we are unassociated from the previous thread group.
* Make this the only thread in the thread group.
*/
retval = de_thread(current);
retval = de_thread(me);
if (retval)
goto out;
......@@ -1286,18 +1300,31 @@ int flush_old_exec(struct linux_binprm * bprm)
goto out;
/*
* After clearing bprm->mm (to mark that current is using the
* prepared mm now), we have nothing left of the original
* After setting bprm->called_exec_mmap (to mark that current is
* using the prepared mm now), we have nothing left of the original
* process. If anything from here on returns an error, the check
* in search_binary_handler() will SEGV current.
*/
bprm->called_exec_mmap = 1;
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
exit_itimers(me->signal);
flush_itimer_signals();
#endif
/*
* Make the signal table private.
*/
retval = unshare_sighand(me);
if (retval)
goto out;
set_fs(USER_DS);
current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
current->personality &= ~bprm->per_clear;
me->personality &= ~bprm->per_clear;
/*
* We have to apply CLOEXEC before we change whether the process is
......@@ -1305,7 +1332,7 @@ int flush_old_exec(struct linux_binprm * bprm)
* trying to access the should-be-closed file descriptors of a process
* undergoing exec(2).
*/
do_close_on_exec(current->files);
do_close_on_exec(me->files);
return 0;
out:
......@@ -1424,6 +1451,8 @@ static void free_bprm(struct linux_binprm *bprm)
{
free_arg_pages(bprm);
if (bprm->cred) {
if (bprm->called_exec_mmap)
mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
......@@ -1473,6 +1502,7 @@ void install_exec_creds(struct linux_binprm *bprm)
* credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
mutex_unlock(&current->signal->exec_update_mutex);
mutex_unlock(&current->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(install_exec_creds);
......@@ -1664,7 +1694,7 @@ int search_binary_handler(struct linux_binprm *bprm)
read_lock(&binfmt_lock);
put_binfmt(fmt);
if (retval < 0 && !bprm->mm) {
if (retval < 0 && bprm->called_exec_mmap) {
/* we got to flush_old_exec() and failed after it */
read_unlock(&binfmt_lock);
force_sigsegv(SIGSEGV);
......
......@@ -405,11 +405,11 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
static int lock_trace(struct task_struct *task)
{
int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
int err = mutex_lock_killable(&task->signal->exec_update_mutex);
if (err)
return err;
if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) {
mutex_unlock(&task->signal->cred_guard_mutex);
mutex_unlock(&task->signal->exec_update_mutex);
return -EPERM;
}
return 0;
......@@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task)
static void unlock_trace(struct task_struct *task)
{
mutex_unlock(&task->signal->cred_guard_mutex);
mutex_unlock(&task->signal->exec_update_mutex);
}
#ifdef CONFIG_STACKTRACE
......@@ -2883,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
unsigned long flags;
int result;
result = mutex_lock_killable(&task->signal->cred_guard_mutex);
result = mutex_lock_killable(&task->signal->exec_update_mutex);
if (result)
return result;
......@@ -2919,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh
result = 0;
out_unlock:
mutex_unlock(&task->signal->cred_guard_mutex);
mutex_unlock(&task->signal->exec_update_mutex);
return result;
}
......
......@@ -44,7 +44,13 @@ struct linux_binprm {
* exec has happened. Used to sanitize execution environment
* and to set AT_SECURE auxv for glibc.
*/
secureexec:1;
secureexec:1,
/*
* Set by flush_old_exec, when exec_mmap has been called.
* This is past the point of no return, when the
* exec_update_mutex has been taken.
*/
called_exec_mmap:1;
#ifdef __alpha__
unsigned int taso:1;
#endif
......
......@@ -224,7 +224,14 @@ struct signal_struct {
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace) */
* (notably. ptrace)
* Deprecated do not use in new code.
* Use exec_update_mutex instead.
*/
struct mutex exec_update_mutex; /* Held while task_struct is being
* updated during exec, and may have
* inconsistent permissions.
*/
} __randomize_layout;
/*
......
......@@ -26,6 +26,7 @@ static struct signal_struct init_signals = {
.multiprocess = HLIST_HEAD_INIT,
.rlim = INIT_RLIMITS,
.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
.exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
#ifdef CONFIG_POSIX_TIMERS
.posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
.cputimer = {
......
......@@ -675,8 +675,6 @@ void __init cred_init(void)
* The caller may change these controls afterwards if desired.
*
* Returns the new credentials or NULL if out of memory.
*
* Does not take, and does not return holding current->cred_replace_mutex.
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
......
......@@ -1249,7 +1249,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
* cred_guard_mutex
* exec_update_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
......@@ -11263,14 +11263,14 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (task) {
err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
if (err)
goto err_task;
/*
* Reuse ptrace permission checks for now.
*
* We must hold cred_guard_mutex across this and any potential
* We must hold exec_update_mutex across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
......@@ -11559,7 +11559,7 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&ctx->mutex);
if (task) {
mutex_unlock(&task->signal->cred_guard_mutex);
mutex_unlock(&task->signal->exec_update_mutex);
put_task_struct(task);
}
......@@ -11595,7 +11595,7 @@ SYSCALL_DEFINE5(perf_event_open,
free_event(event);
err_cred:
if (task)
mutex_unlock(&task->signal->cred_guard_mutex);
mutex_unlock(&task->signal->exec_update_mutex);
err_task:
if (task)
put_task_struct(task);
......@@ -11900,7 +11900,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
*
* Can be called with cred_guard_mutex held when called from
* Can be called with exec_update_mutex held when called from
* install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
......
......@@ -1224,7 +1224,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
struct mm_struct *mm;
int err;
err = mutex_lock_killable(&task->signal->cred_guard_mutex);
err = mutex_lock_killable(&task->signal->exec_update_mutex);
if (err)
return ERR_PTR(err);
......@@ -1234,7 +1234,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mmput(mm);
mm = ERR_PTR(-EACCES);
}
mutex_unlock(&task->signal->cred_guard_mutex);
mutex_unlock(&task->signal->exec_update_mutex);
return mm;
}
......@@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_init(&sig->cred_guard_mutex);
mutex_init(&sig->exec_update_mutex);
return 0;
}
......
......@@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
/*
* One should have enough rights to inspect task details.
*/
ret = kcmp_lock(&task1->signal->cred_guard_mutex,
&task2->signal->cred_guard_mutex);
ret = kcmp_lock(&task1->signal->exec_update_mutex,
&task2->signal->exec_update_mutex);
if (ret)
goto err;
if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) ||
......@@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
}
err_unlock:
kcmp_unlock(&task1->signal->cred_guard_mutex,
&task2->signal->cred_guard_mutex);
kcmp_unlock(&task1->signal->exec_update_mutex,
&task2->signal->exec_update_mutex);
err:
put_task_struct(task1);
put_task_struct(task2);
......
......@@ -577,7 +577,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
struct file *file;
int ret;
ret = mutex_lock_killable(&task->signal->cred_guard_mutex);
ret = mutex_lock_killable(&task->signal->exec_update_mutex);
if (ret)
return ERR_PTR(ret);
......@@ -586,7 +586,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
else
file = ERR_PTR(-EPERM);
mutex_unlock(&task->signal->cred_guard_mutex);
mutex_unlock(&task->signal->exec_update_mutex);
return file ?: ERR_PTR(-EBADF);
}
......
......@@ -206,7 +206,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
if (!mm || IS_ERR(mm)) {
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
/*
* Explicitly map EACCES to EPERM as EPERM is a more a
* Explicitly map EACCES to EPERM as EPERM is a more
* appropriate error code for process_vw_readv/writev
*/
if (rc == -EACCES)
......
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -iquote../../../../include/uapi -Wall
CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall
TEST_GEN_PROGS := get_syscall_info peeksiginfo
TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess
include ../lib.mk
// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (c) 2020 Bernd Edlinger <bernd.edlinger@hotmail.de>
* All rights reserved.
*
* Check whether /proc/$pid/mem can be accessed without causing deadlocks
* when de_thread is blocked with ->cred_guard_mutex held.
*/
#include "../kselftest_harness.h"
#include <stdio.h>
#include <fcntl.h>
#include <pthread.h>
#include <signal.h>
#include <unistd.h>
#include <sys/ptrace.h>
static void *thread(void *arg)
{
ptrace(PTRACE_TRACEME, 0, 0L, 0L);
return NULL;
}
TEST(vmaccess)
{
int f, pid = fork();
char mm[64];
if (!pid) {
pthread_t pt;
pthread_create(&pt, NULL, thread, NULL);
pthread_join(pt, NULL);
execlp("true", "true", NULL);
}
sleep(1);
sprintf(mm, "/proc/%d/mem", pid);
f = open(mm, O_RDONLY);
ASSERT_GE(f, 0);
close(f);
f = kill(pid, SIGCONT);
ASSERT_EQ(f, 0);
}
TEST(attach)
{
int s, k, pid = fork();
if (!pid) {
pthread_t pt;
pthread_create(&pt, NULL, thread, NULL);
pthread_join(pt, NULL);
execlp("sleep", "sleep", "2", NULL);
}
sleep(1);
k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
ASSERT_EQ(errno, EAGAIN);
ASSERT_EQ(k, -1);
k = waitpid(-1, &s, WNOHANG);
ASSERT_NE(k, -1);
ASSERT_NE(k, 0);
ASSERT_NE(k, pid);
ASSERT_EQ(WIFEXITED(s), 1);
ASSERT_EQ(WEXITSTATUS(s), 0);
sleep(1);
k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
ASSERT_EQ(WIFSTOPPED(s), 1);
ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
ASSERT_EQ(k, 0);
k = waitpid(-1, &s, 0);
ASSERT_EQ(k, pid);
ASSERT_EQ(WIFEXITED(s), 1);
ASSERT_EQ(WEXITSTATUS(s), 0);
k = waitpid(-1, NULL, 0);
ASSERT_EQ(k, -1);
ASSERT_EQ(errno, ECHILD);
}
TEST_HARNESS_MAIN
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment