Commit 0a72761b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull thread updates from Christian Brauner:
 "This contains the changes to add the missing support for attaching to
  time namespaces via pidfds.

  Last cycle setns() was changed to support attaching to multiple
  namespaces atomically. This requires all namespaces to have a point of
  no return where they can't fail anymore.

  Specifically, <namespace-type>_install() is allowed to perform
  permission checks and install the namespace into the new struct nsset
  that it has been given but it is not allowed to make visible changes
  to the affected task. Once <namespace-type>_install() returns,
  anything that the given namespace type additionally requires to be
  setup needs to ideally be done in a function that can't fail or if it
  fails the failure must be non-fatal.

  For time namespaces the relevant functions that fell into this
  category were timens_set_vvar_page() and vdso_join_timens(). The
  latter could still fail although it didn't need to. This function is
  only implemented for vdso_join_timens() in current mainline. As
  discussed on-list (cf. [1]), in order to make setns() support time
  namespaces when attaching to multiple namespaces at once properly we
  changed vdso_join_timens() to always succeed. So vdso_join_timens()
  replaces the mmap_write_lock_killable() with mmap_read_lock().

  Please note that arm is about to grow vdso support for time namespaces
  (possibly this merge window). We've synced on this change and arm64
  also uses mmap_read_lock(), i.e. makes vdso_join_timens() a function
  that can't fail. Once the changes here and the arm64 changes have
  landed, vdso_join_timens() should be turned into a void function so
  it's obvious to callers and implementers on other architectures that
  the expectation is that it can't fail.

  We didn't do this right away because it would've introduced
  unnecessary merge conflicts between the two trees for no major gain.

  As always, tests included"

[1]: https://lore.kernel.org/lkml/20200611110221.pgd3r5qkjrjmfqa2@wittgenstein

* tag 'threads-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  tests: add CLONE_NEWTIME setns tests
  nsproxy: support CLONE_NEWTIME with setns()
  timens: add timens_commit() helper
  timens: make vdso_join_timens() always succeed
parents 3950e975 55d9ad97
......@@ -144,8 +144,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
if (mmap_write_lock_killable(mm))
return -EINTR;
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
unsigned long size = vma->vm_end - vma->vm_start;
......@@ -154,7 +153,7 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
zap_page_range(vma, vma->vm_start, size);
}
mmap_write_unlock(mm);
mmap_read_unlock(mm);
return 0;
}
#else
......
......@@ -33,6 +33,7 @@ extern struct time_namespace init_time_ns;
#ifdef CONFIG_TIME_NS
extern int vdso_join_timens(struct task_struct *task,
struct time_namespace *ns);
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
......@@ -96,6 +97,11 @@ static inline int vdso_join_timens(struct task_struct *task,
return 0;
}
static inline void timens_commit(struct task_struct *tsk,
struct time_namespace *ns)
{
}
static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
return NULL;
......
......@@ -262,8 +262,8 @@ void exit_task_namespaces(struct task_struct *p)
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID |
CLONE_NEWCGROUP)))
CLONE_NEWNET | CLONE_NEWTIME | CLONE_NEWUSER |
CLONE_NEWPID | CLONE_NEWCGROUP)))
return -EINVAL;
#ifndef CONFIG_USER_NS
......@@ -290,6 +290,10 @@ static int check_setns_flags(unsigned long flags)
if (flags & CLONE_NEWNET)
return -EINVAL;
#endif
#ifndef CONFIG_TIME_NS
if (flags & CLONE_NEWTIME)
return -EINVAL;
#endif
return 0;
}
......@@ -464,6 +468,14 @@ static int validate_nsset(struct nsset *nsset, struct pid *pid)
}
#endif
#ifdef CONFIG_TIME_NS
if (flags & CLONE_NEWTIME) {
ret = validate_ns(nsset, &nsp->time_ns->ns);
if (ret)
goto out;
}
#endif
out:
if (pid_ns)
put_pid_ns(pid_ns);
......@@ -507,6 +519,11 @@ static void commit_nsset(struct nsset *nsset)
exit_sem(me);
#endif
#ifdef CONFIG_TIME_NS
if (flags & CLONE_NEWTIME)
timens_commit(me, nsset->nsproxy->time_ns);
#endif
/* transfer ownership */
switch_task_namespaces(me, nsset->nsproxy);
nsset->nsproxy = NULL;
......
......@@ -280,11 +280,16 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
{
timens_set_vvar_page(tsk, ns);
vdso_join_timens(tsk, ns);
}
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct time_namespace *ns = to_time_ns(new);
int err;
if (!current_is_single_threaded())
return -EUSERS;
......@@ -293,12 +298,6 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
timens_set_vvar_page(current, ns);
err = vdso_join_timens(current, ns);
if (err)
return err;
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
......@@ -313,22 +312,17 @@ int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
{
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
struct time_namespace *ns = to_time_ns(nsc);
int err;
/* create_new_namespaces() already incremented the ref counter */
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
return 0;
timens_set_vvar_page(tsk, ns);
err = vdso_join_timens(tsk, ns);
if (err)
return err;
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
timens_commit(tsk, ns);
return 0;
}
......
......@@ -22,6 +22,10 @@
#define P_PIDFD 3
#endif
#ifndef CLONE_NEWTIME
#define CLONE_NEWTIME 0x00000080
#endif
#ifndef CLONE_PIDFD
#define CLONE_PIDFD 0x00001000
#endif
......
......@@ -32,6 +32,7 @@ enum {
PIDFD_NS_NET,
PIDFD_NS_CGROUP,
PIDFD_NS_PIDCLD,
PIDFD_NS_TIME,
PIDFD_NS_MAX
};
......@@ -47,6 +48,7 @@ const struct ns_info {
[PIDFD_NS_NET] = { "net", CLONE_NEWNET, },
[PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, },
[PIDFD_NS_PIDCLD] = { "pid_for_children", 0, },
[PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, },
};
FIXTURE(current_nsset)
......@@ -83,9 +85,49 @@ pid_t create_child(int *pidfd, unsigned flags)
return sys_clone3(&args, sizeof(struct clone_args));
}
static bool switch_timens(void)
{
int fd, ret;
if (unshare(CLONE_NEWTIME))
return false;
fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
if (fd < 0)
return false;
ret = setns(fd, CLONE_NEWTIME);
close(fd);
return ret == 0;
}
static ssize_t read_nointr(int fd, void *buf, size_t count)
{
ssize_t ret;
do {
ret = read(fd, buf, count);
} while (ret < 0 && errno == EINTR);
return ret;
}
static ssize_t write_nointr(int fd, const void *buf, size_t count)
{
ssize_t ret;
do {
ret = write(fd, buf, count);
} while (ret < 0 && errno == EINTR);
return ret;
}
FIXTURE_SETUP(current_nsset)
{
int i, proc_fd, ret;
int ipc_sockets[2];
char c;
for (i = 0; i < PIDFD_NS_MAX; i++) {
self->nsfds[i] = -EBADF;
......@@ -130,6 +172,9 @@ FIXTURE_SETUP(current_nsset)
TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
}
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
EXPECT_EQ(ret, 0);
/* Create tasks that will be stopped. */
self->child_pid1 = create_child(&self->child_pidfd1,
CLONE_NEWUSER | CLONE_NEWNS |
......@@ -139,10 +184,27 @@ FIXTURE_SETUP(current_nsset)
EXPECT_GE(self->child_pid1, 0);
if (self->child_pid1 == 0) {
close(ipc_sockets[0]);
if (!switch_timens())
_exit(EXIT_FAILURE);
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
_exit(EXIT_FAILURE);
close(ipc_sockets[1]);
pause();
_exit(EXIT_SUCCESS);
}
close(ipc_sockets[1]);
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
close(ipc_sockets[0]);
ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
EXPECT_EQ(ret, 0);
self->child_pid2 = create_child(&self->child_pidfd2,
CLONE_NEWUSER | CLONE_NEWNS |
CLONE_NEWCGROUP | CLONE_NEWIPC |
......@@ -151,10 +213,24 @@ FIXTURE_SETUP(current_nsset)
EXPECT_GE(self->child_pid2, 0);
if (self->child_pid2 == 0) {
close(ipc_sockets[0]);
if (!switch_timens())
_exit(EXIT_FAILURE);
if (write_nointr(ipc_sockets[1], "1", 1) < 0)
_exit(EXIT_FAILURE);
close(ipc_sockets[1]);
pause();
_exit(EXIT_SUCCESS);
}
close(ipc_sockets[1]);
ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
close(ipc_sockets[0]);
for (i = 0; i < PIDFD_NS_MAX; i++) {
char p[100];
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment