Commit 78725596 authored by Eric W. Biederman's avatar Eric W. Biederman

Merge branch 'nsfs-ioctls' into HEAD

From: Andrey Vagin <avagin@openvz.org>

Each namespace has an owning user namespace and now there is not way
to discover these relationships.

Pid and user namepaces are hierarchical. There is no way to discover
parent-child relationships too.

Why we may want to know relationships between namespaces?

One use would be visualization, in order to understand the running
system.  Another would be to answer the question: what capability does
process X have to perform operations on a resource governed by namespace
Y?

One more use-case (which usually called abnormal) is checkpoint/restart.
In CRIU we are going to dump and restore nested namespaces.

There [1] was a discussion about which interface to choose to determing
relationships between namespaces.

Eric suggested to add two ioctl-s [2]:
> Grumble, Grumble.  I think this may actually a case for creating ioctls
> for these two cases.  Now that random nsfs file descriptors are bind
> mountable the original reason for using proc files is not as pressing.
>
> One ioctl for the user namespace that owns a file descriptor.
> One ioctl for the parent namespace of a namespace file descriptor.

Here is an implementaions of these ioctl-s.

$ man man7/namespaces.7
...
Since  Linux  4.X,  the  following  ioctl(2)  calls are supported for
namespace file descriptors.  The correct syntax is:

      fd = ioctl(ns_fd, ioctl_type);

where ioctl_type is one of the following:

NS_GET_USERNS
      Returns a file descriptor that refers to an owning user names‐
      pace.

NS_GET_PARENT
      Returns  a  file descriptor that refers to a parent namespace.
      This ioctl(2) can be used for pid  and  user  namespaces.  For
      user namespaces, NS_GET_PARENT and NS_GET_USERNS have the same
      meaning.

In addition to generic ioctl(2) errors, the following  specific  ones
can occur:

EINVAL NS_GET_PARENT was called for a nonhierarchical namespace.

EPERM  The  requested  namespace  is outside of the current namespace
      scope.

[1] https://lkml.org/lkml/2016/7/6/158
[2] https://lkml.org/lkml/2016/7/9/101

Changes for v2:
* don't return ENOENT for init_user_ns and init_pid_ns. There is nothing
  outside of the init namespace, so we can return EPERM in this case too.
  > The fewer special cases the easier the code is to get
  > correct, and the easier it is to read. // Eric

Changes for v3:
* rename ns->get_owner() to ns->owner(). get_* usually means that it
  grabs a reference.

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: "W. Trevor King" <wking@tremily.us>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
parents 93f0a88b 6ad92bf6
...@@ -3368,10 +3368,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) ...@@ -3368,10 +3368,16 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0; return 0;
} }
static struct user_namespace *mntns_owner(struct ns_common *ns)
{
return to_mnt_ns(ns)->user_ns;
}
const struct proc_ns_operations mntns_operations = { const struct proc_ns_operations mntns_operations = {
.name = "mnt", .name = "mnt",
.type = CLONE_NEWNS, .type = CLONE_NEWNS,
.get = mntns_get, .get = mntns_get,
.put = mntns_put, .put = mntns_put,
.install = mntns_install, .install = mntns_install,
.owner = mntns_owner,
}; };
...@@ -5,11 +5,16 @@ ...@@ -5,11 +5,16 @@
#include <linux/magic.h> #include <linux/magic.h>
#include <linux/ktime.h> #include <linux/ktime.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
static struct vfsmount *nsfs_mnt; static struct vfsmount *nsfs_mnt;
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg);
static const struct file_operations ns_file_operations = { static const struct file_operations ns_file_operations = {
.llseek = no_llseek, .llseek = no_llseek,
.unlocked_ioctl = ns_ioctl,
}; };
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
...@@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode) ...@@ -44,22 +49,14 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns); ns->ops->put(ns);
} }
void *ns_get_path(struct path *path, struct task_struct *task, static void *__ns_get_path(struct path *path, struct ns_common *ns)
const struct proc_ns_operations *ns_ops)
{ {
struct vfsmount *mnt = mntget(nsfs_mnt); struct vfsmount *mnt = mntget(nsfs_mnt);
struct qstr qname = { .name = "", }; struct qstr qname = { .name = "", };
struct dentry *dentry; struct dentry *dentry;
struct inode *inode; struct inode *inode;
struct ns_common *ns;
unsigned long d; unsigned long d;
again:
ns = ns_ops->get(task);
if (!ns) {
mntput(mnt);
return ERR_PTR(-ENOENT);
}
rcu_read_lock(); rcu_read_lock();
d = atomic_long_read(&ns->stashed); d = atomic_long_read(&ns->stashed);
if (!d) if (!d)
...@@ -68,7 +65,7 @@ void *ns_get_path(struct path *path, struct task_struct *task, ...@@ -68,7 +65,7 @@ void *ns_get_path(struct path *path, struct task_struct *task,
if (!lockref_get_not_dead(&dentry->d_lockref)) if (!lockref_get_not_dead(&dentry->d_lockref))
goto slow; goto slow;
rcu_read_unlock(); rcu_read_unlock();
ns_ops->put(ns); ns->ops->put(ns);
got_it: got_it:
path->mnt = mnt; path->mnt = mnt;
path->dentry = dentry; path->dentry = dentry;
...@@ -77,7 +74,7 @@ void *ns_get_path(struct path *path, struct task_struct *task, ...@@ -77,7 +74,7 @@ void *ns_get_path(struct path *path, struct task_struct *task,
rcu_read_unlock(); rcu_read_unlock();
inode = new_inode_pseudo(mnt->mnt_sb); inode = new_inode_pseudo(mnt->mnt_sb);
if (!inode) { if (!inode) {
ns_ops->put(ns); ns->ops->put(ns);
mntput(mnt); mntput(mnt);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
...@@ -95,17 +92,94 @@ void *ns_get_path(struct path *path, struct task_struct *task, ...@@ -95,17 +92,94 @@ void *ns_get_path(struct path *path, struct task_struct *task,
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
d_instantiate(dentry, inode); d_instantiate(dentry, inode);
dentry->d_fsdata = (void *)ns_ops; dentry->d_fsdata = (void *)ns->ops;
d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
if (d) { if (d) {
d_delete(dentry); /* make sure ->d_prune() does nothing */ d_delete(dentry); /* make sure ->d_prune() does nothing */
dput(dentry); dput(dentry);
mntput(mnt);
cpu_relax(); cpu_relax();
goto again; return ERR_PTR(-EAGAIN);
} }
goto got_it; goto got_it;
} }
void *ns_get_path(struct path *path, struct task_struct *task,
const struct proc_ns_operations *ns_ops)
{
struct ns_common *ns;
void *ret;
again:
ns = ns_ops->get(task);
if (!ns)
return ERR_PTR(-ENOENT);
ret = __ns_get_path(path, ns);
if (IS_ERR(ret) && PTR_ERR(ret) == -EAGAIN)
goto again;
return ret;
}
static int open_related_ns(struct ns_common *ns,
struct ns_common *(*get_ns)(struct ns_common *ns))
{
struct path path = {};
struct file *f;
void *err;
int fd;
fd = get_unused_fd_flags(O_CLOEXEC);
if (fd < 0)
return fd;
while (1) {
struct ns_common *relative;
relative = get_ns(ns);
if (IS_ERR(relative)) {
put_unused_fd(fd);
return PTR_ERR(relative);
}
err = __ns_get_path(&path, relative);
if (IS_ERR(err) && PTR_ERR(err) == -EAGAIN)
continue;
break;
}
if (IS_ERR(err)) {
put_unused_fd(fd);
return PTR_ERR(err);
}
f = dentry_open(&path, O_RDONLY, current_cred());
path_put(&path);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else
fd_install(fd, f);
return fd;
}
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct ns_common *ns = get_proc_ns(file_inode(filp));
switch (ioctl) {
case NS_GET_USERNS:
return open_related_ns(ns, ns_get_owner);
case NS_GET_PARENT:
if (!ns->ops->get_parent)
return -EINVAL;
return open_related_ns(ns, ns->ops->get_parent);
default:
return -ENOTTY;
}
}
int ns_get_name(char *buf, size_t size, struct task_struct *task, int ns_get_name(char *buf, size_t size, struct task_struct *task,
const struct proc_ns_operations *ns_ops) const struct proc_ns_operations *ns_ops)
{ {
......
...@@ -18,6 +18,8 @@ struct proc_ns_operations { ...@@ -18,6 +18,8 @@ struct proc_ns_operations {
struct ns_common *(*get)(struct task_struct *task); struct ns_common *(*get)(struct task_struct *task);
void (*put)(struct ns_common *ns); void (*put)(struct ns_common *ns);
int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); int (*install)(struct nsproxy *nsproxy, struct ns_common *ns);
struct user_namespace *(*owner)(struct ns_common *ns);
struct ns_common *(*get_parent)(struct ns_common *ns);
}; };
extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations netns_operations;
......
...@@ -106,6 +106,8 @@ extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, ...@@ -106,6 +106,8 @@ extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t,
extern int proc_setgroups_show(struct seq_file *m, void *v); extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool current_in_userns(const struct user_namespace *target_ns); extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else #else
static inline struct user_namespace *get_user_ns(struct user_namespace *ns) static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
...@@ -139,6 +141,11 @@ static inline bool current_in_userns(const struct user_namespace *target_ns) ...@@ -139,6 +141,11 @@ static inline bool current_in_userns(const struct user_namespace *target_ns)
{ {
return true; return true;
} }
static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
return ERR_PTR(-EPERM);
}
#endif #endif
#endif /* _LINUX_USER_H */ #endif /* _LINUX_USER_H */
#ifndef __LINUX_NSFS_H
#define __LINUX_NSFS_H
#include <linux/ioctl.h>
#define NSIO 0xb7
/* Returns a file descriptor that refers to an owning user namespace */
#define NS_GET_USERNS _IO(NSIO, 0x1)
/* Returns a file descriptor that refers to a parent namespace */
#define NS_GET_PARENT _IO(NSIO, 0x2)
#endif /* __LINUX_NSFS_H */
...@@ -188,10 +188,16 @@ static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new) ...@@ -188,10 +188,16 @@ static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new)
return 0; return 0;
} }
static struct user_namespace *ipcns_owner(struct ns_common *ns)
{
return to_ipc_ns(ns)->user_ns;
}
const struct proc_ns_operations ipcns_operations = { const struct proc_ns_operations ipcns_operations = {
.name = "ipc", .name = "ipc",
.type = CLONE_NEWIPC, .type = CLONE_NEWIPC,
.get = ipcns_get, .get = ipcns_get,
.put = ipcns_put, .put = ipcns_put,
.install = ipcns_install, .install = ipcns_install,
.owner = ipcns_owner,
}; };
...@@ -6421,12 +6421,18 @@ static void cgroupns_put(struct ns_common *ns) ...@@ -6421,12 +6421,18 @@ static void cgroupns_put(struct ns_common *ns)
put_cgroup_ns(to_cg_ns(ns)); put_cgroup_ns(to_cg_ns(ns));
} }
static struct user_namespace *cgroupns_owner(struct ns_common *ns)
{
return to_cg_ns(ns)->user_ns;
}
const struct proc_ns_operations cgroupns_operations = { const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup", .name = "cgroup",
.type = CLONE_NEWCGROUP, .type = CLONE_NEWCGROUP,
.get = cgroupns_get, .get = cgroupns_get,
.put = cgroupns_put, .put = cgroupns_put,
.install = cgroupns_install, .install = cgroupns_install,
.owner = cgroupns_owner,
}; };
static __init int cgroup_namespaces_init(void) static __init int cgroup_namespaces_init(void)
......
...@@ -405,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) ...@@ -405,12 +405,37 @@ static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0; return 0;
} }
static struct ns_common *pidns_get_parent(struct ns_common *ns)
{
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *pid_ns, *p;
/* See if the parent is in the current namespace */
pid_ns = p = to_pid_ns(ns)->parent;
for (;;) {
if (!p)
return ERR_PTR(-EPERM);
if (p == active)
break;
p = p->parent;
}
return &get_pid_ns(pid_ns)->ns;
}
static struct user_namespace *pidns_owner(struct ns_common *ns)
{
return to_pid_ns(ns)->user_ns;
}
const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_operations = {
.name = "pid", .name = "pid",
.type = CLONE_NEWPID, .type = CLONE_NEWPID,
.get = pidns_get, .get = pidns_get,
.put = pidns_put, .put = pidns_put,
.install = pidns_install, .install = pidns_install,
.owner = pidns_owner,
.get_parent = pidns_get_parent,
}; };
static __init int pid_namespaces_init(void) static __init int pid_namespaces_init(void)
......
...@@ -1050,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) ...@@ -1050,12 +1050,37 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return commit_creds(cred); return commit_creds(cred);
} }
struct ns_common *ns_get_owner(struct ns_common *ns)
{
struct user_namespace *my_user_ns = current_user_ns();
struct user_namespace *owner, *p;
/* See if the owner is in the current user namespace */
owner = p = ns->ops->owner(ns);
for (;;) {
if (!p)
return ERR_PTR(-EPERM);
if (p == my_user_ns)
break;
p = p->parent;
}
return &get_user_ns(owner)->ns;
}
static struct user_namespace *userns_owner(struct ns_common *ns)
{
return to_user_ns(ns)->parent;
}
const struct proc_ns_operations userns_operations = { const struct proc_ns_operations userns_operations = {
.name = "user", .name = "user",
.type = CLONE_NEWUSER, .type = CLONE_NEWUSER,
.get = userns_get, .get = userns_get,
.put = userns_put, .put = userns_put,
.install = userns_install, .install = userns_install,
.owner = userns_owner,
.get_parent = ns_get_owner,
}; };
static __init int user_namespaces_init(void) static __init int user_namespaces_init(void)
......
...@@ -154,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) ...@@ -154,10 +154,16 @@ static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new)
return 0; return 0;
} }
static struct user_namespace *utsns_owner(struct ns_common *ns)
{
return to_uts_ns(ns)->user_ns;
}
const struct proc_ns_operations utsns_operations = { const struct proc_ns_operations utsns_operations = {
.name = "uts", .name = "uts",
.type = CLONE_NEWUTS, .type = CLONE_NEWUTS,
.get = utsns_get, .get = utsns_get,
.put = utsns_put, .put = utsns_put,
.install = utsns_install, .install = utsns_install,
.owner = utsns_owner,
}; };
...@@ -1016,11 +1016,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) ...@@ -1016,11 +1016,17 @@ static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
return 0; return 0;
} }
static struct user_namespace *netns_owner(struct ns_common *ns)
{
return to_net_ns(ns)->user_ns;
}
const struct proc_ns_operations netns_operations = { const struct proc_ns_operations netns_operations = {
.name = "net", .name = "net",
.type = CLONE_NEWNET, .type = CLONE_NEWNET,
.get = netns_get, .get = netns_get,
.put = netns_put, .put = netns_put,
.install = netns_install, .install = netns_install,
.owner = netns_owner,
}; };
#endif #endif
...@@ -15,6 +15,7 @@ TARGETS += memory-hotplug ...@@ -15,6 +15,7 @@ TARGETS += memory-hotplug
TARGETS += mount TARGETS += mount
TARGETS += mqueue TARGETS += mqueue
TARGETS += net TARGETS += net
TARGETS += nsfs
TARGETS += powerpc TARGETS += powerpc
TARGETS += pstore TARGETS += pstore
TARGETS += ptrace TARGETS += ptrace
......
TEST_PROGS := owner pidns
CFLAGS := -Wall -Werror
all: owner pidns
owner: owner.c
pidns: pidns.c
clean:
$(RM) owner pidns
include ../lib.mk
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#define NSIO 0xb7
#define NS_GET_USERNS _IO(NSIO, 0x1)
#define pr_err(fmt, ...) \
({ \
fprintf(stderr, "%s:%d:" fmt ": %m\n", \
__func__, __LINE__, ##__VA_ARGS__); \
1; \
})
int main(int argc, char *argvp[])
{
int pfd[2], ns, uns, init_uns;
struct stat st1, st2;
char path[128];
pid_t pid;
char c;
if (pipe(pfd))
return 1;
pid = fork();
if (pid < 0)
return pr_err("fork");
if (pid == 0) {
prctl(PR_SET_PDEATHSIG, SIGKILL);
if (unshare(CLONE_NEWUTS | CLONE_NEWUSER))
return pr_err("unshare");
close(pfd[0]);
close(pfd[1]);
while (1)
sleep(1);
return 0;
}
close(pfd[1]);
if (read(pfd[0], &c, 1) != 0)
return pr_err("Unable to read from pipe");
close(pfd[0]);
snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
ns = open(path, O_RDONLY);
if (ns < 0)
return pr_err("Unable to open %s", path);
uns = ioctl(ns, NS_GET_USERNS);
if (uns < 0)
return pr_err("Unable to get an owning user namespace");
if (fstat(uns, &st1))
return pr_err("fstat");
snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
if (stat(path, &st2))
return pr_err("stat");
if (st1.st_ino != st2.st_ino)
return pr_err("NS_GET_USERNS returned a wrong namespace");
init_uns = ioctl(uns, NS_GET_USERNS);
if (uns < 0)
return pr_err("Unable to get an owning user namespace");
if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
return pr_err("Don't get EPERM");
if (unshare(CLONE_NEWUSER))
return pr_err("unshare");
if (ioctl(ns, NS_GET_USERNS) >= 0 || errno != EPERM)
return pr_err("Don't get EPERM");
if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
return pr_err("Don't get EPERM");
kill(pid, SIGKILL);
wait(NULL);
return 0;
}
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#define pr_err(fmt, ...) \
({ \
fprintf(stderr, "%s:%d:" fmt ": %m\n", \
__func__, __LINE__, ##__VA_ARGS__); \
1; \
})
#define NSIO 0xb7
#define NS_GET_USERNS _IO(NSIO, 0x1)
#define NS_GET_PARENT _IO(NSIO, 0x2)
#define __stack_aligned__ __attribute__((aligned(16)))
struct cr_clone_arg {
char stack[128] __stack_aligned__;
char stack_ptr[0];
};
static int child(void *args)
{
prctl(PR_SET_PDEATHSIG, SIGKILL);
while (1)
sleep(1);
exit(0);
}
int main(int argc, char *argv[])
{
char *ns_strs[] = {"pid", "user"};
char path[] = "/proc/0123456789/ns/pid";
struct cr_clone_arg ca;
struct stat st1, st2;
int ns, pns, i;
pid_t pid;
pid = clone(child, ca.stack_ptr, CLONE_NEWUSER | CLONE_NEWPID | SIGCHLD, NULL);
if (pid < 0)
return pr_err("clone");
for (i = 0; i < 2; i++) {
snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns_strs[i]);
ns = open(path, O_RDONLY);
if (ns < 0)
return pr_err("Unable to open %s", path);
pns = ioctl(ns, NS_GET_PARENT);
if (pns < 0)
return pr_err("Unable to get a parent pidns");
snprintf(path, sizeof(path), "/proc/self/ns/%s", ns_strs[i]);
if (stat(path, &st2))
return pr_err("Unable to stat %s", path);
if (fstat(pns, &st1))
return pr_err("Unable to stat the parent pidns");
if (st1.st_ino != st2.st_ino)
return pr_err("NS_GET_PARENT returned a wrong namespace");
if (ioctl(pns, NS_GET_PARENT) >= 0 || errno != EPERM)
return pr_err("Don't get EPERM");;
}
kill(pid, SIGKILL);
wait(NULL);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment