Commit 8f502d5b authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull usernamespace mount fixes from Eric Biederman:
 "Way back in October Andrey Vagin reported that umount(MNT_DETACH)
  could be used to defeat MNT_LOCKED.  As I worked to fix this I
  discovered that combined with mount propagation and an appropriate
  selection of shared subtrees a reference to a directory on an
  unmounted filesystem is not necessary.

  That MNT_DETACH is allowed in user namespace in a form that can break
  MNT_LOCKED comes from my early misunderstanding what MNT_DETACH does.

  To avoid breaking existing userspace the conflict between MNT_DETACH
  and MNT_LOCKED is fixed by leaving mounts that are locked to their
  parents in the mount hash table until the last reference goes away.

  While investigating this issue I also found an issue with
  __detach_mounts.  The code was unnecessarily and incorrectly
  triggering mount propagation.  Resulting in too many mounts going away
  when a directory is deleted, and too many cpu cycles are burned while
  doing that.

  Looking some more I realized that __detach_mounts by only keeping
  mounts connected that were MNT_LOCKED it had the potential to still
  leak information so I tweaked the code to keep everything locked
  together that possibly could be.

  This code was almost ready last cycle but Al invented fs_pin which
  slightly simplifies this code but required rewrites and retesting, and
  I have not been in top form for a while so it took me a while to get
  all of that done.  Similiarly this pull request is late because I have
  been feeling absolutely miserable all week.

  The issue of being able to escape a bind mount has not yet been
  addressed, as the fixes are not yet mature"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  mnt: Update detach_mounts to leave mounts connected
  mnt: Fix the error check in __detach_mounts
  mnt: Honor MNT_LOCKED when detaching mounts
  fs_pin: Allow for the possibility that m_list or s_list go unused.
  mnt: Factor umount_mnt from umount_tree
  mnt: Factor out unhash_mnt from detach_mnt and umount_tree
  mnt: Fail collect_mounts when applied to unmounted mounts
  mnt: Don't propagate unmounts to locked mounts
  mnt: On an unmount propagate clearing of MNT_LOCKED
  mnt: Delay removal from the mount hash.
  mnt: Add MNT_UMOUNT flag
  mnt: In umount_tree reuse mnt_list instead of mnt_hash
  mnt: Don't propagate umounts in __detach_mounts
  mnt: Improve the umount_tree flags
  mnt: Use hlist_move_list in namespace_unlock
parents 06a60dec e0c9c0af
......@@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock);
void pin_remove(struct fs_pin *pin)
{
spin_lock(&pin_lock);
hlist_del(&pin->m_list);
hlist_del(&pin->s_list);
hlist_del_init(&pin->m_list);
hlist_del_init(&pin->s_list);
spin_unlock(&pin_lock);
spin_lock_irq(&pin->wait.lock);
pin->done = 1;
......
......@@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
*/
struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
{
struct mount *p, *res;
res = p = __lookup_mnt(mnt, dentry);
struct mount *p, *res = NULL;
p = __lookup_mnt(mnt, dentry);
if (!p)
goto out;
if (!(p->mnt.mnt_flags & MNT_UMOUNT))
res = p;
hlist_for_each_entry_continue(p, mnt_hash) {
if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
break;
res = p;
if (!(p->mnt.mnt_flags & MNT_UMOUNT))
res = p;
}
out:
return res;
......@@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
/*
* vfsmount lock must be held for write
*/
static void detach_mnt(struct mount *mnt, struct path *old_path)
static void unhash_mnt(struct mount *mnt)
{
old_path->dentry = mnt->mnt_mountpoint;
old_path->mnt = &mnt->mnt_parent->mnt;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
......@@ -808,6 +809,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
mnt->mnt_mp = NULL;
}
/*
* vfsmount lock must be held for write
*/
static void detach_mnt(struct mount *mnt, struct path *old_path)
{
old_path->dentry = mnt->mnt_mountpoint;
old_path->mnt = &mnt->mnt_parent->mnt;
unhash_mnt(mnt);
}
/*
* vfsmount lock must be held for write
*/
static void umount_mnt(struct mount *mnt)
{
/* old mountpoint will be dropped when we can do that */
mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
unhash_mnt(mnt);
}
/*
* vfsmount lock must be held for write
*/
......@@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt)
rcu_read_unlock();
list_del(&mnt->mnt_instance);
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
struct mount *p, *tmp;
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
umount_mnt(p);
}
}
unlock_mount_hash();
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
......@@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static void namespace_unlock(void)
{
struct hlist_head head = unmounted;
struct hlist_head head;
if (likely(hlist_empty(&head))) {
up_write(&namespace_sem);
return;
}
hlist_move_list(&unmounted, &head);
head.first->pprev = &head.first;
INIT_HLIST_HEAD(&unmounted);
up_write(&namespace_sem);
if (likely(hlist_empty(&head)))
return;
synchronize_rcu();
group_pin_kill(&head);
......@@ -1319,49 +1345,63 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
UMOUNT_CONNECTED = 4,
};
/*
* mount_lock must be held
* namespace_sem must be held for write
* how = 0 => just this tree, don't propagate
* how = 1 => propagate; we know that nobody else has reference to any victims
* how = 2 => lazy umount
*/
void umount_tree(struct mount *mnt, int how)
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
HLIST_HEAD(tmp_list);
LIST_HEAD(tmp_list);
struct mount *p;
if (how & UMOUNT_PROPAGATE)
propagate_mount_unlock(mnt);
/* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) {
hlist_del_init_rcu(&p->mnt_hash);
hlist_add_head(&p->mnt_hash, &tmp_list);
p->mnt.mnt_flags |= MNT_UMOUNT;
list_move(&p->mnt_list, &tmp_list);
}
hlist_for_each_entry(p, &tmp_list, mnt_hash)
/* Hide the mounts from mnt_mounts */
list_for_each_entry(p, &tmp_list, mnt_list) {
list_del_init(&p->mnt_child);
}
if (how)
/* Add propogated mounts to the tmp_list */
if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
while (!hlist_empty(&tmp_list)) {
p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
hlist_del_init_rcu(&p->mnt_hash);
while (!list_empty(&tmp_list)) {
bool disconnect;
p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
if (how < 2)
if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
disconnect = !(((how & UMOUNT_CONNECTED) &&
mnt_has_parent(p) &&
(p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) ||
IS_MNT_LOCKED_AND_LAZY(p));
pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
disconnect ? &unmounted : NULL);
if (mnt_has_parent(p)) {
hlist_del_init(&p->mnt_mp_list);
put_mountpoint(p->mnt_mp);
mnt_add_count(p->mnt_parent, -1);
/* old mountpoint will be dropped when we can do that */
p->mnt_ex_mountpoint = p->mnt_mountpoint;
p->mnt_mountpoint = p->mnt.mnt_root;
p->mnt_parent = p;
p->mnt_mp = NULL;
if (!disconnect) {
/* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
} else {
umount_mnt(p);
}
}
change_mnt_propagation(p, MS_PRIVATE);
}
......@@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags)
if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, 2);
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, 1);
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
}
......@@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry)
namespace_lock();
mp = lookup_mountpoint(dentry);
if (!mp)
if (IS_ERR_OR_NULL(mp))
goto out_unlock;
lock_mount_hash();
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
umount_tree(mnt, 2);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
struct mount *p, *tmp;
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
hlist_add_head(&p->mnt_umount.s_list, &unmounted);
umount_mnt(p);
}
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
unlock_mount_hash();
put_mountpoint(mp);
......@@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
out:
if (res) {
lock_mount_hash();
umount_tree(res, 0);
umount_tree(res, UMOUNT_SYNC);
unlock_mount_hash();
}
return q;
......@@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path)
{
struct mount *tree;
namespace_lock();
tree = copy_tree(real_mount(path->mnt), path->dentry,
CL_COPY_ALL | CL_PRIVATE);
if (!check_mnt(real_mount(path->mnt)))
tree = ERR_PTR(-EINVAL);
else
tree = copy_tree(real_mount(path->mnt), path->dentry,
CL_COPY_ALL | CL_PRIVATE);
namespace_unlock();
if (IS_ERR(tree))
return ERR_CAST(tree);
......@@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
lock_mount_hash();
umount_tree(real_mount(mnt), 0);
umount_tree(real_mount(mnt), UMOUNT_SYNC);
unlock_mount_hash();
namespace_unlock();
}
......@@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
out_cleanup_ids:
while (!hlist_empty(&tree_list)) {
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
umount_tree(child, 0);
umount_tree(child, UMOUNT_SYNC);
}
unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
......@@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name,
err = graft_tree(mnt, parent, mp);
if (err) {
lock_mount_hash();
umount_tree(mnt, 0);
umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
out2:
......@@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
while (!list_empty(&graveyard)) {
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, 1);
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
unlock_mount_hash();
namespace_unlock();
......@@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt)
m = list_first_entry(&graveyard, struct mount,
mnt_expire);
touch_mnt_namespace(m->mnt_ns);
umount_tree(m, 1);
umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
}
}
......
......@@ -361,6 +361,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
return ret;
}
/*
* Clear MNT_LOCKED when it can be shown to be safe.
*
* mount_lock lock must be held for write
*/
void propagate_mount_unlock(struct mount *mnt)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m, *child;
BUG_ON(parent == mnt);
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
if (child)
child->mnt.mnt_flags &= ~MNT_LOCKED;
}
}
/*
* Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
*/
static void mark_umount_candidates(struct mount *mnt)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m;
BUG_ON(parent == mnt);
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
struct mount *child = __lookup_mnt_last(&m->mnt,
mnt->mnt_mountpoint);
if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
SET_MNT_MARK(child);
}
}
}
/*
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
* parent propagates to.
......@@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt)
struct mount *child = __lookup_mnt_last(&m->mnt,
mnt->mnt_mountpoint);
/*
* umount the child only if the child has no
* other children
* umount the child only if the child has no children
* and the child is marked safe to unmount.
*/
if (child && list_empty(&child->mnt_mounts)) {
if (!child || !IS_MNT_MARKED(child))
continue;
CLEAR_MNT_MARK(child);
if (list_empty(&child->mnt_mounts)) {
list_del_init(&child->mnt_child);
hlist_del_init_rcu(&child->mnt_hash);
hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
child->mnt.mnt_flags |= MNT_UMOUNT;
list_move_tail(&child->mnt_list, &mnt->mnt_list);
}
}
}
......@@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt)
*
* vfsmount lock must be held for write
*/
int propagate_umount(struct hlist_head *list)
int propagate_umount(struct list_head *list)
{
struct mount *mnt;
hlist_for_each_entry(mnt, list, mnt_hash)
list_for_each_entry_reverse(mnt, list, mnt_list)
mark_umount_candidates(mnt);
list_for_each_entry(mnt, list, mnt_list)
__propagate_umount(mnt);
return 0;
}
......@@ -19,6 +19,9 @@
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
#define IS_MNT_LOCKED_AND_LAZY(m) \
(((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED)
#define CL_EXPIRE 0x01
#define CL_SLAVE 0x02
......@@ -40,14 +43,14 @@ static inline void set_mnt_shared(struct mount *mnt)
void change_mnt_propagation(struct mount *, int);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
int propagate_umount(struct hlist_head *);
int propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void propagate_mount_unlock(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt);
void mnt_set_mountpoint(struct mount *, struct mountpoint *,
struct mount *);
void umount_tree(struct mount *, int);
struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
const struct path *root);
......
......@@ -13,6 +13,8 @@ struct vfsmount;
static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
{
init_waitqueue_head(&p->wait);
INIT_HLIST_NODE(&p->s_list);
INIT_HLIST_NODE(&p->m_list);
p->kill = kill;
}
......
......@@ -61,6 +61,7 @@ struct mnt_namespace;
#define MNT_DOOMED 0x1000000
#define MNT_SYNC_UMOUNT 0x2000000
#define MNT_MARKED 0x4000000
#define MNT_UMOUNT 0x8000000
struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment