Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull usernamespace mount fixes from Eric Biederman: "Way back in October Andrey Vagin reported that umount(MNT_DETACH) could be used to defeat MNT_LOCKED. As I worked to fix this I discovered that combined with mount propagation and an appropriate selection of shared subtrees a reference to a directory on an unmounted filesystem is not necessary. That MNT_DETACH is allowed in user namespace in a form that can break MNT_LOCKED comes from my early misunderstanding what MNT_DETACH does. To avoid breaking existing userspace the conflict between MNT_DETACH and MNT_LOCKED is fixed by leaving mounts that are locked to their parents in the mount hash table until the last reference goes away. While investigating this issue I also found an issue with __detach_mounts. The code was unnecessarily and incorrectly triggering mount propagation. Resulting in too many mounts going away when a directory is deleted, and too many cpu cycles are burned while doing that. Looking some more I realized that __detach_mounts by only keeping mounts connected that were MNT_LOCKED it had the potential to still leak information so I tweaked the code to keep everything locked together that possibly could be. This code was almost ready last cycle but Al invented fs_pin which slightly simplifies this code but required rewrites and retesting, and I have not been in top form for a while so it took me a while to get all of that done. Similiarly this pull request is late because I have been feeling absolutely miserable all week. The issue of being able to escape a bind mount has not yet been addressed, as the fixes are not yet mature" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: mnt: Update detach_mounts to leave mounts connected mnt: Fix the error check in __detach_mounts mnt: Honor MNT_LOCKED when detaching mounts fs_pin: Allow for the possibility that m_list or s_list go unused. mnt: Factor umount_mnt from umount_tree mnt: Factor out unhash_mnt from detach_mnt and umount_tree mnt: Fail collect_mounts when applied to unmounted mounts mnt: Don't propagate unmounts to locked mounts mnt: On an unmount propagate clearing of MNT_LOCKED mnt: Delay removal from the mount hash. mnt: Add MNT_UMOUNT flag mnt: In umount_tree reuse mnt_list instead of mnt_hash mnt: Don't propagate umounts in __detach_mounts mnt: Improve the umount_tree flags mnt: Use hlist_move_list in namespace_unlock

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull usernamespace mount fixes from Eric Biederman: "Way back in October Andrey Vagin reported that umount(MNT_DETACH) could be used to defeat MNT_LOCKED. As I worked to fix this I discovered that combined with mount propagation and an appropriate selection of shared subtrees a reference to a directory on an unmounted filesystem is not necessary. That MNT_DETACH is allowed in user namespace in a form that can break MNT_LOCKED comes from my early misunderstanding what MNT_DETACH does. To avoid breaking existing userspace the conflict between MNT_DETACH and MNT_LOCKED is fixed by leaving mounts that are locked to their parents in the mount hash table until the last reference goes away. While investigating this issue I also found an issue with __detach_mounts. The code was unnecessarily and incorrectly triggering mount propagation. Resulting in too many mounts going away when a directory is deleted, and too many cpu cycles are burned while doing that. Looking some more I realized that __detach_mounts by only keeping mounts connected that were MNT_LOCKED it had the potential to still leak information so I tweaked the code to keep everything locked together that possibly could be. This code was almost ready last cycle but Al invented fs_pin which slightly simplifies this code but required rewrites and retesting, and I have not been in top form for a while so it took me a while to get all of that done. Similiarly this pull request is late because I have been feeling absolutely miserable all week. The issue of being able to escape a bind mount has not yet been addressed, as the fixes are not yet mature" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: mnt: Update detach_mounts to leave mounts connected mnt: Fix the error check in __detach_mounts mnt: Honor MNT_LOCKED when detaching mounts fs_pin: Allow for the possibility that m_list or s_list go unused. mnt: Factor umount_mnt from umount_tree mnt: Factor out unhash_mnt from detach_mnt and umount_tree mnt: Fail collect_mounts when applied to unmounted mounts mnt: Don't propagate unmounts to locked mounts mnt: On an unmount propagate clearing of MNT_LOCKED mnt: Delay removal from the mount hash. mnt: Add MNT_UMOUNT flag mnt: In umount_tree reuse mnt_list instead of mnt_hash mnt: Don't propagate umounts in __detach_mounts mnt: Improve the umount_tree flags mnt: Use hlist_move_list in namespace_unlock
8f502d5b · Linus Torvalds · 06a60dec · e0c9c0af · 8f502d5b · 8f502d5b
Commit 8f502d5b authored Apr 18, 2015 by Linus Torvalds
6 changed files
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock);
 void pin_remove(struct fs_pin *pin)
 {
 	spin_lock(&pin_lock);
-	hlist_del(&pin->m_list);
-	hlist_del(&pin->s_list);
+	hlist_del_init(&pin->m_list);
+	hlist_del_init(&pin->s_list);
 	spin_unlock(&pin_lock);
 	spin_lock_irq(&pin->wait.lock);
 	pin->done = 1;

--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
-	struct mount *p, *res;
-	res = p = __lookup_mnt(mnt, dentry);
+	struct mount *p, *res = NULL;
+	p = __lookup_mnt(mnt, dentry);
 	if (!p)
 		goto out;
+	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+		res = p;
 	hlist_for_each_entry_continue(p, mnt_hash) {
 		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
 			break;
-		res = p;
+		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+			res = p;
 	}
 out:
 	return res;
@@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 /*
 * vfsmount lock must be held for write
 */
-static void detach_mnt(struct mount *mnt, struct path *old_path)
+static void unhash_mnt(struct mount *mnt)
 {
-	old_path->dentry = mnt->mnt_mountpoint;
-	old_path->mnt = &mnt->mnt_parent->mnt;
 	mnt->mnt_parent = mnt;
 	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 	list_del_init(&mnt->mnt_child);
@@ -808,6 +809,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
 	mnt->mnt_mp = NULL;
 }

+/*
+ * vfsmount lock must be held for write
+ */
+static void detach_mnt(struct mount *mnt, struct path *old_path)
+{
+	old_path->dentry = mnt->mnt_mountpoint;
+	old_path->mnt = &mnt->mnt_parent->mnt;
+	unhash_mnt(mnt);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void umount_mnt(struct mount *mnt)
+{
+	/* old mountpoint will be dropped when we can do that */
+	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
+	unhash_mnt(mnt);
+}
+
 /*
 * vfsmount lock must be held for write
 */
@@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt)
 	rcu_read_unlock();

 	list_del(&mnt->mnt_instance);
+
+	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
+		struct mount *p, *tmp;
+		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
+			umount_mnt(p);
+		}
+	}
 	unlock_mount_hash();

 	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
@@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted);	/* protected by namespace_sem */

 static void namespace_unlock(void)
 {
-	struct hlist_head head = unmounted;
+	struct hlist_head head;

-	if (likely(hlist_empty(&head))) {
-		up_write(&namespace_sem);
-		return;
-	}
+	hlist_move_list(&unmounted, &head);

-	head.first->pprev = &head.first;
-	INIT_HLIST_HEAD(&unmounted);
 	up_write(&namespace_sem);

+	if (likely(hlist_empty(&head)))
+		return;
+
 	synchronize_rcu();

 	group_pin_kill(&head);
@@ -1319,49 +1345,63 @@ static inline void namespace_lock(void)
 	down_write(&namespace_sem);
 }

+enum umount_tree_flags {
+	UMOUNT_SYNC = 1,
+	UMOUNT_PROPAGATE = 2,
+	UMOUNT_CONNECTED = 4,
+};
 /*
 * mount_lock must be held
 * namespace_sem must be held for write
- * how = 0 => just this tree, don't propagate
- * how = 1 => propagate; we know that nobody else has reference to any victims
- * how = 2 => lazy umount
 */
-void umount_tree(struct mount *mnt, int how)
+static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 {
-	HLIST_HEAD(tmp_list);
+	LIST_HEAD(tmp_list);
 	struct mount *p;

+	if (how & UMOUNT_PROPAGATE)
+		propagate_mount_unlock(mnt);
+
+	/* Gather the mounts to umount */
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
-		hlist_del_init_rcu(&p->mnt_hash);
-		hlist_add_head(&p->mnt_hash, &tmp_list);
+		p->mnt.mnt_flags |= MNT_UMOUNT;
+		list_move(&p->mnt_list, &tmp_list);
 	}

-	hlist_for_each_entry(p, &tmp_list, mnt_hash)
+	/* Hide the mounts from mnt_mounts */
+	list_for_each_entry(p, &tmp_list, mnt_list) {
 		list_del_init(&p->mnt_child);
+	}

-	if (how)
+	/* Add propogated mounts to the tmp_list */
+	if (how & UMOUNT_PROPAGATE)
 		propagate_umount(&tmp_list);

-	while (!hlist_empty(&tmp_list)) {
-		p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
-		hlist_del_init_rcu(&p->mnt_hash);
+	while (!list_empty(&tmp_list)) {
+		bool disconnect;
+		p = list_first_entry(&tmp_list, struct mount, mnt_list);
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
 		__touch_mnt_namespace(p->mnt_ns);
 		p->mnt_ns = NULL;
-		if (how < 2)
+		if (how & UMOUNT_SYNC)
 			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
+		disconnect = !(((how & UMOUNT_CONNECTED) &&
+				mnt_has_parent(p) &&
+				(p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) ||
+			       IS_MNT_LOCKED_AND_LAZY(p));
+
+		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+				 disconnect ? &unmounted : NULL);
 		if (mnt_has_parent(p)) {
-			hlist_del_init(&p->mnt_mp_list);
-			put_mountpoint(p->mnt_mp);
 			mnt_add_count(p->mnt_parent, -1);
-			/* old mountpoint will be dropped when we can do that */
-			p->mnt_ex_mountpoint = p->mnt_mountpoint;
-			p->mnt_mountpoint = p->mnt.mnt_root;
-			p->mnt_parent = p;
-			p->mnt_mp = NULL;
+			if (!disconnect) {
+				/* Don't forget about p */
+				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+			} else {
+				umount_mnt(p);
+			}
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
 	}
@@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags)

 	if (flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
-			umount_tree(mnt, 2);
+			umount_tree(mnt, UMOUNT_PROPAGATE);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
 			if (!list_empty(&mnt->mnt_list))
-				umount_tree(mnt, 1);
+				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 			retval = 0;
 		}
 	}
@@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry)

 	namespace_lock();
 	mp = lookup_mountpoint(dentry);
-	if (!mp)
+	if (IS_ERR_OR_NULL(mp))
 		goto out_unlock;

 	lock_mount_hash();
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
-		umount_tree(mnt, 2);
+		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
+			struct mount *p, *tmp;
+			list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
+				hlist_add_head(&p->mnt_umount.s_list, &unmounted);
+				umount_mnt(p);
+			}
+		}
+		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
 	unlock_mount_hash();
 	put_mountpoint(mp);
@@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 out:
 	if (res) {
 		lock_mount_hash();
-		umount_tree(res, 0);
+		umount_tree(res, UMOUNT_SYNC);
 		unlock_mount_hash();
 	}
 	return q;
@@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path)
 {
 	struct mount *tree;
 	namespace_lock();
-	tree = copy_tree(real_mount(path->mnt), path->dentry,
-			 CL_COPY_ALL | CL_PRIVATE);
+	if (!check_mnt(real_mount(path->mnt)))
+		tree = ERR_PTR(-EINVAL);
+	else
+		tree = copy_tree(real_mount(path->mnt), path->dentry,
+				 CL_COPY_ALL | CL_PRIVATE);
 	namespace_unlock();
 	if (IS_ERR(tree))
 		return ERR_CAST(tree);
@@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
 	namespace_lock();
 	lock_mount_hash();
-	umount_tree(real_mount(mnt), 0);
+	umount_tree(real_mount(mnt), UMOUNT_SYNC);
 	unlock_mount_hash();
 	namespace_unlock();
 }
@@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 out_cleanup_ids:
 	while (!hlist_empty(&tree_list)) {
 		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
-		umount_tree(child, 0);
+		umount_tree(child, UMOUNT_SYNC);
 	}
 	unlock_mount_hash();
 	cleanup_group_ids(source_mnt, NULL);
@@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name,
 	err = graft_tree(mnt, parent, mp);
 	if (err) {
 		lock_mount_hash();
-		umount_tree(mnt, 0);
+		umount_tree(mnt, UMOUNT_SYNC);
 		unlock_mount_hash();
 	}
 out2:
@@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 	while (!list_empty(&graveyard)) {
 		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
 		touch_mnt_namespace(mnt->mnt_ns);
-		umount_tree(mnt, 1);
+		umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 	}
 	unlock_mount_hash();
 	namespace_unlock();
@@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt)
 			m = list_first_entry(&graveyard, struct mount,
 						mnt_expire);
 			touch_mnt_namespace(m->mnt_ns);
-			umount_tree(m, 1);
+			umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 		}
 	}
 }

--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -361,6 +361,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 	return ret;
 }

+/*
+ * Clear MNT_LOCKED when it can be shown to be safe.
+ *
+ * mount_lock lock must be held for write
+ */
+void propagate_mount_unlock(struct mount *mnt)
+{
+	struct mount *parent = mnt->mnt_parent;
+	struct mount *m, *child;
+
+	BUG_ON(parent == mnt);
+
+	for (m = propagation_next(parent, parent); m;
+			m = propagation_next(m, parent)) {
+		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		if (child)
+			child->mnt.mnt_flags &= ~MNT_LOCKED;
+	}
+}
+
+/*
+ * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
+ */
+static void mark_umount_candidates(struct mount *mnt)
+{
+	struct mount *parent = mnt->mnt_parent;
+	struct mount *m;
+
+	BUG_ON(parent == mnt);
+
+	for (m = propagation_next(parent, parent); m;
+			m = propagation_next(m, parent)) {
+		struct mount *child = __lookup_mnt_last(&m->mnt,
+						mnt->mnt_mountpoint);
+		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+			SET_MNT_MARK(child);
+		}
+	}
+}
+
 /*
 * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
 * parent propagates to.
@@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt)
 		struct mount *child = __lookup_mnt_last(&m->mnt,
 						mnt->mnt_mountpoint);
 		/*
-		 * umount the child only if the child has no
-		 * other children
+		 * umount the child only if the child has no children
+		 * and the child is marked safe to unmount.
 		 */
-		if (child && list_empty(&child->mnt_mounts)) {
+		if (!child || !IS_MNT_MARKED(child))
+			continue;
+		CLEAR_MNT_MARK(child);
+		if (list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
-			hlist_del_init_rcu(&child->mnt_hash);
-			hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
+			child->mnt.mnt_flags |= MNT_UMOUNT;
+			list_move_tail(&child->mnt_list, &mnt->mnt_list);
 		}
 	}
 }
@@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt)
 *
 * vfsmount lock must be held for write
 */
-int propagate_umount(struct hlist_head *list)
+int propagate_umount(struct list_head *list)
 {
 	struct mount *mnt;

-	hlist_for_each_entry(mnt, list, mnt_hash)
+	list_for_each_entry_reverse(mnt, list, mnt_list)
+		mark_umount_candidates(mnt);
+
+	list_for_each_entry(mnt, list, mnt_list)
 		__propagate_umount(mnt);
 	return 0;
 }
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -19,6 +19,9 @@
 #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
 #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
 #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
+#define IS_MNT_LOCKED_AND_LAZY(m) \
+	(((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED)

 #define CL_EXPIRE    		0x01
 #define CL_SLAVE     		0x02
@@ -40,14 +43,14 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct hlist_head *);
+int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
+void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
 int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 			struct mount *);
-void umount_tree(struct mount *, int);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);

--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -13,6 +13,8 @@ struct vfsmount;
 static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
 {
 	init_waitqueue_head(&p->wait);
+	INIT_HLIST_NODE(&p->s_list);
+	INIT_HLIST_NODE(&p->m_list);
 	p->kill = kill;
 }


--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -61,6 +61,7 @@ struct mnt_namespace;
 #define MNT_DOOMED		0x1000000
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
+#define MNT_UMOUNT		0x8000000

 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */