Merge branch 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup changes from Tejun Heo: "This pull request contains the following changes. - cgroup_subsys_state (css) reference counting has been converted to percpu-ref. css is what each resource controller embeds into its own control structure and perform reference count against. It may be used in hot paths of various subsystems and is similar to module refcnt in that aspect. For example, block-cgroup's css refcnting was showing up a lot in Mikulaus's device-mapper scalability work and this should alleviate it. - cgroup subtree iterator has been updated so that RCU read lock can be released after grabbing reference. This allows simplifying its users which requires blocking which used to build iteration list under RCU read lock and then traverse it outside. This pull request contains simplification of cgroup core and device-cgroup. A separate pull request will update cpuset. - Fixes for various bugs including corner race conditions and RCU usage bugs. - A lot of cleanups and some prepartory work for the planned unified hierarchy support." * 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (48 commits) cgroup: CGRP_ROOT_SUBSYS_BOUND should also be ignored when mounting an existing hierarchy cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount options cgroup: fix deadlock on cgroup_mutex via drop_parsed_module_refcounts() cgroup: always use RCU accessors for protected accesses cgroup: fix RCU accesses around task->cgroups cgroup: fix RCU accesses to task->cgroups cgroup: grab cgroup_mutex in drop_parsed_module_refcounts() cgroup: fix cgroupfs_root early destruction path cgroup: reserve ID 0 for dummy_root and 1 for unified hierarchy cgroup: implement for_each_[builtin_]subsys() cgroup: move init_css_set initialization inside cgroup_mutex cgroup: s/for_each_subsys()/for_each_root_subsys()/ cgroup: clean up find_css_set() and friends cgroup: remove cgroup->actual_subsys_mask cgroup: prefix global variables with "cgroup_" cgroup: convert CFTYPE_* flags to enums cgroup: rename cont to cgrp cgroup: clean up cgroup_serial_nr_cursor cgroup: convert cgroup_cft_commit() to use cgroup_for_each_descendant_pre() cgroup: make serial_nr_cursor available throughout cgroup.c ...

Merge branch 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup changes from Tejun Heo: "This pull request contains the following changes. - cgroup_subsys_state (css) reference counting has been converted to percpu-ref. css is what each resource controller embeds into its own control structure and perform reference count against. It may be used in hot paths of various subsystems and is similar to module refcnt in that aspect. For example, block-cgroup's css refcnting was showing up a lot in Mikulaus's device-mapper scalability work and this should alleviate it. - cgroup subtree iterator has been updated so that RCU read lock can be released after grabbing reference. This allows simplifying its users which requires blocking which used to build iteration list under RCU read lock and then traverse it outside. This pull request contains simplification of cgroup core and device-cgroup. A separate pull request will update cpuset. - Fixes for various bugs including corner race conditions and RCU usage bugs. - A lot of cleanups and some prepartory work for the planned unified hierarchy support." * 'for-3.11' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (48 commits) cgroup: CGRP_ROOT_SUBSYS_BOUND should also be ignored when mounting an existing hierarchy cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount options cgroup: fix deadlock on cgroup_mutex via drop_parsed_module_refcounts() cgroup: always use RCU accessors for protected accesses cgroup: fix RCU accesses around task->cgroups cgroup: fix RCU accesses to task->cgroups cgroup: grab cgroup_mutex in drop_parsed_module_refcounts() cgroup: fix cgroupfs_root early destruction path cgroup: reserve ID 0 for dummy_root and 1 for unified hierarchy cgroup: implement for_each_[builtin_]subsys() cgroup: move init_css_set initialization inside cgroup_mutex cgroup: s/for_each_subsys()/for_each_root_subsys()/ cgroup: clean up find_css_set() and friends cgroup: remove cgroup->actual_subsys_mask cgroup: prefix global variables with "cgroup_" cgroup: convert CFTYPE_* flags to enums cgroup: rename cont to cgrp cgroup: clean up cgroup_serial_nr_cursor cgroup: convert cgroup_cft_commit() to use cgroup_for_each_descendant_pre() cgroup: make serial_nr_cursor available throughout cgroup.c ...
b028161f · Linus Torvalds · f317ff9e · c7ba8287 · b028161f · b028161f
Commit b028161f authored Jul 02, 2013 by Linus Torvalds
Hide whitespace changes
Inline Side-by-side

Showing with 1029 additions and 785 deletions

include/linux/cgroup.h include/linux/cgroup.h +131 -91

kernel/cgroup.c kernel/cgroup.c +880 -656

security/device_cgroup.c security/device_cgroup.c +18 -38

No files found.
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
 #include <linux/workqueue.h>
 #include <linux/xattr.h>
 #include <linux/fs.h>
+#include <linux/percpu-refcount.h>
 #ifdef CONFIG_CGROUPS
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
 	 */
 	struct cgroup *cgroup;
-	/*
+	/* reference count - access via css_[try]get() and css_put() */
-	 * State maintained by the cgroup system to allow subsystems
+	struct percpu_ref refcnt;
-	 * to be "busy". Should be accessed via css_get(),
-	 * css_tryget() and css_put().
-	 */
-	atomic_t refcnt;
 	unsigned long flags;
 	/* ID for this css, if possible */
@@ -94,56 +90,52 @@ enum {
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
-/* Caller must verify that the css is not for root cgroup */
+/**
-static inline void __css_get(struct cgroup_subsys_state *css, int count)
+ * css_get - obtain a reference on the specified css
-{
+ * @css: target css
-	atomic_add(count, &css->refcnt);
+ *
-}
+ * The caller must already have a reference.
-/*
- * Call css_get() to hold a reference on the css; it can be used
- * for a reference obtained via:
- * - an existing ref-counted reference to the css
- * - task->cgroups for a locked task
 */
 static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
-		__css_get(css, 1);
+		percpu_ref_get(&css->refcnt);
 }
-/*
+/**
- * Call css_tryget() to take a reference on a css if your existing
+ * css_tryget - try to obtain a reference on the specified css
- * (known-valid) reference isn't already ref-counted. Returns false if
+ * @css: target css
- * the css has been destroyed.
+ *
+ * Obtain a reference on @css if it's alive.  The caller naturally needs to
+ * ensure that @css is accessible but doesn't have to be holding a
+ * reference on it - IOW, RCU protected access is good enough for this
+ * function.  Returns %true if a reference count was successfully obtained;
+ * %false otherwise.
 */
-extern bool __css_tryget(struct cgroup_subsys_state *css);
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
-	return __css_tryget(css);
+	return percpu_ref_tryget(&css->refcnt);
 }
-/*
+/**
- * css_put() should be called to release a reference taken by
+ * css_put - put a css reference
- * css_get() or css_tryget()
+ * @css: target css
+ *
+ * Put a reference obtained via css_get() and css_tryget().
 */
-extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
-		__css_put(css);
+		percpu_ref_put(&css->refcnt);
 }
 /* bits in struct cgroup flags field */
 enum {
 	/* Control Group is dead */
-	CGRP_REMOVED,
+	CGRP_DEAD,
 	/*
 	 * Control Group has previously had a child cgroup or a task,
 	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
@@ -169,12 +161,6 @@ struct cgroup_name {
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
-	/*
-	 * count users of this cgroup. >0 means busy, but doesn't
-	 * necessarily indicate the number of tasks in the cgroup
-	 */
-	atomic_t count;
 	int id;				/* ida allocated in-hierarchy ID */
 	/*
@@ -188,6 +174,14 @@ struct cgroup {
 	struct cgroup *parent;		/* my parent */
 	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
+	/*
+	 * Monotonically increasing unique serial number which defines a
+	 * uniform order among all cgroups.  It's guaranteed that all
+	 * ->children lists are in the ascending order of ->serial_nr.
+	 * It's used to allow interrupting and resuming iterations.
+	 */
+	u64 serial_nr;
 	/*
 	 * This is a copy of dentry->d_name, and it's needed because
 	 * we can't use dentry->d_name in cgroup_path().
@@ -207,13 +201,10 @@ struct cgroup {
 	struct cgroupfs_root *root;
 	/*
-	 * List of cg_cgroup_links pointing at css_sets with
+	 * List of cgrp_cset_links pointing at css_sets with tasks in this
-	 * tasks in this cgroup. Protected by css_set_lock
+	 * cgroup.  Protected by css_set_lock.
 	 */
-	struct list_head css_sets;
+	struct list_head cset_links;
-	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
-	struct list_head cft_q_node;	/* used during cftype add/rm */
 	/*
 	 * Linked list running through all cgroups that can
@@ -229,9 +220,10 @@ struct cgroup {
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
-	/* For RCU-protected deletion */
+	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
-	struct work_struct free_work;
+	struct work_struct destroy_work;
+	atomic_t css_kill_cnt;
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
@@ -269,18 +261,26 @@ enum {
 	 *
 	 * - Remount is disallowed.
 	 *
-	 * - memcg: use_hierarchy is on by default and the cgroup file for
+	 * - "tasks" is removed.  Everything should be at process
-	 *   the flag is not created.
+	 *   granularity.  Use "cgroup.procs" instead.
 	 *
-	 * The followings are planned changes.
+	 * - "release_agent" and "notify_on_release" are removed.
+	 *   Replacement notification mechanism will be implemented.
 	 *
-	 * - release_agent will be disallowed once replacement notification
+	 * - rename(2) is disallowed.
-	 *   mechanism is implemented.
+	 *
+	 * - memcg: use_hierarchy is on by default and the cgroup file for
+	 *   the flag is not created.
 	 */
 	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+	/* mount options live below bit 16 */
+	CGRP_ROOT_OPTION_MASK	= (1 << 16) - 1,
+	CGRP_ROOT_SUBSYS_BOUND	= (1 << 16), /* subsystems finished binding */
 };
 /*
@@ -291,18 +291,12 @@ enum {
 struct cgroupfs_root {
 	struct super_block *sb;
-	/*
+	/* The bitmask of subsystems attached to this hierarchy */
-	 * The bitmask of subsystems intended to be attached to this
-	 * hierarchy
-	 */
 	unsigned long subsys_mask;
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
-	/* The bitmask of subsystems currently attached to this hierarchy */
-	unsigned long actual_subsys_mask;
 	/* A list running through the attached subsystems */
 	struct list_head subsys_list;
@@ -315,9 +309,6 @@ struct cgroupfs_root {
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
-	/* All cgroups on this root, cgroup_mutex protected */
-	struct list_head allcg_list;
 	/* Hierarchy-specific flags */
 	unsigned long flags;
@@ -357,11 +348,10 @@ struct css_set {
 	struct list_head tasks;
 	/*
-	 * List of cg_cgroup_link objects on link chains from
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
-	 * cgroups referenced from this css_set. Protected by
+	 * css_set.  Protected by css_set_lock.
-	 * css_set_lock
 	 */
-	struct list_head cg_links;
+	struct list_head cgrp_links;
 	/*
 	 * Set of subsystem states, one for each subsystem. This array
@@ -394,9 +384,11 @@ struct cgroup_map_cb {
 */
 /* cftype->flags */
-#define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
+enum {
-#define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
+	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cg */
-#define CFTYPE_INSANE		(1U << 2)	/* don't create if sane_behavior */
+	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cg */
+	CFTYPE_INSANE		= (1 << 2),	/* don't create if sane_behavior */
+};
 #define MAX_CFTYPE_NAME		64
@@ -442,13 +434,13 @@ struct cftype {
 	 * entry. The key/value pairs (and their ordering) should not
 	 * change between reboots.
 	 */
-	int (*read_map)(struct cgroup *cont, struct cftype *cft,
+	int (*read_map)(struct cgroup *cgrp, struct cftype *cft,
 			struct cgroup_map_cb *cb);
 	/*
 	 * read_seq_string() is used for outputting a simple sequence
 	 * using seqfile.
 	 */
-	int (*read_seq_string)(struct cgroup *cont, struct cftype *cft,
+	int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft,
 			       struct seq_file *m);
 	ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
@@ -538,10 +530,11 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
-int cgroup_is_removed(const struct cgroup *cgrp);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
+int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
+				    char *buf, size_t buflen);
 int cgroup_task_count(const struct cgroup *cgrp);
@@ -646,22 +639,60 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
 	return cgrp->subsys[subsys_id];
 }
-/*
+/**
- * function to get the cgroup_subsys_state which allows for extra
+ * task_css_set_check - obtain a task's css_set with extra access conditions
- * rcu_dereference_check() conditions, such as locks used during the
+ * @task: the task to obtain css_set for
- * cgroup_subsys::attach() methods.
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * A task's css_set is RCU protected, initialized and exited while holding
+ * task_lock(), and can only be modified while holding both cgroup_mutex
+ * and task_lock() while the task is alive.  This macro verifies that the
+ * caller is inside proper critical section and returns @task's css_set.
+ *
+ * The caller can also specify additional allowed conditions via @__c, such
+ * as locks used during the cgroup_subsys::attach() methods.
 */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
-#define task_subsys_state_check(task, subsys_id, __c)			\
+#define task_css_set_check(task, __c)					\
-	rcu_dereference_check((task)->cgroups->subsys[(subsys_id)],	\
+	rcu_dereference_check((task)->cgroups,				\
-			      lockdep_is_held(&(task)->alloc_lock) ||	\
+		lockdep_is_held(&(task)->alloc_lock) ||			\
-			      lockdep_is_held(&cgroup_mutex) || (__c))
+		lockdep_is_held(&cgroup_mutex) || (__c))
 #else
-#define task_subsys_state_check(task, subsys_id, __c)			\
+#define task_css_set_check(task, __c)					\
-	rcu_dereference((task)->cgroups->subsys[(subsys_id)])
+	rcu_dereference((task)->cgroups)
 #endif
+/**
+ * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
+ * synchronization rules are the same as task_css_set_check().
+ */
+#define task_subsys_state_check(task, subsys_id, __c)			\
+	task_css_set_check((task), (__c))->subsys[(subsys_id)]
+/**
+ * task_css_set - obtain a task's css_set
+ * @task: the task to obtain css_set for
+ *
+ * See task_css_set_check().
+ */
+static inline struct css_set *task_css_set(struct task_struct *task)
+{
+	return task_css_set_check(task, false);
+}
+/**
+ * task_subsys_state - obtain css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * See task_subsys_state_check().
+ */
 static inline struct cgroup_subsys_state *
 task_subsys_state(struct task_struct *task, int subsys_id)
 {
@@ -674,12 +705,14 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
 	return task_subsys_state(task, subsys_id)->cgroup;
 }
+struct cgroup *cgroup_next_sibling(struct cgroup *pos);
 /**
 * cgroup_for_each_child - iterate through children of a cgroup
 * @pos: the cgroup * to use as the loop cursor
- * @cgroup: cgroup whose children to walk
+ * @cgrp: cgroup whose children to walk
 *
- * Walk @cgroup's children.  Must be called under rcu_read_lock().  A child
+ * Walk @cgrp's children.  Must be called under rcu_read_lock().  A child
 * cgroup which hasn't finished ->css_online() or already has finished
 * ->css_offline() may show up during traversal and it's each subsystem's
 * responsibility to verify that each @pos is alive.
@@ -687,9 +720,15 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
 * If a subsystem synchronizes against the parent in its ->css_online() and
 * before starting iterating, a cgroup which finished ->css_online() is
 * guaranteed to be visible in the future iterations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
 */
-#define cgroup_for_each_child(pos, cgroup)				\
+#define cgroup_for_each_child(pos, cgrp)				\
-	list_for_each_entry_rcu(pos, &(cgroup)->children, sibling)
+	for ((pos) = list_first_or_null_rcu(&(cgrp)->children,		\
+					    struct cgroup, sibling);	\
+	     (pos); (pos) = cgroup_next_sibling((pos)))
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 					  struct cgroup *cgroup);
@@ -748,6 +787,10 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
 */
 #define cgroup_for_each_descendant_pre(pos, cgroup)			\
 	for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos);	\
@@ -771,7 +814,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 /* A cgroup_iter should be treated as an opaque object */
 struct cgroup_iter {
-	struct list_head *cg_link;
+	struct list_head *cset_link;
 	struct list_head *task;
 };
@@ -827,7 +870,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
-unsigned short css_depth(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 #else /* !CONFIG_CGROUPS */
@@ -838,8 +880,6 @@ static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
-static inline void cgroup_lock(void) {}
-static inline void cgroup_unlock(void) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {

--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
 #include <linux/atomic.h>
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS		INT_MIN
 /*
 * cgroup_mutex is the master lock.  Any modification to cgroup or its
 * hierarchy must be performed while holding it.
@@ -99,16 +96,19 @@ static DEFINE_MUTEX(cgroup_root_mutex);
 */
 #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
 #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
-static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
+static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
 /*
- * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
+ * The dummy hierarchy, reserved for the subsystems that are otherwise
- * subsystems that are otherwise unattached - it never has more than a
+ * unattached - it never has more than a single cgroup, and all tasks are
- * single cgroup, and all tasks are part of that cgroup.
+ * part of that cgroup.
 */
-static struct cgroupfs_root rootnode;
+static struct cgroupfs_root cgroup_dummy_root;
+/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
+static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
 /*
 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
@@ -186,18 +186,28 @@ struct cgroup_event {
 /* The list of hierarchy roots */
-static LIST_HEAD(roots);
+static LIST_HEAD(cgroup_roots);
-static int root_count;
+static int cgroup_root_count;
-static DEFINE_IDA(hierarchy_ida);
+/*
-static int next_hierarchy_id;
+ * Hierarchy ID allocation and mapping.  It follows the same exclusion
-static DEFINE_SPINLOCK(hierarchy_id_lock);
+ * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
+ * writes, either for reads.
-/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
+ */
-#define dummytop (&rootnode.top_cgroup)
+static DEFINE_IDR(cgroup_hierarchy_idr);
 static struct cgroup_name root_cgroup_name = { .name = "/" };
+/*
+ * Assign a monotonically increasing serial number to cgroups.  It
+ * guarantees cgroups with bigger numbers are newer than those with smaller
+ * numbers.  Also, as cgroups are always appended to the parent's
+ * ->children list, it guarantees that sibling cgroups are always sorted in
+ * the ascending serial number order on the list.  Protected by
+ * cgroup_mutex.
+ */
+static u64 cgroup_serial_nr_next = 1;
 /* This flag indicates whether tasks in the fork and exit paths should
 * check for fork/exit handlers to call. This avoids us having to do
 * extra work in the fork/exit path if none of the subsystems need to
@@ -205,27 +215,15 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
 */
 static int need_forkexit_callback __read_mostly;
+static void cgroup_offline_fn(struct work_struct *work);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
-static int css_unbias_refcnt(int refcnt)
-{
-	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-	int v = atomic_read(&css->refcnt);
-	return css_unbias_refcnt(v);
-}
 /* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
-	return test_bit(CGRP_REMOVED, &cgrp->flags);
+	return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 /**
@@ -261,16 +259,38 @@ static int notify_on_release(const struct cgroup *cgrp)
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
-/*
+/**
- * for_each_subsys() allows you to iterate on each subsystem attached to
+ * for_each_subsys - iterate all loaded cgroup subsystems
- * an active hierarchy
+ * @ss: the iteration cursor
+ * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ *
+ * Should be called under cgroup_mutex.
 */
-#define for_each_subsys(_root, _ss) \
+#define for_each_subsys(ss, i)						\
-list_for_each_entry(_ss, &_root->subsys_list, sibling)
+	for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++)			\
+		if (({ lockdep_assert_held(&cgroup_mutex);		\
+		       !((ss) = cgroup_subsys[i]); })) { }		\
+		else
+/**
+ * for_each_builtin_subsys - iterate all built-in cgroup subsystems
+ * @ss: the iteration cursor
+ * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
+ *
+ * Bulit-in subsystems are always present and iteration itself doesn't
+ * require any synchronization.
+ */
+#define for_each_builtin_subsys(ss, i)					\
+	for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[i]) || true); (i)++)
+/* iterate each subsystem attached to a hierarchy */
+#define for_each_root_subsys(root, ss)					\
+	list_for_each_entry((ss), &(root)->subsys_list, sibling)
-/* for_each_active_root() allows you to iterate across the active hierarchies */
+/* iterate across the active hierarchies */
-#define for_each_active_root(_root) \
+#define for_each_active_root(root)					\
-list_for_each_entry(_root, &roots, root_list)
+	list_for_each_entry((root), &cgroup_roots, root_list)
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
@@ -297,7 +317,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
-	if (cgroup_is_removed(cgrp)) {
+	if (cgroup_is_dead(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return false;
 	}
@@ -312,20 +332,24 @@ static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
-/* Link structure for associating css_set objects with cgroups */
+/*
-struct cg_cgroup_link {
+ * A cgroup can be associated with multiple css_sets as different tasks may
-	/*
+ * belong to different cgroups on different hierarchies.  In the other
-	 * List running through cg_cgroup_links associated with a
+ * direction, a css_set is naturally associated with multiple cgroups.
-	 * cgroup, anchored on cgroup->css_sets
+ * This M:N relationship is represented by the following link structure
-	 */
+ * which exists for each association and allows traversing the associations
-	struct list_head cgrp_link_list;
+ * from both sides.
-	struct cgroup *cgrp;
+ */
-	/*
+struct cgrp_cset_link {
-	 * List running through cg_cgroup_links pointing at a
+	/* the cgroup and css_set this link associates */
-	 * single css_set object, anchored on css_set->cg_links
+	struct cgroup		*cgrp;
-	 */
+	struct css_set		*cset;
-	struct list_head cg_link_list;
-	struct css_set *cg;
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
 };
 /* The default css_set - used by init and its children prior to any
@@ -336,7 +360,7 @@ struct cg_cgroup_link {
 */
 static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+static struct cgrp_cset_link init_cgrp_cset_link;
 static int cgroup_init_idr(struct cgroup_subsys *ss,
 			   struct cgroup_subsys_state *css);
@@ -357,10 +381,11 @@ static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
-	int i;
 	unsigned long key = 0UL;
+	struct cgroup_subsys *ss;
+	int i;
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
+	for_each_subsys(ss, i)
 		key += (unsigned long)css[i];
 	key = (key >> 16) ^ key;
@@ -373,90 +398,83 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 * compiled into their kernel but not actually in use */
 static int use_task_css_set_links __read_mostly;
-static void __put_css_set(struct css_set *cg, int taskexit)
+static void __put_css_set(struct css_set *cset, int taskexit)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link, *tmp_link;
-	struct cg_cgroup_link *saved_link;
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 	 * rwlock
 	 */
-	if (atomic_add_unless(&cg->refcount, -1, 1))
+	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 	write_lock(&css_set_lock);
-	if (!atomic_dec_and_test(&cg->refcount)) {
+	if (!atomic_dec_and_test(&cset->refcount)) {
 		write_unlock(&css_set_lock);
 		return;
 	}
 	/* This css_set is dead. unlink it and release cgroup refcounts */
-	hash_del(&cg->hlist);
+	hash_del(&cset->hlist);
 	css_set_count--;
-	list_for_each_entry_safe(link, saved_link, &cg->cg_links,
+	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
-				 cg_link_list) {
 		struct cgroup *cgrp = link->cgrp;
-		list_del(&link->cg_link_list);
-		list_del(&link->cgrp_link_list);
-		/*
+		list_del(&link->cset_link);
-		 * We may not be holding cgroup_mutex, and if cgrp->count is
+		list_del(&link->cgrp_link);
-		 * dropped to 0 the cgroup can be destroyed at any time, hence
-		 * rcu_read_lock is used to keep it alive.
+		/* @cgrp can't go away while we're holding css_set_lock */
-		 */
+		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
-		rcu_read_lock();
-		if (atomic_dec_and_test(&cgrp->count) &&
-		    notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
-		rcu_read_unlock();
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
-	kfree_rcu(cg, rcu_head);
+	kfree_rcu(cset, rcu_head);
 }
 /*
 * refcounted get/put for css_set objects
 */
-static inline void get_css_set(struct css_set *cg)
+static inline void get_css_set(struct css_set *cset)
 {
-	atomic_inc(&cg->refcount);
+	atomic_inc(&cset->refcount);
 }
-static inline void put_css_set(struct css_set *cg)
+static inline void put_css_set(struct css_set *cset)
 {
-	__put_css_set(cg, 0);
+	__put_css_set(cset, 0);
 }
-static inline void put_css_set_taskexit(struct css_set *cg)
+static inline void put_css_set_taskexit(struct css_set *cset)
 {
-	__put_css_set(cg, 1);
+	__put_css_set(cset, 1);
 }
-/*
+/**
 * compare_css_sets - helper function for find_existing_css_set().
- * @cg: candidate css_set being tested
+ * @cset: candidate css_set being tested
- * @old_cg: existing css_set for a task
+ * @old_cset: existing css_set for a task
 * @new_cgrp: cgroup that's being entered by the task
 * @template: desired set of css pointers in css_set (pre-calculated)
 *
 * Returns true if "cg" matches "old_cg" except for the hierarchy
 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 */
-static bool compare_css_sets(struct css_set *cg,
+static bool compare_css_sets(struct css_set *cset,
-			     struct css_set *old_cg,
+			     struct css_set *old_cset,
 			     struct cgroup *new_cgrp,
 			     struct cgroup_subsys_state *template[])
 {
 	struct list_head *l1, *l2;
-	if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
+	if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
 		/* Not all subsystems matched */
 		return false;
 	}
@@ -470,28 +488,28 @@ static bool compare_css_sets(struct css_set *cg,
 	 * candidates.
 	 */
-	l1 = &cg->cg_links;
+	l1 = &cset->cgrp_links;
-	l2 = &old_cg->cg_links;
+	l2 = &old_cset->cgrp_links;
 	while (1) {
-		struct cg_cgroup_link *cgl1, *cgl2;
+		struct cgrp_cset_link *link1, *link2;
-		struct cgroup *cg1, *cg2;
+		struct cgroup *cgrp1, *cgrp2;
 		l1 = l1->next;
 		l2 = l2->next;
 		/* See if we reached the end - both lists are equal length. */
-		if (l1 == &cg->cg_links) {
+		if (l1 == &cset->cgrp_links) {
-			BUG_ON(l2 != &old_cg->cg_links);
+			BUG_ON(l2 != &old_cset->cgrp_links);
 			break;
 		} else {
-			BUG_ON(l2 == &old_cg->cg_links);
+			BUG_ON(l2 == &old_cset->cgrp_links);
 		}
 		/* Locate the cgroups associated with these links. */
-		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
+		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
-		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
+		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
-		cg1 = cgl1->cgrp;
+		cgrp1 = link1->cgrp;
-		cg2 = cgl2->cgrp;
+		cgrp2 = link2->cgrp;
 		/* Hierarchies should be linked in the same order. */
-		BUG_ON(cg1->root != cg2->root);
+		BUG_ON(cgrp1->root != cgrp2->root);
 		/*
 		 * If this hierarchy is the hierarchy of the cgroup
@@ -500,46 +518,39 @@ static bool compare_css_sets(struct css_set *cg,
 		 * hierarchy, then this css_set should point to the
 		 * same cgroup as the old css_set.
 		 */
-		if (cg1->root == new_cgrp->root) {
+		if (cgrp1->root == new_cgrp->root) {
-			if (cg1 != new_cgrp)
+			if (cgrp1 != new_cgrp)
 				return false;
 		} else {
-			if (cg1 != cg2)
+			if (cgrp1 != cgrp2)
 				return false;
 		}
 	}
 	return true;
 }
-/*
+/**
- * find_existing_css_set() is a helper for
+ * find_existing_css_set - init css array and find the matching css_set
- * find_css_set(), and checks to see whether an existing
+ * @old_cset: the css_set that we're using before the cgroup transition
- * css_set is suitable.
+ * @cgrp: the cgroup that we're moving into
- *
+ * @template: out param for the new set of csses, should be clear on entry
- * oldcg: the cgroup group that we're using before the cgroup
- * transition
- *
- * cgrp: the cgroup that we're moving into
- *
- * template: location in which to build the desired set of subsystem
- * state objects for the new cgroup group
 */
-static struct css_set *find_existing_css_set(
+static struct css_set *find_existing_css_set(struct css_set *old_cset,
-	struct css_set *oldcg,
+					struct cgroup *cgrp,
-	struct cgroup *cgrp,
+					struct cgroup_subsys_state *template[])
-	struct cgroup_subsys_state *template[])
 {
-	int i;
 	struct cgroupfs_root *root = cgrp->root;
-	struct css_set *cg;
+	struct cgroup_subsys *ss;
+	struct css_set *cset;
 	unsigned long key;
+	int i;
 	/*
 	 * Build the set of subsystem state objects that we want to see in the
 	 * new css_set. while subsystems can change globally, the entries here
 	 * won't change, so no need for locking.
 	 */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+	for_each_subsys(ss, i) {
 		if (root->subsys_mask & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
@@ -548,148 +559,152 @@ static struct css_set *find_existing_css_set(
 		} else {
 			/* Subsystem is not in this hierarchy, so we
 			 * don't want to change the subsystem state */
-			template[i] = oldcg->subsys[i];
+			template[i] = old_cset->subsys[i];
 		}
 	}
 	key = css_set_hash(template);
-	hash_for_each_possible(css_set_table, cg, hlist, key) {
+	hash_for_each_possible(css_set_table, cset, hlist, key) {
-		if (!compare_css_sets(cg, oldcg, cgrp, template))
+		if (!compare_css_sets(cset, old_cset, cgrp, template))
 			continue;
 		/* This css_set matches what we need */
-		return cg;
+		return cset;
 	}
 	/* No existing cgroup group matched */
 	return NULL;
 }
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link, *tmp_link;
-	struct cg_cgroup_link *saved_link;
-	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
+	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
-		list_del(&link->cgrp_link_list);
+		list_del(&link->cset_link);
 		kfree(link);
 	}
 }
-/*
+/**
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
+ * @count: the number of links to allocate
- * success or a negative error
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link.  Returns 0 on success or -errno.
 */
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 	int i;
-	INIT_LIST_HEAD(tmp);
+	INIT_LIST_HEAD(tmp_links);
 	for (i = 0; i < count; i++) {
-		link = kmalloc(sizeof(*link), GFP_KERNEL);
+		link = kzalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
-			free_cg_links(tmp);
+			free_cgrp_cset_links(tmp_links);
 			return -ENOMEM;
 		}
-		list_add(&link->cgrp_link_list, tmp);
+		list_add(&link->cset_link, tmp_links);
 	}
 	return 0;
 }
 /**
 * link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
- * @cg: the css_set to be linked
+ * @cset: the css_set to be linked
 * @cgrp: the destination cgroup
 */
-static void link_css_set(struct list_head *tmp_cg_links,
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
-			 struct css_set *cg, struct cgroup *cgrp)
+			 struct cgroup *cgrp)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
-	BUG_ON(list_empty(tmp_cg_links));
+	BUG_ON(list_empty(tmp_links));
-	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
+	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
-				cgrp_link_list);
+	link->cset = cset;
-	link->cg = cg;
 	link->cgrp = cgrp;
-	atomic_inc(&cgrp->count);
+	list_move(&link->cset_link, &cgrp->cset_links);
-	list_move(&link->cgrp_link_list, &cgrp->css_sets);
 	/*
 	 * Always add links to the tail of the list so that the list
 	 * is sorted by order of hierarchy creation
 	 */
-	list_add_tail(&link->cg_link_list, &cg->cg_links);
+	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 }
-/*
+/**
- * find_css_set() takes an existing cgroup group and a
+ * find_css_set - return a new css_set with one cgroup updated
- * cgroup object, and returns a css_set object that's
+ * @old_cset: the baseline css_set
- * equivalent to the old group, but with the given cgroup
+ * @cgrp: the cgroup to be updated
- * substituted into the appropriate hierarchy. Must be called with
+ *
- * cgroup_mutex held
+ * Return a new css_set that's equivalent to @old_cset, but with @cgrp
+ * substituted into the appropriate hierarchy.
 */
-static struct css_set *find_css_set(
+static struct css_set *find_css_set(struct css_set *old_cset,
-	struct css_set *oldcg, struct cgroup *cgrp)
+				    struct cgroup *cgrp)
 {
-	struct css_set *res;
+	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
-	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+	struct css_set *cset;
+	struct list_head tmp_links;
-	struct list_head tmp_cg_links;
+	struct cgrp_cset_link *link;
-	struct cg_cgroup_link *link;
 	unsigned long key;
+	lockdep_assert_held(&cgroup_mutex);
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
 	read_lock(&css_set_lock);
-	res = find_existing_css_set(oldcg, cgrp, template);
+	cset = find_existing_css_set(old_cset, cgrp, template);
-	if (res)
+	if (cset)
-		get_css_set(res);
+		get_css_set(cset);
 	read_unlock(&css_set_lock);
-	if (res)
+	if (cset)
-		return res;
+		return cset;
-	res = kmalloc(sizeof(*res), GFP_KERNEL);
+	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
-	if (!res)
+	if (!cset)
 		return NULL;
-	/* Allocate all the cg_cgroup_link objects that we'll need */
+	/* Allocate all the cgrp_cset_link objects that we'll need */
-	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
+	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
-		kfree(res);
+		kfree(cset);
 		return NULL;
 	}
-	atomic_set(&res->refcount, 1);
+	atomic_set(&cset->refcount, 1);
-	INIT_LIST_HEAD(&res->cg_links);
+	INIT_LIST_HEAD(&cset->cgrp_links);
-	INIT_LIST_HEAD(&res->tasks);
+	INIT_LIST_HEAD(&cset->tasks);
-	INIT_HLIST_NODE(&res->hlist);
+	INIT_HLIST_NODE(&cset->hlist);
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
-	memcpy(res->subsys, template, sizeof(res->subsys));
+	memcpy(cset->subsys, template, sizeof(cset->subsys));
 	write_lock(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
-	list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
+	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		if (c->root == cgrp->root)
 			c = cgrp;
-		link_css_set(&tmp_cg_links, res, c);
+		link_css_set(&tmp_links, cset, c);
 	}
-	BUG_ON(!list_empty(&tmp_cg_links));
+	BUG_ON(!list_empty(&tmp_links));
 	css_set_count++;
 	/* Add this cgroup group to the hash table */
-	key = css_set_hash(res->subsys);
+	key = css_set_hash(cset->subsys);
-	hash_add(css_set_table, &res->hlist, key);
+	hash_add(css_set_table, &cset->hlist, key);
 	write_unlock(&css_set_lock);
-	return res;
+	return cset;
 }
 /*
@@ -699,7 +714,7 @@ static struct css_set *find_css_set(
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 					    struct cgroupfs_root *root)
 {
-	struct css_set *css;
+	struct css_set *cset;
 	struct cgroup *res = NULL;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -709,13 +724,15 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 	 * task can't change groups, so the only thing that can happen
 	 * is that it exits and its css is set back to init_css_set.
 	 */
-	css = task->cgroups;
+	cset = task_css_set(task);
-	if (css == &init_css_set) {
+	if (cset == &init_css_set) {
 		res = &root->top_cgroup;
 	} else {
-		struct cg_cgroup_link *link;
+		struct cgrp_cset_link *link;
-		list_for_each_entry(link, &css->cg_links, cg_link_list) {
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 			struct cgroup *c = link->cgrp;
 			if (c->root == root) {
 				res = c;
 				break;
@@ -828,14 +845,14 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 static void cgroup_free_fn(struct work_struct *work)
 {
-	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	struct cgroup_subsys *ss;
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * Release the subsystem state objects.
 	 */
-	for_each_subsys(cgrp->root, ss)
+	for_each_root_subsys(cgrp->root, ss)
 		ss->css_free(cgrp);
 	cgrp->root->number_of_cgroups--;
@@ -873,7 +890,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
-	schedule_work(&cgrp->free_work);
+	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
+	schedule_work(&cgrp->destroy_work);
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -882,7 +900,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 	if (S_ISDIR(inode->i_mode)) {
 		struct cgroup *cgrp = dentry->d_fsdata;
-		BUG_ON(!(cgroup_is_removed(cgrp)));
+		BUG_ON(!(cgroup_is_dead(cgrp)));
 		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 	} else {
 		struct cfent *cfe = __d_cfe(dentry);
@@ -950,7 +968,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files,
 	struct cgroup *cgrp = __d_cgrp(dir);
 	struct cgroup_subsys *ss;
-	for_each_subsys(cgrp->root, ss) {
+	for_each_root_subsys(cgrp->root, ss) {
 		struct cftype_set *set;
 		if (!test_bit(ss->subsys_id, &subsys_mask))
 			continue;
@@ -988,30 +1006,23 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 * returns an error, no reference counts are touched.
 */
 static int rebind_subsystems(struct cgroupfs_root *root,
-			      unsigned long final_subsys_mask)
+			     unsigned long added_mask, unsigned removed_mask)
 {
-	unsigned long added_mask, removed_mask;
 	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgroup_subsys *ss;
 	int i;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
-	removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
-	added_mask = final_subsys_mask & ~root->actual_subsys_mask;
 	/* Check that any added subsystems are currently free */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+	for_each_subsys(ss, i) {
 		unsigned long bit = 1UL << i;
-		struct cgroup_subsys *ss = subsys[i];
 		if (!(bit & added_mask))
 			continue;
-		/*
-		 * Nobody should tell us to do a subsys that doesn't exist:
+		if (ss->root != &cgroup_dummy_root) {
-		 * parse_cgroupfs_options should catch that case and refcounts
-		 * ensure that subsystems won't disappear once selected.
-		 */
-		BUG_ON(ss == NULL);
-		if (ss->root != &rootnode) {
 			/* Subsystem isn't free */
 			return -EBUSY;
 		}
@@ -1025,38 +1036,41 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 		return -EBUSY;
 	/* Process each subsystem */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+	for_each_subsys(ss, i) {
-		struct cgroup_subsys *ss = subsys[i];
 		unsigned long bit = 1UL << i;
 		if (bit & added_mask) {
 			/* We're binding this subsystem to this hierarchy */
-			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i]);
-			BUG_ON(!dummytop->subsys[i]);
+			BUG_ON(!cgroup_dummy_top->subsys[i]);
-			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+			BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
-			cgrp->subsys[i] = dummytop->subsys[i];
+			cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(cgrp);
 			/* refcount was already taken, and we're keeping it */
+			root->subsys_mask |= bit;
 		} else if (bit & removed_mask) {
 			/* We're removing this subsystem */
-			BUG_ON(ss == NULL);
+			BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
-			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
 			if (ss->bind)
-				ss->bind(dummytop);
+				ss->bind(cgroup_dummy_top);
-			dummytop->subsys[i]->cgroup = dummytop;
+			cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
 			cgrp->subsys[i] = NULL;
-			subsys[i]->root = &rootnode;
+			cgroup_subsys[i]->root = &cgroup_dummy_root;
-			list_move(&ss->sibling, &rootnode.subsys_list);
+			list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
 			/* subsystem is now free - drop reference on module */
 			module_put(ss->module);
-		} else if (bit & final_subsys_mask) {
+			root->subsys_mask &= ~bit;
+		} else if (bit & root->subsys_mask) {
 			/* Subsystem state should already exist */
-			BUG_ON(ss == NULL);
 			BUG_ON(!cgrp->subsys[i]);
 			/*
 			 * a refcount was taken, but we already had one, so
@@ -1071,7 +1085,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]);
 		}
 	}
-	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
+	/*
+	 * Mark @root has finished binding subsystems.  @root->subsys_mask
+	 * now matches the bound subsystems.
+	 */
+	root->flags |= CGRP_ROOT_SUBSYS_BOUND;
 	return 0;
 }
@@ -1082,7 +1101,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	struct cgroup_subsys *ss;
 	mutex_lock(&cgroup_root_mutex);
-	for_each_subsys(root, ss)
+	for_each_root_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
@@ -1114,18 +1133,19 @@ struct cgroup_sb_opts {
 };
 /*
- * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
+ * Convert a hierarchy specifier into a bitmask of subsystems and
- * with cgroup_mutex held to protect the subsys[] array. This function takes
+ * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
- * refcounts on subsystems to be used, unless it returns error, in which case
+ * array. This function takes refcounts on subsystems to be used, unless it
- * no refcounts are taken.
+ * returns error, in which case no refcounts are taken.
 */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
 	unsigned long mask = (unsigned long)-1;
-	int i;
 	bool module_pin_failed = false;
+	struct cgroup_subsys *ss;
+	int i;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
@@ -1202,10 +1222,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			continue;
 		}
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		for_each_subsys(ss, i) {
-			struct cgroup_subsys *ss = subsys[i];
-			if (ss == NULL)
-				continue;
 			if (strcmp(token, ss->name))
 				continue;
 			if (ss->disabled)
@@ -1228,16 +1245,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	 * otherwise if 'none', 'name=' and a subsystem name options
 	 * were not specified, let's default to 'all'
 	 */
-	if (all_ss || (!one_ss && !opts->none && !opts->name)) {
+	if (all_ss || (!one_ss && !opts->none && !opts->name))
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		for_each_subsys(ss, i)
-			struct cgroup_subsys *ss = subsys[i];
+			if (!ss->disabled)
-			if (ss == NULL)
+				set_bit(i, &opts->subsys_mask);
-				continue;
-			if (ss->disabled)
-				continue;
-			set_bit(i, &opts->subsys_mask);
-		}
-	}
 	/* Consistency checks */
@@ -1281,12 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	 * take duplicate reference counts on a subsystem that's already used,
 	 * but rebind_subsystems handles this case.
 	 */
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+	for_each_subsys(ss, i) {
-		unsigned long bit = 1UL << i;
+		if (!(opts->subsys_mask & (1UL << i)))
-		if (!(bit & opts->subsys_mask))
 			continue;
-		if (!try_module_get(subsys[i]->module)) {
+		if (!try_module_get(cgroup_subsys[i]->module)) {
 			module_pin_failed = true;
 			break;
 		}
@@ -1303,7 +1312,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			if (!(bit & opts->subsys_mask))
 				continue;
-			module_put(subsys[i]->module);
+			module_put(cgroup_subsys[i]->module);
 		}
 		return -ENOENT;
 	}
@@ -1313,14 +1322,14 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 {
+	struct cgroup_subsys *ss;
 	int i;
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		unsigned long bit = 1UL << i;
-		if (!(bit & subsys_mask))
+	mutex_lock(&cgroup_mutex);
-			continue;
+	for_each_subsys(ss, i)
-		module_put(subsys[i]->module);
+		if (subsys_mask & (1UL << i))
-	}
+			module_put(cgroup_subsys[i]->module);
+	mutex_unlock(&cgroup_mutex);
 }
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
@@ -1345,7 +1354,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
-	if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
@@ -1353,10 +1362,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	removed_mask = root->subsys_mask & ~opts.subsys_mask;
 	/* Don't allow flags or name to change at remount */
-	if (opts.flags != root->flags ||
+	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
 	    (opts.name && strcmp(opts.name, root->name))) {
+		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
 		ret = -EINVAL;
-		drop_parsed_module_refcounts(opts.subsys_mask);
 		goto out_unlock;
 	}
@@ -1367,11 +1378,10 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	 */
 	cgroup_clear_directory(cgrp->dentry, false, removed_mask);
-	ret = rebind_subsystems(root, opts.subsys_mask);
+	ret = rebind_subsystems(root, added_mask, removed_mask);
 	if (ret) {
 		/* rebind_subsystems failed, re-populate the removed files */
 		cgroup_populate_dir(cgrp, false, removed_mask);
-		drop_parsed_module_refcounts(opts.subsys_mask);
 		goto out_unlock;
 	}
@@ -1386,6 +1396,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+	if (ret)
+		drop_parsed_module_refcounts(opts.subsys_mask);
 	return ret;
 }
@@ -1401,11 +1413,9 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->files);
-	INIT_LIST_HEAD(&cgrp->css_sets);
+	INIT_LIST_HEAD(&cgrp->cset_links);
-	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
-	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
@@ -1418,37 +1428,37 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
-	INIT_LIST_HEAD(&root->allcg_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
-	cgrp->name = &root_cgroup_name;
+	RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
 	init_cgroup_housekeeping(cgrp);
-	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
-static bool init_root_id(struct cgroupfs_root *root)
+static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
 {
-	int ret = 0;
+	int id;
-	do {
+	lockdep_assert_held(&cgroup_mutex);
-		if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
+	lockdep_assert_held(&cgroup_root_mutex);
-			return false;
-		spin_lock(&hierarchy_id_lock);
+	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
-		/* Try to allocate the next unused ID */
+			      GFP_KERNEL);
-		ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
+	if (id < 0)
-					&root->hierarchy_id);
+		return id;
-		if (ret == -ENOSPC)
-			/* Try again starting from 0 */
+	root->hierarchy_id = id;
-			ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
+	return 0;
-		if (!ret) {
+}
-			next_hierarchy_id = root->hierarchy_id + 1;
-		} else if (ret != -EAGAIN) {
+static void cgroup_exit_root_id(struct cgroupfs_root *root)
-			/* Can only get here if the 31-bit IDR is full ... */
+{
-			BUG_ON(ret);
+	lockdep_assert_held(&cgroup_mutex);
-		}
+	lockdep_assert_held(&cgroup_root_mutex);
-		spin_unlock(&hierarchy_id_lock);
-	} while (ret);
+	if (root->hierarchy_id) {
-	return true;
+		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+		root->hierarchy_id = 0;
+	}
 }
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -1482,12 +1492,16 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	if (!root)
 		return ERR_PTR(-ENOMEM);
-	if (!init_root_id(root)) {
-		kfree(root);
-		return ERR_PTR(-ENOMEM);
-	}
 	init_cgroup_root(root);
+	/*
+	 * We need to set @root->subsys_mask now so that @root can be
+	 * matched by cgroup_test_super() before it finishes
+	 * initialization; otherwise, competing mounts with the same
+	 * options may try to bind the same subsystems instead of waiting
+	 * for the first one leading to unexpected mount errors.
+	 * SUBSYS_BOUND will be set once actual binding is complete.
+	 */
 	root->subsys_mask = opts->subsys_mask;
 	root->flags = opts->flags;
 	ida_init(&root->cgroup_ida);
@@ -1500,17 +1514,15 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	return root;
 }
-static void cgroup_drop_root(struct cgroupfs_root *root)
+static void cgroup_free_root(struct cgroupfs_root *root)
 {
-	if (!root)
+	if (root) {
-		return;
+		/* hierarhcy ID shoulid already have been released */
+		WARN_ON_ONCE(root->hierarchy_id);
-	BUG_ON(!root->hierarchy_id);
+		ida_destroy(&root->cgroup_ida);
-	spin_lock(&hierarchy_id_lock);
+		kfree(root);
-	ida_remove(&hierarchy_ida, root->hierarchy_id);
+	}
-	spin_unlock(&hierarchy_id_lock);
-	ida_destroy(&root->cgroup_ida);
-	kfree(root);
 }
 static int cgroup_set_super(struct super_block *sb, void *data)
@@ -1597,7 +1609,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
-		cgroup_drop_root(opts.new_root);
+		cgroup_free_root(opts.new_root);
 		goto drop_modules;
 	}
@@ -1605,12 +1617,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	BUG_ON(!root);
 	if (root == opts.new_root) {
 		/* We used the new root structure, so this is a new hierarchy */
-		struct list_head tmp_cg_links;
+		struct list_head tmp_links;
 		struct cgroup *root_cgrp = &root->top_cgroup;
 		struct cgroupfs_root *existing_root;
 		const struct cred *cred;
 		int i;
-		struct css_set *cg;
+		struct css_set *cset;
 		BUG_ON(sb->s_root != NULL);
@@ -1637,13 +1649,18 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * that's us. The worst that can happen is that we
 		 * have some link structures left over
 		 */
-		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
+		ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 		if (ret)
 			goto unlock_drop;
-		ret = rebind_subsystems(root, root->subsys_mask);
+		/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
+		ret = cgroup_init_root_id(root, 2, 0);
+		if (ret)
+			goto unlock_drop;
+		ret = rebind_subsystems(root, root->subsys_mask, 0);
 		if (ret == -EBUSY) {
-			free_cg_links(&tmp_cg_links);
+			free_cgrp_cset_links(&tmp_links);
 			goto unlock_drop;
 		}
 		/*
@@ -1655,8 +1672,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		/* EBUSY should be the only error here */
 		BUG_ON(ret);
-		list_add(&root->root_list, &roots);
+		list_add(&root->root_list, &cgroup_roots);
-		root_count++;
+		cgroup_root_count++;
 		sb->s_root->d_fsdata = root_cgrp;
 		root->top_cgroup.dentry = sb->s_root;
@@ -1664,11 +1681,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		/* Link the top cgroup in this hierarchy into all
 		 * the css_set objects */
 		write_lock(&css_set_lock);
-		hash_for_each(css_set_table, i, cg, hlist)
+		hash_for_each(css_set_table, i, cset, hlist)
-			link_css_set(&tmp_cg_links, cg, root_cgrp);
+			link_css_set(&tmp_links, cset, root_cgrp);
 		write_unlock(&css_set_lock);
-		free_cg_links(&tmp_cg_links);
+		free_cgrp_cset_links(&tmp_links);
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
@@ -1684,9 +1701,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * We re-used an existing hierarchy - the new root (if
 		 * any) is not needed
 		 */
-		cgroup_drop_root(opts.new_root);
+		cgroup_free_root(opts.new_root);
-		if (root->flags != opts.flags) {
+		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
 			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
 				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
 				ret = -EINVAL;
@@ -1705,6 +1722,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	return dget(sb->s_root);
 unlock_drop:
+	cgroup_exit_root_id(root);
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&inode->i_mutex);
@@ -1721,9 +1739,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 static void cgroup_kill_sb(struct super_block *sb) {
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgrp_cset_link *link, *tmp_link;
 	int ret;
-	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
 	BUG_ON(!root);
@@ -1734,36 +1751,39 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	mutex_lock(&cgroup_root_mutex);
 	/* Rebind all subsystems back to the default hierarchy */
-	ret = rebind_subsystems(root, 0);
+	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
-	/* Shouldn't be able to fail ... */
+		ret = rebind_subsystems(root, 0, root->subsys_mask);
-	BUG_ON(ret);
+		/* Shouldn't be able to fail ... */
+		BUG_ON(ret);
+	}
 	/*
-	 * Release all the links from css_sets to this hierarchy's
+	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
 	write_lock(&css_set_lock);
-	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
+	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
-				 cgrp_link_list) {
+		list_del(&link->cset_link);
-		list_del(&link->cg_link_list);
+		list_del(&link->cgrp_link);
-		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
-		root_count--;
+		cgroup_root_count--;
 	}
+	cgroup_exit_root_id(root);
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	simple_xattrs_free(&cgrp->xattrs);
 	kill_litter_super(sb);
-	cgroup_drop_root(root);
+	cgroup_free_root(root);
 }
 static struct file_system_type cgroup_fs_type = {
@@ -1825,6 +1845,38 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
+/**
+ * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
+ * @task: target task
+ * @hierarchy_id: the hierarchy to look up @task's cgroup from
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
+ * copy its path into @buf.  This function grabs cgroup_mutex and shouldn't
+ * be used inside locks used by cgroup controller callbacks.
+ */
+int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
+				    char *buf, size_t buflen)
+{
+	struct cgroupfs_root *root;
+	struct cgroup *cgrp = NULL;
+	int ret = -ENOENT;
+	mutex_lock(&cgroup_mutex);
+	root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
+	if (root) {
+		cgrp = task_cgroup_from_root(task, root);
+		ret = cgroup_path(cgrp, buf, buflen);
+	}
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
 /*
 * Control Group taskset
 */
@@ -1910,10 +1962,11 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
 *
 * Must be called with cgroup_mutex and threadgroup locked.
 */
-static void cgroup_task_migrate(struct cgroup *oldcgrp,
+static void cgroup_task_migrate(struct cgroup *old_cgrp,
-				struct task_struct *tsk, struct css_set *newcg)
+				struct task_struct *tsk,
+				struct css_set *new_cset)
 {
-	struct css_set *oldcg;
+	struct css_set *old_cset;
 	/*
 	 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1921,25 +1974,25 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp,
 	 * css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
-	oldcg = tsk->cgroups;
+	old_cset = task_css_set(tsk);
 	task_lock(tsk);
-	rcu_assign_pointer(tsk->cgroups, newcg);
+	rcu_assign_pointer(tsk->cgroups, new_cset);
 	task_unlock(tsk);
 	/* Update the css_set linked lists if we're using them */
 	write_lock(&css_set_lock);
 	if (!list_empty(&tsk->cg_list))
-		list_move(&tsk->cg_list, &newcg->tasks);
+		list_move(&tsk->cg_list, &new_cset->tasks);
 	write_unlock(&css_set_lock);
 	/*
-	 * We just gained a reference on oldcg by taking it from the task. As
+	 * We just gained a reference on old_cset by taking it from the
-	 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+	 * task. As trading it for new_cset is protected by cgroup_mutex,
-	 * it here; it will be freed under RCU.
+	 * we're safe to drop it here; it will be freed under RCU.
 	 */
-	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+	set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
-	put_css_set(oldcg);
+	put_css_set(old_cset);
 }
 /**
@@ -2029,7 +2082,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
 	 */
-	for_each_subsys(root, ss) {
+	for_each_root_subsys(root, ss) {
 		if (ss->can_attach) {
 			retval = ss->can_attach(cgrp, &tset);
 			if (retval) {
@@ -2044,8 +2097,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	 * we use find_css_set, which allocates a new one if necessary.
 	 */
 	for (i = 0; i < group_size; i++) {
+		struct css_set *old_cset;
 		tc = flex_array_get(group, i);
-		tc->cg = find_css_set(tc->task->cgroups, cgrp);
+		old_cset = task_css_set(tc->task);
+		tc->cg = find_css_set(old_cset, cgrp);
 		if (!tc->cg) {
 			retval = -ENOMEM;
 			goto out_put_css_set_refs;
@@ -2066,7 +2122,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	/*
 	 * step 4: do subsystem attach callbacks.
 	 */
-	for_each_subsys(root, ss) {
+	for_each_root_subsys(root, ss) {
 		if (ss->attach)
 			ss->attach(cgrp, &tset);
 	}
@@ -2086,7 +2142,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	}
 out_cancel_attach:
 	if (retval) {
-		for_each_subsys(root, ss) {
+		for_each_root_subsys(root, ss) {
 			if (ss == failed_ss)
 				break;
 			if (ss->cancel_attach)
@@ -2323,7 +2379,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	if (cgroup_is_removed(cgrp))
+	if (cgroup_is_dead(cgrp))
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2368,7 +2424,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-	if (cgroup_is_removed(cgrp))
+	if (cgroup_is_dead(cgrp))
 		return -ENODEV;
 	if (cft->read)
@@ -2435,10 +2491,12 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
 	cft = __d_cft(file->f_dentry);
 	if (cft->read_map || cft->read_seq_string) {
-		struct cgroup_seqfile_state *state =
+		struct cgroup_seqfile_state *state;
-			kzalloc(sizeof(*state), GFP_USER);
+		state = kzalloc(sizeof(*state), GFP_USER);
 		if (!state)
 			return -ENOMEM;
 		state->cft = cft;
 		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
 		file->f_op = &cgroup_seqfile_operations;
@@ -2486,6 +2544,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 	cgrp = __d_cgrp(old_dentry);
+	/*
+	 * This isn't a proper migration and its usefulness is very
+	 * limited.  Disallow if sane_behavior.
+	 */
+	if (cgroup_sane_behavior(cgrp))
+		return -EPERM;
 	name = cgroup_alloc_name(new_dentry);
 	if (!name)
 		return -ENOMEM;
@@ -2496,7 +2561,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return ret;
 	}
-	old_name = cgrp->name;
+	old_name = rcu_dereference_protected(cgrp->name, true);
 	rcu_assign_pointer(cgrp->name, name);
 	kfree_rcu(old_name, rcu_head);
@@ -2747,58 +2812,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 	return ret;
 }
-static DEFINE_MUTEX(cgroup_cft_mutex);
 static void cgroup_cfts_prepare(void)
-	__acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+	__acquires(&cgroup_mutex)
 {
 	/*
 	 * Thanks to the entanglement with vfs inode locking, we can't walk
 	 * the existing cgroups under cgroup_mutex and create files.
-	 * Instead, we increment reference on all cgroups and build list of
+	 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
-	 * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
+	 * read lock before calling cgroup_addrm_files().
-	 * exclusive access to the field.
 	 */
-	mutex_lock(&cgroup_cft_mutex);
 	mutex_lock(&cgroup_mutex);
 }
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 			       struct cftype *cfts, bool is_add)
-	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+	__releases(&cgroup_mutex)
 {
 	LIST_HEAD(pending);
-	struct cgroup *cgrp, *n;
+	struct cgroup *cgrp, *root = &ss->root->top_cgroup;
+	struct super_block *sb = ss->root->sb;
+	struct dentry *prev = NULL;
+	struct inode *inode;
+	u64 update_before;
 	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
-	if (cfts && ss->root != &rootnode) {
+	if (!cfts || ss->root == &cgroup_dummy_root ||
-		list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
+	    !atomic_inc_not_zero(&sb->s_active)) {
-			dget(cgrp->dentry);
+		mutex_unlock(&cgroup_mutex);
-			list_add_tail(&cgrp->cft_q_node, &pending);
+		return;
-		}
 	}
-	mutex_unlock(&cgroup_mutex);
 	/*
-	 * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
+	 * All cgroups which are created after we drop cgroup_mutex will
-	 * files for all cgroups which were created before.
+	 * have the updated set of files, so we only need to update the
+	 * cgroups created before the current @cgroup_serial_nr_next.
 	 */
-	list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
+	update_before = cgroup_serial_nr_next;
-		struct inode *inode = cgrp->dentry->d_inode;
+	mutex_unlock(&cgroup_mutex);
+	/* @root always needs to be updated */
+	inode = root->dentry->d_inode;
+	mutex_lock(&inode->i_mutex);
+	mutex_lock(&cgroup_mutex);
+	cgroup_addrm_files(root, ss, cfts, is_add);
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&inode->i_mutex);
+	/* add/rm files for all cgroups created before */
+	rcu_read_lock();
+	cgroup_for_each_descendant_pre(cgrp, root) {
+		if (cgroup_is_dead(cgrp))
+			continue;
+		inode = cgrp->dentry->d_inode;
+		dget(cgrp->dentry);
+		rcu_read_unlock();
+		dput(prev);
+		prev = cgrp->dentry;
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
-		if (!cgroup_is_removed(cgrp))
+		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
 			cgroup_addrm_files(cgrp, ss, cfts, is_add);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
-		list_del_init(&cgrp->cft_q_node);
+		rcu_read_lock();
-		dput(cgrp->dentry);
 	}
+	rcu_read_unlock();
-	mutex_unlock(&cgroup_cft_mutex);
+	dput(prev);
+	deactivate_super(sb);
 }
 /**
@@ -2853,7 +2938,8 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	list_for_each_entry(set, &ss->cftsets, node) {
 		if (set->cfts == cfts) {
-			list_del_init(&set->node);
+			list_del(&set->node);
+			kfree(set);
 			cgroup_cfts_commit(ss, cfts, false);
 			return 0;
 		}
@@ -2872,12 +2958,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 	read_lock(&css_set_lock);
-	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += atomic_read(&link->cg->refcount);
+		count += atomic_read(&link->cset->refcount);
-	}
 	read_unlock(&css_set_lock);
 	return count;
 }
@@ -2886,25 +2971,24 @@ int cgroup_task_count(const struct cgroup *cgrp)
 * Advance a list_head iterator.  The iterator should be positioned at
 * the start of a css_set
 */
-static void cgroup_advance_iter(struct cgroup *cgrp,
+static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
-				struct cgroup_iter *it)
 {
-	struct list_head *l = it->cg_link;
+	struct list_head *l = it->cset_link;
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
-	struct css_set *cg;
+	struct css_set *cset;
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
-		if (l == &cgrp->css_sets) {
+		if (l == &cgrp->cset_links) {
-			it->cg_link = NULL;
+			it->cset_link = NULL;
 			return;
 		}
-		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
+		link = list_entry(l, struct cgrp_cset_link, cset_link);
-		cg = link->cg;
+		cset = link->cset;
-	} while (list_empty(&cg->tasks));
+	} while (list_empty(&cset->tasks));
-	it->cg_link = l;
+	it->cset_link = l;
-	it->task = cg->tasks.next;
+	it->task = cset->tasks.next;
 }
 /*
@@ -2934,13 +3018,63 @@ static void cgroup_enable_task_cg_lists(void)
 		 * entry won't be deleted though the process has exited.
 		 */
 		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
-			list_add(&p->cg_list, &p->cgroups->tasks);
+			list_add(&p->cg_list, &task_css_set(p)->tasks);
 		task_unlock(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 	write_unlock(&css_set_lock);
 }
+/**
+ * cgroup_next_sibling - find the next sibling of a given cgroup
+ * @pos: the current cgroup
+ *
+ * This function returns the next sibling of @pos and should be called
+ * under RCU read lock.  The only requirement is that @pos is accessible.
+ * The next sibling is guaranteed to be returned regardless of @pos's
+ * state.
+ */
+struct cgroup *cgroup_next_sibling(struct cgroup *pos)
+{
+	struct cgroup *next;
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	/*
+	 * @pos could already have been removed.  Once a cgroup is removed,
+	 * its ->sibling.next is no longer updated when its next sibling
+	 * changes.  As CGRP_DEAD assertion is serialized and happens
+	 * before the cgroup is taken off the ->sibling list, if we see it
+	 * unasserted, it's guaranteed that the next sibling hasn't
+	 * finished its grace period even if it's already removed, and thus
+	 * safe to dereference from this RCU critical section.  If
+	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
+	 * to be visible as %true here.
+	 */
+	if (likely(!cgroup_is_dead(pos))) {
+		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+		if (&next->sibling != &pos->parent->children)
+			return next;
+		return NULL;
+	}
+	/*
+	 * Can't dereference the next pointer.  Each cgroup is given a
+	 * monotonically increasing unique serial number and always
+	 * appended to the sibling list, so the next one can be found by
+	 * walking the parent's children until we see a cgroup with higher
+	 * serial number than @pos's.
+	 *
+	 * While this path can be slow, it's taken only when either the
+	 * current cgroup is removed or iteration and removal race.
+	 */
+	list_for_each_entry_rcu(next, &pos->parent->children, sibling)
+		if (next->serial_nr > pos->serial_nr)
+			return next;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_sibling);
 /**
 * cgroup_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
@@ -2948,6 +3082,11 @@ static void cgroup_enable_task_cg_lists(void)
 *
 * To be used by cgroup_for_each_descendant_pre().  Find the next
 * descendant to visit for pre-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
 */
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 					  struct cgroup *cgroup)
@@ -2967,11 +3106,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 	/* no child, visit my or the closest ancestor's next sibling */
 	while (pos != cgroup) {
-		next = list_entry_rcu(pos->sibling.next, struct cgroup,
+		next = cgroup_next_sibling(pos);
-				      sibling);
+		if (next)
-		if (&next->sibling != &pos->parent->children)
 			return next;
 		pos = pos->parent;
 	}
@@ -2986,6 +3123,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
 * Return the rightmost descendant of @pos.  If there's no descendant,
 * @pos is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct rightmost descendant as long as @pos is
+ * accessible.
 */
 struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
 {
@@ -3025,6 +3167,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
 *
 * To be used by cgroup_for_each_descendant_post().  Find the next
 * descendant to visit for post-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
 */
 struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 					   struct cgroup *cgroup)
@@ -3040,8 +3187,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 	}
 	/* if there's an unvisited sibling, visit its leftmost descendant */
-	next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+	next = cgroup_next_sibling(pos);
-	if (&next->sibling != &pos->parent->children)
+	if (next)
 		return cgroup_leftmost_descendant(next);
 	/* no sibling left, visit parent */
@@ -3062,7 +3209,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 		cgroup_enable_task_cg_lists();
 	read_lock(&css_set_lock);
-	it->cg_link = &cgrp->css_sets;
+	it->cset_link = &cgrp->cset_links;
 	cgroup_advance_iter(cgrp, it);
 }
@@ -3071,16 +3218,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 	/* If the iterator cg is NULL, we have no tasks */
-	if (!it->cg_link)
+	if (!it->cset_link)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 	/* Advance iterator to find next entry */
 	l = l->next;
-	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
+	link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
-	if (l == &link->cg->tasks) {
+	if (l == &link->cset->tasks) {
 		/* We reached the end of this task list - move on to
 		 * the next cg_cgroup_link */
 		cgroup_advance_iter(cgrp, it);
@@ -3411,7 +3558,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 		}
 	}
 	/* entry not found; create a new one */
-	l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
+	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
 		return l;
@@ -3420,8 +3567,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 	down_write(&l->mutex);
 	l->key.type = type;
 	l->key.ns = get_pid_ns(ns);
-	l->use_count = 0; /* don't increment here */
-	l->list = NULL;
 	l->owner = cgrp;
 	list_add(&l->links, &cgrp->pidlists);
 	mutex_unlock(&cgrp->pidlist_mutex);
@@ -3726,6 +3871,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
 	return 0;
 }
+/*
+ * When dput() is called asynchronously, if umount has been done and
+ * then deactivate_super() in cgroup_free_fn() kills the superblock,
+ * there's a small window that vfs will see the root dentry with non-zero
+ * refcnt and trigger BUG().
+ *
+ * That's why we hold a reference before dput() and drop it right after.
+ */
+static void cgroup_dput(struct cgroup *cgrp)
+{
+	struct super_block *sb = cgrp->root->sb;
+	atomic_inc(&sb->s_active);
+	dput(cgrp->dentry);
+	deactivate_super(sb);
+}
 /*
 * Unregister event and free resources.
 *
@@ -3746,7 +3908,7 @@ static void cgroup_event_remove(struct work_struct *work)
 	eventfd_ctx_put(event->eventfd);
 	kfree(event);
-	dput(cgrp->dentry);
+	cgroup_dput(cgrp);
 }
 /*
@@ -3933,33 +4095,16 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
 	return 0;
 }
-/*
+static struct cftype cgroup_base_files[] = {
- * for the common functions, 'private' gives the type of file
- */
-/* for hysterical raisins, we can't put this on the older files */
-#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
-static struct cftype files[] = {
-	{
-		.name = "tasks",
-		.open = cgroup_tasks_open,
-		.write_u64 = cgroup_tasks_write,
-		.release = cgroup_pidlist_release,
-		.mode = S_IRUGO | S_IWUSR,
-	},
 	{
-		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
+		.name = "cgroup.procs",
 		.open = cgroup_procs_open,
 		.write_u64 = cgroup_procs_write,
 		.release = cgroup_pidlist_release,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
-		.name = "notify_on_release",
+		.name = "cgroup.event_control",
-		.read_u64 = cgroup_read_notify_on_release,
-		.write_u64 = cgroup_write_notify_on_release,
-	},
-	{
-		.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
 		.write_string = cgroup_write_event_control,
 		.mode = S_IWUGO,
 	},
@@ -3974,9 +4119,29 @@ static struct cftype files[] = {
 		.flags = CFTYPE_ONLY_ON_ROOT,
 		.read_seq_string = cgroup_sane_behavior_show,
 	},
+	/*
+	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
+	 * don't exist if sane_behavior.  If you're depending on these, be
+	 * prepared to be burned.
+	 */
+	{
+		.name = "tasks",
+		.flags = CFTYPE_INSANE,		/* use "procs" instead */
+		.open = cgroup_tasks_open,
+		.write_u64 = cgroup_tasks_write,
+		.release = cgroup_pidlist_release,
+		.mode = S_IRUGO | S_IWUSR,
+	},
+	{
+		.name = "notify_on_release",
+		.flags = CFTYPE_INSANE,
+		.read_u64 = cgroup_read_notify_on_release,
+		.write_u64 = cgroup_write_notify_on_release,
+	},
 	{
 		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
 		.read_seq_string = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
@@ -3997,13 +4162,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
 	struct cgroup_subsys *ss;
 	if (base_files) {
-		err = cgroup_addrm_files(cgrp, NULL, files, true);
+		err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true);
 		if (err < 0)
 			return err;
 	}
 	/* process cftsets of each subsystem */
-	for_each_subsys(cgrp->root, ss) {
+	for_each_root_subsys(cgrp->root, ss) {
 		struct cftype_set *set;
 		if (!test_bit(ss->subsys_id, &subsys_mask))
 			continue;
@@ -4013,15 +4178,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
 	}
 	/* This cgroup is ready now */
-	for_each_subsys(cgrp->root, ss) {
+	for_each_root_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		struct css_id *id = rcu_dereference_protected(css->id, true);
 		/*
 		 * Update id->css pointer and make this css visible from
 		 * CSS ID functions. This pointer will be dereferened
 		 * from RCU-read-side without locks.
 		 */
-		if (css->id)
+		if (id)
-			rcu_assign_pointer(css->id->css, css);
+			rcu_assign_pointer(id->css, css);
 	}
 	return 0;
@@ -4031,12 +4198,16 @@ static void css_dput_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, dput_work);
-	struct dentry *dentry = css->cgroup->dentry;
-	struct super_block *sb = dentry->d_sb;
-	atomic_inc(&sb->s_active);
+	cgroup_dput(css->cgroup);
-	dput(dentry);
+}
-	deactivate_super(sb);
+static void css_release(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+	schedule_work(&css->dput_work);
 }
 static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4044,10 +4215,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
-	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
 	css->id = NULL;
-	if (cgrp == dummytop)
+	if (cgrp == cgroup_dummy_top)
 		css->flags |= CSS_ROOT;
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
 	cgrp->subsys[ss->subsys_id] = css;
@@ -4157,7 +4327,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
-	for_each_subsys(root, ss) {
+	for_each_root_subsys(root, ss) {
 		struct cgroup_subsys_state *css;
 		css = ss->css_alloc(cgrp);
@@ -4165,7 +4335,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			err = PTR_ERR(css);
 			goto err_free_all;
 		}
+		err = percpu_ref_init(&css->refcnt, css_release);
+		if (err)
+			goto err_free_all;
 		init_cgroup_css(css, ss, cgrp);
 		if (ss->use_id) {
 			err = alloc_css_id(ss, parent, cgrp);
 			if (err)
@@ -4183,20 +4359,21 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		goto err_free_all;
 	lockdep_assert_held(&dentry->d_inode->i_mutex);
+	cgrp->serial_nr = cgroup_serial_nr_next++;
 	/* allocation complete, commit to creation */
-	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	root->number_of_cgroups++;
 	/* each css holds a ref to the cgroup's dentry */
-	for_each_subsys(root, ss)
+	for_each_root_subsys(root, ss)
 		dget(dentry);
 	/* hold a ref to the parent's dentry */
 	dget(parent->dentry);
 	/* creation succeeded, notify subsystems */
-	for_each_subsys(root, ss) {
+	for_each_root_subsys(root, ss) {
 		err = online_css(ss, cgrp);
 		if (err)
 			goto err_destroy;
@@ -4221,9 +4398,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	return 0;
 err_free_all:
-	for_each_subsys(root, ss) {
+	for_each_root_subsys(root, ss) {
-		if (cgrp->subsys[ss->subsys_id])
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		if (css) {
+			percpu_ref_cancel_init(&css->refcnt);
 			ss->css_free(cgrp);
+		}
 	}
 	mutex_unlock(&cgroup_mutex);
 	/* Release the reference count that we took on the superblock */
@@ -4251,63 +4432,120 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+		return;
+	/* percpu ref's of all css's are killed, kick off the next step */
+	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+	schedule_work(&cgrp->destroy_work);
+}
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+	cgroup_css_killed(css->cgroup);
+}
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked.  To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup *parent = cgrp->parent;
 	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
+	bool empty;
 	lockdep_assert_held(&d->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
-	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+	/*
+	 * css_set_lock synchronizes access to ->cset_links and prevents
+	 * @cgrp from being removed while __put_css_set() is in progress.
+	 */
+	read_lock(&css_set_lock);
+	empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
+	read_unlock(&css_set_lock);
+	if (!empty)
 		return -EBUSY;
 	/*
-	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
+	 * Block new css_tryget() by killing css refcnts.  cgroup core
-	 * removed.  This makes future css_tryget() and child creation
+	 * guarantees that, by the time ->css_offline() is invoked, no new
-	 * attempts fail thus maintaining the removal conditions verified
+	 * css reference will be given out via css_tryget().  We can't
-	 * above.
+	 * simply call percpu_ref_kill() and proceed to offlining css's
+	 * because percpu_ref_kill() doesn't guarantee that the ref is seen
+	 * as killed on all CPUs on return.
+	 *
+	 * Use percpu_ref_kill_and_confirm() to get notifications as each
+	 * css is confirmed to be seen as killed on all CPUs.  The
+	 * notification callback keeps track of the number of css's to be
+	 * killed and schedules cgroup_offline_fn() to perform the rest of
+	 * destruction once the percpu refs of all css's are confirmed to
+	 * be killed.
 	 */
-	for_each_subsys(cgrp->root, ss) {
+	atomic_set(&cgrp->css_kill_cnt, 1);
+	for_each_root_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-		WARN_ON(atomic_read(&css->refcnt) < 0);
+		/*
-		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+		 * Killing would put the base ref, but we need to keep it
-	}
+		 * alive until after ->css_offline.
-	set_bit(CGRP_REMOVED, &cgrp->flags);
+		 */
+		percpu_ref_get(&css->refcnt);
-	/* tell subsystems to initate destruction */
+		atomic_inc(&cgrp->css_kill_cnt);
-	for_each_subsys(cgrp->root, ss)
+		percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
-		offline_css(ss, cgrp);
+	}
+	cgroup_css_killed(cgrp);
 	/*
-	 * Put all the base refs.  Each css holds an extra reference to the
+	 * Mark @cgrp dead.  This prevents further task migration and child
-	 * cgroup's dentry and cgroup removal proceeds regardless of css
+	 * creation by disabling cgroup_lock_live_group().  Note that
-	 * refs.  On the last put of each css, whenever that may be, the
+	 * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
-	 * extra dentry ref is put so that dentry destruction happens only
+	 * resume iteration after dropping RCU read lock.  See
-	 * after all css's are released.
+	 * cgroup_next_sibling() for details.
 	 */
-	for_each_subsys(cgrp->root, ss)
+	set_bit(CGRP_DEAD, &cgrp->flags);
-		css_put(cgrp->subsys[ss->subsys_id]);
+	/* CGRP_DEAD is set, remove from ->release_list for the last time */
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
-	/* delete this cgroup from parent->children */
+	/*
-	list_del_rcu(&cgrp->sibling);
+	 * Remove @cgrp directory.  The removal puts the base ref but we
-	list_del_init(&cgrp->allcg_node);
+	 * aren't quite done with @cgrp yet, so hold onto it.
+	 */
 	dget(d);
 	cgroup_d_remove_dir(d);
-	dput(d);
-	set_bit(CGRP_RELEASABLE, &parent->flags);
-	check_for_release(parent);
 	/*
 	 * Unregister events and notify userspace.
@@ -4322,6 +4560,53 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	spin_unlock(&cgrp->event_list_lock);
 	return 0;
+};
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
+static void cgroup_offline_fn(struct work_struct *work)
+{
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+	struct cgroup *parent = cgrp->parent;
+	struct dentry *d = cgrp->dentry;
+	struct cgroup_subsys *ss;
+	mutex_lock(&cgroup_mutex);
+	/*
+	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
+	 * initate destruction.
+	 */
+	for_each_root_subsys(cgrp->root, ss)
+		offline_css(ss, cgrp);
+	/*
+	 * Put the css refs from cgroup_destroy_locked().  Each css holds
+	 * an extra reference to the cgroup's dentry and cgroup removal
+	 * proceeds regardless of css refs.  On the last put of each css,
+	 * whenever that may be, the extra dentry ref is put so that dentry
+	 * destruction happens only after all css's are released.
+	 */
+	for_each_root_subsys(cgrp->root, ss)
+		css_put(cgrp->subsys[ss->subsys_id]);
+	/* delete this cgroup from parent->children */
+	list_del_rcu(&cgrp->sibling);
+	dput(d);
+	set_bit(CGRP_RELEASABLE, &parent->flags);
+	check_for_release(parent);
+	mutex_unlock(&cgroup_mutex);
 }
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4361,12 +4646,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	cgroup_init_cftsets(ss);
 	/* Create the top cgroup state for this subsystem */
-	list_add(&ss->sibling, &rootnode.subsys_list);
+	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
-	ss->root = &rootnode;
+	ss->root = &cgroup_dummy_root;
-	css = ss->css_alloc(dummytop);
+	css = ss->css_alloc(cgroup_dummy_top);
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
-	init_cgroup_css(css, ss, dummytop);
+	init_cgroup_css(css, ss, cgroup_dummy_top);
 	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
@@ -4381,7 +4666,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
-	BUG_ON(online_css(ss, dummytop));
+	BUG_ON(online_css(ss, cgroup_dummy_top));
 	mutex_unlock(&cgroup_mutex);
@@ -4404,7 +4689,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	struct cgroup_subsys_state *css;
 	int i, ret;
 	struct hlist_node *tmp;
-	struct css_set *cg;
+	struct css_set *cset;
 	unsigned long key;
 	/* check name and function validity */
@@ -4427,7 +4712,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	 */
 	if (ss->module == NULL) {
 		/* a sanity check */
-		BUG_ON(subsys[ss->subsys_id] != ss);
+		BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
 		return 0;
 	}
@@ -4435,26 +4720,26 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	cgroup_init_cftsets(ss);
 	mutex_lock(&cgroup_mutex);
-	subsys[ss->subsys_id] = ss;
+	cgroup_subsys[ss->subsys_id] = ss;
 	/*
 	 * no ss->css_alloc seems to need anything important in the ss
-	 * struct, so this can happen first (i.e. before the rootnode
+	 * struct, so this can happen first (i.e. before the dummy root
 	 * attachment).
 	 */
-	css = ss->css_alloc(dummytop);
+	css = ss->css_alloc(cgroup_dummy_top);
 	if (IS_ERR(css)) {
-		/* failure case - need to deassign the subsys[] slot. */
+		/* failure case - need to deassign the cgroup_subsys[] slot. */
-		subsys[ss->subsys_id] = NULL;
+		cgroup_subsys[ss->subsys_id] = NULL;
 		mutex_unlock(&cgroup_mutex);
 		return PTR_ERR(css);
 	}
-	list_add(&ss->sibling, &rootnode.subsys_list);
+	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
-	ss->root = &rootnode;
+	ss->root = &cgroup_dummy_root;
 	/* our new subsystem will be attached to the dummy hierarchy. */
-	init_cgroup_css(css, ss, dummytop);
+	init_cgroup_css(css, ss, cgroup_dummy_top);
 	/* init_idr must be after init_cgroup_css because it sets css->id. */
 	if (ss->use_id) {
 		ret = cgroup_init_idr(ss, css);
@@ -4471,21 +4756,21 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	 * this is all done under the css_set_lock.
 	 */
 	write_lock(&css_set_lock);
-	hash_for_each_safe(css_set_table, i, tmp, cg, hlist) {
+	hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
 		/* skip entries that we already rehashed */
-		if (cg->subsys[ss->subsys_id])
+		if (cset->subsys[ss->subsys_id])
 			continue;
 		/* remove existing entry */
-		hash_del(&cg->hlist);
+		hash_del(&cset->hlist);
 		/* set new value */
-		cg->subsys[ss->subsys_id] = css;
+		cset->subsys[ss->subsys_id] = css;
 		/* recompute hash and restore entry */
-		key = css_set_hash(cg->subsys);
+		key = css_set_hash(cset->subsys);
-		hash_add(css_set_table, &cg->hlist, key);
+		hash_add(css_set_table, &cset->hlist, key);
 	}
 	write_unlock(&css_set_lock);
-	ret = online_css(ss, dummytop);
+	ret = online_css(ss, cgroup_dummy_top);
 	if (ret)
 		goto err_unload;
@@ -4511,7 +4796,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
 */
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 	BUG_ON(ss->module == NULL);
@@ -4520,45 +4805,46 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	 * try_module_get in parse_cgroupfs_options should ensure that it
 	 * doesn't start being used while we're killing it off.
 	 */
-	BUG_ON(ss->root != &rootnode);
+	BUG_ON(ss->root != &cgroup_dummy_root);
 	mutex_lock(&cgroup_mutex);
-	offline_css(ss, dummytop);
+	offline_css(ss, cgroup_dummy_top);
 	if (ss->use_id)
 		idr_destroy(&ss->idr);
 	/* deassign the subsys_id */
-	subsys[ss->subsys_id] = NULL;
+	cgroup_subsys[ss->subsys_id] = NULL;
-	/* remove subsystem from rootnode's list of subsystems */
+	/* remove subsystem from the dummy root's list of subsystems */
 	list_del_init(&ss->sibling);
 	/*
-	 * disentangle the css from all css_sets attached to the dummytop. as
+	 * disentangle the css from all css_sets attached to the dummy
-	 * in loading, we need to pay our respects to the hashtable gods.
+	 * top. as in loading, we need to pay our respects to the hashtable
+	 * gods.
 	 */
 	write_lock(&css_set_lock);
-	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
+	list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
-		struct css_set *cg = link->cg;
+		struct css_set *cset = link->cset;
 		unsigned long key;
-		hash_del(&cg->hlist);
+		hash_del(&cset->hlist);
-		cg->subsys[ss->subsys_id] = NULL;
+		cset->subsys[ss->subsys_id] = NULL;
-		key = css_set_hash(cg->subsys);
+		key = css_set_hash(cset->subsys);
-		hash_add(css_set_table, &cg->hlist, key);
+		hash_add(css_set_table, &cset->hlist, key);
 	}
 	write_unlock(&css_set_lock);
 	/*
-	 * remove subsystem's css from the dummytop and free it - need to
+	 * remove subsystem's css from the cgroup_dummy_top and free it -
-	 * free before marking as null because ss->css_free needs the
+	 * need to free before marking as null because ss->css_free needs
-	 * cgrp->subsys pointer to find their state. note that this also
+	 * the cgrp->subsys pointer to find their state. note that this
-	 * takes care of freeing the css_id.
+	 * also takes care of freeing the css_id.
 	 */
-	ss->css_free(dummytop);
+	ss->css_free(cgroup_dummy_top);
-	dummytop->subsys[ss->subsys_id] = NULL;
+	cgroup_dummy_top->subsys[ss->subsys_id] = NULL;
 	mutex_unlock(&cgroup_mutex);
 }
@@ -4572,30 +4858,25 @@ EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
 */
 int __init cgroup_init_early(void)
 {
+	struct cgroup_subsys *ss;
 	int i;
 	atomic_set(&init_css_set.refcount, 1);
-	INIT_LIST_HEAD(&init_css_set.cg_links);
+	INIT_LIST_HEAD(&init_css_set.cgrp_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	INIT_HLIST_NODE(&init_css_set.hlist);
 	css_set_count = 1;
-	init_cgroup_root(&rootnode);
+	init_cgroup_root(&cgroup_dummy_root);
-	root_count = 1;
+	cgroup_root_count = 1;
-	init_task.cgroups = &init_css_set;
+	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
-	init_css_set_link.cg = &init_css_set;
+	init_cgrp_cset_link.cset = &init_css_set;
-	init_css_set_link.cgrp = dummytop;
+	init_cgrp_cset_link.cgrp = cgroup_dummy_top;
-	list_add(&init_css_set_link.cgrp_link_list,
+	list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
-		 &rootnode.top_cgroup.css_sets);
+	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
-	list_add(&init_css_set_link.cg_link_list,
-		 &init_css_set.cg_links);
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup_subsys *ss = subsys[i];
-		/* at bootup time, we don't worry about modular subsystems */
-		if (!ss || ss->module)
-			continue;
+	/* at bootup time, we don't worry about modular subsystems */
+	for_each_builtin_subsys(ss, i) {
 		BUG_ON(!ss->name);
 		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
 		BUG_ON(!ss->css_alloc);
@@ -4620,30 +4901,33 @@ int __init cgroup_init_early(void)
 */
 int __init cgroup_init(void)
 {
-	int err;
+	struct cgroup_subsys *ss;
-	int i;
 	unsigned long key;
+	int i, err;
 	err = bdi_init(&cgroup_backing_dev_info);
 	if (err)
 		return err;
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+	for_each_builtin_subsys(ss, i) {
-		struct cgroup_subsys *ss = subsys[i];
-		/* at bootup time, we don't worry about modular subsystems */
-		if (!ss || ss->module)
-			continue;
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 		if (ss->use_id)
 			cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
 	}
+	/* allocate id for the dummy hierarchy */
+	mutex_lock(&cgroup_mutex);
+	mutex_lock(&cgroup_root_mutex);
 	/* Add init_css_set to the hash table */
 	key = css_set_hash(init_css_set.subsys);
 	hash_add(css_set_table, &init_css_set.hlist, key);
-	BUG_ON(!init_root_id(&rootnode));
+	BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
+	mutex_unlock(&cgroup_root_mutex);
+	mutex_unlock(&cgroup_mutex);
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
 	if (!cgroup_kobj) {
@@ -4708,7 +4992,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 		int count = 0;
 		seq_printf(m, "%d:", root->hierarchy_id);
-		for_each_subsys(root, ss)
+		for_each_root_subsys(root, ss)
 			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4734,6 +5018,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
+	struct cgroup_subsys *ss;
 	int i;
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -4743,14 +5028,12 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	 * subsys/hierarchy state.
 	 */
 	mutex_lock(&cgroup_mutex);
-	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-		struct cgroup_subsys *ss = subsys[i];
+	for_each_subsys(ss, i)
-		if (ss == NULL)
-			continue;
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
 			   ss->root->number_of_cgroups, !ss->disabled);
-	}
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
@@ -4786,8 +5069,8 @@ static const struct file_operations proc_cgroupstats_operations = {
 void cgroup_fork(struct task_struct *child)
 {
 	task_lock(current);
+	get_css_set(task_css_set(current));
 	child->cgroups = current->cgroups;
-	get_css_set(child->cgroups);
 	task_unlock(current);
 	INIT_LIST_HEAD(&child->cg_list);
 }
@@ -4804,6 +5087,7 @@ void cgroup_fork(struct task_struct *child)
 */
 void cgroup_post_fork(struct task_struct *child)
 {
+	struct cgroup_subsys *ss;
 	int i;
 	/*
@@ -4821,7 +5105,7 @@ void cgroup_post_fork(struct task_struct *child)
 		write_lock(&css_set_lock);
 		task_lock(child);
 		if (list_empty(&child->cg_list))
-			list_add(&child->cg_list, &child->cgroups->tasks);
+			list_add(&child->cg_list, &task_css_set(child)->tasks);
 		task_unlock(child);
 		write_unlock(&css_set_lock);
 	}
@@ -4840,12 +5124,9 @@ void cgroup_post_fork(struct task_struct *child)
 		 * of the array can be freed at module unload, so we
 		 * can't touch that.
 		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+		for_each_builtin_subsys(ss, i)
-			struct cgroup_subsys *ss = subsys[i];
 			if (ss->fork)
 				ss->fork(child);
-		}
 	}
 }
@@ -4886,7 +5167,8 @@ void cgroup_post_fork(struct task_struct *child)
 */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
-	struct css_set *cg;
+	struct cgroup_subsys *ss;
+	struct css_set *cset;
 	int i;
 	/*
@@ -4903,36 +5185,32 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	/* Reassign the task to the init_css_set. */
 	task_lock(tsk);
-	cg = tsk->cgroups;
+	cset = task_css_set(tsk);
-	tsk->cgroups = &init_css_set;
+	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 	if (run_callbacks && need_forkexit_callback) {
 		/*
 		 * fork/exit callbacks are supported only for builtin
 		 * subsystems, see cgroup_post_fork() for details.
 		 */
-		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+		for_each_builtin_subsys(ss, i) {
-			struct cgroup_subsys *ss = subsys[i];
 			if (ss->exit) {
-				struct cgroup *old_cgrp =
+				struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
-					rcu_dereference_raw(cg->subsys[i])->cgroup;
 				struct cgroup *cgrp = task_cgroup(tsk, i);
 				ss->exit(cgrp, old_cgrp, tsk);
 			}
 		}
 	}
 	task_unlock(tsk);
-	put_css_set_taskexit(cg);
+	put_css_set_taskexit(cset);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
-	/* All of these checks rely on RCU to keep the cgroup
-	 * structure alive */
 	if (cgroup_is_releasable(cgrp) &&
-	    !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
 		/*
 		 * Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
@@ -4941,7 +5219,7 @@ static void check_for_release(struct cgroup *cgrp)
 		int need_schedule_work = 0;
 		raw_spin_lock(&release_list_lock);
-		if (!cgroup_is_removed(cgrp) &&
+		if (!cgroup_is_dead(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
@@ -4952,34 +5230,6 @@ static void check_for_release(struct cgroup *cgrp)
 	}
 }
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
-	while (true) {
-		int t, v;
-		v = css_refcnt(css);
-		t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-		if (likely(t == v))
-			return true;
-		else if (t < 0)
-			return false;
-		cpu_relax();
-	}
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
-	int v;
-	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-	if (v == 0)
-		schedule_work(&css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
 /*
 * Notify userspace when a cgroup is released, by running the
 * configured release agent with the name of the cgroup (path
@@ -5054,23 +5304,19 @@ static void cgroup_release_agent(struct work_struct *work)
 static int __init cgroup_disable(char *str)
 {
-	int i;
+	struct cgroup_subsys *ss;
 	char *token;
+	int i;
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-			struct cgroup_subsys *ss = subsys[i];
-			/*
-			 * cgroup_disable, being at boot time, can't
-			 * know about module subsystems, so we don't
-			 * worry about them.
-			 */
-			if (!ss || ss->module)
-				continue;
+		/*
+		 * cgroup_disable, being at boot time, can't know about
+		 * module subsystems, so we don't worry about them.
+		 */
+		for_each_builtin_subsys(ss, i) {
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
@@ -5087,9 +5333,7 @@ __setup("cgroup_disable=", cgroup_disable);
 * Functons for CSS ID.
 */
-/*
+/* to get ID other than 0, this should be called when !cgroup_is_dead() */
- *To get ID other than 0, this should be called when !cgroup_is_removed().
- */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
 	struct css_id *cssid;
@@ -5099,7 +5343,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 	 * it's unchanged until freed.
 	 */
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
+	cssid = rcu_dereference_raw(css->id);
 	if (cssid)
 		return cssid->id;
@@ -5107,18 +5351,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_id);
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
-	struct css_id *cssid;
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
-	if (cssid)
-		return cssid->depth;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
 /**
 *  css_is_ancestor - test "root" css is an ancestor of "child"
 * @child: the css to be tested.
@@ -5153,7 +5385,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
 void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
 {
-	struct css_id *id = css->id;
+	struct css_id *id = rcu_dereference_protected(css->id, true);
 	/* When this is called before css_id initialization, id can be NULL */
 	if (!id)
 		return;
@@ -5219,8 +5452,8 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 		return PTR_ERR(newid);
 	newid->stack[0] = newid->id;
-	newid->css = rootcss;
+	RCU_INIT_POINTER(newid->css, rootcss);
-	rootcss->id = newid;
+	RCU_INIT_POINTER(rootcss->id, newid);
 	return 0;
 }
@@ -5234,7 +5467,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 	subsys_id = ss->subsys_id;
 	parent_css = parent->subsys[subsys_id];
 	child_css = child->subsys[subsys_id];
-	parent_id = parent_css->id;
+	parent_id = rcu_dereference_protected(parent_css->id, true);
 	depth = parent_id->depth + 1;
 	child_id = get_new_cssid(ss, depth);
@@ -5299,7 +5532,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
@@ -5309,48 +5542,43 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
 	return css;
 }
-static void debug_css_free(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cgrp)
-{
-	kfree(cont->subsys[debug_subsys_id]);
-}
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
 {
-	return atomic_read(&cont->count);
+	kfree(cgrp->subsys[debug_subsys_id]);
 }
-static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
 {
-	return cgroup_task_count(cont);
+	return cgroup_task_count(cgrp);
 }
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	return (u64)(unsigned long)current->cgroups;
 }
-static u64 current_css_set_refcount_read(struct cgroup *cont,
+static u64 current_css_set_refcount_read(struct cgroup *cgrp,
-					   struct cftype *cft)
+					 struct cftype *cft)
 {
 	u64 count;
 	rcu_read_lock();
-	count = atomic_read(&current->cgroups->refcount);
+	count = atomic_read(&task_css_set(current)->refcount);
 	rcu_read_unlock();
 	return count;
 }
-static int current_css_set_cg_links_read(struct cgroup *cont,
+static int current_css_set_cg_links_read(struct cgroup *cgrp,
 					 struct cftype *cft,
 					 struct seq_file *seq)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
-	struct css_set *cg;
+	struct css_set *cset;
 	read_lock(&css_set_lock);
 	rcu_read_lock();
-	cg = rcu_dereference(current->cgroups);
+	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cg->cg_links, cg_link_list) {
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		const char *name;
@@ -5367,19 +5595,19 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cont,
+static int cgroup_css_links_read(struct cgroup *cgrp,
 				 struct cftype *cft,
 				 struct seq_file *seq)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 	read_lock(&css_set_lock);
-	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
+	list_for_each_entry(link, &cgrp->cset_links, cset_link) {
-		struct css_set *cg = link->cg;
+		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
-		seq_printf(seq, "css_set %p\n", cg);
+		seq_printf(seq, "css_set %p\n", cset);
-		list_for_each_entry(task, &cg->tasks, cg_list) {
+		list_for_each_entry(task, &cset->tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
 				seq_puts(seq, "  ...\n");
 				break;
@@ -5399,10 +5627,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
 }
 static struct cftype debug_files[] =  {
-	{
-		.name = "cgroup_refcount",
-		.read_u64 = cgroup_refcount_read,
-	},
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,

--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -49,8 +49,6 @@ struct dev_cgroup {
 	struct cgroup_subsys_state css;
 	struct list_head exceptions;
 	enum devcg_behavior behavior;
-	/* temporary list for pending propagation operations */
-	struct list_head propagate_pending;
 };
 static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
@@ -241,7 +239,6 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup)
 	if (!dev_cgroup)
 		return ERR_PTR(-ENOMEM);
 	INIT_LIST_HEAD(&dev_cgroup->exceptions);
-	INIT_LIST_HEAD(&dev_cgroup->propagate_pending);
 	dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
 	return &dev_cgroup->css;
@@ -444,34 +441,6 @@ static void revalidate_active_exceptions(struct dev_cgroup *devcg)
 	}
 }
-/**
- * get_online_devcg - walks the cgroup tree and fills a list with the online
- * 		      groups
- * @root: cgroup used as starting point
- * @online: list that will be filled with online groups
- *
- * Must be called with devcgroup_mutex held. Grabs RCU lock.
- * Because devcgroup_mutex is held, no devcg will become online or offline
- * during the tree walk (see devcgroup_online, devcgroup_offline)
- * A separated list is needed because propagate_behavior() and
- * propagate_exception() need to allocate memory and can block.
- */
-static void get_online_devcg(struct cgroup *root, struct list_head *online)
-{
-	struct cgroup *pos;
-	struct dev_cgroup *devcg;
-	lockdep_assert_held(&devcgroup_mutex);
-	rcu_read_lock();
-	cgroup_for_each_descendant_pre(pos, root) {
-		devcg = cgroup_to_devcgroup(pos);
-		if (is_devcg_online(devcg))
-			list_add_tail(&devcg->propagate_pending, online);
-	}
-	rcu_read_unlock();
-}
 /**
 * propagate_exception - propagates a new exception to the children
 * @devcg_root: device cgroup that added a new exception
@@ -482,15 +451,24 @@ static void get_online_devcg(struct cgroup *root, struct list_head *online)
 static int propagate_exception(struct dev_cgroup *devcg_root,
 			       struct dev_exception_item *ex)
 {
-	struct cgroup *root = devcg_root->css.cgroup;
+	struct cgroup *root = devcg_root->css.cgroup, *pos;
-	struct dev_cgroup *devcg, *parent, *tmp;
 	int rc = 0;
-	LIST_HEAD(pending);
-	get_online_devcg(root, &pending);
+	rcu_read_lock();
-	list_for_each_entry_safe(devcg, tmp, &pending, propagate_pending) {
+	cgroup_for_each_descendant_pre(pos, root) {
-		parent = cgroup_to_devcgroup(devcg->css.cgroup->parent);
+		struct dev_cgroup *devcg = cgroup_to_devcgroup(pos);
+		/*
+		 * Because devcgroup_mutex is held, no devcg will become
+		 * online or offline during the tree walk (see on/offline
+		 * methods), and online ones are safe to access outside RCU
+		 * read lock without bumping refcnt.
+		 */
+		if (!is_devcg_online(devcg))
+			continue;
+		rcu_read_unlock();
 		/*
 		 * in case both root's behavior and devcg is allow, a new
@@ -512,8 +490,10 @@ static int propagate_exception(struct dev_cgroup *devcg_root,
 		}
 		revalidate_active_exceptions(devcg);
-		list_del_init(&devcg->propagate_pending);
+		rcu_read_lock();
 	}
+	rcu_read_unlock();
 	return rc;
 }