Commit 40f61237 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-3.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
 "Mostly fixes for the fallouts from the recent cgroup core changes.

  The decoupled nature of cgroup dynamic hierarchy management
  (hierarchies are created dynamically on mount but may or may not be
  reused once unmounted depending on remaining usages) led to more
  ugliness being added to kernfs.

  Hopefully, this is the last of it"

* 'for-3.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cpuset: break kernfs active protection in cpuset_write_resmask()
  cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()
  kernfs: introduce kernfs_pin_sb()
  cgroup: fix mount failure in a corner case
  cpuset,mempolicy: fix sleeping function called from invalid context
  cgroup: fix broken css_has_online_children()
parents a805cbf4 76bb5ab8
...@@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb) ...@@ -211,6 +211,36 @@ void kernfs_kill_sb(struct super_block *sb)
kernfs_put(root_kn); kernfs_put(root_kn);
} }
/**
* kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
* @kernfs_root: the kernfs_root in question
* @ns: the namespace tag
*
* Pin the superblock so the superblock won't be destroyed in subsequent
* operations. This can be used to block ->kill_sb() which may be useful
* for kernfs users which dynamically manage superblocks.
*
* Returns NULL if there's no superblock associated to this kernfs_root, or
* -EINVAL if the superblock is being freed.
*/
struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
{
struct kernfs_super_info *info;
struct super_block *sb = NULL;
mutex_lock(&kernfs_mutex);
list_for_each_entry(info, &root->supers, node) {
if (info->ns == ns) {
sb = info->sb;
if (!atomic_inc_not_zero(&info->sb->s_active))
sb = ERR_PTR(-EINVAL);
break;
}
}
mutex_unlock(&kernfs_mutex);
return sb;
}
void __init kernfs_init(void) void __init kernfs_init(void)
{ {
kernfs_node_cache = kmem_cache_create("kernfs_node_cache", kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
......
...@@ -305,6 +305,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, ...@@ -305,6 +305,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
struct kernfs_root *root, unsigned long magic, struct kernfs_root *root, unsigned long magic,
bool *new_sb_created, const void *ns); bool *new_sb_created, const void *ns);
void kernfs_kill_sb(struct super_block *sb); void kernfs_kill_sb(struct super_block *sb);
struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
void kernfs_init(void); void kernfs_init(void);
......
...@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ...@@ -1648,10 +1648,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name, int flags, const char *unused_dev_name,
void *data) void *data)
{ {
struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root; struct cgroup_root *root;
struct cgroup_sb_opts opts; struct cgroup_sb_opts opts;
struct dentry *dentry; struct dentry *dentry;
int ret; int ret;
int i;
bool new_sb; bool new_sb;
/* /*
...@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ...@@ -1677,6 +1680,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock; goto out_unlock;
} }
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
* dying subsystems. We just need to ensure that the ones
* unmounted previously finish dying and don't care about new ones
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
if (!(opts.subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;
if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
cgroup_put(&ss->root->cgrp);
}
for_each_root(root) { for_each_root(root) {
bool name_match = false; bool name_match = false;
...@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ...@@ -1717,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
} }
/* /*
* A root's lifetime is governed by its root cgroup. * We want to reuse @root whose lifetime is governed by its
* tryget_live failure indicate that the root is being * ->cgrp. Let's check whether @root is alive and keep it
* destroyed. Wait for destruction to complete so that the * that way. As cgroup_kill_sb() can happen anytime, we
* subsystems are free. We can use wait_queue for the wait * want to block it by pinning the sb so that @root doesn't
* but this path is super cold. Let's just sleep for a bit * get killed before mount is complete.
* and retry. *
*/ * With the sb pinned, tryget_live can reliably indicate
if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { * whether @root can be reused. If it's being killed,
* drain it. We can use wait_queue for the wait but this
* path is super cold. Let's just sleep a bit and retry.
*/
pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
if (IS_ERR(pinned_sb) ||
!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex); mutex_unlock(&cgroup_mutex);
if (!IS_ERR_OR_NULL(pinned_sb))
deactivate_super(pinned_sb);
msleep(10); msleep(10);
ret = restart_syscall(); ret = restart_syscall();
goto out_free; goto out_free;
...@@ -1770,6 +1802,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, ...@@ -1770,6 +1802,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
CGROUP_SUPER_MAGIC, &new_sb); CGROUP_SUPER_MAGIC, &new_sb);
if (IS_ERR(dentry) || !new_sb) if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp); cgroup_put(&root->cgrp);
/*
* If @pinned_sb, we're reusing an existing root and holding an
* extra ref on its sb. Mount is complete. Put the extra ref.
*/
if (pinned_sb) {
WARN_ON(new_sb);
deactivate_super(pinned_sb);
}
return dentry; return dentry;
} }
...@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) ...@@ -3328,7 +3370,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
rcu_read_lock(); rcu_read_lock();
css_for_each_child(child, css) { css_for_each_child(child, css) {
if (css->flags & CSS_ONLINE) { if (child->flags & CSS_ONLINE) {
ret = true; ret = true;
break; break;
} }
......
...@@ -1181,7 +1181,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, ...@@ -1181,7 +1181,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
int current_cpuset_is_being_rebound(void) int current_cpuset_is_being_rebound(void)
{ {
return task_cs(current) == cpuset_being_rebound; int ret;
rcu_read_lock();
ret = task_cs(current) == cpuset_being_rebound;
rcu_read_unlock();
return ret;
} }
static int update_relax_domain_level(struct cpuset *cs, s64 val) static int update_relax_domain_level(struct cpuset *cs, s64 val)
...@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, ...@@ -1617,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* resources, wait for the previously scheduled operations before * resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added * proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored. * after execution capability is restored.
*
* cpuset_hotplug_work calls back into cgroup core via
* cgroup_transfer_tasks() and waiting for it from a cgroupfs
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
* grabbing cpuset_mutex anyway. This only happens on the legacy
* hierarchies.
*/ */
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work); flush_work(&cpuset_hotplug_work);
mutex_lock(&cpuset_mutex); mutex_lock(&cpuset_mutex);
...@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, ...@@ -1645,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_trial_cpuset(trialcs); free_trial_cpuset(trialcs);
out_unlock: out_unlock:
mutex_unlock(&cpuset_mutex); mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
return retval ?: nbytes; return retval ?: nbytes;
} }
......
...@@ -2139,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) ...@@ -2139,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
} else } else
*new = *old; *new = *old;
rcu_read_lock();
if (current_cpuset_is_being_rebound()) { if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current); nodemask_t mems = cpuset_mems_allowed(current);
if (new->flags & MPOL_F_REBINDING) if (new->flags & MPOL_F_REBINDING)
...@@ -2147,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) ...@@ -2147,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
else else
mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
} }
rcu_read_unlock();
atomic_set(&new->refcnt, 1); atomic_set(&new->refcnt, 1);
return new; return new;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment