Commit 06b285bd authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

blkcg: fix blkcg_policy_data allocation bug

e48453c3 ("block, cgroup: implement policy-specific per-blkcg
data") updated per-blkcg policy data to be dynamically allocated.
When a policy is registered, its policy data aren't created.  Instead,
when the policy is activated on a queue, the policy data are allocated
if there are blkg's (blkcg_gq's) which are attached to a given blkcg.
This is buggy.  Consider the following scenario.

1. A blkcg is created.  No blkg's attached yet.

2. The policy is registered.  No policy data is allocated.

3. The policy is activated on a queue.  As the above blkcg doesn't
   have any blkg's, it won't allocate the matching blkcg_policy_data.

4. An IO is issued from the blkcg and blkg is created and the blkcg
   still doesn't have the matching policy data allocated.

With cfq-iosched, this leads to an oops.

It also doesn't free policy data on policy unregistration assuming
that freeing of all policy data on blkcg destruction should take care
of it; however, this also is incorrect.

1. A blkcg has policy data.

2. The policy gets unregistered but the policy data remains.

3. Another policy gets registered on the same slot.

4. Later, the new policy tries to allocate policy data on the previous
   blkcg but the slot is already occupied and gets skipped.  The
   policy ends up operating on the policy data of the previous policy.

There's no reason to manage blkcg_policy_data lazily.  The reason we
do lazy allocation of blkg's is that the number of all possible blkg's
is the product of cgroups and block devices which can reach a
surprising level.  blkcg_policy_data is contrained by the number of
cgroups and shouldn't be a problem.

This patch makes blkcg_policy_data to be allocated for all existing
blkcg's on policy registration and freed on unregistration and removes
blkcg_policy_data handling from policy [de]activation paths.  This
makes that blkcg_policy_data are created and removed with the policy
they belong to and fixes the above described problems.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Fixes: e48453c3 ("block, cgroup: implement policy-specific per-blkcg data")
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 7876f930
...@@ -1048,10 +1048,8 @@ int blkcg_activate_policy(struct request_queue *q, ...@@ -1048,10 +1048,8 @@ int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol) const struct blkcg_policy *pol)
{ {
LIST_HEAD(pds); LIST_HEAD(pds);
LIST_HEAD(cpds);
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
struct blkg_policy_data *pd, *nd; struct blkg_policy_data *pd, *nd;
struct blkcg_policy_data *cpd, *cnd;
int cnt = 0, ret; int cnt = 0, ret;
if (blkcg_policy_enabled(q, pol)) if (blkcg_policy_enabled(q, pol))
...@@ -1064,10 +1062,7 @@ int blkcg_activate_policy(struct request_queue *q, ...@@ -1064,10 +1062,7 @@ int blkcg_activate_policy(struct request_queue *q,
cnt++; cnt++;
spin_unlock_irq(q->queue_lock); spin_unlock_irq(q->queue_lock);
/* /* allocate per-blkg policy data for all existing blkgs */
* Allocate per-blkg and per-blkcg policy data
* for all existing blkgs.
*/
while (cnt--) { while (cnt--) {
pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
if (!pd) { if (!pd) {
...@@ -1075,15 +1070,6 @@ int blkcg_activate_policy(struct request_queue *q, ...@@ -1075,15 +1070,6 @@ int blkcg_activate_policy(struct request_queue *q,
goto out_free; goto out_free;
} }
list_add_tail(&pd->alloc_node, &pds); list_add_tail(&pd->alloc_node, &pds);
if (!pol->cpd_size)
continue;
cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
if (!cpd) {
ret = -ENOMEM;
goto out_free;
}
list_add_tail(&cpd->alloc_node, &cpds);
} }
/* /*
...@@ -1093,32 +1079,17 @@ int blkcg_activate_policy(struct request_queue *q, ...@@ -1093,32 +1079,17 @@ int blkcg_activate_policy(struct request_queue *q,
spin_lock_irq(q->queue_lock); spin_lock_irq(q->queue_lock);
list_for_each_entry(blkg, &q->blkg_list, q_node) { list_for_each_entry(blkg, &q->blkg_list, q_node) {
if (WARN_ON(list_empty(&pds)) || if (WARN_ON(list_empty(&pds))) {
WARN_ON(pol->cpd_size && list_empty(&cpds))) {
/* umm... this shouldn't happen, just abort */ /* umm... this shouldn't happen, just abort */
ret = -ENOMEM; ret = -ENOMEM;
goto out_unlock; goto out_unlock;
} }
cpd = list_first_entry(&cpds, struct blkcg_policy_data,
alloc_node);
list_del_init(&cpd->alloc_node);
pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
list_del_init(&pd->alloc_node); list_del_init(&pd->alloc_node);
/* grab blkcg lock too while installing @pd on @blkg */ /* grab blkcg lock too while installing @pd on @blkg */
spin_lock(&blkg->blkcg->lock); spin_lock(&blkg->blkcg->lock);
if (!pol->cpd_size)
goto no_cpd;
if (!blkg->blkcg->pd[pol->plid]) {
/* Per-policy per-blkcg data */
blkg->blkcg->pd[pol->plid] = cpd;
cpd->plid = pol->plid;
pol->cpd_init_fn(blkg->blkcg);
} else { /* must free it as it has already been extracted */
kfree(cpd);
}
no_cpd:
blkg->pd[pol->plid] = pd; blkg->pd[pol->plid] = pd;
pd->blkg = blkg; pd->blkg = blkg;
pd->plid = pol->plid; pd->plid = pol->plid;
...@@ -1135,8 +1106,6 @@ int blkcg_activate_policy(struct request_queue *q, ...@@ -1135,8 +1106,6 @@ int blkcg_activate_policy(struct request_queue *q,
blk_queue_bypass_end(q); blk_queue_bypass_end(q);
list_for_each_entry_safe(pd, nd, &pds, alloc_node) list_for_each_entry_safe(pd, nd, &pds, alloc_node)
kfree(pd); kfree(pd);
list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
kfree(cpd);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(blkcg_activate_policy); EXPORT_SYMBOL_GPL(blkcg_activate_policy);
...@@ -1191,6 +1160,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); ...@@ -1191,6 +1160,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
*/ */
int blkcg_policy_register(struct blkcg_policy *pol) int blkcg_policy_register(struct blkcg_policy *pol)
{ {
struct blkcg *blkcg;
int i, ret; int i, ret;
if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
...@@ -1207,9 +1177,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) ...@@ -1207,9 +1177,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
if (i >= BLKCG_MAX_POLS) if (i >= BLKCG_MAX_POLS)
goto err_unlock; goto err_unlock;
/* register and update blkgs */ /* register @pol */
pol->plid = i; pol->plid = i;
blkcg_policy[i] = pol; blkcg_policy[pol->plid] = pol;
/* allocate and install cpd's */
if (pol->cpd_size) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
struct blkcg_policy_data *cpd;
cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
if (!cpd) {
mutex_unlock(&blkcg_pol_mutex);
goto err_free_cpds;
}
blkcg->pd[pol->plid] = cpd;
cpd->plid = pol->plid;
pol->cpd_init_fn(blkcg);
}
}
mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_mutex);
/* everything is in place, add intf files for the new policy */ /* everything is in place, add intf files for the new policy */
...@@ -1219,6 +1207,14 @@ int blkcg_policy_register(struct blkcg_policy *pol) ...@@ -1219,6 +1207,14 @@ int blkcg_policy_register(struct blkcg_policy *pol)
mutex_unlock(&blkcg_pol_register_mutex); mutex_unlock(&blkcg_pol_register_mutex);
return 0; return 0;
err_free_cpds:
if (pol->cpd_size) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
kfree(blkcg->pd[pol->plid]);
blkcg->pd[pol->plid] = NULL;
}
}
blkcg_policy[pol->plid] = NULL;
err_unlock: err_unlock:
mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_mutex);
mutex_unlock(&blkcg_pol_register_mutex); mutex_unlock(&blkcg_pol_register_mutex);
...@@ -1234,6 +1230,8 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); ...@@ -1234,6 +1230,8 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
*/ */
void blkcg_policy_unregister(struct blkcg_policy *pol) void blkcg_policy_unregister(struct blkcg_policy *pol)
{ {
struct blkcg *blkcg;
mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_register_mutex);
if (WARN_ON(blkcg_policy[pol->plid] != pol)) if (WARN_ON(blkcg_policy[pol->plid] != pol))
...@@ -1243,9 +1241,17 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) ...@@ -1243,9 +1241,17 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
if (pol->cftypes) if (pol->cftypes)
cgroup_rm_cftypes(pol->cftypes); cgroup_rm_cftypes(pol->cftypes);
/* unregister and update blkgs */ /* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex); mutex_lock(&blkcg_pol_mutex);
if (pol->cpd_size) {
list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
kfree(blkcg->pd[pol->plid]);
blkcg->pd[pol->plid] = NULL;
}
}
blkcg_policy[pol->plid] = NULL; blkcg_policy[pol->plid] = NULL;
mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_mutex);
out_unlock: out_unlock:
mutex_unlock(&blkcg_pol_register_mutex); mutex_unlock(&blkcg_pol_register_mutex);
......
...@@ -89,18 +89,12 @@ struct blkg_policy_data { ...@@ -89,18 +89,12 @@ struct blkg_policy_data {
* Policies that need to keep per-blkcg data which is independent * Policies that need to keep per-blkcg data which is independent
* from any request_queue associated to it must specify its size * from any request_queue associated to it must specify its size
* with the cpd_size field of the blkcg_policy structure and * with the cpd_size field of the blkcg_policy structure and
* embed a blkcg_policy_data in it. blkcg core allocates * embed a blkcg_policy_data in it. cpd_init() is invoked to let
* policy-specific per-blkcg structures lazily the first time * each policy handle per-blkcg data.
* they are actually needed, so it handles them together with
* blkgs. cpd_init() is invoked to let each policy handle
* per-blkcg data.
*/ */
struct blkcg_policy_data { struct blkcg_policy_data {
/* the policy id this per-policy data belongs to */ /* the policy id this per-policy data belongs to */
int plid; int plid;
/* used during policy activation */
struct list_head alloc_node;
}; };
/* association between a blk cgroup and a request queue */ /* association between a blk cgroup and a request queue */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment