blkcg: let blkcg core handle policy private data allocation

Currently, blkg's are embedded in private data blkcg policy private data structure and thus allocated and freed by policies. This leads to duplicate codes in policies, hinders implementing common part in blkcg core with strong semantics, and forces duplicate blkg's for the same cgroup-q association. This patch introduces struct blkg_policy_data which is a separate data structure chained from blkg. Policies specifies the amount of private data it needs in its blkio_policy_type->pdata_size and blkcg core takes care of allocating them along with blkg which can be accessed using blkg_to_pdata(). blkg can be determined from pdata using pdata_to_blkg(). blkio_alloc_group_fn() method is accordingly updated to blkio_init_group_fn(). For consistency, tg_of_blkg() and cfqg_of_blkg() are replaced with blkg_to_tg() and blkg_to_cfqg() respectively, and functions to map in the reverse direction are added. Except that policy specific data now lives in a separate data structure from blkg, this patch doesn't introduce any functional difference. This will be used to unify blkg's for different policies. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>

blkcg: let blkcg core handle policy private data allocation
Currently, blkg's are embedded in private data blkcg policy private data structure and thus allocated and freed by policies. This leads to duplicate codes in policies, hinders implementing common part in blkcg core with strong semantics, and forces duplicate blkg's for the same cgroup-q association. This patch introduces struct blkg_policy_data which is a separate data structure chained from blkg. Policies specifies the amount of private data it needs in its blkio_policy_type->pdata_size and blkcg core takes care of allocating them along with blkg which can be accessed using blkg_to_pdata(). blkg can be determined from pdata using pdata_to_blkg(). blkio_alloc_group_fn() method is accordingly updated to blkio_init_group_fn(). For consistency, tg_of_blkg() and cfqg_of_blkg() are replaced with blkg_to_tg() and blkg_to_cfqg() respectively, and functions to map in the reverse direction are added. Except that policy specific data now lives in a separate data structure from blkg, this patch doesn't introduce any functional difference. This will be used to unify blkg's for different policies. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
0381411e · Tejun Heo · Jens Axboe · 923adde1 · 0381411e · 0381411e
Commit 0381411e authored Mar 05, 2012 by Tejun Heo Committed by Jens Axboe Mar 06, 2012
Showing with 209 additions and 111 deletions

block/blk-cgroup.c block/blk-cgroup.c +67 -19

block/blk-cgroup.h block/blk-cgroup.h +50 -3

block/blk-throttle.c block/blk-throttle.c +40 -39

block/cfq-iosched.c block/cfq-iosched.c +52 -50

No files found.
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -422,6 +422,70 @@ void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
 }
 EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);

+/**
+ * blkg_free - free a blkg
+ * @blkg: blkg to free
+ *
+ * Free @blkg which may be partially allocated.
+ */
+static void blkg_free(struct blkio_group *blkg)
+{
+	if (blkg) {
+		free_percpu(blkg->stats_cpu);
+		kfree(blkg->pd);
+		kfree(blkg);
+	}
+}
+
+/**
+ * blkg_alloc - allocate a blkg
+ * @blkcg: block cgroup the new blkg is associated with
+ * @q: request_queue the new blkg is associated with
+ * @pol: policy the new blkg is associated with
+ *
+ * Allocate a new blkg assocating @blkcg and @q for @pol.
+ *
+ * FIXME: Should be called with queue locked but currently isn't due to
+ *        percpu stat breakage.
+ */
+static struct blkio_group *blkg_alloc(struct blkio_cgroup *blkcg,
+				      struct request_queue *q,
+				      struct blkio_policy_type *pol)
+{
+	struct blkio_group *blkg;
+
+	/* alloc and init base part */
+	blkg = kzalloc_node(sizeof(*blkg), GFP_ATOMIC, q->node);
+	if (!blkg)
+		return NULL;
+
+	spin_lock_init(&blkg->stats_lock);
+	rcu_assign_pointer(blkg->q, q);
+	blkg->blkcg = blkcg;
+	blkg->plid = pol->plid;
+	cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
+
+	/* alloc per-policy data */
+	blkg->pd = kzalloc_node(sizeof(*blkg->pd) + pol->pdata_size, GFP_ATOMIC,
+				q->node);
+	if (!blkg->pd) {
+		blkg_free(blkg);
+		return NULL;
+	}
+
+	/* broken, read comment in the callsite */
+	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
+	if (!blkg->stats_cpu) {
+		blkg_free(blkg);
+		return NULL;
+	}
+
+	/* attach pd to blkg and invoke per-policy init */
+	blkg->pd->blkg = blkg;
+	pol->ops.blkio_init_group_fn(blkg);
+	return blkg;
+}
+
 struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 				       struct request_queue *q,
 				       enum blkio_policy_id plid,
@@ -463,19 +527,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();

-	new_blkg = pol->ops.blkio_alloc_group_fn(q, blkcg);
-	if (new_blkg) {
-		new_blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
-
-		spin_lock_init(&new_blkg->stats_lock);
-		rcu_assign_pointer(new_blkg->q, q);
-		new_blkg->blkcg = blkcg;
-		new_blkg->plid = plid;
-		cgroup_path(blkcg->css.cgroup, new_blkg->path,
-			    sizeof(new_blkg->path));
-	} else {
-		css_put(&blkcg->css);
-	}
+	new_blkg = blkg_alloc(blkcg, q, pol);

 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
@@ -492,7 +544,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 		goto out;

 	/* did alloc fail? */
-	if (unlikely(!new_blkg || !new_blkg->stats_cpu)) {
+	if (unlikely(!new_blkg)) {
 		blkg = ERR_PTR(-ENOMEM);
 		goto out;
 	}
@@ -504,11 +556,7 @@ struct blkio_group *blkg_lookup_create(struct blkio_cgroup *blkcg,
 	pol->ops.blkio_link_group_fn(q, blkg);
 	spin_unlock(&blkcg->lock);
 out:
-	if (new_blkg) {
-		free_percpu(new_blkg->stats_cpu);
-		kfree(new_blkg);
-		css_put(&blkcg->css);
-	}
+	blkg_free(new_blkg);
 	return blkg;
 }
 EXPORT_SYMBOL_GPL(blkg_lookup_create);

--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -159,6 +159,15 @@ struct blkio_group_conf {
 	u64 bps[2];
 };

+/* per-blkg per-policy data */
+struct blkg_policy_data {
+	/* the blkg this per-policy data belongs to */
+	struct blkio_group *blkg;
+
+	/* pol->pdata_size bytes of private data used by policy impl */
+	char pdata[] __aligned(__alignof__(unsigned long long));
+};
+
 struct blkio_group {
 	/* Pointer to the associated request_queue, RCU protected */
 	struct request_queue __rcu *q;
@@ -177,10 +186,11 @@ struct blkio_group {
 	struct blkio_group_stats stats;
 	/* Per cpu stats pointer */
 	struct blkio_group_stats_cpu __percpu *stats_cpu;
+
+	struct blkg_policy_data *pd;
 };

-typedef struct blkio_group *(blkio_alloc_group_fn)(struct request_queue *q,
-						   struct blkio_cgroup *blkcg);
+typedef void (blkio_init_group_fn)(struct blkio_group *blkg);
 typedef void (blkio_link_group_fn)(struct request_queue *q,
 			struct blkio_group *blkg);
 typedef void (blkio_unlink_group_fn)(struct request_queue *q,
@@ -198,7 +208,7 @@ typedef void (blkio_update_group_write_iops_fn)(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int write_iops);

 struct blkio_policy_ops {
-	blkio_alloc_group_fn *blkio_alloc_group_fn;
+	blkio_init_group_fn *blkio_init_group_fn;
 	blkio_link_group_fn *blkio_link_group_fn;
 	blkio_unlink_group_fn *blkio_unlink_group_fn;
 	blkio_clear_queue_fn *blkio_clear_queue_fn;
@@ -213,6 +223,7 @@ struct blkio_policy_type {
 	struct list_head list;
 	struct blkio_policy_ops ops;
 	enum blkio_policy_id plid;
+	size_t pdata_size;		/* policy specific private data size */
 };

 extern int blkcg_init_queue(struct request_queue *q);
@@ -224,6 +235,38 @@ extern void blkio_policy_register(struct blkio_policy_type *);
 extern void blkio_policy_unregister(struct blkio_policy_type *);
 extern void blkg_destroy_all(struct request_queue *q);

+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline void *blkg_to_pdata(struct blkio_group *blkg,
+			      struct blkio_policy_type *pol)
+{
+	return blkg ? blkg->pd->pdata : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pdata: policy private data of interest
+ * @pol: policy @pdata is for
+ *
+ * @pdata is policy private data for @pol.  Determine the blkg it's
+ * associated with.
+ */
+static inline struct blkio_group *pdata_to_blkg(void *pdata,
+						struct blkio_policy_type *pol)
+{
+	if (pdata) {
+		struct blkg_policy_data *pd =
+			container_of(pdata, struct blkg_policy_data, pdata);
+		return pd->blkg;
+	}
+	return NULL;
+}
+
 static inline char *blkg_path(struct blkio_group *blkg)
 {
 	return blkg->path;
@@ -244,6 +287,10 @@ static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
 static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
 static inline void blkg_destroy_all(struct request_queue *q) { }

+static inline void *blkg_to_pdata(struct blkio_group *blkg,
+				struct blkio_policy_type *pol) { return NULL; }
+static inline struct blkio_group *pdata_to_blkg(void *pdata,
+				struct blkio_policy_type *pol) { return NULL; }
 static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }

 #endif

--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -21,6 +21,8 @@ static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;	/* 100 ms */

+static struct blkio_policy_type blkio_policy_throtl;
+
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 static void throtl_schedule_delayed_work(struct throtl_data *td,
@@ -52,7 +54,6 @@ struct throtl_grp {
 	 */
 	unsigned long disptime;

-	struct blkio_group blkg;
 	atomic_t ref;
 	unsigned int flags;

@@ -108,6 +109,16 @@ struct throtl_data
 	int limits_changed;
 };

+static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
+{
+	return blkg_to_pdata(blkg, &blkio_policy_throtl);
+}
+
+static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
+{
+	return pdata_to_blkg(tg, &blkio_policy_throtl);
+}
+
 enum tg_state_flags {
 	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
 };
@@ -130,19 +141,11 @@ THROTL_TG_FNS(on_rr);

 #define throtl_log_tg(td, tg, fmt, args...)				\
 	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
-				blkg_path(&(tg)->blkg), ##args);      	\
+			  blkg_path(tg_to_blkg(tg)), ##args);		\

 #define throtl_log(td, fmt, args...)	\
 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)

-static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
-{
-	if (blkg)
-		return container_of(blkg, struct throtl_grp, blkg);
-
-	return NULL;
-}
-
 static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
 	return td->nr_queued[0] + td->nr_queued[1];
@@ -156,21 +159,24 @@ static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)

 static void throtl_free_tg(struct rcu_head *head)
 {
-	struct throtl_grp *tg;
+	struct throtl_grp *tg = container_of(head, struct throtl_grp, rcu_head);
+	struct blkio_group *blkg = tg_to_blkg(tg);

-	tg = container_of(head, struct throtl_grp, rcu_head);
-	free_percpu(tg->blkg.stats_cpu);
-	kfree(tg);
+	free_percpu(blkg->stats_cpu);
+	kfree(blkg->pd);
+	kfree(blkg);
 }

 static void throtl_put_tg(struct throtl_grp *tg)
 {
+	struct blkio_group *blkg = tg_to_blkg(tg);
+
 	BUG_ON(atomic_read(&tg->ref) <= 0);
 	if (!atomic_dec_and_test(&tg->ref))
 		return;

 	/* release the extra blkcg reference this blkg has been holding */
-	css_put(&tg->blkg.blkcg->css);
+	css_put(&blkg->blkcg->css);

 	/*
 	 * A group is freed in rcu manner. But having an rcu lock does not
@@ -184,14 +190,9 @@ static void throtl_put_tg(struct throtl_grp *tg)
 	call_rcu(&tg->rcu_head, throtl_free_tg);
 }

-static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
-						    struct blkio_cgroup *blkcg)
+static void throtl_init_blkio_group(struct blkio_group *blkg)
 {
-	struct throtl_grp *tg;
-
-	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, q->node);
-	if (!tg)
-		return NULL;
+	struct throtl_grp *tg = blkg_to_tg(blkg);

 	INIT_HLIST_NODE(&tg->tg_node);
 	RB_CLEAR_NODE(&tg->rb_node);
@@ -211,15 +212,13 @@ static struct blkio_group *throtl_alloc_blkio_group(struct request_queue *q,
 	 * exit or cgroup deletion path depending on who is exiting first.
 	 */
 	atomic_set(&tg->ref, 1);
-
-	return &tg->blkg;
 }

 static void throtl_link_blkio_group(struct request_queue *q,
 				    struct blkio_group *blkg)
 {
 	struct throtl_data *td = q->td;
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);

 	hlist_add_head(&tg->tg_node, &td->tg_list);
 	td->nr_undestroyed_grps++;
@@ -235,7 +234,7 @@ throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 	if (blkcg == &blkio_root_cgroup)
 		return td->root_tg;

-	return tg_of_blkg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
+	return blkg_to_tg(blkg_lookup(blkcg, td->queue, BLKIO_POLICY_THROTL));
 }

 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
@@ -257,7 +256,7 @@ static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,

 		/* if %NULL and @q is alive, fall back to root_tg */
 		if (!IS_ERR(blkg))
-			tg = tg_of_blkg(blkg);
+			tg = blkg_to_tg(blkg);
 		else if (!blk_queue_dead(q))
 			tg = td->root_tg;
 	}
@@ -639,7 +638,7 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;

-	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
+	blkiocg_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, rw, sync);
 }

 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
@@ -901,7 +900,7 @@ static bool throtl_release_tgs(struct throtl_data *td, bool release_root)
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
-		if (!blkiocg_del_blkio_group(&tg->blkg))
+		if (!blkiocg_del_blkio_group(tg_to_blkg(tg)))
 			throtl_destroy_tg(td, tg);
 		else
 			empty = false;
@@ -929,7 +928,7 @@ void throtl_unlink_blkio_group(struct request_queue *q,
 	unsigned long flags;

 	spin_lock_irqsave(q->queue_lock, flags);
-	throtl_destroy_tg(q->td, tg_of_blkg(blkg));
+	throtl_destroy_tg(q->td, blkg_to_tg(blkg));
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }

@@ -968,7 +967,7 @@ static void throtl_update_blkio_group_common(struct throtl_data *td,
 static void throtl_update_blkio_group_read_bps(struct request_queue *q,
 				struct blkio_group *blkg, u64 read_bps)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);

 	tg->bps[READ] = read_bps;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -977,7 +976,7 @@ static void throtl_update_blkio_group_read_bps(struct request_queue *q,
 static void throtl_update_blkio_group_write_bps(struct request_queue *q,
 				struct blkio_group *blkg, u64 write_bps)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);

 	tg->bps[WRITE] = write_bps;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -986,7 +985,7 @@ static void throtl_update_blkio_group_write_bps(struct request_queue *q,
 static void throtl_update_blkio_group_read_iops(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int read_iops)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);

 	tg->iops[READ] = read_iops;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -995,7 +994,7 @@ static void throtl_update_blkio_group_read_iops(struct request_queue *q,
 static void throtl_update_blkio_group_write_iops(struct request_queue *q,
 			struct blkio_group *blkg, unsigned int write_iops)
 {
-	struct throtl_grp *tg = tg_of_blkg(blkg);
+	struct throtl_grp *tg = blkg_to_tg(blkg);

 	tg->iops[WRITE] = write_iops;
 	throtl_update_blkio_group_common(q->td, tg);
@@ -1010,7 +1009,7 @@ static void throtl_shutdown_wq(struct request_queue *q)

 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
-		.blkio_alloc_group_fn = throtl_alloc_blkio_group,
+		.blkio_init_group_fn = throtl_init_blkio_group,
 		.blkio_link_group_fn = throtl_link_blkio_group,
 		.blkio_unlink_group_fn = throtl_unlink_blkio_group,
 		.blkio_clear_queue_fn = throtl_clear_queue,
@@ -1024,6 +1023,7 @@ static struct blkio_policy_type blkio_policy_throtl = {
 					throtl_update_blkio_group_write_iops,
 	},
 	.plid = BLKIO_POLICY_THROTL,
+	.pdata_size = sizeof(struct throtl_grp),
 };

 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
@@ -1049,8 +1049,9 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
-			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-					rw, rw_is_sync(bio->bi_rw));
+			blkiocg_update_dispatch_stats(tg_to_blkg(tg),
+						      bio->bi_size, rw,
+						      rw_is_sync(bio->bi_rw));
 			goto out_unlock_rcu;
 		}
 	}
@@ -1176,7 +1177,7 @@ int blk_throtl_init(struct request_queue *q)
 	blkg = blkg_lookup_create(&blkio_root_cgroup, q, BLKIO_POLICY_THROTL,
 				  true);
 	if (!IS_ERR(blkg))
-		td->root_tg = tg_of_blkg(blkg);
+		td->root_tg = blkg_to_tg(blkg);

 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
@@ -1207,7 +1208,7 @@ void blk_throtl_exit(struct request_queue *q)
 	spin_unlock_irq(q->queue_lock);

 	/*
-	 * Wait for tg->blkg->q accessors to exit their grace periods.
+	 * Wait for tg_to_blkg(tg)->q accessors to exit their grace periods.
 	 * Do this wait only if there are other undestroyed groups out
 	 * there (other than root group). This can happen if cgroup deletion
 	 * path claimed the responsibility of cleaning up a group before

--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c