Commit 1de0c4cd authored by Arianna Avanzini's avatar Arianna Avanzini Committed by Jens Axboe

block, bfq: reduce idling only in symmetric scenarios

A seeky queue (i..e, a queue containing random requests) is assigned a
very small device-idling slice, for throughput issues. Unfortunately,
given the process associated with a seeky queue, this behavior causes
the following problem: if the process, say P, performs sync I/O and
has a higher weight than some other processes doing I/O and associated
with non-seeky queues, then BFQ may fail to guarantee to P its
reserved share of the throughput. The reason is that idling is key
for providing service guarantees to processes doing sync I/O [1].

This commit addresses this issue by allowing the device-idling slice
to be reduced for a seeky queue only if the scenario happens to be
symmetric, i.e., if all the queues are to receive the same share of
the throughput.

[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
    Scheduler", Proceedings of the First Workshop on Mobile System
    Technologies (MST-2015), May 2015.
    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdfSigned-off-by: default avatarArianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: default avatarRiccardo Pizzetti <riccardo.pizzetti@gmail.com>
Signed-off-by: default avatarSamuele Zecchini <samuele.zecchini92@gmail.com>
Signed-off-by: default avatarPaolo Valente <paolo.valente@linaro.org>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 36eca894
...@@ -182,6 +182,20 @@ struct bfq_sched_data { ...@@ -182,6 +182,20 @@ struct bfq_sched_data {
}; };
/**
* struct bfq_weight_counter - counter of the number of all active entities
* with a given weight.
*/
struct bfq_weight_counter {
unsigned int weight; /* weight of the entities this counter refers to */
unsigned int num_active; /* nr of active entities with this weight */
/*
* Weights tree member (see bfq_data's @queue_weights_tree and
* @group_weights_tree)
*/
struct rb_node weights_node;
};
/** /**
* struct bfq_entity - schedulable entity. * struct bfq_entity - schedulable entity.
* *
...@@ -212,6 +226,8 @@ struct bfq_sched_data { ...@@ -212,6 +226,8 @@ struct bfq_sched_data {
struct bfq_entity { struct bfq_entity {
/* service_tree member */ /* service_tree member */
struct rb_node rb_node; struct rb_node rb_node;
/* pointer to the weight counter associated with this entity */
struct bfq_weight_counter *weight_counter;
/* /*
* Flag, true if the entity is on a tree (either the active or * Flag, true if the entity is on a tree (either the active or
...@@ -455,6 +471,25 @@ struct bfq_data { ...@@ -455,6 +471,25 @@ struct bfq_data {
/* root bfq_group for the device */ /* root bfq_group for the device */
struct bfq_group *root_group; struct bfq_group *root_group;
/*
* rbtree of weight counters of @bfq_queues, sorted by
* weight. Used to keep track of whether all @bfq_queues have
* the same weight. The tree contains one counter for each
* distinct weight associated to some active and not
* weight-raised @bfq_queue (see the comments to the functions
* bfq_weights_tree_[add|remove] for further details).
*/
struct rb_root queue_weights_tree;
/*
* rbtree of non-queue @bfq_entity weight counters, sorted by
* weight. Used to keep track of whether all @bfq_groups have
* the same weight. The tree contains one counter for each
* distinct weight associated to some active @bfq_group (see
* the comments to the functions bfq_weights_tree_[add|remove]
* for further details).
*/
struct rb_root group_weights_tree;
/* /*
* Number of bfq_queues containing requests (including the * Number of bfq_queues containing requests (including the
* queue in service, even if it is idling). * queue in service, even if it is idling).
...@@ -791,6 +826,11 @@ struct bfq_group_data { ...@@ -791,6 +826,11 @@ struct bfq_group_data {
* to avoid too many special cases during group creation/ * to avoid too many special cases during group creation/
* migration. * migration.
* @stats: stats for this bfqg. * @stats: stats for this bfqg.
* @active_entities: number of active entities belonging to the group;
* unused for the root group. Used to know whether there
* are groups with more than one active @bfq_entity
* (see the comments to the function
* bfq_bfqq_may_idle()).
* @rq_pos_tree: rbtree sorted by next_request position, used when * @rq_pos_tree: rbtree sorted by next_request position, used when
* determining if two or more queues have interleaving * determining if two or more queues have interleaving
* requests (see bfq_find_close_cooperator()). * requests (see bfq_find_close_cooperator()).
...@@ -818,6 +858,8 @@ struct bfq_group { ...@@ -818,6 +858,8 @@ struct bfq_group {
struct bfq_entity *my_entity; struct bfq_entity *my_entity;
int active_entities;
struct rb_root rq_pos_tree; struct rb_root rq_pos_tree;
struct bfqg_stats stats; struct bfqg_stats stats;
...@@ -1254,12 +1296,27 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) ...@@ -1254,12 +1296,27 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
* a candidate for next service (i.e, a candidate entity to serve * a candidate for next service (i.e, a candidate entity to serve
* after the in-service entity is expired). The function then returns * after the in-service entity is expired). The function then returns
* true. * true.
*
* In contrast, the entity could stil be a candidate for next service
* if it is not a queue, and has more than one child. In fact, even if
* one of its children is about to be set in service, other children
* may still be the next to serve. As a consequence, a non-queue
* entity is not a candidate for next-service only if it has only one
* child. And only if this condition holds, then the function returns
* true for a non-queue entity.
*/ */
static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
{ {
struct bfq_group *bfqg;
if (bfq_entity_to_bfqq(entity)) if (bfq_entity_to_bfqq(entity))
return true; return true;
bfqg = container_of(entity, struct bfq_group, entity);
if (bfqg->active_entities == 1)
return true;
return false; return false;
} }
...@@ -1498,6 +1555,15 @@ static void bfq_update_active_tree(struct rb_node *node) ...@@ -1498,6 +1555,15 @@ static void bfq_update_active_tree(struct rb_node *node)
goto up; goto up;
} }
static void bfq_weights_tree_add(struct bfq_data *bfqd,
struct bfq_entity *entity,
struct rb_root *root);
static void bfq_weights_tree_remove(struct bfq_data *bfqd,
struct bfq_entity *entity,
struct rb_root *root);
/** /**
* bfq_active_insert - insert an entity in the active tree of its * bfq_active_insert - insert an entity in the active tree of its
* group/device. * group/device.
...@@ -1536,6 +1602,13 @@ static void bfq_active_insert(struct bfq_service_tree *st, ...@@ -1536,6 +1602,13 @@ static void bfq_active_insert(struct bfq_service_tree *st,
#endif #endif
if (bfqq) if (bfqq)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
else /* bfq_group */
bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
if (bfqg != bfqd->root_group)
bfqg->active_entities++;
#endif
} }
/** /**
...@@ -1631,6 +1704,14 @@ static void bfq_active_extract(struct bfq_service_tree *st, ...@@ -1631,6 +1704,14 @@ static void bfq_active_extract(struct bfq_service_tree *st,
#endif #endif
if (bfqq) if (bfqq)
list_del(&bfqq->bfqq_list); list_del(&bfqq->bfqq_list);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
else /* bfq_group */
bfq_weights_tree_remove(bfqd, entity,
&bfqd->group_weights_tree);
if (bfqg != bfqd->root_group)
bfqg->active_entities--;
#endif
} }
/** /**
...@@ -1731,6 +1812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ...@@ -1731,6 +1812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
unsigned int prev_weight, new_weight; unsigned int prev_weight, new_weight;
struct bfq_data *bfqd = NULL; struct bfq_data *bfqd = NULL;
struct rb_root *root;
#ifdef CONFIG_BFQ_GROUP_IOSCHED #ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_sched_data *sd; struct bfq_sched_data *sd;
struct bfq_group *bfqg; struct bfq_group *bfqg;
...@@ -1780,7 +1862,26 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ...@@ -1780,7 +1862,26 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
prev_weight = entity->weight; prev_weight = entity->weight;
new_weight = entity->orig_weight * new_weight = entity->orig_weight *
(bfqq ? bfqq->wr_coeff : 1); (bfqq ? bfqq->wr_coeff : 1);
/*
* If the weight of the entity changes, remove the entity
* from its old weight counter (if there is a counter
* associated with the entity), and add it to the counter
* associated with its new weight.
*/
if (prev_weight != new_weight) {
root = bfqq ? &bfqd->queue_weights_tree :
&bfqd->group_weights_tree;
bfq_weights_tree_remove(bfqd, entity, root);
}
entity->weight = new_weight; entity->weight = new_weight;
/*
* Add the entity to its weights tree only if it is
* not associated with a weight-raised queue.
*/
if (prev_weight != new_weight &&
(bfqq ? bfqq->wr_coeff == 1 : 1))
/* If we get here, root has been initialized. */
bfq_weights_tree_add(bfqd, entity, root);
new_st->wsum += entity->weight; new_st->wsum += entity->weight;
...@@ -2606,6 +2707,10 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ...@@ -2606,6 +2707,10 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->busy_queues--; bfqd->busy_queues--;
if (!bfqq->dispatched)
bfq_weights_tree_remove(bfqd, &bfqq->entity,
&bfqd->queue_weights_tree);
if (bfqq->wr_coeff > 1) if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues--; bfqd->wr_busy_queues--;
...@@ -2626,6 +2731,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ...@@ -2626,6 +2731,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_mark_bfqq_busy(bfqq); bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues++; bfqd->busy_queues++;
if (!bfqq->dispatched)
if (bfqq->wr_coeff == 1)
bfq_weights_tree_add(bfqd, &bfqq->entity,
&bfqd->queue_weights_tree);
if (bfqq->wr_coeff > 1) if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues++; bfqd->wr_busy_queues++;
} }
...@@ -3028,6 +3138,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) ...@@ -3028,6 +3138,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
* in bfq_init_queue() * in bfq_init_queue()
*/ */
bfqg->bfqd = bfqd; bfqg->bfqd = bfqd;
bfqg->active_entities = 0;
bfqg->rq_pos_tree = RB_ROOT; bfqg->rq_pos_tree = RB_ROOT;
} }
...@@ -3915,6 +4026,158 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ...@@ -3915,6 +4026,158 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL; bfqq->pos_root = NULL;
} }
/*
* Tell whether there are active queues or groups with differentiated weights.
*/
static bool bfq_differentiated_weights(struct bfq_data *bfqd)
{
/*
* For weights to differ, at least one of the trees must contain
* at least two nodes.
*/
return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
(bfqd->queue_weights_tree.rb_node->rb_left ||
bfqd->queue_weights_tree.rb_node->rb_right)
#ifdef CONFIG_BFQ_GROUP_IOSCHED
) ||
(!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
(bfqd->group_weights_tree.rb_node->rb_left ||
bfqd->group_weights_tree.rb_node->rb_right)
#endif
);
}
/*
* The following function returns true if every queue must receive the
* same share of the throughput (this condition is used when deciding
* whether idling may be disabled, see the comments in the function
* bfq_bfqq_may_idle()).
*
* Such a scenario occurs when:
* 1) all active queues have the same weight,
* 2) all active groups at the same level in the groups tree have the same
* weight,
* 3) all active groups at the same level in the groups tree have the same
* number of children.
*
* Unfortunately, keeping the necessary state for evaluating exactly the
* above symmetry conditions would be quite complex and time-consuming.
* Therefore this function evaluates, instead, the following stronger
* sub-conditions, for which it is much easier to maintain the needed
* state:
* 1) all active queues have the same weight,
* 2) all active groups have the same weight,
* 3) all active groups have at most one active child each.
* In particular, the last two conditions are always true if hierarchical
* support and the cgroups interface are not enabled, thus no state needs
* to be maintained in this case.
*/
static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
{
return !bfq_differentiated_weights(bfqd);
}
/*
* If the weight-counter tree passed as input contains no counter for
* the weight of the input entity, then add that counter; otherwise just
* increment the existing counter.
*
* Note that weight-counter trees contain few nodes in mostly symmetric
* scenarios. For example, if all queues have the same weight, then the
* weight-counter tree for the queues may contain at most one node.
* This holds even if low_latency is on, because weight-raised queues
* are not inserted in the tree.
* In most scenarios, the rate at which nodes are created/destroyed
* should be low too.
*/
static void bfq_weights_tree_add(struct bfq_data *bfqd,
struct bfq_entity *entity,
struct rb_root *root)
{
struct rb_node **new = &(root->rb_node), *parent = NULL;
/*
* Do not insert if the entity is already associated with a
* counter, which happens if:
* 1) the entity is associated with a queue,
* 2) a request arrival has caused the queue to become both
* non-weight-raised, and hence change its weight, and
* backlogged; in this respect, each of the two events
* causes an invocation of this function,
* 3) this is the invocation of this function caused by the
* second event. This second invocation is actually useless,
* and we handle this fact by exiting immediately. More
* efficient or clearer solutions might possibly be adopted.
*/
if (entity->weight_counter)
return;
while (*new) {
struct bfq_weight_counter *__counter = container_of(*new,
struct bfq_weight_counter,
weights_node);
parent = *new;
if (entity->weight == __counter->weight) {
entity->weight_counter = __counter;
goto inc_counter;
}
if (entity->weight < __counter->weight)
new = &((*new)->rb_left);
else
new = &((*new)->rb_right);
}
entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
GFP_ATOMIC);
/*
* In the unlucky event of an allocation failure, we just
* exit. This will cause the weight of entity to not be
* considered in bfq_differentiated_weights, which, in its
* turn, causes the scenario to be deemed wrongly symmetric in
* case entity's weight would have been the only weight making
* the scenario asymmetric. On the bright side, no unbalance
* will however occur when entity becomes inactive again (the
* invocation of this function is triggered by an activation
* of entity). In fact, bfq_weights_tree_remove does nothing
* if !entity->weight_counter.
*/
if (unlikely(!entity->weight_counter))
return;
entity->weight_counter->weight = entity->weight;
rb_link_node(&entity->weight_counter->weights_node, parent, new);
rb_insert_color(&entity->weight_counter->weights_node, root);
inc_counter:
entity->weight_counter->num_active++;
}
/*
* Decrement the weight counter associated with the entity, and, if the
* counter reaches 0, remove the counter from the tree.
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
static void bfq_weights_tree_remove(struct bfq_data *bfqd,
struct bfq_entity *entity,
struct rb_root *root)
{
if (!entity->weight_counter)
return;
entity->weight_counter->num_active--;
if (entity->weight_counter->num_active > 0)
goto reset_entity_pointer;
rb_erase(&entity->weight_counter->weights_node, root);
kfree(entity->weight_counter);
reset_entity_pointer:
entity->weight_counter = NULL;
}
/* /*
* Return expired entry, or NULL to just start from scratch in rbtree. * Return expired entry, or NULL to just start from scratch in rbtree.
*/ */
...@@ -5293,13 +5556,17 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) ...@@ -5293,13 +5556,17 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
*/ */
sl = bfqd->bfq_slice_idle; sl = bfqd->bfq_slice_idle;
/* /*
* Unless the queue is being weight-raised, grant only minimum * Unless the queue is being weight-raised or the scenario is
* idle time if the queue is seeky. A long idling is preserved * asymmetric, grant only minimum idle time if the queue
* for a weight-raised queue, because it is needed for * is seeky. A long idling is preserved for a weight-raised
* guaranteeing to the queue its reserved share of the * queue, or, more in general, in an asymmetric scenario,
* throughput. * because a long idling is needed for guaranteeing to a queue
*/ * its reserved share of the throughput (in particular, it is
if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1) * needed if the queue has a higher weight than some other
* queue).
*/
if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
bfq_symmetric_scenario(bfqd))
sl = min_t(u64, sl, BFQ_MIN_TT); sl = min_t(u64, sl, BFQ_MIN_TT);
bfqd->last_idling_start = ktime_get(); bfqd->last_idling_start = ktime_get();
...@@ -7197,6 +7464,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) ...@@ -7197,6 +7464,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* mechanism). * mechanism).
*/ */
bfqq->budget_timeout = jiffies; bfqq->budget_timeout = jiffies;
bfq_weights_tree_remove(bfqd, &bfqq->entity,
&bfqd->queue_weights_tree);
} }
now_ns = ktime_get_ns(); now_ns = ktime_get_ns();
...@@ -7627,6 +7897,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ...@@ -7627,6 +7897,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
HRTIMER_MODE_REL); HRTIMER_MODE_REL);
bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
bfqd->queue_weights_tree = RB_ROOT;
bfqd->group_weights_tree = RB_ROOT;
INIT_LIST_HEAD(&bfqd->active_list); INIT_LIST_HEAD(&bfqd->active_list);
INIT_LIST_HEAD(&bfqd->idle_list); INIT_LIST_HEAD(&bfqd->idle_list);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment