Commit dc6ae6d8 authored by Ilya Dryomov's avatar Ilya Dryomov

crush: add chooseleaf_stable tunable

Add a tunable to fix the bug that chooseleaf may cause unnecessary pg
migrations when some device fails.

Reflects ceph.git commit fdb3f664448e80d984470f32f04e2e6f03ab52ec.
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
Reviewed-by: default avatarSage Weil <sage@redhat.com>
parent 56a4f309
...@@ -59,7 +59,8 @@ enum { ...@@ -59,7 +59,8 @@ enum {
CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
}; };
/* /*
...@@ -205,6 +206,11 @@ struct crush_map { ...@@ -205,6 +206,11 @@ struct crush_map {
* mappings line up a bit better with previous mappings. */ * mappings line up a bit better with previous mappings. */
__u8 chooseleaf_vary_r; __u8 chooseleaf_vary_r;
/* if true, it makes chooseleaf firstn to return stable results (if
* no local retry) so that data migrations would be optimal when some
* device fails. */
__u8 chooseleaf_stable;
#ifndef __KERNEL__ #ifndef __KERNEL__
/* /*
* version 0 (original) of straw_calc has various flaws. version 1 * version 0 (original) of straw_calc has various flaws. version 1
......
...@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map, ...@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
* @local_retries: localized retries * @local_retries: localized retries
* @local_fallback_retries: localized fallback retries * @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
* @stable: stable mode starts rep=0 in the recursive call for all replicas
* @vary_r: pass r to recursive calls * @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf) * @out2: second output vector for leaf items (if @recurse_to_leaf)
* @parent_r: r value passed from the parent * @parent_r: r value passed from the parent
...@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int local_fallback_retries, unsigned int local_fallback_retries,
int recurse_to_leaf, int recurse_to_leaf,
unsigned int vary_r, unsigned int vary_r,
unsigned int stable,
int *out2, int *out2,
int parent_r) int parent_r)
{ {
...@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
int collide, reject; int collide, reject;
int count = out_size; int count = out_size;
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
recurse_to_leaf ? "_LEAF" : "", recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep, bucket->id, x, outpos, numrep,
tries, recurse_tries, local_retries, local_fallback_retries, tries, recurse_tries, local_retries, local_fallback_retries,
parent_r); parent_r, stable);
for (rep = outpos; rep < numrep && count > 0 ; rep++) { for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
/* keep trying until we get a non-out, non-colliding item */ /* keep trying until we get a non-out, non-colliding item */
ftotal = 0; ftotal = 0;
skip_rep = 0; skip_rep = 0;
...@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
if (crush_choose_firstn(map, if (crush_choose_firstn(map,
map->buckets[-1-item], map->buckets[-1-item],
weight, weight_max, weight, weight_max,
x, outpos+1, 0, x, stable ? 1 : outpos+1, 0,
out2, outpos, count, out2, outpos, count,
recurse_tries, 0, recurse_tries, 0,
local_retries, local_retries,
local_fallback_retries, local_fallback_retries,
0, 0,
vary_r, vary_r,
stable,
NULL, NULL,
sub_r) <= outpos) sub_r) <= outpos)
/* didn't get leaf */ /* didn't get leaf */
...@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
int choose_local_fallback_retries = map->choose_local_fallback_tries; int choose_local_fallback_retries = map->choose_local_fallback_tries;
int vary_r = map->chooseleaf_vary_r; int vary_r = map->chooseleaf_vary_r;
int stable = map->chooseleaf_stable;
if ((__u32)ruleno >= map->max_rules) { if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno); dprintk(" bad ruleno %d\n", ruleno);
...@@ -870,6 +874,11 @@ int crush_do_rule(const struct crush_map *map, ...@@ -870,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
vary_r = curstep->arg1; vary_r = curstep->arg1;
break; break;
case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
if (curstep->arg1 >= 0)
stable = curstep->arg1;
break;
case CRUSH_RULE_CHOOSELEAF_FIRSTN: case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN: case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1; firstn = 1;
...@@ -932,6 +941,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -932,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
choose_local_fallback_retries, choose_local_fallback_retries,
recurse_to_leaf, recurse_to_leaf,
vary_r, vary_r,
stable,
c+osize, c+osize,
0); 0);
} else { } else {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment