Commit 19f57256 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

fib_hash: RCU conversion phase 2

Get rid of fib_hash_lock rwlock.

The fn_zone hash table resize is the noticeable part of this patch.

I added a seqlock per fn_zone, so that readers can restart their lookup
in the (very rare) case a writer expanded the hash table.

Add rcu heads in fib_alias and fib_node, use call_rcu() to defer their
freeing, and use appropriate _rcu list manipulations.

Stress test (160.000.000 udp frames sent, IP route cache disabled to
mimic DDOS attack, FIB_HASH)

Before:
real	0m41.191s
user	0m13.137s
sys	8m55.241s

After:
real	0m38.091s
user	0m13.189s
sys	7m53.018s
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 117a8cde
...@@ -58,7 +58,8 @@ struct fib_node { ...@@ -58,7 +58,8 @@ struct fib_node {
struct fn_zone { struct fn_zone {
struct fn_zone __rcu *fz_next; /* Next not empty zone */ struct fn_zone __rcu *fz_next; /* Next not empty zone */
struct hlist_head *fz_hash; /* Hash table pointer */ struct hlist_head __rcu *fz_hash; /* Hash table pointer */
seqlock_t fz_lock;
u32 fz_hashmask; /* (fz_divisor - 1) */ u32 fz_hashmask; /* (fz_divisor - 1) */
u8 fz_order; /* Zone order (0..32) */ u8 fz_order; /* Zone order (0..32) */
...@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) ...@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
return dst & FZ_MASK(fz); return dst & FZ_MASK(fz);
} }
static DEFINE_RWLOCK(fib_hash_lock);
static unsigned int fib_hash_genid; static unsigned int fib_hash_genid;
#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head)) #define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
...@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor) ...@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
{ {
unsigned long size = divisor * sizeof(struct hlist_head); unsigned long size = divisor * sizeof(struct hlist_head);
if (size <= PAGE_SIZE) { if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL); return kzalloc(size, GFP_KERNEL);
} else {
return (struct hlist_head *) return (struct hlist_head *)
__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size)); __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
}
} }
/* The fib hash lock must be held when this is called. */ /* The fib hash lock must be held when this is called. */
...@@ -121,12 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz, ...@@ -121,12 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
struct fib_node *f; struct fib_node *f;
hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) { hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
struct hlist_head *new_head; struct hlist_head __rcu *new_head;
hlist_del(&f->fn_hash); hlist_del_rcu(&f->fn_hash);
new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
hlist_add_head(&f->fn_hash, new_head); hlist_add_head_rcu(&f->fn_hash, new_head);
} }
} }
} }
...@@ -175,32 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz) ...@@ -175,32 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
ht = fz_hash_alloc(new_divisor); ht = fz_hash_alloc(new_divisor);
if (ht) { if (ht) {
write_lock_bh(&fib_hash_lock); struct fn_zone nfz;
memcpy(&nfz, fz, sizeof(nfz));
write_seqlock_bh(&fz->fz_lock);
old_ht = fz->fz_hash; old_ht = fz->fz_hash;
fz->fz_hash = ht; nfz.fz_hash = ht;
nfz.fz_hashmask = new_hashmask;
nfz.fz_divisor = new_divisor;
fn_rebuild_zone(&nfz, old_ht, old_divisor);
fib_hash_genid++;
rcu_assign_pointer(fz->fz_hash, ht);
fz->fz_hashmask = new_hashmask; fz->fz_hashmask = new_hashmask;
fz->fz_divisor = new_divisor; fz->fz_divisor = new_divisor;
fn_rebuild_zone(fz, old_ht, old_divisor); write_sequnlock_bh(&fz->fz_lock);
fib_hash_genid++;
write_unlock_bh(&fib_hash_lock);
if (old_ht != fz->fz_embedded_hash) if (old_ht != fz->fz_embedded_hash) {
synchronize_rcu();
fz_hash_free(old_ht, old_divisor); fz_hash_free(old_ht, old_divisor);
} }
}
} }
static inline void fn_free_node(struct fib_node * f) static void fn_free_node_rcu(struct rcu_head *head)
{ {
struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
kmem_cache_free(fn_hash_kmem, f); kmem_cache_free(fn_hash_kmem, f);
} }
static inline void fn_free_node(struct fib_node *f)
{
call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
}
static void fn_free_alias_rcu(struct rcu_head *head)
{
struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
kmem_cache_free(fn_alias_kmem, fa);
}
static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
{ {
fib_release_info(fa->fa_info); fib_release_info(fa->fa_info);
if (fa == &f->fn_embedded_alias) if (fa == &f->fn_embedded_alias)
fa->fa_info = NULL; fa->fa_info = NULL;
else else
kmem_cache_free(fn_alias_kmem, fa); call_rcu(&fa->rcu, fn_free_alias_rcu);
} }
static struct fn_zone * static struct fn_zone *
...@@ -211,6 +233,7 @@ fn_new_zone(struct fn_hash *table, int z) ...@@ -211,6 +233,7 @@ fn_new_zone(struct fn_hash *table, int z)
if (!fz) if (!fz)
return NULL; return NULL;
seqlock_init(&fz->fz_lock);
fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
fz->fz_hashmask = fz->fz_divisor - 1; fz->fz_hashmask = fz->fz_divisor - 1;
fz->fz_hash = fz->fz_embedded_hash; fz->fz_hash = fz->fz_embedded_hash;
...@@ -246,17 +269,21 @@ int fib_table_lookup(struct fib_table *tb, ...@@ -246,17 +269,21 @@ int fib_table_lookup(struct fib_table *tb,
struct fn_hash *t = (struct fn_hash *)tb->tb_data; struct fn_hash *t = (struct fn_hash *)tb->tb_data;
rcu_read_lock(); rcu_read_lock();
read_lock(&fib_hash_lock);
for (fz = rcu_dereference(t->fn_zone_list); for (fz = rcu_dereference(t->fn_zone_list);
fz != NULL; fz != NULL;
fz = rcu_dereference(fz->fz_next)) { fz = rcu_dereference(fz->fz_next)) {
struct hlist_head *head; struct hlist_head __rcu *head;
struct hlist_node *node; struct hlist_node *node;
struct fib_node *f; struct fib_node *f;
__be32 k = fz_key(flp->fl4_dst, fz); __be32 k;
unsigned int seq;
do {
seq = read_seqbegin(&fz->fz_lock);
k = fz_key(flp->fl4_dst, fz);
head = &fz->fz_hash[fn_hash(k, fz)]; head = &fz->fz_hash[fn_hash(k, fz)];
hlist_for_each_entry(f, node, head, fn_hash) { hlist_for_each_entry_rcu(f, node, head, fn_hash) {
if (f->fn_key != k) if (f->fn_key != k)
continue; continue;
...@@ -266,10 +293,10 @@ int fib_table_lookup(struct fib_table *tb, ...@@ -266,10 +293,10 @@ int fib_table_lookup(struct fib_table *tb,
if (err <= 0) if (err <= 0)
goto out; goto out;
} }
} while (read_seqretry(&fz->fz_lock, seq));
} }
err = 1; err = 1;
out: out:
read_unlock(&fib_hash_lock);
rcu_read_unlock(); rcu_read_unlock();
return err; return err;
} }
...@@ -292,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb, ...@@ -292,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb,
last_resort = NULL; last_resort = NULL;
order = -1; order = -1;
read_lock(&fib_hash_lock); rcu_read_lock();
hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) {
struct fib_alias *fa; struct fib_alias *fa;
list_for_each_entry(fa, &f->fn_alias, fa_list) { list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
struct fib_info *next_fi = fa->fa_info; struct fib_info *next_fi = fa->fa_info;
if (fa->fa_scope != res->scope || if (fa->fa_scope != res->scope ||
...@@ -340,7 +367,7 @@ void fib_table_select_default(struct fib_table *tb, ...@@ -340,7 +367,7 @@ void fib_table_select_default(struct fib_table *tb,
fib_result_assign(res, last_resort); fib_result_assign(res, last_resort);
tb->tb_default = last_idx; tb->tb_default = last_idx;
out: out:
read_unlock(&fib_hash_lock); rcu_read_unlock();
} }
/* Insert node F to FZ. */ /* Insert node F to FZ. */
...@@ -348,7 +375,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) ...@@ -348,7 +375,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
{ {
struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
hlist_add_head(&f->fn_hash, head); hlist_add_head_rcu(&f->fn_hash, head);
} }
/* Return the node in FZ matching KEY. */ /* Return the node in FZ matching KEY. */
...@@ -358,7 +385,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) ...@@ -358,7 +385,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
struct hlist_node *node; struct hlist_node *node;
struct fib_node *f; struct fib_node *f;
hlist_for_each_entry(f, node, head, fn_hash) { hlist_for_each_entry_rcu(f, node, head, fn_hash) {
if (f->fn_key == key) if (f->fn_key == key)
return f; return f;
} }
...@@ -366,6 +393,16 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) ...@@ -366,6 +393,16 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
return NULL; return NULL;
} }
static struct fib_alias *fib_fast_alloc(struct fib_node *f)
{
struct fib_alias *fa = &f->fn_embedded_alias;
if (fa->fa_info != NULL)
fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
return fa;
}
/* Caller must hold RTNL. */ /* Caller must hold RTNL. */
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{ {
...@@ -451,7 +488,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -451,7 +488,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
} }
if (cfg->fc_nlflags & NLM_F_REPLACE) { if (cfg->fc_nlflags & NLM_F_REPLACE) {
struct fib_info *fi_drop;
u8 state; u8 state;
fa = fa_first; fa = fa_first;
...@@ -460,21 +496,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -460,21 +496,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
err = 0; err = 0;
goto out; goto out;
} }
write_lock_bh(&fib_hash_lock); err = -ENOBUFS;
fi_drop = fa->fa_info; new_fa = fib_fast_alloc(f);
fa->fa_info = fi; if (new_fa == NULL)
fa->fa_type = cfg->fc_type; goto out;
fa->fa_scope = cfg->fc_scope;
new_fa->fa_tos = fa->fa_tos;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_scope = cfg->fc_scope;
state = fa->fa_state; state = fa->fa_state;
fa->fa_state &= ~FA_S_ACCESSED; new_fa->fa_state = state & ~FA_S_ACCESSED;
fib_hash_genid++; fib_hash_genid++;
write_unlock_bh(&fib_hash_lock); list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
fib_release_info(fi_drop); fn_free_alias(fa, f);
if (state & FA_S_ACCESSED) if (state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
&cfg->fc_nlinfo, NLM_F_REPLACE); tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
return 0; return 0;
} }
...@@ -506,12 +546,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -506,12 +546,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
f = new_f; f = new_f;
} }
new_fa = &f->fn_embedded_alias; new_fa = fib_fast_alloc(f);
if (new_fa->fa_info != NULL) {
new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
if (new_fa == NULL) if (new_fa == NULL)
goto out; goto out;
}
new_fa->fa_info = fi; new_fa->fa_info = fi;
new_fa->fa_tos = tos; new_fa->fa_tos = tos;
new_fa->fa_type = cfg->fc_type; new_fa->fa_type = cfg->fc_type;
...@@ -522,13 +560,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -522,13 +560,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
* Insert new entry to the list. * Insert new entry to the list.
*/ */
write_lock_bh(&fib_hash_lock);
if (new_f) if (new_f)
fib_insert_node(fz, new_f); fib_insert_node(fz, new_f);
list_add_tail(&new_fa->fa_list, list_add_tail_rcu(&new_fa->fa_list,
(fa ? &fa->fa_list : &f->fn_alias)); (fa ? &fa->fa_list : &f->fn_alias));
fib_hash_genid++; fib_hash_genid++;
write_unlock_bh(&fib_hash_lock);
if (new_f) if (new_f)
fz->fz_nent++; fz->fz_nent++;
...@@ -603,14 +639,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) ...@@ -603,14 +639,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
tb->tb_id, &cfg->fc_nlinfo, 0); tb->tb_id, &cfg->fc_nlinfo, 0);
kill_fn = 0; kill_fn = 0;
write_lock_bh(&fib_hash_lock); list_del_rcu(&fa->fa_list);
list_del(&fa->fa_list);
if (list_empty(&f->fn_alias)) { if (list_empty(&f->fn_alias)) {
hlist_del(&f->fn_hash); hlist_del_rcu(&f->fn_hash);
kill_fn = 1; kill_fn = 1;
} }
fib_hash_genid++; fib_hash_genid++;
write_unlock_bh(&fib_hash_lock);
if (fa->fa_state & FA_S_ACCESSED) if (fa->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
...@@ -641,14 +675,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx) ...@@ -641,14 +675,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
struct fib_info *fi = fa->fa_info; struct fib_info *fi = fa->fa_info;
if (fi && (fi->fib_flags&RTNH_F_DEAD)) { if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
write_lock_bh(&fib_hash_lock); list_del_rcu(&fa->fa_list);
list_del(&fa->fa_list);
if (list_empty(&f->fn_alias)) { if (list_empty(&f->fn_alias)) {
hlist_del(&f->fn_hash); hlist_del_rcu(&f->fn_hash);
kill_f = 1; kill_f = 1;
} }
fib_hash_genid++; fib_hash_genid++;
write_unlock_bh(&fib_hash_lock);
fn_free_alias(fa, f); fn_free_alias(fa, f);
found++; found++;
...@@ -693,10 +725,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, ...@@ -693,10 +725,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
s_i = cb->args[4]; s_i = cb->args[4];
i = 0; i = 0;
hlist_for_each_entry(f, node, head, fn_hash) { hlist_for_each_entry_rcu(f, node, head, fn_hash) {
struct fib_alias *fa; struct fib_alias *fa;
list_for_each_entry(fa, &f->fn_alias, fa_list) { list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
if (i < s_i) if (i < s_i)
goto next; goto next;
...@@ -714,7 +746,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, ...@@ -714,7 +746,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
cb->args[4] = i; cb->args[4] = i;
return -1; return -1;
} }
next: next:
i++; i++;
} }
} }
...@@ -755,7 +787,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, ...@@ -755,7 +787,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
s_m = cb->args[2]; s_m = cb->args[2];
rcu_read_lock(); rcu_read_lock();
read_lock(&fib_hash_lock);
for (fz = rcu_dereference(table->fn_zone_list); for (fz = rcu_dereference(table->fn_zone_list);
fz != NULL; fz != NULL;
fz = rcu_dereference(fz->fz_next), m++) { fz = rcu_dereference(fz->fz_next), m++) {
...@@ -763,14 +794,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, ...@@ -763,14 +794,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
continue; continue;
if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
cb->args[2] = m; cb->args[2] = m;
read_unlock(&fib_hash_lock);
rcu_read_unlock(); rcu_read_unlock();
return -1; return -1;
} }
memset(&cb->args[3], 0, memset(&cb->args[3], 0,
sizeof(cb->args) - 3*sizeof(cb->args[0])); sizeof(cb->args) - 3*sizeof(cb->args[0]));
} }
read_unlock(&fib_hash_lock);
rcu_read_unlock(); rcu_read_unlock();
cb->args[2] = m; cb->args[2] = m;
return skb->len; return skb->len;
...@@ -960,13 +989,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) ...@@ -960,13 +989,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
} }
static void *fib_seq_start(struct seq_file *seq, loff_t *pos) static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(fib_hash_lock)
__acquires(RCU) __acquires(RCU)
{ {
void *v = NULL; void *v = NULL;
rcu_read_lock(); rcu_read_lock();
read_lock(&fib_hash_lock);
if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
return v; return v;
...@@ -979,17 +1006,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) ...@@ -979,17 +1006,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
} }
static void fib_seq_stop(struct seq_file *seq, void *v) static void fib_seq_stop(struct seq_file *seq, void *v)
__releases(fib_hash_lock)
__releases(RCU) __releases(RCU)
{ {
read_unlock(&fib_hash_lock);
rcu_read_unlock(); rcu_read_unlock();
} }
static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{ {
static const unsigned type2flags[RTN_MAX + 1] = { static const unsigned type2flags[RTN_MAX + 1] = {
[7] = RTF_REJECT, [8] = RTF_REJECT, [7] = RTF_REJECT,
[8] = RTF_REJECT,
}; };
unsigned flags = type2flags[type]; unsigned flags = type2flags[type];
......
...@@ -12,9 +12,7 @@ struct fib_alias { ...@@ -12,9 +12,7 @@ struct fib_alias {
u8 fa_type; u8 fa_type;
u8 fa_scope; u8 fa_scope;
u8 fa_state; u8 fa_state;
#ifdef CONFIG_IP_FIB_TRIE
struct rcu_head rcu; struct rcu_head rcu;
#endif
}; };
#define FA_S_ACCESSED 0x01 #define FA_S_ACCESSED 0x01
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment