Commit f93eb4ba authored by David S. Miller's avatar David S. Miller

Merge branch 'fib_trie-next'

Alexander Duyck says:

====================
ipv4/fib_trie: Cleanups to prepare for introduction of key vector

This patch series is meant to mostly just clean up the fib_trie to prepare
it for the introduction of the key_vector.  As such there are a number of
minor clean-ups such as reformatting the tnode to match the format once the
key vector is introduced, some optimizations to drop the need for a leaf
parent pointer, and some changes to remove duplication of effort such as
the 2 look-ups that were essentially being done per node insertion.

v2: Added code to cleanup idx >> n->bits and explain unsigned long logic
    Added code to prevent allocation when tnode size is larger than size_t
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3a65f63f 1de3d87b
...@@ -185,6 +185,7 @@ struct fib_table { ...@@ -185,6 +185,7 @@ struct fib_table {
u32 tb_id; u32 tb_id;
int tb_default; int tb_default;
int tb_num_default; int tb_num_default;
struct rcu_head rcu;
unsigned long tb_data[0]; unsigned long tb_data[0];
}; };
...@@ -206,12 +207,16 @@ void fib_free_table(struct fib_table *tb); ...@@ -206,12 +207,16 @@ void fib_free_table(struct fib_table *tb);
static inline struct fib_table *fib_get_table(struct net *net, u32 id) static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{ {
struct hlist_node *tb_hlist;
struct hlist_head *ptr; struct hlist_head *ptr;
ptr = id == RT_TABLE_LOCAL ? ptr = id == RT_TABLE_LOCAL ?
&net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] :
&net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX];
return hlist_entry(ptr->first, struct fib_table, tb_hlist);
tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr));
return hlist_entry(tb_hlist, struct fib_table, tb_hlist);
} }
static inline struct fib_table *fib_new_table(struct net *net, u32 id) static inline struct fib_table *fib_new_table(struct net *net, u32 id)
...@@ -222,15 +227,19 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id) ...@@ -222,15 +227,19 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id)
static inline int fib_lookup(struct net *net, const struct flowi4 *flp, static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
struct fib_result *res) struct fib_result *res)
{ {
int err = -ENETUNREACH; struct fib_table *tb;
int err;
rcu_read_lock(); rcu_read_lock();
if (!fib_table_lookup(fib_get_table(net, RT_TABLE_LOCAL), flp, res, for (err = 0; !err; err = -ENETUNREACH) {
FIB_LOOKUP_NOREF) || tb = fib_get_table(net, RT_TABLE_LOCAL);
!fib_table_lookup(fib_get_table(net, RT_TABLE_MAIN), flp, res, if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
FIB_LOOKUP_NOREF)) break;
err = 0; tb = fib_get_table(net, RT_TABLE_MAIN);
if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
break;
}
rcu_read_unlock(); rcu_read_unlock();
...@@ -249,28 +258,33 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res); ...@@ -249,28 +258,33 @@ int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res);
static inline int fib_lookup(struct net *net, struct flowi4 *flp, static inline int fib_lookup(struct net *net, struct flowi4 *flp,
struct fib_result *res) struct fib_result *res)
{ {
if (!net->ipv4.fib_has_custom_rules) { struct fib_table *tb;
int err = -ENETUNREACH; int err;
if (net->ipv4.fib_has_custom_rules)
return __fib_lookup(net, flp, res);
rcu_read_lock(); rcu_read_lock();
res->tclassid = 0; res->tclassid = 0;
if ((net->ipv4.fib_local &&
!fib_table_lookup(net->ipv4.fib_local, flp, res, for (err = 0; !err; err = -ENETUNREACH) {
FIB_LOOKUP_NOREF)) || tb = rcu_dereference_rtnl(net->ipv4.fib_local);
(net->ipv4.fib_main && if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
!fib_table_lookup(net->ipv4.fib_main, flp, res, break;
FIB_LOOKUP_NOREF)) ||
(net->ipv4.fib_default && tb = rcu_dereference_rtnl(net->ipv4.fib_main);
!fib_table_lookup(net->ipv4.fib_default, flp, res, if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
FIB_LOOKUP_NOREF))) break;
err = 0;
tb = rcu_dereference_rtnl(net->ipv4.fib_default);
if (tb && !fib_table_lookup(tb, flp, res, FIB_LOOKUP_NOREF))
break;
}
rcu_read_unlock(); rcu_read_unlock();
return err; return err;
}
return __fib_lookup(net, flp, res);
} }
#endif /* CONFIG_IP_MULTIPLE_TABLES */ #endif /* CONFIG_IP_MULTIPLE_TABLES */
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include <linux/uidgid.h> #include <linux/uidgid.h>
#include <net/inet_frag.h> #include <net/inet_frag.h>
#include <linux/rcupdate.h>
struct tcpm_hash_bucket; struct tcpm_hash_bucket;
struct ctl_table_header; struct ctl_table_header;
...@@ -38,9 +39,9 @@ struct netns_ipv4 { ...@@ -38,9 +39,9 @@ struct netns_ipv4 {
#ifdef CONFIG_IP_MULTIPLE_TABLES #ifdef CONFIG_IP_MULTIPLE_TABLES
struct fib_rules_ops *rules_ops; struct fib_rules_ops *rules_ops;
bool fib_has_custom_rules; bool fib_has_custom_rules;
struct fib_table *fib_local; struct fib_table __rcu *fib_local;
struct fib_table *fib_main; struct fib_table __rcu *fib_main;
struct fib_table *fib_default; struct fib_table __rcu *fib_default;
#endif #endif
#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_ROUTE_CLASSID
int fib_num_tclassid_users; int fib_num_tclassid_users;
......
...@@ -89,17 +89,14 @@ struct fib_table *fib_new_table(struct net *net, u32 id) ...@@ -89,17 +89,14 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
switch (id) { switch (id) {
case RT_TABLE_LOCAL: case RT_TABLE_LOCAL:
net->ipv4.fib_local = tb; rcu_assign_pointer(net->ipv4.fib_local, tb);
break; break;
case RT_TABLE_MAIN: case RT_TABLE_MAIN:
net->ipv4.fib_main = tb; rcu_assign_pointer(net->ipv4.fib_main, tb);
break; break;
case RT_TABLE_DEFAULT: case RT_TABLE_DEFAULT:
net->ipv4.fib_default = tb; rcu_assign_pointer(net->ipv4.fib_default, tb);
break; break;
default: default:
break; break;
} }
...@@ -132,13 +129,14 @@ struct fib_table *fib_get_table(struct net *net, u32 id) ...@@ -132,13 +129,14 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
static void fib_flush(struct net *net) static void fib_flush(struct net *net)
{ {
int flushed = 0; int flushed = 0;
struct fib_table *tb;
struct hlist_head *head;
unsigned int h; unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) { for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
head = &net->ipv4.fib_table_hash[h]; struct hlist_head *head = &net->ipv4.fib_table_hash[h];
hlist_for_each_entry(tb, head, tb_hlist) struct hlist_node *tmp;
struct fib_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
flushed += fib_table_flush(tb); flushed += fib_table_flush(tb);
} }
...@@ -665,10 +663,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -665,10 +663,12 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
s_h = cb->args[0]; s_h = cb->args[0];
s_e = cb->args[1]; s_e = cb->args[1];
rcu_read_lock();
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0; e = 0;
head = &net->ipv4.fib_table_hash[h]; head = &net->ipv4.fib_table_hash[h];
hlist_for_each_entry(tb, head, tb_hlist) { hlist_for_each_entry_rcu(tb, head, tb_hlist) {
if (e < s_e) if (e < s_e)
goto next; goto next;
if (dumped) if (dumped)
...@@ -682,6 +682,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -682,6 +682,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
} }
} }
out: out:
rcu_read_unlock();
cb->args[1] = e; cb->args[1] = e;
cb->args[0] = h; cb->args[0] = h;
...@@ -1117,14 +1119,34 @@ static void ip_fib_net_exit(struct net *net) ...@@ -1117,14 +1119,34 @@ static void ip_fib_net_exit(struct net *net)
rtnl_lock(); rtnl_lock();
for (i = 0; i < FIB_TABLE_HASHSZ; i++) { for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
struct fib_table *tb; struct hlist_head *head = &net->ipv4.fib_table_hash[i];
struct hlist_head *head;
struct hlist_node *tmp; struct hlist_node *tmp;
struct fib_table *tb;
/* this is done in two passes as flushing the table could
* cause it to be reallocated in order to accommodate new
* tnodes at the root as the table shrinks.
*/
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
fib_table_flush(tb);
head = &net->ipv4.fib_table_hash[i];
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
#ifdef CONFIG_IP_MULTIPLE_TABLES
switch (tb->tb_id) {
case RT_TABLE_LOCAL:
RCU_INIT_POINTER(net->ipv4.fib_local, NULL);
break;
case RT_TABLE_MAIN:
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
break;
case RT_TABLE_DEFAULT:
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
break;
default:
break;
}
#endif
hlist_del(&tb->tb_hlist); hlist_del(&tb->tb_hlist);
fib_table_flush(tb);
fib_free_table(tb); fib_free_table(tb);
} }
} }
......
...@@ -94,24 +94,27 @@ typedef unsigned int t_key; ...@@ -94,24 +94,27 @@ typedef unsigned int t_key;
#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) #define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
struct tnode { struct tnode {
struct rcu_head rcu;
t_key empty_children; /* KEYLENGTH bits needed */
t_key full_children; /* KEYLENGTH bits needed */
struct tnode __rcu *parent;
t_key key; t_key key;
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char pos; /* 2log(KEYLENGTH) bits needed */ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char slen; unsigned char slen;
struct tnode __rcu *parent;
struct rcu_head rcu;
union { union {
/* The fields in this struct are valid if bits > 0 (TNODE) */ /* This list pointer if valid if (pos | bits) == 0 (LEAF) */
struct {
t_key empty_children; /* KEYLENGTH bits needed */
t_key full_children; /* KEYLENGTH bits needed */
struct tnode __rcu *child[0];
};
/* This list pointer if valid if bits == 0 (LEAF) */
struct hlist_head leaf; struct hlist_head leaf;
/* This array is valid if (pos | bits) > 0 (TNODE) */
struct tnode __rcu *tnode[0];
}; };
}; };
#define TNODE_SIZE(n) offsetof(struct tnode, tnode[n])
#define LEAF_SIZE TNODE_SIZE(1)
#ifdef CONFIG_IP_FIB_TRIE_STATS #ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats { struct trie_use_stats {
unsigned int gets; unsigned int gets;
...@@ -180,14 +183,21 @@ static inline unsigned long tnode_child_length(const struct tnode *tn) ...@@ -180,14 +183,21 @@ static inline unsigned long tnode_child_length(const struct tnode *tn)
static inline struct tnode *tnode_get_child(const struct tnode *tn, static inline struct tnode *tnode_get_child(const struct tnode *tn,
unsigned long i) unsigned long i)
{ {
return rtnl_dereference(tn->child[i]); return rtnl_dereference(tn->tnode[i]);
} }
/* caller must hold RCU read lock or RTNL */ /* caller must hold RCU read lock or RTNL */
static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
unsigned long i) unsigned long i)
{ {
return rcu_dereference_rtnl(tn->child[i]); return rcu_dereference_rtnl(tn->tnode[i]);
}
static inline struct fib_table *trie_get_table(struct trie *t)
{
unsigned long *tb_data = (unsigned long *)t;
return container_of(tb_data, struct fib_table, tb_data[0]);
} }
/* To understand this stuff, an understanding of keys and all their bits is /* To understand this stuff, an understanding of keys and all their bits is
...@@ -266,7 +276,9 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) ...@@ -266,7 +276,9 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa)
} }
#define TNODE_KMALLOC_MAX \ #define TNODE_KMALLOC_MAX \
ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct tnode *))
#define TNODE_VMALLOC_MAX \
ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct tnode *))
static void __node_free_rcu(struct rcu_head *head) static void __node_free_rcu(struct rcu_head *head)
{ {
...@@ -282,8 +294,17 @@ static void __node_free_rcu(struct rcu_head *head) ...@@ -282,8 +294,17 @@ static void __node_free_rcu(struct rcu_head *head)
#define node_free(n) call_rcu(&n->rcu, __node_free_rcu) #define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
static struct tnode *tnode_alloc(size_t size) static struct tnode *tnode_alloc(int bits)
{ {
size_t size;
/* verify bits is within bounds */
if (bits > TNODE_VMALLOC_MAX)
return NULL;
/* determine size and verify it is non-zero and didn't overflow */
size = TNODE_SIZE(1ul << bits);
if (size <= PAGE_SIZE) if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL); return kzalloc(size, GFP_KERNEL);
else else
...@@ -300,7 +321,7 @@ static inline void empty_child_dec(struct tnode *n) ...@@ -300,7 +321,7 @@ static inline void empty_child_dec(struct tnode *n)
n->empty_children-- ? : n->full_children--; n->empty_children-- ? : n->full_children--;
} }
static struct tnode *leaf_new(t_key key) static struct tnode *leaf_new(t_key key, struct fib_alias *fa)
{ {
struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) { if (l) {
...@@ -310,20 +331,21 @@ static struct tnode *leaf_new(t_key key) ...@@ -310,20 +331,21 @@ static struct tnode *leaf_new(t_key key)
* as the nodes are searched * as the nodes are searched
*/ */
l->key = key; l->key = key;
l->slen = 0; l->slen = fa->fa_slen;
l->pos = 0; l->pos = 0;
/* set bits to 0 indicating we are not a tnode */ /* set bits to 0 indicating we are not a tnode */
l->bits = 0; l->bits = 0;
/* link leaf to fib alias */
INIT_HLIST_HEAD(&l->leaf); INIT_HLIST_HEAD(&l->leaf);
hlist_add_head(&fa->fa_list, &l->leaf);
} }
return l; return l;
} }
static struct tnode *tnode_new(t_key key, int pos, int bits) static struct tnode *tnode_new(t_key key, int pos, int bits)
{ {
size_t sz = offsetof(struct tnode, child[1ul << bits]); struct tnode *tn = tnode_alloc(bits);
struct tnode *tn = tnode_alloc(sz);
unsigned int shift = pos + bits; unsigned int shift = pos + bits;
/* verify bits and pos their msb bits clear and values are valid */ /* verify bits and pos their msb bits clear and values are valid */
...@@ -341,7 +363,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) ...@@ -341,7 +363,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
tn->empty_children = 1ul << bits; tn->empty_children = 1ul << bits;
} }
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), pr_debug("AT %p s=%zu %zu\n", tn, TNODE_SIZE(0),
sizeof(struct tnode *) << bits); sizeof(struct tnode *) << bits);
return tn; return tn;
} }
...@@ -382,7 +404,7 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) ...@@ -382,7 +404,7 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
if (n && (tn->slen < n->slen)) if (n && (tn->slen < n->slen))
tn->slen = n->slen; tn->slen = n->slen;
rcu_assign_pointer(tn->child[i], n); rcu_assign_pointer(tn->tnode[i], n);
} }
static void update_children(struct tnode *tn) static void update_children(struct tnode *tn)
...@@ -433,7 +455,7 @@ static void tnode_free(struct tnode *tn) ...@@ -433,7 +455,7 @@ static void tnode_free(struct tnode *tn)
while (head) { while (head) {
head = head->next; head = head->next;
tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); tnode_free_size += TNODE_SIZE(1ul << tn->bits);
node_free(tn); node_free(tn);
tn = container_of(head, struct tnode, rcu); tn = container_of(head, struct tnode, rcu);
...@@ -786,7 +808,7 @@ static void resize(struct trie *t, struct tnode *tn) ...@@ -786,7 +808,7 @@ static void resize(struct trie *t, struct tnode *tn)
* doing it ourselves. This way we can let RCU fully do its * doing it ourselves. This way we can let RCU fully do its
* thing without us interfering * thing without us interfering
*/ */
cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; cptr = tp ? &tp->tnode[get_index(tn->key, tp)] : &t->trie;
BUG_ON(tn != rtnl_dereference(*cptr)); BUG_ON(tn != rtnl_dereference(*cptr));
/* Double as long as the resulting node has a number of /* Double as long as the resulting node has a number of
...@@ -842,10 +864,8 @@ static void resize(struct trie *t, struct tnode *tn) ...@@ -842,10 +864,8 @@ static void resize(struct trie *t, struct tnode *tn)
} }
} }
static void leaf_pull_suffix(struct tnode *l) static void leaf_pull_suffix(struct tnode *tp, struct tnode *l)
{ {
struct tnode *tp = node_parent(l);
while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) { while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
if (update_suffix(tp) > l->slen) if (update_suffix(tp) > l->slen)
break; break;
...@@ -853,10 +873,8 @@ static void leaf_pull_suffix(struct tnode *l) ...@@ -853,10 +873,8 @@ static void leaf_pull_suffix(struct tnode *l)
} }
} }
static void leaf_push_suffix(struct tnode *l) static void leaf_push_suffix(struct tnode *tn, struct tnode *l)
{ {
struct tnode *tn = node_parent(l);
/* if this is a new leaf then tn will be NULL and we can sort /* if this is a new leaf then tn will be NULL and we can sort
* out parent suffix lengths as a part of trie_rebalance * out parent suffix lengths as a part of trie_rebalance
*/ */
...@@ -866,55 +884,10 @@ static void leaf_push_suffix(struct tnode *l) ...@@ -866,55 +884,10 @@ static void leaf_push_suffix(struct tnode *l)
} }
} }
static void fib_remove_alias(struct tnode *l, struct fib_alias *old)
{
/* record the location of the previous list_info entry */
struct hlist_node **pprev = old->fa_list.pprev;
struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
/* remove the fib_alias from the list */
hlist_del_rcu(&old->fa_list);
/* only access fa if it is pointing at the last valid hlist_node */
if (hlist_empty(&l->leaf) || (*pprev))
return;
/* update the trie with the latest suffix length */
l->slen = fa->fa_slen;
leaf_pull_suffix(l);
}
static void fib_insert_alias(struct tnode *l, struct fib_alias *fa,
struct fib_alias *new)
{
if (fa) {
hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
} else {
struct fib_alias *last;
hlist_for_each_entry(last, &l->leaf, fa_list) {
if (new->fa_slen < last->fa_slen)
break;
fa = last;
}
if (fa)
hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
else
hlist_add_head_rcu(&new->fa_list, &l->leaf);
}
/* if we added to the tail node then we need to update slen */
if (l->slen < new->fa_slen) {
l->slen = new->fa_slen;
leaf_push_suffix(l);
}
}
/* rcu_read_lock needs to be hold by caller from readside */ /* rcu_read_lock needs to be hold by caller from readside */
static struct tnode *fib_find_node(struct trie *t, u32 key) static struct tnode *fib_find_node(struct trie *t, struct tnode **tn, u32 key)
{ {
struct tnode *n = rcu_dereference_rtnl(t->trie); struct tnode *pn = NULL, *n = rcu_dereference_rtnl(t->trie);
while (n) { while (n) {
unsigned long index = get_index(key, n); unsigned long index = get_index(key, n);
...@@ -924,21 +897,30 @@ static struct tnode *fib_find_node(struct trie *t, u32 key) ...@@ -924,21 +897,30 @@ static struct tnode *fib_find_node(struct trie *t, u32 key)
* prefix plus zeros for the bits in the cindex. The index * prefix plus zeros for the bits in the cindex. The index
* is the difference between the key and this value. From * is the difference between the key and this value. From
* this we can actually derive several pieces of data. * this we can actually derive several pieces of data.
* if (index & (~0ul << bits)) * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed * we have a mismatch in skip bits and failed
* else * else
* we know the value is cindex * we know the value is cindex
*
* This check is safe even if bits == KEYLENGTH due to the
* fact that we can only allocate a node with 32 bits if a
* long is greater than 32 bits.
*/ */
if (index & (~0ul << n->bits)) if (index >= (1ul << n->bits)) {
return NULL; n = NULL;
break;
}
/* we have found a leaf. Prefixes have already been compared */ /* we have found a leaf. Prefixes have already been compared */
if (IS_LEAF(n)) if (IS_LEAF(n))
break; break;
pn = n;
n = tnode_get_child_rcu(n, index); n = tnode_get_child_rcu(n, index);
} }
*tn = pn;
return n; return n;
} }
...@@ -971,61 +953,28 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) ...@@ -971,61 +953,28 @@ static void trie_rebalance(struct trie *t, struct tnode *tn)
{ {
struct tnode *tp; struct tnode *tp;
while ((tp = node_parent(tn)) != NULL) { while (tn) {
tp = node_parent(tn);
resize(t, tn); resize(t, tn);
tn = tp; tn = tp;
} }
/* Handle last (top) tnode */
if (IS_TNODE(tn))
resize(t, tn);
} }
/* only used from updater-side */ /* only used from updater-side */
static int fib_insert_node(struct trie *t, struct tnode *tp,
static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen) struct fib_alias *new, t_key key)
{ {
struct tnode *l, *n, *tp = NULL; struct tnode *n, *l;
n = rtnl_dereference(t->trie);
/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
* and we should just put our new leaf in that.
*
* If we hit a node with a key that does't match then we should stop
* and create a new tnode to replace that node and insert ourselves
* and the other node into the new tnode.
*/
while (n) {
unsigned long index = get_index(key, n);
/* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the
* prefix plus zeros for the "bits" in the prefix. The index
* is the difference between the key and this value. From
* this we can actually derive several pieces of data.
* if !(index >> bits)
* we know the value is child index
* else
* we have a mismatch in skip bits and failed
*/
if (index >> n->bits)
break;
/* we have found a leaf. Prefixes have already been compared */
if (IS_LEAF(n)) {
/* Case 1: n is a leaf, and prefixes match*/
return n;
}
tp = n; l = leaf_new(key, new);
n = tnode_get_child_rcu(n, index);
}
l = leaf_new(key);
if (!l) if (!l)
return NULL; return -ENOMEM;
/* retrieve child from parent node */
if (tp)
n = tnode_get_child(tp, get_index(key, tp));
else
n = rcu_dereference_rtnl(t->trie);
/* Case 2: n is a LEAF or a TNODE and the key doesn't match. /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
* *
...@@ -1039,7 +988,7 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen) ...@@ -1039,7 +988,7 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen)
tn = tnode_new(key, __fls(key ^ n->key), 1); tn = tnode_new(key, __fls(key ^ n->key), 1);
if (!tn) { if (!tn) {
node_free(l); node_free(l);
return NULL; return -ENOMEM;
} }
/* initialize routes out of node */ /* initialize routes out of node */
...@@ -1055,31 +1004,58 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen) ...@@ -1055,31 +1004,58 @@ static struct tnode *fib_insert_node(struct trie *t, u32 key, int plen)
} }
/* Case 3: n is NULL, and will just insert a new leaf */ /* Case 3: n is NULL, and will just insert a new leaf */
if (tp) {
NODE_INIT_PARENT(l, tp); NODE_INIT_PARENT(l, tp);
put_child(tp, get_index(key, tp), l); put_child_root(tp, t, key, l);
trie_rebalance(t, tp); trie_rebalance(t, tp);
return 0;
}
static int fib_insert_alias(struct trie *t, struct tnode *tp,
struct tnode *l, struct fib_alias *new,
struct fib_alias *fa, t_key key)
{
if (!l)
return fib_insert_node(t, tp, new, key);
if (fa) {
hlist_add_before_rcu(&new->fa_list, &fa->fa_list);
} else { } else {
rcu_assign_pointer(t->trie, l); struct fib_alias *last;
hlist_for_each_entry(last, &l->leaf, fa_list) {
if (new->fa_slen < last->fa_slen)
break;
fa = last;
} }
return l; if (fa)
hlist_add_behind_rcu(&new->fa_list, &fa->fa_list);
else
hlist_add_head_rcu(&new->fa_list, &l->leaf);
}
/* if we added to the tail node then we need to update slen */
if (l->slen < new->fa_slen) {
l->slen = new->fa_slen;
leaf_push_suffix(tp, l);
}
return 0;
} }
/* /* Caller must hold RTNL. */
* Caller must hold RTNL.
*/
int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{ {
struct trie *t = (struct trie *) tb->tb_data; struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa; struct fib_alias *fa, *new_fa;
struct tnode *l, *tp;
struct fib_info *fi; struct fib_info *fi;
u8 plen = cfg->fc_dst_len; u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen; u8 slen = KEYLENGTH - plen;
u8 tos = cfg->fc_tos; u8 tos = cfg->fc_tos;
u32 key, mask; u32 key;
int err; int err;
struct tnode *l;
if (plen > KEYLENGTH) if (plen > KEYLENGTH)
return -EINVAL; return -EINVAL;
...@@ -1088,9 +1064,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -1088,9 +1064,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
mask = ntohl(inet_make_mask(plen)); if ((plen < KEYLENGTH) && (key << plen))
if (key & ~mask)
return -EINVAL; return -EINVAL;
fi = fib_create_info(cfg); fi = fib_create_info(cfg);
...@@ -1099,7 +1073,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -1099,7 +1073,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
goto err; goto err;
} }
l = fib_find_node(t, key); l = fib_find_node(t, &tp, key);
fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority) : NULL; fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority) : NULL;
/* Now fa, if non-NULL, points to the first fib alias /* Now fa, if non-NULL, points to the first fib alias
...@@ -1198,19 +1172,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -1198,19 +1172,13 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_slen = slen; new_fa->fa_slen = slen;
/* Insert new entry to the list. */ /* Insert new entry to the list. */
if (!l) { err = fib_insert_alias(t, tp, l, new_fa, fa, key);
l = fib_insert_node(t, key, plen); if (err)
if (unlikely(!l)) {
err = -ENOMEM;
goto out_free_new_fa; goto out_free_new_fa;
}
}
if (!plen) if (!plen)
tb->tb_num_default++; tb->tb_num_default++;
fib_insert_alias(l, fa, new_fa);
rt_cache_flush(cfg->fc_nlinfo.nl_net); rt_cache_flush(cfg->fc_nlinfo.nl_net);
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0); &cfg->fc_nlinfo, 0);
...@@ -1243,6 +1211,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1243,6 +1211,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
const t_key key = ntohl(flp->daddr); const t_key key = ntohl(flp->daddr);
struct tnode *n, *pn; struct tnode *n, *pn;
struct fib_alias *fa; struct fib_alias *fa;
unsigned long index;
t_key cindex; t_key cindex;
n = rcu_dereference(t->trie); n = rcu_dereference(t->trie);
...@@ -1258,19 +1227,23 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1258,19 +1227,23 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
/* Step 1: Travel to the longest prefix match in the trie */ /* Step 1: Travel to the longest prefix match in the trie */
for (;;) { for (;;) {
unsigned long index = get_index(key, n); index = get_index(key, n);
/* This bit of code is a bit tricky but it combines multiple /* This bit of code is a bit tricky but it combines multiple
* checks into a single check. The prefix consists of the * checks into a single check. The prefix consists of the
* prefix plus zeros for the "bits" in the prefix. The index * prefix plus zeros for the "bits" in the prefix. The index
* is the difference between the key and this value. From * is the difference between the key and this value. From
* this we can actually derive several pieces of data. * this we can actually derive several pieces of data.
* if (index & (~0ul << bits)) * if (index >= (1ul << bits))
* we have a mismatch in skip bits and failed * we have a mismatch in skip bits and failed
* else * else
* we know the value is cindex * we know the value is cindex
*
* This check is safe even if bits == KEYLENGTH due to the
* fact that we can only allocate a node with 32 bits if a
* long is greater than 32 bits.
*/ */
if (index & (~0ul << n->bits)) if (index >= (1ul << n->bits))
break; break;
/* we have found a leaf. Prefixes have already been compared */ /* we have found a leaf. Prefixes have already been compared */
...@@ -1293,7 +1266,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1293,7 +1266,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
/* Step 2: Sort out leaves and begin backtracing for longest prefix */ /* Step 2: Sort out leaves and begin backtracing for longest prefix */
for (;;) { for (;;) {
/* record the pointer where our next node pointer is stored */ /* record the pointer where our next node pointer is stored */
struct tnode __rcu **cptr = n->child; struct tnode __rcu **cptr = n->tnode;
/* This test verifies that none of the bits that differ /* This test verifies that none of the bits that differ
* between the key and the prefix exist in the region of * between the key and the prefix exist in the region of
...@@ -1339,17 +1312,20 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1339,17 +1312,20 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
cindex &= cindex - 1; cindex &= cindex - 1;
/* grab pointer for next child node */ /* grab pointer for next child node */
cptr = &pn->child[cindex]; cptr = &pn->tnode[cindex];
} }
} }
found: found:
/* this line carries forward the xor from earlier in the function */
index = key ^ n->key;
/* Step 3: Process the leaf, if that fails fall back to backtracing */ /* Step 3: Process the leaf, if that fails fall back to backtracing */
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info; struct fib_info *fi = fa->fa_info;
int nhsel, err; int nhsel, err;
if (((key ^ n->key) >= (1ul << fa->fa_slen)) && if ((index >= (1ul << fa->fa_slen)) &&
((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH))) ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
continue; continue;
if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
...@@ -1399,53 +1375,59 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, ...@@ -1399,53 +1375,59 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
} }
EXPORT_SYMBOL_GPL(fib_table_lookup); EXPORT_SYMBOL_GPL(fib_table_lookup);
/* static void fib_remove_alias(struct trie *t, struct tnode *tp,
* Remove the leaf and return parent. struct tnode *l, struct fib_alias *old)
*/
static void trie_leaf_remove(struct trie *t, struct tnode *l)
{ {
struct tnode *tp = node_parent(l); /* record the location of the previous list_info entry */
struct hlist_node **pprev = old->fa_list.pprev;
struct fib_alias *fa = hlist_entry(pprev, typeof(*fa), fa_list.next);
pr_debug("entering trie_leaf_remove(%p)\n", l); /* remove the fib_alias from the list */
hlist_del_rcu(&old->fa_list);
if (tp) { /* if we emptied the list this leaf will be freed and we can sort
put_child(tp, get_index(l->key, tp), NULL); * out parent suffix lengths as a part of trie_rebalance
*/
if (hlist_empty(&l->leaf)) {
put_child_root(tp, t, l->key, NULL);
node_free(l);
trie_rebalance(t, tp); trie_rebalance(t, tp);
} else { return;
RCU_INIT_POINTER(t->trie, NULL);
} }
node_free(l); /* only access fa if it is pointing at the last valid hlist_node */
if (*pprev)
return;
/* update the trie with the latest suffix length */
l->slen = fa->fa_slen;
leaf_pull_suffix(tp, l);
} }
/* /* Caller must hold RTNL. */
* Caller must hold RTNL.
*/
int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
{ {
struct trie *t = (struct trie *) tb->tb_data; struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *fa_to_delete; struct fib_alias *fa, *fa_to_delete;
struct tnode *l, *tp;
u8 plen = cfg->fc_dst_len; u8 plen = cfg->fc_dst_len;
u8 tos = cfg->fc_tos;
u8 slen = KEYLENGTH - plen; u8 slen = KEYLENGTH - plen;
struct tnode *l; u8 tos = cfg->fc_tos;
u32 key, mask; u32 key;
if (plen > KEYLENGTH) if (plen > KEYLENGTH)
return -EINVAL; return -EINVAL;
key = ntohl(cfg->fc_dst); key = ntohl(cfg->fc_dst);
mask = ntohl(inet_make_mask(plen));
if (key & ~mask) if ((plen < KEYLENGTH) && (key << plen))
return -EINVAL; return -EINVAL;
l = fib_find_node(t, key); l = fib_find_node(t, &tp, key);
if (!l) if (!l)
return -ESRCH; return -ESRCH;
fa = fib_find_alias(&l->leaf, slen, tos, 0); fa = fib_find_alias(&l->leaf, slen, tos, 0);
if (!fa) if (!fa)
return -ESRCH; return -ESRCH;
...@@ -1474,150 +1456,171 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) ...@@ -1474,150 +1456,171 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete) if (!fa_to_delete)
return -ESRCH; return -ESRCH;
fa = fa_to_delete; rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
&cfg->fc_nlinfo, 0); &cfg->fc_nlinfo, 0);
fib_remove_alias(l, fa);
if (!plen) if (!plen)
tb->tb_num_default--; tb->tb_num_default--;
if (hlist_empty(&l->leaf)) fib_remove_alias(t, tp, l, fa_to_delete);
trie_leaf_remove(t, l);
if (fa->fa_state & FA_S_ACCESSED) if (fa_to_delete->fa_state & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net); rt_cache_flush(cfg->fc_nlinfo.nl_net);
fib_release_info(fa->fa_info); fib_release_info(fa_to_delete->fa_info);
alias_free_mem_rcu(fa); alias_free_mem_rcu(fa_to_delete);
return 0; return 0;
} }
static int trie_flush_leaf(struct tnode *l) /* Scan for the next leaf starting at the provided key value */
static struct tnode *leaf_walk_rcu(struct tnode **tn, t_key key)
{ {
struct hlist_node *tmp; struct tnode *pn, *n = *tn;
unsigned char slen = 0; unsigned long cindex;
struct fib_alias *fa;
int found = 0;
hlist_for_each_entry_safe(fa, tmp, &l->leaf, fa_list) { /* record parent node for backtracing */
struct fib_info *fi = fa->fa_info; pn = n;
cindex = n ? get_index(key, n) : 0;
if (fi && (fi->fib_flags & RTNH_F_DEAD)) { /* this loop is meant to try and find the key in the trie */
hlist_del_rcu(&fa->fa_list); while (n) {
fib_release_info(fa->fa_info); unsigned long idx = get_index(key, n);
alias_free_mem_rcu(fa);
found++;
continue; /* guarantee forward progress on the keys */
} if (IS_LEAF(n) && (n->key >= key))
goto found;
if (idx >= (1ul << n->bits))
break;
slen = fa->fa_slen; /* record parent and next child index */
pn = n;
cindex = idx;
/* descend into the next child */
n = tnode_get_child_rcu(pn, cindex++);
} }
l->slen = slen; /* this loop will search for the next leaf with a greater key */
while (pn) {
/* if we exhausted the parent node we will need to climb */
if (cindex >= (1ul << pn->bits)) {
t_key pkey = pn->key;
return found; pn = node_parent_rcu(pn);
} if (!pn)
break;
/* Scan for the next right leaf starting at node p->child[idx] cindex = get_index(pkey, pn) + 1;
* Since we have back pointer, no recursion necessary. continue;
*/ }
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
{
do {
unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
while (idx < tnode_child_length(p)) { /* grab the next available node */
c = tnode_get_child_rcu(p, idx++); n = tnode_get_child_rcu(pn, cindex++);
if (!c) if (!n)
continue; continue;
if (IS_LEAF(c)) /* no need to compare keys since we bumped the index */
return c; if (IS_LEAF(n))
goto found;
/* Rescan start scanning in new node */ /* Rescan start scanning in new node */
p = c; pn = n;
idx = 0; cindex = 0;
} }
/* Node empty, walk back up to parent */ *tn = pn;
c = p;
} while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */ return NULL; /* Root of trie */
found:
/* if we are at the limit for keys just return NULL for the tnode */
*tn = (n->key == KEY_MAX) ? NULL : pn;
return n;
} }
static struct tnode *trie_firstleaf(struct trie *t) /* Caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb)
{ {
struct tnode *n = rcu_dereference_rtnl(t->trie); struct trie *t = (struct trie *)tb->tb_data;
struct hlist_node *tmp;
struct fib_alias *fa;
struct tnode *n, *pn;
unsigned long cindex;
unsigned char slen;
int found = 0;
n = rcu_dereference(t->trie);
if (!n) if (!n)
return NULL; goto flush_complete;
if (IS_LEAF(n)) /* trie is just a leaf */
return n;
return leaf_walk_rcu(n, NULL); pn = NULL;
} cindex = 0;
static struct tnode *trie_nextleaf(struct tnode *l) while (IS_TNODE(n)) {
{ /* record pn and cindex for leaf walking */
struct tnode *p = node_parent_rcu(l); pn = n;
cindex = 1ul << n->bits;
backtrace:
/* walk trie in reverse order */
do {
while (!(cindex--)) {
t_key pkey = pn->key;
if (!p) n = pn;
return NULL; /* trie with just one leaf */ pn = node_parent(n);
return leaf_walk_rcu(p, l); /* resize completed node */
} resize(t, n);
static struct tnode *trie_leafindex(struct trie *t, int index) /* if we got the root we are done */
{ if (!pn)
struct tnode *l = trie_firstleaf(t); goto flush_complete;
while (l && index-- > 0) cindex = get_index(pkey, pn);
l = trie_nextleaf(l); }
return l; /* grab the next available node */
} n = tnode_get_child(pn, cindex);
} while (!n);
}
/* track slen in case any prefixes survive */
slen = 0;
/* hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
* Caller must hold RTNL. struct fib_info *fi = fa->fa_info;
*/
int fib_table_flush(struct fib_table *tb)
{
struct trie *t = (struct trie *) tb->tb_data;
struct tnode *l, *ll = NULL;
int found = 0;
for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
found += trie_flush_leaf(l); hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
found++;
if (ll) { continue;
if (hlist_empty(&ll->leaf))
trie_leaf_remove(t, ll);
else
leaf_pull_suffix(ll);
} }
ll = l; slen = fa->fa_slen;
} }
if (ll) { /* update leaf slen */
if (hlist_empty(&ll->leaf)) n->slen = slen;
trie_leaf_remove(t, ll);
else if (hlist_empty(&n->leaf)) {
leaf_pull_suffix(ll); put_child_root(pn, t, n->key, NULL);
node_free(n);
} else {
leaf_pull_suffix(pn, n);
} }
/* if trie is leaf only loop is completed */
if (pn)
goto backtrace;
flush_complete:
pr_debug("trie_flush found=%d\n", found); pr_debug("trie_flush found=%d\n", found);
return found; return found;
} }
void fib_free_table(struct fib_table *tb) static void __trie_free_rcu(struct rcu_head *head)
{ {
struct fib_table *tb = container_of(head, struct fib_table, rcu);
#ifdef CONFIG_IP_FIB_TRIE_STATS #ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie *t = (struct trie *)tb->tb_data; struct trie *t = (struct trie *)tb->tb_data;
...@@ -1626,6 +1629,11 @@ void fib_free_table(struct fib_table *tb) ...@@ -1626,6 +1629,11 @@ void fib_free_table(struct fib_table *tb)
kfree(tb); kfree(tb);
} }
void fib_free_table(struct fib_table *tb)
{
call_rcu(&tb->rcu, __trie_free_rcu);
}
static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *skb, struct netlink_callback *cb)
{ {
...@@ -1662,44 +1670,40 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, ...@@ -1662,44 +1670,40 @@ static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
return skb->len; return skb->len;
} }
/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb) struct netlink_callback *cb)
{ {
struct tnode *l; struct trie *t = (struct trie *)tb->tb_data;
struct trie *t = (struct trie *) tb->tb_data; struct tnode *l, *tp;
t_key key = cb->args[2];
int count = cb->args[3];
rcu_read_lock();
/* Dump starting at last key. /* Dump starting at last key.
* Note: 0.0.0.0/0 (ie default) is first key. * Note: 0.0.0.0/0 (ie default) is first key.
*/ */
if (count == 0) int count = cb->args[2];
l = trie_firstleaf(t); t_key key = cb->args[3];
else {
/* Normally, continue from last key, but if that is missing
* fallback to using slow rescan
*/
l = fib_find_node(t, key);
if (!l)
l = trie_leafindex(t, count);
}
while (l) { tp = rcu_dereference_rtnl(t->trie);
cb->args[2] = l->key;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) { if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
cb->args[3] = count; cb->args[3] = key;
rcu_read_unlock(); cb->args[2] = count;
return -1; return -1;
} }
++count; ++count;
l = trie_nextleaf(l); key = l->key + 1;
memset(&cb->args[4], 0, memset(&cb->args[4], 0,
sizeof(cb->args) - 4*sizeof(cb->args[0])); sizeof(cb->args) - 4*sizeof(cb->args[0]));
/* stop loop if key wrapped back to 0 */
if (key < l->key)
break;
} }
cb->args[3] = count;
rcu_read_unlock(); cb->args[3] = key;
cb->args[2] = count;
return skb->len; return skb->len;
} }
...@@ -1711,7 +1715,7 @@ void __init fib_trie_init(void) ...@@ -1711,7 +1715,7 @@ void __init fib_trie_init(void)
0, SLAB_PANIC, NULL); 0, SLAB_PANIC, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie", trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
sizeof(struct tnode), LEAF_SIZE,
0, SLAB_PANIC, NULL); 0, SLAB_PANIC, NULL);
} }
...@@ -1869,13 +1873,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) ...@@ -1869,13 +1873,13 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves); seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
bytes = sizeof(struct tnode) * stat->leaves; bytes = LEAF_SIZE * stat->leaves;
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
bytes += sizeof(struct fib_alias) * stat->prefixes; bytes += sizeof(struct fib_alias) * stat->prefixes;
seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes); seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
bytes += sizeof(struct tnode) * stat->tnodes; bytes += TNODE_SIZE(0) * stat->tnodes;
max = MAX_STAT_DEPTH; max = MAX_STAT_DEPTH;
while (max > 0 && stat->nodesizes[max-1] == 0) while (max > 0 && stat->nodesizes[max-1] == 0)
...@@ -1944,7 +1948,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) ...@@ -1944,7 +1948,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, seq_printf(seq,
"Basic info: size of leaf:" "Basic info: size of leaf:"
" %Zd bytes, size of tnode: %Zd bytes.\n", " %Zd bytes, size of tnode: %Zd bytes.\n",
sizeof(struct tnode), sizeof(struct tnode)); LEAF_SIZE, TNODE_SIZE(0));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) { for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct hlist_head *head = &net->ipv4.fib_table_hash[h];
...@@ -2171,31 +2175,46 @@ static const struct file_operations fib_trie_fops = { ...@@ -2171,31 +2175,46 @@ static const struct file_operations fib_trie_fops = {
struct fib_route_iter { struct fib_route_iter {
struct seq_net_private p; struct seq_net_private p;
struct trie *main_trie; struct fib_table *main_tb;
struct tnode *tnode;
loff_t pos; loff_t pos;
t_key key; t_key key;
}; };
static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{ {
struct tnode *l = NULL; struct fib_table *tb = iter->main_tb;
struct trie *t = iter->main_trie; struct tnode *l, **tp = &iter->tnode;
struct trie *t;
t_key key;
/* use cache location of last found key */ /* use cache location of next-to-find key */
if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key))) if (iter->pos > 0 && pos >= iter->pos) {
pos -= iter->pos; pos -= iter->pos;
else { key = iter->key;
} else {
t = (struct trie *)tb->tb_data;
iter->tnode = rcu_dereference_rtnl(t->trie);
iter->pos = 0; iter->pos = 0;
l = trie_firstleaf(t); key = 0;
} }
while (l && pos-- > 0) { while ((l = leaf_walk_rcu(tp, key)) != NULL) {
key = l->key + 1;
iter->pos++; iter->pos++;
l = trie_nextleaf(l);
if (pos-- <= 0)
break;
l = NULL;
/* handle unlikely case of a key wrap */
if (!key)
break;
} }
if (l) if (l)
iter->key = pos; /* remember it */ iter->key = key; /* remember it */
else else
iter->pos = 0; /* forget it */ iter->pos = 0; /* forget it */
...@@ -2207,37 +2226,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) ...@@ -2207,37 +2226,46 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
{ {
struct fib_route_iter *iter = seq->private; struct fib_route_iter *iter = seq->private;
struct fib_table *tb; struct fib_table *tb;
struct trie *t;
rcu_read_lock(); rcu_read_lock();
tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN); tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
if (!tb) if (!tb)
return NULL; return NULL;
iter->main_trie = (struct trie *) tb->tb_data; iter->main_tb = tb;
if (*pos == 0)
if (*pos != 0)
return fib_route_get_idx(iter, *pos);
t = (struct trie *)tb->tb_data;
iter->tnode = rcu_dereference_rtnl(t->trie);
iter->pos = 0;
iter->key = 0;
return SEQ_START_TOKEN; return SEQ_START_TOKEN;
else
return fib_route_get_idx(iter, *pos - 1);
} }
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{ {
struct fib_route_iter *iter = seq->private; struct fib_route_iter *iter = seq->private;
struct tnode *l = v; struct tnode *l = NULL;
t_key key = iter->key;
++*pos; ++*pos;
if (v == SEQ_START_TOKEN) {
iter->pos = 0; /* only allow key of 0 for start of sequence */
l = trie_firstleaf(iter->main_trie); if ((v == SEQ_START_TOKEN) || key)
} else { l = leaf_walk_rcu(&iter->tnode, key);
if (l) {
iter->key = l->key + 1;
iter->pos++; iter->pos++;
l = trie_nextleaf(l); } else {
iter->pos = 0;
} }
if (l)
iter->key = l->key;
else
iter->pos = 0;
return l; return l;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment