Commit 6aaf47fa authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

[PATCH] INET : IPV4 UDP lookups converted to a 2 pass algo

Some people want to have many UDP sockets, binded to a single port but
many different addresses. We currently hash all those sockets into a
single chain.  Processing of incoming packets is very expensive,
because the whole chain must be examined to find the best match.

I chose in this patch to hash UDP sockets with a hash function that
take into account both their port number and address : This has a
drawback because we need two lookups : one with a given address, one
with a wildcard (null) address.
Signed-off-by: default avatarEric Dumazet <dada1@cosmosbay.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 65def812
...@@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock); ...@@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock);
static int udp_port_rover; static int udp_port_rover;
static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) /*
* Note about this hash function :
* Typical use is probably daddr = 0, only dport is going to vary hash
*/
static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr)
{
addr ^= addr >> 16;
addr ^= addr >> 8;
return port ^ addr;
}
static inline int __udp_lib_port_inuse(unsigned int hash, int port,
__be32 daddr, struct hlist_head udptable[])
{ {
struct sock *sk; struct sock *sk;
struct hlist_node *node; struct hlist_node *node;
struct inet_sock *inet;
sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
if (sk->sk_hash == num) if (sk->sk_hash != hash)
continue;
inet = inet_sk(sk);
if (inet->num != port)
continue;
if (inet->rcv_saddr == daddr)
return 1; return 1;
}
return 0; return 0;
} }
...@@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, ...@@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
struct hlist_node *node; struct hlist_node *node;
struct hlist_head *head; struct hlist_head *head;
struct sock *sk2; struct sock *sk2;
unsigned int hash;
int error = 1; int error = 1;
write_lock_bh(&udp_hash_lock); write_lock_bh(&udp_hash_lock);
...@@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, ...@@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
int size; int size;
head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; hash = hash_port_and_addr(result,
inet_sk(sk)->rcv_saddr);
head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
if (hlist_empty(head)) { if (hlist_empty(head)) {
if (result > sysctl_local_port_range[1]) if (result > sysctl_local_port_range[1])
result = sysctl_local_port_range[0] + result = sysctl_local_port_range[0] +
...@@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, ...@@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
result = sysctl_local_port_range[0] result = sysctl_local_port_range[0]
+ ((result - sysctl_local_port_range[0]) & + ((result - sysctl_local_port_range[0]) &
(UDP_HTABLE_SIZE - 1)); (UDP_HTABLE_SIZE - 1));
if (! __udp_lib_lport_inuse(result, udptable)) hash = hash_port_and_addr(result,
inet_sk(sk)->rcv_saddr);
if (! __udp_lib_port_inuse(hash, result,
inet_sk(sk)->rcv_saddr, udptable))
break; break;
} }
if (i >= (1 << 16) / UDP_HTABLE_SIZE) if (i >= (1 << 16) / UDP_HTABLE_SIZE)
...@@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, ...@@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
gotit: gotit:
*port_rover = snum = result; *port_rover = snum = result;
} else { } else {
head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr);
head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
sk_for_each(sk2, node, head) sk_for_each(sk2, node, head)
if (sk2->sk_hash == snum && if (sk2->sk_hash == hash &&
sk2 != sk && sk2 != sk &&
inet_sk(sk2)->num == snum &&
(!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
...@@ -201,9 +228,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, ...@@ -201,9 +228,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
goto fail; goto fail;
} }
inet_sk(sk)->num = snum; inet_sk(sk)->num = snum;
sk->sk_hash = snum; sk->sk_hash = hash;
if (sk_unhashed(sk)) { if (sk_unhashed(sk)) {
head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
sk_add_node(sk, head); sk_add_node(sk, head);
sock_prot_inc_use(sk->sk_prot); sock_prot_inc_use(sk->sk_prot);
} }
...@@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, ...@@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
{ {
struct sock *sk, *result = NULL; struct sock *sk, *result = NULL;
struct hlist_node *node; struct hlist_node *node;
unsigned short hnum = ntohs(dport); unsigned int hash, hashwild;
int badness = -1; int score, best = -1;
hash = hash_port_and_addr(ntohs(dport), daddr);
hashwild = hash_port_and_addr(ntohs(dport), 0);
read_lock(&udp_hash_lock); read_lock(&udp_hash_lock);
sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {
lookup:
sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { if (sk->sk_hash != hash || ipv6_only_sock(sk) ||
int score = (sk->sk_family == PF_INET ? 1 : 0); inet->num != dport)
if (inet->rcv_saddr) { continue;
if (inet->rcv_saddr != daddr)
continue; score = (sk->sk_family == PF_INET ? 1 : 0);
score+=2; if (inet->rcv_saddr) {
} if (inet->rcv_saddr != daddr)
if (inet->daddr) { continue;
if (inet->daddr != saddr) score+=2;
continue; }
score+=2; if (inet->daddr) {
} if (inet->daddr != saddr)
if (inet->dport) { continue;
if (inet->dport != sport) score+=2;
continue; }
score+=2; if (inet->dport) {
} if (inet->dport != sport)
if (sk->sk_bound_dev_if) { continue;
if (sk->sk_bound_dev_if != dif) score+=2;
continue; }
score+=2; if (sk->sk_bound_dev_if) {
} if (sk->sk_bound_dev_if != dif)
if (score == 9) { continue;
result = sk; score+=2;
break; }
} else if (score > badness) { if (score == 9) {
result = sk; result = sk;
badness = score; goto found;
} } else if (score > best) {
result = sk;
best = score;
} }
} }
if (hash != hashwild) {
hash = hashwild;
goto lookup;
}
found:
if (result) if (result)
sock_hold(result); sock_hold(result);
read_unlock(&udp_hash_lock); read_unlock(&udp_hash_lock);
return result; return result;
} }
static inline struct sock *udp_v4_mcast_next(struct sock *sk, static inline struct sock *udp_v4_mcast_next(
__be16 loc_port, __be32 loc_addr, struct sock *sk,
__be16 rmt_port, __be32 rmt_addr, unsigned int hnum, __be16 loc_port, __be32 loc_addr,
int dif) __be16 rmt_port, __be32 rmt_addr,
int dif)
{ {
struct hlist_node *node; struct hlist_node *node;
struct sock *s = sk; struct sock *s = sk;
unsigned short hnum = ntohs(loc_port);
sk_for_each_from(s, node) { sk_for_each_from(s, node) {
struct inet_sock *inet = inet_sk(s); struct inet_sock *inet = inet_sk(s);
if (s->sk_hash != hnum || if (s->sk_hash != hnum ||
inet->num != loc_port ||
(inet->daddr && inet->daddr != rmt_addr) || (inet->daddr && inet->daddr != rmt_addr) ||
(inet->dport != rmt_port && inet->dport) || (inet->dport != rmt_port && inet->dport) ||
(inet->rcv_saddr && inet->rcv_saddr != loc_addr) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
...@@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, ...@@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
__be32 saddr, __be32 daddr, __be32 saddr, __be32 daddr,
struct hlist_head udptable[]) struct hlist_head udptable[])
{ {
struct sock *sk; struct sock *sk, *skw, *sknext;
int dif; int dif;
unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr);
unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0);
read_lock(&udp_hash_lock);
sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
dif = skb->dev->ifindex; dif = skb->dev->ifindex;
sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
if (sk) {
struct sock *sknext = NULL;
read_lock(&udp_hash_lock);
sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]);
skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]);
sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif);
if (!sk) {
hash = hashwild;
sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source,
saddr, dif);
}
if (sk) {
do { do {
struct sk_buff *skb1 = skb; struct sk_buff *skb1 = skb;
sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest,
sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, daddr, uh->source, saddr, dif);
uh->source, saddr, dif); if (!sknext && hash != hashwild) {
hash = hashwild;
sknext = udp_v4_mcast_next(skw, hash, uh->dest,
daddr, uh->source, saddr, dif);
}
if (sknext) if (sknext)
skb1 = skb_clone(skb, GFP_ATOMIC); skb1 = skb_clone(skb, GFP_ATOMIC);
if (skb1) { if (skb1) {
int ret = udp_queue_rcv_skb(sk, skb1); int ret = udp_queue_rcv_skb(sk, skb1);
if (ret > 0) if (ret > 0)
/* we should probably re-process instead /*
* of dropping packets here. */ * we should probably re-process
* instead of dropping packets here.
*/
kfree_skb(skb1); kfree_skb(skb1);
} }
sk = sknext; sk = sknext;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment