Commit 567e4b79 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: rfs: add hash collision detection

Receive Flow Steering is a nice solution but suffers from
hash collisions when a mix of connected and unconnected traffic
is received on the host, when flow hash table is populated.

Also, clearing flow in inet_release() makes RFS not very good
for short lived flows, as many packets can follow close().
(FIN , ACK packets, ...)

This patch extends the information stored into global hash table
to not only include cpu number, but upper part of the hash value.

I use a 32bit value, and dynamically split it in two parts.

For host with less than 64 possible cpus, this gives 6 bits for the
cpu number, and 26 (32-6) bits for the upper part of the hash.

Since hash bucket selection use low order bits of the hash, we have
a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
enough.

If the hash found in flow table does not match, we fallback to RPS (if
it is enabled for the rxqueue).

This means that a packet for an non connected flow can avoid the
IPI through a unrelated/victim CPU.

This also means we no longer have to clear the table at socket
close time, and this helps short lived flows performance.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarTom Herbert <therbert@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 096a4cfa
...@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e) ...@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{ {
tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n", tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
e->rxhash, e->queue_index); e->rxhash, e->queue_index);
sock_rps_reset_flow_hash(e->rps_rxhash);
hlist_del_rcu(&e->hash_link); hlist_del_rcu(&e->hash_link);
kfree_rcu(e, rcu); kfree_rcu(e, rcu);
--tun->flow_count; --tun->flow_count;
...@@ -373,10 +372,8 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash, ...@@ -373,10 +372,8 @@ static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
*/ */
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{ {
if (unlikely(e->rps_rxhash != hash)) { if (unlikely(e->rps_rxhash != hash))
sock_rps_reset_flow_hash(e->rps_rxhash);
e->rps_rxhash = hash; e->rps_rxhash = hash;
}
} }
/* We try to identify a flow through its rxhash first. The reason that /* We try to identify a flow through its rxhash first. The reason that
......
...@@ -644,39 +644,39 @@ struct rps_dev_flow_table { ...@@ -644,39 +644,39 @@ struct rps_dev_flow_table {
/* /*
* The rps_sock_flow_table contains mappings of flows to the last CPU * The rps_sock_flow_table contains mappings of flows to the last CPU
* on which they were processed by the application (set in recvmsg). * on which they were processed by the application (set in recvmsg).
* Each entry is a 32bit value. Upper part is the high order bits
* of flow hash, lower part is cpu number.
* rps_cpu_mask is used to partition the space, depending on number of
* possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
* For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
* meaning we use 32-6=26 bits for the hash.
*/ */
struct rps_sock_flow_table { struct rps_sock_flow_table {
unsigned int mask; u32 mask;
u16 ents[0]; u32 ents[0];
}; };
#define RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \ #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
((_num) * sizeof(u16)))
#define RPS_NO_CPU 0xffff #define RPS_NO_CPU 0xffff
extern u32 rps_cpu_mask;
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
static inline void rps_record_sock_flow(struct rps_sock_flow_table *table, static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
u32 hash) u32 hash)
{ {
if (table && hash) { if (table && hash) {
unsigned int cpu, index = hash & table->mask; unsigned int index = hash & table->mask;
u32 val = hash & ~rps_cpu_mask;
/* We only give a hint, preemption can change cpu under us */ /* We only give a hint, preemption can change cpu under us */
cpu = raw_smp_processor_id(); val |= raw_smp_processor_id();
if (table->ents[index] != cpu) if (table->ents[index] != val)
table->ents[index] = cpu; table->ents[index] = val;
} }
} }
static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
u32 hash)
{
if (table && hash)
table->ents[hash & table->mask] = RPS_NO_CPU;
}
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
#ifdef CONFIG_RFS_ACCEL #ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
u16 filter_id); u16 filter_id);
......
...@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash) ...@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
#endif #endif
} }
static inline void sock_rps_reset_flow_hash(__u32 hash)
{
#ifdef CONFIG_RPS
struct rps_sock_flow_table *sock_flow_table;
rcu_read_lock();
sock_flow_table = rcu_dereference(rps_sock_flow_table);
rps_reset_sock_flow(sock_flow_table, hash);
rcu_read_unlock();
#endif
}
static inline void sock_rps_record_flow(const struct sock *sk) static inline void sock_rps_record_flow(const struct sock *sk)
{ {
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
...@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk) ...@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
#endif #endif
} }
static inline void sock_rps_reset_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
sock_rps_reset_flow_hash(sk->sk_rxhash);
#endif
}
static inline void sock_rps_save_rxhash(struct sock *sk, static inline void sock_rps_save_rxhash(struct sock *sk,
const struct sk_buff *skb) const struct sk_buff *skb)
{ {
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
if (unlikely(sk->sk_rxhash != skb->hash)) { if (unlikely(sk->sk_rxhash != skb->hash))
sock_rps_reset_flow(sk);
sk->sk_rxhash = skb->hash; sk->sk_rxhash = skb->hash;
}
#endif #endif
} }
static inline void sock_rps_reset_rxhash(struct sock *sk) static inline void sock_rps_reset_rxhash(struct sock *sk)
{ {
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
sock_rps_reset_flow(sk);
sk->sk_rxhash = 0; sk->sk_rxhash = 0;
#endif #endif
} }
......
...@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd, ...@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
/* One global table that all flow-based protocols share. */ /* One global table that all flow-based protocols share. */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table); EXPORT_SYMBOL(rps_sock_flow_table);
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly; struct static_key rps_needed __read_mostly;
...@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, ...@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp) struct rps_dev_flow **rflowp)
{ {
struct netdev_rx_queue *rxqueue; const struct rps_sock_flow_table *sock_flow_table;
struct rps_map *map; struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table; struct rps_dev_flow_table *flow_table;
struct rps_sock_flow_table *sock_flow_table; struct rps_map *map;
int cpu = -1; int cpu = -1;
u16 tcpu; u32 tcpu;
u32 hash; u32 hash;
if (skb_rx_queue_recorded(skb)) { if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb); u16 index = skb_get_rx_queue(skb);
if (unlikely(index >= dev->real_num_rx_queues)) { if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1, WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number " "%s received packet on queue %u, but number "
...@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ...@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
dev->name, index, dev->real_num_rx_queues); dev->name, index, dev->real_num_rx_queues);
goto done; goto done;
} }
rxqueue = dev->_rx + index; rxqueue += index;
} else }
rxqueue = dev->_rx;
/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
flow_table = rcu_dereference(rxqueue->rps_flow_table);
map = rcu_dereference(rxqueue->rps_map); map = rcu_dereference(rxqueue->rps_map);
if (map) { if (!flow_table && !map)
if (map->len == 1 &&
!rcu_access_pointer(rxqueue->rps_flow_table)) {
tcpu = map->cpus[0];
if (cpu_online(tcpu))
cpu = tcpu;
goto done;
}
} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
goto done; goto done;
}
skb_reset_network_header(skb); skb_reset_network_header(skb);
hash = skb_get_hash(skb); hash = skb_get_hash(skb);
if (!hash) if (!hash)
goto done; goto done;
flow_table = rcu_dereference(rxqueue->rps_flow_table);
sock_flow_table = rcu_dereference(rps_sock_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) { if (flow_table && sock_flow_table) {
u16 next_cpu;
struct rps_dev_flow *rflow; struct rps_dev_flow *rflow;
u32 next_cpu;
u32 ident;
/* First check into global flow table if there is a match */
ident = sock_flow_table->ents[hash & sock_flow_table->mask];
if ((ident ^ hash) & ~rps_cpu_mask)
goto try_rps;
next_cpu = ident & rps_cpu_mask;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
rflow = &flow_table->flows[hash & flow_table->mask]; rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu; tcpu = rflow->cpu;
next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
/* /*
* If the desired CPU (where last recvmsg was done) is * If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow * different from current CPU (one in the rx-queue flow
...@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ...@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
} }
} }
try_rps:
if (map) { if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)]; tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) { if (cpu_online(tcpu)) {
......
...@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, ...@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
mutex_unlock(&sock_flow_mutex); mutex_unlock(&sock_flow_mutex);
return -ENOMEM; return -ENOMEM;
} }
rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
sock_table->mask = size - 1; sock_table->mask = size - 1;
} else } else
sock_table = orig_sock_table; sock_table = orig_sock_table;
......
...@@ -395,8 +395,6 @@ int inet_release(struct socket *sock) ...@@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
if (sk) { if (sk) {
long timeout; long timeout;
sock_rps_reset_flow(sk);
/* Applications forget to leave groups before exiting */ /* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk); ip_mc_drop_socket(sk);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment