Commit 2c8c56e1 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net: introduce SO_INCOMING_CPU

Alternative to RPS/RFS is to use hardware support for multiple
queues.

Then split a set of million of sockets into worker threads, each
one using epoll() to manage events on its own socket pool.

Ideally, we want one thread per RX/TX queue/cpu, but we have no way to
know after accept() or connect() on which queue/cpu a socket is managed.

We normally use one cpu per RX queue (IRQ smp_affinity being properly
set), so remembering on socket structure which cpu delivered last packet
is enough to solve the problem.

After accept(), connect(), or even file descriptor passing around
processes, applications can use :

 int cpu;
 socklen_t len = sizeof(cpu);

 getsockopt(fd, SOL_SOCKET, SO_INCOMING_CPU, &cpu, &len);

And use this information to put the socket into the right silo
for optimal performance, as all networking stack should run
on the appropriate cpu, without need to send IPI (RPS/RFS).
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 3d97379a
...@@ -87,4 +87,6 @@ ...@@ -87,4 +87,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _UAPI_ASM_SOCKET_H */ #endif /* _UAPI_ASM_SOCKET_H */
...@@ -80,4 +80,6 @@ ...@@ -80,4 +80,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _UAPI__ASM_AVR32_SOCKET_H */ #endif /* _UAPI__ASM_AVR32_SOCKET_H */
...@@ -82,6 +82,8 @@ ...@@ -82,6 +82,8 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _ASM_SOCKET_H */ #endif /* _ASM_SOCKET_H */
...@@ -80,5 +80,7 @@ ...@@ -80,5 +80,7 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _ASM_SOCKET_H */ #endif /* _ASM_SOCKET_H */
...@@ -89,4 +89,6 @@ ...@@ -89,4 +89,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _ASM_IA64_SOCKET_H */ #endif /* _ASM_IA64_SOCKET_H */
...@@ -80,4 +80,6 @@ ...@@ -80,4 +80,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _ASM_M32R_SOCKET_H */ #endif /* _ASM_M32R_SOCKET_H */
...@@ -98,4 +98,6 @@ ...@@ -98,4 +98,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _UAPI_ASM_SOCKET_H */ #endif /* _UAPI_ASM_SOCKET_H */
...@@ -80,4 +80,6 @@ ...@@ -80,4 +80,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _ASM_SOCKET_H */ #endif /* _ASM_SOCKET_H */
...@@ -79,4 +79,6 @@ ...@@ -79,4 +79,6 @@
#define SO_BPF_EXTENSIONS 0x4029 #define SO_BPF_EXTENSIONS 0x4029
#define SO_INCOMING_CPU 0x402A
#endif /* _UAPI_ASM_SOCKET_H */ #endif /* _UAPI_ASM_SOCKET_H */
...@@ -87,4 +87,6 @@ ...@@ -87,4 +87,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _ASM_POWERPC_SOCKET_H */ #endif /* _ASM_POWERPC_SOCKET_H */
...@@ -86,4 +86,6 @@ ...@@ -86,4 +86,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _ASM_SOCKET_H */ #endif /* _ASM_SOCKET_H */
...@@ -76,6 +76,8 @@ ...@@ -76,6 +76,8 @@
#define SO_BPF_EXTENSIONS 0x0032 #define SO_BPF_EXTENSIONS 0x0032
#define SO_INCOMING_CPU 0x0033
/* Security levels - as per NRL IPv6 - don't actually do anything */ /* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
......
...@@ -91,4 +91,6 @@ ...@@ -91,4 +91,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* _XTENSA_SOCKET_H */ #endif /* _XTENSA_SOCKET_H */
...@@ -273,6 +273,7 @@ struct cg_proto; ...@@ -273,6 +273,7 @@ struct cg_proto;
* @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_rcvtimeo: %SO_RCVTIMEO setting
* @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting
* @sk_rxhash: flow hash received from netif layer * @sk_rxhash: flow hash received from netif layer
* @sk_incoming_cpu: record cpu processing incoming packets
* @sk_txhash: computed flow hash for use on transmit * @sk_txhash: computed flow hash for use on transmit
* @sk_filter: socket filtering instructions * @sk_filter: socket filtering instructions
* @sk_protinfo: private area, net family specific, when not using slab * @sk_protinfo: private area, net family specific, when not using slab
...@@ -350,6 +351,12 @@ struct sock { ...@@ -350,6 +351,12 @@ struct sock {
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
__u32 sk_rxhash; __u32 sk_rxhash;
#endif #endif
u16 sk_incoming_cpu;
/* 16bit hole
* Warned : sk_incoming_cpu can be set from softirq,
* Do not use this hole without fully understanding possible issues.
*/
__u32 sk_txhash; __u32 sk_txhash;
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sk_napi_id; unsigned int sk_napi_id;
...@@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -833,6 +840,11 @@ static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
return sk->sk_backlog_rcv(sk, skb); return sk->sk_backlog_rcv(sk, skb);
} }
static inline void sk_incoming_cpu_update(struct sock *sk)
{
sk->sk_incoming_cpu = raw_smp_processor_id();
}
static inline void sock_rps_record_flow_hash(__u32 hash) static inline void sock_rps_record_flow_hash(__u32 hash)
{ {
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
......
...@@ -82,4 +82,6 @@ ...@@ -82,4 +82,6 @@
#define SO_BPF_EXTENSIONS 48 #define SO_BPF_EXTENSIONS 48
#define SO_INCOMING_CPU 49
#endif /* __ASM_GENERIC_SOCKET_H */ #endif /* __ASM_GENERIC_SOCKET_H */
...@@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, ...@@ -1213,6 +1213,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = sk->sk_max_pacing_rate; v.val = sk->sk_max_pacing_rate;
break; break;
case SO_INCOMING_CPU:
v.val = sk->sk_incoming_cpu;
break;
default: default:
return -ENOPROTOOPT; return -ENOPROTOOPT;
} }
...@@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) ...@@ -1517,6 +1521,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_err = 0; newsk->sk_err = 0;
newsk->sk_priority = 0; newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
/* /*
* Before updating sk_refcnt, we must commit prior changes to memory * Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details) * (Documentation/RCU/rculist_nulls.txt for details)
......
...@@ -1663,6 +1663,7 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1663,6 +1663,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
if (sk_filter(sk, skb)) if (sk_filter(sk, skb))
goto discard_and_relse; goto discard_and_relse;
sk_incoming_cpu_update(sk);
skb->dev = NULL; skb->dev = NULL;
bh_lock_sock_nested(sk); bh_lock_sock_nested(sk);
......
...@@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -1445,6 +1445,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (inet_sk(sk)->inet_daddr) { if (inet_sk(sk)->inet_daddr) {
sock_rps_save_rxhash(sk, skb); sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb); sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
} }
rc = sock_queue_rcv_skb(sk, skb); rc = sock_queue_rcv_skb(sk, skb);
......
...@@ -1456,6 +1456,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1456,6 +1456,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
if (sk_filter(sk, skb)) if (sk_filter(sk, skb))
goto discard_and_relse; goto discard_and_relse;
sk_incoming_cpu_update(sk);
skb->dev = NULL; skb->dev = NULL;
bh_lock_sock_nested(sk); bh_lock_sock_nested(sk);
......
...@@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ...@@ -577,6 +577,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (!ipv6_addr_any(&sk->sk_v6_daddr)) { if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
sock_rps_save_rxhash(sk, skb); sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb); sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
} }
rc = sock_queue_rcv_skb(sk, skb); rc = sock_queue_rcv_skb(sk, skb);
......
...@@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) ...@@ -205,9 +205,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event)
if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN))
goto out_free; goto out_free;
if (!sctp_ulpevent_is_notification(event)) if (!sctp_ulpevent_is_notification(event)) {
sk_mark_napi_id(sk, skb); sk_mark_napi_id(sk, skb);
sk_incoming_cpu_update(sk);
}
/* Check if the user wishes to receive this event. */ /* Check if the user wishes to receive this event. */
if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe))
goto out_free; goto out_free;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment