Commit dfa2f048 authored by Eric Dumazet's avatar Eric Dumazet Committed by Jakub Kicinski

tcp: get rid of sysctl_tcp_adv_win_scale

With modern NIC drivers shifting to full page allocations per
received frame, we face the following issue:

TCP has one per-netns sysctl used to tweak how to translate
a memory use into an expected payload (RWIN), in RX path.

tcp_win_from_space() implementation is limited to few cases.

For hosts dealing with various MSS, we either under estimate
or over estimate the RWIN we send to the remote peers.

For instance with the default sysctl_tcp_adv_win_scale value,
we expect to store 50% of payload per allocated chunk of memory.

For the typical use of MTU=1500 traffic, and order-0 pages allocations
by NIC drivers, we are sending too big RWIN, leading to potential
tcp collapse operations, which are extremely expensive and source
of latency spikes.

This patch makes sysctl_tcp_adv_win_scale obsolete, and instead
uses a per socket scaling factor, so that we can precisely
adjust the RWIN based on effective skb->len/skb->truesize ratio.

This patch alone can double TCP receive performance when receivers
are too slow to drain their receive queue, or by allowing
a bigger RWIN when MSS is close to PAGE_SIZE.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Link: https://lore.kernel.org/r/20230717152917.751987-1-edumazet@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 63c8778d
...@@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN ...@@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN
option can harm clients of your server. option can harm clients of your server.
tcp_adv_win_scale - INTEGER tcp_adv_win_scale - INTEGER
Obsolete since linux-6.6
Count buffering overhead as bytes/2^tcp_adv_win_scale Count buffering overhead as bytes/2^tcp_adv_win_scale
(if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale),
if it is <= 0. if it is <= 0.
......
...@@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) ...@@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
return (struct tcp_request_sock *)req; return (struct tcp_request_sock *)req;
} }
#define TCP_RMEM_TO_WIN_SCALE 8
struct tcp_sock { struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */ /* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn; struct inet_connection_sock inet_conn;
...@@ -238,7 +240,7 @@ struct tcp_sock { ...@@ -238,7 +240,7 @@ struct tcp_sock {
u32 window_clamp; /* Maximal window to advertise */ u32 window_clamp; /* Maximal window to advertise */
u32 rcv_ssthresh; /* Current window clamp */ u32 rcv_ssthresh; /* Current window clamp */
u8 scaling_ratio; /* see tcp_win_from_space() */
/* Information of the most recently (s)acked skb */ /* Information of the most recently (s)acked skb */
struct tcp_rack { struct tcp_rack {
u64 mstamp; /* (Re)sent time of the skb */ u64 mstamp; /* (Re)sent time of the skb */
......
...@@ -152,7 +152,7 @@ struct netns_ipv4 { ...@@ -152,7 +152,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_abort_on_overflow; u8 sysctl_tcp_abort_on_overflow;
u8 sysctl_tcp_fack; /* obsolete */ u8 sysctl_tcp_fack; /* obsolete */
int sysctl_tcp_max_reordering; int sysctl_tcp_max_reordering;
int sysctl_tcp_adv_win_scale; int sysctl_tcp_adv_win_scale; /* obsolete */
u8 sysctl_tcp_dsack; u8 sysctl_tcp_dsack;
u8 sysctl_tcp_app_win; u8 sysctl_tcp_app_win;
u8 sysctl_tcp_frto; u8 sysctl_tcp_frto;
......
...@@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space, ...@@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
static inline int tcp_win_from_space(const struct sock *sk, int space) static inline int tcp_win_from_space(const struct sock *sk, int space)
{ {
int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale); s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio;
return tcp_adv_win_scale <= 0 ? return scaled_space >> TCP_RMEM_TO_WIN_SCALE;
(space>>(-tcp_adv_win_scale)) : }
space - (space>>tcp_adv_win_scale);
/* inverse of tcp_win_from_space() */
static inline int tcp_space_from_win(const struct sock *sk, int win)
{
u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE;
do_div(val, tcp_sk(sk)->scaling_ratio);
return val;
}
static inline void tcp_scaling_ratio_init(struct sock *sk)
{
/* Assume a conservative default of 1200 bytes of payload per 4K page.
* This may be adjusted later in tcp_measure_rcv_mss().
*/
tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) /
SKB_TRUESIZE(4096);
} }
/* Note: caller must be prepared to deal with negative returns */ /* Note: caller must be prepared to deal with negative returns */
......
...@@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) ...@@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk)
WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
tcp_scaling_ratio_init(sk);
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk); sk_sockets_allocated_inc(sk);
...@@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len); ...@@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len);
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val) int tcp_set_rcvlowat(struct sock *sk, int val)
{ {
int cap; int space, cap;
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1; cap = sk->sk_rcvbuf >> 1;
...@@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val) ...@@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
return 0; return 0;
val <<= 1; space = tcp_space_from_win(sk, val);
if (val > sk->sk_rcvbuf) { if (space > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, val); WRITE_ONCE(sk->sk_rcvbuf, space);
tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); tcp_sk(sk)->window_clamp = val;
} }
return 0; return 0;
} }
......
...@@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) ...@@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/ */
len = skb_shinfo(skb)->gso_size ? : skb->len; len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) { if (len >= icsk->icsk_ack.rcv_mss) {
/* Note: divides are still a bit expensive.
* For the moment, only adjust scaling_ratio
* when we update icsk_ack.rcv_mss.
*/
if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
do_div(val, skb->truesize);
tcp_sk(sk)->scaling_ratio = val ? val : 1;
}
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss); tcp_sk(sk)->advmss);
/* Account for possibly-removed options */ /* Account for possibly-removed options */
...@@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk) ...@@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow; u64 rcvwin, grow;
int rcvbuf;
/* minimal window to cope with packet losses, assuming /* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations. * steady state. Add some cushion because of small variations.
...@@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk) ...@@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
do_div(grow, tp->rcvq_space.space); do_div(grow, tp->rcvq_space.space);
rcvwin += (grow << 1); rcvwin += (grow << 1);
rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
rcvmem += 128;
do_div(rcvwin, tp->advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) { if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment