Commit 8fc8911b authored by Paolo Abeni's avatar Paolo Abeni

Merge branch 'tcp-backlog-processing-optims'

Eric Dumazet says:

====================
tcp: backlog processing optims

First patches are mostly preparing the ground for the last one.

Last patch of the series implements sort of ACK reduction
only for the cases a TCP receiver is under high stress,
which happens for high throughput flows.

This gives us a ~20% increase of single TCP flow (100Gbit -> 120Gbit)
====================

Link: https://lore.kernel.org/r/20230911170531.828100-1-edumazet@google.comSigned-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents cd8bae85 133c4c0d
...@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER ...@@ -745,6 +745,13 @@ tcp_comp_sack_nr - INTEGER
Default : 44 Default : 44
tcp_backlog_ack_defer - BOOLEAN
If set, user thread processing socket backlog tries sending
one ACK for the whole queue. This helps to avoid potential
long latencies at end of a TCP socket syscall.
Default : true
tcp_slow_start_after_idle - BOOLEAN tcp_slow_start_after_idle - BOOLEAN
If set, provide RFC2861 behavior and time out the congestion If set, provide RFC2861 behavior and time out the congestion
window after an idle period. An idle period is defined at window after an idle period. An idle period is defined at
......
...@@ -463,15 +463,17 @@ enum tsq_enum { ...@@ -463,15 +463,17 @@ enum tsq_enum {
TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
* tcp_v{4|6}_mtu_reduced() * tcp_v{4|6}_mtu_reduced()
*/ */
TCP_ACK_DEFERRED, /* TX pure ack is deferred */
}; };
enum tsq_flags { enum tsq_flags {
TSQF_THROTTLED = (1UL << TSQ_THROTTLED), TSQF_THROTTLED = BIT(TSQ_THROTTLED),
TSQF_QUEUED = (1UL << TSQ_QUEUED), TSQF_QUEUED = BIT(TSQ_QUEUED),
TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED), TCPF_TSQ_DEFERRED = BIT(TCP_TSQ_DEFERRED),
TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED), TCPF_WRITE_TIMER_DEFERRED = BIT(TCP_WRITE_TIMER_DEFERRED),
TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = BIT(TCP_DELACK_TIMER_DEFERRED),
TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = BIT(TCP_MTU_REDUCED_DEFERRED),
TCPF_ACK_DEFERRED = BIT(TCP_ACK_DEFERRED),
}; };
#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) #define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
......
...@@ -132,6 +132,7 @@ struct netns_ipv4 { ...@@ -132,6 +132,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_syncookies; u8 sysctl_tcp_syncookies;
u8 sysctl_tcp_migrate_req; u8 sysctl_tcp_migrate_req;
u8 sysctl_tcp_comp_sack_nr; u8 sysctl_tcp_comp_sack_nr;
u8 sysctl_tcp_backlog_ack_defer;
int sysctl_tcp_reordering; int sysctl_tcp_reordering;
u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries1;
u8 sysctl_tcp_retries2; u8 sysctl_tcp_retries2;
......
...@@ -1823,12 +1823,11 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk) ...@@ -1823,12 +1823,11 @@ static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
static inline void sock_release_ownership(struct sock *sk) static inline void sock_release_ownership(struct sock *sk)
{ {
if (sock_owned_by_user_nocheck(sk)) { DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
sk->sk_lock.owned = 0; sk->sk_lock.owned = 0;
/* The sk_lock has mutex_unlock() semantics: */ /* The sk_lock has mutex_unlock() semantics: */
mutex_release(&sk->sk_lock.dep_map, _RET_IP_); mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}
} }
/* no reclassification while locks are held */ /* no reclassification while locks are held */
......
...@@ -3001,6 +3001,9 @@ void __sk_flush_backlog(struct sock *sk) ...@@ -3001,6 +3001,9 @@ void __sk_flush_backlog(struct sock *sk)
{ {
spin_lock_bh(&sk->sk_lock.slock); spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk); __release_sock(sk);
if (sk->sk_prot->release_cb)
sk->sk_prot->release_cb(sk);
spin_unlock_bh(&sk->sk_lock.slock); spin_unlock_bh(&sk->sk_lock.slock);
} }
EXPORT_SYMBOL_GPL(__sk_flush_backlog); EXPORT_SYMBOL_GPL(__sk_flush_backlog);
...@@ -3519,9 +3522,6 @@ void release_sock(struct sock *sk) ...@@ -3519,9 +3522,6 @@ void release_sock(struct sock *sk)
if (sk->sk_backlog.tail) if (sk->sk_backlog.tail)
__release_sock(sk); __release_sock(sk);
/* Warning : release_cb() might need to release sk ownership,
* ie call sock_release_ownership(sk) before us.
*/
if (sk->sk_prot->release_cb) if (sk->sk_prot->release_cb)
sk->sk_prot->release_cb(sk); sk->sk_prot->release_cb(sk);
......
...@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -1366,6 +1366,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dou8vec_minmax, .proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
}, },
{
.procname = "tcp_backlog_ack_defer",
.data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{ {
.procname = "tcp_reflect_tos", .procname = "tcp_reflect_tos",
.data = &init_net.ipv4.sysctl_tcp_reflect_tos, .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
......
...@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) ...@@ -5553,6 +5553,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
tcp_in_quickack_mode(sk) || tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */ /* Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) { inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
/* If we are running from __release_sock() in user context,
* Defer the ack until tcp_release_cb().
*/
if (sock_owned_by_user_nocheck(sk) &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
return;
}
send_now: send_now:
tcp_send_ack(sk); tcp_send_ack(sk);
return; return;
......
...@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -3263,6 +3263,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_comp_sack_nr = 44;
net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0); atomic_set(&net->ipv4.tfo_active_disable_times, 0);
......
...@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t) ...@@ -1077,7 +1077,8 @@ static void tcp_tasklet_func(struct tasklet_struct *t)
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \ #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \ TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \ TCPF_DELACK_TIMER_DEFERRED | \
TCPF_MTU_REDUCED_DEFERRED) TCPF_MTU_REDUCED_DEFERRED | \
TCPF_ACK_DEFERRED)
/** /**
* tcp_release_cb - tcp release_sock() callback * tcp_release_cb - tcp release_sock() callback
* @sk: socket * @sk: socket
...@@ -1101,16 +1102,6 @@ void tcp_release_cb(struct sock *sk) ...@@ -1101,16 +1102,6 @@ void tcp_release_cb(struct sock *sk)
tcp_tsq_write(sk); tcp_tsq_write(sk);
__sock_put(sk); __sock_put(sk);
} }
/* Here begins the tricky part :
* We are called from release_sock() with :
* 1) BH disabled
* 2) sk_lock.slock spinlock held
* 3) socket owned by us (sk->sk_lock.owned == 1)
*
* But following code is meant to be called from BH handlers,
* so we should keep BH disabled, but early release socket ownership
*/
sock_release_ownership(sk);
if (flags & TCPF_WRITE_TIMER_DEFERRED) { if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk); tcp_write_timer_handler(sk);
...@@ -1124,6 +1115,8 @@ void tcp_release_cb(struct sock *sk) ...@@ -1124,6 +1115,8 @@ void tcp_release_cb(struct sock *sk)
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk); inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk); __sock_put(sk);
} }
if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
tcp_send_ack(sk);
} }
EXPORT_SYMBOL(tcp_release_cb); EXPORT_SYMBOL(tcp_release_cb);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment