Commit 12ff91c8 authored by David S. Miller's avatar David S. Miller

Merge branch 'improving-TCP-behavior-on-host-congestion'

Yuchung Cheng says:

====================
improving TCP behavior on host congestion

This patch set aims to improve how TCP handle local qdisc congestion
by simplifying the previous implementation.  Previously when an
skb fails to (re)transmit due to local qdisc congestion or other
resource issue, TCP refrains from setting the skb timestamp or the
recovery starting time.

This design makes determining when to abort a stalling socket more
complicated, as the timestamps of these tranmission attempts were
missing. The stack needs to sort of infer when the original attempt
happens. A by-product is a socket may disregard the system timeout
limit (i.e. sysctl net.ipv4.tcp_retries2 or USER_TIMEOUT option),
and continue to retry until the transmission is successful.

In data-center environment when TCP RTO is small, this could cause
the socket to retry frequently for long during qdisc congestion.

The solution is to first unconditionally timestamp skb and recovery
attempt. Then retry more conservatively (twice a second) on local
qdisc congestion but abort the sockets according to the system limit.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 9b420eff c1d5674f
...@@ -980,7 +980,6 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, ...@@ -980,7 +980,6 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (sk->sk_pacing_status != SK_PACING_NONE) { if (sk->sk_pacing_status != SK_PACING_NONE) {
unsigned long rate = sk->sk_pacing_rate; unsigned long rate = sk->sk_pacing_rate;
...@@ -1028,7 +1027,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1028,7 +1027,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
BUG_ON(!skb || !tcp_skb_pcount(skb)); BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk); tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
if (clone_it) { if (clone_it) {
TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- tp->snd_una; - tp->snd_una;
...@@ -1045,11 +1046,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1045,11 +1046,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
return -ENOBUFS; return -ENOBUFS;
} }
prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
inet = inet_sk(sk); inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb); tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts)); memset(&opts, 0, sizeof(opts));
...@@ -2937,12 +2933,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) ...@@ -2937,12 +2933,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
} }
/* To avoid taking spuriously low RTT samples based on a timestamp
* for a transmit that never happened, always mark EVER_RETRANS
*/
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err); TCP_SKB_CB(skb)->seq, segs, err);
if (likely(!err)) { if (likely(!err)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
trace_tcp_retransmit_skb(sk, skb); trace_tcp_retransmit_skb(sk, skb);
} else if (err != -EBUSY) { } else if (err != -EBUSY) {
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
...@@ -2963,13 +2963,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) ...@@ -2963,13 +2963,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
#endif #endif
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb); tp->retrans_out += tcp_skb_pcount(skb);
/* Save stamp of the first retransmit. */
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
} }
/* Save stamp of the first (attempted) retransmit. */
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
if (tp->undo_retrans < 0) if (tp->undo_retrans < 0)
tp->undo_retrans = 0; tp->undo_retrans = 0;
tp->undo_retrans += tcp_skb_pcount(skb); tp->undo_retrans += tcp_skb_pcount(skb);
...@@ -3750,7 +3749,7 @@ void tcp_send_probe0(struct sock *sk) ...@@ -3750,7 +3749,7 @@ void tcp_send_probe0(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
unsigned long probe_max; unsigned long timeout;
int err; int err;
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
...@@ -3762,26 +3761,18 @@ void tcp_send_probe0(struct sock *sk) ...@@ -3762,26 +3761,18 @@ void tcp_send_probe0(struct sock *sk)
return; return;
} }
icsk->icsk_probes_out++;
if (err <= 0) { if (err <= 0) {
if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
icsk->icsk_backoff++; icsk->icsk_backoff++;
icsk->icsk_probes_out++; timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
probe_max = TCP_RTO_MAX;
} else { } else {
/* If packet was not sent due to local congestion, /* If packet was not sent due to local congestion,
* do not backoff and do not remember icsk_probes_out. * Let senders fight for local resources conservatively.
* Let local senders to fight for local resources.
*
* Use accumulated backoff yet.
*/ */
if (!icsk->icsk_probes_out) timeout = TCP_RESOURCE_PROBE_INTERVAL;
icsk->icsk_probes_out = 1; }
probe_max = TCP_RESOURCE_PROBE_INTERVAL; tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
}
tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
tcp_probe0_when(sk, probe_max),
TCP_RTO_MAX,
NULL);
} }
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
......
...@@ -22,28 +22,14 @@ ...@@ -22,28 +22,14 @@
#include <linux/gfp.h> #include <linux/gfp.h>
#include <net/tcp.h> #include <net/tcp.h>
static u32 tcp_retransmit_stamp(const struct sock *sk)
{
u32 start_ts = tcp_sk(sk)->retrans_stamp;
if (unlikely(!start_ts)) {
struct sk_buff *head = tcp_rtx_queue_head(sk);
if (!head)
return 0;
start_ts = tcp_skb_timestamp(head);
}
return start_ts;
}
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts; u32 elapsed, start_ts;
s32 remaining; s32 remaining;
start_ts = tcp_retransmit_stamp(sk); start_ts = tcp_sk(sk)->retrans_stamp;
if (!icsk->icsk_user_timeout || !start_ts) if (!icsk->icsk_user_timeout)
return icsk->icsk_rto; return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
remaining = icsk->icsk_user_timeout - elapsed; remaining = icsk->icsk_user_timeout - elapsed;
...@@ -173,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) ...@@ -173,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} }
static unsigned int tcp_model_timeout(struct sock *sk,
unsigned int boundary,
unsigned int rto_base)
{
unsigned int linear_backoff_thresh, timeout;
linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base);
if (boundary <= linear_backoff_thresh)
timeout = ((2 << boundary) - 1) * rto_base;
else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
return jiffies_to_msecs(timeout);
}
/** /**
* retransmits_timed_out() - returns true if this connection has timed out * retransmits_timed_out() - returns true if this connection has timed out
* @sk: The current socket * @sk: The current socket
...@@ -191,26 +190,15 @@ static bool retransmits_timed_out(struct sock *sk, ...@@ -191,26 +190,15 @@ static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary, unsigned int boundary,
unsigned int timeout) unsigned int timeout)
{ {
const unsigned int rto_base = TCP_RTO_MIN; unsigned int start_ts;
unsigned int linear_backoff_thresh, start_ts;
if (!inet_csk(sk)->icsk_retransmits) if (!inet_csk(sk)->icsk_retransmits)
return false; return false;
start_ts = tcp_retransmit_stamp(sk); start_ts = tcp_sk(sk)->retrans_stamp;
if (!start_ts) if (likely(timeout == 0))
return false; timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN);
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
if (boundary <= linear_backoff_thresh)
timeout = ((2 << boundary) - 1) * rto_base;
else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
timeout = jiffies_to_msecs(timeout);
}
return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
} }
...@@ -345,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -345,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk)
struct sk_buff *skb = tcp_send_head(sk); struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int max_probes; int max_probes;
u32 start_ts;
if (tp->packets_out || !skb) { if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0; icsk->icsk_probes_out = 0;
...@@ -360,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk) ...@@ -360,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when * corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer(). * we use RTO to probe window in tcp_retransmit_timer().
*/ */
start_ts = tcp_skb_timestamp(skb); if (icsk->icsk_user_timeout) {
if (!start_ts) u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out,
skb->skb_mstamp_ns = tp->tcp_clock_cache; tcp_probe0_base(sk));
else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) if (elapsed >= icsk->icsk_user_timeout)
goto abort; goto abort;
}
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) { if (sock_flag(sk, SOCK_DEAD)) {
...@@ -395,6 +383,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) ...@@ -395,6 +383,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? : int max_retries = icsk->icsk_syn_retries ? :
sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
struct tcp_sock *tp = tcp_sk(sk);
struct request_sock *req; struct request_sock *req;
req = tcp_sk(sk)->fastopen_rsk; req = tcp_sk(sk)->fastopen_rsk;
...@@ -412,6 +401,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk) ...@@ -412,6 +401,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
inet_rtx_syn_ack(sk, req); inet_rtx_syn_ack(sk, req);
req->num_timeout++; req->num_timeout++;
icsk->icsk_retransmits++; icsk->icsk_retransmits++;
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_time_stamp(tp);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
} }
...@@ -443,10 +434,8 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -443,10 +434,8 @@ void tcp_retransmit_timer(struct sock *sk)
*/ */
return; return;
} }
if (!tp->packets_out) if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk)))
goto out; return;
WARN_ON(tcp_rtx_queue_empty(sk));
tp->tlp_high_seq = 0; tp->tlp_high_seq = 0;
...@@ -511,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -511,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_enter_loss(sk); tcp_enter_loss(sk);
icsk->icsk_retransmits++;
if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion, /* Retransmission failed because of local congestion,
* do not backoff. * Let senders fight for local resources conservatively.
*/ */
if (!icsk->icsk_retransmits)
icsk->icsk_retransmits = 1;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), TCP_RESOURCE_PROBE_INTERVAL,
TCP_RTO_MAX); TCP_RTO_MAX);
goto out; goto out;
} }
...@@ -539,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk) ...@@ -539,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk)
* the 120 second clamps though! * the 120 second clamps though!
*/ */
icsk->icsk_backoff++; icsk->icsk_backoff++;
icsk->icsk_retransmits++;
out_reset_timer: out_reset_timer:
/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment