Merge branch 'tcp-delayed-completions'

Eric Dumazet says: ==================== tcp: better deal with delayed TX completions Jakub and Neil reported an increase of RTO timers whenever TX completions are delayed a bit more (by increasing NIC TX coalescing parameters) While problems have been there forever, second patch might introduce some regressions so I prefer not backport them to stable releases before things settle. Many thanks to FB team for their help and tests. Few packetdrill tests need to be changed to reflect the improvements brought by this series. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'tcp-delayed-completions'
Eric Dumazet says: ==================== tcp: better deal with delayed TX completions Jakub and Neil reported an increase of RTO timers whenever TX completions are delayed a bit more (by increasing NIC TX coalescing parameters) While problems have been there forever, second patch might introduce some regressions so I prefer not backport them to stable releases before things settle. Many thanks to FB team for their help and tests. Few packetdrill tests need to be changed to reflect the improvements brought by this series. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
5215206d · David S. Miller · 8176f8c0 · ac3959fd · 5215206d · 5215206d
Commit 5215206d authored Mar 11, 2021 by David S. Miller
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 19 deletions

include/linux/skbuff.h include/linux/skbuff.h +1 -1

net/ipv4/tcp_input.c net/ipv4/tcp_input.c +4 -6

net/ipv4/tcp_output.c net/ipv4/tcp_output.c +8 -12

No files found.
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1140,7 +1140,7 @@ static inline bool skb_fclone_busy(const struct sock *sk,

 	return skb->fclone == SKB_FCLONE_ORIG &&
 	       refcount_read(&fclones->fclone_ref) > 1 &&
-	       fclones->skb2.sk == sk;
+	       READ_ONCE(fclones->skb2.sk) == sk;
 }

 /**

--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2914,7 +2914,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 	/* D. Check state exit conditions. State can be terminated
 	 *    when high_seq is ACKed. */
 	if (icsk->icsk_ca_state == TCP_CA_Open) {
-		WARN_ON(tp->retrans_out != 0);
+		WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
 		tp->retrans_stamp = 0;
 	} else if (!before(tp->snd_una, tp->high_seq)) {
 		switch (icsk->icsk_ca_state) {
@@ -5994,11 +5994,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 			tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
 		else
 			tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
-		skb_rbtree_walk_from(data) {
-			if (__tcp_retransmit_skb(sk, data, 1))
-				break;
-		}
-		tcp_rearm_rto(sk);
+		skb_rbtree_walk_from(data)
+			 tcp_mark_skb_lost(sk, data);
+		tcp_xmit_retransmit_queue(sk);
 		NET_INC_STATS(sock_net(sk),
 				LINUX_MIB_TCPFASTOPENACTIVEFAIL);
 		return true;

--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2775,13 +2775,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
 * a packet is still in a qdisc or driver queue.
 * In this case, there is very little point doing a retransmit !
 */
-static bool skb_still_in_host_queue(const struct sock *sk,
+static bool skb_still_in_host_queue(struct sock *sk,
 				    const struct sk_buff *skb)
 {
 	if (unlikely(skb_fclone_busy(sk, skb))) {
-		NET_INC_STATS(sock_net(sk),
-			      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
-		return true;
+		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+		smp_mb__after_atomic();
+		if (skb_fclone_busy(sk, skb)) {
+			NET_INC_STATS(sock_net(sk),
+				      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+			return true;
+		}
 	}
 	return false;
 }
@@ -3147,14 +3151,6 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
 	if (icsk->icsk_mtup.probe_size)
 		icsk->icsk_mtup.probe_size = 0;

-	/* Do not sent more than we queued. 1/4 is reserved for possible
-	 * copying overhead: fragmentation, tunneling, mangling etc.
-	 */
-	if (refcount_read(&sk->sk_wmem_alloc) >
-	    min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
-		  sk->sk_sndbuf))
-		return -EAGAIN;
-
 	if (skb_still_in_host_queue(sk, skb))
 		return -EBUSY;