Commit cd1263b6 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'tcp-take-care-of-empty-skbs-in-write-queue'

Eric Dumazet says:
====================
tcp: take care of empty skbs in write queue

We understood recently that TCP sockets could have an empty
skb at the tail of the write queue, leading to various problems.

This patch series :

1) Make sure we do not send an empty packet since this
   was unintended and causing crashes in old kernels.

2) Change tcp_write_queue_empty() to not be fooled by
   the presence of an empty skb.

3) Fix a bug that could trigger suboptimal epoll()
   application behavior under memory pressure.
====================
Signed-off-by: default avatarJakub Kicinski <jakub.kicinski@netronome.com>
parents 5c9934b6 216808c6
...@@ -1766,9 +1766,18 @@ static inline bool tcp_skb_is_last(const struct sock *sk, ...@@ -1766,9 +1766,18 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
return skb_queue_is_last(&sk->sk_write_queue, skb); return skb_queue_is_last(&sk->sk_write_queue, skb);
} }
/**
* tcp_write_queue_empty - test if any payload (or FIN) is available in write queue
* @sk: socket
*
* Since the write queue can have a temporary empty skb in it,
* we must not use "return skb_queue_empty(&sk->sk_write_queue)"
*/
static inline bool tcp_write_queue_empty(const struct sock *sk) static inline bool tcp_write_queue_empty(const struct sock *sk)
{ {
return skb_queue_empty(&sk->sk_write_queue); const struct tcp_sock *tp = tcp_sk(sk);
return tp->write_seq == tp->snd_nxt;
} }
static inline bool tcp_rtx_queue_empty(const struct sock *sk) static inline bool tcp_rtx_queue_empty(const struct sock *sk)
......
...@@ -1087,8 +1087,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, ...@@ -1087,8 +1087,7 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto out; goto out;
out_err: out_err:
/* make sure we wake any epoll edge trigger waiter */ /* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
err == -EAGAIN)) {
sk->sk_write_space(sk); sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
} }
...@@ -1419,8 +1418,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1419,8 +1418,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
sock_zerocopy_put_abort(uarg, true); sock_zerocopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err); err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */ /* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
err == -EAGAIN)) {
sk->sk_write_space(sk); sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
} }
......
...@@ -2438,6 +2438,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2438,6 +2438,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (tcp_small_queue_check(sk, skb, 0)) if (tcp_small_queue_check(sk, skb, 0))
break; break;
/* Argh, we hit an empty skb(), presumably a thread
* is sleeping in sendmsg()/sk_stream_wait_memory().
* We do not want to send a pure-ack packet and have
* a strange looking rtx queue with empty packet(s).
*/
if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
break;
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break; break;
...@@ -3121,7 +3129,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size) ...@@ -3121,7 +3129,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
*/ */
void tcp_send_fin(struct sock *sk) void tcp_send_fin(struct sock *sk)
{ {
struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk); struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
/* Optimization, tack on the FIN if we have one skb in write queue and /* Optimization, tack on the FIN if we have one skb in write queue and
...@@ -3129,6 +3137,7 @@ void tcp_send_fin(struct sock *sk) ...@@ -3129,6 +3137,7 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout, * Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted. * as TCP stack thinks it has already been transmitted.
*/ */
tskb = tail;
if (!tskb && tcp_under_memory_pressure(sk)) if (!tskb && tcp_under_memory_pressure(sk))
tskb = skb_rb_last(&sk->tcp_rtx_queue); tskb = skb_rb_last(&sk->tcp_rtx_queue);
...@@ -3136,7 +3145,7 @@ void tcp_send_fin(struct sock *sk) ...@@ -3136,7 +3145,7 @@ void tcp_send_fin(struct sock *sk)
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++; TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++; tp->write_seq++;
if (tcp_write_queue_empty(sk)) { if (!tail) {
/* This means tskb was already sent. /* This means tskb was already sent.
* Pretend we included the FIN on previous transmit. * Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have * We need to set tp->snd_nxt to the value it would have
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment