Commit c4fefd5a authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-dsack-multi-seg'

Priyaranjan Jha says:

====================
tcp: improve handling of DSACK covering multiple segments

Currently, while processing DSACK, we assume DSACK covers only one
segment. This leads to significant underestimation of no. of duplicate
segments with LRO/GRO. Also, the existing SNMP counters, TCPDSACKRecv
and TCPDSACKOfoRecv, make similar assumption for DSACK, which makes them
unusable for estimating spurious retransmit rates.

This patch series fixes the segment accounting with DSACK, by estimating
number of duplicate segments based on: (DSACKed sequence range) / MSS.
It also introduces a new SNMP counter, TCPDSACKRecvSegs, which tracks
the estimated number of duplicate segments.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents dcc82bb0 e3a5a1e8
...@@ -287,6 +287,7 @@ enum ...@@ -287,6 +287,7 @@ enum
LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY, /* TCPFastOpenPassiveAltKey */
LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */ LINUX_MIB_TCPTIMEOUTREHASH, /* TCPTimeoutRehash */
LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */
LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */
__LINUX_MIB_MAX __LINUX_MIB_MAX
}; };
......
...@@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = { ...@@ -292,6 +292,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH),
SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH),
SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS),
SNMP_MIB_SENTINEL SNMP_MIB_SENTINEL
}; };
......
...@@ -871,12 +871,41 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst) ...@@ -871,12 +871,41 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
return min_t(__u32, cwnd, tp->snd_cwnd_clamp); return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
} }
struct tcp_sacktag_state {
/* Timestamps for earliest and latest never-retransmitted segment
* that was SACKed. RTO needs the earliest RTT to stay conservative,
* but congestion control should still get an accurate delay signal.
*/
u64 first_sackt;
u64 last_sackt;
u32 reord;
u32 sack_delivered;
int flag;
unsigned int mss_now;
struct rate_sample *rate;
};
/* Take a notice that peer is sending D-SACKs */ /* Take a notice that peer is sending D-SACKs */
static void tcp_dsack_seen(struct tcp_sock *tp) static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
u32 end_seq, struct tcp_sacktag_state *state)
{ {
u32 seq_len, dup_segs = 1;
if (before(start_seq, end_seq)) {
seq_len = end_seq - start_seq;
if (seq_len > tp->mss_cache)
dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
}
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
tp->rack.dsack_seen = 1; tp->rack.dsack_seen = 1;
tp->dsack_dups++; tp->dsack_dups += dup_segs;
state->flag |= FLAG_DSACKING_ACK;
/* A spurious retransmission is delivered */
state->sack_delivered += dup_segs;
return dup_segs;
} }
/* It's reordering when higher sequence was delivered (i.e. sacked) before /* It's reordering when higher sequence was delivered (i.e. sacked) before
...@@ -1103,53 +1132,38 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, ...@@ -1103,53 +1132,38 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks, struct tcp_sack_block_wire *sp, int num_sacks,
u32 prior_snd_una) u32 prior_snd_una, struct tcp_sacktag_state *state)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
bool dup_sack = false; u32 dup_segs;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
dup_sack = true;
tcp_dsack_seen(tp);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) { } else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
if (!after(end_seq_0, end_seq_1) && if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
!before(start_seq_0, start_seq_1)) { return false;
dup_sack = true; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
tcp_dsack_seen(tp); } else {
NET_INC_STATS(sock_net(sk), return false;
LINUX_MIB_TCPDSACKOFORECV);
}
} }
dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
/* D-SACK for already forgotten data... Do dumb counting. */ /* D-SACK for already forgotten data... Do dumb counting. */
if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && if (tp->undo_marker && tp->undo_retrans > 0 &&
!after(end_seq_0, prior_snd_una) && !after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker)) after(end_seq_0, tp->undo_marker))
tp->undo_retrans--; tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
return dup_sack; return true;
} }
struct tcp_sacktag_state {
u32 reord;
/* Timestamps for earliest and latest never-retransmitted segment
* that was SACKed. RTO needs the earliest RTT to stay conservative,
* but congestion control should still get an accurate delay signal.
*/
u64 first_sackt;
u64 last_sackt;
struct rate_sample *rate;
int flag;
unsigned int mss_now;
u32 sack_delivered;
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs, /* Check if skb is fully within the SACK block. In presence of GSO skbs,
* the incoming SACK may not exactly match but we can find smaller MSS * the incoming SACK may not exactly match but we can find smaller MSS
* aligned portion of it that matches. Therefore we might need to fragment * aligned portion of it that matches. Therefore we might need to fragment
...@@ -1692,12 +1706,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, ...@@ -1692,12 +1706,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
tcp_highest_sack_reset(sk); tcp_highest_sack_reset(sk);
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
num_sacks, prior_snd_una); num_sacks, prior_snd_una, state);
if (found_dup_sack) {
state->flag |= FLAG_DSACKING_ACK;
/* A spurious retransmission is delivered */
state->sack_delivered++;
}
/* Eliminate too old ACKs, but take into /* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can * account more or less fresh ones, they can
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment