Commit 14a1f445 authored by David S. Miller's avatar David S. Miller Committed by Patrick McHardy

[TCP]: Make TSO play nice with congestion window.

Previously TSO would not abide by the congestion
window properly.  Essentially, each TSO packet would
be trated just like 1 normal packet, even though a TSO
packet generates more than 1 normal packet.  This
violates congestion window rules entirely.

So now we record the TSO factor, a count of how many
real packets a TSO packet will generate, and include
this in all the packet counting routines.

This initial version has a bug in that skb_entail() is
not the correct time to figure out the TSO factor for
the SKB, and tp->mss_tso_factor is not necessarily the
right value for a given SKB.  Will fix this up next.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 10bc9563
......@@ -201,6 +201,10 @@ struct tcp_sack_block {
__u32 end_seq;
};
typedef struct tcp_pcount {
__u32 val;
} tcp_pcount_t;
struct tcp_opt {
int tcp_header_len; /* Bytes of tcp header to send */
......@@ -250,6 +254,7 @@ struct tcp_opt {
__u32 max_window; /* Maximal window ever seen from peer */
__u32 pmtu_cookie; /* Last pmtu seen by socket */
__u32 mss_cache; /* Cached effective mss, not including SACKS */
__u32 mss_tso_factor; /* Real packets per TSO packet */
__u16 mss_cache_std; /* Like mss_cache, but without TSO */
__u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
__u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
......@@ -274,9 +279,9 @@ struct tcp_opt {
__u32 rtt_seq; /* sequence number to update rttvar */
__u32 rto; /* retransmit timeout */
__u32 packets_out; /* Packets which are "in flight" */
__u32 left_out; /* Packets which leaved network */
__u32 retrans_out; /* Retransmitted packets out */
tcp_pcount_t packets_out; /* Packets which are "in flight" */
tcp_pcount_t left_out; /* Packets which leaved network */
tcp_pcount_t retrans_out; /* Retransmitted packets out */
/*
......@@ -337,9 +342,9 @@ struct tcp_opt {
__u8 syn_retries; /* num of allowed syn retries */
__u8 ecn_flags; /* ECN status bits. */
__u16 prior_ssthresh; /* ssthresh saved at recovery start */
__u32 lost_out; /* Lost packets */
__u32 sacked_out; /* SACK'd packets */
__u32 fackets_out; /* FACK'd packets */
tcp_pcount_t lost_out; /* Lost packets */
tcp_pcount_t sacked_out;/* SACK'd packets */
tcp_pcount_t fackets_out;/* FACK'd packets */
__u32 high_seq; /* snd_nxt at onset of congestion */
__u32 retrans_stamp; /* Timestamp of the last retransmit,
......
......@@ -1047,13 +1047,18 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long
* is not a big flaw.
*/
static __inline__ unsigned int tcp_current_mss(struct sock *sk, int large)
static inline unsigned int tcp_current_mss(struct sock *sk, int large, int *factor)
{
struct tcp_opt *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
int mss_now = large && (sk->sk_route_caps & NETIF_F_TSO) &&
!tp->urg_mode ?
tp->mss_cache : tp->mss_cache_std;
int do_large, mss_now;
do_large = (large &&
(sk->sk_route_caps & NETIF_F_TSO) &&
!tp->urg_mode);
mss_now = do_large ? tp->mss_cache : tp->mss_cache_std;
if (factor)
*factor = do_large ? tp->mss_tso_factor : 1;
if (dst) {
u32 mtu = dst_pmtu(dst);
......@@ -1181,12 +1186,76 @@ struct tcp_skb_cb {
__u16 urg_ptr; /* Valid w/URG flags is set. */
__u32 ack_seq; /* Sequence number ACK'd */
__u32 tso_factor;
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
#include <net/tcp_ecn.h>
/* Due to TSO, an SKB can be composed of multiple actual
* packets. To keep these tracked properly, we use this.
*/
static inline int tcp_skb_pcount(struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->tso_factor;
}
static inline void tcp_inc_pcount(tcp_pcount_t *count, struct sk_buff *skb)
{
count->val += tcp_skb_pcount(skb);
}
static inline void tcp_inc_pcount_explicit(tcp_pcount_t *count, int amt)
{
count->val += amt;
}
static inline void tcp_dec_pcount_explicit(tcp_pcount_t *count, int amt)
{
count->val -= amt;
}
static inline void tcp_dec_pcount(tcp_pcount_t *count, struct sk_buff *skb)
{
count->val -= tcp_skb_pcount(skb);
}
static inline void tcp_dec_pcount_approx(tcp_pcount_t *count,
struct sk_buff *skb)
{
if (count->val) {
count->val -= tcp_skb_pcount(skb);
if ((int)count->val < 0)
count->val = 0;
}
}
static inline __u32 tcp_get_pcount(tcp_pcount_t *count)
{
return count->val;
}
static inline void tcp_set_pcount(tcp_pcount_t *count, __u32 val)
{
count->val = val;
}
static inline void tcp_packets_out_inc(struct sock *sk, struct tcp_opt *tp,
struct sk_buff *skb)
{
int orig = tcp_get_pcount(&tp->packets_out);
tcp_inc_pcount(&tp->packets_out, skb);
if (!orig)
tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
}
static inline void tcp_packets_out_dec(struct tcp_opt *tp, struct sk_buff *skb)
{
tcp_dec_pcount(&tp->packets_out, skb);
}
/* This determines how many packets are "in the network" to the best
* of our knowledge. In many cases it is conservative, but where
* detailed information is available from the receiver (via SACK
......@@ -1203,7 +1272,9 @@ struct tcp_skb_cb {
*/
static __inline__ unsigned int tcp_packets_in_flight(struct tcp_opt *tp)
{
return tp->packets_out - tp->left_out + tp->retrans_out;
return (tcp_get_pcount(&tp->packets_out) -
tcp_get_pcount(&tp->left_out) +
tcp_get_pcount(&tp->retrans_out));
}
/* Recalculate snd_ssthresh, we want to set it to:
......@@ -1304,9 +1375,15 @@ static inline __u32 tcp_current_ssthresh(struct tcp_opt *tp)
static inline void tcp_sync_left_out(struct tcp_opt *tp)
{
if (tp->sack_ok && tp->sacked_out >= tp->packets_out - tp->lost_out)
tp->sacked_out = tp->packets_out - tp->lost_out;
tp->left_out = tp->sacked_out + tp->lost_out;
if (tp->sack_ok &&
(tcp_get_pcount(&tp->sacked_out) >=
tcp_get_pcount(&tp->packets_out) - tcp_get_pcount(&tp->lost_out)))
tcp_set_pcount(&tp->sacked_out,
(tcp_get_pcount(&tp->packets_out) -
tcp_get_pcount(&tp->lost_out)));
tcp_set_pcount(&tp->left_out,
(tcp_get_pcount(&tp->sacked_out) +
tcp_get_pcount(&tp->lost_out)));
}
extern void tcp_cwnd_application_limited(struct sock *sk);
......@@ -1315,14 +1392,16 @@ extern void tcp_cwnd_application_limited(struct sock *sk);
static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_opt *tp)
{
if (tp->packets_out >= tp->snd_cwnd) {
__u32 packets_out = tcp_get_pcount(&tp->packets_out);
if (packets_out >= tp->snd_cwnd) {
/* Network is feed fully. */
tp->snd_cwnd_used = 0;
tp->snd_cwnd_stamp = tcp_time_stamp;
} else {
/* Network starves. */
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
if (tcp_get_pcount(&tp->packets_out) > tp->snd_cwnd_used)
tp->snd_cwnd_used = tcp_get_pcount(&tp->packets_out);
if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
tcp_cwnd_application_limited(sk);
......@@ -1388,7 +1467,7 @@ tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int n
!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
((nonagle&TCP_NAGLE_CORK) ||
(!nonagle &&
tp->packets_out &&
tcp_get_pcount(&tp->packets_out) &&
tcp_minshall_check(tp))));
}
......@@ -1398,6 +1477,8 @@ tcp_nagle_check(struct tcp_opt *tp, struct sk_buff *skb, unsigned mss_now, int n
static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
unsigned cur_mss, int nonagle)
{
int pkts = TCP_SKB_CB(skb)->tso_factor;
/* RFC 1122 - section 4.2.3.4
*
* We must queue if
......@@ -1424,14 +1505,14 @@ static __inline__ int tcp_snd_test(struct tcp_opt *tp, struct sk_buff *skb,
*/
return (((nonagle&TCP_NAGLE_PUSH) || tp->urg_mode
|| !tcp_nagle_check(tp, skb, cur_mss, nonagle)) &&
((tcp_packets_in_flight(tp) < tp->snd_cwnd) ||
(((tcp_packets_in_flight(tp) + (pkts-1)) < tp->snd_cwnd) ||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) &&
!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd));
}
static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_opt *tp)
{
if (!tp->packets_out && !tp->pending)
if (!tcp_get_pcount(&tp->packets_out) && !tp->pending)
tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, tp->rto);
}
......@@ -1464,7 +1545,7 @@ static __inline__ void __tcp_push_pending_frames(struct sock *sk,
static __inline__ void tcp_push_pending_frames(struct sock *sk,
struct tcp_opt *tp)
{
__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle);
__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1, NULL), tp->nonagle);
}
static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
......@@ -1472,7 +1553,7 @@ static __inline__ int tcp_may_send_now(struct sock *sk, struct tcp_opt *tp)
struct sk_buff *skb = sk->sk_send_head;
return (skb &&
tcp_snd_test(tp, skb, tcp_current_mss(sk, 1),
tcp_snd_test(tp, skb, tcp_current_mss(sk, 1, NULL),
tcp_skb_is_last(sk, skb) ? TCP_NAGLE_PUSH : tp->nonagle));
}
......@@ -1964,7 +2045,7 @@ static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_opt *tp)
{
return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
(__u32) (tp->mss_cache),
(__u32) (tp->mss_cache_std),
2U);
}
......
......@@ -590,13 +590,14 @@ static inline int forced_push(struct tcp_opt *tp)
}
static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
struct sk_buff *skb)
struct sk_buff *skb, int tso_factor)
{
skb->csum = 0;
TCP_SKB_CB(skb)->seq = tp->write_seq;
TCP_SKB_CB(skb)->end_seq = tp->write_seq;
TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->tso_factor = tso_factor;
__skb_queue_tail(&sk->sk_write_queue, skb);
sk_charge_skb(sk, skb);
if (!sk->sk_send_head)
......@@ -632,7 +633,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
size_t psize, int flags)
{
struct tcp_opt *tp = tcp_sk(sk);
int mss_now;
int mss_now, mss_factor_now;
int err;
ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
......@@ -644,7 +645,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), &mss_factor_now);
copied = 0;
err = -EPIPE;
......@@ -668,7 +669,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
if (!skb)
goto wait_for_memory;
skb_entail(sk, tp, skb);
skb_entail(sk, tp, skb, mss_factor_now);
copy = mss_now;
}
......@@ -719,7 +720,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB),
&mss_factor_now);
}
out:
......@@ -780,7 +782,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct tcp_opt *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
int mss_now;
int mss_now, mss_factor_now;
int err, copied;
long timeo;
......@@ -798,7 +800,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* This should be in poll */
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB), &mss_factor_now);
/* Ok commence sending. */
iovlen = msg->msg_iovlen;
......@@ -843,7 +845,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
NETIF_F_HW_CSUM))
skb->ip_summed = CHECKSUM_HW;
skb_entail(sk, tp, skb);
skb_entail(sk, tp, skb, mss_factor_now);
copy = mss_now;
}
......@@ -962,7 +964,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
goto do_error;
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
mss_now = tcp_current_mss(sk, !(flags&MSG_OOB),
&mss_factor_now);
}
}
......@@ -1818,7 +1821,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->backoff = 0;
tp->snd_cwnd = 2;
tp->probes_out = 0;
tp->packets_out = 0;
tcp_set_pcount(&tp->packets_out, 0);
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_cnt = 0;
tcp_set_ca_state(tp, TCP_CA_Open);
......
......@@ -70,14 +70,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rto = (1000000*tp->rto)/HZ;
info->tcpi_ato = (1000000*tp->ack.ato)/HZ;
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_snd_mss = tp->mss_cache_std;
info->tcpi_rcv_mss = tp->ack.rcv_mss;
info->tcpi_unacked = tp->packets_out;
info->tcpi_sacked = tp->sacked_out;
info->tcpi_lost = tp->lost_out;
info->tcpi_retrans = tp->retrans_out;
info->tcpi_fackets = tp->fackets_out;
info->tcpi_unacked = tcp_get_pcount(&tp->packets_out);
info->tcpi_sacked = tcp_get_pcount(&tp->sacked_out);
info->tcpi_lost = tcp_get_pcount(&tp->lost_out);
info->tcpi_retrans = tcp_get_pcount(&tp->retrans_out);
info->tcpi_fackets = tcp_get_pcount(&tp->fackets_out);
info->tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
info->tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
......
This diff is collapsed.
......@@ -2075,7 +2075,8 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
tp->mss_cache_std = tp->mss_cache = 536;
tp->mss_tso_factor = 1;
tp->reordering = sysctl_tcp_reordering;
......
......@@ -752,11 +752,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req,
newtp->mdev = TCP_TIMEOUT_INIT;
newtp->rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0;
newtp->left_out = 0;
newtp->retrans_out = 0;
newtp->sacked_out = 0;
newtp->fackets_out = 0;
tcp_set_pcount(&newtp->packets_out, 0);
tcp_set_pcount(&newtp->left_out, 0);
tcp_set_pcount(&newtp->retrans_out, 0);
tcp_set_pcount(&newtp->sacked_out, 0);
tcp_set_pcount(&newtp->fackets_out, 0);
newtp->snd_ssthresh = 0x7fffffff;
/* So many TCP implementations out there (incorrectly) count the
......
This diff is collapsed.
......@@ -121,7 +121,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
* 1. Last segment was sent recently. */
if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
/* 2. Window is closed. */
(!tp->snd_wnd && !tp->packets_out))
(!tp->snd_wnd && !tcp_get_pcount(&tp->packets_out)))
do_reset = 1;
if (do_reset)
tcp_send_active_reset(sk, GFP_ATOMIC);
......@@ -269,7 +269,7 @@ static void tcp_probe_timer(struct sock *sk)
struct tcp_opt *tp = tcp_sk(sk);
int max_probes;
if (tp->packets_out || !sk->sk_send_head) {
if (tcp_get_pcount(&tp->packets_out) || !sk->sk_send_head) {
tp->probes_out = 0;
return;
}
......@@ -316,7 +316,7 @@ static void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_opt *tp = tcp_sk(sk);
if (tp->packets_out == 0)
if (!tcp_get_pcount(&tp->packets_out))
goto out;
BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
......@@ -606,7 +606,7 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_when(tp);
/* It is alive without keepalive 8) */
if (tp->packets_out || sk->sk_send_head)
if (tcp_get_pcount(&tp->packets_out) || sk->sk_send_head)
goto resched;
elapsed = tcp_time_stamp - tp->rcv_tstamp;
......
......@@ -1929,7 +1929,8 @@ static int tcp_v6_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff;
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = 536;
tp->mss_cache_std = tp->mss_cache = 536;
tp->mss_tso_factor = 1;
tp->reordering = sysctl_tcp_reordering;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment