Commit 740b0f18 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp: switch rtt estimations to usec resolution

Upcoming congestion controls for TCP require usec resolution for RTT
estimations. Millisecond resolution is simply not enough these days.

FQ/pacing in DC environments also require this change for finer control
and removal of bimodal behavior due to the current hack in
tcp_update_pacing_rate() for 'small rtt'

TCP_CONG_RTT_STAMP is no longer needed.

As Julian Anastasov pointed out, we need to keep user compatibility :
tcp_metrics used to export RTT and RTTVAR in msec resolution,
so we added RTT_US and RTTVAR_US. An iproute2 patch is needed
to use the new attributes if provided by the kernel.

In this example ss command displays a srtt of 32 usecs (10Gbit link)

lpk51:~# ./ss -i dst lpk52
Netid  State      Recv-Q Send-Q   Local Address:Port       Peer
Address:Port
tcp    ESTAB      0      1         10.246.11.51:42959
10.246.11.52:64614
         cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448
cwnd:10 send
3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559

Updated iproute2 ip command displays :

lpk51:~# ./ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source
10.246.11.51

Old binary displays :

lpk51:~# ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source
10.246.11.51

With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Larry Brakmo <brakmo@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 363ec392
...@@ -201,10 +201,10 @@ struct tcp_sock { ...@@ -201,10 +201,10 @@ struct tcp_sock {
u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */
/* RTT measurement */ /* RTT measurement */
u32 srtt; /* smoothed round trip time << 3 */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */
u32 mdev; /* medium deviation */ u32 mdev_us; /* medium deviation */
u32 mdev_max; /* maximal mdev for the last rtt period */ u32 mdev_max_us; /* maximal mdev for the last rtt period */
u32 rttvar; /* smoothed mdev_max */ u32 rttvar_us; /* smoothed mdev_max */
u32 rtt_seq; /* sequence number to update rttvar */ u32 rtt_seq; /* sequence number to update rttvar */
u32 packets_out; /* Packets which are "in flight" */ u32 packets_out; /* Packets which are "in flight" */
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/crypto.h> #include <linux/crypto.h>
#include <linux/cryptohash.h> #include <linux/cryptohash.h>
#include <linux/kref.h> #include <linux/kref.h>
#include <linux/ktime.h>
#include <net/inet_connection_sock.h> #include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h> #include <net/inet_timewait_sock.h>
...@@ -478,7 +479,6 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, ...@@ -478,7 +479,6 @@ int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt); struct ip_options *opt);
#ifdef CONFIG_SYN_COOKIES #ifdef CONFIG_SYN_COOKIES
#include <linux/ktime.h>
/* Syncookies use a monotonic timer which increments every 64 seconds. /* Syncookies use a monotonic timer which increments every 64 seconds.
* This counter is used both as a hash input and partially encoded into * This counter is used both as a hash input and partially encoded into
...@@ -619,7 +619,7 @@ static inline void tcp_bound_rto(const struct sock *sk) ...@@ -619,7 +619,7 @@ static inline void tcp_bound_rto(const struct sock *sk)
static inline u32 __tcp_set_rto(const struct tcp_sock *tp) static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{ {
return (tp->srtt >> 3) + tp->rttvar; return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
} }
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
...@@ -656,6 +656,11 @@ static inline u32 tcp_rto_min(struct sock *sk) ...@@ -656,6 +656,11 @@ static inline u32 tcp_rto_min(struct sock *sk)
return rto_min; return rto_min;
} }
static inline u32 tcp_rto_min_us(struct sock *sk)
{
return jiffies_to_usecs(tcp_rto_min(sk));
}
/* Compute the actual receive window we are currently advertising. /* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data * Rcv_nxt can be after the window if our peer push more data
* than the offered window. * than the offered window.
...@@ -778,7 +783,6 @@ enum tcp_ca_event { ...@@ -778,7 +783,6 @@ enum tcp_ca_event {
#define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
#define TCP_CONG_NON_RESTRICTED 0x1 #define TCP_CONG_NON_RESTRICTED 0x1
#define TCP_CONG_RTT_STAMP 0x2
struct tcp_congestion_ops { struct tcp_congestion_ops {
struct list_head list; struct list_head list;
......
...@@ -11,12 +11,15 @@ ...@@ -11,12 +11,15 @@
#define TCP_METRICS_GENL_VERSION 0x1 #define TCP_METRICS_GENL_VERSION 0x1
enum tcp_metric_index { enum tcp_metric_index {
TCP_METRIC_RTT, TCP_METRIC_RTT, /* in ms units */
TCP_METRIC_RTTVAR, TCP_METRIC_RTTVAR, /* in ms units */
TCP_METRIC_SSTHRESH, TCP_METRIC_SSTHRESH,
TCP_METRIC_CWND, TCP_METRIC_CWND,
TCP_METRIC_REORDERING, TCP_METRIC_REORDERING,
TCP_METRIC_RTT_US, /* in usec units */
TCP_METRIC_RTTVAR_US, /* in usec units */
/* Always last. */ /* Always last. */
__TCP_METRIC_MAX, __TCP_METRIC_MAX,
}; };
......
...@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk) ...@@ -387,7 +387,7 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsq_node);
icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto = TCP_TIMEOUT_INIT;
tp->mdev = TCP_TIMEOUT_INIT; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
/* So many TCP implementations out there (incorrectly) count the /* So many TCP implementations out there (incorrectly) count the
* initial SYN frame in their delayed-ACK and congestion control * initial SYN frame in their delayed-ACK and congestion control
...@@ -2339,7 +2339,7 @@ int tcp_disconnect(struct sock *sk, int flags) ...@@ -2339,7 +2339,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0; sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE); sock_reset_flag(sk, SOCK_DONE);
tp->srtt = 0; tp->srtt_us = 0;
if ((tp->write_seq += tp->max_window + 2) == 0) if ((tp->write_seq += tp->max_window + 2) == 0)
tp->write_seq = 1; tp->write_seq = 1;
icsk->icsk_backoff = 0; icsk->icsk_backoff = 0;
...@@ -2783,8 +2783,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info) ...@@ -2783,8 +2783,8 @@ void tcp_get_info(const struct sock *sk, struct tcp_info *info)
info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; info->tcpi_rtt = tp->srtt_us >> 3;
info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; info->tcpi_rttvar = tp->mdev_us >> 2;
info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_snd_ssthresh = tp->snd_ssthresh;
info->tcpi_snd_cwnd = tp->snd_cwnd; info->tcpi_snd_cwnd = tp->snd_cwnd;
info->tcpi_advmss = tp->advmss; info->tcpi_advmss = tp->advmss;
......
...@@ -476,10 +476,6 @@ static int __init cubictcp_register(void) ...@@ -476,10 +476,6 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */ /* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10); do_div(cube_factor, bic_scale * 10);
/* hystart needs ms clock resolution */
if (hystart && HZ < 1000)
cubictcp.flags |= TCP_CONG_RTT_STAMP;
return tcp_register_congestion_control(&cubictcp); return tcp_register_congestion_control(&cubictcp);
} }
......
...@@ -21,7 +21,7 @@ struct hybla { ...@@ -21,7 +21,7 @@ struct hybla {
u32 rho2; /* Rho * Rho, integer part */ u32 rho2; /* Rho * Rho, integer part */
u32 rho_3ls; /* Rho parameter, <<3 */ u32 rho_3ls; /* Rho parameter, <<3 */
u32 rho2_7ls; /* Rho^2, <<7 */ u32 rho2_7ls; /* Rho^2, <<7 */
u32 minrtt; /* Minimum smoothed round trip time value seen */ u32 minrtt_us; /* Minimum smoothed round trip time value seen */
}; };
/* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */ /* Hybla reference round trip time (default= 1/40 sec = 25 ms), in ms */
...@@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk) ...@@ -35,7 +35,9 @@ static inline void hybla_recalc_param (struct sock *sk)
{ {
struct hybla *ca = inet_csk_ca(sk); struct hybla *ca = inet_csk_ca(sk);
ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); ca->rho_3ls = max_t(u32,
tcp_sk(sk)->srtt_us / (rtt0 * USEC_PER_MSEC),
8U);
ca->rho = ca->rho_3ls >> 3; ca->rho = ca->rho_3ls >> 3;
ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1;
ca->rho2 = ca->rho2_7ls >> 7; ca->rho2 = ca->rho2_7ls >> 7;
...@@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk) ...@@ -59,7 +61,7 @@ static void hybla_init(struct sock *sk)
hybla_recalc_param(sk); hybla_recalc_param(sk);
/* set minimum rtt as this is the 1st ever seen */ /* set minimum rtt as this is the 1st ever seen */
ca->minrtt = tp->srtt; ca->minrtt_us = tp->srtt_us;
tp->snd_cwnd = ca->rho; tp->snd_cwnd = ca->rho;
} }
...@@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked, ...@@ -94,9 +96,9 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked,
int is_slowstart = 0; int is_slowstart = 0;
/* Recalculate rho only if this srtt is the lowest */ /* Recalculate rho only if this srtt is the lowest */
if (tp->srtt < ca->minrtt){ if (tp->srtt_us < ca->minrtt_us) {
hybla_recalc_param(sk); hybla_recalc_param(sk);
ca->minrtt = tp->srtt; ca->minrtt_us = tp->srtt_us;
} }
if (!tcp_is_cwnd_limited(sk, in_flight)) if (!tcp_is_cwnd_limited(sk, in_flight))
......
...@@ -325,7 +325,6 @@ static void tcp_illinois_info(struct sock *sk, u32 ext, ...@@ -325,7 +325,6 @@ static void tcp_illinois_info(struct sock *sk, u32 ext,
} }
static struct tcp_congestion_ops tcp_illinois __read_mostly = { static struct tcp_congestion_ops tcp_illinois __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_illinois_init, .init = tcp_illinois_init,
.ssthresh = tcp_illinois_ssthresh, .ssthresh = tcp_illinois_ssthresh,
.cong_avoid = tcp_illinois_cong_avoid, .cong_avoid = tcp_illinois_cong_avoid,
......
This diff is collapsed.
...@@ -435,7 +435,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) ...@@ -435,7 +435,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
break; break;
icsk->icsk_backoff--; icsk->icsk_backoff--;
inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) : inet_csk(sk)->icsk_rto = (tp->srtt_us ? __tcp_set_rto(tp) :
TCP_TIMEOUT_INIT) << icsk->icsk_backoff; TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
tcp_bound_rto(sk); tcp_bound_rto(sk);
......
...@@ -315,7 +315,6 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) ...@@ -315,7 +315,6 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
} }
static struct tcp_congestion_ops tcp_lp __read_mostly = { static struct tcp_congestion_ops tcp_lp __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_lp_init, .init = tcp_lp_init,
.ssthresh = tcp_reno_ssthresh, .ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_lp_cong_avoid, .cong_avoid = tcp_lp_cong_avoid,
......
...@@ -33,6 +33,11 @@ struct tcp_fastopen_metrics { ...@@ -33,6 +33,11 @@ struct tcp_fastopen_metrics {
struct tcp_fastopen_cookie cookie; struct tcp_fastopen_cookie cookie;
}; };
/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
* Kernel only stores RTT and RTTVAR in usec resolution
*/
#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
struct tcp_metrics_block { struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next; struct tcp_metrics_block __rcu *tcpm_next;
struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_saddr;
...@@ -41,7 +46,7 @@ struct tcp_metrics_block { ...@@ -41,7 +46,7 @@ struct tcp_metrics_block {
u32 tcpm_ts; u32 tcpm_ts;
u32 tcpm_ts_stamp; u32 tcpm_ts_stamp;
u32 tcpm_lock; u32 tcpm_lock;
u32 tcpm_vals[TCP_METRIC_MAX + 1]; u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
struct tcp_fastopen_metrics tcpm_fastopen; struct tcp_fastopen_metrics tcpm_fastopen;
struct rcu_head rcu_head; struct rcu_head rcu_head;
...@@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm, ...@@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm,
return tm->tcpm_vals[idx]; return tm->tcpm_vals[idx];
} }
static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
return msecs_to_jiffies(tm->tcpm_vals[idx]);
}
static void tcp_metric_set(struct tcp_metrics_block *tm, static void tcp_metric_set(struct tcp_metrics_block *tm,
enum tcp_metric_index idx, enum tcp_metric_index idx,
u32 val) u32 val)
...@@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, ...@@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
tm->tcpm_vals[idx] = val; tm->tcpm_vals[idx] = val;
} }
static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
{
tm->tcpm_vals[idx] = jiffies_to_msecs(val);
}
static bool addr_same(const struct inetpeer_addr *a, static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b) const struct inetpeer_addr *b)
{ {
...@@ -101,9 +93,11 @@ struct tcpm_hash_bucket { ...@@ -101,9 +93,11 @@ struct tcpm_hash_bucket {
static DEFINE_SPINLOCK(tcp_metrics_lock); static DEFINE_SPINLOCK(tcp_metrics_lock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, static void tcpm_suck_dst(struct tcp_metrics_block *tm,
const struct dst_entry *dst,
bool fastopen_clear) bool fastopen_clear)
{ {
u32 msval;
u32 val; u32 val;
tm->tcpm_stamp = jiffies; tm->tcpm_stamp = jiffies;
...@@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, ...@@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
val |= 1 << TCP_METRIC_REORDERING; val |= 1 << TCP_METRIC_REORDERING;
tm->tcpm_lock = val; tm->tcpm_lock = val;
tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT); msval = dst_metric_raw(dst, RTAX_RTT);
tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR); tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
msval = dst_metric_raw(dst, RTAX_RTTVAR);
tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
...@@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk) ...@@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk)
dst_confirm(dst); dst_confirm(dst);
rcu_read_lock(); rcu_read_lock();
if (icsk->icsk_backoff || !tp->srtt) { if (icsk->icsk_backoff || !tp->srtt_us) {
/* This session failed to estimate rtt. Why? /* This session failed to estimate rtt. Why?
* Probably, no packets returned in time. Reset our * Probably, no packets returned in time. Reset our
* results. * results.
...@@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk) ...@@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk)
if (!tm) if (!tm)
goto out_unlock; goto out_unlock;
rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
m = rtt - tp->srtt; m = rtt - tp->srtt_us;
/* If newly calculated rtt larger than stored one, store new /* If newly calculated rtt larger than stored one, store new
* one. Otherwise, use EWMA. Remember, rtt overestimation is * one. Otherwise, use EWMA. Remember, rtt overestimation is
...@@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk) ...@@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk)
*/ */
if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
if (m <= 0) if (m <= 0)
rtt = tp->srtt; rtt = tp->srtt_us;
else else
rtt -= (m >> 3); rtt -= (m >> 3);
tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt); tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
} }
if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
...@@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk) ...@@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk)
/* Scale deviation to rttvar fixed point */ /* Scale deviation to rttvar fixed point */
m >>= 1; m >>= 1;
if (m < tp->mdev) if (m < tp->mdev_us)
m = tp->mdev; m = tp->mdev_us;
var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR); var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
if (m >= var) if (m >= var)
var = m; var = m;
else else
var -= (var - m) >> 2; var -= (var - m) >> 2;
tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var); tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
} }
if (tcp_in_initial_slowstart(tp)) { if (tcp_in_initial_slowstart(tp)) {
...@@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk) ...@@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk)
tp->reordering = val; tp->reordering = val;
} }
crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT); crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
rcu_read_unlock(); rcu_read_unlock();
reset: reset:
/* The initial RTT measurement from the SYN/SYN-ACK is not ideal /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
...@@ -551,18 +548,20 @@ void tcp_init_metrics(struct sock *sk) ...@@ -551,18 +548,20 @@ void tcp_init_metrics(struct sock *sk)
* to low value, and then abruptly stops to do it and starts to delay * to low value, and then abruptly stops to do it and starts to delay
* ACKs, wait for troubles. * ACKs, wait for troubles.
*/ */
if (crtt > tp->srtt) { if (crtt > tp->srtt_us) {
/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
crtt >>= 3; crtt /= 8 * USEC_PER_MSEC;
inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
} else if (tp->srtt == 0) { } else if (tp->srtt_us == 0) {
/* RFC6298: 5.7 We've failed to get a valid RTT sample from /* RFC6298: 5.7 We've failed to get a valid RTT sample from
* 3WHS. This is most likely due to retransmission, * 3WHS. This is most likely due to retransmission,
* including spurious one. Reset the RTO back to 3secs * including spurious one. Reset the RTO back to 3secs
* from the more aggressive 1sec to avoid more spurious * from the more aggressive 1sec to avoid more spurious
* retransmission. * retransmission.
*/ */
tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK; tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
} }
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
...@@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, ...@@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS); nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
if (!nest) if (!nest)
goto nla_put_failure; goto nla_put_failure;
for (i = 0; i < TCP_METRIC_MAX + 1; i++) { for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
if (!tm->tcpm_vals[i]) u32 val = tm->tcpm_vals[i];
if (!val)
continue; continue;
if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0) if (i == TCP_METRIC_RTT) {
if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
val) < 0)
goto nla_put_failure;
n++;
val = max(val / 1000, 1U);
}
if (i == TCP_METRIC_RTTVAR) {
if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
val) < 0)
goto nla_put_failure;
n++;
val = max(val / 1000, 1U);
}
if (nla_put_u32(msg, i + 1, val) < 0)
goto nla_put_failure; goto nla_put_failure;
n++; n++;
} }
......
...@@ -398,8 +398,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, ...@@ -398,8 +398,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_init_wl(newtp, treq->rcv_isn); tcp_init_wl(newtp, treq->rcv_isn);
newtp->srtt = 0; newtp->srtt_us = 0;
newtp->mdev = TCP_TIMEOUT_INIT; newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
newicsk->icsk_rto = TCP_TIMEOUT_INIT; newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newtp->packets_out = 0; newtp->packets_out = 0;
......
...@@ -866,11 +866,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, ...@@ -866,11 +866,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (clone_it) { if (clone_it) {
const struct sk_buff *fclone = skb + 1; const struct sk_buff *fclone = skb + 1;
/* If congestion control is doing timestamping, we must skb_mstamp_get(&skb->skb_mstamp);
* take such a timestamp before we potentially clone/copy.
*/
if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
__net_timestamp(skb);
if (unlikely(skb->fclone == SKB_FCLONE_ORIG && if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
fclone->fclone == SKB_FCLONE_CLONE)) fclone->fclone == SKB_FCLONE_CLONE))
...@@ -1974,7 +1970,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) ...@@ -1974,7 +1970,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, tlp_time_stamp, rto_time_stamp; u32 timeout, tlp_time_stamp, rto_time_stamp;
u32 rtt = tp->srtt >> 3; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS)) if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
return false; return false;
...@@ -1996,7 +1992,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) ...@@ -1996,7 +1992,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Schedule a loss probe in 2*RTT for SACK capable connections /* Schedule a loss probe in 2*RTT for SACK capable connections
* in Open state, that are either limited by cwnd or application. * in Open state, that are either limited by cwnd or application.
*/ */
if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
return false; return false;
...@@ -3050,8 +3046,9 @@ void tcp_send_delayed_ack(struct sock *sk) ...@@ -3050,8 +3046,9 @@ void tcp_send_delayed_ack(struct sock *sk)
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
* directly. * directly.
*/ */
if (tp->srtt) { if (tp->srtt_us) {
int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
TCP_DELACK_MIN);
if (rtt < max_ato) if (rtt < max_ato)
max_ato = rtt; max_ato = rtt;
......
...@@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, ...@@ -154,7 +154,7 @@ static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
p->snd_wnd = tp->snd_wnd; p->snd_wnd = tp->snd_wnd;
p->rcv_wnd = tp->rcv_wnd; p->rcv_wnd = tp->rcv_wnd;
p->ssthresh = tcp_current_ssthresh(sk); p->ssthresh = tcp_current_ssthresh(sk);
p->srtt = tp->srtt >> 3; p->srtt = tp->srtt_us >> 3;
tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
} }
......
...@@ -306,7 +306,6 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) ...@@ -306,7 +306,6 @@ void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb)
EXPORT_SYMBOL_GPL(tcp_vegas_get_info); EXPORT_SYMBOL_GPL(tcp_vegas_get_info);
static struct tcp_congestion_ops tcp_vegas __read_mostly = { static struct tcp_congestion_ops tcp_vegas __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_vegas_init, .init = tcp_vegas_init,
.ssthresh = tcp_reno_ssthresh, .ssthresh = tcp_reno_ssthresh,
.cong_avoid = tcp_vegas_cong_avoid, .cong_avoid = tcp_vegas_cong_avoid,
......
...@@ -203,7 +203,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk) ...@@ -203,7 +203,6 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
} }
static struct tcp_congestion_ops tcp_veno __read_mostly = { static struct tcp_congestion_ops tcp_veno __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_veno_init, .init = tcp_veno_init,
.ssthresh = tcp_veno_ssthresh, .ssthresh = tcp_veno_ssthresh,
.cong_avoid = tcp_veno_cong_avoid, .cong_avoid = tcp_veno_cong_avoid,
......
...@@ -227,7 +227,6 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) { ...@@ -227,7 +227,6 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) {
} }
static struct tcp_congestion_ops tcp_yeah __read_mostly = { static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.flags = TCP_CONG_RTT_STAMP,
.init = tcp_yeah_init, .init = tcp_yeah_init,
.ssthresh = tcp_yeah_ssthresh, .ssthresh = tcp_yeah_ssthresh,
.cong_avoid = tcp_yeah_cong_avoid, .cong_avoid = tcp_yeah_cong_avoid,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment