Commit bdf24b4b authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-ts-usec-resolution'

Eric Dumazet says:

====================
tcp: add optional usec resolution to TCP TS

As discussed in various public places in 2016, Google adopted
usec resolution in RFC 7323 TS values, at Van Jacobson suggestion.

Goals were :

1) better observability of delays in networking stacks/fabrics.

2) better disambiguation of events based on TSval/ecr values.

3) building block for congestion control modules needing usec resolution.

Back then we implemented a schem based on private SYN options
to safely negotiate the feature.

For upstream submission, we chose to use a much simpler route
attribute because this feature is probably going to be used
in private networks.

ip route add 10/8 ... features tcp_usec_ts

References:

https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-tcp-options-for-low-latency-00.pdf
https://datatracker.ietf.org/doc/draft-wang-tcpm-low-latency-opt/

First two patches are fixing old minor bugs and might be taken
by stable teams (thanks to appropriate Fixes: tags)
====================
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 35c1b273 a77a0f5c
......@@ -2259,7 +2259,7 @@ static void chtls_rx_ack(struct sock *sk, struct sk_buff *skb)
if (tp->snd_una != snd_una) {
tp->snd_una = snd_una;
tp->rcv_tstamp = tcp_time_stamp(tp);
tp->rcv_tstamp = tcp_jiffies32;
if (tp->snd_una == tp->snd_nxt &&
!csk_flag_nochk(csk, CSK_TX_FAILOVER))
csk_reset_flag(csk, CSK_TX_WAIT_IDLE);
......
......@@ -152,6 +152,7 @@ struct tcp_request_sock {
u64 snt_synack; /* first SYNACK sent time */
bool tfo_listener;
bool is_mptcp;
s8 req_usec_ts;
#if IS_ENABLED(CONFIG_MPTCP)
bool drop_req;
#endif
......@@ -257,7 +258,8 @@ struct tcp_sock {
u8 compressed_ack;
u8 dup_ack_counter:2,
tlp_retrans:1, /* TLP is a retransmission */
unused:5;
tcp_usec_ts:1, /* TSval values in usec */
unused:4;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
......@@ -576,4 +578,9 @@ void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);
static inline bool dst_tcp_usec_ts(const struct dst_entry *dst)
{
return dst_feature(dst, RTAX_FEATURE_TCP_USEC_TS);
}
#endif /* _LINUX_TCP_H */
......@@ -67,7 +67,8 @@ struct inet_timewait_sock {
/* And these are ours. */
unsigned int tw_transparent : 1,
tw_flowlabel : 20,
tw_pad : 3, /* 3 bits hole */
tw_usec_ts : 1,
tw_pad : 2, /* 2 bits hole */
tw_tos : 8;
u32 tw_txhash;
u32 tw_priority;
......
......@@ -166,7 +166,12 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127
#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
/* Ensure that TCP PAWS checks are relaxed after ~2147 seconds
* to avoid overflows. This assumes a clock smaller than 1 Mhz.
* Default clock is 1 Khz, tcp_usec_ts uses 1 Mhz.
*/
#define TCP_PAWS_WRAP (INT_MAX / USEC_PER_SEC)
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
* after this time. It should be equal
* (or greater than) TCP_TIMEWAIT_LEN
......@@ -798,22 +803,31 @@ static inline u64 tcp_clock_us(void)
return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}
/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
static inline u64 tcp_clock_ms(void)
{
return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
return div_u64(tcp_clock_ns(), NSEC_PER_MSEC);
}
/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
static inline u32 tcp_ns_to_ts(u64 ns)
/* TCP Timestamp included in TS option (RFC 1323) can either use ms
* or usec resolution. Each socket carries a flag to select one or other
* resolution, as the route attribute could change anytime.
* Each flow must stick to initial resolution.
*/
static inline u32 tcp_clock_ts(bool usec_ts)
{
return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
return usec_ts ? tcp_clock_us() : tcp_clock_ms();
}
/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
static inline u32 tcp_time_stamp_ms(const struct tcp_sock *tp)
{
return tcp_ns_to_ts(tcp_clock_ns());
return div_u64(tp->tcp_mstamp, USEC_PER_MSEC);
}
static inline u32 tcp_time_stamp_ts(const struct tcp_sock *tp)
{
if (tp->tcp_usec_ts)
return tp->tcp_mstamp;
return tcp_time_stamp_ms(tp);
}
void tcp_mstamp_refresh(struct tcp_sock *tp);
......@@ -823,17 +837,30 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
return max_t(s64, t1 - t0, 0);
}
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
return tcp_ns_to_ts(skb->skb_mstamp_ns);
}
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}
/* Provide skb TSval in usec or ms unit */
static inline u32 tcp_skb_timestamp_ts(bool usec_ts, const struct sk_buff *skb)
{
if (usec_ts)
return tcp_skb_timestamp_us(skb);
return div_u64(skb->skb_mstamp_ns, NSEC_PER_MSEC);
}
static inline u32 tcp_tw_tsval(const struct tcp_timewait_sock *tcptw)
{
return tcp_clock_ts(tcptw->tw_sk.tw_usec_ts) + tcptw->tw_ts_offset;
}
static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
{
return tcp_clock_ts(treq->req_usec_ts) + treq->ts_off;
}
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
......@@ -1599,7 +1626,7 @@ static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt,
if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win)
return true;
if (unlikely(!time_before32(ktime_get_seconds(),
rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS)))
rx_opt->ts_recent_stamp + TCP_PAWS_WRAP)))
return true;
/*
* Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0,
......
......@@ -502,13 +502,17 @@ enum {
#define RTAX_MAX (__RTAX_MAX - 1)
#define RTAX_FEATURE_ECN (1 << 0)
#define RTAX_FEATURE_SACK (1 << 1)
#define RTAX_FEATURE_TIMESTAMP (1 << 2)
#define RTAX_FEATURE_ALLFRAG (1 << 3)
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
#define RTAX_FEATURE_ECN (1 << 0)
#define RTAX_FEATURE_SACK (1 << 1) /* unused */
#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */
#define RTAX_FEATURE_ALLFRAG (1 << 3)
#define RTAX_FEATURE_TCP_USEC_TS (1 << 4)
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \
RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | \
RTAX_FEATURE_ALLFRAG | \
RTAX_FEATURE_TCP_USEC_TS)
struct rta_session {
__u8 proto;
......
......@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
#define TCPI_OPT_USEC_TS 64 /* usec timestamps */
/*
* Sender's congestion state indicating normal or abnormal situations
......
......@@ -41,7 +41,6 @@ static siphash_aligned_key_t syncookie_secret[2];
* requested/supported by the syn/synack exchange.
*/
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
......@@ -52,6 +51,14 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
count, &syncookie_secret[c]);
}
/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */
static u64 tcp_ns_to_ts(bool usec_ts, u64 val)
{
if (usec_ts)
return div_u64(val, NSEC_PER_USEC);
return div_u64(val, NSEC_PER_MSEC);
}
/*
* when syncookies are in effect and tcp timestamps are enabled we encode
......@@ -62,27 +69,24 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
*/
u64 cookie_init_timestamp(struct request_sock *req, u64 now)
{
struct inet_request_sock *ireq;
u32 ts, ts_now = tcp_ns_to_ts(now);
const struct inet_request_sock *ireq = inet_rsk(req);
u64 ts, ts_now = tcp_ns_to_ts(false, now);
u32 options = 0;
ireq = inet_rsk(req);
options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
if (ireq->sack_ok)
options |= TS_OPT_SACK;
if (ireq->ecn_ok)
options |= TS_OPT_ECN;
ts = ts_now & ~TSMASK;
ts = (ts_now >> TSBITS) << TSBITS;
ts |= options;
if (ts > ts_now) {
ts >>= TSBITS;
ts--;
ts <<= TSBITS;
ts |= options;
}
return (u64)ts * (NSEC_PER_SEC / TCP_TS_HZ);
if (ts > ts_now)
ts -= (1UL << TSBITS);
if (tcp_rsk(req)->req_usec_ts)
return ts * NSEC_PER_USEC;
return ts * NSEC_PER_MSEC;
}
......@@ -302,6 +306,8 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
treq->af_specific = af_ops;
treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
treq->req_usec_ts = -1;
#if IS_ENABLED(CONFIG_MPTCP)
treq->is_mptcp = sk_is_mptcp(sk);
if (treq->is_mptcp) {
......
......@@ -3629,10 +3629,16 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
tp->fastopen_no_cookie = val;
break;
case TCP_TIMESTAMP:
if (!tp->repair)
if (!tp->repair) {
err = -EPERM;
else
WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw());
break;
}
/* val is an opaque field,
* and low order bit contains usec_ts enable bit.
* Its a best effort, and we do not care if user makes an error.
*/
tp->tcp_usec_ts = val & 1;
WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
break;
case TCP_REPAIR_WINDOW:
err = tcp_repair_set_window(tp, optval, optlen);
......@@ -3754,6 +3760,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_options |= TCPI_OPT_ECN_SEEN;
if (tp->syn_data_acked)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
if (tp->tcp_usec_ts)
info->tcpi_options |= TCPI_OPT_USEC_TS;
info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
......@@ -3817,10 +3825,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_rto = tp->total_rto;
info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
info->tcpi_total_rto_time = tp->total_rto_time;
if (tp->rto_stamp) {
info->tcpi_total_rto_time += tcp_time_stamp_raw() -
tp->rto_stamp;
}
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
unlock_sock_fast(sk, slow);
}
......@@ -4145,7 +4151,11 @@ int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_TIMESTAMP:
val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset);
val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
if (tp->tcp_usec_ts)
val |= 1;
else
val &= ~1;
break;
case TCP_NOTSENT_LOWAT:
val = READ_ONCE(tp->notsent_lowat);
......
......@@ -693,6 +693,23 @@ static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp)
{
u32 delta, delta_us;
delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
if (tp->tcp_usec_ts)
return delta;
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
delta = 1;
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
return delta_us;
}
return -1;
}
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
const struct sk_buff *skb)
{
......@@ -704,15 +721,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
u32 delta_us;
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
delta = 1;
delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
tcp_rcv_rtt_update(tp, delta_us, 0);
}
s32 delta = tcp_rtt_tsopt_us(tp);
if (delta >= 0)
tcp_rcv_rtt_update(tp, delta, 0);
}
}
......@@ -2442,7 +2454,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
}
/* Nothing was retransmitted or returned timestamp is less
......@@ -2856,7 +2868,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
static void tcp_update_rto_time(struct tcp_sock *tp)
{
if (tp->rto_stamp) {
tp->total_rto_time += tcp_time_stamp(tp) - tp->rto_stamp;
tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
tp->rto_stamp = 0;
}
}
......@@ -3146,17 +3158,10 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED) {
u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
if (!delta)
delta = 1;
seq_rtt_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
ca_rtt_us = seq_rtt_us;
}
}
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp);
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
return false;
......@@ -6293,7 +6298,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
tcp_time_stamp(tp))) {
tcp_time_stamp_ts(tp))) {
NET_INC_STATS(sock_net(sk),
LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
......@@ -7042,6 +7047,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
req->syncookie = want_cookie;
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
tcp_rsk(req)->req_usec_ts = -1;
#if IS_ENABLED(CONFIG_MPTCP)
tcp_rsk(req)->is_mptcp = 0;
#endif
......
......@@ -296,6 +296,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
rt = NULL;
goto failure;
}
tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
......@@ -954,7 +955,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcp_v4_send_ack(sk, skb,
tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcp_tw_tsval(tcptw),
tcptw->tw_ts_recent,
tw->tw_bound_dev_if,
tcp_twsk_md5_key(tcptw),
......@@ -988,7 +989,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
tcp_rsk_tsval(tcp_rsk(req)),
READ_ONCE(req->ts_recent),
0,
tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
......
......@@ -272,7 +272,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
u32 now = tcp_time_stamp(tp);
u32 now = tcp_time_stamp_ts(tp);
u32 delta;
if (sample->rtt_us > 0)
......
......@@ -300,6 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
tw->tw_usec_ts = tp->tcp_usec_ts;
tcptw->tw_last_oow_ack_time = 0;
tcptw->tw_tx_delay = tp->tcp_tx_delay;
tw->tw_txhash = sk->sk_txhash;
......@@ -554,21 +555,29 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->max_window = newtp->snd_wnd;
if (newtp->rx_opt.tstamp_ok) {
newtp->tcp_usec_ts = treq->req_usec_ts;
newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent);
newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
} else {
newtp->tcp_usec_ts = 0;
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
if (req->num_timeout) {
newtp->undo_marker = treq->snt_isn;
newtp->retrans_stamp = div_u64(treq->snt_synack,
USEC_PER_SEC / TCP_TS_HZ);
newtp->total_rto = req->num_timeout;
newtp->total_rto_recoveries = 1;
newtp->total_rto_time = tcp_time_stamp_raw() -
newtp->undo_marker = treq->snt_isn;
if (newtp->tcp_usec_ts) {
newtp->retrans_stamp = treq->snt_synack;
newtp->total_rto_time = (u32)(tcp_clock_us() -
newtp->retrans_stamp) / USEC_PER_MSEC;
} else {
newtp->retrans_stamp = div_u64(treq->snt_synack,
USEC_PER_SEC / TCP_TS_HZ);
newtp->total_rto_time = tcp_clock_ms() -
newtp->retrans_stamp;
}
newtp->total_rto_recoveries = 1;
}
newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
......
......@@ -799,7 +799,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps) && !*md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
......@@ -884,7 +884,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
tcp_rsk(req)->ts_off;
opts->tsecr = READ_ONCE(req->ts_recent);
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
......@@ -943,7 +944,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (likely(tp->rx_opt.tstamp_ok)) {
opts->options |= OPTION_TS;
opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
tp->tsoffset : 0;
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
......@@ -3379,7 +3381,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
/* Save stamp of the first (attempted) retransmit. */
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_skb_timestamp(skb);
tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb);
if (tp->undo_retrans < 0)
tp->undo_retrans = 0;
......@@ -3665,6 +3667,8 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
if (tcp_rsk(req)->req_usec_ts < 0)
tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
......@@ -3961,7 +3965,7 @@ int tcp_connect(struct sock *sk)
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
tcp_mstamp_refresh(tp);
tp->retrans_stamp = tcp_time_stamp(tp);
tp->retrans_stamp = tcp_time_stamp_ts(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
......
......@@ -26,14 +26,18 @@
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts, user_timeout;
const struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed, user_timeout;
s32 remaining;
start_ts = tcp_sk(sk)->retrans_stamp;
user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (!user_timeout)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp;
if (tp->tcp_usec_ts)
elapsed /= USEC_PER_MSEC;
remaining = user_timeout - elapsed;
if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */
......@@ -212,12 +216,13 @@ static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
unsigned int timeout)
{
unsigned int start_ts;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int start_ts, delta;
if (!inet_csk(sk)->icsk_retransmits)
return false;
start_ts = tcp_sk(sk)->retrans_stamp;
start_ts = tp->retrans_stamp;
if (likely(timeout == 0)) {
unsigned int rto_base = TCP_RTO_MIN;
......@@ -226,7 +231,12 @@ static bool retransmits_timed_out(struct sock *sk,
timeout = tcp_model_timeout(sk, boundary, rto_base);
}
return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0;
if (tp->tcp_usec_ts) {
/* delta maybe off up to a jiffy due to timer granularity. */
delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1);
return (s32)(delta - timeout * USEC_PER_MSEC) >= 0;
}
return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0;
}
/* A write timeout has occurred. Process the after effects. */
......@@ -422,7 +432,7 @@ static void tcp_update_rto_stats(struct sock *sk)
if (!icsk->icsk_retransmits) {
tp->total_rto_recoveries++;
tp->rto_stamp = tcp_time_stamp(tp);
tp->rto_stamp = tcp_time_stamp_ms(tp);
}
icsk->icsk_retransmits++;
tp->total_rto++;
......@@ -462,26 +472,24 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
req->num_timeout++;
tcp_update_rto_stats(sk);
if (!tp->retrans_stamp)
tp->retrans_stamp = tcp_time_stamp(tp);
tp->retrans_stamp = tcp_time_stamp_ts(tp);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
req->timeout << req->num_timeout, TCP_RTO_MAX);
}
static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
const struct sk_buff *skb)
const struct sk_buff *skb,
u32 rtx_delta)
{
const struct tcp_sock *tp = tcp_sk(sk);
const int timeout = TCP_RTO_MAX * 2;
u32 rcv_delta, rtx_delta;
u32 rcv_delta;
rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp;
if (rcv_delta <= timeout)
return false;
rtx_delta = (u32)msecs_to_jiffies(tcp_time_stamp(tp) -
(tp->retrans_stamp ?: tcp_skb_timestamp(skb)));
return rtx_delta > timeout;
return msecs_to_jiffies(rtx_delta) > timeout;
}
/**
......@@ -534,7 +542,11 @@ void tcp_retransmit_timer(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
u32 rtx_delta;
rtx_delta = tcp_time_stamp(tp) - (tp->retrans_stamp ?: tcp_skb_timestamp(skb));
rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?:
tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
if (tp->tcp_usec_ts)
rtx_delta /= USEC_PER_MSEC;
if (sk->sk_family == AF_INET) {
net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
&inet->inet_daddr, ntohs(inet->inet_dport),
......@@ -551,7 +563,7 @@ void tcp_retransmit_timer(struct sock *sk)
rtx_delta);
}
#endif
if (tcp_rtx_probe0_timed_out(sk, skb)) {
if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) {
tcp_write_err(sk);
goto out;
}
......
......@@ -286,6 +286,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
goto failure;
}
tp->tcp_usec_ts = dst_tcp_usec_ts(dst);
tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (!saddr) {
......@@ -1096,7 +1097,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp_raw() + tcptw->tw_ts_offset,
tcp_tw_tsval(tcptw),
tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority,
tw->tw_txhash);
......@@ -1123,7 +1124,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
tcp_rsk_tsval(tcp_rsk(req)),
READ_ONCE(req->ts_recent), sk->sk_bound_dev_if,
tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index),
ipv6_get_dsfield(ipv6_hdr(skb)), 0,
......
......@@ -153,7 +153,7 @@ void synproxy_init_timestamp_cookie(const struct nf_synproxy_info *info,
struct synproxy_options *opts)
{
opts->tsecr = opts->tsval;
opts->tsval = tcp_time_stamp_raw() & ~0x3f;
opts->tsval = tcp_clock_ms() & ~0x3f;
if (opts->options & NF_SYNPROXY_OPT_WSCALE) {
opts->tsval |= opts->wscale;
......
......@@ -177,7 +177,7 @@ static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
return ns / (NSEC_PER_SEC / TCP_TS_HZ);
}
static __always_inline __u32 tcp_time_stamp_raw(void)
static __always_inline __u32 tcp_clock_ms(void)
{
return tcp_ns_to_ts(tcp_clock_ns());
}
......@@ -274,7 +274,7 @@ static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
if (!loop_ctx.option_timestamp)
return false;
cookie = tcp_time_stamp_raw() & ~TSMASK;
cookie = tcp_clock_ms() & ~TSMASK;
cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
if (loop_ctx.option_sack)
cookie |= TS_OPT_SACK;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment