Commit 6d5274eb authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-sender-chronographs'

Yuchung Cheng says:

====================
tcp: sender chronographs instrumentation

This patch set provides instrumentation on TCP sender limitations.
While developing the BBR congestion control, we noticed that TCP
sending process is often limited by factors unrelated to congestion
control: insufficient sender buffer and/or insufficient receive
window/buffer to saturate the network bandwidth. Unfortunately these
limits are not visible to the users and often the poor performance
is attributed to the congestion control of choice.

Thie patch aims to help users get the high level understanding of
where sending process is limited by, similar to the TCP_INFO design.
It is not to replace detailed kernel tracing and instrumentation
facilities.

In addition this patch set provide a new option to the timestamping
work to instrument these limits on application data unit. For exampe,
one can use SO_TIMESTAMPING and this patch set to measure the how
long a particular HTTP response is limited by small receive window.

Patch set was initially written by Francis Yan then polished
by Yuchung Cheng, with lots of help from Eric Dumazet and Soheil
Hassas Yeganeh.
====================
Acked-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a0909949 1c885808
......@@ -182,6 +182,16 @@ SOF_TIMESTAMPING_OPT_TSONLY:
the timestamp even if sysctl net.core.tstamp_allow_data is 0.
This option disables SOF_TIMESTAMPING_OPT_CMSG.
SOF_TIMESTAMPING_OPT_STATS:
Optional stats that are obtained along with the transmit timestamps.
It must be used together with SOF_TIMESTAMPING_OPT_TSONLY. When the
transmit timestamp is available, the stats are available in a
separate control message of type SCM_TIMESTAMPING_OPT_STATS, as a
list of TLVs (struct nlattr) of types. These stats allow the
application to associate various transport layer stats with
the transmit timestamps, such as how long a certain block of
data was limited by peer's receiver window.
New applications are encouraged to pass SOF_TIMESTAMPING_OPT_ID to
disambiguate timestamps and SOF_TIMESTAMPING_OPT_TSONLY to operate
......
......@@ -97,4 +97,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _UAPI_ASM_SOCKET_H */
......@@ -90,5 +90,7 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _ASM_SOCKET_H */
......@@ -99,4 +99,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _ASM_IA64_SOCKET_H */
......@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _ASM_M32R_SOCKET_H */
......@@ -108,4 +108,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _UAPI_ASM_SOCKET_H */
......@@ -90,4 +90,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _ASM_SOCKET_H */
......@@ -89,4 +89,6 @@
#define SO_CNX_ADVICE 0x402E
#define SCM_TIMESTAMPING_OPT_STATS 0x402F
#endif /* _UAPI_ASM_SOCKET_H */
......@@ -97,4 +97,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _ASM_POWERPC_SOCKET_H */
......@@ -96,4 +96,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _ASM_SOCKET_H */
......@@ -86,6 +86,8 @@
#define SO_CNX_ADVICE 0x0037
#define SCM_TIMESTAMPING_OPT_STATS 0x0038
/* Security levels - as per NRL IPv6 - don't actually do anything */
#define SO_SECURITY_AUTHENTICATION 0x5001
#define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002
......
......@@ -101,4 +101,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* _XTENSA_SOCKET_H */
......@@ -211,8 +211,11 @@ struct tcp_sock {
u8 reord; /* reordering detected */
} rack;
u16 advmss; /* Advertised MSS */
u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
unused:7;
u32 chrono_start; /* Start time in jiffies of a TCP chrono */
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */
rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
unused:5;
u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */
thin_dupack : 1,/* Fast retransmit on first dupack */
......@@ -425,4 +428,6 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp)
tp->saved_syn = NULL;
}
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk);
#endif /* _LINUX_TCP_H */
......@@ -1516,11 +1516,26 @@ struct tcp_fastopen_context {
struct rcu_head rcu;
};
/* Latencies incurred by various limits for a sender. They are
* chronograph-like stats that are mutually exclusive.
*/
enum tcp_chrono {
TCP_CHRONO_UNSPEC,
TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */
TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */
TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */
__TCP_CHRONO_MAX,
};
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type);
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type);
/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
sk_wmem_free_skb(sk, skb);
sk_mem_reclaim(sk);
......@@ -1579,8 +1594,10 @@ static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *
static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
{
if (sk->sk_send_head == skb_unlinked)
if (sk->sk_send_head == skb_unlinked) {
sk->sk_send_head = NULL;
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
}
if (tcp_sk(sk)->highest_sack == skb_unlinked)
tcp_sk(sk)->highest_sack = NULL;
}
......@@ -1602,6 +1619,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
/* Queue it, remembering where we must start sending. */
if (sk->sk_send_head == NULL) {
sk->sk_send_head = skb;
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
if (tcp_sk(sk)->highest_sack == NULL)
tcp_sk(sk)->highest_sack = skb;
......
......@@ -92,4 +92,6 @@
#define SO_CNX_ADVICE 53
#define SCM_TIMESTAMPING_OPT_STATS 54
#endif /* __ASM_GENERIC_SOCKET_H */
......@@ -25,8 +25,9 @@ enum {
SOF_TIMESTAMPING_TX_ACK = (1<<9),
SOF_TIMESTAMPING_OPT_CMSG = (1<<10),
SOF_TIMESTAMPING_OPT_TSONLY = (1<<11),
SOF_TIMESTAMPING_OPT_STATS = (1<<12),
SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_TSONLY,
SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_STATS,
SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
SOF_TIMESTAMPING_LAST
};
......
......@@ -214,6 +214,18 @@ struct tcp_info {
__u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
__u64 tcpi_delivery_rate;
__u64 tcpi_busy_time; /* Time (usec) busy sending data */
__u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
__u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
};
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
enum {
TCP_NLA_PAD,
TCP_NLA_BUSY, /* Time (usec) busy sending data */
TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */
TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */
};
/* for TCP_MD5SIG socket option */
......
......@@ -3839,10 +3839,18 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!skb_may_tx_timestamp(sk, tsonly))
return;
if (tsonly)
skb = alloc_skb(0, GFP_ATOMIC);
if (tsonly) {
#ifdef CONFIG_INET
if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
sk->sk_protocol == IPPROTO_TCP &&
sk->sk_type == SOCK_STREAM)
skb = tcp_get_timestamping_opt_stats(sk);
else
#endif
skb = alloc_skb(0, GFP_ATOMIC);
} else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
}
if (!skb)
return;
......
......@@ -854,6 +854,13 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_tskey = 0;
}
}
if (val & SOF_TIMESTAMPING_OPT_STATS &&
!(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
ret = -EINVAL;
break;
}
sk->sk_tsflags = val;
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
......
......@@ -996,8 +996,11 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto out;
out_err:
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
return sk_stream_error(sk, flags, err);
}
......@@ -1331,8 +1334,11 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
out_err:
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
release_sock(sk);
return err;
}
......@@ -2702,6 +2708,25 @@ int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
EXPORT_SYMBOL(compat_tcp_setsockopt);
#endif
static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
struct tcp_info *info)
{
u64 stats[__TCP_CHRONO_MAX], total = 0;
enum tcp_chrono i;
for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
stats[i] = tp->chrono_stat[i - 1];
if (i == tp->chrono_type)
stats[i] += tcp_time_stamp - tp->chrono_start;
stats[i] *= USEC_PER_SEC / HZ;
total += stats[i];
}
info->tcpi_busy_time = total;
info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
}
/* Return information about state of tcp endpoint in API format. */
void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
......@@ -2794,6 +2819,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_acked = tp->bytes_acked;
info->tcpi_bytes_received = tp->bytes_received;
info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt);
tcp_get_info_chrono_stats(tp, info);
unlock_sock_fast(sk, slow);
......@@ -2815,6 +2841,26 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
}
EXPORT_SYMBOL_GPL(tcp_get_info);
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
stats = alloc_skb(3 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC);
if (!stats)
return NULL;
tcp_get_info_chrono_stats(tp, &info);
nla_put_u64_64bit(stats, TCP_NLA_BUSY,
info.tcpi_busy_time, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED,
info.tcpi_rwnd_limited, TCP_NLA_PAD);
nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED,
info.tcpi_sndbuf_limited, TCP_NLA_PAD);
return stats;
}
static int do_tcp_getsockopt(struct sock *sk, int level,
int optname, char __user *optval, int __user *optlen)
{
......
......@@ -3178,6 +3178,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->lost_skb_hint = NULL;
}
if (!skb)
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
......@@ -5056,8 +5059,11 @@ static void tcp_check_space(struct sock *sk)
/* pairs with tcp_poll() */
smp_mb__after_atomic();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
tcp_new_space(sk);
if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
}
......
......@@ -1514,6 +1514,18 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (sysctl_tcp_slow_start_after_idle &&
(s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
tcp_cwnd_application_limited(sk);
/* The following conditions together indicate the starvation
* is caused by insufficient sender buffer:
* 1) just sent some data (see tcp_write_xmit)
* 2) not cwnd limited (this else condition)
* 3) no more data to send (null tcp_send_head )
* 4) application is hitting buffer limit (SOCK_NOSPACE)
*/
if (!tcp_send_head(sk) && sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
(1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
......@@ -2081,6 +2093,47 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
return false;
}
static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
{
const u32 now = tcp_time_stamp;
if (tp->chrono_type > TCP_CHRONO_UNSPEC)
tp->chrono_stat[tp->chrono_type - 1] += now - tp->chrono_start;
tp->chrono_start = now;
tp->chrono_type = new;
}
void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
{
struct tcp_sock *tp = tcp_sk(sk);
/* If there are multiple conditions worthy of tracking in a
* chronograph then the highest priority enum takes precedence
* over the other conditions. So that if something "more interesting"
* starts happening, stop the previous chrono and start a new one.
*/
if (type > tp->chrono_type)
tcp_chrono_set(tp, type);
}
void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
{
struct tcp_sock *tp = tcp_sk(sk);
/* There are multiple conditions worthy of tracking in a
* chronograph, so that the highest priority enum takes
* precedence over the other conditions (see tcp_chrono_start).
* If a condition stops, we only stop chrono tracking if
* it's the "most interesting" or current chrono we are
* tracking and starts busy chrono if we have pending data.
*/
if (tcp_write_queue_empty(sk))
tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
else if (type == tp->chrono_type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
}
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
......@@ -2103,7 +2156,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
int result;
bool is_cwnd_limited = false;
bool is_cwnd_limited = false, is_rwnd_limited = false;
u32 max_segs;
sent_pkts = 0;
......@@ -2140,8 +2193,10 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
break;
}
if (tso_segs == 1) {
if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
......@@ -2186,6 +2241,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
break;
}
if (is_rwnd_limited)
tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
else
tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
......@@ -3298,6 +3358,8 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
fo->copied = space;
tcp_connect_queue_skb(sk, syn_data);
if (syn_data->len)
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
......
......@@ -693,9 +693,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2))
empty = 0;
if (!empty)
if (!empty) {
put_cmsg(msg, SOL_SOCKET,
SCM_TIMESTAMPING, sizeof(tss), &tss);
if (skb->len && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS))
put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING_OPT_STATS,
skb->len, skb->data);
}
}
EXPORT_SYMBOL_GPL(__sock_recv_timestamp);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment