Commit e5a3e259 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-tcp-rtt-hook'

Stanislav Fomichev says:

====================
Congestion control team would like to have a periodic callback to
track some TCP statistics. Let's add a sock_ops callback that can be
selectively enabled on a socket by socket basis and is executed for
every RTT. BPF program frequency can be further controlled by calling
bpf_ktime_get_ns and bailing out early.

I run neper tcp_stream and tcp_rr tests with the sample program
from the last patch and didn't observe any noticeable performance
difference.

v2:
* add a comment about second accept() in selftest (Yonghong Song)
* refer to tcp_bpf.readme in sample program (Yonghong Song)
====================
Suggested-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Priyaranjan Jha <priyarjha@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarYuchung Cheng <ycheng@google.com>
Acked-by: default avatarYonghong Song <yhs@fb.com>
Acked-by: default avatarLawrence Brakmo <brakmo@fb.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents d2f5bbbc d78e3f06
...@@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) ...@@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
} }
static inline void tcp_bpf_rtt(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG))
tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
}
#if IS_ENABLED(CONFIG_SMC) #if IS_ENABLED(CONFIG_SMC)
extern struct static_key_false tcp_have_smc; extern struct static_key_false tcp_have_smc;
#endif #endif
......
...@@ -1770,6 +1770,7 @@ union bpf_attr { ...@@ -1770,6 +1770,7 @@ union bpf_attr {
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
* * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
* *
* Therefore, this function can be used to clear a callback flag by * Therefore, this function can be used to clear a callback flag by
* setting the appropriate bit to zero. e.g. to disable the RTO * setting the appropriate bit to zero. e.g. to disable the RTO
...@@ -3072,6 +3073,12 @@ struct bpf_tcp_sock { ...@@ -3072,6 +3073,12 @@ struct bpf_tcp_sock {
* sum(delta(snd_una)), or how many bytes * sum(delta(snd_una)), or how many bytes
* were acked. * were acked.
*/ */
__u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
* total number of DSACK blocks received
*/
__u32 delivered; /* Total data packets delivered incl. rexmits */
__u32 delivered_ce; /* Like the above but only ECE marked packets */
__u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
}; };
struct bpf_sock_tuple { struct bpf_sock_tuple {
...@@ -3314,7 +3321,8 @@ struct bpf_sock_ops { ...@@ -3314,7 +3321,8 @@ struct bpf_sock_ops {
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently #define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3)
#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently
* supported cb flags * supported cb flags
*/ */
...@@ -3369,6 +3377,8 @@ enum { ...@@ -3369,6 +3377,8 @@ enum {
BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
* socket transition to LISTEN state. * socket transition to LISTEN state.
*/ */
BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
*/
}; };
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
......
...@@ -5194,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { ...@@ -5194,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
}; };
#endif /* CONFIG_IPV6_SEG6_BPF */ #endif /* CONFIG_IPV6_SEG6_BPF */
#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \
do { \
switch (si->off) { \
case offsetof(md_type, snd_cwnd): \
CONVERT(snd_cwnd); break; \
case offsetof(md_type, srtt_us): \
CONVERT(srtt_us); break; \
case offsetof(md_type, snd_ssthresh): \
CONVERT(snd_ssthresh); break; \
case offsetof(md_type, rcv_nxt): \
CONVERT(rcv_nxt); break; \
case offsetof(md_type, snd_nxt): \
CONVERT(snd_nxt); break; \
case offsetof(md_type, snd_una): \
CONVERT(snd_una); break; \
case offsetof(md_type, mss_cache): \
CONVERT(mss_cache); break; \
case offsetof(md_type, ecn_flags): \
CONVERT(ecn_flags); break; \
case offsetof(md_type, rate_delivered): \
CONVERT(rate_delivered); break; \
case offsetof(md_type, rate_interval_us): \
CONVERT(rate_interval_us); break; \
case offsetof(md_type, packets_out): \
CONVERT(packets_out); break; \
case offsetof(md_type, retrans_out): \
CONVERT(retrans_out); break; \
case offsetof(md_type, total_retrans): \
CONVERT(total_retrans); break; \
case offsetof(md_type, segs_in): \
CONVERT(segs_in); break; \
case offsetof(md_type, data_segs_in): \
CONVERT(data_segs_in); break; \
case offsetof(md_type, segs_out): \
CONVERT(segs_out); break; \
case offsetof(md_type, data_segs_out): \
CONVERT(data_segs_out); break; \
case offsetof(md_type, lost_out): \
CONVERT(lost_out); break; \
case offsetof(md_type, sacked_out): \
CONVERT(sacked_out); break; \
case offsetof(md_type, bytes_received): \
CONVERT(bytes_received); break; \
case offsetof(md_type, bytes_acked): \
CONVERT(bytes_acked); break; \
} \
} while (0)
#ifdef CONFIG_INET #ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto) int dif, int sdif, u8 family, u8 proto)
...@@ -5592,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { ...@@ -5592,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info) struct bpf_insn_access_aux *info)
{ {
if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
icsk_retransmits))
return false; return false;
if (off % size != 0) if (off % size != 0)
...@@ -5623,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, ...@@ -5623,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, FIELD)); \ offsetof(struct tcp_sock, FIELD)); \
} while (0) } while (0)
CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, #define BPF_INET_SOCK_GET_COMMON(FIELD) \
BPF_TCP_SOCK_GET_COMMON); do { \
BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \
FIELD) > \
FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct inet_connection_sock, \
FIELD), \
si->dst_reg, si->src_reg, \
offsetof( \
struct inet_connection_sock, \
FIELD)); \
} while (0)
if (insn > insn_buf) if (insn > insn_buf)
return insn - insn_buf; return insn - insn_buf;
...@@ -5640,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, ...@@ -5640,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, rtt_min) + offsetof(struct tcp_sock, rtt_min) +
offsetof(struct minmax_sample, v)); offsetof(struct minmax_sample, v));
break; break;
case offsetof(struct bpf_tcp_sock, snd_cwnd):
BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
break;
case offsetof(struct bpf_tcp_sock, srtt_us):
BPF_TCP_SOCK_GET_COMMON(srtt_us);
break;
case offsetof(struct bpf_tcp_sock, snd_ssthresh):
BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
break;
case offsetof(struct bpf_tcp_sock, rcv_nxt):
BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
break;
case offsetof(struct bpf_tcp_sock, snd_nxt):
BPF_TCP_SOCK_GET_COMMON(snd_nxt);
break;
case offsetof(struct bpf_tcp_sock, snd_una):
BPF_TCP_SOCK_GET_COMMON(snd_una);
break;
case offsetof(struct bpf_tcp_sock, mss_cache):
BPF_TCP_SOCK_GET_COMMON(mss_cache);
break;
case offsetof(struct bpf_tcp_sock, ecn_flags):
BPF_TCP_SOCK_GET_COMMON(ecn_flags);
break;
case offsetof(struct bpf_tcp_sock, rate_delivered):
BPF_TCP_SOCK_GET_COMMON(rate_delivered);
break;
case offsetof(struct bpf_tcp_sock, rate_interval_us):
BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
break;
case offsetof(struct bpf_tcp_sock, packets_out):
BPF_TCP_SOCK_GET_COMMON(packets_out);
break;
case offsetof(struct bpf_tcp_sock, retrans_out):
BPF_TCP_SOCK_GET_COMMON(retrans_out);
break;
case offsetof(struct bpf_tcp_sock, total_retrans):
BPF_TCP_SOCK_GET_COMMON(total_retrans);
break;
case offsetof(struct bpf_tcp_sock, segs_in):
BPF_TCP_SOCK_GET_COMMON(segs_in);
break;
case offsetof(struct bpf_tcp_sock, data_segs_in):
BPF_TCP_SOCK_GET_COMMON(data_segs_in);
break;
case offsetof(struct bpf_tcp_sock, segs_out):
BPF_TCP_SOCK_GET_COMMON(segs_out);
break;
case offsetof(struct bpf_tcp_sock, data_segs_out):
BPF_TCP_SOCK_GET_COMMON(data_segs_out);
break;
case offsetof(struct bpf_tcp_sock, lost_out):
BPF_TCP_SOCK_GET_COMMON(lost_out);
break;
case offsetof(struct bpf_tcp_sock, sacked_out):
BPF_TCP_SOCK_GET_COMMON(sacked_out);
break;
case offsetof(struct bpf_tcp_sock, bytes_received):
BPF_TCP_SOCK_GET_COMMON(bytes_received);
break;
case offsetof(struct bpf_tcp_sock, bytes_acked):
BPF_TCP_SOCK_GET_COMMON(bytes_acked);
break;
case offsetof(struct bpf_tcp_sock, dsack_dups):
BPF_TCP_SOCK_GET_COMMON(dsack_dups);
break;
case offsetof(struct bpf_tcp_sock, delivered):
BPF_TCP_SOCK_GET_COMMON(delivered);
break;
case offsetof(struct bpf_tcp_sock, delivered_ce):
BPF_TCP_SOCK_GET_COMMON(delivered_ce);
break;
case offsetof(struct bpf_tcp_sock, icsk_retransmits):
BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
break;
} }
return insn - insn_buf; return insn - insn_buf;
...@@ -7913,9 +7952,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, ...@@ -7913,9 +7952,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
} while (0) } while (0)
CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops,
SOCK_OPS_GET_TCP_SOCK_FIELD);
if (insn > insn_buf) if (insn > insn_buf)
return insn - insn_buf; return insn - insn_buf;
...@@ -8085,6 +8121,69 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, ...@@ -8085,6 +8121,69 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type); struct sock, type);
break; break;
case offsetof(struct bpf_sock_ops, snd_cwnd):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
break;
case offsetof(struct bpf_sock_ops, srtt_us):
SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
break;
case offsetof(struct bpf_sock_ops, snd_ssthresh):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
break;
case offsetof(struct bpf_sock_ops, rcv_nxt):
SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
break;
case offsetof(struct bpf_sock_ops, snd_nxt):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
break;
case offsetof(struct bpf_sock_ops, snd_una):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
break;
case offsetof(struct bpf_sock_ops, mss_cache):
SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
break;
case offsetof(struct bpf_sock_ops, ecn_flags):
SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
break;
case offsetof(struct bpf_sock_ops, rate_delivered):
SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
break;
case offsetof(struct bpf_sock_ops, rate_interval_us):
SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
break;
case offsetof(struct bpf_sock_ops, packets_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
break;
case offsetof(struct bpf_sock_ops, retrans_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
break;
case offsetof(struct bpf_sock_ops, total_retrans):
SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
break;
case offsetof(struct bpf_sock_ops, segs_in):
SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
break;
case offsetof(struct bpf_sock_ops, data_segs_in):
SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
break;
case offsetof(struct bpf_sock_ops, segs_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
break;
case offsetof(struct bpf_sock_ops, data_segs_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
break;
case offsetof(struct bpf_sock_ops, lost_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
break;
case offsetof(struct bpf_sock_ops, sacked_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
break;
case offsetof(struct bpf_sock_ops, bytes_received):
SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
break;
case offsetof(struct bpf_sock_ops, bytes_acked):
SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
break;
case offsetof(struct bpf_sock_ops, sk): case offsetof(struct bpf_sock_ops, sk):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, struct bpf_sock_ops_kern,
......
...@@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) ...@@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt; tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk); tp->mdev_max_us = tcp_rto_min_us(sk);
tcp_bpf_rtt(sk);
} }
} else { } else {
/* no previous measure. */ /* no previous measure. */
...@@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) ...@@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
tp->mdev_max_us = tp->rttvar_us; tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt; tp->rtt_seq = tp->snd_nxt;
tcp_bpf_rtt(sk);
} }
tp->srtt_us = max(1U, srtt); tp->srtt_us = max(1U, srtt);
} }
......
...@@ -154,6 +154,7 @@ always += tcp_iw_kern.o ...@@ -154,6 +154,7 @@ always += tcp_iw_kern.o
always += tcp_clamp_kern.o always += tcp_clamp_kern.o
always += tcp_basertt_kern.o always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o always += tcp_tos_reflect_kern.o
always += tcp_dumpstats_kern.o
always += xdp_redirect_kern.o always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o always += xdp_redirect_cpu_kern.o
......
...@@ -25,4 +25,4 @@ attached to the cgroupv2). ...@@ -25,4 +25,4 @@ attached to the cgroupv2).
To remove (unattach) a socket_ops BPF program from a cgroupv2: To remove (unattach) a socket_ops BPF program from a cgroupv2:
bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
// SPDX-License-Identifier: GPL-2.0
/* Refer to samples/bpf/tcp_bpf.readme for the instructions on
* how to run this sample program.
*/
#include <linux/bpf.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
#define INTERVAL 1000000000ULL
int _version SEC("version") = 1;
char _license[] SEC("license") = "GPL";
struct {
__u32 type;
__u32 map_flags;
int *key;
__u64 *value;
} bpf_next_dump SEC(".maps") = {
.type = BPF_MAP_TYPE_SK_STORAGE,
.map_flags = BPF_F_NO_PREALLOC,
};
SEC("sockops")
int _sockops(struct bpf_sock_ops *ctx)
{
struct bpf_tcp_sock *tcp_sk;
struct bpf_sock *sk;
__u64 *next_dump;
__u64 now;
switch (ctx->op) {
case BPF_SOCK_OPS_TCP_CONNECT_CB:
bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
return 1;
case BPF_SOCK_OPS_RTT_CB:
break;
default:
return 1;
}
sk = ctx->sk;
if (!sk)
return 1;
next_dump = bpf_sk_storage_get(&bpf_next_dump, sk, 0,
BPF_SK_STORAGE_GET_F_CREATE);
if (!next_dump)
return 1;
now = bpf_ktime_get_ns();
if (now < *next_dump)
return 1;
tcp_sk = bpf_tcp_sock(sk);
if (!tcp_sk)
return 1;
*next_dump = now + INTERVAL;
bpf_printk("dsack_dups=%u delivered=%u\n",
tcp_sk->dsack_dups, tcp_sk->delivered);
bpf_printk("delivered_ce=%u icsk_retransmits=%u\n",
tcp_sk->delivered_ce, tcp_sk->icsk_retransmits);
return 1;
}
...@@ -1767,6 +1767,7 @@ union bpf_attr { ...@@ -1767,6 +1767,7 @@ union bpf_attr {
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
* * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
* *
* Therefore, this function can be used to clear a callback flag by * Therefore, this function can be used to clear a callback flag by
* setting the appropriate bit to zero. e.g. to disable the RTO * setting the appropriate bit to zero. e.g. to disable the RTO
...@@ -3069,6 +3070,12 @@ struct bpf_tcp_sock { ...@@ -3069,6 +3070,12 @@ struct bpf_tcp_sock {
* sum(delta(snd_una)), or how many bytes * sum(delta(snd_una)), or how many bytes
* were acked. * were acked.
*/ */
__u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
* total number of DSACK blocks received
*/
__u32 delivered; /* Total data packets delivered incl. rexmits */
__u32 delivered_ce; /* Like the above but only ECE marked packets */
__u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
}; };
struct bpf_sock_tuple { struct bpf_sock_tuple {
...@@ -3311,7 +3318,8 @@ struct bpf_sock_ops { ...@@ -3311,7 +3318,8 @@ struct bpf_sock_ops {
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently #define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3)
#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently
* supported cb flags * supported cb flags
*/ */
...@@ -3366,6 +3374,8 @@ enum { ...@@ -3366,6 +3374,8 @@ enum {
BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
* socket transition to LISTEN state. * socket transition to LISTEN state.
*/ */
BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
*/
}; };
/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
......
...@@ -27,7 +27,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test ...@@ -27,7 +27,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_cgroup_storage test_select_reuseport test_section_names \ test_cgroup_storage test_select_reuseport test_section_names \
test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \ test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
test_sockopt_multi test_sockopt_multi test_tcp_rtt
BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c))) BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
TEST_GEN_FILES = $(BPF_OBJ_FILES) TEST_GEN_FILES = $(BPF_OBJ_FILES)
...@@ -107,6 +107,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c ...@@ -107,6 +107,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
$(OUTPUT)/test_sockopt: cgroup_helpers.c $(OUTPUT)/test_sockopt: cgroup_helpers.c
$(OUTPUT)/test_sockopt_sk: cgroup_helpers.c $(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
$(OUTPUT)/test_sockopt_multi: cgroup_helpers.c $(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
$(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
.PHONY: force .PHONY: force
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include "bpf_helpers.h"
char _license[] SEC("license") = "GPL";
__u32 _version SEC("version") = 1;
struct tcp_rtt_storage {
__u32 invoked;
__u32 dsack_dups;
__u32 delivered;
__u32 delivered_ce;
__u32 icsk_retransmits;
};
struct bpf_map_def SEC("maps") socket_storage_map = {
.type = BPF_MAP_TYPE_SK_STORAGE,
.key_size = sizeof(int),
.value_size = sizeof(struct tcp_rtt_storage),
.map_flags = BPF_F_NO_PREALLOC,
};
BPF_ANNOTATE_KV_PAIR(socket_storage_map, int, struct tcp_rtt_storage);
SEC("sockops")
int _sockops(struct bpf_sock_ops *ctx)
{
struct tcp_rtt_storage *storage;
struct bpf_tcp_sock *tcp_sk;
int op = (int) ctx->op;
struct bpf_sock *sk;
sk = ctx->sk;
if (!sk)
return 1;
storage = bpf_sk_storage_get(&socket_storage_map, sk, 0,
BPF_SK_STORAGE_GET_F_CREATE);
if (!storage)
return 1;
if (op == BPF_SOCK_OPS_TCP_CONNECT_CB) {
bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
return 1;
}
if (op != BPF_SOCK_OPS_RTT_CB)
return 1;
tcp_sk = bpf_tcp_sock(sk);
if (!tcp_sk)
return 1;
storage->invoked++;
storage->dsack_dups = tcp_sk->dsack_dups;
storage->delivered = tcp_sk->delivered;
storage->delivered_ce = tcp_sk->delivered_ce;
storage->icsk_retransmits = tcp_sk->icsk_retransmits;
return 1;
}
// SPDX-License-Identifier: GPL-2.0
#include <error.h>
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <pthread.h>
#include <linux/filter.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include "bpf_rlimit.h"
#include "bpf_util.h"
#include "cgroup_helpers.h"
#define CG_PATH "/tcp_rtt"
struct tcp_rtt_storage {
__u32 invoked;
__u32 dsack_dups;
__u32 delivered;
__u32 delivered_ce;
__u32 icsk_retransmits;
};
static void send_byte(int fd)
{
char b = 0x55;
if (write(fd, &b, sizeof(b)) != 1)
error(1, errno, "Failed to send single byte");
}
static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
__u32 dsack_dups, __u32 delivered, __u32 delivered_ce,
__u32 icsk_retransmits)
{
int err = 0;
struct tcp_rtt_storage val;
if (bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)
error(1, errno, "Failed to read socket storage");
if (val.invoked != invoked) {
log_err("%s: unexpected bpf_tcp_sock.invoked %d != %d",
msg, val.invoked, invoked);
err++;
}
if (val.dsack_dups != dsack_dups) {
log_err("%s: unexpected bpf_tcp_sock.dsack_dups %d != %d",
msg, val.dsack_dups, dsack_dups);
err++;
}
if (val.delivered != delivered) {
log_err("%s: unexpected bpf_tcp_sock.delivered %d != %d",
msg, val.delivered, delivered);
err++;
}
if (val.delivered_ce != delivered_ce) {
log_err("%s: unexpected bpf_tcp_sock.delivered_ce %d != %d",
msg, val.delivered_ce, delivered_ce);
err++;
}
if (val.icsk_retransmits != icsk_retransmits) {
log_err("%s: unexpected bpf_tcp_sock.icsk_retransmits %d != %d",
msg, val.icsk_retransmits, icsk_retransmits);
err++;
}
return err;
}
static int connect_to_server(int server_fd)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
int fd;
fd = socket(AF_INET, SOCK_STREAM, 0);
if (fd < 0) {
log_err("Failed to create client socket");
return -1;
}
if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
log_err("Failed to get server addr");
goto out;
}
if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
log_err("Fail to connect to server");
goto out;
}
return fd;
out:
close(fd);
return -1;
}
static int run_test(int cgroup_fd, int server_fd)
{
struct bpf_prog_load_attr attr = {
.prog_type = BPF_PROG_TYPE_SOCK_OPS,
.file = "./tcp_rtt.o",
.expected_attach_type = BPF_CGROUP_SOCK_OPS,
};
struct bpf_object *obj;
struct bpf_map *map;
int client_fd;
int prog_fd;
int map_fd;
int err;
err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
if (err) {
log_err("Failed to load BPF object");
return -1;
}
map = bpf_map__next(NULL, obj);
map_fd = bpf_map__fd(map);
err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0);
if (err) {
log_err("Failed to attach BPF program");
goto close_bpf_object;
}
client_fd = connect_to_server(server_fd);
if (client_fd < 0) {
err = -1;
goto close_bpf_object;
}
err += verify_sk(map_fd, client_fd, "syn-ack",
/*invoked=*/1,
/*dsack_dups=*/0,
/*delivered=*/1,
/*delivered_ce=*/0,
/*icsk_retransmits=*/0);
send_byte(client_fd);
err += verify_sk(map_fd, client_fd, "first payload byte",
/*invoked=*/2,
/*dsack_dups=*/0,
/*delivered=*/2,
/*delivered_ce=*/0,
/*icsk_retransmits=*/0);
close(client_fd);
close_bpf_object:
bpf_object__close(obj);
return err;
}
static int start_server(void)
{
struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_LOOPBACK),
};
int fd;
fd = socket(AF_INET, SOCK_STREAM, 0);
if (fd < 0) {
log_err("Failed to create server socket");
return -1;
}
if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
log_err("Failed to bind socket");
close(fd);
return -1;
}
return fd;
}
static void *server_thread(void *arg)
{
struct sockaddr_storage addr;
socklen_t len = sizeof(addr);
int fd = *(int *)arg;
int client_fd;
if (listen(fd, 1) < 0)
error(1, errno, "Failed to listed on socket");
client_fd = accept(fd, (struct sockaddr *)&addr, &len);
if (client_fd < 0)
error(1, errno, "Failed to accept client");
/* Wait for the next connection (that never arrives)
* to keep this thread alive to prevent calling
* close() on client_fd.
*/
if (accept(fd, (struct sockaddr *)&addr, &len) >= 0)
error(1, errno, "Unexpected success in second accept");
close(client_fd);
return NULL;
}
int main(int args, char **argv)
{
int server_fd, cgroup_fd;
int err = EXIT_SUCCESS;
pthread_t tid;
if (setup_cgroup_environment())
goto cleanup_obj;
cgroup_fd = create_and_get_cgroup(CG_PATH);
if (cgroup_fd < 0)
goto cleanup_cgroup_env;
if (join_cgroup(CG_PATH))
goto cleanup_cgroup;
server_fd = start_server();
if (server_fd < 0) {
err = EXIT_FAILURE;
goto cleanup_cgroup;
}
pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
if (run_test(cgroup_fd, server_fd))
err = EXIT_FAILURE;
close(server_fd);
printf("test_sockopt_sk: %s\n",
err == EXIT_SUCCESS ? "PASSED" : "FAILED");
cleanup_cgroup:
close(cgroup_fd);
cleanup_cgroup_env:
cleanup_cgroup_environment();
cleanup_obj:
return err;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment