Commit 655a51e5 authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by Alexei Starovoitov

bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock

This patch adds a helper function BPF_FUNC_tcp_sock and it
is currently available for cg_skb and sched_(cls|act):

struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk);

int cg_skb_foo(struct __sk_buff *skb) {
	struct bpf_tcp_sock *tp;
	struct bpf_sock *sk;
	__u32 snd_cwnd;

	sk = skb->sk;
	if (!sk)
		return 1;

	tp = bpf_tcp_sock(sk);
	if (!tp)
		return 1;

	snd_cwnd = tp->snd_cwnd;
	/* ... */

	return 1;
}

A 'struct bpf_tcp_sock' is also added to the uapi bpf.h to provide
read-only access.  bpf_tcp_sock has all the existing tcp_sock's fields
that has already been exposed by the bpf_sock_ops.
i.e. no new tcp_sock's fields are exposed in bpf.h.

This helper returns a pointer to the tcp_sock.  If it is not a tcp_sock
or it cannot be traced back to a tcp_sock by sk_to_full_sk(), it
returns NULL.  Hence, the caller needs to check for NULL before
accessing it.

The current use case is to expose members from tcp_sock
to allow a cg_skb_bpf_prog to provide per cgroup traffic
policing/shaping.
Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent 9b1f3d6e
......@@ -204,6 +204,7 @@ enum bpf_return_type {
RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */
RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */
RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */
RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */
};
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
......@@ -259,6 +260,8 @@ enum bpf_reg_type {
PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */
PTR_TO_SOCK_COMMON, /* reg points to sock_common */
PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
};
/* The information passed from prog-specific *_is_valid_access
......@@ -956,4 +959,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
}
#endif
#ifdef CONFIG_INET
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info);
u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog,
u32 *target_size);
#else
static inline bool bpf_tcp_sock_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
return false;
}
static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog,
u32 *target_size)
{
return 0;
}
#endif /* CONFIG_INET */
#endif /* _LINUX_BPF_H */
......@@ -2337,6 +2337,15 @@ union bpf_attr {
* Return
* A **struct bpf_sock** pointer on success, or NULL in
* case of failure.
*
* struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk)
* Description
* This helper gets a **struct bpf_tcp_sock** pointer from a
* **struct bpf_sock** pointer.
*
* Return
* A **struct bpf_tcp_sock** pointer on success, or NULL in
* case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2434,7 +2443,8 @@ union bpf_attr {
FN(rc_pointer_rel), \
FN(spin_lock), \
FN(spin_unlock), \
FN(sk_fullsock),
FN(sk_fullsock), \
FN(tcp_sock),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2616,6 +2626,45 @@ struct bpf_sock {
__u32 state;
};
struct bpf_tcp_sock {
__u32 snd_cwnd; /* Sending congestion window */
__u32 srtt_us; /* smoothed round trip time << 3 in usecs */
__u32 rtt_min;
__u32 snd_ssthresh; /* Slow start size threshold */
__u32 rcv_nxt; /* What we want to receive next */
__u32 snd_nxt; /* Next sequence we send */
__u32 snd_una; /* First byte we want an ack for */
__u32 mss_cache; /* Cached effective mss, not including SACKS */
__u32 ecn_flags; /* ECN status bits. */
__u32 rate_delivered; /* saved rate sample: packets delivered */
__u32 rate_interval_us; /* saved rate sample: time elapsed */
__u32 packets_out; /* Packets which are "in flight" */
__u32 retrans_out; /* Retransmitted packets out */
__u32 total_retrans; /* Total retransmits for entire connection */
__u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn
* total number of segments in.
*/
__u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn
* total number of data segments in.
*/
__u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut
* The total number of segments sent.
*/
__u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut
* total number of data segments sent.
*/
__u32 lost_out; /* Lost packets */
__u32 sacked_out; /* SACK'd packets */
__u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived
* sum(delta(rcv_nxt)), or how many bytes
* were acked.
*/
__u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
};
struct bpf_sock_tuple {
union {
struct {
......
......@@ -334,14 +334,16 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type)
static bool type_is_sk_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_SOCKET ||
type == PTR_TO_SOCK_COMMON;
type == PTR_TO_SOCK_COMMON ||
type == PTR_TO_TCP_SOCK;
}
static bool reg_type_may_be_null(enum bpf_reg_type type)
{
return type == PTR_TO_MAP_VALUE_OR_NULL ||
type == PTR_TO_SOCKET_OR_NULL ||
type == PTR_TO_SOCK_COMMON_OR_NULL;
type == PTR_TO_SOCK_COMMON_OR_NULL ||
type == PTR_TO_TCP_SOCK_OR_NULL;
}
static bool type_is_refcounted(enum bpf_reg_type type)
......@@ -407,6 +409,8 @@ static const char * const reg_type_str[] = {
[PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
[PTR_TO_SOCK_COMMON] = "sock_common",
[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
[PTR_TO_TCP_SOCK] = "tcp_sock",
[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
};
static char slot_type_char[] = {
......@@ -1209,6 +1213,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
return true;
default:
return false;
......@@ -1662,6 +1668,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
case PTR_TO_SOCKET:
valid = bpf_sock_is_valid_access(off, size, t, &info);
break;
case PTR_TO_TCP_SOCK:
valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
break;
default:
valid = false;
}
......@@ -1823,6 +1832,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_SOCK_COMMON:
pointer_desc = "sock_common ";
break;
case PTR_TO_TCP_SOCK:
pointer_desc = "tcp_sock ";
break;
default:
break;
}
......@@ -3148,6 +3160,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
/* For mark_ptr_or_null_reg() */
regs[BPF_REG_0].id = ++env->id_gen;
}
} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
......@@ -3409,6 +3425,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
......@@ -4644,6 +4662,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
reg->type = PTR_TO_SOCKET;
} else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) {
reg->type = PTR_TO_SOCK_COMMON;
} else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) {
reg->type = PTR_TO_TCP_SOCK;
}
if (is_null || !(reg_is_refcounted(reg) ||
reg_may_point_to_spin_lock(reg))) {
......@@ -5839,6 +5859,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
/* Only valid matches are exact, which memcmp() above
* would have accepted
*/
......@@ -6161,6 +6183,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
return false;
default:
return true;
......@@ -7166,6 +7190,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
case PTR_TO_SOCK_COMMON:
convert_ctx_access = bpf_sock_convert_ctx_access;
break;
case PTR_TO_TCP_SOCK:
convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
break;
default:
continue;
}
......
......@@ -5315,6 +5315,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
.arg5_type = ARG_ANYTHING,
};
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
return false;
if (off % size != 0)
return false;
switch (off) {
case offsetof(struct bpf_tcp_sock, bytes_received):
case offsetof(struct bpf_tcp_sock, bytes_acked):
return size == sizeof(__u64);
default:
return size == sizeof(__u32);
}
}
u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
#define BPF_TCP_SOCK_GET_COMMON(FIELD) \
do { \
BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \
FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
si->dst_reg, si->src_reg, \
offsetof(struct tcp_sock, FIELD)); \
} while (0)
CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
BPF_TCP_SOCK_GET_COMMON);
if (insn > insn_buf)
return insn - insn_buf;
switch (si->off) {
case offsetof(struct bpf_tcp_sock, rtt_min):
BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
sizeof(struct minmax));
BUILD_BUG_ON(sizeof(struct minmax) <
sizeof(struct minmax_sample));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct tcp_sock, rtt_min) +
offsetof(struct minmax_sample, v));
break;
}
return insn - insn_buf;
}
BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
return (unsigned long)sk;
return (unsigned long)NULL;
}
static const struct bpf_func_proto bpf_tcp_sock_proto = {
.func = bpf_tcp_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
.arg1_type = ARG_PTR_TO_SOCK_COMMON,
};
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
......@@ -5470,6 +5543,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_local_storage_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
default:
return sk_filter_func_proto(func_id, prog);
}
......@@ -5560,6 +5637,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
default:
return bpf_base_func_proto(func_id);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment