Commit 46f8bc92 authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by Alexei Starovoitov

bpf: Add a bpf_sock pointer to __sk_buff and a bpf_sk_fullsock helper

In kernel, it is common to check "skb->sk && sk_fullsock(skb->sk)"
before accessing the fields in sock.  For example, in __netdev_pick_tx:

static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
			    struct net_device *sb_dev)
{
	/* ... */

	struct sock *sk = skb->sk;

		if (queue_index != new_index && sk &&
		    sk_fullsock(sk) &&
		    rcu_access_pointer(sk->sk_dst_cache))
			sk_tx_queue_set(sk, new_index);

	/* ... */

	return queue_index;
}

This patch adds a "struct bpf_sock *sk" pointer to the "struct __sk_buff"
where a few of the convert_ctx_access() in filter.c has already been
accessing the skb->sk sock_common's fields,
e.g. sock_ops_convert_ctx_access().

"__sk_buff->sk" is a PTR_TO_SOCK_COMMON_OR_NULL in the verifier.
Some of the fileds in "bpf_sock" will not be directly
accessible through the "__sk_buff->sk" pointer.  It is limited
by the new "bpf_sock_common_is_valid_access()".
e.g. The existing "type", "protocol", "mark" and "priority" in bpf_sock
     are not allowed.

The newly added "struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)"
can be used to get a sk with all accessible fields in "bpf_sock".
This helper is added to both cg_skb and sched_(cls|act).

int cg_skb_foo(struct __sk_buff *skb) {
	struct bpf_sock *sk;

	sk = skb->sk;
	if (!sk)
		return 1;

	sk = bpf_sk_fullsock(sk);
	if (!sk)
		return 1;

	if (sk->family != AF_INET6 || sk->protocol != IPPROTO_TCP)
		return 1;

	/* some_traffic_shaping(); */

	return 1;
}

(1) The sk is read only

(2) There is no new "struct bpf_sock_common" introduced.

(3) Future kernel sock's members could be added to bpf_sock only
    instead of repeatedly adding at multiple places like currently
    in bpf_sock_ops_md, bpf_sock_addr_md, sk_reuseport_md...etc.

(4) After "sk = skb->sk", the reg holding sk is in type
    PTR_TO_SOCK_COMMON_OR_NULL.

(5) After bpf_sk_fullsock(), the return type will be in type
    PTR_TO_SOCKET_OR_NULL which is the same as the return type of
    bpf_sk_lookup_xxx().

    However, bpf_sk_fullsock() does not take refcnt.  The
    acquire_reference_state() is only depending on the return type now.
    To avoid it, a new is_acquire_function() is checked before calling
    acquire_reference_state().

(6) The WARN_ON in "release_reference_state()" is no longer an
    internal verifier bug.

    When reg->id is not found in state->refs[], it means the
    bpf_prog does something wrong like
    "bpf_sk_release(bpf_sk_fullsock(skb->sk))" where reference has
    never been acquired by calling "bpf_sk_fullsock(skb->sk)".

    A -EINVAL and a verbose are done instead of WARN_ON.  A test is
    added to the test_verifier in a later patch.

    Since the WARN_ON in "release_reference_state()" is no longer
    needed, "__release_reference_state()" is folded into
    "release_reference_state()" also.
Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent 5f456649
......@@ -194,6 +194,7 @@ enum bpf_arg_type {
ARG_ANYTHING, /* any (initialized) argument is ok */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
};
/* type of values returned from helper functions */
......@@ -256,6 +257,8 @@ enum bpf_reg_type {
PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
PTR_TO_SOCKET, /* reg points to struct bpf_sock */
PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */
PTR_TO_SOCK_COMMON, /* reg points to sock_common */
PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
};
/* The information passed from prog-specific *_is_valid_access
......@@ -920,6 +923,9 @@ void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#if defined(CONFIG_NET)
bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info);
bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info);
u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
......@@ -928,6 +934,12 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
struct bpf_prog *prog,
u32 *target_size);
#else
static inline bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
return false;
}
static inline bool bpf_sock_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
......
......@@ -2329,6 +2329,14 @@ union bpf_attr {
* "**y**".
* Return
* 0
*
* struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)
* Description
* This helper gets a **struct bpf_sock** pointer such
* that all the fields in bpf_sock can be accessed.
* Return
* A **struct bpf_sock** pointer on success, or NULL in
* case of failure.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2425,7 +2433,8 @@ union bpf_attr {
FN(msg_pop_data), \
FN(rc_pointer_rel), \
FN(spin_lock), \
FN(spin_unlock),
FN(spin_unlock), \
FN(sk_fullsock),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -2545,6 +2554,7 @@ struct __sk_buff {
__u64 tstamp;
__u32 wire_len;
__u32 gso_segs;
__bpf_md_ptr(struct bpf_sock *, sk);
};
struct bpf_tunnel_key {
......
This diff is collapsed.
......@@ -1793,6 +1793,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
}
static const struct bpf_func_proto bpf_sk_fullsock_proto = {
.func = bpf_sk_fullsock,
.gpl_only = false,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_SOCK_COMMON,
};
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
......@@ -5406,6 +5420,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
default:
return sk_filter_func_proto(func_id, prog);
}
......@@ -5477,6 +5493,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
return &bpf_skb_fib_lookup_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
#ifdef CONFIG_XFRM
case BPF_FUNC_skb_get_xfrm_state:
return &bpf_skb_get_xfrm_state_proto;
......@@ -5764,6 +5782,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (size != sizeof(__u64))
return false;
break;
case offsetof(struct __sk_buff, sk):
if (type == BPF_WRITE || size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
break;
default:
/* Only narrow read access allowed for now. */
if (type == BPF_WRITE) {
......@@ -5950,6 +5973,18 @@ static bool __sock_filter_check_size(int off, int size,
return size == size_default;
}
bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
switch (off) {
case bpf_ctx_range_till(struct bpf_sock, type, priority):
return false;
default:
return bpf_sock_is_valid_access(off, size, type, info);
}
}
bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
......@@ -6748,6 +6783,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
off += offsetof(struct qdisc_skb_cb, pkt_len);
*target_size = 4;
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
break;
case offsetof(struct __sk_buff, sk):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, sk));
break;
}
return insn - insn_buf;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment