Commit 46531a30 authored by Pavel Begunkov's avatar Pavel Begunkov Committed by Alexei Starovoitov

cgroup/bpf: fast path skb BPF filtering

Even though there is a static key protecting from overhead from
cgroup-bpf skb filtering when there is nothing attached, in many cases
it's not enough as registering a filter for one type will ruin the fast
path for all others. It's observed in production servers I've looked
at but also in laptops, where registration is done during init by
systemd or something else.

Add a per-socket fast path check guarding from such overhead. This
affects both receive and transmit paths of TCP, UDP and other
protocols. It showed ~1% tx/s improvement in small payload UDP
send benchmarks using a real NIC and in a server environment and the
number jumps to 2-3% for preemtible kernels.
Reviewed-by: default avatarStanislav Fomichev <sdf@google.com>
Signed-off-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/r/d8c58857113185a764927a46f4b5a058d36d3ec3.1643292455.git.asml.silence@gmail.comSigned-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent cdb5ed97
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <linux/jump_label.h> #include <linux/jump_label.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <net/sock.h>
#include <uapi/linux/bpf.h> #include <uapi/linux/bpf.h>
struct sock; struct sock;
...@@ -165,11 +166,23 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); ...@@ -165,11 +166,23 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
void *value, u64 flags); void *value, u64 flags);
/* Opportunistic check to see whether we have any BPF program attached*/
static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
enum cgroup_bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
struct bpf_prog_array *array;
array = rcu_access_pointer(cgrp->bpf.effective[type]);
return array != &bpf_empty_prog_array.hdr;
}
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \ ({ \
int __ret = 0; \ int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \
cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(sk, skb, \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \
CGROUP_INET_INGRESS); \ CGROUP_INET_INGRESS); \
\ \
...@@ -181,7 +194,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, ...@@ -181,7 +194,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
int __ret = 0; \ int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \
typeof(sk) __sk = sk_to_full_sk(sk); \ typeof(sk) __sk = sk_to_full_sk(sk); \
if (sk_fullsock(__sk)) \ if (sk_fullsock(__sk) && \
cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \
__ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \
CGROUP_INET_EGRESS); \ CGROUP_INET_EGRESS); \
} \ } \
...@@ -347,7 +361,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, ...@@ -347,7 +361,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
kernel_optval) \ kernel_optval) \
({ \ ({ \
int __ret = 0; \ int __ret = 0; \
if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) && \
cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT)) \
__ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \
optname, optval, \ optname, optval, \
optlen, \ optlen, \
...@@ -367,7 +382,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, ...@@ -367,7 +382,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
max_optlen, retval) \ max_optlen, retval) \
({ \ ({ \
int __ret = retval; \ int __ret = retval; \
if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) && \
cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT)) \
if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \
!INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \
tcp_bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \
......
...@@ -1233,6 +1233,19 @@ struct bpf_prog_array { ...@@ -1233,6 +1233,19 @@ struct bpf_prog_array {
struct bpf_prog_array_item items[]; struct bpf_prog_array_item items[];
}; };
struct bpf_empty_prog_array {
struct bpf_prog_array hdr;
struct bpf_prog *null_prog;
};
/* to avoid allocating empty bpf_prog_array for cgroups that
* don't have bpf program attached use one global 'bpf_empty_prog_array'
* It will not be modified the caller of bpf_prog_array_alloc()
* (since caller requested prog_cnt == 0)
* that pointer should be 'freed' by bpf_prog_array_free()
*/
extern struct bpf_empty_prog_array bpf_empty_prog_array;
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array *progs); void bpf_prog_array_free(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs);
......
...@@ -1384,20 +1384,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, ...@@ -1384,20 +1384,6 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
} }
#ifdef CONFIG_NET #ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
enum cgroup_bpf_attach_type attach_type)
{
struct bpf_prog_array *prog_array;
bool empty;
rcu_read_lock();
prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
empty = bpf_prog_array_is_empty(prog_array);
rcu_read_unlock();
return empty;
}
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
struct bpf_sockopt_buf *buf) struct bpf_sockopt_buf *buf)
{ {
...@@ -1456,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, ...@@ -1456,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
}; };
int ret, max_optlen; int ret, max_optlen;
/* Opportunistic check to see whether we have any BPF program
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
return 0;
/* Allocate a bit more than the initial user buffer for /* Allocate a bit more than the initial user buffer for
* BPF program. The canonical use case is overriding * BPF program. The canonical use case is overriding
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/ */
max_optlen = max_t(int, 16, *optlen); max_optlen = max_t(int, 16, *optlen);
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0) if (max_optlen < 0)
return max_optlen; return max_optlen;
...@@ -1550,15 +1528,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ...@@ -1550,15 +1528,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}; };
int ret; int ret;
/* Opportunistic check to see whether we have any BPF program
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
return retval;
ctx.optlen = max_optlen; ctx.optlen = max_optlen;
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0) if (max_optlen < 0)
return max_optlen; return max_optlen;
......
...@@ -1968,18 +1968,10 @@ static struct bpf_prog_dummy { ...@@ -1968,18 +1968,10 @@ static struct bpf_prog_dummy {
}, },
}; };
/* to avoid allocating empty bpf_prog_array for cgroups that struct bpf_empty_prog_array bpf_empty_prog_array = {
* don't have bpf program attached use one global 'empty_prog_array'
* It will not be modified the caller of bpf_prog_array_alloc()
* (since caller requested prog_cnt == 0)
* that pointer should be 'freed' by bpf_prog_array_free()
*/
static struct {
struct bpf_prog_array hdr;
struct bpf_prog *null_prog;
} empty_prog_array = {
.null_prog = NULL, .null_prog = NULL,
}; };
EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{ {
...@@ -1989,12 +1981,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) ...@@ -1989,12 +1981,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
(prog_cnt + 1), (prog_cnt + 1),
flags); flags);
return &empty_prog_array.hdr; return &bpf_empty_prog_array.hdr;
} }
void bpf_prog_array_free(struct bpf_prog_array *progs) void bpf_prog_array_free(struct bpf_prog_array *progs)
{ {
if (!progs || progs == &empty_prog_array.hdr) if (!progs || progs == &bpf_empty_prog_array.hdr)
return; return;
kfree_rcu(progs, rcu); kfree_rcu(progs, rcu);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment