Commit d9e8d14b authored by Andrii Nakryiko's avatar Andrii Nakryiko

Merge branch 'bpf: Allow bpf tcp iter to do bpf_(get|set)sockopt'

Martin KaFai says:

====================

This set is to allow bpf tcp iter to call bpf_(get|set)sockopt.

With bpf-tcp-cc, new algo rollout happens more often.  Instead of
restarting the applications to pick up the new tcp-cc, this set
allows the bpf tcp iter to call bpf_(get|set)sockopt(TCP_CONGESTION).
It is not limited to TCP_CONGESTION, the bpf tcp iter can call
bpf_(get|set)sockopt() with other options.  The bpf tcp iter can read
into all the fields of a tcp_sock, so there is a lot of flexibility
to select the desired sk to do setsockopt(), e.g. it can test for
TCP_LISTEN only and leave the established connections untouched,
or check the addr/port, or check the current tcp-cc name, ...etc.

Patch 1-4 are some cleanup and prep work in the tcp and bpf seq_file.

Patch 5 is to have the tcp seq_file iterate on the
port+addr lhash2 instead of the port only listening_hash.

Patch 6 is to have the bpf tcp iter doing batching which
then allows lock_sock.  lock_sock is needed for setsockopt.

Patch 7 allows the bpf tcp iter to call bpf_(get|set)sockopt.

v2:
- Use __GFP_NOWARN in patch 6
- Add bpf_getsockopt() in patch 7 to give a symmetrical user experience.
  selftest in patch 8 is changed to also cover bpf_getsockopt().
- Remove CAP_NET_ADMIN check in patch 7. Tracing bpf prog has already
  required CAP_SYS_ADMIN or CAP_PERFMON.
- Move some def macros to bpf_tracing_net.h in patch 8
====================
Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
parents da97553e eed92afd
......@@ -1442,6 +1442,9 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux,
struct seq_file *seq);
typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux,
struct bpf_link_info *info);
typedef const struct bpf_func_proto *
(*bpf_iter_get_func_proto_t)(enum bpf_func_id func_id,
const struct bpf_prog *prog);
enum bpf_iter_feature {
BPF_ITER_RESCHED = BIT(0),
......@@ -1454,6 +1457,7 @@ struct bpf_iter_reg {
bpf_iter_detach_target_t detach_target;
bpf_iter_show_fdinfo_t show_fdinfo;
bpf_iter_fill_link_info_t fill_link_info;
bpf_iter_get_func_proto_t get_func_proto;
u32 ctx_arg_info_size;
u32 feature;
struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
......@@ -1476,6 +1480,8 @@ struct bpf_iter__bpf_map_elem {
int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
bool bpf_iter_prog_supported(struct bpf_prog *prog);
const struct bpf_func_proto *
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog);
int bpf_iter_new_fd(struct bpf_link *link);
bool bpf_link_is_iter(struct bpf_link *link);
......@@ -2050,6 +2056,8 @@ extern const struct bpf_func_proto bpf_task_storage_get_proto;
extern const struct bpf_func_proto bpf_task_storage_delete_proto;
extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);
......
......@@ -160,6 +160,12 @@ struct inet_hashinfo {
____cacheline_aligned_in_smp;
};
#define inet_lhash2_for_each_icsk_continue(__icsk) \
hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)
#define inet_lhash2_for_each_icsk(__icsk, list) \
hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)
#define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
......
......@@ -1959,7 +1959,6 @@ struct tcp_iter_state {
struct seq_net_private p;
enum tcp_seq_states state;
struct sock *syn_wait_sk;
struct tcp_seq_afinfo *bpf_seq_afinfo;
int bucket, offset, sbucket, num;
loff_t last_pos;
};
......
......@@ -360,6 +360,28 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
return supported;
}
const struct bpf_func_proto *
bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
const struct bpf_iter_target_info *tinfo;
const struct bpf_func_proto *fn = NULL;
mutex_lock(&targets_mutex);
list_for_each_entry(tinfo, &targets, list) {
if (tinfo->btf_id == prog->aux->attach_btf_id) {
const struct bpf_iter_reg *reg_info;
reg_info = tinfo->reg_info;
if (reg_info->get_func_proto)
fn = reg_info->get_func_proto(func_id, prog);
break;
}
}
mutex_unlock(&targets_mutex);
return fn;
}
static void bpf_iter_link_release(struct bpf_link *link)
{
struct bpf_iter_link *iter_link =
......
......@@ -1461,6 +1461,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
const struct bpf_func_proto *fn;
switch (func_id) {
#ifdef CONFIG_NET
case BPF_FUNC_skb_output:
......@@ -1501,7 +1503,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_d_path:
return &bpf_d_path_proto;
default:
return raw_tp_prog_func_proto(func_id, prog);
fn = raw_tp_prog_func_proto(func_id, prog);
if (!fn && prog->expected_attach_type == BPF_TRACE_ITER)
fn = bpf_iter_get_func_proto(func_id, prog);
return fn;
}
}
......
......@@ -5016,6 +5016,40 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
}
BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{
return _bpf_setsockopt(sk, level, optname, optval, optlen);
}
const struct bpf_func_proto bpf_sk_setsockopt_proto = {
.func = bpf_sk_setsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
.arg5_type = ARG_CONST_SIZE,
};
BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
int, optname, char *, optval, int, optlen)
{
return _bpf_getsockopt(sk, level, optname, optval, optlen);
}
const struct bpf_func_proto bpf_sk_getsockopt_proto = {
.func = bpf_sk_getsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_UNINIT_MEM,
.arg5_type = ARG_CONST_SIZE,
};
BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
int, level, int, optname, char *, optval, int, optlen)
{
......
This diff is collapsed.
......@@ -66,17 +66,13 @@ int settimeo(int fd, int timeout_ms)
#define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; })
int start_server(int family, int type, const char *addr_str, __u16 port,
int timeout_ms)
static int __start_server(int type, const struct sockaddr *addr,
socklen_t addrlen, int timeout_ms, bool reuseport)
{
struct sockaddr_storage addr = {};
socklen_t len;
int on = 1;
int fd;
if (make_sockaddr(family, addr_str, port, &addr, &len))
return -1;
fd = socket(family, type, 0);
fd = socket(addr->sa_family, type, 0);
if (fd < 0) {
log_err("Failed to create server socket");
return -1;
......@@ -85,7 +81,13 @@ int start_server(int family, int type, const char *addr_str, __u16 port,
if (settimeo(fd, timeout_ms))
goto error_close;
if (bind(fd, (const struct sockaddr *)&addr, len) < 0) {
if (reuseport &&
setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on))) {
log_err("Failed to set SO_REUSEPORT");
return -1;
}
if (bind(fd, addr, addrlen) < 0) {
log_err("Failed to bind socket");
goto error_close;
}
......@@ -104,6 +106,69 @@ int start_server(int family, int type, const char *addr_str, __u16 port,
return -1;
}
int start_server(int family, int type, const char *addr_str, __u16 port,
int timeout_ms)
{
struct sockaddr_storage addr;
socklen_t addrlen;
if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
return -1;
return __start_server(type, (struct sockaddr *)&addr,
addrlen, timeout_ms, false);
}
int *start_reuseport_server(int family, int type, const char *addr_str,
__u16 port, int timeout_ms, unsigned int nr_listens)
{
struct sockaddr_storage addr;
unsigned int nr_fds = 0;
socklen_t addrlen;
int *fds;
if (!nr_listens)
return NULL;
if (make_sockaddr(family, addr_str, port, &addr, &addrlen))
return NULL;
fds = malloc(sizeof(*fds) * nr_listens);
if (!fds)
return NULL;
fds[0] = __start_server(type, (struct sockaddr *)&addr, addrlen,
timeout_ms, true);
if (fds[0] == -1)
goto close_fds;
nr_fds = 1;
if (getsockname(fds[0], (struct sockaddr *)&addr, &addrlen))
goto close_fds;
for (; nr_fds < nr_listens; nr_fds++) {
fds[nr_fds] = __start_server(type, (struct sockaddr *)&addr,
addrlen, timeout_ms, true);
if (fds[nr_fds] == -1)
goto close_fds;
}
return fds;
close_fds:
free_fds(fds, nr_fds);
return NULL;
}
void free_fds(int *fds, unsigned int nr_close_fds)
{
if (fds) {
while (nr_close_fds)
close(fds[--nr_close_fds]);
free(fds);
}
}
int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
int timeout_ms)
{
......@@ -217,6 +282,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port,
if (family == AF_INET) {
struct sockaddr_in *sin = (void *)addr;
memset(addr, 0, sizeof(*sin));
sin->sin_family = AF_INET;
sin->sin_port = htons(port);
if (addr_str &&
......@@ -230,6 +296,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port,
} else if (family == AF_INET6) {
struct sockaddr_in6 *sin6 = (void *)addr;
memset(addr, 0, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_port = htons(port);
if (addr_str &&
......
......@@ -36,6 +36,10 @@ extern struct ipv6_packet pkt_v6;
int settimeo(int fd, int timeout_ms);
int start_server(int family, int type, const char *addr, __u16 port,
int timeout_ms);
int *start_reuseport_server(int family, int type, const char *addr_str,
__u16 port, int timeout_ms,
unsigned int nr_listens);
void free_fds(int *fds, unsigned int nr_close_fds);
int connect_to_fd(int server_fd, int timeout_ms);
int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
......
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#define _GNU_SOURCE
#include <sched.h>
#include <test_progs.h>
#include "network_helpers.h"
#include "bpf_dctcp.skel.h"
#include "bpf_cubic.skel.h"
#include "bpf_iter_setsockopt.skel.h"
static int create_netns(void)
{
if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
return -1;
if (!ASSERT_OK(system("ip link set dev lo up"), "bring up lo"))
return -1;
return 0;
}
static unsigned int set_bpf_cubic(int *fds, unsigned int nr_fds)
{
unsigned int i;
for (i = 0; i < nr_fds; i++) {
if (setsockopt(fds[i], SOL_TCP, TCP_CONGESTION, "bpf_cubic",
sizeof("bpf_cubic")))
return i;
}
return nr_fds;
}
static unsigned int check_bpf_dctcp(int *fds, unsigned int nr_fds)
{
char tcp_cc[16];
socklen_t optlen = sizeof(tcp_cc);
unsigned int i;
for (i = 0; i < nr_fds; i++) {
if (getsockopt(fds[i], SOL_TCP, TCP_CONGESTION,
tcp_cc, &optlen) ||
strcmp(tcp_cc, "bpf_dctcp"))
return i;
}
return nr_fds;
}
static int *make_established(int listen_fd, unsigned int nr_est,
int **paccepted_fds)
{
int *est_fds, *accepted_fds;
unsigned int i;
est_fds = malloc(sizeof(*est_fds) * nr_est);
if (!est_fds)
return NULL;
accepted_fds = malloc(sizeof(*accepted_fds) * nr_est);
if (!accepted_fds) {
free(est_fds);
return NULL;
}
for (i = 0; i < nr_est; i++) {
est_fds[i] = connect_to_fd(listen_fd, 0);
if (est_fds[i] == -1)
break;
if (set_bpf_cubic(&est_fds[i], 1) != 1) {
close(est_fds[i]);
break;
}
accepted_fds[i] = accept(listen_fd, NULL, 0);
if (accepted_fds[i] == -1) {
close(est_fds[i]);
break;
}
}
if (!ASSERT_EQ(i, nr_est, "create established fds")) {
free_fds(accepted_fds, i);
free_fds(est_fds, i);
return NULL;
}
*paccepted_fds = accepted_fds;
return est_fds;
}
static unsigned short get_local_port(int fd)
{
struct sockaddr_in6 addr;
socklen_t addrlen = sizeof(addr);
if (!getsockname(fd, &addr, &addrlen))
return ntohs(addr.sin6_port);
return 0;
}
static void do_bpf_iter_setsockopt(struct bpf_iter_setsockopt *iter_skel,
bool random_retry)
{
int *reuse_listen_fds = NULL, *accepted_fds = NULL, *est_fds = NULL;
unsigned int nr_reuse_listens = 256, nr_est = 256;
int err, iter_fd = -1, listen_fd = -1;
char buf;
/* Prepare non-reuseport listen_fd */
listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
if (!ASSERT_GE(listen_fd, 0, "start_server"))
return;
if (!ASSERT_EQ(set_bpf_cubic(&listen_fd, 1), 1,
"set listen_fd to cubic"))
goto done;
iter_skel->bss->listen_hport = get_local_port(listen_fd);
if (!ASSERT_NEQ(iter_skel->bss->listen_hport, 0,
"get_local_port(listen_fd)"))
goto done;
/* Connect to non-reuseport listen_fd */
est_fds = make_established(listen_fd, nr_est, &accepted_fds);
if (!ASSERT_OK_PTR(est_fds, "create established"))
goto done;
/* Prepare reuseport listen fds */
reuse_listen_fds = start_reuseport_server(AF_INET6, SOCK_STREAM,
"::1", 0, 0,
nr_reuse_listens);
if (!ASSERT_OK_PTR(reuse_listen_fds, "start_reuseport_server"))
goto done;
if (!ASSERT_EQ(set_bpf_cubic(reuse_listen_fds, nr_reuse_listens),
nr_reuse_listens, "set reuse_listen_fds to cubic"))
goto done;
iter_skel->bss->reuse_listen_hport = get_local_port(reuse_listen_fds[0]);
if (!ASSERT_NEQ(iter_skel->bss->reuse_listen_hport, 0,
"get_local_port(reuse_listen_fds[0])"))
goto done;
/* Run bpf tcp iter to switch from bpf_cubic to bpf_dctcp */
iter_skel->bss->random_retry = random_retry;
iter_fd = bpf_iter_create(bpf_link__fd(iter_skel->links.change_tcp_cc));
if (!ASSERT_GE(iter_fd, 0, "create iter_fd"))
goto done;
while ((err = read(iter_fd, &buf, sizeof(buf))) == -1 &&
errno == EAGAIN)
;
if (!ASSERT_OK(err, "read iter error"))
goto done;
/* Check reuseport listen fds for dctcp */
ASSERT_EQ(check_bpf_dctcp(reuse_listen_fds, nr_reuse_listens),
nr_reuse_listens,
"check reuse_listen_fds dctcp");
/* Check non reuseport listen fd for dctcp */
ASSERT_EQ(check_bpf_dctcp(&listen_fd, 1), 1,
"check listen_fd dctcp");
/* Check established fds for dctcp */
ASSERT_EQ(check_bpf_dctcp(est_fds, nr_est), nr_est,
"check est_fds dctcp");
/* Check accepted fds for dctcp */
ASSERT_EQ(check_bpf_dctcp(accepted_fds, nr_est), nr_est,
"check accepted_fds dctcp");
done:
if (iter_fd != -1)
close(iter_fd);
if (listen_fd != -1)
close(listen_fd);
free_fds(reuse_listen_fds, nr_reuse_listens);
free_fds(accepted_fds, nr_est);
free_fds(est_fds, nr_est);
}
void test_bpf_iter_setsockopt(void)
{
struct bpf_iter_setsockopt *iter_skel = NULL;
struct bpf_cubic *cubic_skel = NULL;
struct bpf_dctcp *dctcp_skel = NULL;
struct bpf_link *cubic_link = NULL;
struct bpf_link *dctcp_link = NULL;
if (create_netns())
return;
/* Load iter_skel */
iter_skel = bpf_iter_setsockopt__open_and_load();
if (!ASSERT_OK_PTR(iter_skel, "iter_skel"))
return;
iter_skel->links.change_tcp_cc = bpf_program__attach_iter(iter_skel->progs.change_tcp_cc, NULL);
if (!ASSERT_OK_PTR(iter_skel->links.change_tcp_cc, "attach iter"))
goto done;
/* Load bpf_cubic */
cubic_skel = bpf_cubic__open_and_load();
if (!ASSERT_OK_PTR(cubic_skel, "cubic_skel"))
goto done;
cubic_link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic);
if (!ASSERT_OK_PTR(cubic_link, "cubic_link"))
goto done;
/* Load bpf_dctcp */
dctcp_skel = bpf_dctcp__open_and_load();
if (!ASSERT_OK_PTR(dctcp_skel, "dctcp_skel"))
goto done;
dctcp_link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
if (!ASSERT_OK_PTR(dctcp_link, "dctcp_link"))
goto done;
do_bpf_iter_setsockopt(iter_skel, true);
do_bpf_iter_setsockopt(iter_skel, false);
done:
bpf_link__destroy(cubic_link);
bpf_link__destroy(dctcp_link);
bpf_cubic__destroy(cubic_skel);
bpf_dctcp__destroy(dctcp_skel);
bpf_iter_setsockopt__destroy(iter_skel);
}
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2021 Facebook */
#include "bpf_iter.h"
#include "bpf_tracing_net.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#define bpf_tcp_sk(skc) ({ \
struct sock_common *_skc = skc; \
sk = NULL; \
tp = NULL; \
if (_skc) { \
tp = bpf_skc_to_tcp_sock(_skc); \
sk = (struct sock *)tp; \
} \
tp; \
})
unsigned short reuse_listen_hport = 0;
unsigned short listen_hport = 0;
char cubic_cc[TCP_CA_NAME_MAX] = "bpf_cubic";
char dctcp_cc[TCP_CA_NAME_MAX] = "bpf_dctcp";
bool random_retry = false;
static bool tcp_cc_eq(const char *a, const char *b)
{
int i;
for (i = 0; i < TCP_CA_NAME_MAX; i++) {
if (a[i] != b[i])
return false;
if (!a[i])
break;
}
return true;
}
SEC("iter/tcp")
int change_tcp_cc(struct bpf_iter__tcp *ctx)
{
char cur_cc[TCP_CA_NAME_MAX];
struct tcp_sock *tp;
struct sock *sk;
int ret;
if (!bpf_tcp_sk(ctx->sk_common))
return 0;
if (sk->sk_family != AF_INET6 ||
(sk->sk_state != TCP_LISTEN &&
sk->sk_state != TCP_ESTABLISHED) ||
(sk->sk_num != reuse_listen_hport &&
sk->sk_num != listen_hport &&
bpf_ntohs(sk->sk_dport) != listen_hport))
return 0;
if (bpf_getsockopt(tp, SOL_TCP, TCP_CONGESTION,
cur_cc, sizeof(cur_cc)))
return 0;
if (!tcp_cc_eq(cur_cc, cubic_cc))
return 0;
if (random_retry && bpf_get_prandom_u32() % 4 == 1)
return 1;
bpf_setsockopt(tp, SOL_TCP, TCP_CONGESTION, dctcp_cc, sizeof(dctcp_cc));
return 0;
}
char _license[] SEC("license") = "GPL";
......@@ -5,6 +5,10 @@
#define AF_INET 2
#define AF_INET6 10
#define SOL_TCP 6
#define TCP_CONGESTION 13
#define TCP_CA_NAME_MAX 16
#define ICSK_TIME_RETRANS 1
#define ICSK_TIME_PROBE0 3
#define ICSK_TIME_LOSS_PROBE 5
......@@ -32,6 +36,8 @@
#define ir_v6_rmt_addr req.__req_common.skc_v6_daddr
#define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr
#define sk_num __sk_common.skc_num
#define sk_dport __sk_common.skc_dport
#define sk_family __sk_common.skc_family
#define sk_rmem_alloc sk_backlog.rmem_alloc
#define sk_refcnt __sk_common.skc_refcnt
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment