Commit 39e8111c authored by Martin KaFai Lau's avatar Martin KaFai Lau

Merge branch 'add TCP_BPF_SOCK_OPS_CB_FLAGS to bpf_*sockopt()'

Alan Maguire says:

====================
As previously discussed here [1], long-lived sockets can miss
a chance to set additional callbacks if a sock ops program
was not attached early in their lifetime.  Adding support
to bpf_setsockopt() to set callback flags (and bpf_getsockopt()
to retrieve them) provides other opportunities to enable callbacks,
either directly via a cgroup/setsockopt intercepted setsockopt()
or via a socket iterator.

Patch 1 adds bpf_[get|set]sockopt() support; patch 2 adds testing
for it via a sockops programs, along with verification via a
cgroup/getsockopt program.

Changes since v1 [2]:

- Removed unneeded READ_ONCE() (Martin, patch 1)
- Reworked sockopt test to leave existing tests undisturbed while adding
  test_nonstandard_opt() test to cover the TCP_BPF_SOCK_OPS_CB_FLAGS
  case; test verifies that value set via bpf_setsockopt() is what we
  expect via a call to getsockopt() which is caught by a
  cgroup/getsockopt program to provide the flags value (Martin, patch 2)
- Removed unneeded iterator test (Martin)

[1] https://lore.kernel.org/bpf/f42f157b-6e52-dd4d-3d97-9b86c84c0b00@oracle.com/
[2] https://lore.kernel.org/bpf/20240802152929.2695863-1-alan.maguire@oracle.com/
====================
Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
parents 91d516d4 d5305093
......@@ -2851,7 +2851,7 @@ union bpf_attr {
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
* **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**,
* **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**,
* **TCP_BPF_RTO_MIN**.
* **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports the following *optname*\ s:
* **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**.
......@@ -7080,6 +7080,7 @@ enum {
TCP_BPF_SYN = 1005, /* Copy the TCP header */
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
};
enum {
......
......@@ -5278,6 +5278,11 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
return -EINVAL;
inet_csk(sk)->icsk_rto_min = timeout;
break;
case TCP_BPF_SOCK_OPS_CB_FLAGS:
if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
return -EINVAL;
tp->bpf_sock_ops_cb_flags = val;
break;
default:
return -EINVAL;
}
......@@ -5366,6 +5371,17 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
if (*optlen < 1)
return -EINVAL;
break;
case TCP_BPF_SOCK_OPS_CB_FLAGS:
if (*optlen != sizeof(int))
return -EINVAL;
if (getopt) {
struct tcp_sock *tp = tcp_sk(sk);
int cb_flags = tp->bpf_sock_ops_cb_flags;
memcpy(optval, &cb_flags, *optlen);
return 0;
}
return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
default:
if (getopt)
return -EINVAL;
......
......@@ -2851,7 +2851,7 @@ union bpf_attr {
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**,
* **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**,
* **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**,
* **TCP_BPF_RTO_MIN**.
* **TCP_BPF_RTO_MIN**, **TCP_BPF_SOCK_OPS_CB_FLAGS**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports the following *optname*\ s:
* **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**.
......@@ -7080,6 +7080,7 @@ enum {
TCP_BPF_SYN = 1005, /* Copy the TCP header */
TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */
TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */
TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
};
enum {
......
......@@ -154,6 +154,51 @@ static void test_ktls(int family)
close(sfd);
}
static void test_nonstandard_opt(int family)
{
struct setget_sockopt__bss *bss = skel->bss;
struct bpf_link *getsockopt_link = NULL;
int sfd = -1, fd = -1, cfd = -1, flags;
socklen_t flagslen = sizeof(flags);
memset(bss, 0, sizeof(*bss));
sfd = start_server(family, SOCK_STREAM,
family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
if (!ASSERT_GE(sfd, 0, "start_server"))
return;
fd = connect_to_fd(sfd, 0);
if (!ASSERT_GE(fd, 0, "connect_to_fd_server"))
goto err_out;
/* cgroup/getsockopt prog will intercept getsockopt() below and
* retrieve the tcp socket bpf_sock_ops_cb_flags value for the
* accept()ed socket; this was set earlier in the passive established
* callback for the accept()ed socket via bpf_setsockopt().
*/
getsockopt_link = bpf_program__attach_cgroup(skel->progs._getsockopt, cg_fd);
if (!ASSERT_OK_PTR(getsockopt_link, "getsockopt prog"))
goto err_out;
cfd = accept(sfd, NULL, 0);
if (!ASSERT_GE(cfd, 0, "accept"))
goto err_out;
if (!ASSERT_OK(getsockopt(cfd, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, &flagslen),
"getsockopt_flags"))
goto err_out;
ASSERT_EQ(flags & BPF_SOCK_OPS_STATE_CB_FLAG, BPF_SOCK_OPS_STATE_CB_FLAG,
"cb_flags_set");
err_out:
close(sfd);
if (fd != -1)
close(fd);
if (cfd != -1)
close(cfd);
bpf_link__destroy(getsockopt_link);
}
void test_setget_sockopt(void)
{
cg_fd = test__join_cgroup(CG_NAME);
......@@ -191,6 +236,8 @@ void test_setget_sockopt(void)
test_udp(AF_INET);
test_ktls(AF_INET6);
test_ktls(AF_INET);
test_nonstandard_opt(AF_INET);
test_nonstandard_opt(AF_INET6);
done:
setget_sockopt__destroy(skel);
......
......@@ -59,6 +59,8 @@ static const struct sockopt_test sol_tcp_tests[] = {
{ .opt = TCP_THIN_LINEAR_TIMEOUTS, .flip = 1, },
{ .opt = TCP_USER_TIMEOUT, .new = 123400, .expected = 123400, },
{ .opt = TCP_NOTSENT_LOWAT, .new = 1314, .expected = 1314, },
{ .opt = TCP_BPF_SOCK_OPS_CB_FLAGS, .new = BPF_SOCK_OPS_ALL_CB_FLAGS,
.expected = BPF_SOCK_OPS_ALL_CB_FLAGS, },
{ .opt = 0, },
};
......@@ -353,11 +355,30 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family,
return 1;
}
SEC("cgroup/getsockopt")
int _getsockopt(struct bpf_sockopt *ctx)
{
struct bpf_sock *sk = ctx->sk;
int *optval = ctx->optval;
struct tcp_sock *tp;
if (!sk || ctx->level != SOL_TCP || ctx->optname != TCP_BPF_SOCK_OPS_CB_FLAGS)
return 1;
tp = bpf_core_cast(sk, struct tcp_sock);
if (ctx->optval + sizeof(int) <= ctx->optval_end) {
*optval = tp->bpf_sock_ops_cb_flags;
ctx->retval = 0;
}
return 1;
}
SEC("sockops")
int skops_sockopt(struct bpf_sock_ops *skops)
{
struct bpf_sock *bpf_sk = skops->sk;
struct sock *sk;
int flags;
if (!bpf_sk)
return 1;
......@@ -384,9 +405,8 @@ int skops_sockopt(struct bpf_sock_ops *skops)
nr_passive += !(bpf_test_sockopt(skops, sk) ||
test_tcp_maxseg(skops, sk) ||
test_tcp_saved_syn(skops, sk));
bpf_sock_ops_cb_flags_set(skops,
skops->bpf_sock_ops_cb_flags |
BPF_SOCK_OPS_STATE_CB_FLAG);
flags = skops->bpf_sock_ops_cb_flags | BPF_SOCK_OPS_STATE_CB_FLAG;
bpf_setsockopt(skops, SOL_TCP, TCP_BPF_SOCK_OPS_CB_FLAGS, &flags, sizeof(flags));
break;
case BPF_SOCK_OPS_STATE_CB:
if (skops->args[1] == BPF_TCP_CLOSE_WAIT)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment