Commit ea7da1d5 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'Various BPF helper improvements'

Daniel Borkmann says:

====================
This series adds two BPF helpers, that is, one for retrieving the classid
of an skb and another one to redirect via the neigh subsystem, and improves
also the cookie helpers by removing the atomic counter. I've also added
the bpf_tail_call_static() helper to the libbpf API that we've been using
in Cilium for a while now, and last but not least the series adds a few
selftests. For details, please check individual patches, thanks!

v3 -> v4:
  - Removed out_rec error path (Martin)
  - Integrate BPF_F_NEIGH flag into rejecting invalid flags (Martin)
    - I think this way it's better to avoid bit overlaps given it's
      right in the place that would need to be extended on new flags
v2 -> v3:
  - Removed double skb->dev = dev assignment (David)
  - Added headroom check for v6 path (David)
  - Set set flowi4_proto for ip_route_output_flow (David)
  - Rebased onto latest bpf-next
v1 -> v2:
  - Rework cookie generator to support nested contexts (Eric)
  - Use ip_neigh_gw6() and container_of() (David)
  - Rename __throw_build_bug() and improve comments (Andrii)
  - Use bpf_tail_call_static() also in BPF samples (Maciej)
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 963ec27a eef4a011
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_COOKIE_H
#define __LINUX_COOKIE_H
#include <linux/atomic.h>
#include <linux/percpu.h>
#include <asm/local.h>
struct pcpu_gen_cookie {
local_t nesting;
u64 last;
} __aligned(16);
struct gen_cookie {
struct pcpu_gen_cookie __percpu *local;
atomic64_t forward_last ____cacheline_aligned_in_smp;
atomic64_t reverse_last;
};
#define COOKIE_LOCAL_BATCH 4096
#define DEFINE_COOKIE(name) \
static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \
static struct gen_cookie name = { \
.local = &__##name, \
.forward_last = ATOMIC64_INIT(0), \
.reverse_last = ATOMIC64_INIT(0), \
}
static __always_inline u64 gen_cookie_next(struct gen_cookie *gc)
{
struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local);
u64 val;
if (likely(local_inc_return(&local->nesting) == 1)) {
val = local->last;
if (__is_defined(CONFIG_SMP) &&
unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) {
s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH,
&gc->forward_last);
val = next - COOKIE_LOCAL_BATCH;
}
local->last = ++val;
} else {
val = atomic64_dec_return(&gc->reverse_last);
}
local_dec(&local->nesting);
return val;
}
#endif /* __LINUX_COOKIE_H */
......@@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb)
return skb->mac_header != (typeof(skb->mac_header))~0U;
}
static inline void skb_unset_mac_header(struct sk_buff *skb)
{
skb->mac_header = (typeof(skb->mac_header))~0U;
}
static inline void skb_reset_mac_header(struct sk_buff *skb)
{
skb->mac_header = skb->data - skb->head;
......
......@@ -25,7 +25,19 @@ void sock_diag_unregister(const struct sock_diag_handler *h);
void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh));
u64 sock_gen_cookie(struct sock *sk);
u64 __sock_gen_cookie(struct sock *sk);
static inline u64 sock_gen_cookie(struct sock *sk)
{
u64 cookie;
preempt_disable();
cookie = __sock_gen_cookie(sk);
preempt_enable();
return cookie;
}
int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie);
void sock_diag_save_cookie(struct sock *sk, __u32 *cookie);
......
......@@ -230,7 +230,7 @@ extern struct list_head net_namespace_list;
struct net *get_net_ns_by_pid(pid_t pid);
struct net *get_net_ns_by_fd(int fd);
u64 net_gen_cookie(struct net *net);
u64 __net_gen_cookie(struct net *net);
#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
......
......@@ -3643,6 +3643,28 @@ union bpf_attr {
* *flags* are identical to those used for bpf_snprintf_btf.
* Return
* 0 on success or a negative error in case of failure.
*
* u64 bpf_skb_cgroup_classid(struct sk_buff *skb)
* Description
* See **bpf_get_cgroup_classid**\ () for the main description.
* This helper differs from **bpf_get_cgroup_classid**\ () in that
* the cgroup v1 net_cls class is retrieved only from the *skb*'s
* associated socket instead of the current process.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* long bpf_redirect_neigh(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it
* fills in e.g. MAC addresses based on the L3 information from
* the packet. This helper is supported for IPv4 and IPv6 protocols.
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -3796,6 +3818,8 @@ union bpf_attr {
FN(copy_from_user), \
FN(snprintf_btf), \
FN(seq_printf_btf), \
FN(skb_cgroup_classid), \
FN(redirect_neigh), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
......@@ -191,7 +191,7 @@ int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
rcu_read_lock();
sk = reuseport_array_lookup_elem(map, key);
if (sk) {
*(u64 *)value = sock_gen_cookie(sk);
*(u64 *)value = __sock_gen_cookie(sk);
err = 0;
} else {
err = -ENOENT;
......
This diff is collapsed.
......@@ -19,6 +19,7 @@
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/cookie.h>
#include <net/sock.h>
#include <net/netlink.h>
......@@ -69,16 +70,16 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static atomic64_t cookie_gen;
DEFINE_COOKIE(net_cookie);
u64 net_gen_cookie(struct net *net)
u64 __net_gen_cookie(struct net *net)
{
while (1) {
u64 res = atomic64_read(&net->net_cookie);
if (res)
return res;
res = atomic64_inc_return(&cookie_gen);
res = gen_cookie_next(&net_cookie);
atomic64_cmpxchg(&net->net_cookie, 0, res);
}
}
......@@ -1101,7 +1102,10 @@ static int __init net_ns_init(void)
panic("Could not allocate generic netns");
rcu_assign_pointer(init_net.gen, ng);
net_gen_cookie(&init_net);
preempt_disable();
__net_gen_cookie(&init_net);
preempt_enable();
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
......
......@@ -11,7 +11,7 @@
#include <linux/tcp.h>
#include <linux/workqueue.h>
#include <linux/nospec.h>
#include <linux/cookie.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
......@@ -19,16 +19,17 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
static DEFINE_MUTEX(sock_diag_table_mutex);
static struct workqueue_struct *broadcast_wq;
static atomic64_t cookie_gen;
u64 sock_gen_cookie(struct sock *sk)
DEFINE_COOKIE(sock_cookie);
u64 __sock_gen_cookie(struct sock *sk)
{
while (1) {
u64 res = atomic64_read(&sk->sk_cookie);
if (res)
return res;
res = atomic64_inc_return(&cookie_gen);
res = gen_cookie_next(&sock_cookie);
atomic64_cmpxchg(&sk->sk_cookie, 0, res);
}
}
......
......@@ -401,7 +401,7 @@ static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
if (!sk)
return ERR_PTR(-ENOENT);
sock_gen_cookie(sk);
__sock_gen_cookie(sk);
return &sk->sk_cookie;
}
......@@ -1209,7 +1209,7 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
if (!sk)
return ERR_PTR(-ENOENT);
sock_gen_cookie(sk);
__sock_gen_cookie(sk);
return &sk->sk_cookie;
}
......
......@@ -31,28 +31,30 @@ struct {
#define PARSE_IP 3
#define PARSE_IPV6 4
/* protocol dispatch routine.
* It tail-calls next BPF program depending on eth proto
* Note, we could have used:
* bpf_tail_call(skb, &jmp_table, proto);
* but it would need large prog_array
/* Protocol dispatch routine. It tail-calls next BPF program depending
* on eth proto. Note, we could have used ...
*
* bpf_tail_call(skb, &jmp_table, proto);
*
* ... but it would need large prog_array and cannot be optimised given
* the map key is not static.
*/
static inline void parse_eth_proto(struct __sk_buff *skb, u32 proto)
{
switch (proto) {
case ETH_P_8021Q:
case ETH_P_8021AD:
bpf_tail_call(skb, &jmp_table, PARSE_VLAN);
bpf_tail_call_static(skb, &jmp_table, PARSE_VLAN);
break;
case ETH_P_MPLS_UC:
case ETH_P_MPLS_MC:
bpf_tail_call(skb, &jmp_table, PARSE_MPLS);
bpf_tail_call_static(skb, &jmp_table, PARSE_MPLS);
break;
case ETH_P_IP:
bpf_tail_call(skb, &jmp_table, PARSE_IP);
bpf_tail_call_static(skb, &jmp_table, PARSE_IP);
break;
case ETH_P_IPV6:
bpf_tail_call(skb, &jmp_table, PARSE_IPV6);
bpf_tail_call_static(skb, &jmp_table, PARSE_IPV6);
break;
}
}
......
......@@ -3643,6 +3643,28 @@ union bpf_attr {
* *flags* are identical to those used for bpf_snprintf_btf.
* Return
* 0 on success or a negative error in case of failure.
*
* u64 bpf_skb_cgroup_classid(struct sk_buff *skb)
* Description
* See **bpf_get_cgroup_classid**\ () for the main description.
* This helper differs from **bpf_get_cgroup_classid**\ () in that
* the cgroup v1 net_cls class is retrieved only from the *skb*'s
* associated socket instead of the current process.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* long bpf_redirect_neigh(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it
* fills in e.g. MAC addresses based on the L3 information from
* the packet. This helper is supported for IPv4 and IPv6 protocols.
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -3796,6 +3818,8 @@ union bpf_attr {
FN(copy_from_user), \
FN(snprintf_btf), \
FN(seq_printf_btf), \
FN(skb_cgroup_classid), \
FN(redirect_neigh), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
......@@ -53,6 +53,52 @@
})
#endif
/*
* Helper macro to throw a compilation error if __bpf_unreachable() gets
* built into the resulting code. This works given BPF back end does not
* implement __builtin_trap(). This is useful to assert that certain paths
* of the program code are never used and hence eliminated by the compiler.
*
* For example, consider a switch statement that covers known cases used by
* the program. __bpf_unreachable() can then reside in the default case. If
* the program gets extended such that a case is not covered in the switch
* statement, then it will throw a build error due to the default case not
* being compiled out.
*/
#ifndef __bpf_unreachable
# define __bpf_unreachable() __builtin_trap()
#endif
/*
* Helper function to perform a tail call with a constant/immediate map slot.
*/
static __always_inline void
bpf_tail_call_static(void *ctx, const void *map, const __u32 slot)
{
if (!__builtin_constant_p(slot))
__bpf_unreachable();
/*
* Provide a hard guarantee that LLVM won't optimize setting r2 (map
* pointer) and r3 (constant map index) from _different paths_ ending
* up at the _same_ call insn as otherwise we won't be able to use the
* jmpq/nopl retpoline-free patching by the x86-64 JIT in the kernel
* given they mismatch. See also d2e4c1e6c294 ("bpf: Constant map key
* tracking for prog array pokes") for details on verifier tracking.
*
* Note on clobber list: we need to stay in-line with BPF calling
* convention, so even if we don't end up using r0, r4, r5, we need
* to mark them as clobber so that LLVM doesn't end up using them
* before / after the call.
*/
asm volatile("r1 = %[ctx]\n\t"
"r2 = %[map]\n\t"
"r3 = %[slot]\n\t"
"call 12"
:: [ctx]"r"(ctx), [map]"r"(map), [slot]"i"(slot)
: "r0", "r1", "r2", "r3", "r4", "r5");
}
/*
* Helper structure used by eBPF C program
* to describe BPF map attributes to libbpf loader
......
......@@ -118,18 +118,18 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
switch (proto) {
case bpf_htons(ETH_P_IP):
bpf_tail_call(skb, &jmp_table, IP);
bpf_tail_call_static(skb, &jmp_table, IP);
break;
case bpf_htons(ETH_P_IPV6):
bpf_tail_call(skb, &jmp_table, IPV6);
bpf_tail_call_static(skb, &jmp_table, IPV6);
break;
case bpf_htons(ETH_P_MPLS_MC):
case bpf_htons(ETH_P_MPLS_UC):
bpf_tail_call(skb, &jmp_table, MPLS);
bpf_tail_call_static(skb, &jmp_table, MPLS);
break;
case bpf_htons(ETH_P_8021Q):
case bpf_htons(ETH_P_8021AD):
bpf_tail_call(skb, &jmp_table, VLAN);
bpf_tail_call_static(skb, &jmp_table, VLAN);
break;
default:
/* Protocol not supported */
......@@ -246,10 +246,10 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
switch (nexthdr) {
case IPPROTO_HOPOPTS:
case IPPROTO_DSTOPTS:
bpf_tail_call(skb, &jmp_table, IPV6OP);
bpf_tail_call_static(skb, &jmp_table, IPV6OP);
break;
case IPPROTO_FRAGMENT:
bpf_tail_call(skb, &jmp_table, IPV6FR);
bpf_tail_call_static(skb, &jmp_table, IPV6FR);
break;
default:
return parse_ip_proto(skb, nexthdr);
......
......@@ -26,20 +26,20 @@ int entry(struct __sk_buff *skb)
/* Multiple locations to make sure we patch
* all of them.
*/
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call_static(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 2);
bpf_tail_call_static(skb, &jmp_table, 2);
bpf_tail_call_static(skb, &jmp_table, 2);
bpf_tail_call_static(skb, &jmp_table, 2);
return 3;
}
......
......@@ -13,14 +13,14 @@ struct {
SEC("classifier/0")
int bpf_func_0(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
return 0;
}
SEC("classifier/1")
int bpf_func_1(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call_static(skb, &jmp_table, 2);
return 1;
}
......@@ -33,25 +33,25 @@ int bpf_func_2(struct __sk_buff *skb)
SEC("classifier/3")
int bpf_func_3(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 4);
bpf_tail_call_static(skb, &jmp_table, 4);
return 3;
}
SEC("classifier/4")
int bpf_func_4(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 3);
bpf_tail_call_static(skb, &jmp_table, 3);
return 4;
}
SEC("classifier")
int entry(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
/* Check multi-prog update. */
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call_static(skb, &jmp_table, 2);
/* Check tail call limit. */
bpf_tail_call(skb, &jmp_table, 3);
bpf_tail_call_static(skb, &jmp_table, 3);
return 3;
}
......
......@@ -16,14 +16,14 @@ SEC("classifier/0")
int bpf_func_0(struct __sk_buff *skb)
{
count++;
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
return 1;
}
SEC("classifier")
int entry(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
return 0;
}
......
......@@ -21,7 +21,7 @@ TAIL_FUNC(1)
static __noinline
int subprog_tail(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
return skb->len * 2;
}
......@@ -29,7 +29,7 @@ int subprog_tail(struct __sk_buff *skb)
SEC("classifier")
int entry(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
return subprog_tail(skb);
}
......
......@@ -14,9 +14,9 @@ static __noinline
int subprog_tail(struct __sk_buff *skb)
{
if (load_byte(skb, 0))
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
else
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
return 1;
}
......@@ -32,7 +32,7 @@ int bpf_func_0(struct __sk_buff *skb)
SEC("classifier")
int entry(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
return 0;
}
......
......@@ -16,9 +16,9 @@ int subprog_tail2(struct __sk_buff *skb)
volatile char arr[64] = {};
if (load_word(skb, 0) || load_half(skb, 0))
bpf_tail_call(skb, &jmp_table, 10);
bpf_tail_call_static(skb, &jmp_table, 10);
else
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
return skb->len;
}
......@@ -28,7 +28,7 @@ int subprog_tail(struct __sk_buff *skb)
{
volatile char arr[64] = {};
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
return skb->len * 2;
}
......
......@@ -14,21 +14,21 @@ static volatile int count;
__noinline
int subprog_tail_2(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 2);
bpf_tail_call_static(skb, &jmp_table, 2);
return skb->len * 3;
}
__noinline
int subprog_tail_1(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 1);
bpf_tail_call_static(skb, &jmp_table, 1);
return skb->len * 2;
}
__noinline
int subprog_tail(struct __sk_buff *skb)
{
bpf_tail_call(skb, &jmp_table, 0);
bpf_tail_call_static(skb, &jmp_table, 0);
return skb->len;
}
......
// SPDX-License-Identifier: GPL-2.0
#include <stdint.h>
#include <stdbool.h>
#include <linux/bpf.h>
#include <linux/stddef.h>
#include <linux/pkt_cls.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#ifndef barrier_data
# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory")
#endif
#ifndef ctx_ptr
# define ctx_ptr(field) (void *)(long)(field)
#endif
#define dst_to_src_tmp 0xeeddddeeU
#define src_to_dst_tmp 0xeeffffeeU
#define ip4_src 0xac100164 /* 172.16.1.100 */
#define ip4_dst 0xac100264 /* 172.16.2.100 */
#define ip6_src { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
0x00, 0x01, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
#define ip6_dst { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
0x00, 0x02, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
#ifndef v6_equal
# define v6_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \
a.s6_addr32[1] == b.s6_addr32[1] && \
a.s6_addr32[2] == b.s6_addr32[2] && \
a.s6_addr32[3] == b.s6_addr32[3])
#endif
static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
__be32 addr)
{
void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data);
struct iphdr *ip4h;
if (data + sizeof(struct ethhdr) > data_end)
return false;
ip4h = (struct iphdr *)(data + sizeof(struct ethhdr));
if ((void *)(ip4h + 1) > data_end)
return false;
return ip4h->daddr == addr;
}
static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
struct in6_addr addr)
{
void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data);
struct ipv6hdr *ip6h;
if (data + sizeof(struct ethhdr) > data_end)
return false;
ip6h = (struct ipv6hdr *)(data + sizeof(struct ethhdr));
if ((void *)(ip6h + 1) > data_end)
return false;
return v6_equal(ip6h->daddr, addr);
}
SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
{
void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data);
__u32 *raw = data;
if (data + sizeof(struct ethhdr) > data_end)
return TC_ACT_SHOT;
return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK;
}
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
{
int idx = dst_to_src_tmp;
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
switch (skb->protocol) {
case __bpf_constant_htons(ETH_P_IP):
redirect = is_remote_ep_v4(skb, __bpf_constant_htonl(ip4_src));
break;
case __bpf_constant_htons(ETH_P_IPV6):
redirect = is_remote_ep_v6(skb, (struct in6_addr)ip6_src);
break;
}
if (!redirect)
return TC_ACT_OK;
barrier_data(&idx);
idx = bpf_ntohl(idx);
__builtin_memset(&zero, 0, sizeof(zero));
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(idx, 0);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
{
int idx = src_to_dst_tmp;
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
switch (skb->protocol) {
case __bpf_constant_htons(ETH_P_IP):
redirect = is_remote_ep_v4(skb, __bpf_constant_htonl(ip4_dst));
break;
case __bpf_constant_htons(ETH_P_IPV6):
redirect = is_remote_ep_v6(skb, (struct in6_addr)ip6_dst);
break;
}
if (!redirect)
return TC_ACT_OK;
barrier_data(&idx);
idx = bpf_ntohl(idx);
__builtin_memset(&zero, 0, sizeof(zero));
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(idx, 0);
}
char __license[] SEC("license") = "GPL";
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
# between src and dst. The netns fwd has veth links to each src and dst. The
# client is in src and server in dst. The test installs a TC BPF program to each
# host facing veth in fwd which calls into bpf_redirect_peer() to perform the
# neigh addr population and redirect; it also installs a dropper prog on the
# egress side to drop skbs if neigh addrs were not populated.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that nc, dd, ping, ping6 and timeout are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "dd is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
command -v ping >/dev/null 2>&1 || \
{ echo >&2 "ping is not available"; exit 1; }
command -v ping6 >/dev/null 2>&1 || \
{ echo >&2 "ping6 is not available"; exit 1; }
readonly GREEN='\033[0;92m'
readonly RED='\033[0;31m'
readonly NC='\033[0m' # No Color
readonly PING_ARG="-c 3 -w 10 -q"
readonly TIMEOUT=10
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP4_SRC="172.16.1.100"
readonly IP4_DST="172.16.2.100"
readonly IP6_SRC="::1:dead:beef:cafe"
readonly IP6_DST="::2:dead:beef:cafe"
readonly IP4_SLL="169.254.0.1"
readonly IP4_DLL="169.254.0.2"
readonly IP4_NET="169.254.0.0"
cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_FWD}
ip netns del ${NS_DST}
}
trap cleanup EXIT
set -e
ip netns add "${NS_SRC}"
ip netns add "${NS_FWD}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_src_fwd
ip link add veth_dst type veth peer name veth_dst_fwd
ip link set veth_src netns ${NS_SRC}
ip link set veth_src_fwd netns ${NS_FWD}
ip link set veth_dst netns ${NS_DST}
ip link set veth_dst_fwd netns ${NS_FWD}
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
# The fwd netns automatically get a v6 LL address / routes, but also needs v4
# one in order to start ARP probing. IP4_NET route is added to the endpoints
# so that the ARP processing will reply.
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_FWD} link set dev veth_src_fwd up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}')
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}')
xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o
xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh
rm -f test_tc_neigh.x.o test_tc_neigh.y.o
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
set +e
TEST="TCPv4 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="TCPv6 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv4 connectivity test"
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv6 connectivity test"
ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment