Commit 673e3752 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'Follow-up BPF helper improvements'

Daniel Borkmann says:

====================

This series addresses most of the feedback [0] that was to be followed
up from the last series, that is, UAPI helper comment improvements and
getting rid of the ifindex obj file hacks in the selftest by using a
BPF map instead. The __sk_buff data/data_end pointer work, I'm planning
to do in a later round as well as the mem*() BPF improvements we have
in Cilium for libbpf. Next, the series adds two features, i) a helper
called redirect_peer() to improve latency on netns switch, and ii) to
allow map in map with dynamic inner array map sizes. Selftests for each
are added as well. For details, please check individual patches, thanks!

  [0] https://lore.kernel.org/bpf/cover.1601477936.git.daniel@iogearbox.net/

v5 -> v6:
  - Going with Andrii's suggestion to make the misconfigured verifier
    test more robust, and only probe on -EOPNOTSUPP (Andrii)
v4 -> v5:
  - Replace cnt == -EOPNOTSUPP check with cnt < 0; I've used < 0
    here as I think it's useful to keep the existing cnt == 0 ||
    cnt >= ARRAY_SIZE(insn_buf) for error detection (Andrii)
v3 -> v4:
  - Rename new array map flag to BPF_F_INNER_MAP (Alexei)
v2 -> v3:
  - Remove tab that slipped into uapi helper desc (Jakub)
  - Rework map in map for array to error from map_gen_lookup (Andrii)
v1 -> v2:
  - Fixed selftest comment wrt inner1/inner2 value (Yonghong)
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents ac53a0d3 9f4c53ca
......@@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev)
return smp_processor_id() % dev->real_num_rx_queues;
}
static struct net_device *veth_peer_dev(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
/* Callers must be under RCU read side. */
return rcu_dereference(priv->peer);
}
static int veth_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames,
u32 flags, bool ndo_xmit)
......@@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = {
.ndo_set_rx_headroom = veth_set_rx_headroom,
.ndo_bpf = veth_xdp,
.ndo_xdp_xmit = veth_ndo_xdp_xmit,
.ndo_get_peer_dev = veth_peer_dev,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
......
......@@ -82,7 +82,7 @@ struct bpf_map_ops {
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
int fd);
void (*map_fd_put_ptr)(void *ptr);
u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
struct seq_file *m);
......
......@@ -1277,6 +1277,9 @@ struct netdev_net_notifier {
* int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
* int cmd);
* Add, change, delete or get information on an IPv4 tunnel.
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
* If a device is paired with a peer device, return the peer instance.
* The caller must be under RCU read context.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
......@@ -1484,6 +1487,7 @@ struct net_device_ops {
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
int (*ndo_tunnel_ctl)(struct net_device *dev,
struct ip_tunnel_parm *p, int cmd);
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
};
/**
......
......@@ -435,6 +435,9 @@ enum {
/* Share perf_event among processes */
BPF_F_PRESERVE_ELEMS = (1U << 11),
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
};
/* Flags for BPF_PROG_QUERY. */
......@@ -3679,10 +3682,14 @@ union bpf_attr {
* Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it
* fills in e.g. MAC addresses based on the L3 information from
* the packet. This helper is supported for IPv4 and IPv6 protocols.
* populates L2 addresses as well, meaning, internally, the helper
* performs a FIB lookup based on the skb's networking header to
* get the address of the next hop and then relies on the neighbor
* lookup for the L2 address of the nexthop.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types.
* currently only supported for tc BPF program types, and enabled
* for IPv4 and IPv6 protocols.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
......@@ -3715,6 +3722,22 @@ union bpf_attr {
* never return NULL.
* Return
* A pointer pointing to the kernel percpu variable on this cpu.
*
* long bpf_redirect_peer(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_redirect**\ (), except
* that the redirection happens to the *ifindex*' peer device and
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the ingress
* hook and for veth device types. The peer device must reside in a
* different network namespace.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -3872,6 +3895,7 @@ union bpf_attr {
FN(redirect_neigh), \
FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \
FN(redirect_peer), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
......@@ -16,7 +16,7 @@
#define ARRAY_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
BPF_F_PRESERVE_ELEMS)
BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
static void bpf_array_free_percpu(struct bpf_array *array)
{
......@@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr)
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
attr->map_flags & BPF_F_MMAPABLE)
attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
......@@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
......@@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
if (map->map_flags & BPF_F_INNER_MAP)
return -EOPNOTSUPP;
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
if (!map->bypass_spec_v1) {
......@@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
static bool array_map_meta_equal(const struct bpf_map *meta0,
const struct bpf_map *meta1)
{
return meta0->max_entries == meta1->max_entries &&
bpf_map_meta_equal(meta0, meta1);
if (!bpf_map_meta_equal(meta0, meta1))
return false;
return meta0->map_flags & BPF_F_INNER_MAP ? true :
meta0->max_entries == meta1->max_entries;
}
struct bpf_iter_seq_array_map_info {
......@@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
static u32 array_of_map_gen_lookup(struct bpf_map *map,
static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
......
......@@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
* bpf_prog
* __htab_map_lookup_elem
*/
static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
......@@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
return __htab_lru_map_lookup_elem(map, key, false);
}
static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
static int htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
......@@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
static u32 htab_of_map_gen_lookup(struct bpf_map *map,
static int htab_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
......
......@@ -11049,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
if (cnt == -EOPNOTSUPP)
goto patch_map_ops_generic;
if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
......@@ -11079,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
......
......@@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev)
struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
......@@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
*another = true;
break;
}
return NULL;
case TC_ACT_CONSUMED:
return NULL;
......@@ -5163,7 +5167,12 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
bool another = false;
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
goto another_round;
if (!skb)
goto out;
......
......@@ -2380,8 +2380,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
/* Internal, non-exposed redirect flags. */
enum {
BPF_F_NEIGH = (1ULL << 1),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH)
BPF_F_NEIGH = (1ULL << 1),
BPF_F_PEER = (1ULL << 2),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER)
};
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
......@@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
dev = dev_get_by_index_rcu(net, ri->tgt_index);
ri->tgt_index = 0;
if (unlikely(!dev)) {
kfree_skb(skb);
return -EINVAL;
ri->flags = 0;
if (unlikely(!dev))
goto out_drop;
if (flags & BPF_F_PEER) {
const struct net_device_ops *ops = dev->netdev_ops;
if (unlikely(!ops->ndo_get_peer_dev ||
!skb_at_tc_ingress(skb)))
goto out_drop;
dev = ops->ndo_get_peer_dev(dev);
if (unlikely(!dev ||
!is_skb_forwardable(dev, skb) ||
net_eq(net, dev_net(dev))))
goto out_drop;
skb->dev = dev;
return -EAGAIN;
}
return flags & BPF_F_NEIGH ?
__bpf_redirect_neigh(skb, dev) :
__bpf_redirect(skb, dev, flags);
out_drop:
kfree_skb(skb);
return -EINVAL;
}
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
......@@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags))
return TC_ACT_SHOT;
ri->flags = BPF_F_PEER;
ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
static const struct bpf_func_proto bpf_redirect_peer_proto = {
.func = bpf_redirect_peer,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
......@@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_redirect_proto;
case BPF_FUNC_redirect_neigh:
return &bpf_redirect_neigh_proto;
case BPF_FUNC_redirect_peer:
return &bpf_redirect_peer_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
......
......@@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
struct bpf_insn *insn = insn_buf;
......
......@@ -435,6 +435,9 @@ enum {
/* Share perf_event among processes */
BPF_F_PRESERVE_ELEMS = (1U << 11),
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
};
/* Flags for BPF_PROG_QUERY. */
......@@ -3679,10 +3682,14 @@ union bpf_attr {
* Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it
* fills in e.g. MAC addresses based on the L3 information from
* the packet. This helper is supported for IPv4 and IPv6 protocols.
* populates L2 addresses as well, meaning, internally, the helper
* performs a FIB lookup based on the skb's networking header to
* get the address of the next hop and then relies on the neighbor
* lookup for the L2 address of the nexthop.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types.
* currently only supported for tc BPF program types, and enabled
* for IPv4 and IPv6 protocols.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
......@@ -3715,6 +3722,22 @@ union bpf_attr {
* never return NULL.
* Return
* A pointer pointing to the kernel percpu variable on this cpu.
*
* long bpf_redirect_peer(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_redirect**\ (), except
* that the redirection happens to the *ifindex*' peer device and
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the ingress
* hook and for veth device types. The peer device must reside in a
* different network namespace.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -3872,6 +3895,7 @@ union bpf_attr {
FN(redirect_neigh), \
FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \
FN(redirect_peer), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
......@@ -55,10 +55,10 @@ static int kern_sync_rcu(void)
static void test_lookup_update(void)
{
int err, key = 0, val, i;
int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id;
int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd;
struct test_btf_map_in_map *skel;
int outer_arr_fd, outer_hash_fd;
int fd, map1_fd, map2_fd, map1_id, map2_id;
int err, key = 0, val, i, fd;
skel = test_btf_map_in_map__open_and_load();
if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
......@@ -70,32 +70,45 @@ static void test_lookup_update(void)
map1_fd = bpf_map__fd(skel->maps.inner_map1);
map2_fd = bpf_map__fd(skel->maps.inner_map2);
map3_fd = bpf_map__fd(skel->maps.inner_map3);
map4_fd = bpf_map__fd(skel->maps.inner_map4);
map5_fd = bpf_map__fd(skel->maps.inner_map5);
outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn);
outer_arr_fd = bpf_map__fd(skel->maps.outer_arr);
outer_hash_fd = bpf_map__fd(skel->maps.outer_hash);
/* inner1 = input, inner2 = input + 1 */
map1_fd = bpf_map__fd(skel->maps.inner_map1);
/* inner1 = input, inner2 = input + 1, inner3 = input + 2 */
bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0);
map2_fd = bpf_map__fd(skel->maps.inner_map2);
bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0);
bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0);
skel->bss->input = 1;
usleep(1);
bpf_map_lookup_elem(map1_fd, &key, &val);
CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1);
bpf_map_lookup_elem(map2_fd, &key, &val);
CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2);
bpf_map_lookup_elem(map3_fd, &key, &val);
CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3);
/* inner1 = input + 1, inner2 = input */
/* inner2 = input, inner1 = input + 1, inner4 = input + 2 */
bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0);
bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0);
bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0);
skel->bss->input = 3;
usleep(1);
bpf_map_lookup_elem(map1_fd, &key, &val);
CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4);
bpf_map_lookup_elem(map2_fd, &key, &val);
CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3);
bpf_map_lookup_elem(map4_fd, &key, &val);
CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5);
/* inner5 = input + 2 */
bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0);
skel->bss->input = 5;
usleep(1);
bpf_map_lookup_elem(map5_fd, &key, &val);
CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7);
for (i = 0; i < 5; i++) {
val = i % 2 ? map1_fd : map2_fd;
......@@ -106,7 +119,13 @@ static void test_lookup_update(void)
}
err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0);
if (CHECK_FAIL(err)) {
printf("failed to update hash_of_maps on iter #%d\n", i);
printf("failed to update array_of_maps on iter #%d\n", i);
goto cleanup;
}
val = i % 2 ? map4_fd : map5_fd;
err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0);
if (CHECK_FAIL(err)) {
printf("failed to update array_of_maps (dyn) on iter #%d\n", i);
goto cleanup;
}
}
......
......@@ -41,6 +41,43 @@ struct outer_arr {
.values = { (void *)&inner_map1, 0, (void *)&inner_map2 },
};
struct inner_map_sz3 {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_INNER_MAP);
__uint(max_entries, 3);
__type(key, int);
__type(value, int);
} inner_map3 SEC(".maps"),
inner_map4 SEC(".maps");
struct inner_map_sz4 {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_INNER_MAP);
__uint(max_entries, 5);
__type(key, int);
__type(value, int);
} inner_map5 SEC(".maps");
struct outer_arr_dyn {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(max_entries, 3);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__array(values, struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_INNER_MAP);
__uint(max_entries, 1);
__type(key, int);
__type(value, int);
});
} outer_arr_dyn SEC(".maps") = {
.values = {
[0] = (void *)&inner_map3,
[1] = (void *)&inner_map4,
[2] = (void *)&inner_map5,
},
};
struct outer_hash {
__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
__uint(max_entries, 5);
......@@ -101,6 +138,12 @@ int handle__sys_enter(void *ctx)
val = input + 1;
bpf_map_update_elem(inner_map, &key, &val, 0);
inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key);
if (!inner_map)
return 1;
val = input + 2;
bpf_map_update_elem(inner_map, &key, &val, 0);
return 0;
}
......
......@@ -13,17 +13,10 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#ifndef barrier_data
# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory")
#endif
#ifndef ctx_ptr
# define ctx_ptr(field) (void *)(long)(field)
#endif
#define dst_to_src_tmp 0xeeddddeeU
#define src_to_dst_tmp 0xeeffffeeU
#define ip4_src 0xac100164 /* 172.16.1.100 */
#define ip4_dst 0xac100264 /* 172.16.2.100 */
......@@ -39,6 +32,18 @@
a.s6_addr32[3] == b.s6_addr32[3])
#endif
enum {
dev_src,
dev_dst,
};
struct bpf_map_def SEC("maps") ifindex_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
__be32 addr)
{
......@@ -73,7 +78,14 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
return v6_equal(ip6h->daddr, addr);
}
SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
static __always_inline int get_dev_ifindex(int which)
{
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
{
void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data);
......@@ -87,7 +99,6 @@ SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
{
int idx = dst_to_src_tmp;
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
......@@ -103,19 +114,15 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
if (!redirect)
return TC_ACT_OK;
barrier_data(&idx);
idx = bpf_ntohl(idx);
__builtin_memset(&zero, 0, sizeof(zero));
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(idx, 0);
return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
{
int idx = src_to_dst_tmp;
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
......@@ -131,14 +138,11 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb)
if (!redirect)
return TC_ACT_OK;
barrier_data(&idx);
idx = bpf_ntohl(idx);
__builtin_memset(&zero, 0, sizeof(zero));
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(idx, 0);
return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0);
}
char __license[] SEC("license") = "GPL";
// SPDX-License-Identifier: GPL-2.0
#include <stdint.h>
#include <stdbool.h>
#include <linux/bpf.h>
#include <linux/stddef.h>
#include <linux/pkt_cls.h>
#include <bpf/bpf_helpers.h>
enum {
dev_src,
dev_dst,
};
struct bpf_map_def SEC("maps") ifindex_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static __always_inline int get_dev_ifindex(int which)
{
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
{
return TC_ACT_SHOT;
}
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
{
return bpf_redirect_peer(get_dev_ifindex(dev_src), 0);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
{
return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0);
}
char __license[] SEC("license") = "GPL";
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
# between src and dst. The netns fwd has veth links to each src and dst. The
# client is in src and server in dst. The test installs a TC BPF program to each
# host facing veth in fwd which calls into bpf_redirect_peer() to perform the
# neigh addr population and redirect; it also installs a dropper prog on the
# egress side to drop skbs if neigh addrs were not populated.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that nc, dd, ping, ping6 and timeout are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "dd is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
command -v ping >/dev/null 2>&1 || \
{ echo >&2 "ping is not available"; exit 1; }
command -v ping6 >/dev/null 2>&1 || \
{ echo >&2 "ping6 is not available"; exit 1; }
readonly GREEN='\033[0;92m'
readonly RED='\033[0;31m'
readonly NC='\033[0m' # No Color
readonly PING_ARG="-c 3 -w 10 -q"
readonly TIMEOUT=10
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP4_SRC="172.16.1.100"
readonly IP4_DST="172.16.2.100"
readonly IP6_SRC="::1:dead:beef:cafe"
readonly IP6_DST="::2:dead:beef:cafe"
readonly IP4_SLL="169.254.0.1"
readonly IP4_DLL="169.254.0.2"
readonly IP4_NET="169.254.0.0"
cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_FWD}
ip netns del ${NS_DST}
}
trap cleanup EXIT
set -e
ip netns add "${NS_SRC}"
ip netns add "${NS_FWD}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_src_fwd
ip link add veth_dst type veth peer name veth_dst_fwd
ip link set veth_src netns ${NS_SRC}
ip link set veth_src_fwd netns ${NS_FWD}
ip link set veth_dst netns ${NS_DST}
ip link set veth_dst_fwd netns ${NS_FWD}
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
# The fwd netns automatically get a v6 LL address / routes, but also needs v4
# one in order to start ARP probing. IP4_NET route is added to the endpoints
# so that the ARP processing will reply.
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_FWD} link set dev veth_src_fwd up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}')
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}')
xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o
xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh
rm -f test_tc_neigh.x.o test_tc_neigh.y.o
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
set +e
TEST="TCPv4 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="TCPv6 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv4 connectivity test"
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv6 connectivity test"
ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
# between src and dst. The netns fwd has veth links to each src and dst. The
# client is in src and server in dst. The test installs a TC BPF program to each
# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
# switch from ingress side; it also installs a checker prog on the egress side
# to drop unexpected traffic.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that needed tools are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "dd is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
command -v ping >/dev/null 2>&1 || \
{ echo >&2 "ping is not available"; exit 1; }
command -v ping6 >/dev/null 2>&1 || \
{ echo >&2 "ping6 is not available"; exit 1; }
command -v perl >/dev/null 2>&1 || \
{ echo >&2 "perl is not available"; exit 1; }
command -v jq >/dev/null 2>&1 || \
{ echo >&2 "jq is not available"; exit 1; }
command -v bpftool >/dev/null 2>&1 || \
{ echo >&2 "bpftool is not available"; exit 1; }
readonly GREEN='\033[0;92m'
readonly RED='\033[0;31m'
readonly NC='\033[0m' # No Color
readonly PING_ARG="-c 3 -w 10 -q"
readonly TIMEOUT=10
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP4_SRC="172.16.1.100"
readonly IP4_DST="172.16.2.100"
readonly IP6_SRC="::1:dead:beef:cafe"
readonly IP6_DST="::2:dead:beef:cafe"
readonly IP4_SLL="169.254.0.1"
readonly IP4_DLL="169.254.0.2"
readonly IP4_NET="169.254.0.0"
netns_cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_FWD}
ip netns del ${NS_DST}
}
netns_setup()
{
ip netns add "${NS_SRC}"
ip netns add "${NS_FWD}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_src_fwd
ip link add veth_dst type veth peer name veth_dst_fwd
ip link set veth_src netns ${NS_SRC}
ip link set veth_src_fwd netns ${NS_FWD}
ip link set veth_dst netns ${NS_DST}
ip link set veth_dst_fwd netns ${NS_FWD}
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
# The fwd netns automatically get a v6 LL address / routes, but also
# needs v4 one in order to start ARP probing. IP4_NET route is added
# to the endpoints so that the ARP processing will reply.
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_FWD} link set dev veth_src_fwd up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
}
netns_test_connectivity()
{
set +e
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
TEST="TCPv4 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="TCPv6 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv4 connectivity test"
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv6 connectivity test"
ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
set -e
}
hex_mem_str()
{
perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
}
netns_setup_bpf()
{
local obj=$1
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
for prog in $progs; do
map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
if [ ! -z "$map" ]; then
bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
fi
done
}
trap netns_cleanup EXIT
set -e
netns_setup
netns_setup_bpf test_tc_neigh.o
netns_test_connectivity
netns_cleanup
netns_setup
netns_setup_bpf test_tc_peer.o
netns_test_connectivity
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment