Commit ba452c9e authored by Toke Høiland-Jørgensen's avatar Toke Høiland-Jørgensen Committed by Daniel Borkmann

bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop

Based on the discussion in [0], update the bpf_redirect_neigh() helper to
accept an optional parameter specifying the nexthop information. This makes
it possible to combine bpf_fib_lookup() and bpf_redirect_neigh() without
incurring a duplicate FIB lookup - since the FIB lookup helper will return
the nexthop information even if no neighbour is present, this can simply
be passed on to bpf_redirect_neigh() if bpf_fib_lookup() returns
BPF_FIB_LKUP_RET_NO_NEIGH. Thus fix & extend it before helper API is frozen.

  [0] https://lore.kernel.org/bpf/393e17fc-d187-3a8d-2f0d-a627c7c63fca@iogearbox.net/Signed-off-by: default avatarToke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Reviewed-by: default avatarDavid Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/bpf/160322915615.32199.1187570224032024535.stgit@toke.dk
parent c5eb48e8
...@@ -607,12 +607,21 @@ struct bpf_skb_data_end { ...@@ -607,12 +607,21 @@ struct bpf_skb_data_end {
void *data_end; void *data_end;
}; };
struct bpf_nh_params {
u32 nh_family;
union {
u32 ipv4_nh;
struct in6_addr ipv6_nh;
};
};
struct bpf_redirect_info { struct bpf_redirect_info {
u32 flags; u32 flags;
u32 tgt_index; u32 tgt_index;
void *tgt_value; void *tgt_value;
struct bpf_map *map; struct bpf_map *map;
u32 kern_flags; u32 kern_flags;
struct bpf_nh_params nh;
}; };
DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
......
...@@ -3677,15 +3677,19 @@ union bpf_attr { ...@@ -3677,15 +3677,19 @@ union bpf_attr {
* Return * Return
* The id is returned or 0 in case the id could not be retrieved. * The id is returned or 0 in case the id could not be retrieved.
* *
* long bpf_redirect_neigh(u32 ifindex, u64 flags) * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
* Description * Description
* Redirect the packet to another net device of index *ifindex* * Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper * and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it * is somewhat similar to **bpf_redirect**\ (), except that it
* populates L2 addresses as well, meaning, internally, the helper * populates L2 addresses as well, meaning, internally, the helper
* performs a FIB lookup based on the skb's networking header to * relies on the neighbor lookup for the L2 address of the nexthop.
* get the address of the next hop and then relies on the neighbor *
* lookup for the L2 address of the nexthop. * The helper will perform a FIB lookup based on the skb's
* networking header to get the address of the next hop, unless
* this is supplied by the caller in the *params* argument. The
* *plen* argument indicates the len of *params* and should be set
* to 0 if *params* is NULL.
* *
* The *flags* argument is reserved and must be 0. The helper is * The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types, and enabled * currently only supported for tc BPF program types, and enabled
...@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { ...@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup {
__u8 dmac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */
}; };
struct bpf_redir_neigh {
/* network family for lookup (AF_INET, AF_INET6) */
__u32 nh_family;
/* network address of nexthop; skips fib lookup to find gateway */
union {
__be32 ipv4_nh;
__u32 ipv6_nh[4]; /* in6_addr; network order */
};
};
enum bpf_task_fd_type { enum bpf_task_fd_type {
BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
BPF_FD_TYPE_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */
......
...@@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, ...@@ -2165,12 +2165,12 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
} }
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
struct net_device *dev, struct bpf_nh_params *nh)
{ {
struct dst_entry *dst = skb_dst(skb);
struct net_device *dev = dst->dev;
u32 hh_len = LL_RESERVED_SPACE(dev); u32 hh_len = LL_RESERVED_SPACE(dev);
const struct in6_addr *nexthop; const struct in6_addr *nexthop;
struct dst_entry *dst = NULL;
struct neighbour *neigh; struct neighbour *neigh;
if (dev_xmit_recursion()) { if (dev_xmit_recursion()) {
...@@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) ...@@ -2196,8 +2196,13 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
} }
rcu_read_lock_bh(); rcu_read_lock_bh();
nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst), if (!nh) {
&ipv6_hdr(skb)->daddr); dst = skb_dst(skb);
nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
&ipv6_hdr(skb)->daddr);
} else {
nexthop = &nh->ipv6_nh;
}
neigh = ip_neigh_gw6(dev, nexthop); neigh = ip_neigh_gw6(dev, nexthop);
if (likely(!IS_ERR(neigh))) { if (likely(!IS_ERR(neigh))) {
int ret; int ret;
...@@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb) ...@@ -2210,36 +2215,43 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb)
return ret; return ret;
} }
rcu_read_unlock_bh(); rcu_read_unlock_bh();
IP6_INC_STATS(dev_net(dst->dev), if (dst)
ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); IP6_INC_STATS(dev_net(dst->dev),
ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop: out_drop:
kfree_skb(skb); kfree_skb(skb);
return -ENETDOWN; return -ENETDOWN;
} }
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{ {
const struct ipv6hdr *ip6h = ipv6_hdr(skb); const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct net *net = dev_net(dev); struct net *net = dev_net(dev);
int err, ret = NET_XMIT_DROP; int err, ret = NET_XMIT_DROP;
struct dst_entry *dst;
struct flowi6 fl6 = {
.flowi6_flags = FLOWI_FLAG_ANYSRC,
.flowi6_mark = skb->mark,
.flowlabel = ip6_flowinfo(ip6h),
.flowi6_oif = dev->ifindex,
.flowi6_proto = ip6h->nexthdr,
.daddr = ip6h->daddr,
.saddr = ip6h->saddr,
};
dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL); if (!nh) {
if (IS_ERR(dst)) struct dst_entry *dst;
goto out_drop; struct flowi6 fl6 = {
.flowi6_flags = FLOWI_FLAG_ANYSRC,
.flowi6_mark = skb->mark,
.flowlabel = ip6_flowinfo(ip6h),
.flowi6_oif = dev->ifindex,
.flowi6_proto = ip6h->nexthdr,
.daddr = ip6h->daddr,
.saddr = ip6h->saddr,
};
dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
if (IS_ERR(dst))
goto out_drop;
skb_dst_set(skb, dst); skb_dst_set(skb, dst);
} else if (nh->nh_family != AF_INET6) {
goto out_drop;
}
err = bpf_out_neigh_v6(net, skb); err = bpf_out_neigh_v6(net, skb, dev, nh);
if (unlikely(net_xmit_eval(err))) if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++; dev->stats.tx_errors++;
else else
...@@ -2252,7 +2264,8 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) ...@@ -2252,7 +2264,8 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
return ret; return ret;
} }
#else #else
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{ {
kfree_skb(skb); kfree_skb(skb);
return NET_XMIT_DROP; return NET_XMIT_DROP;
...@@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev) ...@@ -2260,11 +2273,9 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev)
#endif /* CONFIG_IPV6 */ #endif /* CONFIG_IPV6 */
#if IS_ENABLED(CONFIG_INET) #if IS_ENABLED(CONFIG_INET)
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
struct net_device *dev, struct bpf_nh_params *nh)
{ {
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = container_of(dst, struct rtable, dst);
struct net_device *dev = dst->dev;
u32 hh_len = LL_RESERVED_SPACE(dev); u32 hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh; struct neighbour *neigh;
bool is_v6gw = false; bool is_v6gw = false;
...@@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) ...@@ -2292,7 +2303,21 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
} }
rcu_read_lock_bh(); rcu_read_lock_bh();
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!nh) {
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = container_of(dst, struct rtable, dst);
neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
} else if (nh->nh_family == AF_INET6) {
neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
is_v6gw = true;
} else if (nh->nh_family == AF_INET) {
neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
} else {
rcu_read_unlock_bh();
goto out_drop;
}
if (likely(!IS_ERR(neigh))) { if (likely(!IS_ERR(neigh))) {
int ret; int ret;
...@@ -2309,33 +2334,37 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb) ...@@ -2309,33 +2334,37 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb)
return -ENETDOWN; return -ENETDOWN;
} }
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{ {
const struct iphdr *ip4h = ip_hdr(skb); const struct iphdr *ip4h = ip_hdr(skb);
struct net *net = dev_net(dev); struct net *net = dev_net(dev);
int err, ret = NET_XMIT_DROP; int err, ret = NET_XMIT_DROP;
struct rtable *rt;
struct flowi4 fl4 = {
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.flowi4_mark = skb->mark,
.flowi4_tos = RT_TOS(ip4h->tos),
.flowi4_oif = dev->ifindex,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
.saddr = ip4h->saddr,
};
rt = ip_route_output_flow(net, &fl4, NULL); if (!nh) {
if (IS_ERR(rt)) struct flowi4 fl4 = {
goto out_drop; .flowi4_flags = FLOWI_FLAG_ANYSRC,
if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { .flowi4_mark = skb->mark,
ip_rt_put(rt); .flowi4_tos = RT_TOS(ip4h->tos),
goto out_drop; .flowi4_oif = dev->ifindex,
} .flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
.saddr = ip4h->saddr,
};
struct rtable *rt;
rt = ip_route_output_flow(net, &fl4, NULL);
if (IS_ERR(rt))
goto out_drop;
if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
ip_rt_put(rt);
goto out_drop;
}
skb_dst_set(skb, &rt->dst); skb_dst_set(skb, &rt->dst);
}
err = bpf_out_neigh_v4(net, skb); err = bpf_out_neigh_v4(net, skb, dev, nh);
if (unlikely(net_xmit_eval(err))) if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++; dev->stats.tx_errors++;
else else
...@@ -2348,14 +2377,16 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) ...@@ -2348,14 +2377,16 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev)
return ret; return ret;
} }
#else #else
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev) static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{ {
kfree_skb(skb); kfree_skb(skb);
return NET_XMIT_DROP; return NET_XMIT_DROP;
} }
#endif /* CONFIG_INET */ #endif /* CONFIG_INET */
static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
struct bpf_nh_params *nh)
{ {
struct ethhdr *ethh = eth_hdr(skb); struct ethhdr *ethh = eth_hdr(skb);
...@@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) ...@@ -2370,9 +2401,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
skb_reset_network_header(skb); skb_reset_network_header(skb);
if (skb->protocol == htons(ETH_P_IP)) if (skb->protocol == htons(ETH_P_IP))
return __bpf_redirect_neigh_v4(skb, dev); return __bpf_redirect_neigh_v4(skb, dev, nh);
else if (skb->protocol == htons(ETH_P_IPV6)) else if (skb->protocol == htons(ETH_P_IPV6))
return __bpf_redirect_neigh_v6(skb, dev); return __bpf_redirect_neigh_v6(skb, dev, nh);
out: out:
kfree_skb(skb); kfree_skb(skb);
return -ENOTSUPP; return -ENOTSUPP;
...@@ -2382,7 +2413,8 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) ...@@ -2382,7 +2413,8 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
enum { enum {
BPF_F_NEIGH = (1ULL << 1), BPF_F_NEIGH = (1ULL << 1),
BPF_F_PEER = (1ULL << 2), BPF_F_PEER = (1ULL << 2),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER) BPF_F_NEXTHOP = (1ULL << 3),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
}; };
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
...@@ -2455,7 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb) ...@@ -2455,7 +2487,8 @@ int skb_do_redirect(struct sk_buff *skb)
return -EAGAIN; return -EAGAIN;
} }
return flags & BPF_F_NEIGH ? return flags & BPF_F_NEIGH ?
__bpf_redirect_neigh(skb, dev) : __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
&ri->nh : NULL) :
__bpf_redirect(skb, dev, flags); __bpf_redirect(skb, dev, flags);
out_drop: out_drop:
kfree_skb(skb); kfree_skb(skb);
...@@ -2504,16 +2537,21 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = { ...@@ -2504,16 +2537,21 @@ static const struct bpf_func_proto bpf_redirect_peer_proto = {
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
int, plen, u64, flags)
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags)) if (unlikely((plen && plen < sizeof(*params)) || flags))
return TC_ACT_SHOT; return TC_ACT_SHOT;
ri->flags = BPF_F_NEIGH; ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
ri->tgt_index = ifindex; ri->tgt_index = ifindex;
BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
if (plen)
memcpy(&ri->nh, params, sizeof(ri->nh));
return TC_ACT_REDIRECT; return TC_ACT_REDIRECT;
} }
...@@ -2522,7 +2560,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { ...@@ -2522,7 +2560,9 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = {
.gpl_only = false, .gpl_only = false,
.ret_type = RET_INTEGER, .ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING, .arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_MEM_OR_NULL,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
}; };
BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes) BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
......
...@@ -453,6 +453,7 @@ class PrinterHelpers(Printer): ...@@ -453,6 +453,7 @@ class PrinterHelpers(Printer):
'struct bpf_perf_event_data', 'struct bpf_perf_event_data',
'struct bpf_perf_event_value', 'struct bpf_perf_event_value',
'struct bpf_pidns_info', 'struct bpf_pidns_info',
'struct bpf_redir_neigh',
'struct bpf_sk_lookup', 'struct bpf_sk_lookup',
'struct bpf_sock', 'struct bpf_sock',
'struct bpf_sock_addr', 'struct bpf_sock_addr',
......
...@@ -3677,15 +3677,19 @@ union bpf_attr { ...@@ -3677,15 +3677,19 @@ union bpf_attr {
* Return * Return
* The id is returned or 0 in case the id could not be retrieved. * The id is returned or 0 in case the id could not be retrieved.
* *
* long bpf_redirect_neigh(u32 ifindex, u64 flags) * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags)
* Description * Description
* Redirect the packet to another net device of index *ifindex* * Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper * and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it * is somewhat similar to **bpf_redirect**\ (), except that it
* populates L2 addresses as well, meaning, internally, the helper * populates L2 addresses as well, meaning, internally, the helper
* performs a FIB lookup based on the skb's networking header to * relies on the neighbor lookup for the L2 address of the nexthop.
* get the address of the next hop and then relies on the neighbor *
* lookup for the L2 address of the nexthop. * The helper will perform a FIB lookup based on the skb's
* networking header to get the address of the next hop, unless
* this is supplied by the caller in the *params* argument. The
* *plen* argument indicates the len of *params* and should be set
* to 0 if *params* is NULL.
* *
* The *flags* argument is reserved and must be 0. The helper is * The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types, and enabled * currently only supported for tc BPF program types, and enabled
...@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup { ...@@ -4906,6 +4910,16 @@ struct bpf_fib_lookup {
__u8 dmac[6]; /* ETH_ALEN */ __u8 dmac[6]; /* ETH_ALEN */
}; };
struct bpf_redir_neigh {
/* network family for lookup (AF_INET, AF_INET6) */
__u32 nh_family;
/* network address of nexthop; skips fib lookup to find gateway */
union {
__be32 ipv4_nh;
__u32 ipv6_nh[4]; /* in6_addr; network order */
};
};
enum bpf_task_fd_type { enum bpf_task_fd_type {
BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
BPF_FD_TYPE_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment