Commit 629a0025 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf-tc-tunneling'

Willem de Bruijn says:

====================
BPF allows for dynamic tunneling, choosing the tunnel destination and
features on-demand. Extend bpf_skb_adjust_room to allow for efficient
tunneling at the TC hooks.

Most features are required for large packets with GSO, as these will
be modified after this patch.

Patch 1
  is a performance optimization, avoiding an unnecessary unclone
  for the TCP hot path.

Patches 2..6
  introduce a regression test. These can be squashed, but the code is
  arguably more readable when gradually expanding the feature set.

Patch 7
  is a performance optimization, avoid copying network headers
  that are going to be overwritten. This also simplifies the bpf
  program.

Patch 8
  reenables bpf_skb_adjust_room for UDP packets.

Patch 9
  configures skb tunneling metadata analogous to tunnel devices.

Patches 10..13
  expand the regression test to make use of the new features and
  enable the GSO testcases.

Changes
  v1->v2
  - move BPF_F_ADJ_ROOM_MASK out of uapi as it can be expanded
  - document new flags
  - in tests replace netcat -q flag with coreutils timeout:
      the -q flag is not supported in all netcat versions
  v2->v3
  - move BPF_F_ADJ_ROOM_ENCAP_L3_MASK out of uapi as it has no
    use in userspace
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents f6827526 75a1a9fa
...@@ -1478,13 +1478,27 @@ union bpf_attr { ...@@ -1478,13 +1478,27 @@ union bpf_attr {
* Grow or shrink the room for data in the packet associated to * Grow or shrink the room for data in the packet associated to
* *skb* by *len_diff*, and according to the selected *mode*. * *skb* by *len_diff*, and according to the selected *mode*.
* *
* There is a single supported mode at this time: * There are two supported modes at this time:
*
* * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
* (room space is added or removed below the layer 2 header).
* *
* * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
* (room space is added or removed below the layer 3 header). * (room space is added or removed below the layer 3 header).
* *
* All values for *flags* are reserved for future usage, and must * The following flags are supported at this time:
* be left at zero. *
* * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
* Adjusting mss in this way is not allowed for datagrams.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
* Any new space is reserved to hold a tunnel header.
* Configure skb offsets and other fields accordingly.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
* * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
* Use with ENCAP_L3 flags to further specify the tunnel type.
* *
* A call to this helper is susceptible to change the underlaying * A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers * packet buffer. Therefore, at load time, all checks on pointers
...@@ -2624,9 +2638,18 @@ enum bpf_func_id { ...@@ -2624,9 +2638,18 @@ enum bpf_func_id {
/* Current network namespace */ /* Current network namespace */
#define BPF_F_CURRENT_NETNS (-1L) #define BPF_F_CURRENT_NETNS (-1L)
/* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4)
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET, BPF_ADJ_ROOM_NET,
BPF_ADJ_ROOM_MAC,
}; };
/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
......
...@@ -2963,42 +2963,113 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) ...@@ -2963,42 +2963,113 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
} }
} }
static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
u64 flags)
{ {
u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
unsigned int gso_type = SKB_GSO_DODGY;
u16 mac_len, inner_net, inner_trans;
int ret; int ret;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
return -ENOTSUPP; /* udp gso_size delineates datagrams, only allow if fixed */
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
return -ENOTSUPP;
}
ret = skb_cow(skb, len_diff); ret = skb_cow_head(skb, len_diff);
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
if (encap) {
if (skb->protocol != htons(ETH_P_IP) &&
skb->protocol != htons(ETH_P_IPV6))
return -ENOTSUPP;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
return -EINVAL;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
return -EINVAL;
if (skb->encapsulation)
return -EALREADY;
mac_len = skb->network_header - skb->mac_header;
inner_net = skb->network_header;
inner_trans = skb->transport_header;
}
ret = bpf_skb_net_hdr_push(skb, off, len_diff); ret = bpf_skb_net_hdr_push(skb, off, len_diff);
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
if (encap) {
/* inner mac == inner_net on l3 encap */
skb->inner_mac_header = inner_net;
skb->inner_network_header = inner_net;
skb->inner_transport_header = inner_trans;
skb_set_inner_protocol(skb, skb->protocol);
skb->encapsulation = 1;
skb_set_network_header(skb, mac_len);
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
gso_type |= SKB_GSO_UDP_TUNNEL;
else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
gso_type |= SKB_GSO_GRE;
else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
gso_type |= SKB_GSO_IPXIP6;
else
gso_type |= SKB_GSO_IPXIP4;
if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
sizeof(struct ipv6hdr) :
sizeof(struct iphdr);
skb_set_transport_header(skb, mac_len + nh_len);
}
}
if (skb_is_gso(skb)) { if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb); struct skb_shared_info *shinfo = skb_shinfo(skb);
/* Due to header grow, MSS needs to be downgraded. */ /* Due to header grow, MSS needs to be downgraded. */
skb_decrease_gso_size(shinfo, len_diff); if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
skb_decrease_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */ /* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_type |= gso_type;
shinfo->gso_segs = 0; shinfo->gso_segs = 0;
} }
return 0; return 0;
} }
static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
u64 flags)
{ {
u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
int ret; int ret;
if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
return -ENOTSUPP; /* udp gso_size delineates datagrams, only allow if fixed */
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
return -ENOTSUPP;
}
ret = skb_unclone(skb, GFP_ATOMIC); ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0)) if (unlikely(ret < 0))
...@@ -3012,7 +3083,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) ...@@ -3012,7 +3083,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
struct skb_shared_info *shinfo = skb_shinfo(skb); struct skb_shared_info *shinfo = skb_shinfo(skb);
/* Due to header shrink, MSS can be upgraded. */ /* Due to header shrink, MSS can be upgraded. */
skb_increase_gso_size(shinfo, len_diff); if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
skb_increase_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */ /* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY; shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_segs = 0; shinfo->gso_segs = 0;
...@@ -3027,49 +3100,50 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb) ...@@ -3027,49 +3100,50 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
SKB_MAX_ALLOC; SKB_MAX_ALLOC;
} }
static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{ {
bool trans_same = skb->transport_header == skb->network_header;
u32 len_cur, len_diff_abs = abs(len_diff); u32 len_cur, len_diff_abs = abs(len_diff);
u32 len_min = bpf_skb_net_base_len(skb); u32 len_min = bpf_skb_net_base_len(skb);
u32 len_max = __bpf_skb_max_len(skb); u32 len_max = __bpf_skb_max_len(skb);
__be16 proto = skb->protocol; __be16 proto = skb->protocol;
bool shrink = len_diff < 0; bool shrink = len_diff < 0;
u32 off;
int ret; int ret;
if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK))
return -EINVAL;
if (unlikely(len_diff_abs > 0xfffU)) if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT; return -EFAULT;
if (unlikely(proto != htons(ETH_P_IP) && if (unlikely(proto != htons(ETH_P_IP) &&
proto != htons(ETH_P_IPV6))) proto != htons(ETH_P_IPV6)))
return -ENOTSUPP; return -ENOTSUPP;
off = skb_mac_header_len(skb);
switch (mode) {
case BPF_ADJ_ROOM_NET:
off += bpf_skb_net_base_len(skb);
break;
case BPF_ADJ_ROOM_MAC:
break;
default:
return -ENOTSUPP;
}
len_cur = skb->len - skb_network_offset(skb); len_cur = skb->len - skb_network_offset(skb);
if (skb_transport_header_was_set(skb) && !trans_same)
len_cur = skb_network_header_len(skb);
if ((shrink && (len_diff_abs >= len_cur || if ((shrink && (len_diff_abs >= len_cur ||
len_cur - len_diff_abs < len_min)) || len_cur - len_diff_abs < len_min)) ||
(!shrink && (skb->len + len_diff_abs > len_max && (!shrink && (skb->len + len_diff_abs > len_max &&
!skb_is_gso(skb)))) !skb_is_gso(skb))))
return -ENOTSUPP; return -ENOTSUPP;
ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
bpf_skb_net_grow(skb, len_diff_abs); bpf_skb_net_grow(skb, off, len_diff_abs, flags);
bpf_compute_data_pointers(skb); bpf_compute_data_pointers(skb);
return ret; return ret;
} }
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
if (unlikely(flags))
return -EINVAL;
if (likely(mode == BPF_ADJ_ROOM_NET))
return bpf_skb_adjust_net(skb, len_diff);
return -ENOTSUPP;
}
static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
.func = bpf_skb_adjust_room, .func = bpf_skb_adjust_room,
.gpl_only = false, .gpl_only = false,
......
...@@ -1478,13 +1478,27 @@ union bpf_attr { ...@@ -1478,13 +1478,27 @@ union bpf_attr {
* Grow or shrink the room for data in the packet associated to * Grow or shrink the room for data in the packet associated to
* *skb* by *len_diff*, and according to the selected *mode*. * *skb* by *len_diff*, and according to the selected *mode*.
* *
* There is a single supported mode at this time: * There are two supported modes at this time:
*
* * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
* (room space is added or removed below the layer 2 header).
* *
* * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
* (room space is added or removed below the layer 3 header). * (room space is added or removed below the layer 3 header).
* *
* All values for *flags* are reserved for future usage, and must * The following flags are supported at this time:
* be left at zero. *
* * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
* Adjusting mss in this way is not allowed for datagrams.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
* * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
* Any new space is reserved to hold a tunnel header.
* Configure skb offsets and other fields accordingly.
*
* * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
* * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
* Use with ENCAP_L3 flags to further specify the tunnel type.
* *
* A call to this helper is susceptible to change the underlaying * A call to this helper is susceptible to change the underlaying
* packet buffer. Therefore, at load time, all checks on pointers * packet buffer. Therefore, at load time, all checks on pointers
...@@ -2624,9 +2638,18 @@ enum bpf_func_id { ...@@ -2624,9 +2638,18 @@ enum bpf_func_id {
/* Current network namespace */ /* Current network namespace */
#define BPF_F_CURRENT_NETNS (-1L) #define BPF_F_CURRENT_NETNS (-1L)
/* BPF_FUNC_skb_adjust_room flags. */
#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1)
#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2)
#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3)
#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4)
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET, BPF_ADJ_ROOM_NET,
BPF_ADJ_ROOM_MAC,
}; };
/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */
......
...@@ -52,7 +52,8 @@ TEST_PROGS := test_kmod.sh \ ...@@ -52,7 +52,8 @@ TEST_PROGS := test_kmod.sh \
test_flow_dissector.sh \ test_flow_dissector.sh \
test_xdp_vlan.sh \ test_xdp_vlan.sh \
test_lwt_ip_encap.sh \ test_lwt_ip_encap.sh \
test_tcp_check_syncookie.sh test_tcp_check_syncookie.sh \
test_tc_tunnel.sh
TEST_PROGS_EXTENDED := with_addr.sh \ TEST_PROGS_EXTENDED := with_addr.sh \
with_tunnels.sh \ with_tunnels.sh \
......
...@@ -23,3 +23,5 @@ CONFIG_LWTUNNEL=y ...@@ -23,3 +23,5 @@ CONFIG_LWTUNNEL=y
CONFIG_BPF_STREAM_PARSER=y CONFIG_BPF_STREAM_PARSER=y
CONFIG_XDP_SOCKETS=y CONFIG_XDP_SOCKETS=y
CONFIG_FTRACE_SYSCALLS=y CONFIG_FTRACE_SYSCALLS=y
CONFIG_IPV6_TUNNEL=y
CONFIG_IPV6_GRE=y
// SPDX-License-Identifier: GPL-2.0
/* In-place tunneling */
#include <stdbool.h>
#include <string.h>
#include <linux/stddef.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <linux/pkt_cls.h>
#include <linux/types.h>
#include "bpf_endian.h"
#include "bpf_helpers.h"
static const int cfg_port = 8000;
struct grev4hdr {
struct iphdr ip;
__be16 flags;
__be16 protocol;
} __attribute__((packed));
struct grev6hdr {
struct ipv6hdr ip;
__be16 flags;
__be16 protocol;
} __attribute__((packed));
static __always_inline void set_ipv4_csum(struct iphdr *iph)
{
__u16 *iph16 = (__u16 *)iph;
__u32 csum;
int i;
iph->check = 0;
#pragma clang loop unroll(full)
for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++)
csum += *iph16++;
iph->check = ~((csum & 0xffff) + (csum >> 16));
}
static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
{
struct grev4hdr h_outer;
struct iphdr iph_inner;
struct tcphdr tcph;
__u64 flags;
int olen;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0)
return TC_ACT_OK;
/* filter only packets we want */
if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP)
return TC_ACT_OK;
if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner),
&tcph, sizeof(tcph)) < 0)
return TC_ACT_OK;
if (tcph.dest != __bpf_constant_htons(cfg_port))
return TC_ACT_OK;
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4;
if (with_gre) {
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
olen = sizeof(h_outer);
} else {
olen = sizeof(h_outer.ip);
}
/* add room between mac and network header */
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
return TC_ACT_SHOT;
/* prepare new outer network header */
h_outer.ip = iph_inner;
h_outer.ip.tot_len = bpf_htons(olen +
bpf_htons(h_outer.ip.tot_len));
if (with_gre) {
h_outer.ip.protocol = IPPROTO_GRE;
h_outer.protocol = bpf_htons(ETH_P_IP);
h_outer.flags = 0;
} else {
h_outer.ip.protocol = IPPROTO_IPIP;
}
set_ipv4_csum((void *)&h_outer.ip);
/* store new outer network header */
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
BPF_F_INVALIDATE_HASH) < 0)
return TC_ACT_SHOT;
return TC_ACT_OK;
}
static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
{
struct ipv6hdr iph_inner;
struct grev6hdr h_outer;
struct tcphdr tcph;
__u64 flags;
int olen;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0)
return TC_ACT_OK;
/* filter only packets we want */
if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner),
&tcph, sizeof(tcph)) < 0)
return TC_ACT_OK;
if (tcph.dest != __bpf_constant_htons(cfg_port))
return TC_ACT_OK;
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
if (with_gre) {
flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
olen = sizeof(h_outer);
} else {
olen = sizeof(h_outer.ip);
}
/* add room between mac and network header */
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
return TC_ACT_SHOT;
/* prepare new outer network header */
h_outer.ip = iph_inner;
h_outer.ip.payload_len = bpf_htons(olen +
bpf_ntohs(h_outer.ip.payload_len));
if (with_gre) {
h_outer.ip.nexthdr = IPPROTO_GRE;
h_outer.protocol = bpf_htons(ETH_P_IPV6);
h_outer.flags = 0;
} else {
h_outer.ip.nexthdr = IPPROTO_IPV6;
}
/* store new outer network header */
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
BPF_F_INVALIDATE_HASH) < 0)
return TC_ACT_SHOT;
return TC_ACT_OK;
}
SEC("encap_ipip")
int __encap_ipip(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, false);
else
return TC_ACT_OK;
}
SEC("encap_gre")
int __encap_gre(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv4(skb, true);
else
return TC_ACT_OK;
}
SEC("encap_ip6tnl")
int __encap_ip6tnl(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, false);
else
return TC_ACT_OK;
}
SEC("encap_ip6gre")
int __encap_ip6gre(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
return encap_ipv6(skb, true);
else
return TC_ACT_OK;
}
static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
{
char buf[sizeof(struct grev6hdr)];
int olen;
switch (proto) {
case IPPROTO_IPIP:
case IPPROTO_IPV6:
olen = len;
break;
case IPPROTO_GRE:
olen = len + 4 /* gre hdr */;
break;
default:
return TC_ACT_OK;
}
if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC,
BPF_F_ADJ_ROOM_FIXED_GSO))
return TC_ACT_SHOT;
return TC_ACT_OK;
}
static int decap_ipv4(struct __sk_buff *skb)
{
struct iphdr iph_outer;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
sizeof(iph_outer)) < 0)
return TC_ACT_OK;
if (iph_outer.ihl != 5)
return TC_ACT_OK;
return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
iph_outer.protocol);
}
static int decap_ipv6(struct __sk_buff *skb)
{
struct ipv6hdr iph_outer;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
sizeof(iph_outer)) < 0)
return TC_ACT_OK;
return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
iph_outer.nexthdr);
}
SEC("decap")
int decap_f(struct __sk_buff *skb)
{
switch (skb->protocol) {
case __bpf_constant_htons(ETH_P_IP):
return decap_ipv4(skb);
case __bpf_constant_htons(ETH_P_IPV6):
return decap_ipv6(skb);
default:
/* does not match, ignore */
return TC_ACT_OK;
}
}
char __license[] SEC("license") = "GPL";
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# In-place tunneling
# must match the port that the bpf program filters on
readonly port=8000
readonly ns_prefix="ns-$$-"
readonly ns1="${ns_prefix}1"
readonly ns2="${ns_prefix}2"
readonly ns1_v4=192.168.1.1
readonly ns2_v4=192.168.1.2
readonly ns1_v6=fd::1
readonly ns2_v6=fd::2
readonly infile="$(mktemp)"
readonly outfile="$(mktemp)"
setup() {
ip netns add "${ns1}"
ip netns add "${ns2}"
ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \
peer name veth2 mtu 1500 netns "${ns2}"
ip netns exec "${ns1}" ethtool -K veth1 tso off
ip -netns "${ns1}" link set veth1 up
ip -netns "${ns2}" link set veth2 up
ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1
ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2
ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad
ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad
# clamp route to reserve room for tunnel headers
ip -netns "${ns1}" -4 route flush table main
ip -netns "${ns1}" -6 route flush table main
ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1476 dev veth1
ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1456 dev veth1
sleep 1
dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
}
cleanup() {
ip netns del "${ns2}"
ip netns del "${ns1}"
if [[ -f "${outfile}" ]]; then
rm "${outfile}"
fi
if [[ -f "${infile}" ]]; then
rm "${infile}"
fi
}
server_listen() {
ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" &
server_pid=$!
sleep 0.2
}
client_connect() {
ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}"
echo $?
}
verify_data() {
wait "${server_pid}"
# sha1sum returns two fields [sha1] [filepath]
# convert to bash array and access first elem
insum=($(sha1sum ${infile}))
outsum=($(sha1sum ${outfile}))
if [[ "${insum[0]}" != "${outsum[0]}" ]]; then
echo "data mismatch"
exit 1
fi
}
set -e
# no arguments: automated test, run all
if [[ "$#" -eq "0" ]]; then
echo "ipip"
$0 ipv4 ipip 100
echo "ip6ip6"
$0 ipv6 ip6tnl 100
echo "ip gre"
$0 ipv4 gre 100
echo "ip6 gre"
$0 ipv6 ip6gre 100
echo "ip gre gso"
$0 ipv4 gre 2000
echo "ip6 gre gso"
$0 ipv6 ip6gre 2000
echo "OK. All tests passed"
exit 0
fi
if [[ "$#" -ne "3" ]]; then
echo "Usage: $0"
echo " or: $0 <ipv4|ipv6> <tuntype> <data_len>"
exit 1
fi
case "$1" in
"ipv4")
readonly addr1="${ns1_v4}"
readonly addr2="${ns2_v4}"
readonly netcat_opt=-4
;;
"ipv6")
readonly addr1="${ns1_v6}"
readonly addr2="${ns2_v6}"
readonly netcat_opt=-6
;;
*)
echo "unknown arg: $1"
exit 1
;;
esac
readonly tuntype=$2
readonly datalen=$3
echo "encap ${addr1} to ${addr2}, type ${tuntype}, len ${datalen}"
trap cleanup EXIT
setup
# basic communication works
echo "test basic connectivity"
server_listen
client_connect
verify_data
# clientside, insert bpf program to encap all TCP to port ${port}
# client can no longer connect
ip netns exec "${ns1}" tc qdisc add dev veth1 clsact
ip netns exec "${ns1}" tc filter add dev veth1 egress \
bpf direct-action object-file ./test_tc_tunnel.o \
section "encap_${tuntype}"
echo "test bpf encap without decap (expect failure)"
server_listen
! client_connect
# serverside, insert decap module
# server is still running
# client can connect again
ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \
remote "${addr1}" local "${addr2}"
ip netns exec "${ns2}" ip link set dev testtun0 up
echo "test bpf encap with tunnel device decap"
client_connect
verify_data
# serverside, use BPF for decap
ip netns exec "${ns2}" ip link del dev testtun0
ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
ip netns exec "${ns2}" tc filter add dev veth2 ingress \
bpf direct-action object-file ./test_tc_tunnel.o section decap
server_listen
echo "test bpf encap with bpf decap"
client_connect
verify_data
echo OK
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment