Merge branch 'bpf-tc-tunneling'

Willem de Bruijn says: ==================== BPF allows for dynamic tunneling, choosing the tunnel destination and features on-demand. Extend bpf_skb_adjust_room to allow for efficient tunneling at the TC hooks. Most features are required for large packets with GSO, as these will be modified after this patch. Patch 1 is a performance optimization, avoiding an unnecessary unclone for the TCP hot path. Patches 2..6 introduce a regression test. These can be squashed, but the code is arguably more readable when gradually expanding the feature set. Patch 7 is a performance optimization, avoid copying network headers that are going to be overwritten. This also simplifies the bpf program. Patch 8 reenables bpf_skb_adjust_room for UDP packets. Patch 9 configures skb tunneling metadata analogous to tunnel devices. Patches 10..13 expand the regression test to make use of the new features and enable the GSO testcases. Changes v1->v2 - move BPF_F_ADJ_ROOM_MASK out of uapi as it can be expanded - document new flags - in tests replace netcat -q flag with coreutils timeout: the -q flag is not supported in all netcat versions v2->v3 - move BPF_F_ADJ_ROOM_ENCAP_L3_MASK out of uapi as it has no use in userspace ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Merge branch 'bpf-tc-tunneling'
Willem de Bruijn says: ==================== BPF allows for dynamic tunneling, choosing the tunnel destination and features on-demand. Extend bpf_skb_adjust_room to allow for efficient tunneling at the TC hooks. Most features are required for large packets with GSO, as these will be modified after this patch. Patch 1 is a performance optimization, avoiding an unnecessary unclone for the TCP hot path. Patches 2..6 introduce a regression test. These can be squashed, but the code is arguably more readable when gradually expanding the feature set. Patch 7 is a performance optimization, avoid copying network headers that are going to be overwritten. This also simplifies the bpf program. Patch 8 reenables bpf_skb_adjust_room for UDP packets. Patch 9 configures skb tunneling metadata analogous to tunnel devices. Patches 10..13 expand the regression test to make use of the new features and enable the GSO testcases. Changes v1->v2 - move BPF_F_ADJ_ROOM_MASK out of uapi as it can be expanded - document new flags - in tests replace netcat -q flag with coreutils timeout: the -q flag is not supported in all netcat versions v2->v3 - move BPF_F_ADJ_ROOM_ENCAP_L3_MASK out of uapi as it has no use in userspace ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
629a0025 · Alexei Starovoitov · f6827526 · 75a1a9fa · 629a0025 · 629a0025
Commit 629a0025 authored Mar 22, 2019 by Alexei Starovoitov
7 changed files
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1478,13 +1478,27 @@ union bpf_attr {
 * 		Grow or shrink the room for data in the packet associated to
 * 		*skb* by *len_diff*, and according to the selected *mode*.
 *
- * 		There is a single supported mode at this time:
+ *		There are two supported modes at this time:
+ *
+ *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
+ *		  (room space is added or removed below the layer 2 header).
 *
 * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
 * 		  (room space is added or removed below the layer 3 header).
 *
- * 		All values for *flags* are reserved for future usage, and must
- * 		be left at zero.
+ *		The following flags are supported at this time:
+ *
+ *		* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
+ *		  Adjusting mss in this way is not allowed for datagrams.
+ *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
+ *		  Any new space is reserved to hold a tunnel header.
+ *		  Configure skb offsets and other fields accordingly.
+ *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
+ *		  Use with ENCAP_L3 flags to further specify the tunnel type.
 *
 * 		A call to this helper is susceptible to change the underlaying
 * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2624,9 +2638,18 @@ enum bpf_func_id {
 /* Current network namespace */
 #define BPF_F_CURRENT_NETNS		(-1L)

+/* BPF_FUNC_skb_adjust_room flags. */
+#define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
+
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
+#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
+#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
+	BPF_ADJ_ROOM_MAC,
 };

 /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */

--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2963,42 +2963,113 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
 	}
 }

-static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
+#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK	(BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
+					 BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+
+#define BPF_F_ADJ_ROOM_MASK		(BPF_F_ADJ_ROOM_FIXED_GSO | \
+					 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
+					 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
+					 BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+
+static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
+			    u64 flags)
 {
-	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+	bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
+	unsigned int gso_type = SKB_GSO_DODGY;
+	u16 mac_len, inner_net, inner_trans;
 	int ret;

-	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-		return -ENOTSUPP;
+	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
+		/* udp gso_size delineates datagrams, only allow if fixed */
+		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
+		    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+			return -ENOTSUPP;
+	}

-	ret = skb_cow(skb, len_diff);
+	ret = skb_cow_head(skb, len_diff);
 	if (unlikely(ret < 0))
 		return ret;

+	if (encap) {
+		if (skb->protocol != htons(ETH_P_IP) &&
+		    skb->protocol != htons(ETH_P_IPV6))
+			return -ENOTSUPP;
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
+		    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+			return -EINVAL;
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
+		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+			return -EINVAL;
+
+		if (skb->encapsulation)
+			return -EALREADY;
+
+		mac_len = skb->network_header - skb->mac_header;
+		inner_net = skb->network_header;
+		inner_trans = skb->transport_header;
+	}
+
 	ret = bpf_skb_net_hdr_push(skb, off, len_diff);
 	if (unlikely(ret < 0))
 		return ret;

+	if (encap) {
+		/* inner mac == inner_net on l3 encap */
+		skb->inner_mac_header = inner_net;
+		skb->inner_network_header = inner_net;
+		skb->inner_transport_header = inner_trans;
+		skb_set_inner_protocol(skb, skb->protocol);
+
+		skb->encapsulation = 1;
+		skb_set_network_header(skb, mac_len);
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+			gso_type |= SKB_GSO_UDP_TUNNEL;
+		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
+			gso_type |= SKB_GSO_GRE;
+		else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+			gso_type |= SKB_GSO_IPXIP6;
+		else
+			gso_type |= SKB_GSO_IPXIP4;
+
+		if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
+		    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
+			int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
+					sizeof(struct ipv6hdr) :
+					sizeof(struct iphdr);
+
+			skb_set_transport_header(skb, mac_len + nh_len);
+		}
+	}
+
 	if (skb_is_gso(skb)) {
 		struct skb_shared_info *shinfo = skb_shinfo(skb);

 		/* Due to header grow, MSS needs to be downgraded. */
-		skb_decrease_gso_size(shinfo, len_diff);
+		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+			skb_decrease_gso_size(shinfo, len_diff);
+
 		/* Header must be checked, and gso_segs recomputed. */
-		shinfo->gso_type |= SKB_GSO_DODGY;
+		shinfo->gso_type |= gso_type;
 		shinfo->gso_segs = 0;
 	}

 	return 0;
 }

-static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
+static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
+			      u64 flags)
 {
-	u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
 	int ret;

-	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb))
-		return -ENOTSUPP;
+	if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
+		/* udp gso_size delineates datagrams, only allow if fixed */
+		if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
+		    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+			return -ENOTSUPP;
+	}

 	ret = skb_unclone(skb, GFP_ATOMIC);
 	if (unlikely(ret < 0))
@@ -3012,7 +3083,9 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
 		struct skb_shared_info *shinfo = skb_shinfo(skb);

 		/* Due to header shrink, MSS can be upgraded. */
-		skb_increase_gso_size(shinfo, len_diff);
+		if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+			skb_increase_gso_size(shinfo, len_diff);
+
 		/* Header must be checked, and gso_segs recomputed. */
 		shinfo->gso_type |= SKB_GSO_DODGY;
 		shinfo->gso_segs = 0;
@@ -3027,49 +3100,50 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
 			  SKB_MAX_ALLOC;
 }

-static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
+BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+	   u32, mode, u64, flags)
 {
-	bool trans_same = skb->transport_header == skb->network_header;
 	u32 len_cur, len_diff_abs = abs(len_diff);
 	u32 len_min = bpf_skb_net_base_len(skb);
 	u32 len_max = __bpf_skb_max_len(skb);
 	__be16 proto = skb->protocol;
 	bool shrink = len_diff < 0;
+	u32 off;
 	int ret;

+	if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK))
+		return -EINVAL;
 	if (unlikely(len_diff_abs > 0xfffU))
 		return -EFAULT;
 	if (unlikely(proto != htons(ETH_P_IP) &&
 		     proto != htons(ETH_P_IPV6)))
 		return -ENOTSUPP;

+	off = skb_mac_header_len(skb);
+	switch (mode) {
+	case BPF_ADJ_ROOM_NET:
+		off += bpf_skb_net_base_len(skb);
+		break;
+	case BPF_ADJ_ROOM_MAC:
+		break;
+	default:
+		return -ENOTSUPP;
+	}
+
 	len_cur = skb->len - skb_network_offset(skb);
-	if (skb_transport_header_was_set(skb) && !trans_same)
-		len_cur = skb_network_header_len(skb);
 	if ((shrink && (len_diff_abs >= len_cur ||
 			len_cur - len_diff_abs < len_min)) ||
 	    (!shrink && (skb->len + len_diff_abs > len_max &&
 			 !skb_is_gso(skb))))
 		return -ENOTSUPP;

-	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
-		       bpf_skb_net_grow(skb, len_diff_abs);
+	ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
+		       bpf_skb_net_grow(skb, off, len_diff_abs, flags);

 	bpf_compute_data_pointers(skb);
 	return ret;
 }

-BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
-	   u32, mode, u64, flags)
-{
-	if (unlikely(flags))
-		return -EINVAL;
-	if (likely(mode == BPF_ADJ_ROOM_NET))
-		return bpf_skb_adjust_net(skb, len_diff);
-
-	return -ENOTSUPP;
-}
-
 static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
 	.func		= bpf_skb_adjust_room,
 	.gpl_only	= false,

--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1478,13 +1478,27 @@ union bpf_attr {
 * 		Grow or shrink the room for data in the packet associated to
 * 		*skb* by *len_diff*, and according to the selected *mode*.
 *
- * 		There is a single supported mode at this time:
+ *		There are two supported modes at this time:
+ *
+ *		* **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer
+ *		  (room space is added or removed below the layer 2 header).
 *
 * 		* **BPF_ADJ_ROOM_NET**: Adjust room at the network layer
 * 		  (room space is added or removed below the layer 3 header).
 *
- * 		All values for *flags* are reserved for future usage, and must
- * 		be left at zero.
+ *		The following flags are supported at this time:
+ *
+ *		* **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size.
+ *		  Adjusting mss in this way is not allowed for datagrams.
+ *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **:
+ *		  Any new space is reserved to hold a tunnel header.
+ *		  Configure skb offsets and other fields accordingly.
+ *
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **:
+ *		* **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **:
+ *		  Use with ENCAP_L3 flags to further specify the tunnel type.
 *
 * 		A call to this helper is susceptible to change the underlaying
 * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2624,9 +2638,18 @@ enum bpf_func_id {
 /* Current network namespace */
 #define BPF_F_CURRENT_NETNS		(-1L)

+/* BPF_FUNC_skb_adjust_room flags. */
+#define BPF_F_ADJ_ROOM_FIXED_GSO	(1ULL << 0)
+
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4	(1ULL << 1)
+#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6	(1ULL << 2)
+#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE	(1ULL << 3)
+#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP	(1ULL << 4)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
+	BPF_ADJ_ROOM_MAC,
 };

 /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */

--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -52,7 +52,8 @@ TEST_PROGS := test_kmod.sh \
 	test_flow_dissector.sh \
 	test_xdp_vlan.sh \
 	test_lwt_ip_encap.sh \
-	test_tcp_check_syncookie.sh
+	test_tcp_check_syncookie.sh \
+	test_tc_tunnel.sh

 TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh \

--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -23,3 +23,5 @@ CONFIG_LWTUNNEL=y
 CONFIG_BPF_STREAM_PARSER=y
 CONFIG_XDP_SOCKETS=y
 CONFIG_FTRACE_SYSCALLS=y
+CONFIG_IPV6_TUNNEL=y
+CONFIG_IPV6_GRE=y
--- a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
+// SPDX-License-Identifier: GPL-2.0
+
+/* In-place tunneling */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <linux/types.h>
+
+#include "bpf_endian.h"
+#include "bpf_helpers.h"
+
+static const int cfg_port = 8000;
+
+struct grev4hdr {
+	struct iphdr ip;
+	__be16 flags;
+	__be16 protocol;
+} __attribute__((packed));
+
+struct grev6hdr {
+	struct ipv6hdr ip;
+	__be16 flags;
+	__be16 protocol;
+} __attribute__((packed));
+
+static __always_inline void set_ipv4_csum(struct iphdr *iph)
+{
+	__u16 *iph16 = (__u16 *)iph;
+	__u32 csum;
+	int i;
+
+	iph->check = 0;
+
+#pragma clang loop unroll(full)
+	for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++)
+		csum += *iph16++;
+
+	iph->check = ~((csum & 0xffff) + (csum >> 16));
+}
+
+static __always_inline int encap_ipv4(struct __sk_buff *skb, bool with_gre)
+{
+	struct grev4hdr h_outer;
+	struct iphdr iph_inner;
+	struct tcphdr tcph;
+	__u64 flags;
+	int olen;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
+			       sizeof(iph_inner)) < 0)
+		return TC_ACT_OK;
+
+	/* filter only packets we want */
+	if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP)
+		return TC_ACT_OK;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner),
+			       &tcph, sizeof(tcph)) < 0)
+		return TC_ACT_OK;
+
+	if (tcph.dest != __bpf_constant_htons(cfg_port))
+		return TC_ACT_OK;
+
+	flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4;
+	if (with_gre) {
+		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
+		olen = sizeof(h_outer);
+	} else {
+		olen = sizeof(h_outer.ip);
+	}
+
+	/* add room between mac and network header */
+	if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
+		return TC_ACT_SHOT;
+
+	/* prepare new outer network header */
+	h_outer.ip = iph_inner;
+	h_outer.ip.tot_len = bpf_htons(olen +
+				      bpf_htons(h_outer.ip.tot_len));
+	if (with_gre) {
+		h_outer.ip.protocol = IPPROTO_GRE;
+		h_outer.protocol = bpf_htons(ETH_P_IP);
+		h_outer.flags = 0;
+	} else {
+		h_outer.ip.protocol = IPPROTO_IPIP;
+	}
+
+	set_ipv4_csum((void *)&h_outer.ip);
+
+	/* store new outer network header */
+	if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
+				BPF_F_INVALIDATE_HASH) < 0)
+		return TC_ACT_SHOT;
+
+	return TC_ACT_OK;
+}
+
+static __always_inline int encap_ipv6(struct __sk_buff *skb, bool with_gre)
+{
+	struct ipv6hdr iph_inner;
+	struct grev6hdr h_outer;
+	struct tcphdr tcph;
+	__u64 flags;
+	int olen;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
+			       sizeof(iph_inner)) < 0)
+		return TC_ACT_OK;
+
+	/* filter only packets we want */
+	if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner),
+			       &tcph, sizeof(tcph)) < 0)
+		return TC_ACT_OK;
+
+	if (tcph.dest != __bpf_constant_htons(cfg_port))
+		return TC_ACT_OK;
+
+	flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
+	if (with_gre) {
+		flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
+		olen = sizeof(h_outer);
+	} else {
+		olen = sizeof(h_outer.ip);
+	}
+
+
+	/* add room between mac and network header */
+	if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
+		return TC_ACT_SHOT;
+
+	/* prepare new outer network header */
+	h_outer.ip = iph_inner;
+	h_outer.ip.payload_len = bpf_htons(olen +
+					   bpf_ntohs(h_outer.ip.payload_len));
+	if (with_gre) {
+		h_outer.ip.nexthdr = IPPROTO_GRE;
+		h_outer.protocol = bpf_htons(ETH_P_IPV6);
+		h_outer.flags = 0;
+	} else {
+		h_outer.ip.nexthdr = IPPROTO_IPV6;
+	}
+
+	/* store new outer network header */
+	if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
+				BPF_F_INVALIDATE_HASH) < 0)
+		return TC_ACT_SHOT;
+
+	return TC_ACT_OK;
+}
+
+SEC("encap_ipip")
+int __encap_ipip(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4(skb, false);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_gre")
+int __encap_gre(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+		return encap_ipv4(skb, true);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_ip6tnl")
+int __encap_ip6tnl(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6(skb, false);
+	else
+		return TC_ACT_OK;
+}
+
+SEC("encap_ip6gre")
+int __encap_ip6gre(struct __sk_buff *skb)
+{
+	if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+		return encap_ipv6(skb, true);
+	else
+		return TC_ACT_OK;
+}
+
+static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
+{
+	char buf[sizeof(struct grev6hdr)];
+	int olen;
+
+	switch (proto) {
+	case IPPROTO_IPIP:
+	case IPPROTO_IPV6:
+		olen = len;
+		break;
+	case IPPROTO_GRE:
+		olen = len + 4 /* gre hdr */;
+		break;
+	default:
+		return TC_ACT_OK;
+	}
+
+	if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC,
+				BPF_F_ADJ_ROOM_FIXED_GSO))
+		return TC_ACT_SHOT;
+
+	return TC_ACT_OK;
+}
+
+static int decap_ipv4(struct __sk_buff *skb)
+{
+	struct iphdr iph_outer;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
+			       sizeof(iph_outer)) < 0)
+		return TC_ACT_OK;
+
+	if (iph_outer.ihl != 5)
+		return TC_ACT_OK;
+
+	return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
+			      iph_outer.protocol);
+}
+
+static int decap_ipv6(struct __sk_buff *skb)
+{
+	struct ipv6hdr iph_outer;
+
+	if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
+			       sizeof(iph_outer)) < 0)
+		return TC_ACT_OK;
+
+	return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
+			      iph_outer.nexthdr);
+}
+
+SEC("decap")
+int decap_f(struct __sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case __bpf_constant_htons(ETH_P_IP):
+		return decap_ipv4(skb);
+	case __bpf_constant_htons(ETH_P_IPV6):
+		return decap_ipv6(skb);
+	default:
+		/* does not match, ignore */
+		return TC_ACT_OK;
+	}
+}
+
+char __license[] SEC("license") = "GPL";
--- a/tools/testing/selftests/bpf/test_tc_tunnel.sh
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# In-place tunneling
+
+# must match the port that the bpf program filters on
+readonly port=8000
+
+readonly ns_prefix="ns-$$-"
+readonly ns1="${ns_prefix}1"
+readonly ns2="${ns_prefix}2"
+
+readonly ns1_v4=192.168.1.1
+readonly ns2_v4=192.168.1.2
+readonly ns1_v6=fd::1
+readonly ns2_v6=fd::2
+
+readonly infile="$(mktemp)"
+readonly outfile="$(mktemp)"
+
+setup() {
+	ip netns add "${ns1}"
+	ip netns add "${ns2}"
+
+	ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \
+	      peer name veth2 mtu 1500 netns "${ns2}"
+
+	ip netns exec "${ns1}" ethtool -K veth1 tso off
+
+	ip -netns "${ns1}" link set veth1 up
+	ip -netns "${ns2}" link set veth2 up
+
+	ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1
+	ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2
+	ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad
+	ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad
+
+	# clamp route to reserve room for tunnel headers
+	ip -netns "${ns1}" -4 route flush table main
+	ip -netns "${ns1}" -6 route flush table main
+	ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1476 dev veth1
+	ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1456 dev veth1
+
+	sleep 1
+
+	dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
+}
+
+cleanup() {
+	ip netns del "${ns2}"
+	ip netns del "${ns1}"
+
+	if [[ -f "${outfile}" ]]; then
+		rm "${outfile}"
+	fi
+	if [[ -f "${infile}" ]]; then
+		rm "${infile}"
+	fi
+}
+
+server_listen() {
+	ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" &
+	server_pid=$!
+	sleep 0.2
+}
+
+client_connect() {
+	ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}"
+	echo $?
+}
+
+verify_data() {
+	wait "${server_pid}"
+	# sha1sum returns two fields [sha1] [filepath]
+	# convert to bash array and access first elem
+	insum=($(sha1sum ${infile}))
+	outsum=($(sha1sum ${outfile}))
+	if [[ "${insum[0]}" != "${outsum[0]}" ]]; then
+		echo "data mismatch"
+		exit 1
+	fi
+}
+
+set -e
+
+# no arguments: automated test, run all
+if [[ "$#" -eq "0" ]]; then
+	echo "ipip"
+	$0 ipv4 ipip 100
+
+	echo "ip6ip6"
+	$0 ipv6 ip6tnl 100
+
+	echo "ip gre"
+	$0 ipv4 gre 100
+
+	echo "ip6 gre"
+	$0 ipv6 ip6gre 100
+
+	echo "ip gre gso"
+	$0 ipv4 gre 2000
+
+	echo "ip6 gre gso"
+	$0 ipv6 ip6gre 2000
+
+	echo "OK. All tests passed"
+	exit 0
+fi
+
+if [[ "$#" -ne "3" ]]; then
+	echo "Usage: $0"
+	echo "   or: $0 <ipv4|ipv6> <tuntype> <data_len>"
+	exit 1
+fi
+
+case "$1" in
+"ipv4")
+	readonly addr1="${ns1_v4}"
+	readonly addr2="${ns2_v4}"
+	readonly netcat_opt=-4
+	;;
+"ipv6")
+	readonly addr1="${ns1_v6}"
+	readonly addr2="${ns2_v6}"
+	readonly netcat_opt=-6
+	;;
+*)
+	echo "unknown arg: $1"
+	exit 1
+	;;
+esac
+
+readonly tuntype=$2
+readonly datalen=$3
+
+echo "encap ${addr1} to ${addr2}, type ${tuntype}, len ${datalen}"
+
+trap cleanup EXIT
+
+setup
+
+# basic communication works
+echo "test basic connectivity"
+server_listen
+client_connect
+verify_data
+
+# clientside, insert bpf program to encap all TCP to port ${port}
+# client can no longer connect
+ip netns exec "${ns1}" tc qdisc add dev veth1 clsact
+ip netns exec "${ns1}" tc filter add dev veth1 egress \
+	bpf direct-action object-file ./test_tc_tunnel.o \
+	section "encap_${tuntype}"
+echo "test bpf encap without decap (expect failure)"
+server_listen
+! client_connect
+
+# serverside, insert decap module
+# server is still running
+# client can connect again
+ip netns exec "${ns2}" ip link add dev testtun0 type "${tuntype}" \
+	remote "${addr1}" local "${addr2}"
+ip netns exec "${ns2}" ip link set dev testtun0 up
+echo "test bpf encap with tunnel device decap"
+client_connect
+verify_data
+
+# serverside, use BPF for decap
+ip netns exec "${ns2}" ip link del dev testtun0
+ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
+ip netns exec "${ns2}" tc filter add dev veth2 ingress \
+	bpf direct-action object-file ./test_tc_tunnel.o section decap
+server_listen
+echo "test bpf encap with bpf decap"
+client_connect
+verify_data
+
+echo OK