Commit 81bbbb69 authored by Martin KaFai Lau's avatar Martin KaFai Lau

Merge branch 'bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()'

Ziyang Xuan says:

====================

Add ipip6 and ip6ip decap support for bpf_skb_adjust_room().
Main use case is for using cls_bpf on ingress hook to decapsulate
IPv4 over IPv6 and IPv6 over IPv4 tunnel packets.

And add ipip6 and ip6ip decap testcases to verify that
bpf_skb_adjust_room() correctly decapsulate ipip6 and ip6ip
tunnel packets.

$./test_tc_tunnel.sh
ipip
encap 192.168.1.1 to 192.168.1.2, type ipip, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
ipip6
encap 192.168.1.1 to 192.168.1.2, type ipip6, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
ip6ip6
encap fd::1 to fd::2, type ip6tnl, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
sit
encap fd::1 to fd::2, type sit, mac none len 100
test basic connectivity
0
test bpf encap without decap (expect failure)
Ncat: TIMEOUT.
1
test bpf encap with tunnel device decap
0
test bpf encap with bpf decap
0
OK
...
OK. All tests passed

v3:
  - Fix compilation failure of selftests/bpf.
  - Combine two new branches in bpf_skb_adjust_room().
  - Simplify description for new flags BPF_F_ADJ_ROOM_DECAP_L3_IP*.

v2:
  - Use decap flags to indicate the new IP header.
    Do not rely on skb->encapsulation.
====================
Signed-off-by: default avatarMartin KaFai Lau <martin.lau@kernel.org>
parents 1c48391b 7105f76f
...@@ -2647,6 +2647,11 @@ union bpf_attr { ...@@ -2647,6 +2647,11 @@ union bpf_attr {
* Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
* L2 type as Ethernet. * L2 type as Ethernet.
* *
* * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
* **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
* Indicate the new IP header version after decapsulating the outer
* IP header. Used when the inner and outer IP versions are different.
*
* A call to this helper is susceptible to change the underlying * A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers * packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be * previously done by the verifier are invalidated and must be
...@@ -5807,6 +5812,8 @@ enum { ...@@ -5807,6 +5812,8 @@ enum {
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7),
BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8),
}; };
enum { enum {
......
...@@ -3381,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) ...@@ -3381,13 +3381,17 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_F_ADJ_ROOM_ENCAP_L2( \
BPF_ADJ_ROOM_ENCAP_L2_MASK)) BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
BPF_F_ADJ_ROOM_DECAP_L3_MASK)
static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
u64 flags) u64 flags)
...@@ -3501,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, ...@@ -3501,6 +3505,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
int ret; int ret;
if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
BPF_F_ADJ_ROOM_DECAP_L3_MASK |
BPF_F_ADJ_ROOM_NO_CSUM_RESET))) BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
return -EINVAL; return -EINVAL;
...@@ -3519,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, ...@@ -3519,6 +3524,14 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
skb->protocol = htons(ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
skb->protocol = htons(ETH_P_IP);
if (skb_is_gso(skb)) { if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb); struct skb_shared_info *shinfo = skb_shinfo(skb);
...@@ -3608,6 +3621,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ...@@ -3608,6 +3621,22 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
return -ENOTSUPP; return -ENOTSUPP;
} }
if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
if (!shrink)
return -EINVAL;
switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
len_min = sizeof(struct iphdr);
break;
case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
len_min = sizeof(struct ipv6hdr);
break;
default:
return -EINVAL;
}
}
len_cur = skb->len - skb_network_offset(skb); len_cur = skb->len - skb_network_offset(skb);
if ((shrink && (len_diff_abs >= len_cur || if ((shrink && (len_diff_abs >= len_cur ||
len_cur - len_diff_abs < len_min)) || len_cur - len_diff_abs < len_min)) ||
......
...@@ -2647,6 +2647,11 @@ union bpf_attr { ...@@ -2647,6 +2647,11 @@ union bpf_attr {
* Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
* L2 type as Ethernet. * L2 type as Ethernet.
* *
* * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**,
* **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**:
* Indicate the new IP header version after decapsulating the outer
* IP header. Used when the inner and outer IP versions are different.
*
* A call to this helper is susceptible to change the underlying * A call to this helper is susceptible to change the underlying
* packet buffer. Therefore, at load time, all checks on pointers * packet buffer. Therefore, at load time, all checks on pointers
* previously done by the verifier are invalidated and must be * previously done by the verifier are invalidated and must be
...@@ -5807,6 +5812,8 @@ enum { ...@@ -5807,6 +5812,8 @@ enum {
BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4),
BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5),
BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6),
BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7),
BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8),
}; };
enum { enum {
......
...@@ -38,6 +38,10 @@ static const int cfg_udp_src = 20000; ...@@ -38,6 +38,10 @@ static const int cfg_udp_src = 20000;
#define VXLAN_FLAGS 0x8 #define VXLAN_FLAGS 0x8
#define VXLAN_VNI 1 #define VXLAN_VNI 1
#ifndef NEXTHDR_DEST
#define NEXTHDR_DEST 60
#endif
/* MPLS label 1000 with S bit (last label) set and ttl of 255. */ /* MPLS label 1000 with S bit (last label) set and ttl of 255. */
static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 | static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
MPLS_LS_S_MASK | 0xff); MPLS_LS_S_MASK | 0xff);
...@@ -363,6 +367,61 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, ...@@ -363,6 +367,61 @@ static __always_inline int __encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
return TC_ACT_OK; return TC_ACT_OK;
} }
static int encap_ipv6_ipip6(struct __sk_buff *skb)
{
struct iphdr iph_inner;
struct v6hdr h_outer;
struct tcphdr tcph;
struct ethhdr eth;
__u64 flags;
int olen;
if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
sizeof(iph_inner)) < 0)
return TC_ACT_OK;
/* filter only packets we want */
if (bpf_skb_load_bytes(skb, ETH_HLEN + (iph_inner.ihl << 2),
&tcph, sizeof(tcph)) < 0)
return TC_ACT_OK;
if (tcph.dest != __bpf_constant_htons(cfg_port))
return TC_ACT_OK;
olen = sizeof(h_outer.ip);
flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
/* add room between mac and network header */
if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
return TC_ACT_SHOT;
/* prepare new outer network header */
memset(&h_outer.ip, 0, sizeof(h_outer.ip));
h_outer.ip.version = 6;
h_outer.ip.hop_limit = iph_inner.ttl;
h_outer.ip.saddr.s6_addr[1] = 0xfd;
h_outer.ip.saddr.s6_addr[15] = 1;
h_outer.ip.daddr.s6_addr[1] = 0xfd;
h_outer.ip.daddr.s6_addr[15] = 2;
h_outer.ip.payload_len = iph_inner.tot_len;
h_outer.ip.nexthdr = IPPROTO_IPIP;
/* store new outer network header */
if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
BPF_F_INVALIDATE_HASH) < 0)
return TC_ACT_SHOT;
/* update eth->h_proto */
if (bpf_skb_load_bytes(skb, 0, &eth, sizeof(eth)) < 0)
return TC_ACT_SHOT;
eth.h_proto = bpf_htons(ETH_P_IPV6);
if (bpf_skb_store_bytes(skb, 0, &eth, sizeof(eth), 0) < 0)
return TC_ACT_SHOT;
return TC_ACT_OK;
}
static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto, static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
__u16 l2_proto) __u16 l2_proto)
{ {
...@@ -461,6 +520,15 @@ int __encap_ip6tnl_none(struct __sk_buff *skb) ...@@ -461,6 +520,15 @@ int __encap_ip6tnl_none(struct __sk_buff *skb)
return TC_ACT_OK; return TC_ACT_OK;
} }
SEC("encap_ipip6_none")
int __encap_ipip6_none(struct __sk_buff *skb)
{
if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
return encap_ipv6_ipip6(skb);
else
return TC_ACT_OK;
}
SEC("encap_ip6gre_none") SEC("encap_ip6gre_none")
int __encap_ip6gre_none(struct __sk_buff *skb) int __encap_ip6gre_none(struct __sk_buff *skb)
{ {
...@@ -528,13 +596,33 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb) ...@@ -528,13 +596,33 @@ int __encap_ip6vxlan_eth(struct __sk_buff *skb)
static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
{ {
__u64 flags = BPF_F_ADJ_ROOM_FIXED_GSO;
struct ipv6_opt_hdr ip6_opt_hdr;
struct gre_hdr greh; struct gre_hdr greh;
struct udphdr udph; struct udphdr udph;
int olen = len; int olen = len;
switch (proto) { switch (proto) {
case IPPROTO_IPIP: case IPPROTO_IPIP:
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4;
break;
case IPPROTO_IPV6: case IPPROTO_IPV6:
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6;
break;
case NEXTHDR_DEST:
if (bpf_skb_load_bytes(skb, off + len, &ip6_opt_hdr,
sizeof(ip6_opt_hdr)) < 0)
return TC_ACT_OK;
switch (ip6_opt_hdr.nexthdr) {
case IPPROTO_IPIP:
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV4;
break;
case IPPROTO_IPV6:
flags |= BPF_F_ADJ_ROOM_DECAP_L3_IPV6;
break;
default:
return TC_ACT_OK;
}
break; break;
case IPPROTO_GRE: case IPPROTO_GRE:
olen += sizeof(struct gre_hdr); olen += sizeof(struct gre_hdr);
...@@ -569,8 +657,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto) ...@@ -569,8 +657,7 @@ static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
return TC_ACT_OK; return TC_ACT_OK;
} }
if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC, flags))
BPF_F_ADJ_ROOM_FIXED_GSO))
return TC_ACT_SHOT; return TC_ACT_SHOT;
return TC_ACT_OK; return TC_ACT_OK;
......
...@@ -100,6 +100,9 @@ if [[ "$#" -eq "0" ]]; then ...@@ -100,6 +100,9 @@ if [[ "$#" -eq "0" ]]; then
echo "ipip" echo "ipip"
$0 ipv4 ipip none 100 $0 ipv4 ipip none 100
echo "ipip6"
$0 ipv4 ipip6 none 100
echo "ip6ip6" echo "ip6ip6"
$0 ipv6 ip6tnl none 100 $0 ipv6 ip6tnl none 100
...@@ -224,6 +227,9 @@ elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then ...@@ -224,6 +227,9 @@ elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then elif [[ "$tuntype" =~ "vxlan" && "$mac" == "eth" ]]; then
ttype="vxlan" ttype="vxlan"
targs="id 1 dstport 8472 udp6zerocsumrx" targs="id 1 dstport 8472 udp6zerocsumrx"
elif [[ "$tuntype" == "ipip6" ]]; then
ttype="ip6tnl"
targs=""
else else
ttype=$tuntype ttype=$tuntype
targs="" targs=""
...@@ -233,6 +239,9 @@ fi ...@@ -233,6 +239,9 @@ fi
if [[ "${tuntype}" == "sit" ]]; then if [[ "${tuntype}" == "sit" ]]; then
link_addr1="${ns1_v4}" link_addr1="${ns1_v4}"
link_addr2="${ns2_v4}" link_addr2="${ns2_v4}"
elif [[ "${tuntype}" == "ipip6" ]]; then
link_addr1="${ns1_v6}"
link_addr2="${ns2_v6}"
else else
link_addr1="${addr1}" link_addr1="${addr1}"
link_addr2="${addr2}" link_addr2="${addr2}"
...@@ -287,12 +296,6 @@ else ...@@ -287,12 +296,6 @@ else
server_listen server_listen
fi fi
# bpf_skb_net_shrink does not take tunnel flags yet, cannot update L3.
if [[ "${tuntype}" == "sit" ]]; then
echo OK
exit 0
fi
# serverside, use BPF for decap # serverside, use BPF for decap
ip netns exec "${ns2}" ip link del dev testtun0 ip netns exec "${ns2}" ip link del dev testtun0
ip netns exec "${ns2}" tc qdisc add dev veth2 clsact ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment