Commit 943e398d authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'flow_dissector-input-flags'

Stanislav Fomichev says:

====================
C flow dissector supports input flags that tell it to customize parsing
by either stopping early or trying to parse as deep as possible.
BPF flow dissector always parses as deep as possible which is sub-optimal.
Pass input flags to the BPF flow dissector as well so it can make the same
decisions.

Series outline:
* remove unused FLOW_DISSECTOR_F_STOP_AT_L3 flag
* export FLOW_DISSECTOR_F_XXX flags as uapi and pass them to BPF
  flow dissector
* add documentation for the export flags
* support input flags in BPF_PROG_TEST_RUN via ctx_{in,out}
* sync uapi to tools
* support FLOW_DISSECTOR_F_PARSE_1ST_FRAG in selftest
* support FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL in kernel and selftest
* support FLOW_DISSECTOR_F_STOP_AT_ENCAP in selftest

Pros:
* makes BPF flow dissector faster by avoiding burning extra cycles
* existing BPF progs continue to work by ignoring the flags and always
  parsing as deep as possible

Cons:
* new UAPI which we need to support (OTOH, if we need to deprecate some
  flags, we can just stop setting them upon calling BPF programs)

Some numbers (with .repeat = 4000000 in test_flow_dissector):
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec

        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec

        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec

        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec

The improvement is around 10%, but it's in a tight cache-hot
BPF_PROG_TEST_RUN loop.
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 03cd1d1a e853ae77
...@@ -26,6 +26,7 @@ The inputs are: ...@@ -26,6 +26,7 @@ The inputs are:
* ``nhoff`` - initial offset of the networking header * ``nhoff`` - initial offset of the networking header
* ``thoff`` - initial offset of the transport header, initialized to nhoff * ``thoff`` - initial offset of the transport header, initialized to nhoff
* ``n_proto`` - L3 protocol type, parsed out of L2 header * ``n_proto`` - L3 protocol type, parsed out of L2 header
* ``flags`` - optional flags
Flow dissector BPF program should fill out the rest of the ``struct Flow dissector BPF program should fill out the rest of the ``struct
bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be
...@@ -101,6 +102,23 @@ can be called for both cases and would have to be written carefully to ...@@ -101,6 +102,23 @@ can be called for both cases and would have to be written carefully to
handle both cases. handle both cases.
Flags
=====
``flow_keys->flags`` might contain optional input flags that work as follows:
* ``BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG`` - tells BPF flow dissector to
continue parsing first fragment; the default expected behavior is that
flow dissector returns as soon as it finds out that the packet is fragmented;
used by ``eth_get_headlen`` to estimate length of all headers for GRO.
* ``BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL`` - tells BPF flow dissector to
stop parsing as soon as it reaches IPv6 flow label; used by
``___skb_get_hash`` and ``__skb_get_hash_symmetric`` to get flow hash.
* ``BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP`` - tells BPF flow dissector to stop
parsing as soon as it reaches encapsulated headers; used by routing
infrastructure.
Reference Implementation Reference Implementation
======================== ========================
......
...@@ -1271,7 +1271,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) ...@@ -1271,7 +1271,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)
struct bpf_flow_dissector; struct bpf_flow_dissector;
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
__be16 proto, int nhoff, int hlen); __be16 proto, int nhoff, int hlen, unsigned int flags);
bool __skb_flow_dissect(const struct net *net, bool __skb_flow_dissect(const struct net *net,
const struct sk_buff *skb, const struct sk_buff *skb,
......
...@@ -3507,6 +3507,10 @@ enum bpf_task_fd_type { ...@@ -3507,6 +3507,10 @@ enum bpf_task_fd_type {
BPF_FD_TYPE_URETPROBE, /* filename + offset */ BPF_FD_TYPE_URETPROBE, /* filename + offset */
}; };
#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0)
#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1)
#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2)
struct bpf_flow_keys { struct bpf_flow_keys {
__u16 nhoff; __u16 nhoff;
__u16 thoff; __u16 thoff;
...@@ -3528,6 +3532,8 @@ struct bpf_flow_keys { ...@@ -3528,6 +3532,8 @@ struct bpf_flow_keys {
__u32 ipv6_dst[4]; /* in6_addr; network order */ __u32 ipv6_dst[4]; /* in6_addr; network order */
}; };
}; };
__u32 flags;
__be32 flow_label;
}; };
struct bpf_func_info { struct bpf_func_info {
......
...@@ -377,6 +377,22 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ...@@ -377,6 +377,22 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
return ret; return ret;
} }
static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
{
/* make sure the fields we don't use are zeroed */
if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags)))
return -EINVAL;
/* flags is allowed */
if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) +
FIELD_SIZEOF(struct bpf_flow_keys, flags),
sizeof(struct bpf_flow_keys)))
return -EINVAL;
return 0;
}
int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
const union bpf_attr *kattr, const union bpf_attr *kattr,
union bpf_attr __user *uattr) union bpf_attr __user *uattr)
...@@ -384,9 +400,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ...@@ -384,9 +400,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
u32 size = kattr->test.data_size_in; u32 size = kattr->test.data_size_in;
struct bpf_flow_dissector ctx = {}; struct bpf_flow_dissector ctx = {};
u32 repeat = kattr->test.repeat; u32 repeat = kattr->test.repeat;
struct bpf_flow_keys *user_ctx;
struct bpf_flow_keys flow_keys; struct bpf_flow_keys flow_keys;
u64 time_start, time_spent = 0; u64 time_start, time_spent = 0;
const struct ethhdr *eth; const struct ethhdr *eth;
unsigned int flags = 0;
u32 retval, duration; u32 retval, duration;
void *data; void *data;
int ret; int ret;
...@@ -395,9 +413,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ...@@ -395,9 +413,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR) if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
return -EINVAL; return -EINVAL;
if (kattr->test.ctx_in || kattr->test.ctx_out)
return -EINVAL;
if (size < ETH_HLEN) if (size < ETH_HLEN)
return -EINVAL; return -EINVAL;
...@@ -410,6 +425,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ...@@ -410,6 +425,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
if (!repeat) if (!repeat)
repeat = 1; repeat = 1;
user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys));
if (IS_ERR(user_ctx)) {
kfree(data);
return PTR_ERR(user_ctx);
}
if (user_ctx) {
ret = verify_user_bpf_flow_keys(user_ctx);
if (ret)
goto out;
flags = user_ctx->flags;
}
ctx.flow_keys = &flow_keys; ctx.flow_keys = &flow_keys;
ctx.data = data; ctx.data = data;
ctx.data_end = (__u8 *)data + size; ctx.data_end = (__u8 *)data + size;
...@@ -419,7 +446,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ...@@ -419,7 +446,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
time_start = ktime_get_ns(); time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) { for (i = 0; i < repeat; i++) {
retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
size); size, flags);
if (signal_pending(current)) { if (signal_pending(current)) {
preempt_enable(); preempt_enable();
...@@ -450,8 +477,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, ...@@ -450,8 +477,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys), ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
retval, duration); retval, duration);
if (!ret)
ret = bpf_ctx_finish(kattr, uattr, user_ctx,
sizeof(struct bpf_flow_keys));
out: out:
kfree(user_ctx);
kfree(data); kfree(data);
return ret; return ret;
} }
...@@ -737,6 +737,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, ...@@ -737,6 +737,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs; struct flow_dissector_key_addrs *key_addrs;
struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_ports *key_ports;
struct flow_dissector_key_tags *key_tags;
key_control = skb_flow_dissector_target(flow_dissector, key_control = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_CONTROL, FLOW_DISSECTOR_KEY_CONTROL,
...@@ -781,10 +782,18 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, ...@@ -781,10 +782,18 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_ports->src = flow_keys->sport; key_ports->src = flow_keys->sport;
key_ports->dst = flow_keys->dport; key_ports->dst = flow_keys->dport;
} }
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
key_tags = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_FLOW_LABEL,
target_container);
key_tags->flow_label = ntohl(flow_keys->flow_label);
}
} }
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
__be16 proto, int nhoff, int hlen) __be16 proto, int nhoff, int hlen, unsigned int flags)
{ {
struct bpf_flow_keys *flow_keys = ctx->flow_keys; struct bpf_flow_keys *flow_keys = ctx->flow_keys;
u32 result; u32 result;
...@@ -795,6 +804,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, ...@@ -795,6 +804,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
flow_keys->nhoff = nhoff; flow_keys->nhoff = nhoff;
flow_keys->thoff = flow_keys->nhoff; flow_keys->thoff = flow_keys->nhoff;
BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
(int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
(int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
(int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
flow_keys->flags = flags;
preempt_disable(); preempt_disable();
result = BPF_PROG_RUN(prog, ctx); result = BPF_PROG_RUN(prog, ctx);
preempt_enable(); preempt_enable();
...@@ -914,7 +931,7 @@ bool __skb_flow_dissect(const struct net *net, ...@@ -914,7 +931,7 @@ bool __skb_flow_dissect(const struct net *net,
} }
ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff, ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
hlen); hlen, flags);
__skb_flow_bpf_to_target(&flow_keys, flow_dissector, __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
target_container); target_container);
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -3504,6 +3504,10 @@ enum bpf_task_fd_type { ...@@ -3504,6 +3504,10 @@ enum bpf_task_fd_type {
BPF_FD_TYPE_URETPROBE, /* filename + offset */ BPF_FD_TYPE_URETPROBE, /* filename + offset */
}; };
#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0)
#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1)
#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2)
struct bpf_flow_keys { struct bpf_flow_keys {
__u16 nhoff; __u16 nhoff;
__u16 thoff; __u16 thoff;
...@@ -3525,6 +3529,8 @@ struct bpf_flow_keys { ...@@ -3525,6 +3529,8 @@ struct bpf_flow_keys {
__u32 ipv6_dst[4]; /* in6_addr; network order */ __u32 ipv6_dst[4]; /* in6_addr; network order */
}; };
}; };
__u32 flags;
__be32 flow_label;
}; };
struct bpf_func_info { struct bpf_func_info {
......
...@@ -5,6 +5,10 @@ ...@@ -5,6 +5,10 @@
#include <linux/if_tun.h> #include <linux/if_tun.h>
#include <sys/uio.h> #include <sys/uio.h>
#ifndef IP_MF
#define IP_MF 0x2000
#endif
#define CHECK_FLOW_KEYS(desc, got, expected) \ #define CHECK_FLOW_KEYS(desc, got, expected) \
CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0, \ CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0, \
desc, \ desc, \
...@@ -16,6 +20,7 @@ ...@@ -16,6 +20,7 @@
"is_encap=%u/%u " \ "is_encap=%u/%u " \
"ip_proto=0x%x/0x%x " \ "ip_proto=0x%x/0x%x " \
"n_proto=0x%x/0x%x " \ "n_proto=0x%x/0x%x " \
"flow_label=0x%x/0x%x " \
"sport=%u/%u " \ "sport=%u/%u " \
"dport=%u/%u\n", \ "dport=%u/%u\n", \
got.nhoff, expected.nhoff, \ got.nhoff, expected.nhoff, \
...@@ -26,6 +31,7 @@ ...@@ -26,6 +31,7 @@
got.is_encap, expected.is_encap, \ got.is_encap, expected.is_encap, \
got.ip_proto, expected.ip_proto, \ got.ip_proto, expected.ip_proto, \
got.n_proto, expected.n_proto, \ got.n_proto, expected.n_proto, \
got.flow_label, expected.flow_label, \
got.sport, expected.sport, \ got.sport, expected.sport, \
got.dport, expected.dport) got.dport, expected.dport)
...@@ -35,6 +41,13 @@ struct ipv4_pkt { ...@@ -35,6 +41,13 @@ struct ipv4_pkt {
struct tcphdr tcp; struct tcphdr tcp;
} __packed; } __packed;
struct ipip_pkt {
struct ethhdr eth;
struct iphdr iph;
struct iphdr iph_inner;
struct tcphdr tcp;
} __packed;
struct svlan_ipv4_pkt { struct svlan_ipv4_pkt {
struct ethhdr eth; struct ethhdr eth;
__u16 vlan_tci; __u16 vlan_tci;
...@@ -49,6 +62,18 @@ struct ipv6_pkt { ...@@ -49,6 +62,18 @@ struct ipv6_pkt {
struct tcphdr tcp; struct tcphdr tcp;
} __packed; } __packed;
struct ipv6_frag_pkt {
struct ethhdr eth;
struct ipv6hdr iph;
struct frag_hdr {
__u8 nexthdr;
__u8 reserved;
__be16 frag_off;
__be32 identification;
} ipf;
struct tcphdr tcp;
} __packed;
struct dvlan_ipv6_pkt { struct dvlan_ipv6_pkt {
struct ethhdr eth; struct ethhdr eth;
__u16 vlan_tci; __u16 vlan_tci;
...@@ -64,10 +89,13 @@ struct test { ...@@ -64,10 +89,13 @@ struct test {
union { union {
struct ipv4_pkt ipv4; struct ipv4_pkt ipv4;
struct svlan_ipv4_pkt svlan_ipv4; struct svlan_ipv4_pkt svlan_ipv4;
struct ipip_pkt ipip;
struct ipv6_pkt ipv6; struct ipv6_pkt ipv6;
struct ipv6_frag_pkt ipv6_frag;
struct dvlan_ipv6_pkt dvlan_ipv6; struct dvlan_ipv6_pkt dvlan_ipv6;
} pkt; } pkt;
struct bpf_flow_keys keys; struct bpf_flow_keys keys;
__u32 flags;
}; };
#define VLAN_HLEN 4 #define VLAN_HLEN 4
...@@ -143,6 +171,202 @@ struct test tests[] = { ...@@ -143,6 +171,202 @@ struct test tests[] = {
.n_proto = __bpf_constant_htons(ETH_P_IPV6), .n_proto = __bpf_constant_htons(ETH_P_IPV6),
}, },
}, },
{
.name = "ipv4-frag",
.pkt.ipv4 = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
.iph.ihl = 5,
.iph.protocol = IPPROTO_TCP,
.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
.iph.frag_off = __bpf_constant_htons(IP_MF),
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct iphdr),
.addr_proto = ETH_P_IP,
.ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IP),
.is_frag = true,
.is_first_frag = true,
.sport = 80,
.dport = 8080,
},
.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
},
{
.name = "ipv4-no-frag",
.pkt.ipv4 = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
.iph.ihl = 5,
.iph.protocol = IPPROTO_TCP,
.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
.iph.frag_off = __bpf_constant_htons(IP_MF),
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct iphdr),
.addr_proto = ETH_P_IP,
.ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IP),
.is_frag = true,
.is_first_frag = true,
},
},
{
.name = "ipv6-frag",
.pkt.ipv6_frag = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
.iph.nexthdr = IPPROTO_FRAGMENT,
.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
.ipf.nexthdr = IPPROTO_TCP,
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
sizeof(struct frag_hdr),
.addr_proto = ETH_P_IPV6,
.ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IPV6),
.is_frag = true,
.is_first_frag = true,
.sport = 80,
.dport = 8080,
},
.flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
},
{
.name = "ipv6-no-frag",
.pkt.ipv6_frag = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
.iph.nexthdr = IPPROTO_FRAGMENT,
.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
.ipf.nexthdr = IPPROTO_TCP,
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
sizeof(struct frag_hdr),
.addr_proto = ETH_P_IPV6,
.ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IPV6),
.is_frag = true,
.is_first_frag = true,
},
},
{
.name = "ipv6-flow-label",
.pkt.ipv6 = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
.iph.nexthdr = IPPROTO_TCP,
.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
.iph.flow_lbl = { 0xb, 0xee, 0xef },
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct ipv6hdr),
.addr_proto = ETH_P_IPV6,
.ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IPV6),
.sport = 80,
.dport = 8080,
.flow_label = __bpf_constant_htonl(0xbeeef),
},
},
{
.name = "ipv6-no-flow-label",
.pkt.ipv6 = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
.iph.nexthdr = IPPROTO_TCP,
.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
.iph.flow_lbl = { 0xb, 0xee, 0xef },
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct ipv6hdr),
.addr_proto = ETH_P_IPV6,
.ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IPV6),
.flow_label = __bpf_constant_htonl(0xbeeef),
},
.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
},
{
.name = "ipip-encap",
.pkt.ipip = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
.iph.ihl = 5,
.iph.protocol = IPPROTO_IPIP,
.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
.iph_inner.ihl = 5,
.iph_inner.protocol = IPPROTO_TCP,
.iph_inner.tot_len =
__bpf_constant_htons(MAGIC_BYTES) -
sizeof(struct iphdr),
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.nhoff = 0,
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct iphdr) +
sizeof(struct iphdr),
.addr_proto = ETH_P_IP,
.ip_proto = IPPROTO_TCP,
.n_proto = __bpf_constant_htons(ETH_P_IP),
.is_encap = true,
.sport = 80,
.dport = 8080,
},
},
{
.name = "ipip-no-encap",
.pkt.ipip = {
.eth.h_proto = __bpf_constant_htons(ETH_P_IP),
.iph.ihl = 5,
.iph.protocol = IPPROTO_IPIP,
.iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
.iph_inner.ihl = 5,
.iph_inner.protocol = IPPROTO_TCP,
.iph_inner.tot_len =
__bpf_constant_htons(MAGIC_BYTES) -
sizeof(struct iphdr),
.tcp.doff = 5,
.tcp.source = 80,
.tcp.dest = 8080,
},
.keys = {
.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP,
.nhoff = ETH_HLEN,
.thoff = ETH_HLEN + sizeof(struct iphdr),
.addr_proto = ETH_P_IP,
.ip_proto = IPPROTO_IPIP,
.n_proto = __bpf_constant_htons(ETH_P_IP),
.is_encap = true,
},
.flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP,
},
}; };
static int create_tap(const char *ifname) static int create_tap(const char *ifname)
...@@ -225,6 +449,13 @@ void test_flow_dissector(void) ...@@ -225,6 +449,13 @@ void test_flow_dissector(void)
.data_size_in = sizeof(tests[i].pkt), .data_size_in = sizeof(tests[i].pkt),
.data_out = &flow_keys, .data_out = &flow_keys,
}; };
static struct bpf_flow_keys ctx = {};
if (tests[i].flags) {
tattr.ctx_in = &ctx;
tattr.ctx_size_in = sizeof(ctx);
ctx.flags = tests[i].flags;
}
err = bpf_prog_test_run_xattr(&tattr); err = bpf_prog_test_run_xattr(&tattr);
CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) || CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) ||
...@@ -251,10 +482,20 @@ void test_flow_dissector(void) ...@@ -251,10 +482,20 @@ void test_flow_dissector(void)
CHECK(err, "ifup", "err %d errno %d\n", err, errno); CHECK(err, "ifup", "err %d errno %d\n", err, errno);
for (i = 0; i < ARRAY_SIZE(tests); i++) { for (i = 0; i < ARRAY_SIZE(tests); i++) {
struct bpf_flow_keys flow_keys = {}; /* Keep in sync with 'flags' from eth_get_headlen. */
__u32 eth_get_headlen_flags =
BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
struct bpf_prog_test_run_attr tattr = {}; struct bpf_prog_test_run_attr tattr = {};
struct bpf_flow_keys flow_keys = {};
__u32 key = 0; __u32 key = 0;
/* For skb-less case we can't pass input flags; run
* only the tests that have a matching set of flags.
*/
if (tests[i].flags != eth_get_headlen_flags)
continue;
err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt)); err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno); CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
......
...@@ -83,6 +83,12 @@ static __always_inline int export_flow_keys(struct bpf_flow_keys *keys, ...@@ -83,6 +83,12 @@ static __always_inline int export_flow_keys(struct bpf_flow_keys *keys,
return ret; return ret;
} }
#define IPV6_FLOWLABEL_MASK __bpf_constant_htonl(0x000FFFFF)
static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
{
return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
}
static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb, static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
__u16 hdr_size, __u16 hdr_size,
void *buffer) void *buffer)
...@@ -153,7 +159,6 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto) ...@@ -153,7 +159,6 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
struct tcphdr *tcp, _tcp; struct tcphdr *tcp, _tcp;
struct udphdr *udp, _udp; struct udphdr *udp, _udp;
keys->ip_proto = proto;
switch (proto) { switch (proto) {
case IPPROTO_ICMP: case IPPROTO_ICMP:
icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp); icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
...@@ -162,9 +167,15 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto) ...@@ -162,9 +167,15 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
return export_flow_keys(keys, BPF_OK); return export_flow_keys(keys, BPF_OK);
case IPPROTO_IPIP: case IPPROTO_IPIP:
keys->is_encap = true; keys->is_encap = true;
if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
return export_flow_keys(keys, BPF_OK);
return parse_eth_proto(skb, bpf_htons(ETH_P_IP)); return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
case IPPROTO_IPV6: case IPPROTO_IPV6:
keys->is_encap = true; keys->is_encap = true;
if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
return export_flow_keys(keys, BPF_OK);
return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6)); return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6));
case IPPROTO_GRE: case IPPROTO_GRE:
gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre); gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre);
...@@ -184,6 +195,8 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto) ...@@ -184,6 +195,8 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
keys->thoff += 4; /* Step over sequence number */ keys->thoff += 4; /* Step over sequence number */
keys->is_encap = true; keys->is_encap = true;
if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
return export_flow_keys(keys, BPF_OK);
if (gre->proto == bpf_htons(ETH_P_TEB)) { if (gre->proto == bpf_htons(ETH_P_TEB)) {
eth = bpf_flow_dissect_get_header(skb, sizeof(*eth), eth = bpf_flow_dissect_get_header(skb, sizeof(*eth),
...@@ -231,7 +244,6 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr) ...@@ -231,7 +244,6 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
{ {
struct bpf_flow_keys *keys = skb->flow_keys; struct bpf_flow_keys *keys = skb->flow_keys;
keys->ip_proto = nexthdr;
switch (nexthdr) { switch (nexthdr) {
case IPPROTO_HOPOPTS: case IPPROTO_HOPOPTS:
case IPPROTO_DSTOPTS: case IPPROTO_DSTOPTS:
...@@ -266,6 +278,7 @@ PROG(IP)(struct __sk_buff *skb) ...@@ -266,6 +278,7 @@ PROG(IP)(struct __sk_buff *skb)
keys->addr_proto = ETH_P_IP; keys->addr_proto = ETH_P_IP;
keys->ipv4_src = iph->saddr; keys->ipv4_src = iph->saddr;
keys->ipv4_dst = iph->daddr; keys->ipv4_dst = iph->daddr;
keys->ip_proto = iph->protocol;
keys->thoff += iph->ihl << 2; keys->thoff += iph->ihl << 2;
if (data + keys->thoff > data_end) if (data + keys->thoff > data_end)
...@@ -273,13 +286,20 @@ PROG(IP)(struct __sk_buff *skb) ...@@ -273,13 +286,20 @@ PROG(IP)(struct __sk_buff *skb)
if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) { if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
keys->is_frag = true; keys->is_frag = true;
if (iph->frag_off & bpf_htons(IP_OFFSET)) if (iph->frag_off & bpf_htons(IP_OFFSET)) {
/* From second fragment on, packets do not have headers /* From second fragment on, packets do not have headers
* we can parse. * we can parse.
*/ */
done = true; done = true;
else } else {
keys->is_first_frag = true; keys->is_first_frag = true;
/* No need to parse fragmented packet unless
* explicitly asked for.
*/
if (!(keys->flags &
BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
done = true;
}
} }
if (done) if (done)
...@@ -301,6 +321,11 @@ PROG(IPV6)(struct __sk_buff *skb) ...@@ -301,6 +321,11 @@ PROG(IPV6)(struct __sk_buff *skb)
memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr)); memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
keys->thoff += sizeof(struct ipv6hdr); keys->thoff += sizeof(struct ipv6hdr);
keys->ip_proto = ip6h->nexthdr;
keys->flow_label = ip6_flowlabel(ip6h);
if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
return export_flow_keys(keys, BPF_OK);
return parse_ipv6_proto(skb, ip6h->nexthdr); return parse_ipv6_proto(skb, ip6h->nexthdr);
} }
...@@ -317,7 +342,8 @@ PROG(IPV6OP)(struct __sk_buff *skb) ...@@ -317,7 +342,8 @@ PROG(IPV6OP)(struct __sk_buff *skb)
/* hlen is in 8-octets and does not include the first 8 bytes /* hlen is in 8-octets and does not include the first 8 bytes
* of the header * of the header
*/ */
skb->flow_keys->thoff += (1 + ip6h->hdrlen) << 3; keys->thoff += (1 + ip6h->hdrlen) << 3;
keys->ip_proto = ip6h->nexthdr;
return parse_ipv6_proto(skb, ip6h->nexthdr); return parse_ipv6_proto(skb, ip6h->nexthdr);
} }
...@@ -333,9 +359,18 @@ PROG(IPV6FR)(struct __sk_buff *skb) ...@@ -333,9 +359,18 @@ PROG(IPV6FR)(struct __sk_buff *skb)
keys->thoff += sizeof(*fragh); keys->thoff += sizeof(*fragh);
keys->is_frag = true; keys->is_frag = true;
if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) keys->ip_proto = fragh->nexthdr;
if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) {
keys->is_first_frag = true; keys->is_first_frag = true;
/* No need to parse fragmented packet unless
* explicitly asked for.
*/
if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
return export_flow_keys(keys, BPF_OK);
}
return parse_ipv6_proto(skb, fragh->nexthdr); return parse_ipv6_proto(skb, fragh->nexthdr);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment