Commit df6f8237 authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2021-05-11

The following pull-request contains BPF updates for your *net* tree.

We've added 13 non-merge commits during the last 8 day(s) which contain
a total of 21 files changed, 817 insertions(+), 382 deletions(-).

The main changes are:

1) Fix multiple ringbuf bugs in particular to prevent writable mmap of
   read-only pages, from Andrii Nakryiko & Thadeu Lima de Souza Cascardo.

2) Fix verifier alu32 known-const subregister bound tracking for bitwise
   operations and/or/xor, from Daniel Borkmann.

3) Reject trampoline attachment for functions with variable arguments,
   and also add a deny list of other forbidden functions, from Jiri Olsa.

4) Fix nested bpf_bprintf_prepare() calls used by various helpers by
   switching to per-CPU buffers, from Florent Revest.

5) Fix kernel compilation with BTF debug info on ppc64 due to pahole
   missing TCP-CC functions like cubictcp_init, from Martin KaFai Lau.

6) Add a kconfig entry to provide an option to disallow unprivileged
   BPF by default, from Daniel Borkmann.

7) Fix libbpf compilation for older libelf when GELF_ST_VISIBILITY()
   macro is not available, from Arnaldo Carvalho de Melo.

8) Migrate test_tc_redirect to test_progs framework as prep work
   for upcoming skb_change_head() fix & selftest, from Jussi Maki.

9) Fix a libbpf segfault in add_dummy_ksym_var() if BTF is not
   present, from Ian Rogers.

10) Fix tx_only micro-benchmark in xdpsock BPF sample with proper frame
    size, from Magnus Karlsson.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 9fe37a80 569c484f
......@@ -1457,11 +1457,22 @@ unprivileged_bpf_disabled
=========================
Writing 1 to this entry will disable unprivileged calls to ``bpf()``;
once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` will return
``-EPERM``.
once disabled, calling ``bpf()`` without ``CAP_SYS_ADMIN`` or ``CAP_BPF``
will return ``-EPERM``. Once set to 1, this can't be cleared from the
running kernel anymore.
Once set, this can't be cleared.
Writing 2 to this entry will also disable unprivileged calls to ``bpf()``,
however, an admin can still change this setting later on, if needed, by
writing 0 or 1 to this entry.
If ``BPF_UNPRIV_DEFAULT_OFF`` is enabled in the kernel config, then this
entry will default to 2 instead of 0.
= =============================================================
0 Unprivileged calls to ``bpf()`` are enabled
1 Unprivileged calls to ``bpf()`` are disabled without recovery
2 Unprivileged calls to ``bpf()`` are disabled
= =============================================================
watchdog
========
......
......@@ -442,6 +442,7 @@ config AUDITSYSCALL
source "kernel/irq/Kconfig"
source "kernel/time/Kconfig"
source "kernel/bpf/Kconfig"
source "kernel/Kconfig.preempt"
menu "CPU/Task time and stats accounting"
......@@ -1713,46 +1714,6 @@ config KALLSYMS_BASE_RELATIVE
# syscall, maps, verifier
config BPF_LSM
bool "LSM Instrumentation with BPF"
depends on BPF_EVENTS
depends on BPF_SYSCALL
depends on SECURITY
depends on BPF_JIT
help
Enables instrumentation of the security hooks with eBPF programs for
implementing dynamic MAC and Audit Policies.
If you are unsure how to answer this question, answer N.
config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if INET
default n
help
Enable the bpf() system call that allows to manipulate eBPF
programs and maps via file descriptors.
config ARCH_WANT_DEFAULT_BPF_JIT
bool
config BPF_JIT_ALWAYS_ON
bool "Permanently enable BPF JIT and remove BPF interpreter"
depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
help
Enables BPF JIT and removes BPF interpreter to avoid
speculative execution of BPF instructions by the interpreter
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
source "kernel/bpf/preload/Kconfig"
config USERFAULTFD
bool "Enable userfaultfd() system call"
depends on MMU
......
# SPDX-License-Identifier: GPL-2.0-only
# BPF interpreter that, for example, classic socket filters depend on.
config BPF
bool
# Used by archs to tell that they support BPF JIT compiler plus which
# flavour. Only one of the two can be selected for a specific arch since
# eBPF JIT supersedes the cBPF JIT.
# Classic BPF JIT (cBPF)
config HAVE_CBPF_JIT
bool
# Extended BPF JIT (eBPF)
config HAVE_EBPF_JIT
bool
# Used by archs to tell that they want the BPF JIT compiler enabled by
# default for kernels that were compiled with BPF JIT support.
config ARCH_WANT_DEFAULT_BPF_JIT
bool
menu "BPF subsystem"
config BPF_SYSCALL
bool "Enable bpf() system call"
select BPF
select IRQ_WORK
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if INET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
and maps via file descriptors.
config BPF_JIT
bool "Enable BPF Just In Time compiler"
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
depends on MODULES
help
BPF programs are normally handled by a BPF interpreter. This option
allows the kernel to generate native code when a program is loaded
into the kernel. This will significantly speed-up processing of BPF
programs.
Note, an admin should enable this feature changing:
/proc/sys/net/core/bpf_jit_enable
/proc/sys/net/core/bpf_jit_harden (optional)
/proc/sys/net/core/bpf_jit_kallsyms (optional)
config BPF_JIT_ALWAYS_ON
bool "Permanently enable BPF JIT and remove BPF interpreter"
depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
help
Enables BPF JIT and removes BPF interpreter to avoid speculative
execution of BPF instructions by the interpreter.
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
config BPF_UNPRIV_DEFAULT_OFF
bool "Disable unprivileged BPF by default"
depends on BPF_SYSCALL
help
Disables unprivileged BPF by default by setting the corresponding
/proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can
still reenable it by setting it to 0 later on, or permanently
disable it by setting it to 1 (from which no other transition to
0 is possible anymore).
source "kernel/bpf/preload/Kconfig"
config BPF_LSM
bool "Enable BPF LSM Instrumentation"
depends on BPF_EVENTS
depends on BPF_SYSCALL
depends on SECURITY
depends on BPF_JIT
help
Enables instrumentation of the security hooks with BPF programs for
implementing dynamic MAC and Audit Policies.
If you are unsure how to answer this question, answer N.
endmenu # "BPF subsystem"
......@@ -5206,6 +5206,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
m->ret_size = ret;
for (i = 0; i < nargs; i++) {
if (i == nargs - 1 && args[i].type == 0) {
bpf_log(log,
"The function %s with variable args is unsupported.\n",
tname);
return -EINVAL;
}
ret = __get_type_size(btf, args[i].type, &t);
if (ret < 0) {
bpf_log(log,
......@@ -5213,6 +5219,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
return -EINVAL;
}
if (ret == 0) {
bpf_log(log,
"The function %s has malformed void argument.\n",
tname);
return -EINVAL;
}
m->arg_size[i] = ret;
}
m->nr_args = nargs;
......
......@@ -696,34 +696,35 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
*/
#define MAX_PRINTF_BUF_LEN 512
struct bpf_printf_buf {
char tmp_buf[MAX_PRINTF_BUF_LEN];
/* Support executing three nested bprintf helper calls on a given CPU */
struct bpf_bprintf_buffers {
char tmp_bufs[3][MAX_PRINTF_BUF_LEN];
};
static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf);
static DEFINE_PER_CPU(int, bpf_printf_buf_used);
static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs);
static DEFINE_PER_CPU(int, bpf_bprintf_nest_level);
static int try_get_fmt_tmp_buf(char **tmp_buf)
{
struct bpf_printf_buf *bufs;
int used;
struct bpf_bprintf_buffers *bufs;
int nest_level;
preempt_disable();
used = this_cpu_inc_return(bpf_printf_buf_used);
if (WARN_ON_ONCE(used > 1)) {
this_cpu_dec(bpf_printf_buf_used);
nest_level = this_cpu_inc_return(bpf_bprintf_nest_level);
if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bufs->tmp_bufs))) {
this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
return -EBUSY;
}
bufs = this_cpu_ptr(&bpf_printf_buf);
*tmp_buf = bufs->tmp_buf;
bufs = this_cpu_ptr(&bpf_bprintf_bufs);
*tmp_buf = bufs->tmp_bufs[nest_level - 1];
return 0;
}
void bpf_bprintf_cleanup(void)
{
if (this_cpu_read(bpf_printf_buf_used)) {
this_cpu_dec(bpf_printf_buf_used);
if (this_cpu_read(bpf_bprintf_nest_level)) {
this_cpu_dec(bpf_bprintf_nest_level);
preempt_enable();
}
}
......
......@@ -221,25 +221,20 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
return -ENOTSUPP;
}
static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
{
size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
/* consumer page + producer page + 2 x data pages */
return RINGBUF_POS_PAGES + 2 * data_pages;
}
static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
{
struct bpf_ringbuf_map *rb_map;
size_t mmap_sz;
rb_map = container_of(map, struct bpf_ringbuf_map, map);
mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
return -EINVAL;
if (vma->vm_flags & VM_WRITE) {
/* allow writable mapping for the consumer_pos only */
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
} else {
vma->vm_flags &= ~VM_MAYWRITE;
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
vma->vm_pgoff + RINGBUF_PGOFF);
}
......@@ -315,6 +310,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
return NULL;
len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
if (len > rb->mask + 1)
return NULL;
cons_pos = smp_load_acquire(&rb->consumer_pos);
if (in_nmi()) {
......
......@@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock);
static DEFINE_IDR(link_idr);
static DEFINE_SPINLOCK(link_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
int sysctl_unprivileged_bpf_disabled __read_mostly =
IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
......
......@@ -7084,11 +7084,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value;
u32 umax_val = src_reg->u32_max_value;
/* Assuming scalar64_min_max_and will be called so its safe
* to skip updating register for known 32-bit case.
*/
if (src_known && dst_known)
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
return;
}
/* We get our minimum from the var_off, since that's inherently
* bitwise. Our maximum is the minimum of the operands' maxima.
......@@ -7108,7 +7107,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg,
dst_reg->s32_min_value = dst_reg->u32_min_value;
dst_reg->s32_max_value = dst_reg->u32_max_value;
}
}
static void scalar_min_max_and(struct bpf_reg_state *dst_reg,
......@@ -7155,11 +7153,10 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
s32 smin_val = src_reg->s32_min_value;
u32 umin_val = src_reg->u32_min_value;
/* Assuming scalar64_min_max_or will be called so it is safe
* to skip updating register for known case.
*/
if (src_known && dst_known)
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
return;
}
/* We get our maximum from the var_off, and our minimum is the
* maximum of the operands' minima
......@@ -7224,11 +7221,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg,
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
s32 smin_val = src_reg->s32_min_value;
/* Assuming scalar64_min_max_xor will be called so it is safe
* to skip updating register for known case.
*/
if (src_known && dst_known)
if (src_known && dst_known) {
__mark_reg32_known(dst_reg, var32_off.value);
return;
}
/* We get both minimum and maximum from the var32_off. */
dst_reg->u32_min_value = var32_off.value;
......@@ -13200,6 +13196,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
return 0;
}
BTF_SET_START(btf_id_deny)
BTF_ID_UNUSED
#ifdef CONFIG_SMP
BTF_ID(func, migrate_disable)
BTF_ID(func, migrate_enable)
#endif
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
BTF_SET_END(btf_id_deny)
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
......@@ -13259,6 +13266,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
ret = bpf_lsm_verify_prog(&env->log, prog);
if (ret < 0)
return ret;
} else if (prog->type == BPF_PROG_TYPE_TRACING &&
btf_id_set_contains(&btf_id_deny, btf_id)) {
return -EINVAL;
}
key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id);
......
......@@ -225,7 +225,27 @@ static int bpf_stats_handler(struct ctl_table *table, int write,
mutex_unlock(&bpf_stats_enabled_mutex);
return ret;
}
#endif
static int bpf_unpriv_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret, unpriv_enable = *(int *)table->data;
bool locked_state = unpriv_enable == 1;
struct ctl_table tmp = *table;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
tmp.data = &unpriv_enable;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && !ret) {
if (locked_state && unpriv_enable != 1)
return -EPERM;
*(int *)table->data = unpriv_enable;
}
return ret;
}
#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
/*
* /proc/sys support
......@@ -2600,10 +2620,9 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_unprivileged_bpf_disabled,
.maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
.mode = 0644,
/* only handle a transition from default "0" to "1" */
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_ONE,
.proc_handler = bpf_unpriv_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
{
.procname = "bpf_stats_enabled",
......
......@@ -302,21 +302,6 @@ config BQL
select DQL
default y
config BPF_JIT
bool "enable BPF Just In Time compiler"
depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
depends on MODULES
help
Berkeley Packet Filter filtering capabilities are normally handled
by an interpreter. This option allows kernel to generate a native
code when filter is loaded in memory. This should speedup
packet sniffing (libpcap/tcpdump).
Note, admin should enable this feature changing:
/proc/sys/net/core/bpf_jit_enable
/proc/sys/net/core/bpf_jit_harden (optional)
/proc/sys/net/core/bpf_jit_kallsyms (optional)
config BPF_STREAM_PARSER
bool "enable BPF STREAM_PARSER"
depends on INET
......@@ -470,15 +455,3 @@ config ETHTOOL_NETLINK
e.g. notification messages.
endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour.
# Only one of the two can be selected for a specific arch since eBPF JIT supersedes
# the cBPF JIT.
# Classic BPF JIT (cBPF)
config HAVE_CBPF_JIT
bool
# Extended BPF JIT (eBPF)
config HAVE_EBPF_JIT
bool
......@@ -185,6 +185,7 @@ BTF_ID(func, tcp_reno_cong_avoid)
BTF_ID(func, tcp_reno_undo_cwnd)
BTF_ID(func, tcp_slow_start)
BTF_ID(func, tcp_cong_avoid_ai)
#ifdef CONFIG_X86
#ifdef CONFIG_DYNAMIC_FTRACE
#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC)
BTF_ID(func, cubictcp_init)
......@@ -213,6 +214,7 @@ BTF_ID(func, bbr_min_tso_segs)
BTF_ID(func, bbr_set_state)
#endif
#endif /* CONFIG_DYNAMIC_FTRACE */
#endif /* CONFIG_X86 */
BTF_SET_END(bpf_tcp_ca_kfunc_ids)
static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id)
......
......@@ -1255,7 +1255,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size)
for (i = 0; i < batch_size; i++) {
struct xdp_desc *tx_desc = xsk_ring_prod__tx_desc(&xsk->tx,
idx + i);
tx_desc->addr = (*frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
tx_desc->addr = (*frame_nb + i) * opt_xsk_frame_size;
tx_desc->len = PKT_SIZE;
}
......
......@@ -3216,6 +3216,9 @@ static int add_dummy_ksym_var(struct btf *btf)
const struct btf_var_secinfo *vs;
const struct btf_type *sec;
if (!btf)
return 0;
sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC,
BTF_KIND_DATASEC);
if (sec_btf_id < 0)
......
......@@ -41,6 +41,11 @@
#define ELF_C_READ_MMAP ELF_C_READ
#endif
/* Older libelf all end up in this expression, for both 32 and 64 bit */
#ifndef GELF_ST_VISIBILITY
#define GELF_ST_VISIBILITY(o) ((o) & 0x03)
#endif
#define BTF_INFO_ENC(kind, kind_flag, vlen) \
((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
#define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type)
......
......@@ -40,7 +40,7 @@ struct ipv6_packet pkt_v6 = {
.tcp.doff = 5,
};
static int settimeo(int fd, int timeout_ms)
int settimeo(int fd, int timeout_ms)
{
struct timeval timeout = { .tv_sec = 3 };
......
......@@ -33,6 +33,7 @@ struct ipv6_packet {
} __packed;
extern struct ipv6_packet pkt_v6;
int settimeo(int fd, int timeout_ms);
int start_server(int family, int type, const char *addr, __u16 port,
int timeout_ms);
int connect_to_fd(int server_fd, int timeout_ms);
......
This diff is collapsed.
......@@ -33,17 +33,8 @@
a.s6_addr32[3] == b.s6_addr32[3])
#endif
enum {
dev_src,
dev_dst,
};
struct bpf_map_def SEC("maps") ifindex_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static volatile const __u32 IFINDEX_SRC;
static volatile const __u32 IFINDEX_DST;
static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
__be32 addr)
......@@ -79,14 +70,8 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
return v6_equal(ip6h->daddr, addr);
}
static __always_inline int get_dev_ifindex(int which)
{
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
SEC("classifier/chk_egress")
int tc_chk(struct __sk_buff *skb)
{
void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data);
......@@ -98,7 +83,8 @@ SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK;
}
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
SEC("classifier/dst_ingress")
int tc_dst(struct __sk_buff *skb)
{
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
......@@ -119,10 +105,11 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(get_dev_ifindex(dev_src), NULL, 0, 0);
return bpf_redirect_neigh(IFINDEX_SRC, NULL, 0, 0);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
SEC("classifier/src_ingress")
int tc_src(struct __sk_buff *skb)
{
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
......@@ -143,7 +130,7 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb)
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(get_dev_ifindex(dev_dst), NULL, 0, 0);
return bpf_redirect_neigh(IFINDEX_DST, NULL, 0, 0);
}
char __license[] SEC("license") = "GPL";
......@@ -75,7 +75,8 @@ static __always_inline int fill_fib_params_v6(struct __sk_buff *skb,
return 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
SEC("classifier/chk_egress")
int tc_chk(struct __sk_buff *skb)
{
void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data);
......@@ -142,12 +143,14 @@ static __always_inline int tc_redir(struct __sk_buff *skb)
/* these are identical, but keep them separate for compatibility with the
* section names expected by test_tc_redirect.sh
*/
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
SEC("classifier/dst_ingress")
int tc_dst(struct __sk_buff *skb)
{
return tc_redir(skb);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
SEC("classifier/src_ingress")
int tc_src(struct __sk_buff *skb)
{
return tc_redir(skb);
}
......
......@@ -8,38 +8,25 @@
#include <bpf/bpf_helpers.h>
enum {
dev_src,
dev_dst,
};
static volatile const __u32 IFINDEX_SRC;
static volatile const __u32 IFINDEX_DST;
struct bpf_map_def SEC("maps") ifindex_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static __always_inline int get_dev_ifindex(int which)
{
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
SEC("classifier/chk_egress")
int tc_chk(struct __sk_buff *skb)
{
return TC_ACT_SHOT;
}
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
SEC("classifier/dst_ingress")
int tc_dst(struct __sk_buff *skb)
{
return bpf_redirect_peer(get_dev_ifindex(dev_src), 0);
return bpf_redirect_peer(IFINDEX_SRC, 0);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
SEC("classifier/src_ingress")
int tc_src(struct __sk_buff *skb)
{
return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0);
return bpf_redirect_peer(IFINDEX_DST, 0);
}
char __license[] SEC("license") = "GPL";
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
# between src and dst. The netns fwd has veth links to each src and dst. The
# client is in src and server in dst. The test installs a TC BPF program to each
# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
# switch from ingress side; it also installs a checker prog on the egress side
# to drop unexpected traffic.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that needed tools are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "dd is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
command -v ping >/dev/null 2>&1 || \
{ echo >&2 "ping is not available"; exit 1; }
if command -v ping6 >/dev/null 2>&1; then PING6=ping6; else PING6=ping; fi
command -v perl >/dev/null 2>&1 || \
{ echo >&2 "perl is not available"; exit 1; }
command -v jq >/dev/null 2>&1 || \
{ echo >&2 "jq is not available"; exit 1; }
command -v bpftool >/dev/null 2>&1 || \
{ echo >&2 "bpftool is not available"; exit 1; }
readonly GREEN='\033[0;92m'
readonly RED='\033[0;31m'
readonly NC='\033[0m' # No Color
readonly PING_ARG="-c 3 -w 10 -q"
readonly TIMEOUT=10
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP4_SRC="172.16.1.100"
readonly IP4_DST="172.16.2.100"
readonly IP6_SRC="::1:dead:beef:cafe"
readonly IP6_DST="::2:dead:beef:cafe"
readonly IP4_SLL="169.254.0.1"
readonly IP4_DLL="169.254.0.2"
readonly IP4_NET="169.254.0.0"
netns_cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_FWD}
ip netns del ${NS_DST}
}
netns_setup()
{
ip netns add "${NS_SRC}"
ip netns add "${NS_FWD}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_src_fwd
ip link add veth_dst type veth peer name veth_dst_fwd
ip link set veth_src netns ${NS_SRC}
ip link set veth_src_fwd netns ${NS_FWD}
ip link set veth_dst netns ${NS_DST}
ip link set veth_dst_fwd netns ${NS_FWD}
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
# The fwd netns automatically get a v6 LL address / routes, but also
# needs v4 one in order to start ARP probing. IP4_NET route is added
# to the endpoints so that the ARP processing will reply.
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_FWD} link set dev veth_src_fwd up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
}
netns_test_connectivity()
{
set +e
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
TEST="TCPv4 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="TCPv6 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv4 connectivity test"
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv6 connectivity test"
ip netns exec ${NS_SRC} $PING6 $PING_ARG ${IP6_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
set -e
}
hex_mem_str()
{
perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
}
netns_setup_bpf()
{
local obj=$1
local use_forwarding=${2:-0}
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress
if [ "$use_forwarding" -eq "1" ]; then
# bpf_fib_lookup() checks if forwarding is enabled
ip netns exec ${NS_FWD} sysctl -w net.ipv4.ip_forward=1
ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_dst_fwd.forwarding=1
ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_src_fwd.forwarding=1
return 0
fi
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
for prog in $progs; do
map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
if [ ! -z "$map" ]; then
bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
fi
done
}
trap netns_cleanup EXIT
set -e
netns_setup
netns_setup_bpf test_tc_neigh.o
netns_test_connectivity
netns_cleanup
netns_setup
netns_setup_bpf test_tc_neigh_fib.o 1
netns_test_connectivity
netns_cleanup
netns_setup
netns_setup_bpf test_tc_peer.o
netns_test_connectivity
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment