Commit fdadd049 authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Alexei Starovoitov

bpf: fix bpf_jit_limit knob for PAGE_SIZE >= 64K

Michael and Sandipan report:

  Commit ede95a63 introduced a bpf_jit_limit tuneable to limit BPF
  JIT allocations. At compile time it defaults to PAGE_SIZE * 40000,
  and is adjusted again at init time if MODULES_VADDR is defined.

  For ppc64 kernels, MODULES_VADDR isn't defined, so we're stuck with
  the compile-time default at boot-time, which is 0x9c400000 when
  using 64K page size. This overflows the signed 32-bit bpf_jit_limit
  value:

  root@ubuntu:/tmp# cat /proc/sys/net/core/bpf_jit_limit
  -1673527296

  and can cause various unexpected failures throughout the network
  stack. In one case `strace dhclient eth0` reported:

  setsockopt(5, SOL_SOCKET, SO_ATTACH_FILTER, {len=11, filter=0x105dd27f8},
             16) = -1 ENOTSUPP (Unknown error 524)

  and similar failures can be seen with tools like tcpdump. This doesn't
  always reproduce however, and I'm not sure why. The more consistent
  failure I've seen is an Ubuntu 18.04 KVM guest booted on a POWER9
  host would time out on systemd/netplan configuring a virtio-net NIC
  with no noticeable errors in the logs.

Given this and also given that in near future some architectures like
arm64 will have a custom area for BPF JIT image allocations we should
get rid of the BPF_JIT_LIMIT_DEFAULT fallback / default entirely. For
4.21, we have an overridable bpf_jit_alloc_exec(), bpf_jit_free_exec()
so therefore add another overridable bpf_jit_alloc_exec_limit() helper
function which returns the possible size of the memory area for deriving
the default heuristic in bpf_jit_charge_init().

Like bpf_jit_alloc_exec() and bpf_jit_free_exec(), the new
bpf_jit_alloc_exec_limit() assumes that module_alloc() is the default
JIT memory provider, and therefore in case archs implement their custom
module_alloc() we use MODULES_{END,_VADDR} for limits and otherwise for
vmalloc_exec() cases like on ppc64 we use VMALLOC_{END,_START}.

Additionally, for archs supporting large page sizes, we should change
the sysctl to be handled as long to not run into sysctl restrictions
in future.

Fixes: ede95a63 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations")
Reported-by: default avatarSandipan Das <sandipan@linux.ibm.com>
Reported-by: default avatarMichael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Tested-by: default avatarMichael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent aca1a80e
...@@ -861,7 +861,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, ...@@ -861,7 +861,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
extern int bpf_jit_enable; extern int bpf_jit_enable;
extern int bpf_jit_harden; extern int bpf_jit_harden;
extern int bpf_jit_kallsyms; extern int bpf_jit_kallsyms;
extern int bpf_jit_limit; extern long bpf_jit_limit;
typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
......
...@@ -365,13 +365,11 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) ...@@ -365,13 +365,11 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
} }
#ifdef CONFIG_BPF_JIT #ifdef CONFIG_BPF_JIT
# define BPF_JIT_LIMIT_DEFAULT (PAGE_SIZE * 40000)
/* All BPF JIT sysctl knobs here. */ /* All BPF JIT sysctl knobs here. */
int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
int bpf_jit_harden __read_mostly; int bpf_jit_harden __read_mostly;
int bpf_jit_kallsyms __read_mostly; int bpf_jit_kallsyms __read_mostly;
int bpf_jit_limit __read_mostly = BPF_JIT_LIMIT_DEFAULT; long bpf_jit_limit __read_mostly;
static __always_inline void static __always_inline void
bpf_get_prog_addr_region(const struct bpf_prog *prog, bpf_get_prog_addr_region(const struct bpf_prog *prog,
...@@ -580,16 +578,27 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, ...@@ -580,16 +578,27 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
static atomic_long_t bpf_jit_current; static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
* dedicated BPF backend memory area, or if neither of the two
* below apply.
*/
u64 __weak bpf_jit_alloc_exec_limit(void)
{
#if defined(MODULES_VADDR) #if defined(MODULES_VADDR)
return MODULES_END - MODULES_VADDR;
#else
return VMALLOC_END - VMALLOC_START;
#endif
}
static int __init bpf_jit_charge_init(void) static int __init bpf_jit_charge_init(void)
{ {
/* Only used as heuristic here to derive limit. */ /* Only used as heuristic here to derive limit. */
bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2, bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
PAGE_SIZE), INT_MAX); PAGE_SIZE), LONG_MAX);
return 0; return 0;
} }
pure_initcall(bpf_jit_charge_init); pure_initcall(bpf_jit_charge_init);
#endif
static int bpf_jit_charge_modmem(u32 pages) static int bpf_jit_charge_modmem(u32 pages)
{ {
......
...@@ -28,6 +28,8 @@ static int two __maybe_unused = 2; ...@@ -28,6 +28,8 @@ static int two __maybe_unused = 2;
static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS; static int max_skb_frags = MAX_SKB_FRAGS;
static long long_one __maybe_unused = 1;
static long long_max __maybe_unused = LONG_MAX;
static int net_msg_warn; /* Unused, but still a sysctl */ static int net_msg_warn; /* Unused, but still a sysctl */
...@@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, ...@@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
return proc_dointvec_minmax(table, write, buffer, lenp, ppos); return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
} }
static int
proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif #endif
static struct ctl_table net_core_table[] = { static struct ctl_table net_core_table[] = {
...@@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = { ...@@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = {
{ {
.procname = "bpf_jit_limit", .procname = "bpf_jit_limit",
.data = &bpf_jit_limit, .data = &bpf_jit_limit,
.maxlen = sizeof(int), .maxlen = sizeof(long),
.mode = 0600, .mode = 0600,
.proc_handler = proc_dointvec_minmax_bpf_restricted, .proc_handler = proc_dolongvec_minmax_bpf_restricted,
.extra1 = &one, .extra1 = &long_one,
.extra2 = &long_max,
}, },
#endif #endif
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment