Commit c45c79e5 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'Add bpf_copy_from_user_task helper and sleepable bpf iterator programs'

Kenny Yu says:

====================

This patch series makes the following changes:
* Adds a new bpf helper `bpf_copy_from_user_task` to read user space
  memory from a different task.
* Adds the ability to create sleepable bpf iterator programs.

As an example of how this will be used, at Meta we are using bpf task
iterator programs and this new bpf helper to read C++ async stack traces of
a running process for debugging C++ binaries in production.

Changes since v6:
* Split first patch into two patches: first patch to add support
  for bpf iterators to use sleepable helpers, and the second to add
  the new bpf helper.
* Simplify implementation of `bpf_copy_from_user_task` based on feedback.
* Add to docs that the destination buffer will be zero-ed on error.

Changes since v5:
* Rename `bpf_access_process_vm` to `bpf_copy_from_user_task`.
* Change return value to be all-or-nothing. If we get a partial read,
  memset all bytes to 0 and return -EFAULT.
* Add to docs that the helper can only be used by sleepable BPF programs.
* Fix nits in selftests.

Changes since v4:
* Make `flags` into u64.
* Use `user_ptr` arg name to be consistent with `bpf_copy_from_user`.
* Add an extra check in selftests to verify access_process_vm calls
  succeeded.

Changes since v3:
* Check if `flags` is 0 and return -EINVAL if not.
* Rebase on latest bpf-next branch and fix merge conflicts.

Changes since v2:
* Reorder arguments in `bpf_access_process_vm` to match existing related
  bpf helpers (e.g. `bpf_probe_read_kernel`, `bpf_probe_read_user`,
  `bpf_copy_from_user`).
* `flags` argument is provided for future extensibility and is not
  currently used, and we always invoke `access_process_vm` with no flags.
* Merge bpf helper patch and `bpf_iter_run_prog` patch together for better
  bisectability in case of failures.
* Clean up formatting and comments in selftests.

Changes since v1:
* Fixed "Invalid wait context" issue in `bpf_iter_run_prog` by using
  `rcu_read_lock_trace()` for sleepable bpf iterator programs.
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents caaba961 45105c2e
...@@ -2243,6 +2243,7 @@ extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; ...@@ -2243,6 +2243,7 @@ extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto;
extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_find_vma_proto;
extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_loop_proto;
extern const struct bpf_func_proto bpf_strncmp_proto; extern const struct bpf_func_proto bpf_strncmp_proto;
extern const struct bpf_func_proto bpf_copy_from_user_task_proto;
const struct bpf_func_proto *tracing_prog_func_proto( const struct bpf_func_proto *tracing_prog_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog); enum bpf_func_id func_id, const struct bpf_prog *prog);
......
...@@ -5076,6 +5076,16 @@ union bpf_attr { ...@@ -5076,6 +5076,16 @@ union bpf_attr {
* associated to *xdp_md*, at *offset*. * associated to *xdp_md*, at *offset*.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
*
* long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
* Description
* Read *size* bytes from user space address *user_ptr* in *tsk*'s
* address space, and stores the data in *dst*. *flags* is not
* used yet and is provided for future extensibility. This helper
* can only be used by sleepable programs.
* Return
* 0 on success, or a negative error in case of failure. On error
* *dst* buffer is zeroed out.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -5269,6 +5279,7 @@ union bpf_attr { ...@@ -5269,6 +5279,7 @@ union bpf_attr {
FN(xdp_get_buff_len), \ FN(xdp_get_buff_len), \
FN(xdp_load_bytes), \ FN(xdp_load_bytes), \
FN(xdp_store_bytes), \ FN(xdp_store_bytes), \
FN(copy_from_user_task), \
/* */ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <linux/anon_inodes.h> #include <linux/anon_inodes.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/bpf.h> #include <linux/bpf.h>
#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info { struct bpf_iter_target_info {
struct list_head list; struct list_head list;
...@@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) ...@@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{ {
int ret; int ret;
rcu_read_lock(); if (prog->aux->sleepable) {
migrate_disable(); rcu_read_lock_trace();
ret = bpf_prog_run(prog, ctx); migrate_disable();
migrate_enable(); might_fault();
rcu_read_unlock(); ret = bpf_prog_run(prog, ctx);
migrate_enable();
rcu_read_unlock_trace();
} else {
rcu_read_lock();
migrate_disable();
ret = bpf_prog_run(prog, ctx);
migrate_enable();
rcu_read_unlock();
}
/* bpf program can only return 0 or 1: /* bpf program can only return 0 or 1:
* 0 : okay * 0 : okay
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/pid_namespace.h> #include <linux/pid_namespace.h>
#include <linux/proc_ns.h> #include <linux/proc_ns.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/btf_ids.h>
#include "../../lib/kstrtox.h" #include "../../lib/kstrtox.h"
...@@ -671,6 +672,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { ...@@ -671,6 +672,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
.arg3_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING,
}; };
BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
{
int ret;
/* flags is not used yet */
if (unlikely(flags))
return -EINVAL;
if (unlikely(!size))
return 0;
ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
if (ret == size)
return 0;
memset(dst, 0, size);
/* Return -EFAULT for partial read */
return ret < 0 ? ret : -EFAULT;
}
const struct bpf_func_proto bpf_copy_from_user_task_proto = {
.func = bpf_copy_from_user_task,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_BTF_ID,
.arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg5_type = ARG_ANYTHING
};
BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
{ {
if (cpu >= nr_cpu_ids) if (cpu >= nr_cpu_ids)
......
...@@ -1235,6 +1235,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ...@@ -1235,6 +1235,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_task_stack_proto; return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user: case BPF_FUNC_copy_from_user:
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
case BPF_FUNC_copy_from_user_task:
return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
case BPF_FUNC_snprintf_btf: case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto; return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr: case BPF_FUNC_per_cpu_ptr:
......
...@@ -5076,6 +5076,16 @@ union bpf_attr { ...@@ -5076,6 +5076,16 @@ union bpf_attr {
* associated to *xdp_md*, at *offset*. * associated to *xdp_md*, at *offset*.
* Return * Return
* 0 on success, or a negative error in case of failure. * 0 on success, or a negative error in case of failure.
*
* long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags)
* Description
* Read *size* bytes from user space address *user_ptr* in *tsk*'s
* address space, and stores the data in *dst*. *flags* is not
* used yet and is provided for future extensibility. This helper
* can only be used by sleepable programs.
* Return
* 0 on success, or a negative error in case of failure. On error
* *dst* buffer is zeroed out.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -5269,6 +5279,7 @@ union bpf_attr { ...@@ -5269,6 +5279,7 @@ union bpf_attr {
FN(xdp_get_buff_len), \ FN(xdp_get_buff_len), \
FN(xdp_load_bytes), \ FN(xdp_load_bytes), \
FN(xdp_store_bytes), \ FN(xdp_store_bytes), \
FN(copy_from_user_task), \
/* */ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
...@@ -8612,6 +8612,7 @@ static const struct bpf_sec_def section_defs[] = { ...@@ -8612,6 +8612,7 @@ static const struct bpf_sec_def section_defs[] = {
SEC_DEF("lsm/", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), SEC_DEF("lsm/", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm),
SEC_DEF("lsm.s/", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), SEC_DEF("lsm.s/", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm),
SEC_DEF("iter/", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), SEC_DEF("iter/", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter),
SEC_DEF("iter.s/", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter),
SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE),
SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS), SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS),
SEC_DEF("xdp_devmap/", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE), SEC_DEF("xdp_devmap/", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE),
......
...@@ -138,6 +138,24 @@ static void test_task(void) ...@@ -138,6 +138,24 @@ static void test_task(void)
bpf_iter_task__destroy(skel); bpf_iter_task__destroy(skel);
} }
static void test_task_sleepable(void)
{
struct bpf_iter_task *skel;
skel = bpf_iter_task__open_and_load();
if (!ASSERT_OK_PTR(skel, "bpf_iter_task__open_and_load"))
return;
do_dummy_read(skel->progs.dump_task_sleepable);
ASSERT_GT(skel->bss->num_expected_failure_copy_from_user_task, 0,
"num_expected_failure_copy_from_user_task");
ASSERT_GT(skel->bss->num_success_copy_from_user_task, 0,
"num_success_copy_from_user_task");
bpf_iter_task__destroy(skel);
}
static void test_task_stack(void) static void test_task_stack(void)
{ {
struct bpf_iter_task_stack *skel; struct bpf_iter_task_stack *skel;
...@@ -1252,6 +1270,8 @@ void test_bpf_iter(void) ...@@ -1252,6 +1270,8 @@ void test_bpf_iter(void)
test_bpf_map(); test_bpf_map();
if (test__start_subtest("task")) if (test__start_subtest("task"))
test_task(); test_task();
if (test__start_subtest("task_sleepable"))
test_task_sleepable();
if (test__start_subtest("task_stack")) if (test__start_subtest("task_stack"))
test_task_stack(); test_task_stack();
if (test__start_subtest("task_file")) if (test__start_subtest("task_file"))
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
/* Copyright (c) 2020 Facebook */ /* Copyright (c) 2020 Facebook */
#include "bpf_iter.h" #include "bpf_iter.h"
#include <bpf/bpf_helpers.h> #include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL"; char _license[] SEC("license") = "GPL";
...@@ -23,3 +24,56 @@ int dump_task(struct bpf_iter__task *ctx) ...@@ -23,3 +24,56 @@ int dump_task(struct bpf_iter__task *ctx)
BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid); BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid);
return 0; return 0;
} }
int num_expected_failure_copy_from_user_task = 0;
int num_success_copy_from_user_task = 0;
SEC("iter.s/task")
int dump_task_sleepable(struct bpf_iter__task *ctx)
{
struct seq_file *seq = ctx->meta->seq;
struct task_struct *task = ctx->task;
static const char info[] = " === END ===";
struct pt_regs *regs;
void *ptr;
uint32_t user_data = 0;
int ret;
if (task == (void *)0) {
BPF_SEQ_PRINTF(seq, "%s\n", info);
return 0;
}
/* Read an invalid pointer and ensure we get an error */
ptr = NULL;
ret = bpf_copy_from_user_task(&user_data, sizeof(uint32_t), ptr, task, 0);
if (ret) {
++num_expected_failure_copy_from_user_task;
} else {
BPF_SEQ_PRINTF(seq, "%s\n", info);
return 0;
}
/* Try to read the contents of the task's instruction pointer from the
* remote task's address space.
*/
regs = (struct pt_regs *)bpf_task_pt_regs(task);
if (regs == (void *)0) {
BPF_SEQ_PRINTF(seq, "%s\n", info);
return 0;
}
ptr = (void *)PT_REGS_IP(regs);
ret = bpf_copy_from_user_task(&user_data, sizeof(uint32_t), ptr, task, 0);
if (ret) {
BPF_SEQ_PRINTF(seq, "%s\n", info);
return 0;
}
++num_success_copy_from_user_task;
if (ctx->meta->seq_num == 0)
BPF_SEQ_PRINTF(seq, " tgid gid data\n");
BPF_SEQ_PRINTF(seq, "%8d %8d %8d\n", task->tgid, task->pid, user_data);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment