Commit 5cd0aea0 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'support-bpf_kptr_xchg-into-local-kptr'

Amery Hung says:

====================
Support bpf_kptr_xchg into local kptr

This revision adds substaintial changes to patch 2 to support structures
with kptr as the only special btf type. The test is split into
local_kptr_stash and task_kfunc_success to remove dependencies on
bpf_testmod that would break veristat results.

This series allows stashing kptr into local kptr. Currently, kptrs are
only allowed to be stashed into map value with bpf_kptr_xchg(). A
motivating use case of this series is to enable adding referenced kptr to
bpf_rbtree or bpf_list by using allocated object as graph node and the
storage of referenced kptr. For example, a bpf qdisc [0] enqueuing a
referenced kptr to a struct sk_buff* to a bpf_list serving as a fifo:

    struct skb_node {
            struct sk_buff __kptr *skb;
            struct bpf_list_node node;
    };

    private(A) struct bpf_spin_lock fifo_lock;
    private(A) struct bpf_list_head fifo __contains(skb_node, node);

    /* In Qdisc_ops.enqueue */
    struct skb_node *skbn;

    skbn = bpf_obj_new(typeof(*skbn));
    if (!skbn)
        goto drop;

    /* skb is a referenced kptr to struct sk_buff acquired earilier
     * but not shown in this code snippet.
     */
    skb = bpf_kptr_xchg(&skbn->skb, skb);
    if (skb)
        /* should not happen; do something below releasing skb to
         * satisfy the verifier */
    	...

    bpf_spin_lock(&fifo_lock);
    bpf_list_push_back(&fifo, &skbn->node);
    bpf_spin_unlock(&fifo_lock);

The implementation first searches for BPF_KPTR when generating program
BTF. Then, we teach the verifier that the detination argument of
bpf_kptr_xchg() can be local kptr, and use the btf_record in program BTF
to check against the source argument.

This series is mostly developed by Dave, who kindly helped and sent me
the patchset. The selftests in bpf qdisc (WIP) relies on this series to
work.

[0] https://lore.kernel.org/netdev/20240714175130.4051012-10-amery.hung@bytedance.com/
---
v3 -> v4
  - Allow struct in prog btf w/ kptr as the only special field type
  - Split tests of stashing referenced kptr and local kptr
  - v3: https://lore.kernel.org/bpf/20240809005131.3916464-1-amery.hung@bytedance.com/

v2 -> v3
  - Fix prog btf memory leak
  - Test stashing kptr in prog btf
  - Test unstashing kptrs after stashing into local kptrs
  - v2: https://lore.kernel.org/bpf/20240803001145.635887-1-amery.hung@bytedance.com/

v1 -> v2
  - Fix the document for bpf_kptr_xchg()
  - Add a comment explaining changes in the verifier
  - v1: https://lore.kernel.org/bpf/20240728030115.3970543-1-amery.hung@bytedance.com/
====================

Link: https://lore.kernel.org/r/20240813212424.2871455-1-amery.hung@bytedance.comSigned-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents f727b13d 91c96842
......@@ -744,7 +744,7 @@ enum bpf_arg_type {
ARG_PTR_TO_STACK, /* pointer to stack */
ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */
ARG_PTR_TO_TIMER, /* pointer to bpf_timer */
ARG_PTR_TO_KPTR, /* pointer to referenced kptr */
ARG_KPTR_XCHG_DEST, /* pointer to destination that kptrs are bpf_kptr_xchg'd into */
ARG_PTR_TO_DYNPTR, /* pointer to bpf_dynptr. See bpf_type_flag for dynptr type */
__BPF_ARG_TYPE_MAX,
......
......@@ -5519,11 +5519,12 @@ union bpf_attr {
* **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if
* invalid arguments are passed.
*
* void *bpf_kptr_xchg(void *map_value, void *ptr)
* void *bpf_kptr_xchg(void *dst, void *ptr)
* Description
* Exchange kptr at pointer *map_value* with *ptr*, and return the
* old value. *ptr* can be NULL, otherwise it must be a referenced
* pointer which will be released when this helper is called.
* Exchange kptr at pointer *dst* with *ptr*, and return the old value.
* *dst* can be map value or local kptr. *ptr* can be NULL, otherwise
* it must be a referenced pointer which will be released when this helper
* is called.
* Return
* The old value of kptr (which can be NULL). The returned pointer
* if not NULL, is a reference which must be released using its
......
......@@ -3754,6 +3754,7 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
return -EINVAL;
}
/* Callers have to ensure the life cycle of btf if it is program BTF */
static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
struct btf_field_info *info)
{
......@@ -3782,7 +3783,6 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
field->kptr.dtor = NULL;
id = info->kptr.type_id;
kptr_btf = (struct btf *)btf;
btf_get(kptr_btf);
goto found_dtor;
}
if (id < 0)
......@@ -5512,36 +5512,70 @@ static const char *alloc_obj_fields[] = {
static struct btf_struct_metas *
btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
{
union {
struct btf_id_set set;
struct {
u32 _cnt;
u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
} _arr;
} aof;
struct btf_struct_metas *tab = NULL;
struct btf_id_set *aof;
int i, n, id, ret;
BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
memset(&aof, 0, sizeof(aof));
aof = kmalloc(sizeof(*aof), GFP_KERNEL | __GFP_NOWARN);
if (!aof)
return ERR_PTR(-ENOMEM);
aof->cnt = 0;
for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
/* Try to find whether this special type exists in user BTF, and
* if so remember its ID so we can easily find it among members
* of structs that we iterate in the next loop.
*/
struct btf_id_set *new_aof;
id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
if (id < 0)
continue;
aof.set.ids[aof.set.cnt++] = id;
new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
GFP_KERNEL | __GFP_NOWARN);
if (!new_aof) {
ret = -ENOMEM;
goto free_aof;
}
aof = new_aof;
aof->ids[aof->cnt++] = id;
}
n = btf_nr_types(btf);
for (i = 1; i < n; i++) {
/* Try to find if there are kptrs in user BTF and remember their ID */
struct btf_id_set *new_aof;
struct btf_field_info tmp;
const struct btf_type *t;
t = btf_type_by_id(btf, i);
if (!t) {
ret = -EINVAL;
goto free_aof;
}
ret = btf_find_kptr(btf, t, 0, 0, &tmp);
if (ret != BTF_FIELD_FOUND)
continue;
new_aof = krealloc(aof, offsetof(struct btf_id_set, ids[aof->cnt + 1]),
GFP_KERNEL | __GFP_NOWARN);
if (!new_aof) {
ret = -ENOMEM;
goto free_aof;
}
aof = new_aof;
aof->ids[aof->cnt++] = i;
}
if (!aof.set.cnt)
if (!aof->cnt)
return NULL;
sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
sort(&aof->ids, aof->cnt, sizeof(aof->ids[0]), btf_id_cmp_func, NULL);
n = btf_nr_types(btf);
for (i = 1; i < n; i++) {
struct btf_struct_metas *new_tab;
const struct btf_member *member;
......@@ -5551,17 +5585,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
int j, tab_cnt;
t = btf_type_by_id(btf, i);
if (!t) {
ret = -EINVAL;
goto free;
}
if (!__btf_type_is_struct(t))
continue;
cond_resched();
for_each_member(j, t, member) {
if (btf_id_set_contains(&aof.set, member->type))
if (btf_id_set_contains(aof, member->type))
goto parse;
}
continue;
......@@ -5580,7 +5610,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT |
BPF_KPTR, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
......@@ -5589,9 +5620,12 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type->record = record;
tab->cnt++;
}
kfree(aof);
return tab;
free:
btf_struct_metas_free(tab);
free_aof:
kfree(aof);
return ERR_PTR(ret);
}
......
......@@ -1619,9 +1619,9 @@ void bpf_wq_cancel_and_free(void *val)
schedule_work(&work->delete_work);
}
BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
BPF_CALL_2(bpf_kptr_xchg, void *, dst, void *, ptr)
{
unsigned long *kptr = map_value;
unsigned long *kptr = dst;
/* This helper may be inlined by verifier. */
return xchg(kptr, (unsigned long)ptr);
......@@ -1636,7 +1636,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
.ret_btf_id = BPF_PTR_POISON,
.arg1_type = ARG_PTR_TO_KPTR,
.arg1_type = ARG_KPTR_XCHG_DEST,
.arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL | OBJ_RELEASE,
.arg2_btf_id = BPF_PTR_POISON,
};
......
......@@ -550,7 +550,8 @@ void btf_record_free(struct btf_record *rec)
case BPF_KPTR_PERCPU:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
btf_put(rec->fields[i].kptr.btf);
if (btf_is_kernel(rec->fields[i].kptr.btf))
btf_put(rec->fields[i].kptr.btf);
break;
case BPF_LIST_HEAD:
case BPF_LIST_NODE:
......@@ -596,7 +597,8 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
btf_get(fields[i].kptr.btf);
if (btf_is_kernel(fields[i].kptr.btf))
btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
ret = -ENXIO;
goto free;
......
......@@ -7803,29 +7803,38 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_map *map_ptr = reg->map_ptr;
struct btf_field *kptr_field;
struct bpf_map *map_ptr;
struct btf_record *rec;
u32 kptr_off;
if (type_is_ptr_alloc_obj(reg->type)) {
rec = reg_btf_record(reg);
} else { /* PTR_TO_MAP_VALUE */
map_ptr = reg->map_ptr;
if (!map_ptr->btf) {
verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
map_ptr->name);
return -EINVAL;
}
rec = map_ptr->record;
meta->map_ptr = map_ptr;
}
if (!tnum_is_const(reg->var_off)) {
verbose(env,
"R%d doesn't have constant offset. kptr has to be at the constant offset\n",
regno);
return -EINVAL;
}
if (!map_ptr->btf) {
verbose(env, "map '%s' has to have BTF in order to use bpf_kptr_xchg\n",
map_ptr->name);
return -EINVAL;
}
if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
if (!btf_record_has_field(rec, BPF_KPTR)) {
verbose(env, "R%d has no valid kptr\n", regno);
return -EINVAL;
}
meta->map_ptr = map_ptr;
kptr_off = reg->off + reg->var_off.value;
kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
kptr_field = btf_record_find(rec, kptr_off, BPF_KPTR);
if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
......@@ -8412,7 +8421,12 @@ static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types timer_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types kptr_xchg_dest_types = {
.types = {
PTR_TO_MAP_VALUE,
PTR_TO_BTF_ID | MEM_ALLOC
}
};
static const struct bpf_reg_types dynptr_types = {
.types = {
PTR_TO_STACK,
......@@ -8444,7 +8458,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
[ARG_PTR_TO_KPTR] = &kptr_types,
[ARG_KPTR_XCHG_DEST] = &kptr_xchg_dest_types,
[ARG_PTR_TO_DYNPTR] = &dynptr_types,
};
......@@ -8483,7 +8497,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (base_type(arg_type) == ARG_PTR_TO_MEM)
type &= ~DYNPTR_TYPE_FLAG_MASK;
if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type)) {
/* Local kptr types are allowed as the source argument of bpf_kptr_xchg */
if (meta->func_id == BPF_FUNC_kptr_xchg && type_is_alloc(type) && regno == BPF_REG_2) {
type &= ~MEM_ALLOC;
type &= ~MEM_PERCPU;
}
......@@ -8576,7 +8591,8 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
if (meta->func_id == BPF_FUNC_kptr_xchg) {
/* Check if local kptr in src arg matches kptr in dst arg */
if (meta->func_id == BPF_FUNC_kptr_xchg && regno == BPF_REG_2) {
if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
}
......@@ -8887,7 +8903,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
meta->release_regno = regno;
}
if (reg->ref_obj_id) {
if (reg->ref_obj_id && base_type(arg_type) != ARG_KPTR_XCHG_DEST) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
regno, reg->ref_obj_id,
......@@ -9044,7 +9060,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return err;
break;
}
case ARG_PTR_TO_KPTR:
case ARG_KPTR_XCHG_DEST:
err = process_kptr_func(env, regno, meta);
if (err)
return err;
......
......@@ -8,9 +8,12 @@
#include "../bpf_experimental.h"
#include "../bpf_testmod/bpf_testmod_kfunc.h"
struct plain_local;
struct node_data {
long key;
long data;
struct plain_local __kptr * stashed_in_local_kptr;
struct bpf_rb_node node;
};
......@@ -85,6 +88,7 @@ static bool less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
static int create_and_stash(int idx, int val)
{
struct plain_local *inner_local_kptr;
struct map_value *mapval;
struct node_data *res;
......@@ -92,11 +96,25 @@ static int create_and_stash(int idx, int val)
if (!mapval)
return 1;
inner_local_kptr = bpf_obj_new(typeof(*inner_local_kptr));
if (!inner_local_kptr)
return 2;
res = bpf_obj_new(typeof(*res));
if (!res)
return 1;
if (!res) {
bpf_obj_drop(inner_local_kptr);
return 3;
}
res->key = val;
inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr);
if (inner_local_kptr) {
/* Should never happen, we just obj_new'd res */
bpf_obj_drop(inner_local_kptr);
bpf_obj_drop(res);
return 4;
}
res = bpf_kptr_xchg(&mapval->node, res);
if (res)
bpf_obj_drop(res);
......@@ -169,6 +187,7 @@ long stash_local_with_root(void *ctx)
SEC("tc")
long unstash_rb_node(void *ctx)
{
struct plain_local *inner_local_kptr = NULL;
struct map_value *mapval;
struct node_data *res;
long retval;
......@@ -180,6 +199,13 @@ long unstash_rb_node(void *ctx)
res = bpf_kptr_xchg(&mapval->node, NULL);
if (res) {
inner_local_kptr = bpf_kptr_xchg(&res->stashed_in_local_kptr, inner_local_kptr);
if (!inner_local_kptr) {
bpf_obj_drop(res);
return 1;
}
bpf_obj_drop(inner_local_kptr);
retval = res->key;
bpf_obj_drop(res);
return retval;
......
......@@ -5,6 +5,7 @@
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_helpers.h>
#include "../bpf_experimental.h"
#include "task_kfunc_common.h"
char _license[] SEC("license") = "GPL";
......@@ -143,7 +144,7 @@ SEC("tp_btf/task_newtask")
int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
{
struct task_struct *kptr;
struct __tasks_kfunc_map_value *v;
struct __tasks_kfunc_map_value *v, *local;
long status;
if (!is_test_kfunc_task())
......@@ -167,6 +168,29 @@ int BPF_PROG(test_task_xchg_release, struct task_struct *task, u64 clone_flags)
return 0;
}
local = bpf_obj_new(typeof(*local));
if (!local) {
err = 4;
bpf_task_release(kptr);
return 0;
}
kptr = bpf_kptr_xchg(&local->task, kptr);
if (kptr) {
err = 5;
bpf_obj_drop(local);
bpf_task_release(kptr);
return 0;
}
kptr = bpf_kptr_xchg(&local->task, NULL);
if (!kptr) {
err = 6;
bpf_obj_drop(local);
return 0;
}
bpf_obj_drop(local);
bpf_task_release(kptr);
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment