Commit f0c5941f authored by Kumar Kartikeya Dwivedi's avatar Kumar Kartikeya Dwivedi Committed by Alexei Starovoitov

bpf: Support bpf_list_head in map values

Add the support on the map side to parse, recognize, verify, and build
metadata table for a new special field of the type struct bpf_list_head.
To parameterize the bpf_list_head for a certain value type and the
list_node member it will accept in that value type, we use BTF
declaration tags.

The definition of bpf_list_head in a map value will be done as follows:

struct foo {
	struct bpf_list_node node;
	int data;
};

struct map_value {
	struct bpf_list_head head __contains(foo, node);
};

Then, the bpf_list_head only allows adding to the list 'head' using the
bpf_list_node 'node' for the type struct foo.

The 'contains' annotation is a BTF declaration tag composed of four
parts, "contains:name:node" where the name is then used to look up the
type in the map BTF, with its kind hardcoded to BTF_KIND_STRUCT during
the lookup. The node defines name of the member in this type that has
the type struct bpf_list_node, which is actually used for linking into
the linked list. For now, 'kind' part is hardcoded as struct.

This allows building intrusive linked lists in BPF, using container_of
to obtain pointer to entry, while being completely type safe from the
perspective of the verifier. The verifier knows exactly the type of the
nodes, and knows that list helpers return that type at some fixed offset
where the bpf_list_node member used for this list exists. The verifier
also uses this information to disallow adding types that are not
accepted by a certain list.

For now, no elements can be added to such lists. Support for that is
coming in future patches, hence draining and freeing items is done with
a TODO that will be resolved in a future patch.

Note that the bpf_list_head_free function moves the list out to a local
variable under the lock and releases it, doing the actual draining of
the list items outside the lock. While this helps with not holding the
lock for too long pessimizing other concurrent list operations, it is
also necessary for deadlock prevention: unless every function called in
the critical section would be notrace, a fentry/fexit program could
attach and call bpf_map_update_elem again on the map, leading to the
same lock being acquired if the key matches and lead to a deadlock.
While this requires some special effort on part of the BPF programmer to
trigger and is highly unlikely to occur in practice, it is always better
if we can avoid such a condition.

While notrace would prevent this, doing the draining outside the lock
has advantages of its own, hence it is used to also fix the deadlock
related problem.
Signed-off-by: default avatarKumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20221114191547.1694267-5-memxor@gmail.comSigned-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent e5feed0f
...@@ -175,6 +175,7 @@ enum btf_field_type { ...@@ -175,6 +175,7 @@ enum btf_field_type {
BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_UNREF = (1 << 2),
BPF_KPTR_REF = (1 << 3), BPF_KPTR_REF = (1 << 3),
BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF,
BPF_LIST_HEAD = (1 << 4),
}; };
struct btf_field_kptr { struct btf_field_kptr {
...@@ -184,11 +185,18 @@ struct btf_field_kptr { ...@@ -184,11 +185,18 @@ struct btf_field_kptr {
u32 btf_id; u32 btf_id;
}; };
struct btf_field_list_head {
struct btf *btf;
u32 value_btf_id;
u32 node_offset;
};
struct btf_field { struct btf_field {
u32 offset; u32 offset;
enum btf_field_type type; enum btf_field_type type;
union { union {
struct btf_field_kptr kptr; struct btf_field_kptr kptr;
struct btf_field_list_head list_head;
}; };
}; };
...@@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) ...@@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
case BPF_KPTR_UNREF: case BPF_KPTR_UNREF:
case BPF_KPTR_REF: case BPF_KPTR_REF:
return "kptr"; return "kptr";
case BPF_LIST_HEAD:
return "bpf_list_head";
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
return "unknown"; return "unknown";
...@@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) ...@@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
case BPF_KPTR_UNREF: case BPF_KPTR_UNREF:
case BPF_KPTR_REF: case BPF_KPTR_REF:
return sizeof(u64); return sizeof(u64);
case BPF_LIST_HEAD:
return sizeof(struct bpf_list_head);
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
return 0; return 0;
...@@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) ...@@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
case BPF_KPTR_UNREF: case BPF_KPTR_UNREF:
case BPF_KPTR_REF: case BPF_KPTR_REF:
return __alignof__(u64); return __alignof__(u64);
case BPF_LIST_HEAD:
return __alignof__(struct bpf_list_head);
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
return 0; return 0;
...@@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) ...@@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst)
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
bool lock_src); bool lock_src);
void bpf_timer_cancel_and_free(void *timer); void bpf_timer_cancel_and_free(void *timer);
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock);
int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
struct bpf_offload_dev; struct bpf_offload_dev;
......
...@@ -6888,6 +6888,16 @@ struct bpf_dynptr { ...@@ -6888,6 +6888,16 @@ struct bpf_dynptr {
__u64 :64; __u64 :64;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
struct bpf_list_head {
__u64 :64;
__u64 :64;
} __attribute__((aligned(8)));
struct bpf_list_node {
__u64 :64;
__u64 :64;
} __attribute__((aligned(8)));
struct bpf_sysctl { struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1). __u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write. * Allows 1,2,4-byte read, but no write.
......
...@@ -3205,9 +3205,15 @@ enum { ...@@ -3205,9 +3205,15 @@ enum {
struct btf_field_info { struct btf_field_info {
enum btf_field_type type; enum btf_field_type type;
u32 off; u32 off;
union {
struct { struct {
u32 type_id; u32 type_id;
} kptr; } kptr;
struct {
const char *node_name;
u32 value_btf_id;
} list_head;
};
}; };
static int btf_find_struct(const struct btf *btf, const struct btf_type *t, static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
...@@ -3261,6 +3267,63 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, ...@@ -3261,6 +3267,63 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
return BTF_FIELD_FOUND; return BTF_FIELD_FOUND;
} }
static const char *btf_find_decl_tag_value(const struct btf *btf,
const struct btf_type *pt,
int comp_idx, const char *tag_key)
{
int i;
for (i = 1; i < btf_nr_types(btf); i++) {
const struct btf_type *t = btf_type_by_id(btf, i);
int len = strlen(tag_key);
if (!btf_type_is_decl_tag(t))
continue;
if (pt != btf_type_by_id(btf, t->type) ||
btf_type_decl_tag(t)->component_idx != comp_idx)
continue;
if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
continue;
return __btf_name_by_offset(btf, t->name_off) + len;
}
return NULL;
}
static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
const struct btf_type *t, int comp_idx,
u32 off, int sz, struct btf_field_info *info)
{
const char *value_type;
const char *list_node;
s32 id;
if (!__btf_type_is_struct(t))
return BTF_FIELD_IGNORE;
if (t->size != sz)
return BTF_FIELD_IGNORE;
value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
if (!value_type)
return -EINVAL;
list_node = strstr(value_type, ":");
if (!list_node)
return -EINVAL;
value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN);
if (!value_type)
return -ENOMEM;
id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
kfree(value_type);
if (id < 0)
return id;
list_node++;
if (str_is_empty(list_node))
return -EINVAL;
info->type = BPF_LIST_HEAD;
info->off = off;
info->list_head.value_btf_id = id;
info->list_head.node_name = list_node;
return BTF_FIELD_FOUND;
}
static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
int *align, int *sz) int *align, int *sz)
{ {
...@@ -3284,6 +3347,12 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, ...@@ -3284,6 +3347,12 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
goto end; goto end;
} }
} }
if (field_mask & BPF_LIST_HEAD) {
if (!strcmp(name, "bpf_list_head")) {
type = BPF_LIST_HEAD;
goto end;
}
}
/* Only return BPF_KPTR when all other types with matchable names fail */ /* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & BPF_KPTR) { if (field_mask & BPF_KPTR) {
type = BPF_KPTR_REF; type = BPF_KPTR_REF;
...@@ -3339,6 +3408,12 @@ static int btf_find_struct_field(const struct btf *btf, ...@@ -3339,6 +3408,12 @@ static int btf_find_struct_field(const struct btf *btf,
if (ret < 0) if (ret < 0)
return ret; return ret;
break; break;
case BPF_LIST_HEAD:
ret = btf_find_list_head(btf, t, member_type, i, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
default: default:
return -EFAULT; return -EFAULT;
} }
...@@ -3393,6 +3468,12 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, ...@@ -3393,6 +3468,12 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
if (ret < 0) if (ret < 0)
return ret; return ret;
break; break;
case BPF_LIST_HEAD:
ret = btf_find_list_head(btf, var, var_type, -1, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
default: default:
return -EFAULT; return -EFAULT;
} }
...@@ -3491,11 +3572,52 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, ...@@ -3491,11 +3572,52 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
return ret; return ret;
} }
static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
struct btf_field_info *info)
{
const struct btf_type *t, *n = NULL;
const struct btf_member *member;
u32 offset;
int i;
t = btf_type_by_id(btf, info->list_head.value_btf_id);
/* We've already checked that value_btf_id is a struct type. We
* just need to figure out the offset of the list_node, and
* verify its type.
*/
for_each_member(i, t, member) {
if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off)))
continue;
/* Invalid BTF, two members with same name */
if (n)
return -EINVAL;
n = btf_type_by_id(btf, member->type);
if (!__btf_type_is_struct(n))
return -EINVAL;
if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
return -EINVAL;
offset = __btf_member_bit_offset(n, member);
if (offset % 8)
return -EINVAL;
offset /= 8;
if (offset % __alignof__(struct bpf_list_node))
return -EINVAL;
field->list_head.btf = (struct btf *)btf;
field->list_head.value_btf_id = info->list_head.value_btf_id;
field->list_head.node_offset = offset;
}
if (!n)
return -ENOENT;
return 0;
}
struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
u32 field_mask, u32 value_size) u32 field_mask, u32 value_size)
{ {
struct btf_field_info info_arr[BTF_FIELDS_MAX]; struct btf_field_info info_arr[BTF_FIELDS_MAX];
struct btf_record *rec; struct btf_record *rec;
u32 next_off = 0;
int ret, i, cnt; int ret, i, cnt;
ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
...@@ -3517,6 +3639,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type ...@@ -3517,6 +3639,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
ret = -EFAULT; ret = -EFAULT;
goto end; goto end;
} }
if (info_arr[i].off < next_off) {
ret = -EEXIST;
goto end;
}
next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
rec->field_mask |= info_arr[i].type; rec->field_mask |= info_arr[i].type;
rec->fields[i].offset = info_arr[i].off; rec->fields[i].offset = info_arr[i].off;
...@@ -3539,12 +3666,24 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type ...@@ -3539,12 +3666,24 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
if (ret < 0) if (ret < 0)
goto end; goto end;
break; break;
case BPF_LIST_HEAD:
ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
if (ret < 0)
goto end;
break;
default: default:
ret = -EFAULT; ret = -EFAULT;
goto end; goto end;
} }
rec->cnt++; rec->cnt++;
} }
/* bpf_list_head requires bpf_spin_lock */
if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) {
ret = -EINVAL;
goto end;
}
return rec; return rec;
end: end:
btf_record_free(rec); btf_record_free(rec);
......
...@@ -1706,6 +1706,38 @@ bpf_base_func_proto(enum bpf_func_id func_id) ...@@ -1706,6 +1706,38 @@ bpf_base_func_proto(enum bpf_func_id func_id)
} }
} }
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
struct list_head *head = list_head, *orig_head = list_head;
BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
/* Do the actual list draining outside the lock to not hold the lock for
* too long, and also prevent deadlocks if tracing programs end up
* executing on entry/exit of functions called inside the critical
* section, and end up doing map ops that call bpf_list_head_free for
* the same map value again.
*/
__bpf_spin_lock_irqsave(spin_lock);
if (!head->next || list_empty(head))
goto unlock;
head = head->next;
unlock:
INIT_LIST_HEAD(orig_head);
__bpf_spin_unlock_irqrestore(spin_lock);
while (head != orig_head) {
void *obj = head;
obj -= field->list_head.node_offset;
head = head->next;
/* TODO: Rework later */
kfree(obj);
}
}
BTF_SET8_START(tracing_btf_ids) BTF_SET8_START(tracing_btf_ids)
#ifdef CONFIG_KEXEC_CORE #ifdef CONFIG_KEXEC_CORE
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
......
...@@ -536,6 +536,9 @@ void btf_record_free(struct btf_record *rec) ...@@ -536,6 +536,9 @@ void btf_record_free(struct btf_record *rec)
module_put(rec->fields[i].kptr.module); module_put(rec->fields[i].kptr.module);
btf_put(rec->fields[i].kptr.btf); btf_put(rec->fields[i].kptr.btf);
break; break;
case BPF_LIST_HEAD:
/* Nothing to release for bpf_list_head */
break;
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
continue; continue;
...@@ -578,6 +581,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) ...@@ -578,6 +581,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
goto free; goto free;
} }
break; break;
case BPF_LIST_HEAD:
/* Nothing to acquire for bpf_list_head */
break;
default: default:
ret = -EFAULT; ret = -EFAULT;
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
...@@ -637,6 +643,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) ...@@ -637,6 +643,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
case BPF_KPTR_REF: case BPF_KPTR_REF:
field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
break; break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
continue;
bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
break;
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
continue; continue;
...@@ -965,7 +976,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, ...@@ -965,7 +976,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size) if (!value_type || value_size != map->value_size)
return -EINVAL; return -EINVAL;
map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR, map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
map->value_size); map->value_size);
if (!IS_ERR_OR_NULL(map->record)) { if (!IS_ERR_OR_NULL(map->record)) {
int i; int i;
...@@ -1012,6 +1024,14 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, ...@@ -1012,6 +1024,14 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
goto free_map_tab; goto free_map_tab;
} }
break; break;
case BPF_LIST_HEAD:
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
break;
default: default:
/* Fail if map_type checks are missing for a field type */ /* Fail if map_type checks are missing for a field type */
ret = -EOPNOTSUPP; ret = -EOPNOTSUPP;
......
...@@ -12814,6 +12814,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, ...@@ -12814,6 +12814,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
{ {
enum bpf_prog_type prog_type = resolve_prog_type(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog);
if (btf_record_has_field(map->record, BPF_LIST_HEAD)) {
if (is_tracing_prog_type(prog_type)) {
verbose(env, "tracing progs cannot use bpf_list_head yet\n");
return -EINVAL;
}
}
if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
......
...@@ -6888,6 +6888,16 @@ struct bpf_dynptr { ...@@ -6888,6 +6888,16 @@ struct bpf_dynptr {
__u64 :64; __u64 :64;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
struct bpf_list_head {
__u64 :64;
__u64 :64;
} __attribute__((aligned(8)));
struct bpf_list_node {
__u64 :64;
__u64 :64;
} __attribute__((aligned(8)));
struct bpf_sysctl { struct bpf_sysctl {
__u32 write; /* Sysctl is being read (= 0) or written (= 1). __u32 write; /* Sysctl is being read (= 0) or written (= 1).
* Allows 1,2,4-byte read, but no write. * Allows 1,2,4-byte read, but no write.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment