Commit 417759f7 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'tcp-bpf-cc'

Martin Lau says:

====================
This series introduces BPF STRUCT_OPS.  It is an infra to allow
implementing some specific kernel's function pointers in BPF.
The first use case included in this series is to implement
TCP congestion control algorithm in BPF  (i.e. implement
struct tcp_congestion_ops in BPF).

There has been attempt to move the TCP CC to the user space
(e.g. CCP in TCP).   The common arguments are faster turn around,
get away from long-tail kernel versions in production...etc,
which are legit points.

BPF has been the continuous effort to join both kernel and
userspace upsides together (e.g. XDP to gain the performance
advantage without bypassing the kernel).  The recent BPF
advancements (in particular BTF-aware verifier, BPF trampoline,
BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc)
possible in BPF.

The idea is to allow implementing tcp_congestion_ops in bpf.
It allows a faster turnaround for testing algorithm in the
production while leveraging the existing (and continue growing) BPF
feature/framework instead of building one specifically for
userspace TCP CC.

Please see individual patch for details.

The bpftool support will be posted in follow-up patches.

v4:
- Expose tcp_ca_find() to tcp.h in patch 7.
  It is used to check the same bpf-tcp-cc
  does not exist to guarantee the register()
  will succeed.
- set_memory_ro() and then set_memory_x() only after all
  trampolines are written to the image in patch 6. (Daniel)
  spinlock is replaced by mutex because set_memory_*
  requires sleepable context.

v3:
- Fix kbuild error by considering CONFIG_BPF_SYSCALL (kbuild)
- Support anonymous bitfield in patch 4 (Andrii, Yonghong)
- Push boundary safety check to a specific arch's trampoline function
  (in patch 6) (Yonghong).
  Reuse the WANR_ON_ONCE check in arch_prepare_bpf_trampoline() in x86.
- Check module field is 0 in udata in patch 6 (Yonghong)
- Check zero holes in patch 6 (Andrii)
- s/_btf_vmlinux/btf/ in patch 5 and 7 (Andrii)
- s/check_xxx/is_xxx/ in patch 7 (Andrii)
- Use "struct_ops/" convention in patch 11 (Andrii)
- Use the skel instead of bpf_object in patch 11 (Andrii)
- libbpf: Decide BPF_PROG_TYPE_STRUCT_OPS at open phase by using
          find_sec_def()
- libbpf: Avoid a debug message at open phase (Andrii)
- libbpf: Add bpf_program__(is|set)_struct_ops() for consistency (Andrii)
- libbpf: Add "struct_ops" to section_defs (Andrii)
- libbpf: Some code shuffling in init_kern_struct_ops() (Andrii)
- libbpf: A few safety checks (Andrii)

v2:
- Dropped cubic for now.  They will be reposted
  once there are more clarity in "jiffies" on both
  bpf side (about the helper) and
  tcp_cubic side (some of jiffies usages are being replaced
  by tp->tcp_mstamp)
- Remove unnecssary check on bitfield support from btf_struct_access()
  (Yonghong)
- BTF_TYPE_EMIT macro (Yonghong, Andrii)
- value_name's length check to avoid an unlikely
  type match during truncation case (Yonghong)
- BUILD_BUG_ON to ensure no trampoline-image overrun
  in the future (Yonghong)
- Simplify get_next_key() (Yonghong)
- Added comment to explain how to check mandatory
  func ptr in net/ipv4/bpf_tcp_ca.c (Yonghong)
- Rename "__bpf_" to "bpf_struct_ops_" for value prefix (Andrii)
- Add comment to highlight the bpf_dctcp.c is not necessarily
  the same as tcp_dctcp.c. (Alexei, Eric)
- libbpf: Renmae "struct_ops" to ".struct_ops" for elf sec (Andrii)
- libbpf: Expose struct_ops as a bpf_map (Andrii)
- libbpf: Support multiple struct_ops in SEC(".struct_ops") (Andrii)
- libbpf: Add bpf_map__attach_struct_ops()  (Andrii)
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 65726b5b 09903869
......@@ -1328,7 +1328,7 @@ xadd: if (is_imm8(insn->off))
return proglen;
}
static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
int stack_size)
{
int i;
......@@ -1344,7 +1344,7 @@ static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
-(stack_size - i * 8));
}
static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
int stack_size)
{
int i;
......@@ -1361,7 +1361,7 @@ static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
-(stack_size - i * 8));
}
static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
struct bpf_prog **progs, int prog_cnt, int stack_size)
{
u8 *prog = *pprog;
......@@ -1456,7 +1456,8 @@ static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
int arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_prog **fentry_progs, int fentry_cnt,
struct bpf_prog **fexit_progs, int fexit_cnt,
void *orig_call)
......@@ -1523,13 +1524,10 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
/* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
EMIT1(0xC3); /* ret */
/* One half of the page has active running trampoline.
* Another half is an area for next trampoline.
* Make sure the trampoline generation logic doesn't overflow.
*/
if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
/* Make sure the trampoline generation logic doesn't overflow */
if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY))
return -EFAULT;
return 0;
return prog - (u8 *)image;
}
static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
......
......@@ -17,6 +17,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/module.h>
struct bpf_verifier_env;
struct bpf_verifier_log;
......@@ -106,6 +107,7 @@ struct bpf_map {
struct btf *btf;
struct bpf_map_memory memory;
char name[BPF_OBJ_NAME_LEN];
u32 btf_vmlinux_value_type_id;
bool unpriv_array;
bool frozen; /* write-once; write-protected by freeze_mutex */
/* 22 bytes hole */
......@@ -183,7 +185,8 @@ static inline bool bpf_map_offload_neutral(const struct bpf_map *map)
static inline bool bpf_map_support_seq_show(const struct bpf_map *map)
{
return map->btf && map->ops->map_seq_show_elem;
return (map->btf_value_type_id || map->btf_vmlinux_value_type_id) &&
map->ops->map_seq_show_elem;
}
int map_check_no_btf(const struct bpf_map *map,
......@@ -349,6 +352,10 @@ struct bpf_verifier_ops {
const struct bpf_insn *src,
struct bpf_insn *dst,
struct bpf_prog *prog, u32 *target_size);
int (*btf_struct_access)(struct bpf_verifier_log *log,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype,
u32 *next_btf_id);
};
struct bpf_prog_offload_ops {
......@@ -437,7 +444,8 @@ struct btf_func_model {
* fentry = a set of program to run before calling original function
* fexit = a set of program to run after original function
*/
int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
int arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_prog **fentry_progs, int fentry_cnt,
struct bpf_prog **fexit_progs, int fexit_cnt,
void *orig_call);
......@@ -668,6 +676,73 @@ struct bpf_array_aux {
struct work_struct work;
};
struct bpf_struct_ops_value;
struct btf_type;
struct btf_member;
#define BPF_STRUCT_OPS_MAX_NR_MEMBERS 64
struct bpf_struct_ops {
const struct bpf_verifier_ops *verifier_ops;
int (*init)(struct btf *btf);
int (*check_member)(const struct btf_type *t,
const struct btf_member *member);
int (*init_member)(const struct btf_type *t,
const struct btf_member *member,
void *kdata, const void *udata);
int (*reg)(void *kdata);
void (*unreg)(void *kdata);
const struct btf_type *type;
const struct btf_type *value_type;
const char *name;
struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
u32 type_id;
u32 value_id;
};
#if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
#define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id);
void bpf_struct_ops_init(struct btf *btf);
bool bpf_struct_ops_get(const void *kdata);
void bpf_struct_ops_put(const void *kdata);
int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
void *value);
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
if (owner == BPF_MODULE_OWNER)
return bpf_struct_ops_get(data);
else
return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
if (owner == BPF_MODULE_OWNER)
bpf_struct_ops_put(data);
else
module_put(owner);
}
#else
static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
{
return NULL;
}
static inline void bpf_struct_ops_init(struct btf *btf) { }
static inline bool bpf_try_module_get(const void *data, struct module *owner)
{
return try_module_get(owner);
}
static inline void bpf_module_put(const void *data, struct module *owner)
{
module_put(owner);
}
static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map,
void *key,
void *value)
{
return -EINVAL;
}
#endif
struct bpf_array {
struct bpf_map map;
u32 elem_size;
......
......@@ -65,6 +65,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport,
struct sk_reuseport_md, struct sk_reuseport_kern)
#endif
#if defined(CONFIG_BPF_JIT)
BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops,
void *, void *)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
......@@ -105,3 +109,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
#if defined(CONFIG_BPF_JIT)
BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
#endif
......@@ -7,6 +7,8 @@
#include <linux/types.h>
#include <uapi/linux/btf.h>
#define BTF_TYPE_EMIT(type) ((void)(type *)0)
struct btf;
struct btf_member;
struct btf_type;
......@@ -53,6 +55,22 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
u32 expected_offset, u32 expected_size);
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
bool btf_type_is_void(const struct btf_type *t);
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind);
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id);
const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
u32 id, u32 *res_id);
const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
u32 id, u32 *res_id);
const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size, const struct btf_type **elem_type,
u32 *total_nelems);
#define for_each_member(i, struct_type, member) \
for (i = 0, member = btf_type_member(struct_type); \
i < btf_type_vlen(struct_type); \
i++, member++)
static inline bool btf_type_is_ptr(const struct btf_type *t)
{
......@@ -84,6 +102,35 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
}
static inline u16 btf_type_vlen(const struct btf_type *t)
{
return BTF_INFO_VLEN(t->info);
}
static inline bool btf_type_kflag(const struct btf_type *t)
{
return BTF_INFO_KFLAG(t->info);
}
static inline u32 btf_member_bit_offset(const struct btf_type *struct_type,
const struct btf_member *member)
{
return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
: member->offset;
}
static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type,
const struct btf_member *member)
{
return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
: 0;
}
static inline const struct btf_member *btf_type_member(const struct btf_type *t)
{
return (const struct btf_member *)(t + 1);
}
#ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
const char *btf_name_by_offset(const struct btf *btf, u32 offset);
......
......@@ -843,6 +843,8 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id);
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
......
......@@ -1007,6 +1007,7 @@ enum tcp_ca_ack_event_flags {
#define TCP_CONG_NON_RESTRICTED 0x1
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN 0x2
#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
union tcp_cc_info;
......@@ -1101,6 +1102,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk);
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
struct tcp_congestion_ops *tcp_ca_find(const char *name);
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
......
......@@ -136,6 +136,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_STACK,
BPF_MAP_TYPE_SK_STORAGE,
BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS,
};
/* Note that tracing related programs such as
......@@ -174,6 +175,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
BPF_PROG_TYPE_CGROUP_SOCKOPT,
BPF_PROG_TYPE_TRACING,
BPF_PROG_TYPE_STRUCT_OPS,
};
enum bpf_attach_type {
......@@ -397,6 +399,10 @@ union bpf_attr {
__u32 btf_fd; /* fd pointing to a BTF type data */
__u32 btf_key_type_id; /* BTF type_id of the key */
__u32 btf_value_type_id; /* BTF type_id of the value */
__u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
* struct stored as the
* map value
*/
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
......@@ -2831,6 +2837,14 @@ union bpf_attr {
* Return
* On success, the strictly positive length of the string, including
* the trailing NUL character. On error, a negative value.
*
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
* Description
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
* *rcv_nxt* is the ack_seq to be sent out.
* Return
* 0 on success, or a negative error in case of failure.
*
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2948,7 +2962,8 @@ union bpf_attr {
FN(probe_read_user), \
FN(probe_read_kernel), \
FN(probe_read_user_str), \
FN(probe_read_kernel_str),
FN(probe_read_kernel_str), \
FN(tcp_send_ack),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -3349,7 +3364,7 @@ struct bpf_map_info {
__u32 map_flags;
char name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
__u32 :32;
__u32 btf_vmlinux_value_type_id;
__u64 netns_dev;
__u64 netns_ino;
__u32 btf_id;
......
......@@ -27,3 +27,6 @@ endif
ifeq ($(CONFIG_SYSFS),y)
obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
endif
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
/* internal file - do not include directly */
#ifdef CONFIG_BPF_JIT
#ifdef CONFIG_INET
#include <net/tcp.h>
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
#endif
#endif
......@@ -180,11 +180,6 @@
*/
#define BTF_MAX_SIZE (16 * 1024 * 1024)
#define for_each_member(i, struct_type, member) \
for (i = 0, member = btf_type_member(struct_type); \
i < btf_type_vlen(struct_type); \
i++, member++)
#define for_each_member_from(i, from, struct_type, member) \
for (i = from, member = btf_type_member(struct_type) + from; \
i < btf_type_vlen(struct_type); \
......@@ -382,6 +377,65 @@ static bool btf_type_is_datasec(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
const struct btf_type *t;
const char *tname;
u32 i;
for (i = 1; i <= btf->nr_types; i++) {
t = btf->types[i];
if (BTF_INFO_KIND(t->info) != kind)
continue;
tname = btf_name_by_offset(btf, t->name_off);
if (!strcmp(tname, name))
return i;
}
return -ENOENT;
}
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
const struct btf_type *t = btf_type_by_id(btf, id);
while (btf_type_is_modifier(t)) {
id = t->type;
t = btf_type_by_id(btf, t->type);
}
if (res_id)
*res_id = id;
return t;
}
const struct btf_type *btf_type_resolve_ptr(const struct btf *btf,
u32 id, u32 *res_id)
{
const struct btf_type *t;
t = btf_type_skip_modifiers(btf, id, NULL);
if (!btf_type_is_ptr(t))
return NULL;
return btf_type_skip_modifiers(btf, t->type, res_id);
}
const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
u32 id, u32 *res_id)
{
const struct btf_type *ptype;
ptype = btf_type_resolve_ptr(btf, id, res_id);
if (ptype && btf_type_is_func_proto(ptype))
return ptype;
return NULL;
}
/* Types that act only as a source, not sink or intermediate
* type when resolving.
*/
......@@ -446,30 +500,6 @@ static const char *btf_int_encoding_str(u8 encoding)
return "UNKN";
}
static u16 btf_type_vlen(const struct btf_type *t)
{
return BTF_INFO_VLEN(t->info);
}
static bool btf_type_kflag(const struct btf_type *t)
{
return BTF_INFO_KFLAG(t->info);
}
static u32 btf_member_bit_offset(const struct btf_type *struct_type,
const struct btf_member *member)
{
return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset)
: member->offset;
}
static u32 btf_member_bitfield_size(const struct btf_type *struct_type,
const struct btf_member *member)
{
return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset)
: 0;
}
static u32 btf_type_int(const struct btf_type *t)
{
return *(u32 *)(t + 1);
......@@ -480,11 +510,6 @@ static const struct btf_array *btf_type_array(const struct btf_type *t)
return (const struct btf_array *)(t + 1);
}
static const struct btf_member *btf_type_member(const struct btf_type *t)
{
return (const struct btf_member *)(t + 1);
}
static const struct btf_enum *btf_type_enum(const struct btf_type *t)
{
return (const struct btf_enum *)(t + 1);
......@@ -1057,7 +1082,7 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
* *elem_type: same as return type ("struct X")
* *total_nelems: 1
*/
static const struct btf_type *
const struct btf_type *
btf_resolve_size(const struct btf *btf, const struct btf_type *type,
u32 *type_size, const struct btf_type **elem_type,
u32 *total_nelems)
......@@ -1111,8 +1136,10 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
return ERR_PTR(-EINVAL);
*type_size = nelems * size;
*total_nelems = nelems;
*elem_type = type;
if (total_nelems)
*total_nelems = nelems;
if (elem_type)
*elem_type = type;
return array_type ? : type;
}
......@@ -1826,7 +1853,10 @@ static void btf_modifier_seq_show(const struct btf *btf,
u32 type_id, void *data,
u8 bits_offset, struct seq_file *m)
{
t = btf_type_id_resolve(btf, &type_id);
if (btf->resolved_ids)
t = btf_type_id_resolve(btf, &type_id);
else
t = btf_type_skip_modifiers(btf, type_id, NULL);
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
}
......@@ -3605,6 +3635,8 @@ struct btf *btf_parse_vmlinux(void)
goto errout;
}
bpf_struct_ops_init(btf);
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
......@@ -3677,7 +3709,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_int(t))
if (btf_type_is_int(t) || btf_type_is_enum(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
......@@ -3697,7 +3729,6 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* this is a pointer to another type */
info->reg_type = PTR_TO_BTF_ID;
info->btf_id = t->type;
if (tgt_prog) {
ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type);
......@@ -3708,10 +3739,14 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return false;
}
}
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
/* skip modifiers */
while (btf_type_is_modifier(t))
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
}
if (!btf_type_is_struct(t)) {
bpf_log(log,
"func '%s' arg%d type %s is not a struct\n",
......@@ -3737,23 +3772,57 @@ int btf_struct_access(struct bpf_verifier_log *log,
again:
tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct", tname);
bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
}
for_each_member(i, t, member) {
if (btf_member_bitfield_size(t, member))
/* bitfields are not supported yet */
continue;
if (off + size > t->size) {
bpf_log(log, "access beyond struct %s at off %u size %u\n",
tname, off, size);
return -EACCES;
}
for_each_member(i, t, member) {
/* offset of the field in bytes */
moff = btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
if (btf_member_bitfield_size(t, member)) {
u32 end_bit = btf_member_bit_offset(t, member) +
btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
* bitfield like the ":16" here:
* struct {
* int :16;
* int x:8;
* };
*/
if (off <= moff &&
BITS_ROUNDUP_BYTES(end_bit) <= off + size)
return SCALAR_VALUE;
/* off may be accessing a following member
*
* or
*
* Doing partial access at either end of this
* bitfield. Continue on this case also to
* treat it as not accessing this bitfield
* and eventually error out as field not
* found to keep it simple.
* It could be relaxed if there was a legit
* partial access case later.
*/
continue;
}
/* In case of "off" is pointing to holes of a struct */
if (off < moff)
continue;
break;
/* type of the field */
mtype = btf_type_by_id(btf_vmlinux, member->type);
......
......@@ -22,7 +22,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
*/
if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE ||
inner_map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
......
......@@ -628,7 +628,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
}
#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id
#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
......@@ -642,6 +642,14 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
if (attr->btf_vmlinux_value_type_id) {
if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS ||
attr->btf_key_type_id || attr->btf_value_type_id)
return -EINVAL;
} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
return -EINVAL;
}
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
......@@ -664,32 +672,35 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
if (attr->btf_key_type_id || attr->btf_value_type_id) {
map->spin_lock_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
* to figure out the corresponding kernel's
* counter part. Thus, attr->btf_fd has
* to be valid also.
*/
attr->btf_vmlinux_value_type_id) {
struct btf *btf;
if (!attr->btf_value_type_id) {
err = -EINVAL;
goto free_map;
}
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
goto free_map;
}
map->btf = btf;
err = map_check_btf(map, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err) {
btf_put(btf);
goto free_map;
if (attr->btf_value_type_id) {
err = map_check_btf(map, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err)
goto free_map;
}
map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id;
} else {
map->spin_lock_off = -EINVAL;
map->btf_vmlinux_value_type_id =
attr->btf_vmlinux_value_type_id;
}
err = security_bpf_map_alloc(map);
......@@ -888,6 +899,9 @@ static int map_lookup_elem(union bpf_attr *attr)
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_peek_elem(map, value);
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* struct_ops map requires directly updating "value" */
err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
} else {
rcu_read_lock();
if (map->ops->map_lookup_elem_sys_only)
......@@ -1003,7 +1017,8 @@ static int map_update_elem(union bpf_attr *attr)
goto out;
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_SOCKHASH ||
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
map->map_type == BPF_MAP_TYPE_SOCKMAP ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
} else if (IS_FD_PROG_ARRAY(map)) {
......@@ -1092,7 +1107,9 @@ static int map_delete_elem(union bpf_attr *attr)
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
} else if (IS_FD_PROG_ARRAY(map)) {
} else if (IS_FD_PROG_ARRAY(map) ||
map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* These maps require sleepable context */
err = map->ops->map_delete_elem(map, key);
goto out;
}
......@@ -1672,17 +1689,22 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
enum bpf_attach_type expected_attach_type,
u32 btf_id, u32 prog_fd)
{
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
if (btf_id) {
if (btf_id > BTF_MAX_TYPE)
return -EINVAL;
break;
default:
if (btf_id || prog_fd)
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_STRUCT_OPS:
break;
default:
return -EINVAL;
break;
}
}
if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING)
return -EINVAL;
switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
switch (expected_attach_type) {
......@@ -2817,6 +2839,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.btf_key_type_id = map->btf_key_type_id;
info.btf_value_type_id = map->btf_value_type_id;
}
info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_info_fill(&info, map);
......
......@@ -160,11 +160,12 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
if (fexit_cnt)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
err = arch_prepare_bpf_trampoline(new_image, &tr->func.model, flags,
err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
&tr->func.model, flags,
fentry, fentry_cnt,
fexit, fexit_cnt,
tr->func.addr);
if (err)
if (err < 0)
goto out;
if (tr->selector)
......@@ -296,7 +297,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
}
int __weak
arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
arch_prepare_bpf_trampoline(void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_prog **fentry_progs, int fentry_cnt,
struct bpf_prog **fexit_progs, int fexit_cnt,
void *orig_call)
......
......@@ -2859,11 +2859,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
u32 btf_id;
int ret;
if (atype != BPF_READ) {
verbose(env, "only read is supported\n");
return -EACCES;
}
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
......@@ -2880,17 +2875,32 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id);
if (env->ops->btf_struct_access) {
ret = env->ops->btf_struct_access(&env->log, t, off, size,
atype, &btf_id);
} else {
if (atype != BPF_READ) {
verbose(env, "only read is supported\n");
return -EACCES;
}
ret = btf_struct_access(&env->log, t, off, size, atype,
&btf_id);
}
if (ret < 0)
return ret;
if (ret == SCALAR_VALUE) {
mark_reg_unknown(env, regs, value_regno);
return 0;
if (atype == BPF_READ) {
if (ret == SCALAR_VALUE) {
mark_reg_unknown(env, regs, value_regno);
return 0;
}
mark_reg_known_zero(env, regs, value_regno);
regs[value_regno].type = PTR_TO_BTF_ID;
regs[value_regno].btf_id = btf_id;
}
mark_reg_known_zero(env, regs, value_regno);
regs[value_regno].type = PTR_TO_BTF_ID;
regs[value_regno].btf_id = btf_id;
return 0;
}
......@@ -6349,8 +6359,30 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
static int check_return_code(struct bpf_verifier_env *env)
{
struct tnum enforce_attach_type_range = tnum_unknown;
const struct bpf_prog *prog = env->prog;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
int err;
/* The struct_ops func-ptr's return type could be "void" */
if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS &&
!prog->aux->attach_func_proto->type)
return 0;
/* eBPF calling convetion is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
* of bpf_exit, which means that program wrote
* something into it earlier
*/
err = check_reg_arg(env, BPF_REG_0, SRC_OP);
if (err)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
switch (env->prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
......@@ -8016,21 +8048,6 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
/* eBPF calling convetion is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
* of bpf_exit, which means that program wrote
* something into it earlier
*/
err = check_reg_arg(env, BPF_REG_0, SRC_OP);
if (err)
return err;
if (is_pointer_value(env, BPF_REG_0)) {
verbose(env, "R0 leaks addr as return value\n");
return -EACCES;
}
err = check_return_code(env);
if (err)
return err;
......@@ -8138,6 +8155,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
verbose(env, "bpf_struct_ops map cannot be used in prog\n");
return -EINVAL;
}
return 0;
}
......@@ -8829,12 +8851,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
break;
case PTR_TO_BTF_ID:
if (type == BPF_WRITE) {
if (type == BPF_READ) {
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
} else if (env->prog->type != BPF_PROG_TYPE_STRUCT_OPS) {
verbose(env, "Writes through BTF pointers are not allowed\n");
return -EINVAL;
}
insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code);
env->prog->aux->num_exentries++;
continue;
default:
continue;
......@@ -9502,6 +9526,58 @@ static void print_verification_stats(struct bpf_verifier_env *env)
env->peak_states, env->longest_mark_read_walk);
}
static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
{
const struct btf_type *t, *func_proto;
const struct bpf_struct_ops *st_ops;
const struct btf_member *member;
struct bpf_prog *prog = env->prog;
u32 btf_id, member_idx;
const char *mname;
btf_id = prog->aux->attach_btf_id;
st_ops = bpf_struct_ops_find(btf_id);
if (!st_ops) {
verbose(env, "attach_btf_id %u is not a supported struct\n",
btf_id);
return -ENOTSUPP;
}
t = st_ops->type;
member_idx = prog->expected_attach_type;
if (member_idx >= btf_type_vlen(t)) {
verbose(env, "attach to invalid member idx %u of struct %s\n",
member_idx, st_ops->name);
return -EINVAL;
}
member = &btf_type_member(t)[member_idx];
mname = btf_name_by_offset(btf_vmlinux, member->name_off);
func_proto = btf_type_resolve_func_ptr(btf_vmlinux, member->type,
NULL);
if (!func_proto) {
verbose(env, "attach to invalid member %s(@idx %u) of struct %s\n",
mname, member_idx, st_ops->name);
return -EINVAL;
}
if (st_ops->check_member) {
int err = st_ops->check_member(t, member);
if (err) {
verbose(env, "attach to unsupported member %s of struct %s\n",
mname, st_ops->name);
return err;
}
}
prog->aux->attach_func_proto = func_proto;
prog->aux->attach_func_name = mname;
env->ops = st_ops->verifier_ops;
return 0;
}
static int check_attach_btf_id(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
......@@ -9517,6 +9593,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
long addr;
u64 key;
if (prog->type == BPF_PROG_TYPE_STRUCT_OPS)
return check_struct_ops_btf_id(env);
if (prog->type != BPF_PROG_TYPE_TRACING)
return 0;
......
......@@ -5935,7 +5935,7 @@ bool bpf_helper_changes_pkt_data(void *func)
return false;
}
static const struct bpf_func_proto *
const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
......
......@@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
endif
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <linux/types.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/filter.h>
#include <net/tcp.h>
static u32 optional_ops[] = {
offsetof(struct tcp_congestion_ops, init),
offsetof(struct tcp_congestion_ops, release),
offsetof(struct tcp_congestion_ops, set_state),
offsetof(struct tcp_congestion_ops, cwnd_event),
offsetof(struct tcp_congestion_ops, in_ack_event),
offsetof(struct tcp_congestion_ops, pkts_acked),
offsetof(struct tcp_congestion_ops, min_tso_segs),
offsetof(struct tcp_congestion_ops, sndbuf_expand),
offsetof(struct tcp_congestion_ops, cong_control),
};
static u32 unsupported_ops[] = {
offsetof(struct tcp_congestion_ops, get_info),
};
static const struct btf_type *tcp_sock_type;
static u32 tcp_sock_id, sock_id;
static int bpf_tcp_ca_init(struct btf *btf)
{
s32 type_id;
type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
if (type_id < 0)
return -EINVAL;
sock_id = type_id;
type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
if (type_id < 0)
return -EINVAL;
tcp_sock_id = type_id;
tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
return 0;
}
static bool is_optional(u32 member_offset)
{
unsigned int i;
for (i = 0; i < ARRAY_SIZE(optional_ops); i++) {
if (member_offset == optional_ops[i])
return true;
}
return false;
}
static bool is_unsupported(u32 member_offset)
{
unsigned int i;
for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) {
if (member_offset == unsupported_ops[i])
return true;
}
return false;
}
extern struct btf *btf_vmlinux;
static bool bpf_tcp_ca_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
if (!btf_ctx_access(off, size, type, prog, info))
return false;
if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id)
/* promote it to tcp_sock */
info->btf_id = tcp_sock_id;
return true;
}
static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
const struct btf_type *t, int off,
int size, enum bpf_access_type atype,
u32 *next_btf_id)
{
size_t end;
if (atype == BPF_READ)
return btf_struct_access(log, t, off, size, atype, next_btf_id);
if (t != tcp_sock_type) {
bpf_log(log, "only read is supported\n");
return -EACCES;
}
switch (off) {
case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv):
end = offsetofend(struct inet_connection_sock, icsk_ca_priv);
break;
case offsetof(struct inet_connection_sock, icsk_ack.pending):
end = offsetofend(struct inet_connection_sock,
icsk_ack.pending);
break;
case offsetof(struct tcp_sock, snd_cwnd):
end = offsetofend(struct tcp_sock, snd_cwnd);
break;
case offsetof(struct tcp_sock, snd_cwnd_cnt):
end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
break;
case offsetof(struct tcp_sock, snd_ssthresh):
end = offsetofend(struct tcp_sock, snd_ssthresh);
break;
case offsetof(struct tcp_sock, ecn_flags):
end = offsetofend(struct tcp_sock, ecn_flags);
break;
default:
bpf_log(log, "no write support to tcp_sock at off %d\n", off);
return -EACCES;
}
if (off + size > end) {
bpf_log(log,
"write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n",
off, size, end);
return -EACCES;
}
return NOT_INIT;
}
BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
{
/* bpf_tcp_ca prog cannot have NULL tp */
__tcp_send_ack((struct sock *)tp, rcv_nxt);
return 0;
}
static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
.func = bpf_tcp_send_ack,
.gpl_only = false,
/* In case we want to report error later */
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg2_type = ARG_ANYTHING,
.btf_id = &tcp_sock_id,
};
static const struct bpf_func_proto *
bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_tcp_send_ack:
return &bpf_tcp_send_ack_proto;
default:
return bpf_base_func_proto(func_id);
}
}
static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
.get_func_proto = bpf_tcp_ca_get_func_proto,
.is_valid_access = bpf_tcp_ca_is_valid_access,
.btf_struct_access = bpf_tcp_ca_btf_struct_access,
};
static int bpf_tcp_ca_init_member(const struct btf_type *t,
const struct btf_member *member,
void *kdata, const void *udata)
{
const struct tcp_congestion_ops *utcp_ca;
struct tcp_congestion_ops *tcp_ca;
size_t tcp_ca_name_len;
int prog_fd;
u32 moff;
utcp_ca = (const struct tcp_congestion_ops *)udata;
tcp_ca = (struct tcp_congestion_ops *)kdata;
moff = btf_member_bit_offset(t, member) / 8;
switch (moff) {
case offsetof(struct tcp_congestion_ops, flags):
if (utcp_ca->flags & ~TCP_CONG_MASK)
return -EINVAL;
tcp_ca->flags = utcp_ca->flags;
return 1;
case offsetof(struct tcp_congestion_ops, name):
tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name));
if (!tcp_ca_name_len ||
tcp_ca_name_len == sizeof(utcp_ca->name))
return -EINVAL;
if (tcp_ca_find(utcp_ca->name))
return -EEXIST;
memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name));
return 1;
}
if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL))
return 0;
/* Ensure bpf_prog is provided for compulsory func ptr */
prog_fd = (int)(*(unsigned long *)(udata + moff));
if (!prog_fd && !is_optional(moff) && !is_unsupported(moff))
return -EINVAL;
return 0;
}
static int bpf_tcp_ca_check_member(const struct btf_type *t,
const struct btf_member *member)
{
if (is_unsupported(btf_member_bit_offset(t, member) / 8))
return -ENOTSUPP;
return 0;
}
static int bpf_tcp_ca_reg(void *kdata)
{
return tcp_register_congestion_control(kdata);
}
static void bpf_tcp_ca_unreg(void *kdata)
{
tcp_unregister_congestion_control(kdata);
}
/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */
extern struct bpf_struct_ops bpf_tcp_congestion_ops;
struct bpf_struct_ops bpf_tcp_congestion_ops = {
.verifier_ops = &bpf_tcp_ca_verifier_ops,
.reg = bpf_tcp_ca_reg,
.unreg = bpf_tcp_ca_unreg,
.check_member = bpf_tcp_ca_check_member,
.init_member = bpf_tcp_ca_init_member,
.init = bpf_tcp_ca_init,
.name = "tcp_congestion_ops",
};
......@@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
/* Simple linear search, don't expect many entries! */
static struct tcp_congestion_ops *tcp_ca_find(const char *name)
struct tcp_congestion_ops *tcp_ca_find(const char *name)
{
struct tcp_congestion_ops *e;
......@@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk)
rcu_read_lock();
ca = rcu_dereference(net->ipv4.tcp_congestion_control);
if (unlikely(!try_module_get(ca->owner)))
if (unlikely(!bpf_try_module_get(ca, ca->owner)))
ca = &tcp_reno;
icsk->icsk_ca_ops = ca;
rcu_read_unlock();
......@@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk)
if (icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
module_put(icsk->icsk_ca_ops->owner);
bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
}
/* Used by sysctl to change default congestion control */
......@@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name)
ca = tcp_ca_find_autoload(net, name);
if (!ca) {
ret = -ENOENT;
} else if (!try_module_get(ca->owner)) {
} else if (!bpf_try_module_get(ca, ca->owner)) {
ret = -EBUSY;
} else {
prev = xchg(&net->ipv4.tcp_congestion_control, ca);
if (prev)
module_put(prev->owner);
bpf_module_put(prev, prev->owner);
ca->flags |= TCP_CONG_NON_RESTRICTED;
ret = 0;
......@@ -366,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
} else if (!load) {
const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
if (try_module_get(ca->owner)) {
if (bpf_try_module_get(ca, ca->owner)) {
if (reinit) {
tcp_reinit_congestion_control(sk, ca);
} else {
icsk->icsk_ca_ops = ca;
module_put(old_ca->owner);
bpf_module_put(old_ca, old_ca->owner);
}
} else {
err = -EBUSY;
}
} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) {
err = -EPERM;
} else if (!try_module_get(ca->owner)) {
} else if (!bpf_try_module_get(ca, ca->owner)) {
err = -EBUSY;
} else {
tcp_reinit_congestion_control(sk, ca);
......
......@@ -2678,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
int cpu;
if (net->ipv4.tcp_congestion_control)
module_put(net->ipv4.tcp_congestion_control->owner);
bpf_module_put(net->ipv4.tcp_congestion_control,
net->ipv4.tcp_congestion_control->owner);
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
......@@ -2785,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net)
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
try_module_get(init_net.ipv4.tcp_congestion_control->owner))
bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
init_net.ipv4.tcp_congestion_control->owner))
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
else
net->ipv4.tcp_congestion_control = &tcp_reno;
......
......@@ -414,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
if (likely(ca && try_module_get(ca->owner))) {
if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
ca_got_dst = true;
......@@ -425,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
/* If no valid choice made yet, assign current system default ca. */
if (!ca_got_dst &&
(!icsk->icsk_ca_setsockopt ||
!try_module_get(icsk->icsk_ca_ops->owner)))
!bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
tcp_assign_congestion_control(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
......
......@@ -3368,8 +3368,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
if (likely(ca && try_module_get(ca->owner))) {
module_put(icsk->icsk_ca_ops->owner);
if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
}
......
......@@ -136,6 +136,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_STACK,
BPF_MAP_TYPE_SK_STORAGE,
BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS,
};
/* Note that tracing related programs such as
......@@ -174,6 +175,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
BPF_PROG_TYPE_CGROUP_SOCKOPT,
BPF_PROG_TYPE_TRACING,
BPF_PROG_TYPE_STRUCT_OPS,
};
enum bpf_attach_type {
......@@ -397,6 +399,10 @@ union bpf_attr {
__u32 btf_fd; /* fd pointing to a BTF type data */
__u32 btf_key_type_id; /* BTF type_id of the key */
__u32 btf_value_type_id; /* BTF type_id of the value */
__u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel-
* struct stored as the
* map value
*/
};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
......@@ -2831,6 +2837,14 @@ union bpf_attr {
* Return
* On success, the strictly positive length of the string, including
* the trailing NUL character. On error, a negative value.
*
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
* Description
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
* *rcv_nxt* is the ack_seq to be sent out.
* Return
* 0 on success, or a negative error in case of failure.
*
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
......@@ -2948,7 +2962,8 @@ union bpf_attr {
FN(probe_read_user), \
FN(probe_read_kernel), \
FN(probe_read_user_str), \
FN(probe_read_kernel_str),
FN(probe_read_kernel_str), \
FN(tcp_send_ack),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
......@@ -3349,7 +3364,7 @@ struct bpf_map_info {
__u32 map_flags;
char name[BPF_OBJ_NAME_LEN];
__u32 ifindex;
__u32 :32;
__u32 btf_vmlinux_value_type_id;
__u64 netns_dev;
__u64 netns_ino;
__u32 btf_id;
......
......@@ -95,7 +95,11 @@ int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
attr.btf_key_type_id = create_attr->btf_key_type_id;
attr.btf_value_type_id = create_attr->btf_value_type_id;
attr.map_ifindex = create_attr->map_ifindex;
attr.inner_map_fd = create_attr->inner_map_fd;
if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS)
attr.btf_vmlinux_value_type_id =
create_attr->btf_vmlinux_value_type_id;
else
attr.inner_map_fd = create_attr->inner_map_fd;
return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}
......@@ -228,7 +232,9 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
memset(&attr, 0, sizeof(attr));
attr.prog_type = load_attr->prog_type;
attr.expected_attach_type = load_attr->expected_attach_type;
if (attr.prog_type == BPF_PROG_TYPE_TRACING) {
if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS) {
attr.attach_btf_id = load_attr->attach_btf_id;
} else if (attr.prog_type == BPF_PROG_TYPE_TRACING) {
attr.attach_btf_id = load_attr->attach_btf_id;
attr.attach_prog_fd = load_attr->attach_prog_fd;
} else {
......
......@@ -46,7 +46,10 @@ struct bpf_create_map_attr {
__u32 btf_key_type_id;
__u32 btf_value_type_id;
__u32 map_ifindex;
__u32 inner_map_fd;
union {
__u32 inner_map_fd;
__u32 btf_vmlinux_value_type_id;
};
};
LIBBPF_API int
......
This diff is collapsed.
......@@ -239,6 +239,8 @@ bpf_program__attach_raw_tracepoint(struct bpf_program *prog,
LIBBPF_API struct bpf_link *
bpf_program__attach_trace(struct bpf_program *prog);
struct bpf_map;
LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map);
struct bpf_insn;
/*
......@@ -315,6 +317,7 @@ LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog);
LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog);
LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog);
LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog);
LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog);
LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog);
LIBBPF_API void bpf_program__set_type(struct bpf_program *prog,
......@@ -335,6 +338,7 @@ LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog);
LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog);
LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog);
LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog);
LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog);
/*
* No need for __attribute__((packed)), all members of 'bpf_map_def'
......@@ -354,7 +358,6 @@ struct bpf_map_def {
* The 'struct bpf_map' in include/linux/bpf.h is internal to the kernel,
* so no need to worry about a name clash.
*/
struct bpf_map;
LIBBPF_API struct bpf_map *
bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name);
......
......@@ -213,6 +213,7 @@ LIBBPF_0.0.7 {
global:
btf_dump__emit_type_decl;
bpf_link__disconnect;
bpf_map__attach_struct_ops;
bpf_object__find_program_by_name;
bpf_object__attach_skeleton;
bpf_object__destroy_skeleton;
......@@ -223,5 +224,7 @@ LIBBPF_0.0.7 {
bpf_prog_attach_xattr;
bpf_program__attach;
bpf_program__name;
bpf_program__is_struct_ops;
bpf_program__set_struct_ops;
btf__align_of;
} LIBBPF_0.0.6;
......@@ -103,6 +103,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_STRUCT_OPS:
default:
break;
}
......@@ -251,6 +252,7 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
case BPF_MAP_TYPE_XSKMAP:
case BPF_MAP_TYPE_SOCKHASH:
case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
case BPF_MAP_TYPE_STRUCT_OPS:
default:
break;
}
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BPF_TCP_HELPERS_H
#define __BPF_TCP_HELPERS_H
#include <stdbool.h>
#include <linux/types.h>
#include <bpf_helpers.h>
#include <bpf_core_read.h>
#include "bpf_trace_helpers.h"
/* "struct_ops/" is only a convention. not a requirement. */
#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, "struct_ops/"#fname, fname, ret_type, __VA_ARGS__)
struct sock_common {
unsigned char skc_state;
} __attribute__((preserve_access_index));
struct sock {
struct sock_common __sk_common;
} __attribute__((preserve_access_index));
struct inet_sock {
struct sock sk;
} __attribute__((preserve_access_index));
struct inet_connection_sock {
struct inet_sock icsk_inet;
__u8 icsk_ca_state:6,
icsk_ca_setsockopt:1,
icsk_ca_dst_locked:1;
struct {
__u8 pending;
} icsk_ack;
__u64 icsk_ca_priv[104 / sizeof(__u64)];
} __attribute__((preserve_access_index));
struct tcp_sock {
struct inet_connection_sock inet_conn;
__u32 rcv_nxt;
__u32 snd_nxt;
__u32 snd_una;
__u8 ecn_flags;
__u32 delivered;
__u32 delivered_ce;
__u32 snd_cwnd;
__u32 snd_cwnd_cnt;
__u32 snd_cwnd_clamp;
__u32 snd_ssthresh;
__u8 syn_data:1, /* SYN includes data */
syn_fastopen:1, /* SYN includes Fast Open option */
syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
syn_fastopen_ch:1, /* Active TFO re-enabling probe */
syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
save_syn:1, /* Save headers of SYN packet */
is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
syn_smc:1; /* SYN includes SMC */
__u32 max_packets_out;
__u32 lsndtime;
__u32 prior_cwnd;
} __attribute__((preserve_access_index));
static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
return (struct inet_connection_sock *)sk;
}
static __always_inline void *inet_csk_ca(const struct sock *sk)
{
return (void *)inet_csk(sk)->icsk_ca_priv;
}
static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
return (struct tcp_sock *)sk;
}
static __always_inline bool before(__u32 seq1, __u32 seq2)
{
return (__s32)(seq1-seq2) < 0;
}
#define after(seq2, seq1) before(seq1, seq2)
#define TCP_ECN_OK 1
#define TCP_ECN_QUEUE_CWR 2
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8
enum inet_csk_ack_state_t {
ICSK_ACK_SCHED = 1,
ICSK_ACK_TIMER = 2,
ICSK_ACK_PUSHED = 4,
ICSK_ACK_PUSHED2 = 8,
ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */
};
enum tcp_ca_event {
CA_EVENT_TX_START = 0,
CA_EVENT_CWND_RESTART = 1,
CA_EVENT_COMPLETE_CWR = 2,
CA_EVENT_LOSS = 3,
CA_EVENT_ECN_NO_CE = 4,
CA_EVENT_ECN_IS_CE = 5,
};
enum tcp_ca_state {
TCP_CA_Open = 0,
TCP_CA_Disorder = 1,
TCP_CA_CWR = 2,
TCP_CA_Recovery = 3,
TCP_CA_Loss = 4
};
struct ack_sample {
__u32 pkts_acked;
__s32 rtt_us;
__u32 in_flight;
} __attribute__((preserve_access_index));
struct rate_sample {
__u64 prior_mstamp; /* starting timestamp for interval */
__u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
__s32 delivered; /* number of packets delivered over interval */
long interval_us; /* time for tp->delivered to incr "delivered" */
__u32 snd_interval_us; /* snd interval for delivered packets */
__u32 rcv_interval_us; /* rcv interval for delivered packets */
long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
int losses; /* number of packets marked lost upon ACK */
__u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
__u32 prior_in_flight; /* in flight before this ACK */
bool is_app_limited; /* is sample from packet with bubble in pipe? */
bool is_retrans; /* is sample from retransmission? */
bool is_ack_delayed; /* is this (likely) a delayed ACK? */
} __attribute__((preserve_access_index));
#define TCP_CA_NAME_MAX 16
#define TCP_CONG_NEEDS_ECN 0x2
struct tcp_congestion_ops {
char name[TCP_CA_NAME_MAX];
__u32 flags;
/* initialize private data (optional) */
void (*init)(struct sock *sk);
/* cleanup private data (optional) */
void (*release)(struct sock *sk);
/* return slow start threshold (required) */
__u32 (*ssthresh)(struct sock *sk);
/* do new cwnd calculation (required) */
void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
/* call before changing ca_state (optional) */
void (*set_state)(struct sock *sk, __u8 new_state);
/* call when cwnd event occurs (optional) */
void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
/* call when ack arrives (optional) */
void (*in_ack_event)(struct sock *sk, __u32 flags);
/* new value of cwnd after loss (required) */
__u32 (*undo_cwnd)(struct sock *sk);
/* hook for packet ack accounting (optional) */
void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
/* override sysctl_tcp_min_tso_segs */
__u32 (*min_tso_segs)(struct sock *sk);
/* returns the multiplier used in tcp_sndbuf_expand (optional) */
__u32 (*sndbuf_expand)(struct sock *sk);
/* call when packets are delivered to update cwnd and pacing rate,
* after all the ca_state processing. (optional)
*/
void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
};
#define min(a, b) ((a) < (b) ? (a) : (b))
#define max(a, b) ((a) > (b) ? (a) : (b))
#define min_not_zero(x, y) ({ \
typeof(x) __x = (x); \
typeof(y) __y = (y); \
__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked)
{
__u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
acked -= cwnd - tp->snd_cwnd;
tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
return acked;
}
static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
{
return tp->snd_cwnd < tp->snd_ssthresh;
}
static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* If in slow start, ensure cwnd grows to twice what was ACKed. */
if (tcp_in_slow_start(tp))
return tp->snd_cwnd < 2 * tp->max_packets_out;
return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
}
static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked)
{
/* If credits accumulated at a higher w, apply them gently now. */
if (tp->snd_cwnd_cnt >= w) {
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd++;
}
tp->snd_cwnd_cnt += acked;
if (tp->snd_cwnd_cnt >= w) {
__u32 delta = tp->snd_cwnd_cnt / w;
tp->snd_cwnd_cnt -= delta * w;
tp->snd_cwnd += delta;
}
tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
}
#endif
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
#include <linux/err.h>
#include <test_progs.h>
#include "bpf_dctcp.skel.h"
#define min(a, b) ((a) < (b) ? (a) : (b))
static const unsigned int total_bytes = 10 * 1024 * 1024;
static const struct timeval timeo_sec = { .tv_sec = 10 };
static const size_t timeo_optlen = sizeof(timeo_sec);
static int stop, duration;
static int settimeo(int fd)
{
int err;
err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
timeo_optlen);
if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n",
errno))
return -1;
err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
timeo_optlen);
if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n",
errno))
return -1;
return 0;
}
static int settcpca(int fd, const char *tcp_ca)
{
int err;
err = setsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, tcp_ca, strlen(tcp_ca));
if (CHECK(err == -1, "setsockopt(fd, TCP_CONGESTION)", "errno:%d\n",
errno))
return -1;
return 0;
}
static void *server(void *arg)
{
int lfd = (int)(long)arg, err = 0, fd;
ssize_t nr_sent = 0, bytes = 0;
char batch[1500];
fd = accept(lfd, NULL, NULL);
while (fd == -1) {
if (errno == EINTR)
continue;
err = -errno;
goto done;
}
if (settimeo(fd)) {
err = -errno;
goto done;
}
while (bytes < total_bytes && !READ_ONCE(stop)) {
nr_sent = send(fd, &batch,
min(total_bytes - bytes, sizeof(batch)), 0);
if (nr_sent == -1 && errno == EINTR)
continue;
if (nr_sent == -1) {
err = -errno;
break;
}
bytes += nr_sent;
}
CHECK(bytes != total_bytes, "send", "%zd != %u nr_sent:%zd errno:%d\n",
bytes, total_bytes, nr_sent, errno);
done:
if (fd != -1)
close(fd);
if (err) {
WRITE_ONCE(stop, 1);
return ERR_PTR(err);
}
return NULL;
}
static void do_test(const char *tcp_ca)
{
struct sockaddr_in6 sa6 = {};
ssize_t nr_recv = 0, bytes = 0;
int lfd = -1, fd = -1;
pthread_t srv_thread;
socklen_t addrlen = sizeof(sa6);
void *thread_ret;
char batch[1500];
int err;
WRITE_ONCE(stop, 0);
lfd = socket(AF_INET6, SOCK_STREAM, 0);
if (CHECK(lfd == -1, "socket", "errno:%d\n", errno))
return;
fd = socket(AF_INET6, SOCK_STREAM, 0);
if (CHECK(fd == -1, "socket", "errno:%d\n", errno)) {
close(lfd);
return;
}
if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) ||
settimeo(lfd) || settimeo(fd))
goto done;
/* bind, listen and start server thread to accept */
sa6.sin6_family = AF_INET6;
sa6.sin6_addr = in6addr_loopback;
err = bind(lfd, (struct sockaddr *)&sa6, addrlen);
if (CHECK(err == -1, "bind", "errno:%d\n", errno))
goto done;
err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen);
if (CHECK(err == -1, "getsockname", "errno:%d\n", errno))
goto done;
err = listen(lfd, 1);
if (CHECK(err == -1, "listen", "errno:%d\n", errno))
goto done;
err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd);
if (CHECK(err != 0, "pthread_create", "err:%d\n", err))
goto done;
/* connect to server */
err = connect(fd, (struct sockaddr *)&sa6, addrlen);
if (CHECK(err == -1, "connect", "errno:%d\n", errno))
goto wait_thread;
/* recv total_bytes */
while (bytes < total_bytes && !READ_ONCE(stop)) {
nr_recv = recv(fd, &batch,
min(total_bytes - bytes, sizeof(batch)), 0);
if (nr_recv == -1 && errno == EINTR)
continue;
if (nr_recv == -1)
break;
bytes += nr_recv;
}
CHECK(bytes != total_bytes, "recv", "%zd != %u nr_recv:%zd errno:%d\n",
bytes, total_bytes, nr_recv, errno);
wait_thread:
WRITE_ONCE(stop, 1);
pthread_join(srv_thread, &thread_ret);
CHECK(IS_ERR(thread_ret), "pthread_join", "thread_ret:%ld",
PTR_ERR(thread_ret));
done:
close(lfd);
close(fd);
}
static void test_dctcp(void)
{
struct bpf_dctcp *dctcp_skel;
struct bpf_link *link;
dctcp_skel = bpf_dctcp__open_and_load();
if (CHECK(!dctcp_skel, "bpf_dctcp__open_and_load", "failed\n"))
return;
link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n",
PTR_ERR(link))) {
bpf_dctcp__destroy(dctcp_skel);
return;
}
do_test("bpf_dctcp");
bpf_link__destroy(link);
bpf_dctcp__destroy(dctcp_skel);
}
void test_bpf_tcp_ca(void)
{
if (test__start_subtest("dctcp"))
test_dctcp();
}
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2019 Facebook */
/* WARNING: This implemenation is not necessarily the same
* as the tcp_dctcp.c. The purpose is mainly for testing
* the kernel BPF logic.
*/
#include <linux/bpf.h>
#include <linux/types.h>
#include "bpf_tcp_helpers.h"
char _license[] SEC("license") = "GPL";
#define DCTCP_MAX_ALPHA 1024U
struct dctcp {
__u32 old_delivered;
__u32 old_delivered_ce;
__u32 prior_rcv_nxt;
__u32 dctcp_alpha;
__u32 next_seq;
__u32 ce_state;
__u32 loss_cwnd;
};
static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */
static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA;
static __always_inline void dctcp_reset(const struct tcp_sock *tp,
struct dctcp *ca)
{
ca->next_seq = tp->snd_nxt;
ca->old_delivered = tp->delivered;
ca->old_delivered_ce = tp->delivered_ce;
}
BPF_TCP_OPS_1(dctcp_init, void, struct sock *, sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
ca->prior_rcv_nxt = tp->rcv_nxt;
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
ca->loss_cwnd = 0;
ca->ce_state = 0;
dctcp_reset(tp, ca);
}
BPF_TCP_OPS_1(dctcp_ssthresh, __u32, struct sock *, sk)
{
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
ca->loss_cwnd = tp->snd_cwnd;
return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
}
BPF_TCP_OPS_2(dctcp_update_alpha, void,
struct sock *, sk, __u32, flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
__u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
__u32 alpha = ca->dctcp_alpha;
/* alpha = (1 - g) * alpha + g * F */
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
if (delivered_ce) {
__u32 delivered = tp->delivered - ca->old_delivered;
/* If dctcp_shift_g == 1, a 32bit value would overflow
* after 8 M packets.
*/
delivered_ce <<= (10 - dctcp_shift_g);
delivered_ce /= max(1U, delivered);
alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
}
ca->dctcp_alpha = alpha;
dctcp_reset(tp, ca);
}
}
static __always_inline void dctcp_react_to_loss(struct sock *sk)
{
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
ca->loss_cwnd = tp->snd_cwnd;
tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
}
BPF_TCP_OPS_2(dctcp_state, void, struct sock *, sk, __u8, new_state)
{
if (new_state == TCP_CA_Recovery &&
new_state != BPF_CORE_READ_BITFIELD(inet_csk(sk), icsk_ca_state))
dctcp_react_to_loss(sk);
/* We handle RTO in dctcp_cwnd_event to ensure that we perform only
* one loss-adjustment per RTT.
*/
}
static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
{
struct tcp_sock *tp = tcp_sk(sk);
if (ce_state == 1)
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
else
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
/* Minimal DCTP CE state machine:
*
* S: 0 <- last pkt was non-CE
* 1 <- last pkt was CE
*/
static __always_inline
void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
__u32 *prior_rcv_nxt, __u32 *ce_state)
{
__u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
if (*ce_state != new_ce_state) {
/* CE state has changed, force an immediate ACK to
* reflect the new CE state. If an ACK was delayed,
* send that first to reflect the prior CE state.
*/
if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
dctcp_ece_ack_cwr(sk, *ce_state);
bpf_tcp_send_ack(sk, *prior_rcv_nxt);
}
inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
}
*prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
*ce_state = new_ce_state;
dctcp_ece_ack_cwr(sk, new_ce_state);
}
BPF_TCP_OPS_2(dctcp_cwnd_event, void,
struct sock *, sk, enum tcp_ca_event, ev)
{
struct dctcp *ca = inet_csk_ca(sk);
switch (ev) {
case CA_EVENT_ECN_IS_CE:
case CA_EVENT_ECN_NO_CE:
dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
break;
case CA_EVENT_LOSS:
dctcp_react_to_loss(sk);
break;
default:
/* Don't care for the rest. */
break;
}
}
BPF_TCP_OPS_1(dctcp_cwnd_undo, __u32, struct sock *, sk)
{
const struct dctcp *ca = inet_csk_ca(sk);
return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
}
BPF_TCP_OPS_3(tcp_reno_cong_avoid, void,
struct sock *, sk, __u32, ack, __u32, acked)
{
struct tcp_sock *tp = tcp_sk(sk);
if (!tcp_is_cwnd_limited(sk))
return;
/* In "safe" area, increase. */
if (tcp_in_slow_start(tp)) {
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
}
/* In dangerous area, increase slowly. */
tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
}
SEC(".struct_ops")
struct tcp_congestion_ops dctcp_nouse = {
.init = (void *)dctcp_init,
.set_state = (void *)dctcp_state,
.flags = TCP_CONG_NEEDS_ECN,
.name = "bpf_dctcp_nouse",
};
SEC(".struct_ops")
struct tcp_congestion_ops dctcp = {
.init = (void *)dctcp_init,
.in_ack_event = (void *)dctcp_update_alpha,
.cwnd_event = (void *)dctcp_cwnd_event,
.ssthresh = (void *)dctcp_ssthresh,
.cong_avoid = (void *)tcp_reno_cong_avoid,
.undo_cwnd = (void *)dctcp_cwnd_undo,
.set_state = (void *)dctcp_state,
.flags = TCP_CONG_NEEDS_ECN,
.name = "bpf_dctcp",
};
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment