Commit a3884572 authored by Jakub Kicinski's avatar Jakub Kicinski Committed by Daniel Borkmann

bpf: offload: add map offload infrastructure

BPF map offload follow similar path to program offload.  At creation
time users may specify ifindex of the device on which they want to
create the map.  Map will be validated by the kernel's
.map_alloc_check callback and device driver will be called for the
actual allocation.  Map will have an empty set of operations
associated with it (save for alloc and free callbacks).  The real
device callbacks are kept in map->offload->dev_ops because they
have slightly different signatures.  Map operations are called in
process context so the driver may communicate with HW freely,
msleep(), wait() etc.

Map alloc and free callbacks are muxed via existing .ndo_bpf, and
are always called with rtnl lock held.  Maps and programs are
guaranteed to be destroyed before .ndo_uninit (i.e. before
unregister_netdev() returns).  Map callbacks are invoked with
bpf_devs_lock *read* locked, drivers must take care of exclusive
locking if necessary.

All offload-specific branches are marked with unlikely() (through
bpf_map_is_dev_bound()), given that branch penalty will be
negligible compared to IO anyway, and we don't want to penalize
SW path unnecessarily.
Signed-off-by: default avatarJakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: default avatarQuentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent 5bc2d55c
...@@ -74,6 +74,33 @@ struct bpf_map { ...@@ -74,6 +74,33 @@ struct bpf_map {
char name[BPF_OBJ_NAME_LEN]; char name[BPF_OBJ_NAME_LEN];
}; };
struct bpf_offloaded_map;
struct bpf_map_dev_ops {
int (*map_get_next_key)(struct bpf_offloaded_map *map,
void *key, void *next_key);
int (*map_lookup_elem)(struct bpf_offloaded_map *map,
void *key, void *value);
int (*map_update_elem)(struct bpf_offloaded_map *map,
void *key, void *value, u64 flags);
int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key);
};
struct bpf_offloaded_map {
struct bpf_map map;
struct net_device *netdev;
const struct bpf_map_dev_ops *dev_ops;
void *dev_priv;
struct list_head offloads;
};
static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map)
{
return container_of(map, struct bpf_offloaded_map, map);
}
extern const struct bpf_map_ops bpf_map_offload_ops;
/* function argument constraints */ /* function argument constraints */
enum bpf_arg_type { enum bpf_arg_type {
ARG_DONTCARE = 0, /* unused argument in helper function */ ARG_DONTCARE = 0, /* unused argument in helper function */
...@@ -369,6 +396,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages); ...@@ -369,6 +396,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages);
void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages);
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);
struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map *__bpf_map_get(struct fd f);
...@@ -556,6 +584,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); ...@@ -556,6 +584,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog);
int bpf_prog_offload_info_fill(struct bpf_prog_info *info, int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
struct bpf_prog *prog); struct bpf_prog *prog);
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value);
int bpf_map_offload_update_elem(struct bpf_map *map,
void *key, void *value, u64 flags);
int bpf_map_offload_delete_elem(struct bpf_map *map, void *key);
int bpf_map_offload_get_next_key(struct bpf_map *map,
void *key, void *next_key);
bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map);
#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
...@@ -563,6 +600,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) ...@@ -563,6 +600,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
{ {
return aux->offload_requested; return aux->offload_requested;
} }
static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
{
return unlikely(map->ops == &bpf_map_offload_ops);
}
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr);
void bpf_map_offload_map_free(struct bpf_map *map);
#else #else
static inline int bpf_prog_offload_init(struct bpf_prog *prog, static inline int bpf_prog_offload_init(struct bpf_prog *prog,
union bpf_attr *attr) union bpf_attr *attr)
...@@ -574,6 +619,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) ...@@ -574,6 +619,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
{ {
return false; return false;
} }
static inline bool bpf_map_is_dev_bound(struct bpf_map *map)
{
return false;
}
static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
return ERR_PTR(-EOPNOTSUPP);
}
static inline void bpf_map_offload_map_free(struct bpf_map *map)
{
}
#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
......
...@@ -804,6 +804,8 @@ enum bpf_netdev_command { ...@@ -804,6 +804,8 @@ enum bpf_netdev_command {
BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_VERIFIER_PREP,
BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_TRANSLATE,
BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_DESTROY,
BPF_OFFLOAD_MAP_ALLOC,
BPF_OFFLOAD_MAP_FREE,
}; };
struct bpf_prog_offload_ops; struct bpf_prog_offload_ops;
...@@ -834,6 +836,10 @@ struct netdev_bpf { ...@@ -834,6 +836,10 @@ struct netdev_bpf {
struct { struct {
struct bpf_prog *prog; struct bpf_prog *prog;
} offload; } offload;
/* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
struct {
struct bpf_offloaded_map *offmap;
};
}; };
}; };
......
...@@ -245,6 +245,7 @@ union bpf_attr { ...@@ -245,6 +245,7 @@ union bpf_attr {
* BPF_F_NUMA_NODE is set). * BPF_F_NUMA_NODE is set).
*/ */
char map_name[BPF_OBJ_NAME_LEN]; char map_name[BPF_OBJ_NAME_LEN];
__u32 map_ifindex; /* ifindex of netdev to create on */
}; };
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
......
...@@ -24,11 +24,13 @@ ...@@ -24,11 +24,13 @@
#include <linux/rtnetlink.h> #include <linux/rtnetlink.h>
#include <linux/rwsem.h> #include <linux/rwsem.h>
/* Protects bpf_prog_offload_devs and offload members of all progs. /* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
* of all progs.
* RTNL lock cannot be taken when holding this lock. * RTNL lock cannot be taken when holding this lock.
*/ */
static DECLARE_RWSEM(bpf_devs_lock); static DECLARE_RWSEM(bpf_devs_lock);
static LIST_HEAD(bpf_prog_offload_devs); static LIST_HEAD(bpf_prog_offload_devs);
static LIST_HEAD(bpf_map_offload_devs);
static int bpf_dev_offload_check(struct net_device *netdev) static int bpf_dev_offload_check(struct net_device *netdev)
{ {
...@@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ...@@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
const struct bpf_prog_ops bpf_offload_prog_ops = { const struct bpf_prog_ops bpf_offload_prog_ops = {
}; };
static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
enum bpf_netdev_command cmd)
{
struct netdev_bpf data = {};
struct net_device *netdev;
ASSERT_RTNL();
data.command = cmd;
data.offmap = offmap;
/* Caller must make sure netdev is valid */
netdev = offmap->netdev;
return netdev->netdev_ops->ndo_bpf(netdev, &data);
}
struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
{
struct net *net = current->nsproxy->net_ns;
struct bpf_offloaded_map *offmap;
int err;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (attr->map_type != BPF_MAP_TYPE_HASH)
return ERR_PTR(-EINVAL);
offmap = kzalloc(sizeof(*offmap), GFP_USER);
if (!offmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&offmap->map, attr);
rtnl_lock();
down_write(&bpf_devs_lock);
offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
err = bpf_dev_offload_check(offmap->netdev);
if (err)
goto err_unlock;
err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
if (err)
goto err_unlock;
list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
up_write(&bpf_devs_lock);
rtnl_unlock();
return &offmap->map;
err_unlock:
up_write(&bpf_devs_lock);
rtnl_unlock();
kfree(offmap);
return ERR_PTR(err);
}
static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
{
WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
/* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
bpf_map_free_id(&offmap->map, true);
list_del_init(&offmap->offloads);
offmap->netdev = NULL;
}
void bpf_map_offload_map_free(struct bpf_map *map)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
rtnl_lock();
down_write(&bpf_devs_lock);
if (offmap->netdev)
__bpf_map_offload_destroy(offmap);
up_write(&bpf_devs_lock);
rtnl_unlock();
kfree(offmap);
}
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
int ret = -ENODEV;
down_read(&bpf_devs_lock);
if (offmap->netdev)
ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
up_read(&bpf_devs_lock);
return ret;
}
int bpf_map_offload_update_elem(struct bpf_map *map,
void *key, void *value, u64 flags)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
int ret = -ENODEV;
if (unlikely(flags > BPF_EXIST))
return -EINVAL;
down_read(&bpf_devs_lock);
if (offmap->netdev)
ret = offmap->dev_ops->map_update_elem(offmap, key, value,
flags);
up_read(&bpf_devs_lock);
return ret;
}
int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
int ret = -ENODEV;
down_read(&bpf_devs_lock);
if (offmap->netdev)
ret = offmap->dev_ops->map_delete_elem(offmap, key);
up_read(&bpf_devs_lock);
return ret;
}
int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
int ret = -ENODEV;
down_read(&bpf_devs_lock);
if (offmap->netdev)
ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
up_read(&bpf_devs_lock);
return ret;
}
bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
{
struct bpf_offloaded_map *offmap;
struct bpf_prog_offload *offload;
bool ret;
if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map))
return false;
if (!bpf_prog_is_dev_bound(prog->aux))
return true;
down_read(&bpf_devs_lock);
offload = prog->aux->offload;
offmap = map_to_offmap(map);
ret = offload && offload->netdev == offmap->netdev;
up_read(&bpf_devs_lock);
return ret;
}
static void bpf_offload_orphan_all_progs(struct net_device *netdev)
{
struct bpf_prog_offload *offload, *tmp;
list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
if (offload->netdev == netdev)
__bpf_prog_offload_destroy(offload->prog);
}
static void bpf_offload_orphan_all_maps(struct net_device *netdev)
{
struct bpf_offloaded_map *offmap, *tmp;
list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
if (offmap->netdev == netdev)
__bpf_map_offload_destroy(offmap);
}
static int bpf_offload_notification(struct notifier_block *notifier, static int bpf_offload_notification(struct notifier_block *notifier,
ulong event, void *ptr) ulong event, void *ptr)
{ {
struct net_device *netdev = netdev_notifier_info_to_dev(ptr); struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct bpf_prog_offload *offload, *tmp;
ASSERT_RTNL(); ASSERT_RTNL();
...@@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier, ...@@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
break; break;
down_write(&bpf_devs_lock); down_write(&bpf_devs_lock);
list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, bpf_offload_orphan_all_progs(netdev);
offloads) { bpf_offload_orphan_all_maps(netdev);
if (offload->netdev == netdev)
__bpf_prog_offload_destroy(offload->prog);
}
up_write(&bpf_devs_lock); up_write(&bpf_devs_lock);
break; break;
default: default:
......
...@@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr, ...@@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr,
return 0; return 0;
} }
const struct bpf_map_ops bpf_map_offload_ops = {
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{ {
const struct bpf_map_ops *ops; const struct bpf_map_ops *ops;
...@@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) ...@@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
if (err) if (err)
return ERR_PTR(err); return ERR_PTR(err);
} }
if (attr->map_ifindex)
ops = &bpf_map_offload_ops;
map = ops->map_alloc(attr); map = ops->map_alloc(attr);
if (IS_ERR(map)) if (IS_ERR(map))
return map; return map;
...@@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) ...@@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
return id > 0 ? 0 : id; return id > 0 ? 0 : id;
} }
static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{ {
unsigned long flags; unsigned long flags;
/* Offloaded maps are removed from the IDR store when their device
* disappears - even if someone holds an fd to them they are unusable,
* the memory is gone, all ops will fail; they are simply waiting for
* refcnt to drop to be freed.
*/
if (!map->id)
return;
if (do_idr_lock) if (do_idr_lock)
spin_lock_irqsave(&map_idr_lock, flags); spin_lock_irqsave(&map_idr_lock, flags);
else else
__acquire(&map_idr_lock); __acquire(&map_idr_lock);
idr_remove(&map_idr, map->id); idr_remove(&map_idr, map->id);
map->id = 0;
if (do_idr_lock) if (do_idr_lock)
spin_unlock_irqrestore(&map_idr_lock, flags); spin_unlock_irqrestore(&map_idr_lock, flags);
...@@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) ...@@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
return 0; return 0;
} }
#define BPF_MAP_CREATE_LAST_FIELD map_name #define BPF_MAP_CREATE_LAST_FIELD map_ifindex
/* called via syscall */ /* called via syscall */
static int map_create(union bpf_attr *attr) static int map_create(union bpf_attr *attr)
{ {
...@@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value) if (!value)
goto free_key; goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (bpf_map_is_dev_bound(map)) {
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_map_offload_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value); err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value); err = bpf_percpu_array_copy(map, key, value);
...@@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
goto free_value; goto free_value;
/* Need to create a kthread, thus must support schedule */ /* Need to create a kthread, thus must support schedule */
if (map->map_type == BPF_MAP_TYPE_CPUMAP) { if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_update_elem(map, key, value, attr->flags);
goto out;
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags); err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out; goto out;
} }
...@@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr) ...@@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
goto err_put; goto err_put;
} }
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
}
preempt_disable(); preempt_disable();
__this_cpu_inc(bpf_prog_active); __this_cpu_inc(bpf_prog_active);
rcu_read_lock(); rcu_read_lock();
...@@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr) ...@@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
rcu_read_unlock(); rcu_read_unlock();
__this_cpu_dec(bpf_prog_active); __this_cpu_dec(bpf_prog_active);
preempt_enable(); preempt_enable();
out:
if (!err) if (!err)
trace_bpf_map_delete_elem(map, ufd, key); trace_bpf_map_delete_elem(map, ufd, key);
kfree(key); kfree(key);
...@@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr) ...@@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
if (!next_key) if (!next_key)
goto free_key; goto free_key;
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_get_next_key(map, key, next_key);
goto out;
}
rcu_read_lock(); rcu_read_lock();
err = map->ops->map_get_next_key(map, key, next_key); err = map->ops->map_get_next_key(map, key, next_key);
rcu_read_unlock(); rcu_read_unlock();
out:
if (err) if (err)
goto free_next_key; goto free_next_key;
......
...@@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, ...@@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL; return -EINVAL;
} }
} }
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_dev_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
return -EINVAL;
}
return 0; return 0;
} }
......
...@@ -245,6 +245,7 @@ union bpf_attr { ...@@ -245,6 +245,7 @@ union bpf_attr {
* BPF_F_NUMA_NODE is set). * BPF_F_NUMA_NODE is set).
*/ */
char map_name[BPF_OBJ_NAME_LEN]; char map_name[BPF_OBJ_NAME_LEN];
__u32 map_ifindex; /* ifindex of netdev to create on */
}; };
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment