Commit 8daed767 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-lookup-devmap'

Toke Høiland-Jørgensen says:

====================
When using the bpf_redirect_map() helper to redirect packets from XDP, the eBPF
program cannot currently know whether the redirect will succeed, which makes it
impossible to gracefully handle errors. To properly fix this will probably
require deeper changes to the way TX resources are allocated, but one thing that
is fairly straight forward to fix is to allow lookups into devmaps, so programs
can at least know when a redirect is *guaranteed* to fail because there is no
entry in the map. Currently, programs work around this by keeping a shadow map
of another type which indicates whether a map index is valid.

This series contains two changes that are complementary ways to fix this issue:

- Moving the map lookup into the bpf_redirect_map() helper (and caching the
  result), so the helper can return an error if no value is found in the map.
  This includes a refactoring of the devmap and cpumap code to not care about
  the index on enqueue.

- Allowing regular lookups into devmaps from eBPF programs, using the read-only
  flag to make sure they don't change the values.

The performance impact of the series is negligible, in the sense that I cannot
measure it because the variance between test runs is higher than the difference
pre/post series.

Changelog:

v6:
  - Factor out list handling in maps to a helper in list.h (new patch 1)
  - Rename variables in struct bpf_redirect_info (new patch 3 + patch 4)
  - Explain why we are clearing out the map in the info struct on lookup failure
  - Remove unneeded check for forwarding target in tracepoint macro

v5:
  - Rebase on latest bpf-next.
  - Update documentation for bpf_redirect_map() with the new meaning of flags.

v4:
  - Fix a few nits from Andrii
  - Lose the #defines in bpf.h and just compare the flags argument directly to
    XDP_TX in bpf_xdp_redirect_map().

v3:
  - Adopt Jonathan's idea of using the lower two bits of the flag value as the
    return code.
  - Always do the lookup, and cache the result for use in xdp_do_redirect(); to
    achieve this, refactor the devmap and cpumap code to get rid the bitmap for
    selecting which devices to flush.

v2:
  - For patch 1, make it clear that the change works for any map type.
  - For patch 2, just use the new BPF_F_RDONLY_PROG flag to make the return
    value read-only.
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 2d6dbb9a 0cdbb4b0
...@@ -578,8 +578,9 @@ struct bpf_skb_data_end { ...@@ -578,8 +578,9 @@ struct bpf_skb_data_end {
}; };
struct bpf_redirect_info { struct bpf_redirect_info {
u32 ifindex;
u32 flags; u32 flags;
u32 tgt_index;
void *tgt_value;
struct bpf_map *map; struct bpf_map *map;
struct bpf_map *map_to_flush; struct bpf_map *map_to_flush;
u32 kern_flags; u32 kern_flags;
......
...@@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) ...@@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
WRITE_ONCE(prev->next, next); WRITE_ONCE(prev->next, next);
} }
/*
* Delete a list entry and clear the 'prev' pointer.
*
* This is a special-purpose list clearing method used in the networking code
* for lists allocated as per-cpu, where we don't want to incur the extra
* WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
* needs to check the node 'prev' pointer instead of calling list_empty().
*/
static inline void __list_del_clearprev(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
entry->prev = NULL;
}
/** /**
* list_del - deletes entry from list. * list_del - deletes entry from list.
* @entry: the element to delete from the list. * @entry: the element to delete from the list.
......
...@@ -175,9 +175,8 @@ struct _bpf_dtab_netdev { ...@@ -175,9 +175,8 @@ struct _bpf_dtab_netdev {
#endif /* __DEVMAP_OBJ_TYPE */ #endif /* __DEVMAP_OBJ_TYPE */
#define devmap_ifindex(fwd, map) \ #define devmap_ifindex(fwd, map) \
(!fwd ? 0 : \ ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)
((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
......
...@@ -1571,8 +1571,11 @@ union bpf_attr { ...@@ -1571,8 +1571,11 @@ union bpf_attr {
* but this is only implemented for native XDP (with driver * but this is only implemented for native XDP (with driver
* support) as of this writing). * support) as of this writing).
* *
* All values for *flags* are reserved for future usage, and must * The lower two bits of *flags* are used as the return code if
* be left at zero. * the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to XDP_TX, as chosen by
* the caller. Any higher bits in the *flags* argument must be
* unset.
* *
* When used to redirect packets to net devices, this helper * When used to redirect packets to net devices, this helper
* provides a high performance increase over **bpf_redirect**\ (). * provides a high performance increase over **bpf_redirect**\ ().
......
...@@ -32,14 +32,19 @@ ...@@ -32,14 +32,19 @@
/* General idea: XDP packets getting XDP redirected to another CPU, /* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is * will maximum be stored/queued for one driver ->poll() call. It is
* guaranteed that setting flush bit and flush operation happen on * guaranteed that queueing the frame and the flush operation happen on
* same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
* which queue in bpf_cpu_map_entry contains packets. * which queue in bpf_cpu_map_entry contains packets.
*/ */
#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
struct bpf_cpu_map_entry;
struct bpf_cpu_map;
struct xdp_bulk_queue { struct xdp_bulk_queue {
void *q[CPU_MAP_BULK_SIZE]; void *q[CPU_MAP_BULK_SIZE];
struct list_head flush_node;
struct bpf_cpu_map_entry *obj;
unsigned int count; unsigned int count;
}; };
...@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { ...@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */ /* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq; struct xdp_bulk_queue __percpu *bulkq;
struct bpf_cpu_map *cmap;
/* Queue with potential multi-producers, and single-consumer kthread */ /* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue; struct ptr_ring *queue;
struct task_struct *kthread; struct task_struct *kthread;
...@@ -65,23 +72,17 @@ struct bpf_cpu_map { ...@@ -65,23 +72,17 @@ struct bpf_cpu_map {
struct bpf_map map; struct bpf_map map;
/* Below members specific for map type */ /* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map; struct bpf_cpu_map_entry **cpu_map;
unsigned long __percpu *flush_needed; struct list_head __percpu *flush_list;
}; };
static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
struct xdp_bulk_queue *bq, bool in_napi_ctx);
static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
{
return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
}
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{ {
struct bpf_cpu_map *cmap; struct bpf_cpu_map *cmap;
int err = -ENOMEM; int err = -ENOMEM;
int ret, cpu;
u64 cost; u64 cost;
int ret;
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
...@@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) ...@@ -105,7 +106,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */ /* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */ /* Notice returns -EPERM on if map size is larger than memlock limit */
ret = bpf_map_charge_init(&cmap->map.memory, cost); ret = bpf_map_charge_init(&cmap->map.memory, cost);
...@@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) ...@@ -114,12 +115,13 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap; goto free_cmap;
} }
/* A per cpu bitfield with a bit per possible CPU in map */ cmap->flush_list = alloc_percpu(struct list_head);
cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), if (!cmap->flush_list)
__alignof__(unsigned long));
if (!cmap->flush_needed)
goto free_charge; goto free_charge;
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
/* Alloc array for possible remote "destination" CPUs */ /* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *), sizeof(struct bpf_cpu_map_entry *),
...@@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) ...@@ -129,7 +131,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map; return &cmap->map;
free_percpu: free_percpu:
free_percpu(cmap->flush_needed); free_percpu(cmap->flush_list);
free_charge: free_charge:
bpf_map_charge_finish(&cmap->map.memory); bpf_map_charge_finish(&cmap->map.memory);
free_cmap: free_cmap:
...@@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, ...@@ -334,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
{ {
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu; struct bpf_cpu_map_entry *rcpu;
int numa, err; struct xdp_bulk_queue *bq;
int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */ /* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu); numa = cpu_to_node(cpu);
...@@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, ...@@ -349,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
if (!rcpu->bulkq) if (!rcpu->bulkq)
goto free_rcu; goto free_rcu;
for_each_possible_cpu(i) {
bq = per_cpu_ptr(rcpu->bulkq, i);
bq->obj = rcpu;
}
/* Alloc queue */ /* Alloc queue */
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
if (!rcpu->queue) if (!rcpu->queue)
...@@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) ...@@ -405,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
/* No concurrent bq_enqueue can run at this point */ /* No concurrent bq_enqueue can run at this point */
bq_flush_to_queue(rcpu, bq, false); bq_flush_to_queue(bq, false);
} }
free_percpu(rcpu->bulkq); free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */ /* Cannot kthread_stop() here, last put free rcpu resources */
...@@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -488,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
if (!rcpu) if (!rcpu)
return -ENOMEM; return -ENOMEM;
rcpu->cmap = cmap;
} }
rcu_read_lock(); rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu); __cpu_map_entry_replace(cmap, key_cpu, rcpu);
...@@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) ...@@ -514,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
synchronize_rcu(); synchronize_rcu();
/* To ensure all pending flush operations have completed wait for flush /* To ensure all pending flush operations have completed wait for flush
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus. * list be empty on _all_ cpus. Because the above synchronize_rcu()
* Because the above synchronize_rcu() ensures the map is disconnected * ensures the map is disconnected from the program we can assume no new
* from the program we can assume no new bits will be set. * items will be added to the list.
*/ */
for_each_online_cpu(cpu) { for_each_online_cpu(cpu) {
unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
while (!bitmap_empty(bitmap, cmap->map.max_entries)) while (!list_empty(flush_list))
cond_resched(); cond_resched();
} }
...@@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) ...@@ -538,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
/* bq flush and cleanup happens after RCU graze-period */ /* bq flush and cleanup happens after RCU graze-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
} }
free_percpu(cmap->flush_needed); free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map); bpf_map_area_free(cmap->cpu_map);
kfree(cmap); kfree(cmap);
} }
...@@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { ...@@ -590,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf, .map_check_btf = map_check_no_btf,
}; };
static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
struct xdp_bulk_queue *bq, bool in_napi_ctx)
{ {
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0; unsigned int processed = 0, drops = 0;
const int to_cpu = rcpu->cpu; const int to_cpu = rcpu->cpu;
struct ptr_ring *q; struct ptr_ring *q;
...@@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, ...@@ -621,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
bq->count = 0; bq->count = 0;
spin_unlock(&q->producer_lock); spin_unlock(&q->producer_lock);
__list_del_clearprev(&bq->flush_node);
/* Feedback loop via tracepoints */ /* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
return 0; return 0;
...@@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, ...@@ -631,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
*/ */
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{ {
struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
bq_flush_to_queue(rcpu, bq, true); bq_flush_to_queue(bq, true);
/* Notice, xdp_buff/page MUST be queued here, long enough for /* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver * driver to code invoking us to finished, due to driver
...@@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) ...@@ -646,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
* operation, when completing napi->poll call. * operation, when completing napi->poll call.
*/ */
bq->q[bq->count++] = xdpf; bq->q[bq->count++] = xdpf;
if (!bq->flush_node.prev)
list_add(&bq->flush_node, flush_list);
return 0; return 0;
} }
...@@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, ...@@ -665,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0; return 0;
} }
void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
__set_bit(bit, bitmap);
}
void __cpu_map_flush(struct bpf_map *map) void __cpu_map_flush(struct bpf_map *map)
{ {
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
u32 bit; struct xdp_bulk_queue *bq, *tmp;
/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
* and __cpu_map_flush() happen on same CPU. Thus, the percpu
* bitmap indicate which percpu bulkq have packets.
*/
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
struct xdp_bulk_queue *bq;
/* This is possible if entry is removed by user space
* between xdp redirect and flush op.
*/
if (unlikely(!rcpu))
continue;
__clear_bit(bit, bitmap);
/* Flush all frames in bulkq to real queue */ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq = this_cpu_ptr(rcpu->bulkq); bq_flush_to_queue(bq, true);
bq_flush_to_queue(rcpu, bq, true);
/* If already running, costs spin_lock_irqsave + smb_mb */ /* If already running, costs spin_lock_irqsave + smb_mb */
wake_up_process(rcpu->kthread); wake_up_process(bq->obj->kthread);
} }
} }
...@@ -17,9 +17,8 @@ ...@@ -17,9 +17,8 @@
* datapath always has a valid copy. However, the datapath does a "flush" * datapath always has a valid copy. However, the datapath does a "flush"
* operation that pushes any pending packets in the driver outside the RCU * operation that pushes any pending packets in the driver outside the RCU
* critical section. Each bpf_dtab_netdev tracks these pending operations using * critical section. Each bpf_dtab_netdev tracks these pending operations using
* an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
* until all bits are cleared indicating outstanding flush operations have * this list is empty, indicating outstanding flush operations have completed.
* completed.
* *
* BPF syscalls may race with BPF program calls on any of the update, delete * BPF syscalls may race with BPF program calls on any of the update, delete
* or lookup operations. As noted above the xchg() operation also keep the * or lookup operations. As noted above the xchg() operation also keep the
...@@ -48,9 +47,13 @@ ...@@ -48,9 +47,13 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16 #define DEV_MAP_BULK_SIZE 16
struct bpf_dtab_netdev;
struct xdp_bulk_queue { struct xdp_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE]; struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
struct net_device *dev_rx; struct net_device *dev_rx;
struct bpf_dtab_netdev *obj;
unsigned int count; unsigned int count;
}; };
...@@ -65,23 +68,18 @@ struct bpf_dtab_netdev { ...@@ -65,23 +68,18 @@ struct bpf_dtab_netdev {
struct bpf_dtab { struct bpf_dtab {
struct bpf_map map; struct bpf_map map;
struct bpf_dtab_netdev **netdev_map; struct bpf_dtab_netdev **netdev_map;
unsigned long __percpu *flush_needed; struct list_head __percpu *flush_list;
struct list_head list; struct list_head list;
}; };
static DEFINE_SPINLOCK(dev_map_lock); static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list); static LIST_HEAD(dev_map_list);
static u64 dev_map_bitmap_size(const union bpf_attr *attr)
{
return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
}
static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{ {
struct bpf_dtab *dtab; struct bpf_dtab *dtab;
int err, cpu;
u64 cost; u64 cost;
int err;
if (!capable(CAP_NET_ADMIN)) if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
...@@ -91,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) ...@@ -91,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
dtab = kzalloc(sizeof(*dtab), GFP_USER); dtab = kzalloc(sizeof(*dtab), GFP_USER);
if (!dtab) if (!dtab)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -99,7 +102,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) ...@@ -99,7 +102,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */ /* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
cost += dev_map_bitmap_size(attr) * num_possible_cpus(); cost += sizeof(struct list_head) * num_possible_cpus();
/* if map size is larger than memlock limit, reject it */ /* if map size is larger than memlock limit, reject it */
err = bpf_map_charge_init(&dtab->map.memory, cost); err = bpf_map_charge_init(&dtab->map.memory, cost);
...@@ -108,28 +111,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) ...@@ -108,28 +111,30 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
err = -ENOMEM; err = -ENOMEM;
/* A per cpu bitfield with a bit per possible net device */ dtab->flush_list = alloc_percpu(struct list_head);
dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), if (!dtab->flush_list)
__alignof__(unsigned long),
GFP_KERNEL | __GFP_NOWARN);
if (!dtab->flush_needed)
goto free_charge; goto free_charge;
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *), sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node); dtab->map.numa_node);
if (!dtab->netdev_map) if (!dtab->netdev_map)
goto free_charge; goto free_percpu;
spin_lock(&dev_map_lock); spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list); list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock); spin_unlock(&dev_map_lock);
return &dtab->map; return &dtab->map;
free_percpu:
free_percpu(dtab->flush_list);
free_charge: free_charge:
bpf_map_charge_finish(&dtab->map.memory); bpf_map_charge_finish(&dtab->map.memory);
free_dtab: free_dtab:
free_percpu(dtab->flush_needed);
kfree(dtab); kfree(dtab);
return ERR_PTR(err); return ERR_PTR(err);
} }
...@@ -158,14 +163,14 @@ static void dev_map_free(struct bpf_map *map) ...@@ -158,14 +163,14 @@ static void dev_map_free(struct bpf_map *map)
rcu_barrier(); rcu_barrier();
/* To ensure all pending flush operations have completed wait for flush /* To ensure all pending flush operations have completed wait for flush
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus. * list to empty on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected * Because the above synchronize_rcu() ensures the map is disconnected
* from the program we can assume no new bits will be set. * from the program we can assume no new items will be added.
*/ */
for_each_online_cpu(cpu) { for_each_online_cpu(cpu) {
unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
while (!bitmap_empty(bitmap, dtab->map.max_entries)) while (!list_empty(flush_list))
cond_resched(); cond_resched();
} }
...@@ -181,7 +186,7 @@ static void dev_map_free(struct bpf_map *map) ...@@ -181,7 +186,7 @@ static void dev_map_free(struct bpf_map *map)
kfree(dev); kfree(dev);
} }
free_percpu(dtab->flush_needed); free_percpu(dtab->flush_list);
bpf_map_area_free(dtab->netdev_map); bpf_map_area_free(dtab->netdev_map);
kfree(dtab); kfree(dtab);
} }
...@@ -203,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) ...@@ -203,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0; return 0;
} }
void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
__set_bit(bit, bitmap);
}
static int bq_xmit_all(struct bpf_dtab_netdev *obj,
struct xdp_bulk_queue *bq, u32 flags,
bool in_napi_ctx) bool in_napi_ctx)
{ {
struct bpf_dtab_netdev *obj = bq->obj;
struct net_device *dev = obj->dev; struct net_device *dev = obj->dev;
int sent = 0, drops = 0, err = 0; int sent = 0, drops = 0, err = 0;
int i; int i;
...@@ -241,6 +238,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, ...@@ -241,6 +238,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
sent, drops, bq->dev_rx, dev, err); sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL; bq->dev_rx = NULL;
__list_del_clearprev(&bq->flush_node);
return 0; return 0;
error: error:
/* If ndo_xdp_xmit fails with an errno, no frames have been /* If ndo_xdp_xmit fails with an errno, no frames have been
...@@ -263,31 +261,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj, ...@@ -263,31 +261,18 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
* from the driver before returning from its napi->poll() routine. The poll() * from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled * routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the ctx bitmap * net device can be torn down. On devmap tear down we ensure the flush list
* is zeroed before completing to ensure all flush operations have completed. * is empty before completing to ensure all flush operations have completed.
*/ */
void __dev_map_flush(struct bpf_map *map) void __dev_map_flush(struct bpf_map *map)
{ {
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
u32 bit; struct xdp_bulk_queue *bq, *tmp;
rcu_read_lock(); rcu_read_lock();
for_each_set_bit(bit, bitmap, map->max_entries) { list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
struct xdp_bulk_queue *bq;
/* This is possible if the dev entry is removed by user space
* between xdp redirect and flush op.
*/
if (unlikely(!dev))
continue;
bq = this_cpu_ptr(dev->bulkq);
bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
__clear_bit(bit, bitmap);
}
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -314,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, ...@@ -314,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
struct net_device *dev_rx) struct net_device *dev_rx)
{ {
struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(obj, bq, 0, true); bq_xmit_all(bq, 0, true);
/* Ingress dev_rx will be the same for all xdp_frame's in /* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed * bulk_queue, because bq stored per-CPU and must be flushed
...@@ -327,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, ...@@ -327,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
bq->dev_rx = dev_rx; bq->dev_rx = dev_rx;
bq->q[bq->count++] = xdpf; bq->q[bq->count++] = xdpf;
if (!bq->flush_node.prev)
list_add(&bq->flush_node, flush_list);
return 0; return 0;
} }
...@@ -377,17 +367,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) ...@@ -377,17 +367,12 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{ {
if (dev->dev->netdev_ops->ndo_xdp_xmit) { if (dev->dev->netdev_ops->ndo_xdp_xmit) {
struct xdp_bulk_queue *bq; struct xdp_bulk_queue *bq;
unsigned long *bitmap;
int cpu; int cpu;
rcu_read_lock(); rcu_read_lock();
for_each_online_cpu(cpu) { for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
bq = per_cpu_ptr(dev->bulkq, cpu); bq = per_cpu_ptr(dev->bulkq, cpu);
bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
} }
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -434,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -434,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
struct net *net = current->nsproxy->net_ns; struct net *net = current->nsproxy->net_ns;
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev; struct bpf_dtab_netdev *dev, *old_dev;
u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value; u32 ifindex = *(u32 *)value;
struct xdp_bulk_queue *bq;
u32 i = *(u32 *)key;
int cpu;
if (unlikely(map_flags > BPF_EXIST)) if (unlikely(map_flags > BPF_EXIST))
return -EINVAL; return -EINVAL;
...@@ -458,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -458,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return -ENOMEM; return -ENOMEM;
} }
for_each_possible_cpu(cpu) {
bq = per_cpu_ptr(dev->bulkq, cpu);
bq->obj = dev;
}
dev->dev = dev_get_by_index(net, ifindex); dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) { if (!dev->dev) {
free_percpu(dev->bulkq); free_percpu(dev->bulkq);
......
...@@ -3414,12 +3414,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, ...@@ -3414,12 +3414,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_get_local_storage) if (func_id != BPF_FUNC_get_local_storage)
goto error; goto error;
break; break;
/* devmap returns a pointer to a live net_device ifindex that we cannot
* allow to be modified from bpf side. So do not allow lookup elements
* for now.
*/
case BPF_MAP_TYPE_DEVMAP: case BPF_MAP_TYPE_DEVMAP:
if (func_id != BPF_FUNC_redirect_map) if (func_id != BPF_FUNC_redirect_map &&
func_id != BPF_FUNC_map_lookup_elem)
goto error; goto error;
break; break;
/* Restrict bpf side of cpumap and xskmap, open when use-cases /* Restrict bpf side of cpumap and xskmap, open when use-cases
......
...@@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map) ...@@ -145,8 +145,7 @@ void __xsk_map_flush(struct bpf_map *map)
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
xsk_flush(xs); xsk_flush(xs);
__list_del(xs->flush_node.prev, xs->flush_node.next); __list_del_clearprev(&xs->flush_node);
xs->flush_node.prev = NULL;
} }
} }
......
...@@ -2158,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ...@@ -2158,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
if (unlikely(flags & ~(BPF_F_INGRESS))) if (unlikely(flags & ~(BPF_F_INGRESS)))
return TC_ACT_SHOT; return TC_ACT_SHOT;
ri->ifindex = ifindex;
ri->flags = flags; ri->flags = flags;
ri->tgt_index = ifindex;
return TC_ACT_REDIRECT; return TC_ACT_REDIRECT;
} }
...@@ -2169,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb) ...@@ -2169,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb)
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *dev; struct net_device *dev;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
ri->ifindex = 0; ri->tgt_index = 0;
if (unlikely(!dev)) { if (unlikely(!dev)) {
kfree_skb(skb); kfree_skb(skb);
return -EINVAL; return -EINVAL;
...@@ -3488,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp, ...@@ -3488,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri) struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
{ {
struct net_device *fwd; struct net_device *fwd;
u32 index = ri->ifindex; u32 index = ri->tgt_index;
int err; int err;
fwd = dev_get_by_index_rcu(dev_net(dev), index); fwd = dev_get_by_index_rcu(dev_net(dev), index);
ri->ifindex = 0; ri->tgt_index = 0;
if (unlikely(!fwd)) { if (unlikely(!fwd)) {
err = -EINVAL; err = -EINVAL;
goto err; goto err;
...@@ -3523,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, ...@@ -3523,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = dev_map_enqueue(dst, xdp, dev_rx); err = dev_map_enqueue(dst, xdp, dev_rx);
if (unlikely(err)) if (unlikely(err))
return err; return err;
__dev_map_insert_ctx(map, index);
break; break;
} }
case BPF_MAP_TYPE_CPUMAP: { case BPF_MAP_TYPE_CPUMAP: {
...@@ -3532,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, ...@@ -3532,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = cpu_map_enqueue(rcpu, xdp, dev_rx); err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (unlikely(err)) if (unlikely(err))
return err; return err;
__cpu_map_insert_ctx(map, index);
break; break;
} }
case BPF_MAP_TYPE_XSKMAP: { case BPF_MAP_TYPE_XSKMAP: {
...@@ -3606,18 +3604,14 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, ...@@ -3606,18 +3604,14 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_map *map, struct bpf_prog *xdp_prog, struct bpf_map *map,
struct bpf_redirect_info *ri) struct bpf_redirect_info *ri)
{ {
u32 index = ri->ifindex; u32 index = ri->tgt_index;
void *fwd = NULL; void *fwd = ri->tgt_value;
int err; int err;
ri->ifindex = 0; ri->tgt_index = 0;
ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL); WRITE_ONCE(ri->map, NULL);
fwd = __xdp_map_lookup_elem(map, index);
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
if (ri->map_to_flush && unlikely(ri->map_to_flush != map)) if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
xdp_do_flush_map(); xdp_do_flush_map();
...@@ -3653,19 +3647,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, ...@@ -3653,19 +3647,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct bpf_map *map) struct bpf_map *map)
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
u32 index = ri->ifindex; u32 index = ri->tgt_index;
void *fwd = NULL; void *fwd = ri->tgt_value;
int err = 0; int err = 0;
ri->ifindex = 0; ri->tgt_index = 0;
ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL); WRITE_ONCE(ri->map, NULL);
fwd = __xdp_map_lookup_elem(map, index);
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
}
if (map->map_type == BPF_MAP_TYPE_DEVMAP) { if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
struct bpf_dtab_netdev *dst = fwd; struct bpf_dtab_netdev *dst = fwd;
...@@ -3697,14 +3686,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, ...@@ -3697,14 +3686,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map); struct bpf_map *map = READ_ONCE(ri->map);
u32 index = ri->ifindex; u32 index = ri->tgt_index;
struct net_device *fwd; struct net_device *fwd;
int err = 0; int err = 0;
if (map) if (map)
return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
map); map);
ri->ifindex = 0; ri->tgt_index = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index); fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) { if (unlikely(!fwd)) {
err = -EINVAL; err = -EINVAL;
...@@ -3732,8 +3721,9 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) ...@@ -3732,8 +3721,9 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
if (unlikely(flags)) if (unlikely(flags))
return XDP_ABORTED; return XDP_ABORTED;
ri->ifindex = ifindex;
ri->flags = flags; ri->flags = flags;
ri->tgt_index = ifindex;
ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL); WRITE_ONCE(ri->map, NULL);
return XDP_REDIRECT; return XDP_REDIRECT;
...@@ -3752,11 +3742,23 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, ...@@ -3752,11 +3742,23 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags)) /* Lower bits of the flags are used as return code on lookup failure */
if (unlikely(flags > XDP_TX))
return XDP_ABORTED; return XDP_ABORTED;
ri->ifindex = ifindex; ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
if (unlikely(!ri->tgt_value)) {
/* If the lookup fails we want to clear out the state in the
* redirect_info struct completely, so that if an eBPF program
* performs multiple lookups, the last one always takes
* precedence.
*/
WRITE_ONCE(ri->map, NULL);
return flags;
}
ri->flags = flags; ri->flags = flags;
ri->tgt_index = ifindex;
WRITE_ONCE(ri->map, map); WRITE_ONCE(ri->map, map);
return XDP_REDIRECT; return XDP_REDIRECT;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment