Merge branch 'bpf-dispatcher'

Björn Töpel says: ==================== Overview ======== This is the 6th iteration of the series that introduces the BPF dispatcher, which is a mechanism to avoid indirect calls. The BPF dispatcher is a multi-way branch code generator, targeted for BPF programs. E.g. when an XDP program is executed via the bpf_prog_run_xdp(), it is invoked via an indirect call. With retpolines enabled, the indirect call has a substantial performance impact. The dispatcher is a mechanism that transform indirect calls to direct calls, and therefore avoids the retpoline. The dispatcher is generated using the BPF JIT, and relies on text poking provided by bpf_arch_text_poke(). The dispatcher hijacks a trampoline function it via the __fentry__ nop of the trampoline. One dispatcher instance currently supports up to 48 dispatch points. This can be extended in the future. In this series, only one dispatcher instance is supported, and the only user is XDP. The dispatcher is updated when an XDP program is attached/detached to/from a netdev. An alternative to this could have been to update the dispatcher at program load point, but as there are usually more XDP programs loaded than attached, so the latter was picked. The XDP dispatcher is always enabled, if available, because it helps even when retpolines are disabled. Please refer to the "Performance" section below. The first patch refactors the image allocation from the BPF trampoline code. Patch two introduces the dispatcher, and patch three adds a dispatcher for XDP, and wires up the XDP control-/ fast-path. Patch four adds the dispatcher to BPF_TEST_RUN. Patch five adds a simple selftest, and the last adds alignment to jump targets. I have rebased the series on commit 679152d3 ("libbpf: Fix printf compilation warnings on ppc64le arch"). Generated code, x86-64 ====================== The dispatcher currently has a maximum of 48 entries, where one entry is a unique BPF program. Multiple users of a dispatcher instance using the same BPF program will share that entry. The program/slot lookup is performed by a binary search, O(log n). Let's have a look at the generated code. The trampoline function has the following signature: unsigned int tramp(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)) On Intel x86-64 this means that rdx will contain the bpf_func. To, make it easier to read, I've let the BPF programs have the following range: 0xffffffffffffffff (-1) to 0xfffffffffffffff0 (-16). 0xffffffff81c00f10 is the retpoline thunk, in this case __x86_indirect_thunk_rdx. If retpolines are disabled the thunk will be a regular indirect call. The minimal dispatcher will then look like this: ffffffffc0002000: cmp rdx,0xffffffffffffffff ffffffffc0002007: je 0xffffffffffffffff ; -1 ffffffffc000200d: jmp 0xffffffff81c00f10 A 16 entry dispatcher looks like this: ffffffffc0020000: cmp rdx,0xfffffffffffffff7 ; -9 ffffffffc0020007: jg 0xffffffffc0020130 ffffffffc002000d: cmp rdx,0xfffffffffffffff3 ; -13 ffffffffc0020014: jg 0xffffffffc00200a0 ffffffffc002001a: cmp rdx,0xfffffffffffffff1 ; -15 ffffffffc0020021: jg 0xffffffffc0020060 ffffffffc0020023: cmp rdx,0xfffffffffffffff0 ; -16 ffffffffc002002a: jg 0xffffffffc0020040 ffffffffc002002c: cmp rdx,0xfffffffffffffff0 ; -16 ffffffffc0020033: je 0xfffffffffffffff0 ; -16 ffffffffc0020039: jmp 0xffffffff81c00f10 ffffffffc002003e: xchg ax,ax ffffffffc0020040: cmp rdx,0xfffffffffffffff1 ; -15 ffffffffc0020047: je 0xfffffffffffffff1 ; -15 ffffffffc002004d: jmp 0xffffffff81c00f10 ffffffffc0020052: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002005a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020060: cmp rdx,0xfffffffffffffff2 ; -14 ffffffffc0020067: jg 0xffffffffc0020080 ffffffffc0020069: cmp rdx,0xfffffffffffffff2 ; -14 ffffffffc0020070: je 0xfffffffffffffff2 ; -14 ffffffffc0020076: jmp 0xffffffff81c00f10 ffffffffc002007b: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020080: cmp rdx,0xfffffffffffffff3 ; -13 ffffffffc0020087: je 0xfffffffffffffff3 ; -13 ffffffffc002008d: jmp 0xffffffff81c00f10 ffffffffc0020092: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002009a: nop WORD PTR [rax+rax*1+0x0] ffffffffc00200a0: cmp rdx,0xfffffffffffffff5 ; -11 ffffffffc00200a7: jg 0xffffffffc00200f0 ffffffffc00200a9: cmp rdx,0xfffffffffffffff4 ; -12 ffffffffc00200b0: jg 0xffffffffc00200d0 ffffffffc00200b2: cmp rdx,0xfffffffffffffff4 ; -12 ffffffffc00200b9: je 0xfffffffffffffff4 ; -12 ffffffffc00200bf: jmp 0xffffffff81c00f10 ffffffffc00200c4: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00200cc: nop DWORD PTR [rax+0x0] ffffffffc00200d0: cmp rdx,0xfffffffffffffff5 ; -11 ffffffffc00200d7: je 0xfffffffffffffff5 ; -11 ffffffffc00200dd: jmp 0xffffffff81c00f10 ffffffffc00200e2: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00200ea: nop WORD PTR [rax+rax*1+0x0] ffffffffc00200f0: cmp rdx,0xfffffffffffffff6 ; -10 ffffffffc00200f7: jg 0xffffffffc0020110 ffffffffc00200f9: cmp rdx,0xfffffffffffffff6 ; -10 ffffffffc0020100: je 0xfffffffffffffff6 ; -10 ffffffffc0020106: jmp 0xffffffff81c00f10 ffffffffc002010b: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020110: cmp rdx,0xfffffffffffffff7 ; -9 ffffffffc0020117: je 0xfffffffffffffff7 ; -9 ffffffffc002011d: jmp 0xffffffff81c00f10 ffffffffc0020122: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002012a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020130: cmp rdx,0xfffffffffffffffb ; -5 ffffffffc0020137: jg 0xffffffffc00201d0 ffffffffc002013d: cmp rdx,0xfffffffffffffff9 ; -7 ffffffffc0020144: jg 0xffffffffc0020190 ffffffffc0020146: cmp rdx,0xfffffffffffffff8 ; -8 ffffffffc002014d: jg 0xffffffffc0020170 ffffffffc002014f: cmp rdx,0xfffffffffffffff8 ; -8 ffffffffc0020156: je 0xfffffffffffffff8 ; -8 ffffffffc002015c: jmp 0xffffffff81c00f10 ffffffffc0020161: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020169: nop DWORD PTR [rax+0x0] ffffffffc0020170: cmp rdx,0xfffffffffffffff9 ; -7 ffffffffc0020177: je 0xfffffffffffffff9 ; -7 ffffffffc002017d: jmp 0xffffffff81c00f10 ffffffffc0020182: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002018a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020190: cmp rdx,0xfffffffffffffffa ; -6 ffffffffc0020197: jg 0xffffffffc00201b0 ffffffffc0020199: cmp rdx,0xfffffffffffffffa ; -6 ffffffffc00201a0: je 0xfffffffffffffffa ; -6 ffffffffc00201a6: jmp 0xffffffff81c00f10 ffffffffc00201ab: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00201b0: cmp rdx,0xfffffffffffffffb ; -5 ffffffffc00201b7: je 0xfffffffffffffffb ; -5 ffffffffc00201bd: jmp 0xffffffff81c00f10 ffffffffc00201c2: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00201ca: nop WORD PTR [rax+rax*1+0x0] ffffffffc00201d0: cmp rdx,0xfffffffffffffffd ; -3 ffffffffc00201d7: jg 0xffffffffc0020220 ffffffffc00201d9: cmp rdx,0xfffffffffffffffc ; -4 ffffffffc00201e0: jg 0xffffffffc0020200 ffffffffc00201e2: cmp rdx,0xfffffffffffffffc ; -4 ffffffffc00201e9: je 0xfffffffffffffffc ; -4 ffffffffc00201ef: jmp 0xffffffff81c00f10 ffffffffc00201f4: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00201fc: nop DWORD PTR [rax+0x0] ffffffffc0020200: cmp rdx,0xfffffffffffffffd ; -3 ffffffffc0020207: je 0xfffffffffffffffd ; -3 ffffffffc002020d: jmp 0xffffffff81c00f10 ffffffffc0020212: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002021a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020220: cmp rdx,0xfffffffffffffffe ; -2 ffffffffc0020227: jg 0xffffffffc0020240 ffffffffc0020229: cmp rdx,0xfffffffffffffffe ; -2 ffffffffc0020230: je 0xfffffffffffffffe ; -2 ffffffffc0020236: jmp 0xffffffff81c00f10 ffffffffc002023b: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020240: cmp rdx,0xffffffffffffffff ; -1 ffffffffc0020247: je 0xffffffffffffffff ; -1 ffffffffc002024d: jmp 0xffffffff81c00f10 The nops are there to align jump targets to 16 B. Performance =========== The tests were performed using the xdp_rxq_info sample program with the following command-line: 1. XDP_DRV: # xdp_rxq_info --dev eth0 --action XDP_DROP 2. XDP_SKB: # xdp_rxq_info --dev eth0 -S --action XDP_DROP 3. xdp-perf, from selftests/bpf: # test_progs -v -t xdp_perf Run with mitigations=auto ------------------------- Baseline: 1. 21.7 Mpps (21736190) 2. 3.8 Mpps (3837582) 3. 15 ns Dispatcher: 1. 30.2 Mpps (30176320) 2. 4.0 Mpps (4015579) 3. 5 ns Dispatcher (full; walk all entries, and fallback): 1. 22.0 Mpps (21986704) 2. 3.8 Mpps (3831298) 3. 17 ns Run with mitigations=off ------------------------ Baseline: 1. 29.9 Mpps (29875135) 2. 4.1 Mpps (4100179) 3. 4 ns Dispatcher: 1. 30.4 Mpps (30439241) 2. 4.1 Mpps (4109350) 1. 4 ns Dispatcher (full; walk all entries, and fallback): 1. 28.9 Mpps (28903269) 2. 4.1 Mpps (4080078) 3. 5 ns xdp-perf runs, aliged vs non-aligned jump targets ------------------------------------------------- In this test dispatchers of different sizes, with and without jump target alignment, were exercised. As outlined above the function lookup is performed via binary search. This means that depending on the pointer value of the function, it can reside in the upper or lower part of the search table. The performed tests were: 1. aligned, mititations=auto, function entry < other entries 2. aligned, mititations=auto, function entry > other entries 3. non-aligned, mititations=auto, function entry < other entries 4. non-aligned, mititations=auto, function entry > other entries 5. aligned, mititations=off, function entry < other entries 6. aligned, mititations=off, function entry > other entries 7. non-aligned, mititations=off, function entry < other entries 8. non-aligned, mititations=off, function entry > other entries The micro benchmarks showed that alignment of jump target has some positive impact. A reply to this cover letter will contain complete data for all runs. Multiple xdp-perf baseline with mitigations=auto ------------------------------------------------ Performance counter stats for './test_progs -v -t xdp_perf' (1024 runs): 16.69 msec task-clock # 0.984 CPUs utilized ( +- 0.08% ) 2 context-switches # 0.123 K/sec ( +- 1.11% ) 0 cpu-migrations # 0.000 K/sec ( +- 70.68% ) 97 page-faults # 0.006 M/sec ( +- 0.05% ) 49,254,635 cycles # 2.951 GHz ( +- 0.09% ) (12.28%) 42,138,558 instructions # 0.86 insn per cycle ( +- 0.02% ) (36.15%) 7,315,291 branches # 438.300 M/sec ( +- 0.01% ) (59.43%) 1,011,201 branch-misses # 13.82% of all branches ( +- 0.01% ) (83.31%) 15,440,788 L1-dcache-loads # 925.143 M/sec ( +- 0.00% ) (99.40%) 39,067 L1-dcache-load-misses # 0.25% of all L1-dcache hits ( +- 0.04% ) 6,531 LLC-loads # 0.391 M/sec ( +- 0.05% ) 442 LLC-load-misses # 6.76% of all LL-cache hits ( +- 0.77% ) <not supported> L1-icache-loads 57,964 L1-icache-load-misses ( +- 0.06% ) 15,442,496 dTLB-loads # 925.246 M/sec ( +- 0.00% ) 514 dTLB-load-misses # 0.00% of all dTLB cache hits ( +- 0.73% ) (40.57%) 130 iTLB-loads # 0.008 M/sec ( +- 2.75% ) (16.69%) <not counted> iTLB-load-misses ( +- 8.71% ) (0.60%) <not supported> L1-dcache-prefetches <not supported> L1-dcache-prefetch-misses 0.0169558 +- 0.0000127 seconds time elapsed ( +- 0.07% ) Multiple xdp-perf dispatcher with mitigations=auto -------------------------------------------------- Note that this includes generating the dispatcher. Performance counter stats for './test_progs -v -t xdp_perf' (1024 runs): 4.80 msec task-clock # 0.953 CPUs utilized ( +- 0.06% ) 1 context-switches # 0.258 K/sec ( +- 1.57% ) 0 cpu-migrations # 0.000 K/sec 97 page-faults # 0.020 M/sec ( +- 0.05% ) 14,185,861 cycles # 2.955 GHz ( +- 0.17% ) (50.49%) 45,691,935 instructions # 3.22 insn per cycle ( +- 0.01% ) (99.19%) 8,346,008 branches # 1738.709 M/sec ( +- 0.00% ) 13,046 branch-misses # 0.16% of all branches ( +- 0.10% ) 15,443,735 L1-dcache-loads # 3217.365 M/sec ( +- 0.00% ) 39,585 L1-dcache-load-misses # 0.26% of all L1-dcache hits ( +- 0.05% ) 7,138 LLC-loads # 1.487 M/sec ( +- 0.06% ) 671 LLC-load-misses # 9.40% of all LL-cache hits ( +- 0.73% ) <not supported> L1-icache-loads 56,213 L1-icache-load-misses ( +- 0.08% ) 15,443,735 dTLB-loads # 3217.365 M/sec ( +- 0.00% ) <not counted> dTLB-load-misses (0.00%) <not counted> iTLB-loads (0.00%) <not counted> iTLB-load-misses (0.00%) <not supported> L1-dcache-prefetches <not supported> L1-dcache-prefetch-misses 0.00503705 +- 0.00000546 seconds time elapsed ( +- 0.11% ) Revisions ========= v4->v5: [1] * Fixed s/xdp_ctx/ctx/ type-o (Toke) * Marked dispatcher trampoline with noinline attribute (Alexei) v3->v4: [2] * Moved away from doing dispatcher lookup based on the trampoline function, to a model where the dispatcher instance is explicitly passed to the bpf_dispatcher_change_prog() (Alexei) v2->v3: [3] * Removed xdp_call, and instead make the dispatcher available to all XDP users via bpf_prog_run_xdp() and dev_xdp_install(). (Toke) * Always enable the dispatcher, if available (Alexei) * Reuse BPF trampoline image allocator (Alexei) * Make sure the dispatcher is exercised in selftests (Alexei) * Only allow one dispatcher, and wire it to XDP v1->v2: [4] * Fixed i386 build warning (kbuild robot) * Made bpf_dispatcher_lookup() static (kbuild robot) * Make sure xdp_call.h is only enabled for builtins * Add xdp_call() to ixgbe, mlx4, and mlx5 RFC->v1: [5] * Improved error handling (Edward and Andrii) * Explicit cleanup (Andrii) * Use 32B with sext cmp (Alexei) * Align jump targets to 16B (Alexei) * 4 to 16 entries (Toke) * Added stats to xdp_call_run() [1] https://lore.kernel.org/bpf/20191211123017.13212-1-bjorn.topel@gmail.com/ [2] https://lore.kernel.org/bpf/20191209135522.16576-1-bjorn.topel@gmail.com/ [3] https://lore.kernel.org/bpf/20191123071226.6501-1-bjorn.topel@gmail.com/ [4] https://lore.kernel.org/bpf/20191119160757.27714-1-bjorn.topel@gmail.com/ [5] https://lore.kernel.org/bpf/20191113204737.31623-1-bjorn.topel@gmail.com/ ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Merge branch 'bpf-dispatcher'
Björn Töpel says: ==================== Overview ======== This is the 6th iteration of the series that introduces the BPF dispatcher, which is a mechanism to avoid indirect calls. The BPF dispatcher is a multi-way branch code generator, targeted for BPF programs. E.g. when an XDP program is executed via the bpf_prog_run_xdp(), it is invoked via an indirect call. With retpolines enabled, the indirect call has a substantial performance impact. The dispatcher is a mechanism that transform indirect calls to direct calls, and therefore avoids the retpoline. The dispatcher is generated using the BPF JIT, and relies on text poking provided by bpf_arch_text_poke(). The dispatcher hijacks a trampoline function it via the __fentry__ nop of the trampoline. One dispatcher instance currently supports up to 48 dispatch points. This can be extended in the future. In this series, only one dispatcher instance is supported, and the only user is XDP. The dispatcher is updated when an XDP program is attached/detached to/from a netdev. An alternative to this could have been to update the dispatcher at program load point, but as there are usually more XDP programs loaded than attached, so the latter was picked. The XDP dispatcher is always enabled, if available, because it helps even when retpolines are disabled. Please refer to the "Performance" section below. The first patch refactors the image allocation from the BPF trampoline code. Patch two introduces the dispatcher, and patch three adds a dispatcher for XDP, and wires up the XDP control-/ fast-path. Patch four adds the dispatcher to BPF_TEST_RUN. Patch five adds a simple selftest, and the last adds alignment to jump targets. I have rebased the series on commit 679152d3 ("libbpf: Fix printf compilation warnings on ppc64le arch"). Generated code, x86-64 ====================== The dispatcher currently has a maximum of 48 entries, where one entry is a unique BPF program. Multiple users of a dispatcher instance using the same BPF program will share that entry. The program/slot lookup is performed by a binary search, O(log n). Let's have a look at the generated code. The trampoline function has the following signature: unsigned int tramp(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)) On Intel x86-64 this means that rdx will contain the bpf_func. To, make it easier to read, I've let the BPF programs have the following range: 0xffffffffffffffff (-1) to 0xfffffffffffffff0 (-16). 0xffffffff81c00f10 is the retpoline thunk, in this case __x86_indirect_thunk_rdx. If retpolines are disabled the thunk will be a regular indirect call. The minimal dispatcher will then look like this: ffffffffc0002000: cmp rdx,0xffffffffffffffff ffffffffc0002007: je 0xffffffffffffffff ; -1 ffffffffc000200d: jmp 0xffffffff81c00f10 A 16 entry dispatcher looks like this: ffffffffc0020000: cmp rdx,0xfffffffffffffff7 ; -9 ffffffffc0020007: jg 0xffffffffc0020130 ffffffffc002000d: cmp rdx,0xfffffffffffffff3 ; -13 ffffffffc0020014: jg 0xffffffffc00200a0 ffffffffc002001a: cmp rdx,0xfffffffffffffff1 ; -15 ffffffffc0020021: jg 0xffffffffc0020060 ffffffffc0020023: cmp rdx,0xfffffffffffffff0 ; -16 ffffffffc002002a: jg 0xffffffffc0020040 ffffffffc002002c: cmp rdx,0xfffffffffffffff0 ; -16 ffffffffc0020033: je 0xfffffffffffffff0 ; -16 ffffffffc0020039: jmp 0xffffffff81c00f10 ffffffffc002003e: xchg ax,ax ffffffffc0020040: cmp rdx,0xfffffffffffffff1 ; -15 ffffffffc0020047: je 0xfffffffffffffff1 ; -15 ffffffffc002004d: jmp 0xffffffff81c00f10 ffffffffc0020052: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002005a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020060: cmp rdx,0xfffffffffffffff2 ; -14 ffffffffc0020067: jg 0xffffffffc0020080 ffffffffc0020069: cmp rdx,0xfffffffffffffff2 ; -14 ffffffffc0020070: je 0xfffffffffffffff2 ; -14 ffffffffc0020076: jmp 0xffffffff81c00f10 ffffffffc002007b: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020080: cmp rdx,0xfffffffffffffff3 ; -13 ffffffffc0020087: je 0xfffffffffffffff3 ; -13 ffffffffc002008d: jmp 0xffffffff81c00f10 ffffffffc0020092: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002009a: nop WORD PTR [rax+rax*1+0x0] ffffffffc00200a0: cmp rdx,0xfffffffffffffff5 ; -11 ffffffffc00200a7: jg 0xffffffffc00200f0 ffffffffc00200a9: cmp rdx,0xfffffffffffffff4 ; -12 ffffffffc00200b0: jg 0xffffffffc00200d0 ffffffffc00200b2: cmp rdx,0xfffffffffffffff4 ; -12 ffffffffc00200b9: je 0xfffffffffffffff4 ; -12 ffffffffc00200bf: jmp 0xffffffff81c00f10 ffffffffc00200c4: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00200cc: nop DWORD PTR [rax+0x0] ffffffffc00200d0: cmp rdx,0xfffffffffffffff5 ; -11 ffffffffc00200d7: je 0xfffffffffffffff5 ; -11 ffffffffc00200dd: jmp 0xffffffff81c00f10 ffffffffc00200e2: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00200ea: nop WORD PTR [rax+rax*1+0x0] ffffffffc00200f0: cmp rdx,0xfffffffffffffff6 ; -10 ffffffffc00200f7: jg 0xffffffffc0020110 ffffffffc00200f9: cmp rdx,0xfffffffffffffff6 ; -10 ffffffffc0020100: je 0xfffffffffffffff6 ; -10 ffffffffc0020106: jmp 0xffffffff81c00f10 ffffffffc002010b: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020110: cmp rdx,0xfffffffffffffff7 ; -9 ffffffffc0020117: je 0xfffffffffffffff7 ; -9 ffffffffc002011d: jmp 0xffffffff81c00f10 ffffffffc0020122: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002012a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020130: cmp rdx,0xfffffffffffffffb ; -5 ffffffffc0020137: jg 0xffffffffc00201d0 ffffffffc002013d: cmp rdx,0xfffffffffffffff9 ; -7 ffffffffc0020144: jg 0xffffffffc0020190 ffffffffc0020146: cmp rdx,0xfffffffffffffff8 ; -8 ffffffffc002014d: jg 0xffffffffc0020170 ffffffffc002014f: cmp rdx,0xfffffffffffffff8 ; -8 ffffffffc0020156: je 0xfffffffffffffff8 ; -8 ffffffffc002015c: jmp 0xffffffff81c00f10 ffffffffc0020161: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020169: nop DWORD PTR [rax+0x0] ffffffffc0020170: cmp rdx,0xfffffffffffffff9 ; -7 ffffffffc0020177: je 0xfffffffffffffff9 ; -7 ffffffffc002017d: jmp 0xffffffff81c00f10 ffffffffc0020182: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002018a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020190: cmp rdx,0xfffffffffffffffa ; -6 ffffffffc0020197: jg 0xffffffffc00201b0 ffffffffc0020199: cmp rdx,0xfffffffffffffffa ; -6 ffffffffc00201a0: je 0xfffffffffffffffa ; -6 ffffffffc00201a6: jmp 0xffffffff81c00f10 ffffffffc00201ab: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00201b0: cmp rdx,0xfffffffffffffffb ; -5 ffffffffc00201b7: je 0xfffffffffffffffb ; -5 ffffffffc00201bd: jmp 0xffffffff81c00f10 ffffffffc00201c2: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00201ca: nop WORD PTR [rax+rax*1+0x0] ffffffffc00201d0: cmp rdx,0xfffffffffffffffd ; -3 ffffffffc00201d7: jg 0xffffffffc0020220 ffffffffc00201d9: cmp rdx,0xfffffffffffffffc ; -4 ffffffffc00201e0: jg 0xffffffffc0020200 ffffffffc00201e2: cmp rdx,0xfffffffffffffffc ; -4 ffffffffc00201e9: je 0xfffffffffffffffc ; -4 ffffffffc00201ef: jmp 0xffffffff81c00f10 ffffffffc00201f4: nop DWORD PTR [rax+rax*1+0x0] ffffffffc00201fc: nop DWORD PTR [rax+0x0] ffffffffc0020200: cmp rdx,0xfffffffffffffffd ; -3 ffffffffc0020207: je 0xfffffffffffffffd ; -3 ffffffffc002020d: jmp 0xffffffff81c00f10 ffffffffc0020212: nop DWORD PTR [rax+rax*1+0x0] ffffffffc002021a: nop WORD PTR [rax+rax*1+0x0] ffffffffc0020220: cmp rdx,0xfffffffffffffffe ; -2 ffffffffc0020227: jg 0xffffffffc0020240 ffffffffc0020229: cmp rdx,0xfffffffffffffffe ; -2 ffffffffc0020230: je 0xfffffffffffffffe ; -2 ffffffffc0020236: jmp 0xffffffff81c00f10 ffffffffc002023b: nop DWORD PTR [rax+rax*1+0x0] ffffffffc0020240: cmp rdx,0xffffffffffffffff ; -1 ffffffffc0020247: je 0xffffffffffffffff ; -1 ffffffffc002024d: jmp 0xffffffff81c00f10 The nops are there to align jump targets to 16 B. Performance =========== The tests were performed using the xdp_rxq_info sample program with the following command-line: 1. XDP_DRV: # xdp_rxq_info --dev eth0 --action XDP_DROP 2. XDP_SKB: # xdp_rxq_info --dev eth0 -S --action XDP_DROP 3. xdp-perf, from selftests/bpf: # test_progs -v -t xdp_perf Run with mitigations=auto ------------------------- Baseline: 1. 21.7 Mpps (21736190) 2. 3.8 Mpps (3837582) 3. 15 ns Dispatcher: 1. 30.2 Mpps (30176320) 2. 4.0 Mpps (4015579) 3. 5 ns Dispatcher (full; walk all entries, and fallback): 1. 22.0 Mpps (21986704) 2. 3.8 Mpps (3831298) 3. 17 ns Run with mitigations=off ------------------------ Baseline: 1. 29.9 Mpps (29875135) 2. 4.1 Mpps (4100179) 3. 4 ns Dispatcher: 1. 30.4 Mpps (30439241) 2. 4.1 Mpps (4109350) 1. 4 ns Dispatcher (full; walk all entries, and fallback): 1. 28.9 Mpps (28903269) 2. 4.1 Mpps (4080078) 3. 5 ns xdp-perf runs, aliged vs non-aligned jump targets ------------------------------------------------- In this test dispatchers of different sizes, with and without jump target alignment, were exercised. As outlined above the function lookup is performed via binary search. This means that depending on the pointer value of the function, it can reside in the upper or lower part of the search table. The performed tests were: 1. aligned, mititations=auto, function entry < other entries 2. aligned, mititations=auto, function entry > other entries 3. non-aligned, mititations=auto, function entry < other entries 4. non-aligned, mititations=auto, function entry > other entries 5. aligned, mititations=off, function entry < other entries 6. aligned, mititations=off, function entry > other entries 7. non-aligned, mititations=off, function entry < other entries 8. non-aligned, mititations=off, function entry > other entries The micro benchmarks showed that alignment of jump target has some positive impact. A reply to this cover letter will contain complete data for all runs. Multiple xdp-perf baseline with mitigations=auto ------------------------------------------------ Performance counter stats for './test_progs -v -t xdp_perf' (1024 runs): 16.69 msec task-clock # 0.984 CPUs utilized ( +- 0.08% ) 2 context-switches # 0.123 K/sec ( +- 1.11% ) 0 cpu-migrations # 0.000 K/sec ( +- 70.68% ) 97 page-faults # 0.006 M/sec ( +- 0.05% ) 49,254,635 cycles # 2.951 GHz ( +- 0.09% ) (12.28%) 42,138,558 instructions # 0.86 insn per cycle ( +- 0.02% ) (36.15%) 7,315,291 branches # 438.300 M/sec ( +- 0.01% ) (59.43%) 1,011,201 branch-misses # 13.82% of all branches ( +- 0.01% ) (83.31%) 15,440,788 L1-dcache-loads # 925.143 M/sec ( +- 0.00% ) (99.40%) 39,067 L1-dcache-load-misses # 0.25% of all L1-dcache hits ( +- 0.04% ) 6,531 LLC-loads # 0.391 M/sec ( +- 0.05% ) 442 LLC-load-misses # 6.76% of all LL-cache hits ( +- 0.77% ) <not supported> L1-icache-loads 57,964 L1-icache-load-misses ( +- 0.06% ) 15,442,496 dTLB-loads # 925.246 M/sec ( +- 0.00% ) 514 dTLB-load-misses # 0.00% of all dTLB cache hits ( +- 0.73% ) (40.57%) 130 iTLB-loads # 0.008 M/sec ( +- 2.75% ) (16.69%) <not counted> iTLB-load-misses ( +- 8.71% ) (0.60%) <not supported> L1-dcache-prefetches <not supported> L1-dcache-prefetch-misses 0.0169558 +- 0.0000127 seconds time elapsed ( +- 0.07% ) Multiple xdp-perf dispatcher with mitigations=auto -------------------------------------------------- Note that this includes generating the dispatcher. Performance counter stats for './test_progs -v -t xdp_perf' (1024 runs): 4.80 msec task-clock # 0.953 CPUs utilized ( +- 0.06% ) 1 context-switches # 0.258 K/sec ( +- 1.57% ) 0 cpu-migrations # 0.000 K/sec 97 page-faults # 0.020 M/sec ( +- 0.05% ) 14,185,861 cycles # 2.955 GHz ( +- 0.17% ) (50.49%) 45,691,935 instructions # 3.22 insn per cycle ( +- 0.01% ) (99.19%) 8,346,008 branches # 1738.709 M/sec ( +- 0.00% ) 13,046 branch-misses # 0.16% of all branches ( +- 0.10% ) 15,443,735 L1-dcache-loads # 3217.365 M/sec ( +- 0.00% ) 39,585 L1-dcache-load-misses # 0.26% of all L1-dcache hits ( +- 0.05% ) 7,138 LLC-loads # 1.487 M/sec ( +- 0.06% ) 671 LLC-load-misses # 9.40% of all LL-cache hits ( +- 0.73% ) <not supported> L1-icache-loads 56,213 L1-icache-load-misses ( +- 0.08% ) 15,443,735 dTLB-loads # 3217.365 M/sec ( +- 0.00% ) <not counted> dTLB-load-misses (0.00%) <not counted> iTLB-loads (0.00%) <not counted> iTLB-load-misses (0.00%) <not supported> L1-dcache-prefetches <not supported> L1-dcache-prefetch-misses 0.00503705 +- 0.00000546 seconds time elapsed ( +- 0.11% ) Revisions ========= v4->v5: [1] * Fixed s/xdp_ctx/ctx/ type-o (Toke) * Marked dispatcher trampoline with noinline attribute (Alexei) v3->v4: [2] * Moved away from doing dispatcher lookup based on the trampoline function, to a model where the dispatcher instance is explicitly passed to the bpf_dispatcher_change_prog() (Alexei) v2->v3: [3] * Removed xdp_call, and instead make the dispatcher available to all XDP users via bpf_prog_run_xdp() and dev_xdp_install(). (Toke) * Always enable the dispatcher, if available (Alexei) * Reuse BPF trampoline image allocator (Alexei) * Make sure the dispatcher is exercised in selftests (Alexei) * Only allow one dispatcher, and wire it to XDP v1->v2: [4] * Fixed i386 build warning (kbuild robot) * Made bpf_dispatcher_lookup() static (kbuild robot) * Make sure xdp_call.h is only enabled for builtins * Add xdp_call() to ixgbe, mlx4, and mlx5 RFC->v1: [5] * Improved error handling (Edward and Andrii) * Explicit cleanup (Andrii) * Use 32B with sext cmp (Alexei) * Align jump targets to 16B (Alexei) * 4 to 16 entries (Toke) * Added stats to xdp_call_run() [1] https://lore.kernel.org/bpf/20191211123017.13212-1-bjorn.topel@gmail.com/ [2] https://lore.kernel.org/bpf/20191209135522.16576-1-bjorn.topel@gmail.com/ [3] https://lore.kernel.org/bpf/20191123071226.6501-1-bjorn.topel@gmail.com/ [4] https://lore.kernel.org/bpf/20191119160757.27714-1-bjorn.topel@gmail.com/ [5] https://lore.kernel.org/bpf/20191113204737.31623-1-bjorn.topel@gmail.com/ ==================== Signed-off-by: Alexei Starovoitov <ast@kernel.org>
02620d9e · Alexei Starovoitov · 91cbdf74 · 116eb788 · 02620d9e · 02620d9e
Commit 02620d9e authored Dec 13, 2019 by Alexei Starovoitov
11 changed files
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -10,10 +10,12 @@
 #include <linux/if_vlan.h>
 #include <linux/bpf.h>
 #include <linux/memory.h>
+#include <linux/sort.h>
 #include <asm/extable.h>
 #include <asm/set_memory.h>
 #include <asm/nospec-branch.h>
 #include <asm/text-patching.h>
+#include <asm/asm-prototypes.h>

 static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 {
@@ -1530,6 +1532,154 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
 	return 0;
 }

+static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
+{
+	u8 *prog = *pprog;
+	int cnt = 0;
+	s64 offset;
+
+	offset = func - (ip + 2 + 4);
+	if (!is_simm32(offset)) {
+		pr_err("Target %p is out of range\n", func);
+		return -EINVAL;
+	}
+	EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
+	*pprog = prog;
+	return 0;
+}
+
+static void emit_nops(u8 **pprog, unsigned int len)
+{
+	unsigned int i, noplen;
+	u8 *prog = *pprog;
+	int cnt = 0;
+
+	while (len > 0) {
+		noplen = len;
+
+		if (noplen > ASM_NOP_MAX)
+			noplen = ASM_NOP_MAX;
+
+		for (i = 0; i < noplen; i++)
+			EMIT1(ideal_nops[noplen][i]);
+		len -= noplen;
+	}
+
+	*pprog = prog;
+}
+
+static int emit_fallback_jump(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int err = 0;
+
+#ifdef CONFIG_RETPOLINE
+	/* Note that this assumes the the compiler uses external
+	 * thunks for indirect calls. Both clang and GCC use the same
+	 * naming convention for external thunks.
+	 */
+	err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog);
+#else
+	int cnt = 0;
+
+	EMIT2(0xFF, 0xE2);	/* jmp rdx */
+#endif
+	*pprog = prog;
+	return err;
+}
+
+static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
+{
+	u8 *jg_reloc, *jg_target, *prog = *pprog;
+	int pivot, err, jg_bytes = 1, cnt = 0;
+	s64 jg_offset;
+
+	if (a == b) {
+		/* Leaf node of recursion, i.e. not a range of indices
+		 * anymore.
+		 */
+		EMIT1(add_1mod(0x48, BPF_REG_3));	/* cmp rdx,func */
+		if (!is_simm32(progs[a]))
+			return -1;
+		EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
+			    progs[a]);
+		err = emit_cond_near_jump(&prog,	/* je func */
+					  (void *)progs[a], prog,
+					  X86_JE);
+		if (err)
+			return err;
+
+		err = emit_fallback_jump(&prog);	/* jmp thunk/indirect */
+		if (err)
+			return err;
+
+		*pprog = prog;
+		return 0;
+	}
+
+	/* Not a leaf node, so we pivot, and recursively descend into
+	 * the lower and upper ranges.
+	 */
+	pivot = (b - a) / 2;
+	EMIT1(add_1mod(0x48, BPF_REG_3));		/* cmp rdx,func */
+	if (!is_simm32(progs[a + pivot]))
+		return -1;
+	EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);
+
+	if (pivot > 2) {				/* jg upper_part */
+		/* Require near jump. */
+		jg_bytes = 4;
+		EMIT2_off32(0x0F, X86_JG + 0x10, 0);
+	} else {
+		EMIT2(X86_JG, 0);
+	}
+	jg_reloc = prog;
+
+	err = emit_bpf_dispatcher(&prog, a, a + pivot,	/* emit lower_part */
+				  progs);
+	if (err)
+		return err;
+
+	/* From Intel 64 and IA-32 Architectures Optimization
+	 * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
+	 * Coding Rule 11: All branch targets should be 16-byte
+	 * aligned.
+	 */
+	jg_target = PTR_ALIGN(prog, 16);
+	if (jg_target != prog)
+		emit_nops(&prog, jg_target - prog);
+	jg_offset = prog - jg_reloc;
+	emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
+
+	err = emit_bpf_dispatcher(&prog, a + pivot + 1,	/* emit upper_part */
+				  b, progs);
+	if (err)
+		return err;
+
+	*pprog = prog;
+	return 0;
+}
+
+static int cmp_ips(const void *a, const void *b)
+{
+	const s64 *ipa = a;
+	const s64 *ipb = b;
+
+	if (*ipa > *ipb)
+		return 1;
+	if (*ipa < *ipb)
+		return -1;
+	return 0;
+}
+
+int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+	u8 *prog = image;
+
+	sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
+	return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs);
+}
+
 struct x64_jit_data {
 	struct bpf_binary_header *header;
 	int *addrs;

--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -470,11 +470,69 @@ struct bpf_trampoline {
 	void *image;
 	u64 selector;
 };
+
+#define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */
+
+struct bpf_dispatcher_prog {
+	struct bpf_prog *prog;
+	refcount_t users;
+};
+
+struct bpf_dispatcher {
+	/* dispatcher mutex */
+	struct mutex mutex;
+	void *func;
+	struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX];
+	int num_progs;
+	void *image;
+	u32 image_off;
+};
+
+static __always_inline unsigned int bpf_dispatcher_nopfunc(
+	const void *ctx,
+	const struct bpf_insn *insnsi,
+	unsigned int (*bpf_func)(const void *,
+				 const struct bpf_insn *))
+{
+	return bpf_func(ctx, insnsi);
+}
 #ifdef CONFIG_BPF_JIT
 struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
 int bpf_trampoline_link_prog(struct bpf_prog *prog);
 int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
 void bpf_trampoline_put(struct bpf_trampoline *tr);
+void *bpf_jit_alloc_exec_page(void);
+#define BPF_DISPATCHER_INIT(name) {			\
+	.mutex = __MUTEX_INITIALIZER(name.mutex),	\
+	.func = &name##func,				\
+	.progs = {},					\
+	.num_progs = 0,					\
+	.image = NULL,					\
+	.image_off = 0					\
+}
+
+#define DEFINE_BPF_DISPATCHER(name)					\
+	noinline unsigned int name##func(				\
+		const void *ctx,					\
+		const struct bpf_insn *insnsi,				\
+		unsigned int (*bpf_func)(const void *,			\
+					 const struct bpf_insn *))	\
+	{								\
+		return bpf_func(ctx, insnsi);				\
+	}								\
+	EXPORT_SYMBOL(name##func);			\
+	struct bpf_dispatcher name = BPF_DISPATCHER_INIT(name);
+#define DECLARE_BPF_DISPATCHER(name)					\
+	unsigned int name##func(					\
+		const void *ctx,					\
+		const struct bpf_insn *insnsi,				\
+		unsigned int (*bpf_func)(const void *,			\
+					 const struct bpf_insn *));	\
+	extern struct bpf_dispatcher name;
+#define BPF_DISPATCHER_FUNC(name) name##func
+#define BPF_DISPATCHER_PTR(name) (&name)
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+				struct bpf_prog *to);
 #else
 static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
@@ -489,6 +547,13 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
 	return -ENOTSUPP;
 }
 static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
+#define DEFINE_BPF_DISPATCHER(name)
+#define DECLARE_BPF_DISPATCHER(name)
+#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nopfunc
+#define BPF_DISPATCHER_PTR(name) NULL
+static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d,
+					      struct bpf_prog *from,
+					      struct bpf_prog *to) {}
 #endif

 struct bpf_func_info_aux {
@@ -940,6 +1005,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,

 int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog);

+struct bpf_prog *bpf_prog_by_id(u32 id);
+
 #else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
@@ -1071,6 +1138,11 @@ static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 static inline void bpf_map_put(struct bpf_map *map)
 {
 }
+
+static inline struct bpf_prog *bpf_prog_by_id(u32 id)
+{
+	return ERR_PTR(-ENOTSUPP);
+}
 #endif /* CONFIG_BPF_SYSCALL */

 static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,

--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -559,23 +559,26 @@ struct sk_filter {

 DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

-#define BPF_PROG_RUN(prog, ctx)	({				\
-	u32 ret;						\
-	cant_sleep();						\
-	if (static_branch_unlikely(&bpf_stats_enabled_key)) {	\
-		struct bpf_prog_stats *stats;			\
-		u64 start = sched_clock();			\
-		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
-		stats = this_cpu_ptr(prog->aux->stats);		\
-		u64_stats_update_begin(&stats->syncp);		\
-		stats->cnt++;					\
-		stats->nsecs += sched_clock() - start;		\
-		u64_stats_update_end(&stats->syncp);		\
-	} else {						\
-		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
-	}							\
+#define __BPF_PROG_RUN(prog, ctx, dfunc)	({			\
+	u32 ret;							\
+	cant_sleep();							\
+	if (static_branch_unlikely(&bpf_stats_enabled_key)) {		\
+		struct bpf_prog_stats *stats;				\
+		u64 start = sched_clock();				\
+		ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
+		stats = this_cpu_ptr(prog->aux->stats);			\
+		u64_stats_update_begin(&stats->syncp);			\
+		stats->cnt++;						\
+		stats->nsecs += sched_clock() - start;			\
+		u64_stats_update_end(&stats->syncp);			\
+	} else {							\
+		ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func);	\
+	}								\
 	ret; })

+#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx,		\
+					       bpf_dispatcher_nopfunc)
+
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

 struct bpf_skb_data_end {
@@ -699,6 +702,8 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	return res;
 }

+DECLARE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 					    struct xdp_buff *xdp)
 {
@@ -708,9 +713,12 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	 * already takes rcu_read_lock() when fetching the program, so
 	 * it's not necessary here anymore.
 	 */
-	return BPF_PROG_RUN(prog, xdp);
+	return __BPF_PROG_RUN(prog, xdp,
+			      BPF_DISPATCHER_FUNC(bpf_dispatcher_xdp));
 }

+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);
+
 static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
 {
 	return prog->len * sizeof(struct bpf_insn);

--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 obj-$(CONFIG_BPF_JIT) += trampoline.o
 obj-$(CONFIG_BPF_SYSCALL) += btf.o
+obj-$(CONFIG_BPF_JIT) += dispatcher.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o

--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2019 Intel Corporation. */
+
+#include <linux/hash.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+
+/* The BPF dispatcher is a multiway branch code generator. The
+ * dispatcher is a mechanism to avoid the performance penalty of an
+ * indirect call, which is expensive when retpolines are enabled. A
+ * dispatch client registers a BPF program into the dispatcher, and if
+ * there is available room in the dispatcher a direct call to the BPF
+ * program will be generated. All calls to the BPF programs called via
+ * the dispatcher will then be a direct call, instead of an
+ * indirect. The dispatcher hijacks a trampoline function it via the
+ * __fentry__ of the trampoline. The trampoline function has the
+ * following signature:
+ *
+ * unsigned int trampoline(const void *ctx, const struct bpf_insn *insnsi,
+ *                         unsigned int (*bpf_func)(const void *,
+ *                                                  const struct bpf_insn *));
+ */
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_prog(
+	struct bpf_dispatcher *d, struct bpf_prog *prog)
+{
+	int i;
+
+	for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+		if (prog == d->progs[i].prog)
+			return &d->progs[i];
+	}
+	return NULL;
+}
+
+static struct bpf_dispatcher_prog *bpf_dispatcher_find_free(
+	struct bpf_dispatcher *d)
+{
+	return bpf_dispatcher_find_prog(d, NULL);
+}
+
+static bool bpf_dispatcher_add_prog(struct bpf_dispatcher *d,
+				    struct bpf_prog *prog)
+{
+	struct bpf_dispatcher_prog *entry;
+
+	if (!prog)
+		return false;
+
+	entry = bpf_dispatcher_find_prog(d, prog);
+	if (entry) {
+		refcount_inc(&entry->users);
+		return false;
+	}
+
+	entry = bpf_dispatcher_find_free(d);
+	if (!entry)
+		return false;
+
+	bpf_prog_inc(prog);
+	entry->prog = prog;
+	refcount_set(&entry->users, 1);
+	d->num_progs++;
+	return true;
+}
+
+static bool bpf_dispatcher_remove_prog(struct bpf_dispatcher *d,
+				       struct bpf_prog *prog)
+{
+	struct bpf_dispatcher_prog *entry;
+
+	if (!prog)
+		return false;
+
+	entry = bpf_dispatcher_find_prog(d, prog);
+	if (!entry)
+		return false;
+
+	if (refcount_dec_and_test(&entry->users)) {
+		entry->prog = NULL;
+		bpf_prog_put(prog);
+		d->num_progs--;
+		return true;
+	}
+	return false;
+}
+
+int __weak arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+	return -ENOTSUPP;
+}
+
+static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image)
+{
+	s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
+	int i;
+
+	for (i = 0; i < BPF_DISPATCHER_MAX; i++) {
+		if (d->progs[i].prog)
+			*ipsp++ = (s64)(uintptr_t)d->progs[i].prog->bpf_func;
+	}
+	return arch_prepare_bpf_dispatcher(image, &ips[0], d->num_progs);
+}
+
+static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
+{
+	void *old, *new;
+	u32 noff;
+	int err;
+
+	if (!prev_num_progs) {
+		old = NULL;
+		noff = 0;
+	} else {
+		old = d->image + d->image_off;
+		noff = d->image_off ^ (PAGE_SIZE / 2);
+	}
+
+	new = d->num_progs ? d->image + noff : NULL;
+	if (new) {
+		if (bpf_dispatcher_prepare(d, new))
+			return;
+	}
+
+	err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
+	if (err || !new)
+		return;
+
+	d->image_off = noff;
+}
+
+void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
+				struct bpf_prog *to)
+{
+	bool changed = false;
+	int prev_num_progs;
+
+	if (from == to)
+		return;
+
+	mutex_lock(&d->mutex);
+	if (!d->image) {
+		d->image = bpf_jit_alloc_exec_page();
+		if (!d->image)
+			goto out;
+	}
+
+	prev_num_progs = d->num_progs;
+	changed |= bpf_dispatcher_remove_prog(d, from);
+	changed |= bpf_dispatcher_add_prog(d, to);
+
+	if (!changed)
+		goto out;
+
+	bpf_dispatcher_update(d, prev_num_progs);
+out:
+	mutex_unlock(&d->mutex);
+}
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2338,17 +2338,12 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,

 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id

-static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+struct bpf_prog *bpf_prog_by_id(u32 id)
 {
 	struct bpf_prog *prog;
-	u32 id = attr->prog_id;
-	int fd;
-
-	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
-		return -EINVAL;

-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (!id)
+		return ERR_PTR(-ENOENT);

 	spin_lock_bh(&prog_idr_lock);
 	prog = idr_find(&prog_idr, id);
@@ -2357,7 +2352,22 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	else
 		prog = ERR_PTR(-ENOENT);
 	spin_unlock_bh(&prog_idr_lock);
+	return prog;
+}
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	u32 id = attr->prog_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;

+	prog = bpf_prog_by_id(id);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);


--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -13,6 +13,22 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
 /* serializes access to trampoline_table */
 static DEFINE_MUTEX(trampoline_mutex);

+void *bpf_jit_alloc_exec_page(void)
+{
+	void *image;
+
+	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	if (!image)
+		return NULL;
+
+	set_vm_flush_reset_perms(image);
+	/* Keep image as writeable. The alternative is to keep flipping ro/rw
+	 * everytime new program is attached or detached.
+	 */
+	set_memory_x((long)image, 1);
+	return image;
+}
+
 struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 {
 	struct bpf_trampoline *tr;
@@ -33,7 +49,7 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 		goto out;

 	/* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
-	image = bpf_jit_alloc_exec(PAGE_SIZE);
+	image = bpf_jit_alloc_exec_page();
 	if (!image) {
 		kfree(tr);
 		tr = NULL;
@@ -47,12 +63,6 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
 	mutex_init(&tr->mutex);
 	for (i = 0; i < BPF_TRAMP_MAX; i++)
 		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
-
-	set_vm_flush_reset_perms(image);
-	/* Keep image as writeable. The alternative is to keep flipping ro/rw
-	 * everytime new program is attached or detached.
-	 */
-	set_memory_x((long)image, 1);
 	tr->image = image;
 out:
 	mutex_unlock(&trampoline_mutex);

--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -15,7 +15,7 @@
 #include <trace/events/bpf_test_run.h>

 static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
-			u32 *retval, u32 *time)
+			u32 *retval, u32 *time, bool xdp)
 {
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL };
 	enum bpf_cgroup_storage_type stype;
@@ -41,7 +41,11 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 	time_start = ktime_get_ns();
 	for (i = 0; i < repeat; i++) {
 		bpf_cgroup_storage_set(storage);
-		*retval = BPF_PROG_RUN(prog, ctx);
+
+		if (xdp)
+			*retval = bpf_prog_run_xdp(prog, ctx);
+		else
+			*retval = BPF_PROG_RUN(prog, ctx);

 		if (signal_pending(current)) {
 			ret = -EINTR;
@@ -356,7 +360,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	ret = convert___skb_to_skb(skb, ctx);
 	if (ret)
 		goto out;
-	ret = bpf_test_run(prog, skb, repeat, &retval, &duration);
+	ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false);
 	if (ret)
 		goto out;
 	if (!is_l2) {
@@ -413,8 +417,8 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,

 	rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
 	xdp.rxq = &rxqueue->xdp_rxq;
-
-	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration);
+	bpf_prog_change_xdp(NULL, prog);
+	ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
 	if (ret)
 		goto out;
 	if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
@@ -422,6 +426,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 		size = xdp.data_end - xdp.data;
 	ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
 out:
+	bpf_prog_change_xdp(prog, NULL);
 	kfree(data);
 	return ret;
 }

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8542,7 +8542,17 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 			   struct netlink_ext_ack *extack, u32 flags,
 			   struct bpf_prog *prog)
 {
+	bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
+	struct bpf_prog *prev_prog = NULL;
 	struct netdev_bpf xdp;
+	int err;
+
+	if (non_hw) {
+		prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
+							   XDP_QUERY_PROG));
+		if (IS_ERR(prev_prog))
+			prev_prog = NULL;
+	}

 	memset(&xdp, 0, sizeof(xdp));
 	if (flags & XDP_FLAGS_HW_MODE)
@@ -8553,7 +8563,14 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 	xdp.flags = flags;
 	xdp.prog = prog;

-	return bpf_op(dev, &xdp);
+	err = bpf_op(dev, &xdp);
+	if (!err && non_hw)
+		bpf_prog_change_xdp(prev_prog, prog);
+
+	if (prev_prog)
+		bpf_prog_put(prev_prog);
+
+	return err;
 }

 static void dev_xdp_uninstall(struct net_device *dev)

--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8940,3 +8940,11 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
 const struct bpf_prog_ops sk_reuseport_prog_ops = {
 };
 #endif /* CONFIG_INET */
+
+DEFINE_BPF_DISPATCHER(bpf_dispatcher_xdp)
+
+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
+{
+	bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(bpf_dispatcher_xdp),
+				   prev_prog, prog);
+}
--- a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_xdp_perf(void)
+{
+	const char *file = "./xdp_dummy.o";
+	__u32 duration, retval, size;
+	struct bpf_object *obj;
+	char in[128], out[128];
+	int err, prog_fd;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+	if (CHECK_FAIL(err))
+		return;
+
+	err = bpf_prog_test_run(prog_fd, 1000000, &in[0], 128,
+				out, &size, &retval, &duration);
+
+	CHECK(err || retval != XDP_PASS || size != 128,
+	      "xdp-perf",
+	      "err %d errno %d retval %d size %d\n",
+	      err, errno, retval, size);
+
+	bpf_object__close(obj);
+}