1. 19 Nov, 2019 10 commits
    • Andrii Nakryiko's avatar
      libbpf: Fix call relocation offset calculation bug · a0d7da26
      Andrii Nakryiko authored
      When relocating subprogram call, libbpf doesn't take into account
      relo->text_off, which comes from symbol's value. This generally works fine for
      subprograms implemented as static functions, but breaks for global functions.
      
      Taking a simplified test_pkt_access.c as an example:
      
      __attribute__ ((noinline))
      static int test_pkt_access_subprog1(volatile struct __sk_buff *skb)
      {
              return skb->len * 2;
      }
      
      __attribute__ ((noinline))
      static int test_pkt_access_subprog2(int val, volatile struct __sk_buff *skb)
      {
              return skb->len + val;
      }
      
      SEC("classifier/test_pkt_access")
      int test_pkt_access(struct __sk_buff *skb)
      {
              if (test_pkt_access_subprog1(skb) != skb->len * 2)
                      return TC_ACT_SHOT;
              if (test_pkt_access_subprog2(2, skb) != skb->len + 2)
                      return TC_ACT_SHOT;
              return TC_ACT_UNSPEC;
      }
      
      When compiled, we get two relocations, pointing to '.text' symbol. .text has
      st_value set to 0 (it points to the beginning of .text section):
      
      0000000000000008  000000050000000a R_BPF_64_32            0000000000000000 .text
      0000000000000040  000000050000000a R_BPF_64_32            0000000000000000 .text
      
      test_pkt_access_subprog1 and test_pkt_access_subprog2 offsets (targets of two
      calls) are encoded within call instruction's imm32 part as -1 and 2,
      respectively:
      
      0000000000000000 test_pkt_access_subprog1:
             0:       61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0)
             1:       64 00 00 00 01 00 00 00 w0 <<= 1
             2:       95 00 00 00 00 00 00 00 exit
      
      0000000000000018 test_pkt_access_subprog2:
             3:       61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0)
             4:       04 00 00 00 02 00 00 00 w0 += 2
             5:       95 00 00 00 00 00 00 00 exit
      
      0000000000000000 test_pkt_access:
             0:       bf 16 00 00 00 00 00 00 r6 = r1
      ===>   1:       85 10 00 00 ff ff ff ff call -1
             2:       bc 01 00 00 00 00 00 00 w1 = w0
             3:       b4 00 00 00 02 00 00 00 w0 = 2
             4:       61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0)
             5:       64 02 00 00 01 00 00 00 w2 <<= 1
             6:       5e 21 08 00 00 00 00 00 if w1 != w2 goto +8 <LBB0_3>
             7:       bf 61 00 00 00 00 00 00 r1 = r6
      ===>   8:       85 10 00 00 02 00 00 00 call 2
             9:       bc 01 00 00 00 00 00 00 w1 = w0
            10:       61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0)
            11:       04 02 00 00 02 00 00 00 w2 += 2
            12:       b4 00 00 00 ff ff ff ff w0 = -1
            13:       1e 21 01 00 00 00 00 00 if w1 == w2 goto +1 <LBB0_3>
            14:       b4 00 00 00 02 00 00 00 w0 = 2
      0000000000000078 LBB0_3:
            15:       95 00 00 00 00 00 00 00 exit
      
      Now, if we compile example with global functions, the setup changes.
      Relocations are now against specifically test_pkt_access_subprog1 and
      test_pkt_access_subprog2 symbols, with test_pkt_access_subprog2 pointing 24
      bytes into its respective section (.text), i.e., 3 instructions in:
      
      0000000000000008  000000070000000a R_BPF_64_32            0000000000000000 test_pkt_access_subprog1
      0000000000000048  000000080000000a R_BPF_64_32            0000000000000018 test_pkt_access_subprog2
      
      Calls instructions now encode offsets relative to function symbols and are both
      set ot -1:
      
      0000000000000000 test_pkt_access_subprog1:
             0:       61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0)
             1:       64 00 00 00 01 00 00 00 w0 <<= 1
             2:       95 00 00 00 00 00 00 00 exit
      
      0000000000000018 test_pkt_access_subprog2:
             3:       61 20 00 00 00 00 00 00 r0 = *(u32 *)(r2 + 0)
             4:       0c 10 00 00 00 00 00 00 w0 += w1
             5:       95 00 00 00 00 00 00 00 exit
      
      0000000000000000 test_pkt_access:
             0:       bf 16 00 00 00 00 00 00 r6 = r1
      ===>   1:       85 10 00 00 ff ff ff ff call -1
             2:       bc 01 00 00 00 00 00 00 w1 = w0
             3:       b4 00 00 00 02 00 00 00 w0 = 2
             4:       61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0)
             5:       64 02 00 00 01 00 00 00 w2 <<= 1
             6:       5e 21 09 00 00 00 00 00 if w1 != w2 goto +9 <LBB2_3>
             7:       b4 01 00 00 02 00 00 00 w1 = 2
             8:       bf 62 00 00 00 00 00 00 r2 = r6
      ===>   9:       85 10 00 00 ff ff ff ff call -1
            10:       bc 01 00 00 00 00 00 00 w1 = w0
            11:       61 62 00 00 00 00 00 00 r2 = *(u32 *)(r6 + 0)
            12:       04 02 00 00 02 00 00 00 w2 += 2
            13:       b4 00 00 00 ff ff ff ff w0 = -1
            14:       1e 21 01 00 00 00 00 00 if w1 == w2 goto +1 <LBB2_3>
            15:       b4 00 00 00 02 00 00 00 w0 = 2
      0000000000000080 LBB2_3:
            16:       95 00 00 00 00 00 00 00 exit
      
      Thus the right formula to calculate target call offset after relocation should
      take into account relocation's target symbol value (offset within section),
      call instruction's imm32 offset, and (subtracting, to get relative instruction
      offset) instruction index of call instruction itself. All that is shifted by
      number of instructions in main program, given all sub-programs are copied over
      after main program.
      
      Convert few selftests relying on bpf-to-bpf calls to use global functions
      instead of static ones.
      
      Fixes: 48cca7e4 ("libbpf: add support for bpf_call")
      Reported-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Signed-off-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Acked-by: default avatarYonghong Song <yhs@fb.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191119224447.3781271-1-andriin@fb.com
      a0d7da26
    • Luigi Rizzo's avatar
      net-af_xdp: Use correct number of channels from ethtool · 3de88c91
      Luigi Rizzo authored
      Drivers use different fields to report the number of channels, so take
      the maximum of all data channels (rx, tx, combined) when determining the
      size of the xsk map. The current code used only 'combined' which was set
      to 0 in some drivers e.g. mlx4.
      
      Tested: compiled and run xdpsock -q 3 -r -S on mlx4
      Signed-off-by: default avatarLuigi Rizzo <lrizzo@google.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Reviewed-by: default avatarJakub Kicinski <jakub.kicinski@netronome.com>
      Acked-by: default avatarMagnus Karlsson <magnus.karlsson@intel.com>
      Link: https://lore.kernel.org/bpf/20191119001951.92930-1-lrizzo@google.com
      3de88c91
    • Alexei Starovoitov's avatar
      Merge branch 'remove-jited-size-limits' · 0424c5a4
      Alexei Starovoitov authored
      Ilya Leoshkevich says:
      
      ====================
      This patch series introduces usage of relative long jumps and loads in
      order to lift 64/512k size limits on JITed BPF programs on s390.
      
      Patch 1 introduces long relative branches.
      Patch 2 changes the way literal pool is arranged in order to be
      compatible with long relative loads.
      Patch 3 changes the way literal pool base register is loaded for large
      programs.
      Patch 4 replaces regular loads with long relative loads where they are
      totally superior.
      Patch 5 introduces long relative loads as an alternative way to load
      constants in large programs. Regular loads are kept and still used for
      small programs.
      Patch 6 removes the size limit check.
      ====================
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      0424c5a4
    • Ilya Leoshkevich's avatar
      s390/bpf: Remove JITed image size limitations · d1242b10
      Ilya Leoshkevich authored
      Now that jump and long displacement ranges are no longer a problem,
      remove the limit on JITed image size. In practice it's still limited by
      2G, but with verifier allowing "only" 1M instructions, it's not an
      issue.
      Signed-off-by: default avatarIlya Leoshkevich <iii@linux.ibm.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191118180340.68373-7-iii@linux.ibm.com
      d1242b10
    • Ilya Leoshkevich's avatar
      s390/bpf: Use lg(f)rl when long displacement cannot be used · b25c57b6
      Ilya Leoshkevich authored
      If literal pool grows past 524287 mark, it's no longer possible to use
      long displacement to reference literal pool entries. In JIT setting
      maintaining multiple literal pool registers is next to impossible, since
      we operate on one instruction at a time.
      
      Therefore, fall back to loading literal pool entry using PC-relative
      addressing, and then using a register-register form of the following
      machine instruction.
      Signed-off-by: default avatarIlya Leoshkevich <iii@linux.ibm.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191118180340.68373-6-iii@linux.ibm.com
      b25c57b6
    • Ilya Leoshkevich's avatar
      s390/bpf: Use lgrl instead of lg where possible · 451e448f
      Ilya Leoshkevich authored
      lg and lgrl have the same performance characteristics, but the former
      requires a base register and is subject to long displacement range
      limits, while the latter does not. Therefore, lgrl is totally superior
      to lg and should be used instead whenever possible.
      Signed-off-by: default avatarIlya Leoshkevich <iii@linux.ibm.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191118180340.68373-5-iii@linux.ibm.com
      451e448f
    • Ilya Leoshkevich's avatar
      s390/bpf: Load literal pool register using larl · c1aff568
      Ilya Leoshkevich authored
      Currently literal pool register is loaded using basr, which makes it
      point not to the beginning of the literal pool, but rather to the next
      instruction. In case JITed code is larger than 512k, this renders
      literal pool register absolutely useless due to long displacement range
      restrictions.
      
      The solution is to use larl to make literal pool register point to the
      very beginning of the literal pool. This makes it always possible to
      address 512k worth of literal pool entries using long displacement.
      
      However, for short programs, in which the entire literal pool is covered
      by basr-generated base, it is still beneficial to use basr, since it is
      4 bytes shorter than larl.
      
      Detect situations when basr-generated base does not cover the entire
      literal pool, and in such cases use larl instead.
      Signed-off-by: default avatarIlya Leoshkevich <iii@linux.ibm.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191118180340.68373-4-iii@linux.ibm.com
      c1aff568
    • Ilya Leoshkevich's avatar
      s390/bpf: Align literal pool entries · e0491f64
      Ilya Leoshkevich authored
      When literal pool size exceeds 512k, it's no longer possible to
      reference all the entries in it using a single base register and long
      displacement. Therefore, PC-relative lgfrl and lgrl instructions need to
      be used.
      
      Unfortunately, they require their arguments to be aligned to 4- and
      8-byte boundaries respectively. This generates certain overhead due to
      necessary padding bytes. Grouping 4- and 8-byte entries together reduces
      the maximum overhead to 6 bytes (2 for aligning 4-byte entries and 4 for
      aligning 8-byte entries).
      
      While in theory it is possible to detect whether or not alignment is
      needed by comparing the literal pool size with 512k, in practice this
      leads to having two ways of emitting constants, making the code more
      complicated.
      
      Prefer code simplicity over trivial size saving, and always group and
      align literal pool entries.
      Signed-off-by: default avatarIlya Leoshkevich <iii@linux.ibm.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191118180340.68373-3-iii@linux.ibm.com
      e0491f64
    • Ilya Leoshkevich's avatar
      s390/bpf: Use relative long branches · 4e9b4a68
      Ilya Leoshkevich authored
      Currently maximum JITed code size is limited to 64k, because JIT can
      emit only relative short branches, whose range is limited by 64k in both
      directions.
      
      Teach JIT to use relative long branches. There are no compare+branch
      relative long instructions, so using relative long branches consumes
      more space due to having to having to emit an explicit comparison
      instruction. Therefore do this only when relative short branch is not
      enough.
      Signed-off-by: default avatarIlya Leoshkevich <iii@linux.ibm.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191118180340.68373-2-iii@linux.ibm.com
      4e9b4a68
    • Colin Ian King's avatar
      bpf: Fix memory leak on object 'data' · a25ecd9d
      Colin Ian King authored
      The error return path on when bpf_fentry_test* tests fail does not
      kfree 'data'. Fix this by adding the missing kfree.
      
      Addresses-Coverity: ("Resource leak")
      
      Fixes: faeb2dce ("bpf: Add kernel test functions for fentry testing")
      Signed-off-by: default avatarColin Ian King <colin.king@canonical.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Link: https://lore.kernel.org/bpf/20191118114059.37287-1-colin.king@canonical.com
      a25ecd9d
  2. 18 Nov, 2019 9 commits
    • Yonghong Song's avatar
      selftests, bpf: Workaround an alu32 sub-register spilling issue · 2ea2612b
      Yonghong Song authored
      Currently, with latest llvm trunk, selftest test_progs failed obj
      file test_seg6_loop.o with the following error in verifier:
      
        infinite loop detected at insn 76
      
      The byte code sequence looks like below, and noted that alu32 has been
      turned off by default for better generated codes in general:
      
            48:       w3 = 100
            49:       *(u32 *)(r10 - 68) = r3
            ...
        ;             if (tlv.type == SR6_TLV_PADDING) {
            76:       if w3 == 5 goto -18 <LBB0_19>
            ...
            85:       r1 = *(u32 *)(r10 - 68)
        ;     for (int i = 0; i < 100; i++) {
            86:       w1 += -1
            87:       if w1 == 0 goto +5 <LBB0_20>
            88:       *(u32 *)(r10 - 68) = r1
      
      The main reason for verification failure is due to partial spills at
      r10 - 68 for induction variable "i".
      
      Current verifier only handles spills with 8-byte values. The above 4-byte
      value spill to stack is treated to STACK_MISC and its content is not
      saved. For the above example:
      
          w3 = 100
            R3_w=inv100 fp-64_w=inv1086626730498
          *(u32 *)(r10 - 68) = r3
            R3_w=inv100 fp-64_w=inv1086626730498
          ...
          r1 = *(u32 *)(r10 - 68)
            R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
            fp-64=inv1086626730498
      
      To resolve this issue, verifier needs to be extended to track sub-registers
      in spilling, or llvm needs to enhanced to prevent sub-register spilling
      in register allocation phase. The former will increase verifier complexity
      and the latter will need some llvm "hacking".
      
      Let us workaround this issue by declaring the induction variable as "long"
      type so spilling will happen at non sub-register level. We can revisit this
      later if sub-register spilling causes similar or other verification issues.
      Signed-off-by: default avatarYonghong Song <yhs@fb.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Link: https://lore.kernel.org/bpf/20191117214036.1309510-1-yhs@fb.com
      2ea2612b
    • Jiri Benc's avatar
      selftests, bpf: Fix test_tc_tunnel hanging · 3b054b71
      Jiri Benc authored
      When run_kselftests.sh is run, it hangs after test_tc_tunnel.sh. The reason
      is test_tc_tunnel.sh ensures the server ('nc -l') is run all the time,
      starting it again every time it is expected to terminate. The exception is
      the final client_connect: the server is not started anymore, which ensures
      no process is kept running after the test is finished.
      
      For a sit test, though, the script is terminated prematurely without the
      final client_connect and the 'nc' process keeps running. This in turn causes
      the run_one function in kselftest/runner.sh to hang forever, waiting for the
      runaway process to finish.
      
      Ensure a remaining server is terminated on cleanup.
      
      Fixes: f6ad6acc ("selftests/bpf: expand test_tc_tunnel with SIT encap")
      Signed-off-by: default avatarJiri Benc <jbenc@redhat.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarWillem de Bruijn <willemb@google.com>
      Link: https://lore.kernel.org/bpf/60919291657a9ee89c708d8aababc28ebe1420be.1573821780.git.jbenc@redhat.com
      3b054b71
    • Jiri Benc's avatar
      selftests, bpf: xdping is not meant to be run standalone · 56bf877a
      Jiri Benc authored
      The actual test to run is test_xdping.sh, which is already in TEST_PROGS.
      The xdping program alone is not runnable with 'make run_tests', it
      immediatelly fails due to missing arguments.
      
      Move xdping to TEST_GEN_PROGS_EXTENDED in order to be built but not run.
      
      Fixes: cd538502 ("selftests/bpf: measure RTT from xdp using xdping")
      Signed-off-by: default avatarJiri Benc <jbenc@redhat.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Reviewed-by: default avatarAlan Maguire <alan.maguire@oracle.com>
      Acked-by: default avatarToke Høiland-Jørgensen <toke@redhat.com>
      Link: https://lore.kernel.org/bpf/4365c81198f62521344c2215909634407184387e.1573821726.git.jbenc@redhat.com
      56bf877a
    • Daniel Borkmann's avatar
      Merge branch 'bpf-array-mmap' · b97e12e5
      Daniel Borkmann authored
      Andrii Nakryiko says:
      
      ====================
      This patch set adds ability to memory-map BPF array maps (single- and
      multi-element). The primary use case is memory-mapping BPF array maps, created
      to back global data variables, created by libbpf implicitly. This allows for
      much better usability, along with avoiding syscalls to read or update data
      completely.
      
      Due to memory-mapping requirements, BPF array map that is supposed to be
      memory-mapped, has to be created with special BPF_F_MMAPABLE attribute, which
      triggers slightly different memory allocation strategy internally. See
      patch 1 for details.
      
      Libbpf is extended to detect kernel support for this flag, and if supported,
      will specify it for all global data maps automatically.
      
      Patch #1 refactors bpf_map_inc() and converts bpf_map's refcnt to atomic64_t
      to make refcounting never fail. Patch #2 does similar refactoring for
      bpf_prog_add()/bpf_prog_inc().
      
      v5->v6:
      - add back uref counting (Daniel);
      
      v4->v5:
      - change bpf_prog's refcnt to atomic64_t (Daniel);
      
      v3->v4:
      - add mmap's open() callback to fix refcounting (Johannes);
      - switch to remap_vmalloc_pages() instead of custom fault handler (Johannes);
      - converted bpf_map's refcnt/usercnt into atomic64_t;
      - provide default bpf_map_default_vmops handling open/close properly;
      
      v2->v3:
      - change allocation strategy to avoid extra pointer dereference (Jakub);
      
      v1->v2:
      - fix map lookup code generation for BPF_F_MMAPABLE case;
      - prevent BPF_F_MMAPABLE flag for all but plain array map type;
      - centralize ref-counting in generic bpf_map_mmap();
      - don't use uref counting (Alexei);
      - use vfree() directly;
      - print flags with %x (Song);
      - extend tests to verify bpf_map_{lookup,update}_elem() logic as well.
      ====================
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      b97e12e5
    • Andrii Nakryiko's avatar
      selftests/bpf: Add BPF_TYPE_MAP_ARRAY mmap() tests · 5051b384
      Andrii Nakryiko authored
      Add selftests validating mmap()-ing BPF array maps: both single-element and
      multi-element ones. Check that plain bpf_map_update_elem() and
      bpf_map_lookup_elem() work correctly with memory-mapped array. Also convert
      CO-RE relocation tests to use memory-mapped views of global data.
      Signed-off-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarSong Liu <songliubraving@fb.com>
      Link: https://lore.kernel.org/bpf/20191117172806.2195367-6-andriin@fb.com
      5051b384
    • Andrii Nakryiko's avatar
      libbpf: Make global data internal arrays mmap()-able, if possible · 7fe74b43
      Andrii Nakryiko authored
      Add detection of BPF_F_MMAPABLE flag support for arrays and add it as an extra
      flag to internal global data maps, if supported by kernel. This allows users
      to memory-map global data and use it without BPF map operations, greatly
      simplifying user experience.
      Signed-off-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarSong Liu <songliubraving@fb.com>
      Acked-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
      Link: https://lore.kernel.org/bpf/20191117172806.2195367-5-andriin@fb.com
      7fe74b43
    • Andrii Nakryiko's avatar
      bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY · fc970227
      Andrii Nakryiko authored
      Add ability to memory-map contents of BPF array map. This is extremely useful
      for working with BPF global data from userspace programs. It allows to avoid
      typical bpf_map_{lookup,update}_elem operations, improving both performance
      and usability.
      
      There had to be special considerations for map freezing, to avoid having
      writable memory view into a frozen map. To solve this issue, map freezing and
      mmap-ing is happening under mutex now:
        - if map is already frozen, no writable mapping is allowed;
        - if map has writable memory mappings active (accounted in map->writecnt),
          map freezing will keep failing with -EBUSY;
        - once number of writable memory mappings drops to zero, map freezing can be
          performed again.
      
      Only non-per-CPU plain arrays are supported right now. Maps with spinlocks
      can't be memory mapped either.
      
      For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc()
      to be mmap()'able. We also need to make sure that array data memory is
      page-sized and page-aligned, so we over-allocate memory in such a way that
      struct bpf_array is at the end of a single page of memory with array->value
      being aligned with the start of the second page. On deallocation we need to
      accomodate this memory arrangement to free vmalloc()'ed memory correctly.
      
      One important consideration regarding how memory-mapping subsystem functions.
      Memory-mapping subsystem provides few optional callbacks, among them open()
      and close().  close() is called for each memory region that is unmapped, so
      that users can decrease their reference counters and free up resources, if
      necessary. open() is *almost* symmetrical: it's called for each memory region
      that is being mapped, **except** the very first one. So bpf_map_mmap does
      initial refcnt bump, while open() will do any extra ones after that. Thus
      number of close() calls is equal to number of open() calls plus one more.
      Signed-off-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarSong Liu <songliubraving@fb.com>
      Acked-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
      Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
      Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com
      fc970227
    • Andrii Nakryiko's avatar
      bpf: Convert bpf_prog refcnt to atomic64_t · 85192dbf
      Andrii Nakryiko authored
      Similarly to bpf_map's refcnt/usercnt, convert bpf_prog's refcnt to atomic64
      and remove artificial 32k limit. This allows to make bpf_prog's refcounting
      non-failing, simplifying logic of users of bpf_prog_add/bpf_prog_inc.
      
      Validated compilation by running allyesconfig kernel build.
      Suggested-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Signed-off-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Link: https://lore.kernel.org/bpf/20191117172806.2195367-3-andriin@fb.com
      85192dbf
    • Andrii Nakryiko's avatar
      bpf: Switch bpf_map ref counter to atomic64_t so bpf_map_inc() never fails · 1e0bd5a0
      Andrii Nakryiko authored
      92117d84 ("bpf: fix refcnt overflow") turned refcounting of bpf_map into
      potentially failing operation, when refcount reaches BPF_MAX_REFCNT limit
      (32k). Due to using 32-bit counter, it's possible in practice to overflow
      refcounter and make it wrap around to 0, causing erroneous map free, while
      there are still references to it, causing use-after-free problems.
      
      But having a failing refcounting operations are problematic in some cases. One
      example is mmap() interface. After establishing initial memory-mapping, user
      is allowed to arbitrarily map/remap/unmap parts of mapped memory, arbitrarily
      splitting it into multiple non-contiguous regions. All this happening without
      any control from the users of mmap subsystem. Rather mmap subsystem sends
      notifications to original creator of memory mapping through open/close
      callbacks, which are optionally specified during initial memory mapping
      creation. These callbacks are used to maintain accurate refcount for bpf_map
      (see next patch in this series). The problem is that open() callback is not
      supposed to fail, because memory-mapped resource is set up and properly
      referenced. This is posing a problem for using memory-mapping with BPF maps.
      
      One solution to this is to maintain separate refcount for just memory-mappings
      and do single bpf_map_inc/bpf_map_put when it goes from/to zero, respectively.
      There are similar use cases in current work on tcp-bpf, necessitating extra
      counter as well. This seems like a rather unfortunate and ugly solution that
      doesn't scale well to various new use cases.
      
      Another approach to solve this is to use non-failing refcount_t type, which
      uses 32-bit counter internally, but, once reaching overflow state at UINT_MAX,
      stays there. This utlimately causes memory leak, but prevents use after free.
      
      But given refcounting is not the most performance-critical operation with BPF
      maps (it's not used from running BPF program code), we can also just switch to
      64-bit counter that can't overflow in practice, potentially disadvantaging
      32-bit platforms a tiny bit. This simplifies semantics and allows above
      described scenarios to not worry about failing refcount increment operation.
      
      In terms of struct bpf_map size, we are still good and use the same amount of
      space:
      
      BEFORE (3 cache lines, 8 bytes of padding at the end):
      struct bpf_map {
      	const struct bpf_map_ops  * ops __attribute__((__aligned__(64))); /*     0     8 */
      	struct bpf_map *           inner_map_meta;       /*     8     8 */
      	void *                     security;             /*    16     8 */
      	enum bpf_map_type  map_type;                     /*    24     4 */
      	u32                        key_size;             /*    28     4 */
      	u32                        value_size;           /*    32     4 */
      	u32                        max_entries;          /*    36     4 */
      	u32                        map_flags;            /*    40     4 */
      	int                        spin_lock_off;        /*    44     4 */
      	u32                        id;                   /*    48     4 */
      	int                        numa_node;            /*    52     4 */
      	u32                        btf_key_type_id;      /*    56     4 */
      	u32                        btf_value_type_id;    /*    60     4 */
      	/* --- cacheline 1 boundary (64 bytes) --- */
      	struct btf *               btf;                  /*    64     8 */
      	struct bpf_map_memory memory;                    /*    72    16 */
      	bool                       unpriv_array;         /*    88     1 */
      	bool                       frozen;               /*    89     1 */
      
      	/* XXX 38 bytes hole, try to pack */
      
      	/* --- cacheline 2 boundary (128 bytes) --- */
      	atomic_t                   refcnt __attribute__((__aligned__(64))); /*   128     4 */
      	atomic_t                   usercnt;              /*   132     4 */
      	struct work_struct work;                         /*   136    32 */
      	char                       name[16];             /*   168    16 */
      
      	/* size: 192, cachelines: 3, members: 21 */
      	/* sum members: 146, holes: 1, sum holes: 38 */
      	/* padding: 8 */
      	/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
      } __attribute__((__aligned__(64)));
      
      AFTER (same 3 cache lines, no extra padding now):
      struct bpf_map {
      	const struct bpf_map_ops  * ops __attribute__((__aligned__(64))); /*     0     8 */
      	struct bpf_map *           inner_map_meta;       /*     8     8 */
      	void *                     security;             /*    16     8 */
      	enum bpf_map_type  map_type;                     /*    24     4 */
      	u32                        key_size;             /*    28     4 */
      	u32                        value_size;           /*    32     4 */
      	u32                        max_entries;          /*    36     4 */
      	u32                        map_flags;            /*    40     4 */
      	int                        spin_lock_off;        /*    44     4 */
      	u32                        id;                   /*    48     4 */
      	int                        numa_node;            /*    52     4 */
      	u32                        btf_key_type_id;      /*    56     4 */
      	u32                        btf_value_type_id;    /*    60     4 */
      	/* --- cacheline 1 boundary (64 bytes) --- */
      	struct btf *               btf;                  /*    64     8 */
      	struct bpf_map_memory memory;                    /*    72    16 */
      	bool                       unpriv_array;         /*    88     1 */
      	bool                       frozen;               /*    89     1 */
      
      	/* XXX 38 bytes hole, try to pack */
      
      	/* --- cacheline 2 boundary (128 bytes) --- */
      	atomic64_t                 refcnt __attribute__((__aligned__(64))); /*   128     8 */
      	atomic64_t                 usercnt;              /*   136     8 */
      	struct work_struct work;                         /*   144    32 */
      	char                       name[16];             /*   176    16 */
      
      	/* size: 192, cachelines: 3, members: 21 */
      	/* sum members: 154, holes: 1, sum holes: 38 */
      	/* forced alignments: 2, forced holes: 1, sum forced holes: 38 */
      } __attribute__((__aligned__(64)));
      
      This patch, while modifying all users of bpf_map_inc, also cleans up its
      interface to match bpf_map_put with separate operations for bpf_map_inc and
      bpf_map_inc_with_uref (to match bpf_map_put and bpf_map_put_with_uref,
      respectively). Also, given there are no users of bpf_map_inc_not_zero
      specifying uref=true, remove uref flag and default to uref=false internally.
      Signed-off-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarSong Liu <songliubraving@fb.com>
      Link: https://lore.kernel.org/bpf/20191117172806.2195367-2-andriin@fb.com
      1e0bd5a0
  3. 15 Nov, 2019 21 commits