1. 11 Sep, 2020 4 commits
    • Lorenz Bauer's avatar
      bpf: Plug hole in struct bpf_sk_lookup_kern · d66423fb
      Lorenz Bauer authored
      As Alexei points out, struct bpf_sk_lookup_kern has two 4-byte holes.
      This leads to suboptimal instructions being generated (IPv4, x86):
      
          1372                    struct bpf_sk_lookup_kern ctx = {
             0xffffffff81b87f30 <+624>:   xor    %eax,%eax
             0xffffffff81b87f32 <+626>:   mov    $0x6,%ecx
             0xffffffff81b87f37 <+631>:   lea    0x90(%rsp),%rdi
             0xffffffff81b87f3f <+639>:   movl   $0x110002,0x88(%rsp)
             0xffffffff81b87f4a <+650>:   rep stos %rax,%es:(%rdi)
             0xffffffff81b87f4d <+653>:   mov    0x8(%rsp),%eax
             0xffffffff81b87f51 <+657>:   mov    %r13d,0x90(%rsp)
             0xffffffff81b87f59 <+665>:   incl   %gs:0x7e4970a0(%rip)
             0xffffffff81b87f60 <+672>:   mov    %eax,0x8c(%rsp)
             0xffffffff81b87f67 <+679>:   movzwl 0x10(%rsp),%eax
             0xffffffff81b87f6c <+684>:   mov    %ax,0xa8(%rsp)
             0xffffffff81b87f74 <+692>:   movzwl 0x38(%rsp),%eax
             0xffffffff81b87f79 <+697>:   mov    %ax,0xaa(%rsp)
      
      Fix this by moving around sport and dport. pahole confirms there
      are no more holes:
      
          struct bpf_sk_lookup_kern {
              u16                        family;       /*     0     2 */
              u16                        protocol;     /*     2     2 */
              __be16                     sport;        /*     4     2 */
              u16                        dport;        /*     6     2 */
              struct {
                      __be32             saddr;        /*     8     4 */
                      __be32             daddr;        /*    12     4 */
              } v4;                                    /*     8     8 */
              struct {
                      const struct in6_addr  * saddr;  /*    16     8 */
                      const struct in6_addr  * daddr;  /*    24     8 */
              } v6;                                    /*    16    16 */
              struct sock *              selected_sk;  /*    32     8 */
              bool                       no_reuseport; /*    40     1 */
      
              /* size: 48, cachelines: 1, members: 8 */
              /* padding: 7 */
              /* last cacheline: 48 bytes */
          };
      
      The assembly also doesn't contain the pesky rep stos anymore:
      
          1372                    struct bpf_sk_lookup_kern ctx = {
             0xffffffff81b87f60 <+624>:   movzwl 0x10(%rsp),%eax
             0xffffffff81b87f65 <+629>:   movq   $0x0,0xa8(%rsp)
             0xffffffff81b87f71 <+641>:   movq   $0x0,0xb0(%rsp)
             0xffffffff81b87f7d <+653>:   mov    %ax,0x9c(%rsp)
             0xffffffff81b87f85 <+661>:   movzwl 0x38(%rsp),%eax
             0xffffffff81b87f8a <+666>:   movq   $0x0,0xb8(%rsp)
             0xffffffff81b87f96 <+678>:   mov    %ax,0x9e(%rsp)
             0xffffffff81b87f9e <+686>:   mov    0x8(%rsp),%eax
             0xffffffff81b87fa2 <+690>:   movq   $0x0,0xc0(%rsp)
             0xffffffff81b87fae <+702>:   movl   $0x110002,0x98(%rsp)
             0xffffffff81b87fb9 <+713>:   mov    %eax,0xa0(%rsp)
             0xffffffff81b87fc0 <+720>:   mov    %r13d,0xa4(%rsp)
      
      1: https://lore.kernel.org/bpf/CAADnVQKE6y9h2fwX6OS837v-Uf+aBXnT_JXiN_bbo2gitZQ3tA@mail.gmail.com/
      
      Fixes: e9ddbb77 ("bpf: Introduce SK_LOOKUP program type with a dedicated attach point")
      Suggested-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Signed-off-by: default avatarLorenz Bauer <lmb@cloudflare.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarJesper Dangaard Brouer <brouer@redhat.com>
      Link: https://lore.kernel.org/bpf/20200910110248.198326-1-lmb@cloudflare.com
      d66423fb
    • Quentin Monnet's avatar
      tools: bpftool: Add "inner_map" to "bpftool map create" outer maps · e3b9626f
      Quentin Monnet authored
      There is no support for creating maps of types array-of-map or
      hash-of-map in bpftool. This is because the kernel needs an inner_map_fd
      to collect metadata on the inner maps to be supported by the new map,
      but bpftool does not provide a way to pass this file descriptor.
      
      Add a new optional "inner_map" keyword that can be used to pass a
      reference to a map, retrieve a fd to that map, and pass it as the
      inner_map_fd.
      
      Add related documentation and bash completion. Note that we can
      reference the inner map by its name, meaning we can have several times
      the keyword "name" with different meanings (mandatory outer map name,
      and possibly a name to use to find the inner_map_fd). The bash
      completion will offer it just once, and will not suggest "name" on the
      following command:
      
          # bpftool map create /sys/fs/bpf/my_outer_map type hash_of_maps \
              inner_map name my_inner_map [TAB]
      
      Fixing that specific case seems too convoluted. Completion will work as
      expected, however, if the outer map name comes first and the "inner_map
      name ..." is passed second.
      Signed-off-by: default avatarQuentin Monnet <quentin@isovalent.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Link: https://lore.kernel.org/bpf/20200910102652.10509-4-quentin@isovalent.com
      e3b9626f
    • Quentin Monnet's avatar
      tools: bpftool: Keep errors for map-of-map dumps if distinct from ENOENT · 86233ce3
      Quentin Monnet authored
      When dumping outer maps or prog_array maps, and on lookup failure,
      bpftool simply skips the entry with no error message. This is because
      the kernel returns non-zero when no value is found for the provided key,
      which frequently happen for those maps if they have not been filled.
      
      When such a case occurs, errno is set to ENOENT. It seems unlikely we
      could receive other error codes at this stage (we successfully retrieved
      map info just before), but to be on the safe side, let's skip the entry
      only if errno was ENOENT, and not for the other errors.
      
      v3: New patch
      Signed-off-by: default avatarQuentin Monnet <quentin@isovalent.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Link: https://lore.kernel.org/bpf/20200910102652.10509-3-quentin@isovalent.com
      86233ce3
    • Quentin Monnet's avatar
      tools: bpftool: Clean up function to dump map entry · a20693b6
      Quentin Monnet authored
      The function used to dump a map entry in bpftool is a bit difficult to
      follow, as a consequence to earlier refactorings. There is a variable
      ("num_elems") which does not appear to be necessary, and the error
      handling would look cleaner if moved to its own function. Let's clean it
      up. No functional change.
      
      v2:
      - v1 was erroneously removing the check on fd maps in an attempt to get
        support for outer map dumps. This is already working. Instead, v2
        focuses on cleaning up the dump_map_elem() function, to avoid
        similar confusion in the future.
      Signed-off-by: default avatarQuentin Monnet <quentin@isovalent.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Link: https://lore.kernel.org/bpf/20200910102652.10509-2-quentin@isovalent.com
      a20693b6
  2. 10 Sep, 2020 8 commits
  3. 09 Sep, 2020 4 commits
    • Andrii Nakryiko's avatar
      perf: Stop using deprecated bpf_program__title() · 8081ede1
      Andrii Nakryiko authored
      Switch from deprecated bpf_program__title() API to
      bpf_program__section_name(). Also drop unnecessary error checks because
      neither bpf_program__title() nor bpf_program__section_name() can fail or
      return NULL.
      
      Fixes: 52109584 ("libbpf: Deprecate notion of BPF program "title" in favor of "section name"")
      Signed-off-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Reviewed-by: default avatarTobias Klauser <tklauser@distanz.ch>
      Acked-by: default avatarJiri Olsa <jolsa@redhat.com>
      Link: https://lore.kernel.org/bpf/20200908180127.1249-1-andriin@fb.com
      8081ede1
    • Yonghong Song's avatar
      selftests/bpf: Fix test_sysctl_loop{1, 2} failure due to clang change · 7fb5eefd
      Yonghong Song authored
      Andrii reported that with latest clang, when building selftests, we have
      error likes:
        error: progs/test_sysctl_loop1.c:23:16: in function sysctl_tcp_mem i32 (%struct.bpf_sysctl*):
        Looks like the BPF stack limit of 512 bytes is exceeded.
        Please move large on stack variables into BPF per-cpu array map.
      
      The error is triggered by the following LLVM patch:
        https://reviews.llvm.org/D87134
      
      For example, the following code is from test_sysctl_loop1.c:
        static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx)
        {
          volatile char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string";
          ...
        }
      Without the above LLVM patch, the compiler did optimization to load the string
      (59 bytes long) with 7 64bit loads, 1 8bit load and 1 16bit load,
      occupying 64 byte stack size.
      
      With the above LLVM patch, the compiler only uses 8bit loads, but subregister is 32bit.
      So stack requirements become 4 * 59 = 236 bytes. Together with other stuff on
      the stack, total stack size exceeds 512 bytes, hence compiler complains and quits.
      
      To fix the issue, removing "volatile" key word or changing "volatile" to
      "const"/"static const" does not work, the string is put in .rodata.str1.1 section,
      which libbpf did not process it and errors out with
        libbpf: elf: skipping unrecognized data section(6) .rodata.str1.1
        libbpf: prog 'sysctl_tcp_mem': bad map relo against '.L__const.is_tcp_mem.tcp_mem_name'
                in section '.rodata.str1.1'
      
      Defining the string const as global variable can fix the issue as it puts the string constant
      in '.rodata' section which is recognized by libbpf. In the future, when libbpf can process
      '.rodata.str*.*' properly, the global definition can be changed back to local definition.
      
      Defining tcp_mem_name as a global, however, triggered a verifier failure.
         ./test_progs -n 7/21
        libbpf: load bpf program failed: Permission denied
        libbpf: -- BEGIN DUMP LOG ---
        libbpf:
        invalid stack off=0 size=1
        verification time 6975 usec
        stack depth 160+64
        processed 889 insns (limit 1000000) max_states_per_insn 4 total_states
        14 peak_states 14 mark_read 10
      
        libbpf: -- END LOG --
        libbpf: failed to load program 'sysctl_tcp_mem'
        libbpf: failed to load object 'test_sysctl_loop2.o'
        test_bpf_verif_scale:FAIL:114
        #7/21 test_sysctl_loop2.o:FAIL
      This actually exposed a bpf program bug. In test_sysctl_loop{1,2}, we have code
      like
        const char tcp_mem_name[] = "<...long string...>";
        ...
        char name[64];
        ...
        for (i = 0; i < sizeof(tcp_mem_name); ++i)
            if (name[i] != tcp_mem_name[i])
                return 0;
      In the above code, if sizeof(tcp_mem_name) > 64, name[i] access may be
      out of bound. The sizeof(tcp_mem_name) is 59 for test_sysctl_loop1.c and
      79 for test_sysctl_loop2.c.
      
      Without promotion-to-global change, old compiler generates code where
      the overflowed stack access is actually filled with valid value, so hiding
      the bpf program bug. With promotion-to-global change, the code is different,
      more specifically, the previous loading constants to stack is gone, and
      "name" occupies stack[-64:0] and overflow access triggers a verifier error.
      To fix the issue, adjust "name" buffer size properly.
      Reported-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Signed-off-by: default avatarYonghong Song <yhs@fb.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Link: https://lore.kernel.org/bpf/20200909171542.3673449-1-yhs@fb.com
      7fb5eefd
    • Yonghong Song's avatar
      selftests/bpf: Add test for map_ptr arithmetic · e6054fc1
      Yonghong Song authored
      Change selftest map_ptr_kern.c with disabling inlining for
      one of subtests, which will fail the test without previous
      verifier change. Also added to verifier test for both
      "map_ptr += scalar" and "scalar += map_ptr" arithmetic.
      Signed-off-by: default avatarYonghong Song <yhs@fb.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Link: https://lore.kernel.org/bpf/20200908175703.2463721-1-yhs@fb.com
      e6054fc1
    • Yonghong Song's avatar
      bpf: Permit map_ptr arithmetic with opcode add and offset 0 · 7c696732
      Yonghong Song authored
      Commit 41c48f3a ("bpf: Support access
      to bpf map fields") added support to access map fields
      with CORE support. For example,
      
                  struct bpf_map {
                          __u32 max_entries;
                  } __attribute__((preserve_access_index));
      
                  struct bpf_array {
                          struct bpf_map map;
                          __u32 elem_size;
                  } __attribute__((preserve_access_index));
      
                  struct {
                          __uint(type, BPF_MAP_TYPE_ARRAY);
                          __uint(max_entries, 4);
                          __type(key, __u32);
                          __type(value, __u32);
                  } m_array SEC(".maps");
      
                  SEC("cgroup_skb/egress")
                  int cg_skb(void *ctx)
                  {
                          struct bpf_array *array = (struct bpf_array *)&m_array;
      
                          /* .. array->map.max_entries .. */
                  }
      
      In kernel, bpf_htab has similar structure,
      
      	    struct bpf_htab {
      		    struct bpf_map map;
                          ...
                  }
      
      In the above cg_skb(), to access array->map.max_entries, with CORE, the clang will
      generate two builtin's.
                  base = &m_array;
                  /* access array.map */
                  map_addr = __builtin_preserve_struct_access_info(base, 0, 0);
                  /* access array.map.max_entries */
                  max_entries_addr = __builtin_preserve_struct_access_info(map_addr, 0, 0);
      	    max_entries = *max_entries_addr;
      
      In the current llvm, if two builtin's are in the same function or
      in the same function after inlining, the compiler is smart enough to chain
      them together and generates like below:
                  base = &m_array;
                  max_entries = *(base + reloc_offset); /* reloc_offset = 0 in this case */
      and we are fine.
      
      But if we force no inlining for one of functions in test_map_ptr() selftest, e.g.,
      check_default(), the above two __builtin_preserve_* will be in two different
      functions. In this case, we will have code like:
         func check_hash():
                  reloc_offset_map = 0;
                  base = &m_array;
                  map_base = base + reloc_offset_map;
                  check_default(map_base, ...)
         func check_default(map_base, ...):
                  max_entries = *(map_base + reloc_offset_max_entries);
      
      In kernel, map_ptr (CONST_PTR_TO_MAP) does not allow any arithmetic.
      The above "map_base = base + reloc_offset_map" will trigger a verifier failure.
        ; VERIFY(check_default(&hash->map, map));
        0: (18) r7 = 0xffffb4fe8018a004
        2: (b4) w1 = 110
        3: (63) *(u32 *)(r7 +0) = r1
         R1_w=invP110 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0
        ; VERIFY_TYPE(BPF_MAP_TYPE_HASH, check_hash);
        4: (18) r1 = 0xffffb4fe8018a000
        6: (b4) w2 = 1
        7: (63) *(u32 *)(r1 +0) = r2
         R1_w=map_value(id=0,off=0,ks=4,vs=8,imm=0) R2_w=invP1 R7_w=map_value(id=0,off=4,ks=4,vs=8,imm=0) R10=fp0
        8: (b7) r2 = 0
        9: (18) r8 = 0xffff90bcb500c000
        11: (18) r1 = 0xffff90bcb500c000
        13: (0f) r1 += r2
        R1 pointer arithmetic on map_ptr prohibited
      
      To fix the issue, let us permit map_ptr + 0 arithmetic which will
      result in exactly the same map_ptr.
      Signed-off-by: default avatarYonghong Song <yhs@fb.com>
      Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
      Link: https://lore.kernel.org/bpf/20200908175702.2463625-1-yhs@fb.com
      7c696732
  4. 07 Sep, 2020 3 commits
  5. 04 Sep, 2020 20 commits
  6. 02 Sep, 2020 1 commit