• Omar Sandoval's avatar
    btrfs: fix deadlock when defragging transparent huge pages · 24bcb454
    Omar Sandoval authored
    Attempting to defragment a Btrfs file containing a transparent huge page
    immediately deadlocks with the following stack trace:
    
      #0  context_switch (kernel/sched/core.c:4940:2)
      #1  __schedule (kernel/sched/core.c:6287:8)
      #2  schedule (kernel/sched/core.c:6366:3)
      #3  io_schedule (kernel/sched/core.c:8389:2)
      #4  wait_on_page_bit_common (mm/filemap.c:1356:4)
      #5  __lock_page (mm/filemap.c:1648:2)
      #6  lock_page (./include/linux/pagemap.h:625:3)
      #7  pagecache_get_page (mm/filemap.c:1910:4)
      #8  find_or_create_page (./include/linux/pagemap.h:420:9)
      #9  defrag_prepare_one_page (fs/btrfs/ioctl.c:1068:9)
      #10 defrag_one_range (fs/btrfs/ioctl.c:1326:14)
      #11 defrag_one_cluster (fs/btrfs/ioctl.c:1421:9)
      #12 btrfs_defrag_file (fs/btrfs/ioctl.c:1523:9)
      #13 btrfs_ioctl_defrag (fs/btrfs/ioctl.c:3117:9)
      #14 btrfs_ioctl (fs/btrfs/ioctl.c:4872:10)
      #15 vfs_ioctl (fs/ioctl.c:51:10)
      #16 __do_sys_ioctl (fs/ioctl.c:874:11)
      #17 __se_sys_ioctl (fs/ioctl.c:860:1)
      #18 __x64_sys_ioctl (fs/ioctl.c:860:1)
      #19 do_syscall_x64 (arch/x86/entry/common.c:50:14)
      #20 do_syscall_64 (arch/x86/entry/common.c:80:7)
      #21 entry_SYSCALL_64+0x7c/0x15b (arch/x86/entry/entry_64.S:113)
    
    A huge page is represented by a compound page, which consists of a
    struct page for each PAGE_SIZE page within the huge page. The first
    struct page is the "head page", and the remaining are "tail pages".
    
    Defragmentation attempts to lock each page in the range. However,
    lock_page() on a tail page actually locks the corresponding head page.
    So, if defragmentation tries to lock more than one struct page in a
    compound page, it tries to lock the same head page twice and deadlocks
    with itself.
    
    Ideally, we should be able to defragment transparent huge pages.
    However, THP for filesystems is currently read-only, so a lot of code is
    not ready to use huge pages for I/O. For now, let's just return
    ETXTBUSY.
    
    This can be reproduced with the following on a kernel with
    CONFIG_READ_ONLY_THP_FOR_FS=y:
    
      $ cat create_thp_file.c
      #include <fcntl.h>
      #include <stdbool.h>
      #include <stdio.h>
      #include <stdint.h>
      #include <stdlib.h>
      #include <unistd.h>
      #include <sys/mman.h>
    
      static const char zeroes[1024 * 1024];
      static const size_t FILE_SIZE = 2 * 1024 * 1024;
    
      int main(int argc, char **argv)
      {
              if (argc != 2) {
                      fprintf(stderr, "usage: %s PATH\n", argv[0]);
                      return EXIT_FAILURE;
              }
              int fd = creat(argv[1], 0777);
              if (fd == -1) {
                      perror("creat");
                      return EXIT_FAILURE;
              }
              size_t written = 0;
              while (written < FILE_SIZE) {
                      ssize_t ret = write(fd, zeroes,
                                          sizeof(zeroes) < FILE_SIZE - written ?
                                          sizeof(zeroes) : FILE_SIZE - written);
                      if (ret < 0) {
                              perror("write");
                              return EXIT_FAILURE;
                      }
                      written += ret;
              }
              close(fd);
              fd = open(argv[1], O_RDONLY);
              if (fd == -1) {
                      perror("open");
                      return EXIT_FAILURE;
              }
    
              /*
               * Reserve some address space so that we can align the file mapping to
               * the huge page size.
               */
              void *placeholder_map = mmap(NULL, FILE_SIZE * 2, PROT_NONE,
                                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
              if (placeholder_map == MAP_FAILED) {
                      perror("mmap (placeholder)");
                      return EXIT_FAILURE;
              }
    
              void *aligned_address =
                      (void *)(((uintptr_t)placeholder_map + FILE_SIZE - 1) & ~(FILE_SIZE - 1));
    
              void *map = mmap(aligned_address, FILE_SIZE, PROT_READ | PROT_EXEC,
                               MAP_SHARED | MAP_FIXED, fd, 0);
              if (map == MAP_FAILED) {
                      perror("mmap");
                      return EXIT_FAILURE;
              }
              if (madvise(map, FILE_SIZE, MADV_HUGEPAGE) < 0) {
                      perror("madvise");
                      return EXIT_FAILURE;
              }
    
              char *line = NULL;
              size_t line_capacity = 0;
              FILE *smaps_file = fopen("/proc/self/smaps", "r");
              if (!smaps_file) {
                      perror("fopen");
                      return EXIT_FAILURE;
              }
              for (;;) {
                      for (size_t off = 0; off < FILE_SIZE; off += 4096)
                              ((volatile char *)map)[off];
    
                      ssize_t ret;
                      bool this_mapping = false;
                      while ((ret = getline(&line, &line_capacity, smaps_file)) > 0) {
                              unsigned long start, end, huge;
                              if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
                                      this_mapping = (start <= (uintptr_t)map &&
                                                      (uintptr_t)map < end);
                              } else if (this_mapping &&
                                         sscanf(line, "FilePmdMapped: %ld", &huge) == 1 &&
                                         huge > 0) {
                                      return EXIT_SUCCESS;
                              }
                      }
    
                      sleep(6);
                      rewind(smaps_file);
                      fflush(smaps_file);
              }
      }
      $ ./create_thp_file huge
      $ btrfs fi defrag -czstd ./huge
    Reviewed-by: default avatarJosef Bacik <josef@toxicpanda.com>
    Signed-off-by: default avatarOmar Sandoval <osandov@fb.com>
    Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
    Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
    24bcb454
ioctl.c 124 KB