Commit e66f17ff authored by Naoya Horiguchi's avatar Naoya Horiguchi Committed by Linus Torvalds

mm/hugetlb: take page table lock in follow_huge_pmd()

We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing.  This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.

This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.

This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned.  So the caller must be changed to
properly handle the returned tail pages.

We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.

Here is the reproducer:

  $ cat movepages.c
  #include <stdio.h>
  #include <stdlib.h>
  #include <numaif.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000
  #define PS              0x1000

  int main(int argc, char *argv[]) {
          int i;
          int nr_hp = strtol(argv[1], NULL, 0);
          int nr_p  = nr_hp * HPS / PS;
          int ret;
          void **addrs;
          int *status;
          int *nodes;
          pid_t pid;

          pid = strtol(argv[2], NULL, 0);
          addrs  = malloc(sizeof(char *) * nr_p + 1);
          status = malloc(sizeof(char *) * nr_p + 1);
          nodes  = malloc(sizeof(char *) * nr_p + 1);

          while (1) {
                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 1;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");

                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 0;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");
          }
          return 0;
  }

  $ cat hugepage.c
  #include <stdio.h>
  #include <sys/mman.h>
  #include <string.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000

  int main(int argc, char *argv[]) {
          int nr_hp = strtol(argv[1], NULL, 0);
          char *p;

          while (1) {
                  p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
                           MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
                  if (p != (void *)ADDR_INPUT) {
                          perror("mmap");
                          break;
                  }
                  memset(p, 0, nr_hp * HPS);
                  munmap(p, nr_hp * HPS);
          }
  }

  $ sysctl vm.nr_hugepages=40
  $ ./hugepage 10 &
  $ ./movepages 10 $(pgrep -f hugepage)

Fixes: e632a938 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: default avatarHugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org>	[3.12+]
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent cbef8478
...@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); ...@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
int write); int write);
struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write); pmd_t *pmd, int flags);
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write); pud_t *pud, int flags);
int pmd_huge(pmd_t pmd); int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pmd); int pud_huge(pud_t pmd);
unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
...@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) ...@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
static inline void hugetlb_show_meminfo(void) static inline void hugetlb_show_meminfo(void)
{ {
} }
#define follow_huge_pmd(mm, addr, pmd, write) NULL #define follow_huge_pmd(mm, addr, pmd, flags) NULL
#define follow_huge_pud(mm, addr, pud, write) NULL #define follow_huge_pud(mm, addr, pud, flags) NULL
#define prepare_hugepage_range(file, addr, len) (-EINVAL) #define prepare_hugepage_range(file, addr, len) (-EINVAL)
#define pmd_huge(x) 0 #define pmd_huge(x) 0
#define pud_huge(x) 0 #define pud_huge(x) 0
......
...@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry) ...@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
} }
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl);
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address); unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, extern void migration_entry_wait_huge(struct vm_area_struct *vma,
...@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp) ...@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp)
} }
#define migration_entry_to_page(swp) NULL #define migration_entry_to_page(swp) NULL
static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void make_migration_entry_read(swp_entry_t *entryp) { }
static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl) { }
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
unsigned long address) { } unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma, static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
......
...@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, ...@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pud_none(*pud)) if (pud_none(*pud))
return no_page_table(vma, flags); return no_page_table(vma, flags);
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
if (flags & FOLL_GET) page = follow_huge_pud(mm, address, pud, flags);
return NULL; if (page)
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
return page; return page;
return no_page_table(vma, flags);
} }
if (unlikely(pud_bad(*pud))) if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags); return no_page_table(vma, flags);
...@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, ...@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pmd_none(*pmd)) if (pmd_none(*pmd))
return no_page_table(vma, flags); return no_page_table(vma, flags);
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); page = follow_huge_pmd(mm, address, pmd, flags);
if (flags & FOLL_GET) { if (page)
/*
* Refcount on tail pages are not well-defined and
* shouldn't be taken. The caller should handle a NULL
* return when trying to follow tail pages.
*/
if (PageHead(page))
get_page(page);
else
page = NULL;
}
return page; return page;
return no_page_table(vma, flags);
} }
if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
return no_page_table(vma, flags); return no_page_table(vma, flags);
......
...@@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, ...@@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
struct page * __weak struct page * __weak
follow_huge_pmd(struct mm_struct *mm, unsigned long address, follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write) pmd_t *pmd, int flags)
{ {
struct page *page; struct page *page = NULL;
spinlock_t *ptl;
if (!pmd_present(*pmd)) retry:
return NULL; ptl = pmd_lockptr(mm, pmd);
page = pte_page(*(pte_t *)pmd); spin_lock(ptl);
if (page) /*
page += ((address & ~PMD_MASK) >> PAGE_SHIFT); * make sure that the address range covered by this pmd is not
* unmapped from other threads.
*/
if (!pmd_huge(*pmd))
goto out;
if (pmd_present(*pmd)) {
page = pte_page(*(pte_t *)pmd) +
((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
spin_unlock(ptl);
__migration_entry_wait(mm, (pte_t *)pmd, ptl);
goto retry;
}
/*
* hwpoisoned entry is treated as no_page_table in
* follow_page_mask().
*/
}
out:
spin_unlock(ptl);
return page; return page;
} }
struct page * __weak struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address, follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write) pud_t *pud, int flags)
{ {
struct page *page; if (flags & FOLL_GET)
return NULL;
page = pte_page(*(pte_t *)pud); return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
if (page)
page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
return page;
} }
#ifdef CONFIG_MEMORY_FAILURE #ifdef CONFIG_MEMORY_FAILURE
......
...@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) ...@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* get to the page and wait until migration is finished. * get to the page and wait until migration is finished.
* When we return from this function the fault will be retried. * When we return from this function the fault will be retried.
*/ */
static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl) spinlock_t *ptl)
{ {
pte_t pte; pte_t pte;
...@@ -1236,6 +1236,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, ...@@ -1236,6 +1236,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set; goto put_and_set;
if (PageHuge(page)) { if (PageHuge(page)) {
if (PageHead(page))
isolate_huge_page(page, &pagelist); isolate_huge_page(page, &pagelist);
goto put_and_set; goto put_and_set;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment