Commit 3565fce3 authored by Dan Williams's avatar Dan Williams Committed by Linus Torvalds

mm, x86: get_user_pages() for dax mappings

A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver
has established a devm_memremap_pages() mapping, i.e.  when the pfn_t
return from ->direct_access() has PFN_DEV and PFN_MAP set.  Later, when
encountering _PAGE_DEVMAP during a page table walk we lookup and pin a
struct dev_pagemap instance to keep the result of pfn_to_page() valid
until put_page().
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
Tested-by: default avatarLogan Gunthorpe <logang@deltatee.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 5c7fb56e
...@@ -479,6 +479,13 @@ static inline int pte_present(pte_t a) ...@@ -479,6 +479,13 @@ static inline int pte_present(pte_t a)
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
} }
#ifdef __HAVE_ARCH_PTE_DEVMAP
static inline int pte_devmap(pte_t a)
{
return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
}
#endif
#define pte_accessible pte_accessible #define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a) static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{ {
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/vmstat.h> #include <linux/vmstat.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/memremap.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
...@@ -63,6 +64,16 @@ static inline pte_t gup_get_pte(pte_t *ptep) ...@@ -63,6 +64,16 @@ static inline pte_t gup_get_pte(pte_t *ptep)
#endif #endif
} }
static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
ClearPageReferenced(page);
put_page(page);
}
}
/* /*
* The performance critical leaf functions are made noinline otherwise gcc * The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much * inlines everything into a single function which results in too much
...@@ -71,7 +82,9 @@ static inline pte_t gup_get_pte(pte_t *ptep) ...@@ -71,7 +82,9 @@ static inline pte_t gup_get_pte(pte_t *ptep)
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr) unsigned long end, int write, struct page **pages, int *nr)
{ {
struct dev_pagemap *pgmap = NULL;
unsigned long mask; unsigned long mask;
int nr_start = *nr;
pte_t *ptep; pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER; mask = _PAGE_PRESENT|_PAGE_USER;
...@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, ...@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
return 0; return 0;
} }
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { page = pte_page(pte);
if (pte_devmap(pte)) {
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
pte_unmap(ptep);
return 0;
}
} else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep); pte_unmap(ptep);
return 0; return 0;
} }
VM_BUG_ON(!pfn_valid(pte_pfn(pte))); VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page); get_page(page);
put_dev_pagemap(pgmap);
SetPageReferenced(page); SetPageReferenced(page);
pages[*nr] = page; pages[*nr] = page;
(*nr)++; (*nr)++;
...@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr) ...@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr)
SetPageReferenced(page); SetPageReferenced(page);
} }
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
int nr_start = *nr;
unsigned long pfn = pmd_pfn(pmd);
struct dev_pagemap *pgmap = NULL;
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
do {
struct page *page = pfn_to_page(pfn);
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
SetPageReferenced(page);
pages[*nr] = page;
get_page(page);
put_dev_pagemap(pgmap);
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
return 1;
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr) unsigned long end, int write, struct page **pages, int *nr)
{ {
...@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, ...@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
mask |= _PAGE_RW; mask |= _PAGE_RW;
if ((pmd_flags(pmd) & mask) != mask) if ((pmd_flags(pmd) & mask) != mask)
return 0; return 0;
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
if (pmd_devmap(pmd))
return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
/* hugepages are never "special" */ /* hugepages are never "special" */
VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
refs = 0; refs = 0;
head = pmd_page(pmd); head = pmd_page(pmd);
......
...@@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
int prot_numa); int prot_numa);
int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *,
pfn_t pfn, bool write); pfn_t pfn, bool write);
enum transparent_hugepage_flag { enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
...@@ -55,6 +54,9 @@ enum transparent_hugepage_flag { ...@@ -55,6 +54,9 @@ enum transparent_hugepage_flag {
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, int flags);
#define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT)
#define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1))
...@@ -205,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page) ...@@ -205,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page)
return false; return false;
} }
static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd, int flags)
{
return NULL;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_HUGE_MM_H */ #endif /* _LINUX_HUGE_MM_H */
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/mm_types.h> #include <linux/mm_types.h>
#include <linux/range.h> #include <linux/range.h>
#include <linux/pfn.h> #include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h> #include <linux/bit_spinlock.h>
#include <linux/shrinker.h> #include <linux/shrinker.h>
#include <linux/resource.h> #include <linux/resource.h>
...@@ -465,17 +466,6 @@ static inline int page_count(struct page *page) ...@@ -465,17 +466,6 @@ static inline int page_count(struct page *page)
return atomic_read(&compound_head(page)->_count); return atomic_read(&compound_head(page)->_count);
} }
static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
}
static inline struct page *virt_to_head_page(const void *x) static inline struct page *virt_to_head_page(const void *x)
{ {
struct page *page = virt_to_page(x); struct page *page = virt_to_page(x);
...@@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page) ...@@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page)
void __put_page(struct page *page); void __put_page(struct page *page);
static inline void put_page(struct page *page)
{
page = compound_head(page);
if (put_page_testzero(page))
__put_page(page);
}
void put_pages_list(struct list_head *pages); void put_pages_list(struct list_head *pages);
void split_page(struct page *page, unsigned int order); void split_page(struct page *page, unsigned int order);
...@@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page) ...@@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page)
} }
#ifdef CONFIG_ZONE_DEVICE #ifdef CONFIG_ZONE_DEVICE
void get_zone_device_page(struct page *page);
void put_zone_device_page(struct page *page);
static inline bool is_zone_device_page(const struct page *page) static inline bool is_zone_device_page(const struct page *page)
{ {
return page_zonenum(page) == ZONE_DEVICE; return page_zonenum(page) == ZONE_DEVICE;
} }
#else #else
static inline void get_zone_device_page(struct page *page)
{
}
static inline void put_zone_device_page(struct page *page)
{
}
static inline bool is_zone_device_page(const struct page *page) static inline bool is_zone_device_page(const struct page *page)
{ {
return false; return false;
} }
#endif #endif
static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
atomic_inc(&page->_count);
if (unlikely(is_zone_device_page(page)))
get_zone_device_page(page);
}
static inline void put_page(struct page *page)
{
page = compound_head(page);
if (put_page_testzero(page))
__put_page(page);
if (unlikely(is_zone_device_page(page)))
put_zone_device_page(page);
}
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS #define SECTION_IN_PAGE_FLAGS
#endif #endif
...@@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm) ...@@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm)
} }
#endif #endif
#ifndef __HAVE_ARCH_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
return 0;
}
#endif
int vma_wants_writenotify(struct vm_area_struct *vma); int vma_wants_writenotify(struct vm_area_struct *vma);
extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
......
...@@ -169,6 +169,18 @@ struct page_map { ...@@ -169,6 +169,18 @@ struct page_map {
struct vmem_altmap altmap; struct vmem_altmap altmap;
}; };
void get_zone_device_page(struct page *page)
{
percpu_ref_get(page->pgmap->ref);
}
EXPORT_SYMBOL(get_zone_device_page);
void put_zone_device_page(struct page *page)
{
put_dev_pagemap(page->pgmap);
}
EXPORT_SYMBOL(put_zone_device_page);
static void pgmap_radix_release(struct resource *res) static void pgmap_radix_release(struct resource *res)
{ {
resource_size_t key; resource_size_t key;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/memremap.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/rmap.h> #include <linux/rmap.h>
#include <linux/swap.h> #include <linux/swap.h>
...@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags) unsigned long address, pmd_t *pmd, unsigned int flags)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
struct dev_pagemap *pgmap = NULL;
struct page *page; struct page *page;
spinlock_t *ptl; spinlock_t *ptl;
pte_t *ptep, pte; pte_t *ptep, pte;
...@@ -98,7 +100,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -98,7 +100,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
} }
page = vm_normal_page(vma, address, pte); page = vm_normal_page(vma, address, pte);
if (unlikely(!page)) { if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
/*
* Only return device mapping pages in the FOLL_GET case since
* they are only valid while holding the pgmap reference.
*/
pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
if (pgmap)
page = pte_page(pte);
else
goto no_page;
} else if (unlikely(!page)) {
if (flags & FOLL_DUMP) { if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */ /* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT); page = ERR_PTR(-EFAULT);
...@@ -129,8 +141,15 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, ...@@ -129,8 +141,15 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto retry; goto retry;
} }
if (flags & FOLL_GET) if (flags & FOLL_GET) {
get_page(page); get_page(page);
/* drop the pgmap reference now that we hold the page */
if (pgmap) {
put_dev_pagemap(pgmap);
pgmap = NULL;
}
}
if (flags & FOLL_TOUCH) { if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) && if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page)) !pte_dirty(pte) && !PageDirty(page))
...@@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, ...@@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
} }
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags); return no_page_table(vma, flags);
if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
page = follow_devmap_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
if (page)
return page;
}
if (likely(!pmd_trans_huge(*pmd))) if (likely(!pmd_trans_huge(*pmd)))
return follow_page_pte(vma, address, pmd, flags); return follow_page_pte(vma, address, pmd, flags);
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <linux/freezer.h> #include <linux/freezer.h>
#include <linux/pfn_t.h> #include <linux/pfn_t.h>
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/migrate.h> #include <linux/migrate.h>
...@@ -974,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, ...@@ -974,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
return VM_FAULT_NOPAGE; return VM_FAULT_NOPAGE;
} }
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd)
{
pmd_t _pmd;
/*
* We should set the dirty bit only for FOLL_WRITE but for now
* the dirty bit in the pmd is meaningless. And if the dirty
* bit will become meaningful and we'll only set it with
* FOLL_WRITE, an atomic set_bit will be required on the pmd to
* set the young bit, instead of the current set_pmd_at.
*/
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, int flags)
{
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
if (pmd_present(*pmd) && pmd_devmap(*pmd))
/* pass */;
else
return NULL;
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd);
/*
* device mapped pages can only be returned if the
* caller will manage the page reference count.
*/
if (!(flags & FOLL_GET))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
pgmap = get_dev_pagemap(pfn, NULL);
if (!pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
put_dev_pagemap(pgmap);
return page;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma) struct vm_area_struct *vma)
...@@ -1331,21 +1389,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, ...@@ -1331,21 +1389,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd); page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page), page); VM_BUG_ON_PAGE(!PageHead(page), page);
if (flags & FOLL_TOUCH) { if (flags & FOLL_TOUCH)
pmd_t _pmd; touch_pmd(vma, addr, pmd);
/*
* We should set the dirty bit only for FOLL_WRITE but
* for now the dirty bit in the pmd is meaningless.
* And if the dirty bit will become meaningful and
* we'll only set it with FOLL_WRITE, an atomic
* set_bit will be required on the pmd to set the
* young bit, instead of the current set_pmd_at.
*/
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/* /*
* We don't mlock() pte-mapped THPs. This way we can avoid * We don't mlock() pte-mapped THPs. This way we can avoid
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/export.h> #include <linux/export.h>
#include <linux/mm_inline.h> #include <linux/mm_inline.h>
#include <linux/percpu_counter.h> #include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/notifier.h> #include <linux/notifier.h>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment