Commit 50a5de89 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull hmm updates from Jason Gunthorpe:
 "This series focuses on corner case bug fixes and general clarity
  improvements to hmm_range_fault(). It arose from a review of
  hmm_range_fault() by Christoph, Ralph and myself.

  hmm_range_fault() is being used by these 'SVM' style drivers to
  non-destructively read the page tables. It is very similar to
  get_user_pages() except that the output is an array of PFNs and
  per-pfn flags, and it has various modes of reading.

  This is necessary before RDMA ODP can be converted, as we don't want
  to have weird corner case regressions, which is still a looking
  forward item. Ralph has a nice tester for this routine, but it is
  waiting for feedback from the selftests maintainers.

  Summary:

   - 9 bug fixes

   - Allow pgmap to track the 'owner' of a DEVICE_PRIVATE - in this case
     the owner tells the driver if it can understand the DEVICE_PRIVATE
     page or not. Use this to resolve a bug in nouveau where it could
     touch DEVICE_PRIVATE pages from other drivers.

   - Remove a bunch of dead, redundant or unused code and flags

   - Clarity improvements to hmm_range_fault()"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (25 commits)
  mm/hmm: return error for non-vma snapshots
  mm/hmm: do not set pfns when returning an error code
  mm/hmm: do not unconditionally set pfns when returning EBUSY
  mm/hmm: use device_private_entry_to_pfn()
  mm/hmm: remove HMM_FAULT_SNAPSHOT
  mm/hmm: remove unused code and tidy comments
  mm/hmm: return the fault type from hmm_pte_need_fault()
  mm/hmm: remove pgmap checking for devmap pages
  mm/hmm: check the device private page owner in hmm_range_fault()
  mm: simplify device private page handling in hmm_range_fault
  mm: handle multiple owners of device private pages in migrate_vma
  memremap: add an owner field to struct dev_pagemap
  mm: merge hmm_vma_do_fault into into hmm_vma_walk_hole_
  mm/hmm: don't handle the non-fault case in hmm_vma_walk_hole_()
  mm/hmm: simplify hmm_vma_walk_hugetlb_entry()
  mm/hmm: remove the unused HMM_FAULT_ALLOW_RETRY flag
  mm/hmm: don't provide a stub for hmm_range_fault()
  mm/hmm: do not check pmd_protnone twice in hmm_vma_handle_pmd()
  mm/hmm: add missing call to hmm_pte_need_fault in HMM_PFN_SPECIAL handling
  mm/hmm: return -EFAULT when setting HMM_PFN_ERROR on requested valid pages
  ...
parents 193bc55b bd5d3587
......@@ -161,13 +161,11 @@ device must complete the update before the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
use::
long hmm_range_fault(struct hmm_range *range, unsigned int flags);
long hmm_range_fault(struct hmm_range *range);
With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table
entries and will not trigger a page fault on missing or non-present entries.
Without that flag, it does trigger a page fault on missing or read-only entries
if write access is requested (see below). Page faults use the generic mm page
fault code path just like a CPU page fault.
It will trigger a page fault on missing or read-only entries if write access is
requested (see below). Page faults use the generic mm page fault code path just
like a CPU page fault.
Both functions copy CPU page table entries into their pfns array argument. Each
entry in that array corresponds to an address in the virtual range. HMM
......@@ -197,7 +195,7 @@ The usage pattern is::
again:
range.notifier_seq = mmu_interval_read_begin(&interval_sub);
down_read(&mm->mmap_sem);
ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
ret = hmm_range_fault(&range);
if (ret) {
up_read(&mm->mmap_sem);
if (ret == -EBUSY)
......
......@@ -563,6 +563,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
mig.end = end;
mig.src = &src_pfn;
mig.dst = &dst_pfn;
mig.src_owner = &kvmppc_uvmem_pgmap;
mutex_lock(&kvm->arch.uvmem_lock);
/* The requested page is already paged-out, nothing to do */
......@@ -779,6 +780,8 @@ int kvmppc_uvmem_init(void)
kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE;
kvmppc_uvmem_pgmap.res = *res;
kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops;
/* just one global instance: */
kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap;
addr = memremap_pages(&kvmppc_uvmem_pgmap, NUMA_NO_NODE);
if (IS_ERR(addr)) {
ret = PTR_ERR(addr);
......
......@@ -770,7 +770,6 @@ struct amdgpu_ttm_tt {
static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = {
(1 << 0), /* HMM_PFN_VALID */
(1 << 1), /* HMM_PFN_WRITE */
0 /* HMM_PFN_DEVICE_PRIVATE */
};
static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = {
......@@ -851,7 +850,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
range->notifier_seq = mmu_interval_read_begin(&bo->notifier);
down_read(&mm->mmap_sem);
r = hmm_range_fault(range, 0);
r = hmm_range_fault(range);
up_read(&mm->mmap_sem);
if (unlikely(r <= 0)) {
/*
......
......@@ -28,6 +28,7 @@
#include <nvif/class.h>
#include <nvif/object.h>
#include <nvif/if000c.h>
#include <nvif/if500b.h>
#include <nvif/if900b.h>
......@@ -176,6 +177,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
.end = vmf->address + PAGE_SIZE,
.src = &src,
.dst = &dst,
.src_owner = drm->dev,
};
/*
......@@ -526,6 +528,7 @@ nouveau_dmem_init(struct nouveau_drm *drm)
drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
drm->dmem->pagemap.res = *res;
drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops;
drm->dmem->pagemap.owner = drm->dev;
if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap)))
goto out_free;
......@@ -669,12 +672,6 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
return ret;
}
static inline bool
nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
{
return is_device_private_page(page) && drm->dmem == page_to_dmem(page);
}
void
nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
struct hmm_range *range)
......@@ -690,18 +687,12 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
if (page == NULL)
continue;
if (!(range->pfns[i] & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
if (!is_device_private_page(page))
continue;
}
if (!nouveau_dmem_page(drm, page)) {
WARN(1, "Some unknown device memory !\n");
range->pfns[i] = 0;
continue;
}
addr = nouveau_dmem_page_addr(page);
range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM;
}
}
......@@ -367,7 +367,6 @@ static const u64
nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = {
[HMM_PFN_VALID ] = NVIF_VMM_PFNMAP_V0_V,
[HMM_PFN_WRITE ] = NVIF_VMM_PFNMAP_V0_W,
[HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM,
};
static const u64
......@@ -541,7 +540,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
range.default_flags = 0;
range.pfn_flags_mask = -1UL;
down_read(&mm->mmap_sem);
ret = hmm_range_fault(&range, 0);
ret = hmm_range_fault(&range);
up_read(&mm->mmap_sem);
if (ret <= 0) {
if (ret == 0 || ret == -EBUSY)
......
......@@ -3,58 +3,8 @@
* Copyright 2013 Red Hat Inc.
*
* Authors: Jérôme Glisse <jglisse@redhat.com>
*/
/*
* Heterogeneous Memory Management (HMM)
*
* See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it
* is for. Here we focus on the HMM API description, with some explanation of
* the underlying implementation.
*
* Short description: HMM provides a set of helpers to share a virtual address
* space between CPU and a device, so that the device can access any valid
* address of the process (while still obeying memory protection). HMM also
* provides helpers to migrate process memory to device memory, and back. Each
* set of functionality (address space mirroring, and migration to and from
* device memory) can be used independently of the other.
*
*
* HMM address space mirroring API:
*
* Use HMM address space mirroring if you want to mirror a range of the CPU
* page tables of a process into a device page table. Here, "mirror" means "keep
* synchronized". Prerequisites: the device must provide the ability to write-
* protect its page tables (at PAGE_SIZE granularity), and must be able to
* recover from the resulting potential page faults.
*
* HMM guarantees that at any point in time, a given virtual address points to
* either the same memory in both CPU and device page tables (that is: CPU and
* device page tables each point to the same pages), or that one page table (CPU
* or device) points to no entry, while the other still points to the old page
* for the address. The latter case happens when the CPU page table update
* happens first, and then the update is mirrored over to the device page table.
* This does not cause any issue, because the CPU page table cannot start
* pointing to a new page until the device page table is invalidated.
*
* HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
* updates to each device driver that has registered a mirror. It also provides
* some API calls to help with taking a snapshot of the CPU page table, and to
* synchronize with any updates that might happen concurrently.
*
*
* HMM migration to and from device memory:
*
* HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
* a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
* of the device memory, and allows the device driver to manage its memory
* using those struct pages. Having struct pages for device memory makes
* migration easier. Because that memory is not addressable by the CPU it must
* never be pinned to the device; in other words, any CPU page fault can always
* cause the device memory to be migrated (copied/moved) back to regular memory.
*
* A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
* allows use of a device DMA engine to perform the copy operation between
* regular system memory and device memory.
* See Documentation/vm/hmm.rst for reasons and overview of what HMM is.
*/
#ifndef LINUX_HMM_H
#define LINUX_HMM_H
......@@ -74,7 +24,6 @@
* Flags:
* HMM_PFN_VALID: pfn is valid. It has, at least, read permission.
* HMM_PFN_WRITE: CPU page table has write permission set
* HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
*
* The driver provides a flags array for mapping page protections to device
* PTE bits. If the driver valid bit for an entry is bit 3,
......@@ -86,7 +35,6 @@
enum hmm_pfn_flag_e {
HMM_PFN_VALID = 0,
HMM_PFN_WRITE,
HMM_PFN_DEVICE_PRIVATE,
HMM_PFN_FLAG_MAX
};
......@@ -122,9 +70,6 @@ enum hmm_pfn_value_e {
*
* @notifier: a mmu_interval_notifier that includes the start/end
* @notifier_seq: result of mmu_interval_read_begin()
* @hmm: the core HMM structure this range is active against
* @vma: the vm area struct for the range
* @list: all range lock are on a list
* @start: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
* @pfns: array of pfns (big enough for the range)
......@@ -132,8 +77,8 @@ enum hmm_pfn_value_e {
* @values: pfn value for some special case (none, special, error, ...)
* @default_flags: default flags for the range (write, read, ... see hmm doc)
* @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
* @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
* @valid: pfns array did not change since it has been fill by an HMM function
* @pfn_shift: pfn shift value (should be <= PAGE_SHIFT)
* @dev_private_owner: owner of device private pages
*/
struct hmm_range {
struct mmu_interval_notifier *notifier;
......@@ -146,6 +91,7 @@ struct hmm_range {
uint64_t default_flags;
uint64_t pfn_flags_mask;
uint8_t pfn_shift;
void *dev_private_owner;
};
/*
......@@ -171,71 +117,10 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
return pfn_to_page(entry >> range->pfn_shift);
}
/*
* hmm_device_entry_to_pfn() - return pfn value store in a device entry
* @range: range use to decode device entry value
* @entry: device entry to extract pfn from
* Return: pfn value if device entry is valid, -1UL otherwise
*/
static inline unsigned long
hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
{
if (pfn == range->values[HMM_PFN_NONE])
return -1UL;
if (pfn == range->values[HMM_PFN_ERROR])
return -1UL;
if (pfn == range->values[HMM_PFN_SPECIAL])
return -1UL;
if (!(pfn & range->flags[HMM_PFN_VALID]))
return -1UL;
return (pfn >> range->pfn_shift);
}
/*
* hmm_device_entry_from_page() - create a valid device entry for a page
* @range: range use to encode HMM pfn value
* @page: page for which to create the device entry
* Return: valid device entry for the page
*/
static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
struct page *page)
{
return (page_to_pfn(page) << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
/*
* hmm_device_entry_from_pfn() - create a valid device entry value from pfn
* @range: range use to encode HMM pfn value
* @pfn: pfn value for which to create the device entry
* Return: valid device entry for the pfn
*/
static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
unsigned long pfn)
{
return (pfn << range->pfn_shift) |
range->flags[HMM_PFN_VALID];
}
/*
* Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
*/
#define HMM_FAULT_ALLOW_RETRY (1 << 0)
/* Don't fault in missing PTEs, just snapshot the current state. */
#define HMM_FAULT_SNAPSHOT (1 << 1)
#ifdef CONFIG_HMM_MIRROR
/*
* Please see Documentation/vm/hmm.rst for how to use the range API.
*/
long hmm_range_fault(struct hmm_range *range, unsigned int flags);
#else
static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags)
{
return -EOPNOTSUPP;
}
#endif
long hmm_range_fault(struct hmm_range *range);
/*
* HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
......
......@@ -103,6 +103,9 @@ struct dev_pagemap_ops {
* @type: memory type: see MEMORY_* in memory_hotplug.h
* @flags: PGMAP_* flags to specify defailed behavior
* @ops: method table
* @owner: an opaque pointer identifying the entity that manages this
* instance. Used by various helpers to make sure that no
* foreign ZONE_DEVICE memory is accessed.
*/
struct dev_pagemap {
struct vmem_altmap altmap;
......@@ -113,6 +116,7 @@ struct dev_pagemap {
enum memory_type type;
unsigned int flags;
const struct dev_pagemap_ops *ops;
void *owner;
};
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
......
......@@ -196,6 +196,14 @@ struct migrate_vma {
unsigned long npages;
unsigned long start;
unsigned long end;
/*
* Set to the owner value also stored in page->pgmap->owner for
* migrating out of device private memory. If set only device
* private pages with this owner are migrated. If not set
* device private pages are not migrated at all.
*/
void *src_owner;
};
int migrate_vma_setup(struct migrate_vma *args);
......
This diff is collapsed.
......@@ -181,6 +181,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "Missing migrate_to_ram method\n");
return ERR_PTR(-EINVAL);
}
if (!pgmap->owner) {
WARN(1, "Missing owner\n");
return ERR_PTR(-EINVAL);
}
break;
case MEMORY_DEVICE_FS_DAX:
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
......
......@@ -2241,7 +2241,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
unsigned long mpfn, pfn;
unsigned long mpfn = 0, pfn;
struct page *page;
swp_entry_t entry;
pte_t pte;
......@@ -2255,8 +2255,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
}
if (!pte_present(pte)) {
mpfn = 0;
/*
* Only care about unaddressable device page special
* page table entry. Other special swap entries are not
......@@ -2267,11 +2265,16 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
page = device_private_entry_to_page(entry);
if (page->pgmap->owner != migrate->src_owner)
goto next;
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
if (migrate->src_owner)
goto next;
pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment