Commit 719bbd4a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'vfio-v5.12-rc1' of git://github.com/awilliam/linux-vfio

Pull VFIO updatesfrom Alex Williamson:

 - Virtual address update handling (Steve Sistare)

 - s390/zpci fixes and cleanups (Max Gurtovoy)

 - Fixes for dirty bitmap handling, non-mdev page pinning, and improved
   pinned dirty scope tracking (Keqian Zhu)

 - Batched page pinning enhancement (Daniel Jordan)

 - Page access permission fix (Alex Williamson)

* tag 'vfio-v5.12-rc1' of git://github.com/awilliam/linux-vfio: (21 commits)
  vfio/type1: Batch page pinning
  vfio/type1: Prepare for batched pinning with struct vfio_batch
  vfio/type1: Change success value of vaddr_get_pfn()
  vfio/type1: Use follow_pte()
  vfio/pci: remove CONFIG_VFIO_PCI_ZDEV from Kconfig
  vfio/iommu_type1: Fix duplicate included kthread.h
  vfio-pci/zdev: fix possible segmentation fault issue
  vfio-pci/zdev: remove unused vdev argument
  vfio/pci: Fix handling of pci use accessor return codes
  vfio/iommu_type1: Mantain a counter for non_pinned_groups
  vfio/iommu_type1: Fix some sanity checks in detach group
  vfio/iommu_type1: Populate full dirty when detach non-pinned group
  vfio/type1: block on invalid vaddr
  vfio/type1: implement notify callback
  vfio: iommu driver notify callback
  vfio/type1: implement interfaces to update vaddr
  vfio/type1: massage unmap iteration
  vfio: interfaces to update vaddr
  vfio/type1: implement unmap all
  vfio/type1: unmap cleanup
  ...
parents c4fbde84 4d83de6d
...@@ -45,15 +45,3 @@ config VFIO_PCI_NVLINK2 ...@@ -45,15 +45,3 @@ config VFIO_PCI_NVLINK2
depends on VFIO_PCI && PPC_POWERNV depends on VFIO_PCI && PPC_POWERNV
help help
VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
config VFIO_PCI_ZDEV
bool "VFIO PCI ZPCI device CLP support"
depends on VFIO_PCI && S390
default y
help
Enabling this option exposes VFIO capabilities containing hardware
configuration for zPCI devices. This enables userspace (e.g. QEMU)
to supply proper configuration values instead of hard-coded defaults
for zPCI devices passed through via VFIO on s390.
Say Y here.
...@@ -3,6 +3,6 @@ ...@@ -3,6 +3,6 @@
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
vfio-pci-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
...@@ -807,6 +807,7 @@ static long vfio_pci_ioctl(void *device_data, ...@@ -807,6 +807,7 @@ static long vfio_pci_ioctl(void *device_data,
struct vfio_device_info info; struct vfio_device_info info;
struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
unsigned long capsz; unsigned long capsz;
int ret;
minsz = offsetofend(struct vfio_device_info, num_irqs); minsz = offsetofend(struct vfio_device_info, num_irqs);
...@@ -832,14 +833,11 @@ static long vfio_pci_ioctl(void *device_data, ...@@ -832,14 +833,11 @@ static long vfio_pci_ioctl(void *device_data,
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS; info.num_irqs = VFIO_PCI_NUM_IRQS;
if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) { ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
int ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
if (ret && ret != -ENODEV) { if (ret && ret != -ENODEV) {
pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
return ret; return ret;
} }
}
if (caps.size) { if (caps.size) {
info.flags |= VFIO_DEVICE_FLAGS_CAPS; info.flags |= VFIO_DEVICE_FLAGS_CAPS;
......
...@@ -127,7 +127,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, ...@@ -127,7 +127,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_byte(pdev, pos, &val); ret = pci_user_read_config_byte(pdev, pos, &val);
if (ret) if (ret)
return pcibios_err_to_errno(ret); return ret;
if (copy_to_user(buf + count - size, &val, 1)) if (copy_to_user(buf + count - size, &val, 1))
return -EFAULT; return -EFAULT;
...@@ -141,7 +141,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, ...@@ -141,7 +141,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_word(pdev, pos, &val); ret = pci_user_read_config_word(pdev, pos, &val);
if (ret) if (ret)
return pcibios_err_to_errno(ret); return ret;
val = cpu_to_le16(val); val = cpu_to_le16(val);
if (copy_to_user(buf + count - size, &val, 2)) if (copy_to_user(buf + count - size, &val, 2))
...@@ -156,7 +156,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, ...@@ -156,7 +156,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_dword(pdev, pos, &val); ret = pci_user_read_config_dword(pdev, pos, &val);
if (ret) if (ret)
return pcibios_err_to_errno(ret); return ret;
val = cpu_to_le32(val); val = cpu_to_le32(val);
if (copy_to_user(buf + count - size, &val, 4)) if (copy_to_user(buf + count - size, &val, 4))
...@@ -171,7 +171,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, ...@@ -171,7 +171,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_word(pdev, pos, &val); ret = pci_user_read_config_word(pdev, pos, &val);
if (ret) if (ret)
return pcibios_err_to_errno(ret); return ret;
val = cpu_to_le16(val); val = cpu_to_le16(val);
if (copy_to_user(buf + count - size, &val, 2)) if (copy_to_user(buf + count - size, &val, 2))
...@@ -186,7 +186,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev, ...@@ -186,7 +186,7 @@ static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
ret = pci_user_read_config_byte(pdev, pos, &val); ret = pci_user_read_config_byte(pdev, pos, &val);
if (ret) if (ret)
return pcibios_err_to_errno(ret); return ret;
if (copy_to_user(buf + count - size, &val, 1)) if (copy_to_user(buf + count - size, &val, 1))
return -EFAULT; return -EFAULT;
......
...@@ -214,7 +214,7 @@ static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) ...@@ -214,7 +214,7 @@ static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
} }
#endif #endif
#ifdef CONFIG_VFIO_PCI_ZDEV #ifdef CONFIG_S390
extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
struct vfio_info_cap *caps); struct vfio_info_cap *caps);
#else #else
......
...@@ -24,8 +24,7 @@ ...@@ -24,8 +24,7 @@
/* /*
* Add the Base PCI Function information to the device info region. * Add the Base PCI Function information to the device info region.
*/ */
static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev, static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
struct vfio_info_cap *caps)
{ {
struct vfio_device_info_cap_zpci_base cap = { struct vfio_device_info_cap_zpci_base cap = {
.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE, .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
...@@ -45,8 +44,7 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev, ...@@ -45,8 +44,7 @@ static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
/* /*
* Add the Base PCI Function Group information to the device info region. * Add the Base PCI Function Group information to the device info region.
*/ */
static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev, static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
struct vfio_info_cap *caps)
{ {
struct vfio_device_info_cap_zpci_group cap = { struct vfio_device_info_cap_zpci_group cap = {
.header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP, .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
...@@ -66,14 +64,15 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev, ...@@ -66,14 +64,15 @@ static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
/* /*
* Add the device utility string to the device info region. * Add the device utility string to the device info region.
*/ */
static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev, static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
struct vfio_info_cap *caps)
{ {
struct vfio_device_info_cap_zpci_util *cap; struct vfio_device_info_cap_zpci_util *cap;
int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN; int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
int ret; int ret;
cap = kmalloc(cap_size, GFP_KERNEL); cap = kmalloc(cap_size, GFP_KERNEL);
if (!cap)
return -ENOMEM;
cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL; cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
cap->header.version = 1; cap->header.version = 1;
...@@ -90,14 +89,15 @@ static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev, ...@@ -90,14 +89,15 @@ static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
/* /*
* Add the function path string to the device info region. * Add the function path string to the device info region.
*/ */
static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev, static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_info_cap *caps)
struct vfio_info_cap *caps)
{ {
struct vfio_device_info_cap_zpci_pfip *cap; struct vfio_device_info_cap_zpci_pfip *cap;
int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS; int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
int ret; int ret;
cap = kmalloc(cap_size, GFP_KERNEL); cap = kmalloc(cap_size, GFP_KERNEL);
if (!cap)
return -ENOMEM;
cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP; cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
cap->header.version = 1; cap->header.version = 1;
...@@ -123,21 +123,21 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, ...@@ -123,21 +123,21 @@ int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
if (!zdev) if (!zdev)
return -ENODEV; return -ENODEV;
ret = zpci_base_cap(zdev, vdev, caps); ret = zpci_base_cap(zdev, caps);
if (ret) if (ret)
return ret; return ret;
ret = zpci_group_cap(zdev, vdev, caps); ret = zpci_group_cap(zdev, caps);
if (ret) if (ret)
return ret; return ret;
if (zdev->util_str_avail) { if (zdev->util_str_avail) {
ret = zpci_util_cap(zdev, vdev, caps); ret = zpci_util_cap(zdev, caps);
if (ret) if (ret)
return ret; return ret;
} }
ret = zpci_pfip_cap(zdev, vdev, caps); ret = zpci_pfip_cap(zdev, caps);
return ret; return ret;
} }
...@@ -1220,6 +1220,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) ...@@ -1220,6 +1220,11 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
static int vfio_fops_release(struct inode *inode, struct file *filep) static int vfio_fops_release(struct inode *inode, struct file *filep)
{ {
struct vfio_container *container = filep->private_data; struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver = container->iommu_driver;
if (driver && driver->ops->notify)
driver->ops->notify(container->iommu_data,
VFIO_IOMMU_CONTAINER_CLOSE);
filep->private_data = NULL; filep->private_data = NULL;
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/compat.h> #include <linux/compat.h>
#include <linux/device.h> #include <linux/device.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/highmem.h>
#include <linux/iommu.h> #include <linux/iommu.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/mm.h> #include <linux/mm.h>
...@@ -69,11 +70,15 @@ struct vfio_iommu { ...@@ -69,11 +70,15 @@ struct vfio_iommu {
struct rb_root dma_list; struct rb_root dma_list;
struct blocking_notifier_head notifier; struct blocking_notifier_head notifier;
unsigned int dma_avail; unsigned int dma_avail;
unsigned int vaddr_invalid_count;
uint64_t pgsize_bitmap; uint64_t pgsize_bitmap;
uint64_t num_non_pinned_groups;
wait_queue_head_t vaddr_wait;
bool v2; bool v2;
bool nesting; bool nesting;
bool dirty_page_tracking; bool dirty_page_tracking;
bool pinned_page_dirty_scope; bool pinned_page_dirty_scope;
bool container_open;
}; };
struct vfio_domain { struct vfio_domain {
...@@ -92,11 +97,20 @@ struct vfio_dma { ...@@ -92,11 +97,20 @@ struct vfio_dma {
int prot; /* IOMMU_READ/WRITE */ int prot; /* IOMMU_READ/WRITE */
bool iommu_mapped; bool iommu_mapped;
bool lock_cap; /* capable(CAP_IPC_LOCK) */ bool lock_cap; /* capable(CAP_IPC_LOCK) */
bool vaddr_invalid;
struct task_struct *task; struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */ struct rb_root pfn_list; /* Ex-user pinned pfn list */
unsigned long *bitmap; unsigned long *bitmap;
}; };
struct vfio_batch {
struct page **pages; /* for pin_user_pages_remote */
struct page *fallback_page; /* if pages alloc fails */
int capacity; /* length of pages array */
int size; /* of batch currently */
int offset; /* of next entry in pages */
};
struct vfio_group { struct vfio_group {
struct iommu_group *iommu_group; struct iommu_group *iommu_group;
struct list_head next; struct list_head next;
...@@ -143,12 +157,13 @@ struct vfio_regions { ...@@ -143,12 +157,13 @@ struct vfio_regions {
#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
#define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
#define WAITED 1
static int put_pfn(unsigned long pfn, int prot); static int put_pfn(unsigned long pfn, int prot);
static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
struct iommu_group *iommu_group); struct iommu_group *iommu_group);
static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
/* /*
* This code handles mapping and unmapping of user data buffers * This code handles mapping and unmapping of user data buffers
* into DMA'ble space using the IOMMU * into DMA'ble space using the IOMMU
...@@ -173,6 +188,31 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, ...@@ -173,6 +188,31 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
return NULL; return NULL;
} }
static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
dma_addr_t start, size_t size)
{
struct rb_node *res = NULL;
struct rb_node *node = iommu->dma_list.rb_node;
struct vfio_dma *dma_res = NULL;
while (node) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
if (start < dma->iova + dma->size) {
res = node;
dma_res = dma;
if (start >= dma->iova)
break;
node = node->rb_left;
} else {
node = node->rb_right;
}
}
if (res && size && dma_res->iova >= start + size)
res = NULL;
return res;
}
static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
{ {
struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
...@@ -236,6 +276,18 @@ static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize) ...@@ -236,6 +276,18 @@ static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
} }
} }
static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
{
struct rb_node *n;
unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
}
}
static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
{ {
struct rb_node *n; struct rb_node *n;
...@@ -415,13 +467,54 @@ static int put_pfn(unsigned long pfn, int prot) ...@@ -415,13 +467,54 @@ static int put_pfn(unsigned long pfn, int prot)
return 0; return 0;
} }
#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
static void vfio_batch_init(struct vfio_batch *batch)
{
batch->size = 0;
batch->offset = 0;
if (unlikely(disable_hugepages))
goto fallback;
batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
if (!batch->pages)
goto fallback;
batch->capacity = VFIO_BATCH_MAX_CAPACITY;
return;
fallback:
batch->pages = &batch->fallback_page;
batch->capacity = 1;
}
static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
{
while (batch->size) {
unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
put_pfn(pfn, dma->prot);
batch->offset++;
batch->size--;
}
}
static void vfio_batch_fini(struct vfio_batch *batch)
{
if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
free_page((unsigned long)batch->pages);
}
static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
unsigned long vaddr, unsigned long *pfn, unsigned long vaddr, unsigned long *pfn,
bool write_fault) bool write_fault)
{ {
pte_t *ptep;
spinlock_t *ptl;
int ret; int ret;
ret = follow_pfn(vma, vaddr, pfn); ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
if (ret) { if (ret) {
bool unlocked = false; bool unlocked = false;
...@@ -435,16 +528,28 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, ...@@ -435,16 +528,28 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
if (ret) if (ret)
return ret; return ret;
ret = follow_pfn(vma, vaddr, pfn); ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
if (ret)
return ret;
} }
if (write_fault && !pte_write(*ptep))
ret = -EFAULT;
else
*pfn = pte_pfn(*ptep);
pte_unmap_unlock(ptep, ptl);
return ret; return ret;
} }
static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, /*
int prot, unsigned long *pfn) * Returns the positive number of pfns successfully obtained or a negative
* error code.
*/
static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
long npages, int prot, unsigned long *pfn,
struct page **pages)
{ {
struct page *page[1];
struct vm_area_struct *vma; struct vm_area_struct *vma;
unsigned int flags = 0; unsigned int flags = 0;
int ret; int ret;
...@@ -453,11 +558,10 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, ...@@ -453,11 +558,10 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
flags |= FOLL_WRITE; flags |= FOLL_WRITE;
mmap_read_lock(mm); mmap_read_lock(mm);
ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM, ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
page, NULL, NULL); pages, NULL, NULL);
if (ret == 1) { if (ret > 0) {
*pfn = page_to_pfn(page[0]); *pfn = page_to_pfn(pages[0]);
ret = 0;
goto done; goto done;
} }
...@@ -471,14 +575,73 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, ...@@ -471,14 +575,73 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
if (ret == -EAGAIN) if (ret == -EAGAIN)
goto retry; goto retry;
if (!ret && !is_invalid_reserved_pfn(*pfn)) if (!ret) {
if (is_invalid_reserved_pfn(*pfn))
ret = 1;
else
ret = -EFAULT; ret = -EFAULT;
} }
}
done: done:
mmap_read_unlock(mm); mmap_read_unlock(mm);
return ret; return ret;
} }
static int vfio_wait(struct vfio_iommu *iommu)
{
DEFINE_WAIT(wait);
prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
mutex_unlock(&iommu->lock);
schedule();
mutex_lock(&iommu->lock);
finish_wait(&iommu->vaddr_wait, &wait);
if (kthread_should_stop() || !iommu->container_open ||
fatal_signal_pending(current)) {
return -EFAULT;
}
return WAITED;
}
/*
* Find dma struct and wait for its vaddr to be valid. iommu lock is dropped
* if the task waits, but is re-locked on return. Return result in *dma_p.
* Return 0 on success with no waiting, WAITED on success if waited, and -errno
* on error.
*/
static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
size_t size, struct vfio_dma **dma_p)
{
int ret;
do {
*dma_p = vfio_find_dma(iommu, start, size);
if (!*dma_p)
ret = -EINVAL;
else if (!(*dma_p)->vaddr_invalid)
ret = 0;
else
ret = vfio_wait(iommu);
} while (ret > 0);
return ret;
}
/*
* Wait for all vaddr in the dma_list to become valid. iommu lock is dropped
* if the task waits, but is re-locked on return. Return 0 on success with no
* waiting, WAITED on success if waited, and -errno on error.
*/
static int vfio_wait_all_valid(struct vfio_iommu *iommu)
{
int ret = 0;
while (iommu->vaddr_invalid_count && ret >= 0)
ret = vfio_wait(iommu);
return ret;
}
/* /*
* Attempt to pin pages. We really don't want to track all the pfns and * Attempt to pin pages. We really don't want to track all the pfns and
* the iommu can only map chunks of consecutive pfns anyway, so get the * the iommu can only map chunks of consecutive pfns anyway, so get the
...@@ -486,58 +649,67 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, ...@@ -486,58 +649,67 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
*/ */
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
long npage, unsigned long *pfn_base, long npage, unsigned long *pfn_base,
unsigned long limit) unsigned long limit, struct vfio_batch *batch)
{ {
unsigned long pfn = 0; unsigned long pfn;
struct mm_struct *mm = current->mm;
long ret, pinned = 0, lock_acct = 0; long ret, pinned = 0, lock_acct = 0;
bool rsvd; bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova; dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
/* This code path is only user initiated */ /* This code path is only user initiated */
if (!current->mm) if (!mm)
return -ENODEV; return -ENODEV;
ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base); if (batch->size) {
if (ret) /* Leftover pages in batch from an earlier call. */
return ret; *pfn_base = page_to_pfn(batch->pages[batch->offset]);
pfn = *pfn_base;
pinned++;
rsvd = is_invalid_reserved_pfn(*pfn_base); rsvd = is_invalid_reserved_pfn(*pfn_base);
} else {
/* *pfn_base = 0;
* Reserved pages aren't counted against the user, externally pinned
* pages are already counted against the user.
*/
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
put_pfn(*pfn_base, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
return -ENOMEM;
}
lock_acct++;
} }
if (unlikely(disable_hugepages)) while (npage) {
goto out; if (!batch->size) {
/* Empty batch, so refill it. */
long req_pages = min_t(long, npage, batch->capacity);
/* Lock all the consecutive pages from pfn_base */ ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; &pfn, batch->pages);
pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { if (ret < 0)
ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); goto unpin_out;
if (ret)
break;
if (pfn != *pfn_base + pinned || batch->size = ret;
rsvd != is_invalid_reserved_pfn(pfn)) { batch->offset = 0;
put_pfn(pfn, dma->prot);
break; if (!*pfn_base) {
*pfn_base = pfn;
rsvd = is_invalid_reserved_pfn(*pfn_base);
}
} }
/*
* pfn is preset for the first iteration of this inner loop and
* updated at the end to handle a VM_PFNMAP pfn. In that case,
* batch->pages isn't valid (there's no struct page), so allow
* batch->pages to be touched only when there's more than one
* pfn to check, which guarantees the pfns are from a
* !VM_PFNMAP vma.
*/
while (true) {
if (pfn != *pfn_base + pinned ||
rsvd != is_invalid_reserved_pfn(pfn))
goto out;
/*
* Reserved pages aren't counted against the user,
* externally pinned pages are already counted against
* the user.
*/
if (!rsvd && !vfio_find_vpfn(dma, iova)) { if (!rsvd && !vfio_find_vpfn(dma, iova)) {
if (!dma->lock_cap && if (!dma->lock_cap &&
current->mm->locked_vm + lock_acct + 1 > limit) { mm->locked_vm + lock_acct + 1 > limit) {
put_pfn(pfn, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT); __func__, limit << PAGE_SHIFT);
ret = -ENOMEM; ret = -ENOMEM;
...@@ -545,17 +717,34 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, ...@@ -545,17 +717,34 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
} }
lock_acct++; lock_acct++;
} }
pinned++;
npage--;
vaddr += PAGE_SIZE;
iova += PAGE_SIZE;
batch->offset++;
batch->size--;
if (!batch->size)
break;
pfn = page_to_pfn(batch->pages[batch->offset]);
}
if (unlikely(disable_hugepages))
break;
} }
out: out:
ret = vfio_lock_acct(dma, lock_acct, false); ret = vfio_lock_acct(dma, lock_acct, false);
unpin_out: unpin_out:
if (ret) { if (ret < 0) {
if (!rsvd) { if (pinned && !rsvd) {
for (pfn = *pfn_base ; pinned ; pfn++, pinned--) for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
put_pfn(pfn, dma->prot); put_pfn(pfn, dma->prot);
} }
vfio_batch_unpin(batch, dma);
return ret; return ret;
} }
...@@ -587,6 +776,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, ...@@ -587,6 +776,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
unsigned long *pfn_base, bool do_accounting) unsigned long *pfn_base, bool do_accounting)
{ {
struct page *pages[1];
struct mm_struct *mm; struct mm_struct *mm;
int ret; int ret;
...@@ -594,8 +784,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, ...@@ -594,8 +784,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
if (!mm) if (!mm)
return -ENODEV; return -ENODEV;
ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) { if (ret == 1 && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
ret = vfio_lock_acct(dma, 1, true); ret = vfio_lock_acct(dma, 1, true);
if (ret) { if (ret) {
put_pfn(*pfn_base, dma->prot); put_pfn(*pfn_base, dma->prot);
...@@ -640,6 +830,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, ...@@ -640,6 +830,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
unsigned long remote_vaddr; unsigned long remote_vaddr;
struct vfio_dma *dma; struct vfio_dma *dma;
bool do_accounting; bool do_accounting;
dma_addr_t iova;
if (!iommu || !user_pfn || !phys_pfn) if (!iommu || !user_pfn || !phys_pfn)
return -EINVAL; return -EINVAL;
...@@ -650,6 +841,22 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, ...@@ -650,6 +841,22 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
mutex_lock(&iommu->lock); mutex_lock(&iommu->lock);
/*
* Wait for all necessary vaddr's to be valid so they can be used in
* the main loop without dropping the lock, to avoid racing vs unmap.
*/
again:
if (iommu->vaddr_invalid_count) {
for (i = 0; i < npage; i++) {
iova = user_pfn[i] << PAGE_SHIFT;
ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
if (ret < 0)
goto pin_done;
if (ret == WAITED)
goto again;
}
}
/* Fail if notifier list is empty */ /* Fail if notifier list is empty */
if (!iommu->notifier.head) { if (!iommu->notifier.head) {
ret = -EINVAL; ret = -EINVAL;
...@@ -664,7 +871,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, ...@@ -664,7 +871,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
for (i = 0; i < npage; i++) { for (i = 0; i < npage; i++) {
dma_addr_t iova;
struct vfio_pfn *vpfn; struct vfio_pfn *vpfn;
iova = user_pfn[i] << PAGE_SHIFT; iova = user_pfn[i] << PAGE_SHIFT;
...@@ -714,7 +920,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, ...@@ -714,7 +920,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
group = vfio_iommu_find_iommu_group(iommu, iommu_group); group = vfio_iommu_find_iommu_group(iommu, iommu_group);
if (!group->pinned_page_dirty_scope) { if (!group->pinned_page_dirty_scope) {
group->pinned_page_dirty_scope = true; group->pinned_page_dirty_scope = true;
update_pinned_page_dirty_scope(iommu); iommu->num_non_pinned_groups--;
} }
goto pin_done; goto pin_done;
...@@ -945,10 +1151,15 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, ...@@ -945,10 +1151,15 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
{ {
WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
vfio_unmap_unpin(iommu, dma, true); vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma); vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task); put_task_struct(dma->task);
vfio_dma_bitmap_free(dma); vfio_dma_bitmap_free(dma);
if (dma->vaddr_invalid) {
iommu->vaddr_invalid_count--;
wake_up_all(&iommu->vaddr_wait);
}
kfree(dma); kfree(dma);
iommu->dma_avail++; iommu->dma_avail++;
} }
...@@ -991,7 +1202,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, ...@@ -991,7 +1202,7 @@ static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
* mark all pages dirty if any IOMMU capable device is not able * mark all pages dirty if any IOMMU capable device is not able
* to report dirty pages and all pages are pinned and mapped. * to report dirty pages and all pages are pinned and mapped.
*/ */
if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped) if (iommu->num_non_pinned_groups && dma->iommu_mapped)
bitmap_set(dma->bitmap, 0, nbits); bitmap_set(dma->bitmap, 0, nbits);
if (shift) { if (shift) {
...@@ -1074,34 +1285,36 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, ...@@ -1074,34 +1285,36 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
{ {
struct vfio_dma *dma, *dma_last = NULL; struct vfio_dma *dma, *dma_last = NULL;
size_t unmapped = 0, pgsize; size_t unmapped = 0, pgsize;
int ret = 0, retries = 0; int ret = -EINVAL, retries = 0;
unsigned long pgshift; unsigned long pgshift;
dma_addr_t iova = unmap->iova;
unsigned long size = unmap->size;
bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
struct rb_node *n, *first_n;
mutex_lock(&iommu->lock); mutex_lock(&iommu->lock);
pgshift = __ffs(iommu->pgsize_bitmap); pgshift = __ffs(iommu->pgsize_bitmap);
pgsize = (size_t)1 << pgshift; pgsize = (size_t)1 << pgshift;
if (unmap->iova & (pgsize - 1)) { if (iova & (pgsize - 1))
ret = -EINVAL;
goto unlock; goto unlock;
}
if (!unmap->size || unmap->size & (pgsize - 1)) { if (unmap_all) {
ret = -EINVAL; if (iova || size)
goto unlock;
size = SIZE_MAX;
} else if (!size || size & (pgsize - 1)) {
goto unlock; goto unlock;
} }
if (unmap->iova + unmap->size - 1 < unmap->iova || if (iova + size - 1 < iova || size > SIZE_MAX)
unmap->size > SIZE_MAX) {
ret = -EINVAL;
goto unlock; goto unlock;
}
/* When dirty tracking is enabled, allow only min supported pgsize */ /* When dirty tracking is enabled, allow only min supported pgsize */
if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) && if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
(!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) { (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
ret = -EINVAL;
goto unlock; goto unlock;
} }
...@@ -1138,21 +1351,25 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, ...@@ -1138,21 +1351,25 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
* will only return success and a size of zero if there were no * will only return success and a size of zero if there were no
* mappings within the range. * mappings within the range.
*/ */
if (iommu->v2) { if (iommu->v2 && !unmap_all) {
dma = vfio_find_dma(iommu, unmap->iova, 1); dma = vfio_find_dma(iommu, iova, 1);
if (dma && dma->iova != unmap->iova) { if (dma && dma->iova != iova)
ret = -EINVAL;
goto unlock; goto unlock;
}
dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0); dma = vfio_find_dma(iommu, iova + size - 1, 0);
if (dma && dma->iova + dma->size != unmap->iova + unmap->size) { if (dma && dma->iova + dma->size != iova + size)
ret = -EINVAL;
goto unlock; goto unlock;
} }
}
while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { ret = 0;
if (!iommu->v2 && unmap->iova > dma->iova) n = first_n = vfio_find_dma_first_node(iommu, iova, size);
while (n) {
dma = rb_entry(n, struct vfio_dma, node);
if (dma->iova >= iova + size)
break;
if (!iommu->v2 && iova > dma->iova)
break; break;
/* /*
* Task with same address space who mapped this iova range is * Task with same address space who mapped this iova range is
...@@ -1161,6 +1378,27 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, ...@@ -1161,6 +1378,27 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (dma->task->mm != current->mm) if (dma->task->mm != current->mm)
break; break;
if (invalidate_vaddr) {
if (dma->vaddr_invalid) {
struct rb_node *last_n = n;
for (n = first_n; n != last_n; n = rb_next(n)) {
dma = rb_entry(n,
struct vfio_dma, node);
dma->vaddr_invalid = false;
iommu->vaddr_invalid_count--;
}
ret = -EINVAL;
unmapped = 0;
break;
}
dma->vaddr_invalid = true;
iommu->vaddr_invalid_count++;
unmapped += dma->size;
n = rb_next(n);
continue;
}
if (!RB_EMPTY_ROOT(&dma->pfn_list)) { if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
struct vfio_iommu_type1_dma_unmap nb_unmap; struct vfio_iommu_type1_dma_unmap nb_unmap;
...@@ -1190,12 +1428,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, ...@@ -1190,12 +1428,13 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
ret = update_user_bitmap(bitmap->data, iommu, dma, ret = update_user_bitmap(bitmap->data, iommu, dma,
unmap->iova, pgsize); iova, pgsize);
if (ret) if (ret)
break; break;
} }
unmapped += dma->size; unmapped += dma->size;
n = rb_next(n);
vfio_remove_dma(iommu, dma); vfio_remove_dma(iommu, dma);
} }
...@@ -1239,15 +1478,19 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, ...@@ -1239,15 +1478,19 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
{ {
dma_addr_t iova = dma->iova; dma_addr_t iova = dma->iova;
unsigned long vaddr = dma->vaddr; unsigned long vaddr = dma->vaddr;
struct vfio_batch batch;
size_t size = map_size; size_t size = map_size;
long npage; long npage;
unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
int ret = 0; int ret = 0;
vfio_batch_init(&batch);
while (size) { while (size) {
/* Pin a contiguous chunk of memory */ /* Pin a contiguous chunk of memory */
npage = vfio_pin_pages_remote(dma, vaddr + dma->size, npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
size >> PAGE_SHIFT, &pfn, limit); size >> PAGE_SHIFT, &pfn, limit,
&batch);
if (npage <= 0) { if (npage <= 0) {
WARN_ON(!npage); WARN_ON(!npage);
ret = (int)npage; ret = (int)npage;
...@@ -1260,6 +1503,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, ...@@ -1260,6 +1503,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
if (ret) { if (ret) {
vfio_unpin_pages_remote(dma, iova + dma->size, pfn, vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
npage, true); npage, true);
vfio_batch_unpin(&batch, dma);
break; break;
} }
...@@ -1267,6 +1511,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, ...@@ -1267,6 +1511,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma->size += npage << PAGE_SHIFT; dma->size += npage << PAGE_SHIFT;
} }
vfio_batch_fini(&batch);
dma->iommu_mapped = true; dma->iommu_mapped = true;
if (ret) if (ret)
...@@ -1299,6 +1544,7 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu, ...@@ -1299,6 +1544,7 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
static int vfio_dma_do_map(struct vfio_iommu *iommu, static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map) struct vfio_iommu_type1_dma_map *map)
{ {
bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
dma_addr_t iova = map->iova; dma_addr_t iova = map->iova;
unsigned long vaddr = map->vaddr; unsigned long vaddr = map->vaddr;
size_t size = map->size; size_t size = map->size;
...@@ -1316,13 +1562,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, ...@@ -1316,13 +1562,16 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (map->flags & VFIO_DMA_MAP_FLAG_READ) if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ; prot |= IOMMU_READ;
if ((prot && set_vaddr) || (!prot && !set_vaddr))
return -EINVAL;
mutex_lock(&iommu->lock); mutex_lock(&iommu->lock);
pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap); pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
WARN_ON((pgsize - 1) & PAGE_MASK); WARN_ON((pgsize - 1) & PAGE_MASK);
if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) { if (!size || (size | iova | vaddr) & (pgsize - 1)) {
ret = -EINVAL; ret = -EINVAL;
goto out_unlock; goto out_unlock;
} }
...@@ -1333,7 +1582,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, ...@@ -1333,7 +1582,21 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock; goto out_unlock;
} }
if (vfio_find_dma(iommu, iova, size)) { dma = vfio_find_dma(iommu, iova, size);
if (set_vaddr) {
if (!dma) {
ret = -ENOENT;
} else if (!dma->vaddr_invalid || dma->iova != iova ||
dma->size != size) {
ret = -EINVAL;
} else {
dma->vaddr = vaddr;
dma->vaddr_invalid = false;
iommu->vaddr_invalid_count--;
wake_up_all(&iommu->vaddr_wait);
}
goto out_unlock;
} else if (dma) {
ret = -EEXIST; ret = -EEXIST;
goto out_unlock; goto out_unlock;
} }
...@@ -1425,16 +1688,23 @@ static int vfio_bus_type(struct device *dev, void *data) ...@@ -1425,16 +1688,23 @@ static int vfio_bus_type(struct device *dev, void *data)
static int vfio_iommu_replay(struct vfio_iommu *iommu, static int vfio_iommu_replay(struct vfio_iommu *iommu,
struct vfio_domain *domain) struct vfio_domain *domain)
{ {
struct vfio_batch batch;
struct vfio_domain *d = NULL; struct vfio_domain *d = NULL;
struct rb_node *n; struct rb_node *n;
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
int ret; int ret;
ret = vfio_wait_all_valid(iommu);
if (ret < 0)
return ret;
/* Arbitrarily pick the first domain in the list for lookups */ /* Arbitrarily pick the first domain in the list for lookups */
if (!list_empty(&iommu->domain_list)) if (!list_empty(&iommu->domain_list))
d = list_first_entry(&iommu->domain_list, d = list_first_entry(&iommu->domain_list,
struct vfio_domain, next); struct vfio_domain, next);
vfio_batch_init(&batch);
n = rb_first(&iommu->dma_list); n = rb_first(&iommu->dma_list);
for (; n; n = rb_next(n)) { for (; n; n = rb_next(n)) {
...@@ -1482,7 +1752,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, ...@@ -1482,7 +1752,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
npage = vfio_pin_pages_remote(dma, vaddr, npage = vfio_pin_pages_remote(dma, vaddr,
n >> PAGE_SHIFT, n >> PAGE_SHIFT,
&pfn, limit); &pfn, limit,
&batch);
if (npage <= 0) { if (npage <= 0) {
WARN_ON(!npage); WARN_ON(!npage);
ret = (int)npage; ret = (int)npage;
...@@ -1496,11 +1767,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, ...@@ -1496,11 +1767,13 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
ret = iommu_map(domain->domain, iova, phys, ret = iommu_map(domain->domain, iova, phys,
size, dma->prot | domain->prot); size, dma->prot | domain->prot);
if (ret) { if (ret) {
if (!dma->iommu_mapped) if (!dma->iommu_mapped) {
vfio_unpin_pages_remote(dma, iova, vfio_unpin_pages_remote(dma, iova,
phys >> PAGE_SHIFT, phys >> PAGE_SHIFT,
size >> PAGE_SHIFT, size >> PAGE_SHIFT,
true); true);
vfio_batch_unpin(&batch, dma);
}
goto unwind; goto unwind;
} }
...@@ -1515,6 +1788,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, ...@@ -1515,6 +1788,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
dma->iommu_mapped = true; dma->iommu_mapped = true;
} }
vfio_batch_fini(&batch);
return 0; return 0;
unwind: unwind:
...@@ -1555,6 +1829,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, ...@@ -1555,6 +1829,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
} }
} }
vfio_batch_fini(&batch);
return ret; return ret;
} }
...@@ -1622,33 +1897,6 @@ static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, ...@@ -1622,33 +1897,6 @@ static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
return group; return group;
} }
static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
{
struct vfio_domain *domain;
struct vfio_group *group;
list_for_each_entry(domain, &iommu->domain_list, next) {
list_for_each_entry(group, &domain->group_list, next) {
if (!group->pinned_page_dirty_scope) {
iommu->pinned_page_dirty_scope = false;
return;
}
}
}
if (iommu->external_domain) {
domain = iommu->external_domain;
list_for_each_entry(group, &domain->group_list, next) {
if (!group->pinned_page_dirty_scope) {
iommu->pinned_page_dirty_scope = false;
return;
}
}
}
iommu->pinned_page_dirty_scope = true;
}
static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
phys_addr_t *base) phys_addr_t *base)
{ {
...@@ -2057,8 +2305,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, ...@@ -2057,8 +2305,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
* addition of a dirty tracking group. * addition of a dirty tracking group.
*/ */
group->pinned_page_dirty_scope = true; group->pinned_page_dirty_scope = true;
if (!iommu->pinned_page_dirty_scope)
update_pinned_page_dirty_scope(iommu);
mutex_unlock(&iommu->lock); mutex_unlock(&iommu->lock);
return 0; return 0;
...@@ -2188,7 +2434,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, ...@@ -2188,7 +2434,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
* demotes the iommu scope until it declares itself dirty tracking * demotes the iommu scope until it declares itself dirty tracking
* capable via the page pinning interface. * capable via the page pinning interface.
*/ */
iommu->pinned_page_dirty_scope = false; iommu->num_non_pinned_groups++;
mutex_unlock(&iommu->lock); mutex_unlock(&iommu->lock);
vfio_iommu_resv_free(&group_resv_regions); vfio_iommu_resv_free(&group_resv_regions);
...@@ -2238,23 +2484,6 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) ...@@ -2238,23 +2484,6 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
} }
} }
static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
{
struct rb_node *n;
n = rb_first(&iommu->dma_list);
for (; n; n = rb_next(n)) {
struct vfio_dma *dma;
dma = rb_entry(n, struct vfio_dma, node);
if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
break;
}
/* mdev vendor driver must unregister notifier */
WARN_ON(iommu->notifier.head);
}
/* /*
* Called when a domain is removed in detach. It is possible that * Called when a domain is removed in detach. It is possible that
* the removed domain decided the iova aperture window. Modify the * the removed domain decided the iova aperture window. Modify the
...@@ -2354,10 +2583,10 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, ...@@ -2354,10 +2583,10 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
kfree(group); kfree(group);
if (list_empty(&iommu->external_domain->group_list)) { if (list_empty(&iommu->external_domain->group_list)) {
vfio_sanity_check_pfn_list(iommu); if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
WARN_ON(iommu->notifier.head);
if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
vfio_iommu_unmap_unpin_all(iommu); vfio_iommu_unmap_unpin_all(iommu);
}
kfree(iommu->external_domain); kfree(iommu->external_domain);
iommu->external_domain = NULL; iommu->external_domain = NULL;
...@@ -2391,11 +2620,13 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, ...@@ -2391,11 +2620,13 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
*/ */
if (list_empty(&domain->group_list)) { if (list_empty(&domain->group_list)) {
if (list_is_singular(&iommu->domain_list)) { if (list_is_singular(&iommu->domain_list)) {
if (!iommu->external_domain) if (!iommu->external_domain) {
WARN_ON(iommu->notifier.head);
vfio_iommu_unmap_unpin_all(iommu); vfio_iommu_unmap_unpin_all(iommu);
else } else {
vfio_iommu_unmap_unpin_reaccount(iommu); vfio_iommu_unmap_unpin_reaccount(iommu);
} }
}
iommu_domain_free(domain->domain); iommu_domain_free(domain->domain);
list_del(&domain->next); list_del(&domain->next);
kfree(domain); kfree(domain);
...@@ -2415,8 +2646,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, ...@@ -2415,8 +2646,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data,
* Removal of a group without dirty tracking may allow the iommu scope * Removal of a group without dirty tracking may allow the iommu scope
* to be promoted. * to be promoted.
*/ */
if (update_dirty_scope) if (update_dirty_scope) {
update_pinned_page_dirty_scope(iommu); iommu->num_non_pinned_groups--;
if (iommu->dirty_page_tracking)
vfio_iommu_populate_bitmap_full(iommu);
}
mutex_unlock(&iommu->lock); mutex_unlock(&iommu->lock);
} }
...@@ -2446,8 +2680,10 @@ static void *vfio_iommu_type1_open(unsigned long arg) ...@@ -2446,8 +2680,10 @@ static void *vfio_iommu_type1_open(unsigned long arg)
INIT_LIST_HEAD(&iommu->iova_list); INIT_LIST_HEAD(&iommu->iova_list);
iommu->dma_list = RB_ROOT; iommu->dma_list = RB_ROOT;
iommu->dma_avail = dma_entry_limit; iommu->dma_avail = dma_entry_limit;
iommu->container_open = true;
mutex_init(&iommu->lock); mutex_init(&iommu->lock);
BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
init_waitqueue_head(&iommu->vaddr_wait);
return iommu; return iommu;
} }
...@@ -2475,7 +2711,6 @@ static void vfio_iommu_type1_release(void *iommu_data) ...@@ -2475,7 +2711,6 @@ static void vfio_iommu_type1_release(void *iommu_data)
if (iommu->external_domain) { if (iommu->external_domain) {
vfio_release_domain(iommu->external_domain, true); vfio_release_domain(iommu->external_domain, true);
vfio_sanity_check_pfn_list(iommu);
kfree(iommu->external_domain); kfree(iommu->external_domain);
} }
...@@ -2517,6 +2752,8 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, ...@@ -2517,6 +2752,8 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
case VFIO_TYPE1_IOMMU: case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1v2_IOMMU:
case VFIO_TYPE1_NESTING_IOMMU: case VFIO_TYPE1_NESTING_IOMMU:
case VFIO_UNMAP_ALL:
case VFIO_UPDATE_VADDR:
return 1; return 1;
case VFIO_DMA_CC_IOMMU: case VFIO_DMA_CC_IOMMU:
if (!iommu) if (!iommu)
...@@ -2688,7 +2925,8 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu, ...@@ -2688,7 +2925,8 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
{ {
struct vfio_iommu_type1_dma_map map; struct vfio_iommu_type1_dma_map map;
unsigned long minsz; unsigned long minsz;
uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
VFIO_DMA_MAP_FLAG_VADDR;
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
...@@ -2706,6 +2944,9 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu, ...@@ -2706,6 +2944,9 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
{ {
struct vfio_iommu_type1_dma_unmap unmap; struct vfio_iommu_type1_dma_unmap unmap;
struct vfio_bitmap bitmap = { 0 }; struct vfio_bitmap bitmap = { 0 };
uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
VFIO_DMA_UNMAP_FLAG_VADDR |
VFIO_DMA_UNMAP_FLAG_ALL;
unsigned long minsz; unsigned long minsz;
int ret; int ret;
...@@ -2714,8 +2955,12 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu, ...@@ -2714,8 +2955,12 @@ static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
if (copy_from_user(&unmap, (void __user *)arg, minsz)) if (copy_from_user(&unmap, (void __user *)arg, minsz))
return -EFAULT; return -EFAULT;
if (unmap.argsz < minsz || if (unmap.argsz < minsz || unmap.flags & ~mask)
unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) return -EINVAL;
if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
(unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
VFIO_DMA_UNMAP_FLAG_VADDR)))
return -EINVAL; return -EINVAL;
if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) { if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
...@@ -2906,12 +3151,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, ...@@ -2906,12 +3151,13 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
struct vfio_dma *dma; struct vfio_dma *dma;
bool kthread = current->mm == NULL; bool kthread = current->mm == NULL;
size_t offset; size_t offset;
int ret;
*copied = 0; *copied = 0;
dma = vfio_find_dma(iommu, user_iova, 1); ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
if (!dma) if (ret < 0)
return -EINVAL; return ret;
if ((write && !(dma->prot & IOMMU_WRITE)) || if ((write && !(dma->prot & IOMMU_WRITE)) ||
!(dma->prot & IOMMU_READ)) !(dma->prot & IOMMU_READ))
...@@ -3003,6 +3249,19 @@ vfio_iommu_type1_group_iommu_domain(void *iommu_data, ...@@ -3003,6 +3249,19 @@ vfio_iommu_type1_group_iommu_domain(void *iommu_data,
return domain; return domain;
} }
static void vfio_iommu_type1_notify(void *iommu_data,
enum vfio_iommu_notify_type event)
{
struct vfio_iommu *iommu = iommu_data;
if (event != VFIO_IOMMU_CONTAINER_CLOSE)
return;
mutex_lock(&iommu->lock);
iommu->container_open = false;
mutex_unlock(&iommu->lock);
wake_up_all(&iommu->vaddr_wait);
}
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.name = "vfio-iommu-type1", .name = "vfio-iommu-type1",
.owner = THIS_MODULE, .owner = THIS_MODULE,
...@@ -3017,6 +3276,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { ...@@ -3017,6 +3276,7 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.unregister_notifier = vfio_iommu_type1_unregister_notifier, .unregister_notifier = vfio_iommu_type1_unregister_notifier,
.dma_rw = vfio_iommu_type1_dma_rw, .dma_rw = vfio_iommu_type1_dma_rw,
.group_iommu_domain = vfio_iommu_type1_group_iommu_domain, .group_iommu_domain = vfio_iommu_type1_group_iommu_domain,
.notify = vfio_iommu_type1_notify,
}; };
static int __init vfio_iommu_type1_init(void) static int __init vfio_iommu_type1_init(void)
......
...@@ -57,6 +57,11 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); ...@@ -57,6 +57,11 @@ extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
extern void vfio_device_put(struct vfio_device *device); extern void vfio_device_put(struct vfio_device *device);
extern void *vfio_device_data(struct vfio_device *device); extern void *vfio_device_data(struct vfio_device *device);
/* events for the backend driver notify callback */
enum vfio_iommu_notify_type {
VFIO_IOMMU_CONTAINER_CLOSE = 0,
};
/** /**
* struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
*/ */
...@@ -92,6 +97,8 @@ struct vfio_iommu_driver_ops { ...@@ -92,6 +97,8 @@ struct vfio_iommu_driver_ops {
void *data, size_t count, bool write); void *data, size_t count, bool write);
struct iommu_domain *(*group_iommu_domain)(void *iommu_data, struct iommu_domain *(*group_iommu_domain)(void *iommu_data,
struct iommu_group *group); struct iommu_group *group);
void (*notify)(void *iommu_data,
enum vfio_iommu_notify_type event);
}; };
extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
......
...@@ -46,6 +46,12 @@ ...@@ -46,6 +46,12 @@
*/ */
#define VFIO_NOIOMMU_IOMMU 8 #define VFIO_NOIOMMU_IOMMU 8
/* Supports VFIO_DMA_UNMAP_FLAG_ALL */
#define VFIO_UNMAP_ALL 9
/* Supports the vaddr flag for DMA map and unmap */
#define VFIO_UPDATE_VADDR 10
/* /*
* The IOCTL interface is designed for extensibility by embedding the * The IOCTL interface is designed for extensibility by embedding the
* structure length (argsz) and flags into structures passed between * structure length (argsz) and flags into structures passed between
...@@ -1074,12 +1080,22 @@ struct vfio_iommu_type1_info_dma_avail { ...@@ -1074,12 +1080,22 @@ struct vfio_iommu_type1_info_dma_avail {
* *
* Map process virtual addresses to IO virtual addresses using the * Map process virtual addresses to IO virtual addresses using the
* provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
*
* If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and
* unblock translation of host virtual addresses in the iova range. The vaddr
* must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR. To
* maintain memory consistency within the user application, the updated vaddr
* must address the same memory object as originally mapped. Failure to do so
* will result in user memory corruption and/or device misbehavior. iova and
* size must match those in the original MAP_DMA call. Protection is not
* changed, and the READ & WRITE flags must be 0.
*/ */
struct vfio_iommu_type1_dma_map { struct vfio_iommu_type1_dma_map {
__u32 argsz; __u32 argsz;
__u32 flags; __u32 flags;
#define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */ #define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */
#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */ #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */
#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
__u64 vaddr; /* Process virtual address */ __u64 vaddr; /* Process virtual address */
__u64 iova; /* IO virtual address */ __u64 iova; /* IO virtual address */
__u64 size; /* Size of mapping (bytes) */ __u64 size; /* Size of mapping (bytes) */
...@@ -1102,6 +1118,7 @@ struct vfio_bitmap { ...@@ -1102,6 +1118,7 @@ struct vfio_bitmap {
* field. No guarantee is made to the user that arbitrary unmaps of iova * field. No guarantee is made to the user that arbitrary unmaps of iova
* or size different from those used in the original mapping call will * or size different from those used in the original mapping call will
* succeed. * succeed.
*
* VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get the dirty bitmap
* before unmapping IO virtual addresses. When this flag is set, the user must * before unmapping IO virtual addresses. When this flag is set, the user must
* provide a struct vfio_bitmap in data[]. User must provide zero-allocated * provide a struct vfio_bitmap in data[]. User must provide zero-allocated
...@@ -1111,11 +1128,21 @@ struct vfio_bitmap { ...@@ -1111,11 +1128,21 @@ struct vfio_bitmap {
* indicates that the page at that offset from iova is dirty. A Bitmap of the * indicates that the page at that offset from iova is dirty. A Bitmap of the
* pages in the range of unmapped size is returned in the user-provided * pages in the range of unmapped size is returned in the user-provided
* vfio_bitmap.data. * vfio_bitmap.data.
*
* If flags & VFIO_DMA_UNMAP_FLAG_ALL, unmap all addresses. iova and size
* must be 0. This cannot be combined with the get-dirty-bitmap flag.
*
* If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host
* virtual addresses in the iova range. Tasks that attempt to translate an
* iova's vaddr will block. DMA to already-mapped pages continues. This
* cannot be combined with the get-dirty-bitmap flag.
*/ */
struct vfio_iommu_type1_dma_unmap { struct vfio_iommu_type1_dma_unmap {
__u32 argsz; __u32 argsz;
__u32 flags; __u32 flags;
#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) #define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0)
#define VFIO_DMA_UNMAP_FLAG_ALL (1 << 1)
#define VFIO_DMA_UNMAP_FLAG_VADDR (1 << 2)
__u64 iova; /* IO virtual address */ __u64 iova; /* IO virtual address */
__u64 size; /* Size of mapping (bytes) */ __u64 size; /* Size of mapping (bytes) */
__u8 data[]; __u8 data[];
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment