Commit d75671e3 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'vfio-v3.12-rc0' of git://github.com/awilliam/linux-vfio

Pull VFIO update from Alex Williamson:
 "VFIO updates include safer default file flags for VFIO device fds, an
  external user interface exported to allow other modules to hold
  references to VFIO groups, a fix to test for extended config space on
  PCIe and PCI-x, and new hot reset interfaces for PCI devices which
  allows the user to do PCI bus/slot resets when all of the devices
  affected by the reset are owned by the user.

  For this last feature, the PCI bus reset interface, I depend on
  changes already merged from Bjorn's PCI pull request.  I therefore
  merged my tree up to commit cb3e4330, which I think was the correct
  action, but as Stephen Rothwell noted, I failed to provide a commit
  message indicating why the merge was required.  Sorry for that.
  Thanks, Alex"

* tag 'vfio-v3.12-rc0' of git://github.com/awilliam/linux-vfio:
  vfio: fix documentation
  vfio-pci: PCI hot reset interface
  vfio-pci: Test for extended config space
  vfio-pci: Use fdget() rather than eventfd_fget()
  vfio: Add O_CLOEXEC flag to vfio device fd
  vfio: use get_unused_fd_flags(0) instead of get_unused_fd()
  vfio: add external user support
parents bf97293e dac09b57
...@@ -167,8 +167,8 @@ group and can access them as follows: ...@@ -167,8 +167,8 @@ group and can access them as follows:
int container, group, device, i; int container, group, device, i;
struct vfio_group_status group_status = struct vfio_group_status group_status =
{ .argsz = sizeof(group_status) }; { .argsz = sizeof(group_status) };
struct vfio_iommu_x86_info iommu_info = { .argsz = sizeof(iommu_info) }; struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
struct vfio_iommu_x86_dma_map dma_map = { .argsz = sizeof(dma_map) }; struct vfio_iommu_type1_dma_map dma_map = { .argsz = sizeof(dma_map) };
struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
/* Create a new container */ /* Create a new container */
...@@ -193,7 +193,7 @@ group and can access them as follows: ...@@ -193,7 +193,7 @@ group and can access them as follows:
ioctl(group, VFIO_GROUP_SET_CONTAINER, &container); ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
/* Enable the IOMMU model we want */ /* Enable the IOMMU model we want */
ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
/* Get addition IOMMU info */ /* Get addition IOMMU info */
ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info); ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);
...@@ -229,7 +229,7 @@ group and can access them as follows: ...@@ -229,7 +229,7 @@ group and can access them as follows:
irq.index = i; irq.index = i;
ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &reg); ioctl(device, VFIO_DEVICE_GET_IRQ_INFO, &irq);
/* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */ /* Setup IRQs... eventfds, VFIO_DEVICE_SET_IRQS */
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/device.h> #include <linux/device.h>
#include <linux/eventfd.h> #include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/iommu.h> #include <linux/iommu.h>
#include <linux/module.h> #include <linux/module.h>
...@@ -227,6 +228,110 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) ...@@ -227,6 +228,110 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
return 0; return 0;
} }
static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
{
(*(int *)data)++;
return 0;
}
struct vfio_pci_fill_info {
int max;
int cur;
struct vfio_pci_dependent_device *devices;
};
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{
struct vfio_pci_fill_info *fill = data;
struct iommu_group *iommu_group;
if (fill->cur == fill->max)
return -EAGAIN; /* Something changed, try again */
iommu_group = iommu_group_get(&pdev->dev);
if (!iommu_group)
return -EPERM; /* Cannot reset non-isolated devices */
fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
fill->devices[fill->cur].bus = pdev->bus->number;
fill->devices[fill->cur].devfn = pdev->devfn;
fill->cur++;
iommu_group_put(iommu_group);
return 0;
}
struct vfio_pci_group_entry {
struct vfio_group *group;
int id;
};
struct vfio_pci_group_info {
int count;
struct vfio_pci_group_entry *groups;
};
static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data)
{
struct vfio_pci_group_info *info = data;
struct iommu_group *group;
int id, i;
group = iommu_group_get(&pdev->dev);
if (!group)
return -EPERM;
id = iommu_group_id(group);
for (i = 0; i < info->count; i++)
if (info->groups[i].id == id)
break;
iommu_group_put(group);
return (i == info->count) ? -EINVAL : 0;
}
static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
{
for (; pdev; pdev = pdev->bus->self)
if (pdev->bus == slot->bus)
return (pdev->slot == slot);
return false;
}
struct vfio_pci_walk_info {
int (*fn)(struct pci_dev *, void *data);
void *data;
struct pci_dev *pdev;
bool slot;
int ret;
};
static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
{
struct vfio_pci_walk_info *walk = data;
if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
walk->ret = walk->fn(pdev, walk->data);
return walk->ret;
}
static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
int (*fn)(struct pci_dev *,
void *data), void *data,
bool slot)
{
struct vfio_pci_walk_info walk = {
.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
};
pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
return walk.ret;
}
static long vfio_pci_ioctl(void *device_data, static long vfio_pci_ioctl(void *device_data,
unsigned int cmd, unsigned long arg) unsigned int cmd, unsigned long arg)
{ {
...@@ -407,10 +512,189 @@ static long vfio_pci_ioctl(void *device_data, ...@@ -407,10 +512,189 @@ static long vfio_pci_ioctl(void *device_data,
return ret; return ret;
} else if (cmd == VFIO_DEVICE_RESET) } else if (cmd == VFIO_DEVICE_RESET) {
return vdev->reset_works ? return vdev->reset_works ?
pci_reset_function(vdev->pdev) : -EINVAL; pci_reset_function(vdev->pdev) : -EINVAL;
} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
struct vfio_pci_hot_reset_info hdr;
struct vfio_pci_fill_info fill = { 0 };
struct vfio_pci_dependent_device *devices = NULL;
bool slot = false;
int ret = 0;
minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
if (copy_from_user(&hdr, (void __user *)arg, minsz))
return -EFAULT;
if (hdr.argsz < minsz)
return -EINVAL;
hdr.flags = 0;
/* Can we do a slot or bus reset or neither? */
if (!pci_probe_reset_slot(vdev->pdev->slot))
slot = true;
else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV;
/* How many devices are affected? */
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
vfio_pci_count_devs,
&fill.max, slot);
if (ret)
return ret;
WARN_ON(!fill.max); /* Should always be at least one */
/*
* If there's enough space, fill it now, otherwise return
* -ENOSPC and the number of devices affected.
*/
if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
ret = -ENOSPC;
hdr.count = fill.max;
goto reset_info_exit;
}
devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
if (!devices)
return -ENOMEM;
fill.devices = devices;
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
vfio_pci_fill_devs,
&fill, slot);
/*
* If a device was removed between counting and filling,
* we may come up short of fill.max. If a device was
* added, we'll have a return of -EAGAIN above.
*/
if (!ret)
hdr.count = fill.cur;
reset_info_exit:
if (copy_to_user((void __user *)arg, &hdr, minsz))
ret = -EFAULT;
if (!ret) {
if (copy_to_user((void __user *)(arg + minsz), devices,
hdr.count * sizeof(*devices)))
ret = -EFAULT;
}
kfree(devices);
return ret;
} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
struct vfio_pci_hot_reset hdr;
int32_t *group_fds;
struct vfio_pci_group_entry *groups;
struct vfio_pci_group_info info;
bool slot = false;
int i, count = 0, ret = 0;
minsz = offsetofend(struct vfio_pci_hot_reset, count);
if (copy_from_user(&hdr, (void __user *)arg, minsz))
return -EFAULT;
if (hdr.argsz < minsz || hdr.flags)
return -EINVAL;
/* Can we do a slot or bus reset or neither? */
if (!pci_probe_reset_slot(vdev->pdev->slot))
slot = true;
else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV;
/*
* We can't let userspace give us an arbitrarily large
* buffer to copy, so verify how many we think there
* could be. Note groups can have multiple devices so
* one group per device is the max.
*/
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
vfio_pci_count_devs,
&count, slot);
if (ret)
return ret;
/* Somewhere between 1 and count is OK */
if (!hdr.count || hdr.count > count)
return -EINVAL;
group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL);
if (!group_fds || !groups) {
kfree(group_fds);
kfree(groups);
return -ENOMEM;
}
if (copy_from_user(group_fds, (void __user *)(arg + minsz),
hdr.count * sizeof(*group_fds))) {
kfree(group_fds);
kfree(groups);
return -EFAULT;
}
/*
* For each group_fd, get the group through the vfio external
* user interface and store the group and iommu ID. This
* ensures the group is held across the reset.
*/
for (i = 0; i < hdr.count; i++) {
struct vfio_group *group;
struct fd f = fdget(group_fds[i]);
if (!f.file) {
ret = -EBADF;
break;
}
group = vfio_group_get_external_user(f.file);
fdput(f);
if (IS_ERR(group)) {
ret = PTR_ERR(group);
break;
}
groups[i].group = group;
groups[i].id = vfio_external_user_iommu_id(group);
}
kfree(group_fds);
/* release reference to groups on error */
if (ret)
goto hot_reset_release;
info.count = hdr.count;
info.groups = groups;
/*
* Test whether all the affected devices are contained
* by the set of groups provided by the user.
*/
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
vfio_pci_validate_devs,
&info, slot);
if (!ret)
/* User has access, do the reset */
ret = slot ? pci_reset_slot(vdev->pdev->slot) :
pci_reset_bus(vdev->pdev->bus);
hot_reset_release:
for (i--; i >= 0; i--)
vfio_group_put_external_user(groups[i].group);
kfree(groups);
return ret;
}
return -ENOTTY; return -ENOTTY;
} }
......
...@@ -1012,6 +1012,7 @@ static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) ...@@ -1012,6 +1012,7 @@ static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos)
static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
{ {
struct pci_dev *pdev = vdev->pdev; struct pci_dev *pdev = vdev->pdev;
u32 dword;
u16 word; u16 word;
u8 byte; u8 byte;
int ret; int ret;
...@@ -1025,7 +1026,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) ...@@ -1025,7 +1026,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
return pcibios_err_to_errno(ret); return pcibios_err_to_errno(ret);
if (PCI_X_CMD_VERSION(word)) { if (PCI_X_CMD_VERSION(word)) {
vdev->extended_caps = true; /* Test for extended capabilities */
pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
vdev->extended_caps = (dword != 0);
return PCI_CAP_PCIX_SIZEOF_V2; return PCI_CAP_PCIX_SIZEOF_V2;
} else } else
return PCI_CAP_PCIX_SIZEOF_V0; return PCI_CAP_PCIX_SIZEOF_V0;
...@@ -1037,9 +1040,11 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) ...@@ -1037,9 +1040,11 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
return byte; return byte;
case PCI_CAP_ID_EXP: case PCI_CAP_ID_EXP:
/* length based on version */ /* Test for extended capabilities */
vdev->extended_caps = true; pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword);
vdev->extended_caps = (dword != 0);
/* length based on version */
if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
else else
......
...@@ -130,8 +130,8 @@ static int virqfd_enable(struct vfio_pci_device *vdev, ...@@ -130,8 +130,8 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
void (*thread)(struct vfio_pci_device *, void *), void (*thread)(struct vfio_pci_device *, void *),
void *data, struct virqfd **pvirqfd, int fd) void *data, struct virqfd **pvirqfd, int fd)
{ {
struct file *file = NULL; struct fd irqfd;
struct eventfd_ctx *ctx = NULL; struct eventfd_ctx *ctx;
struct virqfd *virqfd; struct virqfd *virqfd;
int ret = 0; int ret = 0;
unsigned int events; unsigned int events;
...@@ -149,16 +149,16 @@ static int virqfd_enable(struct vfio_pci_device *vdev, ...@@ -149,16 +149,16 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
INIT_WORK(&virqfd->shutdown, virqfd_shutdown); INIT_WORK(&virqfd->shutdown, virqfd_shutdown);
INIT_WORK(&virqfd->inject, virqfd_inject); INIT_WORK(&virqfd->inject, virqfd_inject);
file = eventfd_fget(fd); irqfd = fdget(fd);
if (IS_ERR(file)) { if (!irqfd.file) {
ret = PTR_ERR(file); ret = -EBADF;
goto fail; goto err_fd;
} }
ctx = eventfd_ctx_fileget(file); ctx = eventfd_ctx_fileget(irqfd.file);
if (IS_ERR(ctx)) { if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx); ret = PTR_ERR(ctx);
goto fail; goto err_ctx;
} }
virqfd->eventfd = ctx; virqfd->eventfd = ctx;
...@@ -174,7 +174,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev, ...@@ -174,7 +174,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
if (*pvirqfd) { if (*pvirqfd) {
spin_unlock_irq(&vdev->irqlock); spin_unlock_irq(&vdev->irqlock);
ret = -EBUSY; ret = -EBUSY;
goto fail; goto err_busy;
} }
*pvirqfd = virqfd; *pvirqfd = virqfd;
...@@ -187,7 +187,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev, ...@@ -187,7 +187,7 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
events = file->f_op->poll(file, &virqfd->pt); events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
/* /*
* Check if there was an event already pending on the eventfd * Check if there was an event already pending on the eventfd
...@@ -202,17 +202,14 @@ static int virqfd_enable(struct vfio_pci_device *vdev, ...@@ -202,17 +202,14 @@ static int virqfd_enable(struct vfio_pci_device *vdev,
* Do not drop the file until the irqfd is fully initialized, * Do not drop the file until the irqfd is fully initialized,
* otherwise we might race against the POLLHUP. * otherwise we might race against the POLLHUP.
*/ */
fput(file); fdput(irqfd);
return 0; return 0;
err_busy:
fail: eventfd_ctx_put(ctx);
if (ctx && !IS_ERR(ctx)) err_ctx:
eventfd_ctx_put(ctx); fdput(irqfd);
err_fd:
if (file && !IS_ERR(file))
fput(file);
kfree(virqfd); kfree(virqfd);
return ret; return ret;
......
...@@ -1109,7 +1109,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) ...@@ -1109,7 +1109,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
* We can't use anon_inode_getfd() because we need to modify * We can't use anon_inode_getfd() because we need to modify
* the f_mode flags directly to allow more than just ioctls * the f_mode flags directly to allow more than just ioctls
*/ */
ret = get_unused_fd(); ret = get_unused_fd_flags(O_CLOEXEC);
if (ret < 0) { if (ret < 0) {
device->ops->release(device->device_data); device->ops->release(device->device_data);
break; break;
...@@ -1352,6 +1352,68 @@ static const struct file_operations vfio_device_fops = { ...@@ -1352,6 +1352,68 @@ static const struct file_operations vfio_device_fops = {
.mmap = vfio_device_fops_mmap, .mmap = vfio_device_fops_mmap,
}; };
/**
* External user API, exported by symbols to be linked dynamically.
*
* The protocol includes:
* 1. do normal VFIO init operation:
* - opening a new container;
* - attaching group(s) to it;
* - setting an IOMMU driver for a container.
* When IOMMU is set for a container, all groups in it are
* considered ready to use by an external user.
*
* 2. User space passes a group fd to an external user.
* The external user calls vfio_group_get_external_user()
* to verify that:
* - the group is initialized;
* - IOMMU is set for it.
* If both checks passed, vfio_group_get_external_user()
* increments the container user counter to prevent
* the VFIO group from disposal before KVM exits.
*
* 3. The external user calls vfio_external_user_iommu_id()
* to know an IOMMU ID.
*
* 4. When the external KVM finishes, it calls
* vfio_group_put_external_user() to release the VFIO group.
* This call decrements the container user counter.
*/
struct vfio_group *vfio_group_get_external_user(struct file *filep)
{
struct vfio_group *group = filep->private_data;
if (filep->f_op != &vfio_group_fops)
return ERR_PTR(-EINVAL);
if (!atomic_inc_not_zero(&group->container_users))
return ERR_PTR(-EINVAL);
if (!group->container->iommu_driver ||
!vfio_group_viable(group)) {
atomic_dec(&group->container_users);
return ERR_PTR(-EINVAL);
}
vfio_group_get(group);
return group;
}
EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
void vfio_group_put_external_user(struct vfio_group *group)
{
vfio_group_put(group);
vfio_group_try_dissolve_container(group);
}
EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
int vfio_external_user_iommu_id(struct vfio_group *group)
{
return iommu_group_id(group->iommu_group);
}
EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
/** /**
* Module/class support * Module/class support
*/ */
......
...@@ -90,4 +90,11 @@ extern void vfio_unregister_iommu_driver( ...@@ -90,4 +90,11 @@ extern void vfio_unregister_iommu_driver(
TYPE tmp; \ TYPE tmp; \
offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \ offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
/*
* External user API
*/
extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
extern void vfio_group_put_external_user(struct vfio_group *group);
extern int vfio_external_user_iommu_id(struct vfio_group *group);
#endif /* VFIO_H */ #endif /* VFIO_H */
...@@ -324,6 +324,44 @@ enum { ...@@ -324,6 +324,44 @@ enum {
VFIO_PCI_NUM_IRQS VFIO_PCI_NUM_IRQS
}; };
/**
* VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12,
* struct vfio_pci_hot_reset_info)
*
* Return: 0 on success, -errno on failure:
* -enospc = insufficient buffer, -enodev = unsupported for device.
*/
struct vfio_pci_dependent_device {
__u32 group_id;
__u16 segment;
__u8 bus;
__u8 devfn; /* Use PCI_SLOT/PCI_FUNC */
};
struct vfio_pci_hot_reset_info {
__u32 argsz;
__u32 flags;
__u32 count;
struct vfio_pci_dependent_device devices[];
};
#define VFIO_DEVICE_GET_PCI_HOT_RESET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
/**
* VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13,
* struct vfio_pci_hot_reset)
*
* Return: 0 on success, -errno on failure.
*/
struct vfio_pci_hot_reset {
__u32 argsz;
__u32 flags;
__u32 count;
__s32 group_fds[];
};
#define VFIO_DEVICE_PCI_HOT_RESET _IO(VFIO_TYPE, VFIO_BASE + 13)
/* -------- API for Type1 VFIO IOMMU -------- */ /* -------- API for Type1 VFIO IOMMU -------- */
/** /**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment