Commit 0b2e3b6b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'vfio-for-v3.10' of git://github.com/awilliam/linux-vfio

Pull vfio updates from Alex Williamson:
 "Changes include extension to support PCI AER notification to
  userspace, byte granularity of PCI config space and access to
  unarchitected PCI config space, better protection around IOMMU driver
  accesses, default file mode fix, and a few misc cleanups."

* tag 'vfio-for-v3.10' of git://github.com/awilliam/linux-vfio:
  vfio: Set container device mode
  vfio: Use down_reads to protect iommu disconnects
  vfio: Convert container->group_lock to rwsem
  PCI/VFIO: use pcie_flags_reg instead of access PCI-E Capabilities Register
  vfio-pci: Enable raw access to unassigned config space
  vfio-pci: Use byte granularity in config map
  vfio: make local function vfio_pci_intx_unmask_handler() static
  VFIO-AER: Vfio-pci driver changes for supporting AER
  VFIO: Wrapper for getting reference to vfio_device
parents e9589300 664e9386
...@@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) ...@@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
} }
} } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
if (pci_is_pcie(vdev->pdev))
return 1;
return 0; return 0;
} }
...@@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data, ...@@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data,
if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
return -EINVAL; return -EINVAL;
switch (info.index) {
case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
break;
case VFIO_PCI_ERR_IRQ_INDEX:
if (pci_is_pcie(vdev->pdev))
break;
/* pass thru to return error */
default:
return -EINVAL;
}
info.flags = VFIO_IRQ_INFO_EVENTFD; info.flags = VFIO_IRQ_INFO_EVENTFD;
info.count = vfio_pci_get_irq_count(vdev, info.index); info.count = vfio_pci_get_irq_count(vdev, info.index);
...@@ -552,11 +565,40 @@ static void vfio_pci_remove(struct pci_dev *pdev) ...@@ -552,11 +565,40 @@ static void vfio_pci_remove(struct pci_dev *pdev)
kfree(vdev); kfree(vdev);
} }
static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
struct vfio_pci_device *vdev;
struct vfio_device *device;
device = vfio_device_get_from_dev(&pdev->dev);
if (device == NULL)
return PCI_ERS_RESULT_DISCONNECT;
vdev = vfio_device_data(device);
if (vdev == NULL) {
vfio_device_put(device);
return PCI_ERS_RESULT_DISCONNECT;
}
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
vfio_device_put(device);
return PCI_ERS_RESULT_CAN_RECOVER;
}
static struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
};
static struct pci_driver vfio_pci_driver = { static struct pci_driver vfio_pci_driver = {
.name = "vfio-pci", .name = "vfio-pci",
.id_table = NULL, /* only dynamic ids */ .id_table = NULL, /* only dynamic ids */
.probe = vfio_pci_probe, .probe = vfio_pci_probe,
.remove = vfio_pci_remove, .remove = vfio_pci_remove,
.err_handler = &vfio_err_handlers,
}; };
static void __exit vfio_pci_cleanup(void) static void __exit vfio_pci_cleanup(void)
......
...@@ -274,7 +274,8 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, ...@@ -274,7 +274,8 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
return count; return count;
} }
static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, /* Raw access skips any kind of virtualization */
static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
int count, struct perm_bits *perm, int count, struct perm_bits *perm,
int offset, __le32 val) int offset, __le32 val)
{ {
...@@ -287,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos, ...@@ -287,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
return count; return count;
} }
/* Default all regions to read-only, no-virtualization */ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
int count, struct perm_bits *perm,
int offset, __le32 *val)
{
int ret;
ret = vfio_user_config_read(vdev->pdev, pos, val, count);
if (ret)
return pcibios_err_to_errno(ret);
return count;
}
/* Default capability regions to read-only, no-virtualization */
static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
[0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
}; };
static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
[0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
}; };
/*
* Default unassigned regions to raw read-write access. Some devices
* require this to function as they hide registers between the gaps in
* config space (be2net). Like MMIO and I/O port registers, we have
* to trust the hardware isolation.
*/
static struct perm_bits unassigned_perms = {
.readfn = vfio_raw_config_read,
.writefn = vfio_raw_config_write
};
static void free_perm_bits(struct perm_bits *perm) static void free_perm_bits(struct perm_bits *perm)
{ {
...@@ -779,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void) ...@@ -779,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void)
/* Capabilities */ /* Capabilities */
ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write; cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write;
ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write; cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write;
ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
/* Extended capabilities */ /* Extended capabilities */
ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write; ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
if (ret) if (ret)
vfio_pci_uninit_perm_bits(); vfio_pci_uninit_perm_bits();
...@@ -801,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) ...@@ -801,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
u8 cap; u8 cap;
int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
PCI_STD_HEADER_SIZEOF; PCI_STD_HEADER_SIZEOF;
base /= 4;
pos /= 4;
cap = vdev->pci_config_map[pos]; cap = vdev->pci_config_map[pos];
if (cap == PCI_CAP_ID_BASIC) if (cap == PCI_CAP_ID_BASIC)
...@@ -813,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) ...@@ -813,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
pos--; pos--;
return pos * 4; return pos;
} }
static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
...@@ -1017,13 +1038,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) ...@@ -1017,13 +1038,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
return byte; return byte;
case PCI_CAP_ID_EXP: case PCI_CAP_ID_EXP:
/* length based on version */ /* length based on version */
ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
if (ret)
return pcibios_err_to_errno(ret);
vdev->extended_caps = true; vdev->extended_caps = true;
if ((word & PCI_EXP_FLAGS_VERS) == 1) if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
else else
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
...@@ -1230,8 +1247,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) ...@@ -1230,8 +1247,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
} }
/* Sanity check, do we overlap other capabilities? */ /* Sanity check, do we overlap other capabilities? */
for (i = 0; i < len; i += 4) { for (i = 0; i < len; i++) {
if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID)) if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
continue; continue;
pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
...@@ -1239,7 +1256,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev) ...@@ -1239,7 +1256,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
pos + i, map[pos + i], cap); pos + i, map[pos + i], cap);
} }
memset(map + (pos / 4), cap, len / 4); memset(map + pos, cap, len);
ret = vfio_fill_vconfig_bytes(vdev, pos, len); ret = vfio_fill_vconfig_bytes(vdev, pos, len);
if (ret) if (ret)
return ret; return ret;
...@@ -1314,8 +1331,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) ...@@ -1314,8 +1331,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
hidden = true; hidden = true;
} }
for (i = 0; i < len; i += 4) { for (i = 0; i < len; i++) {
if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID)) if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
continue; continue;
pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
...@@ -1330,7 +1347,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) ...@@ -1330,7 +1347,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
*/ */
BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
memset(map + (epos / 4), ecap, len / 4); memset(map + epos, ecap, len);
ret = vfio_fill_vconfig_bytes(vdev, epos, len); ret = vfio_fill_vconfig_bytes(vdev, epos, len);
if (ret) if (ret)
return ret; return ret;
...@@ -1377,10 +1394,12 @@ int vfio_config_init(struct vfio_pci_device *vdev) ...@@ -1377,10 +1394,12 @@ int vfio_config_init(struct vfio_pci_device *vdev)
int ret; int ret;
/* /*
* Config space, caps and ecaps are all dword aligned, so we can * Config space, caps and ecaps are all dword aligned, so we could
* use one byte per dword to record the type. * use one byte per dword to record the type. However, there are
* no requiremenst on the length of a capability, so the gap between
* capabilities needs byte granularity.
*/ */
map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL); map = kmalloc(pdev->cfg_size, GFP_KERNEL);
if (!map) if (!map)
return -ENOMEM; return -ENOMEM;
...@@ -1393,9 +1412,9 @@ int vfio_config_init(struct vfio_pci_device *vdev) ...@@ -1393,9 +1412,9 @@ int vfio_config_init(struct vfio_pci_device *vdev)
vdev->pci_config_map = map; vdev->pci_config_map = map;
vdev->vconfig = vconfig; vdev->vconfig = vconfig;
memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4); memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF);
memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID, memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID,
(pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4); pdev->cfg_size - PCI_STD_HEADER_SIZEOF);
ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
if (ret) if (ret)
...@@ -1450,6 +1469,22 @@ void vfio_config_free(struct vfio_pci_device *vdev) ...@@ -1450,6 +1469,22 @@ void vfio_config_free(struct vfio_pci_device *vdev)
vdev->msi_perm = NULL; vdev->msi_perm = NULL;
} }
/*
* Find the remaining number of bytes in a dword that match the given
* position. Stop at either the end of the capability or the dword boundary.
*/
static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
loff_t pos)
{
u8 cap = vdev->pci_config_map[pos];
size_t i;
for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++)
/* nop */;
return i;
}
static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite) size_t count, loff_t *ppos, bool iswrite)
{ {
...@@ -1458,45 +1493,37 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, ...@@ -1458,45 +1493,37 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
__le32 val = 0; __le32 val = 0;
int cap_start = 0, offset; int cap_start = 0, offset;
u8 cap_id; u8 cap_id;
ssize_t ret = count; ssize_t ret;
if (*ppos < 0 || *ppos + count > pdev->cfg_size) if (*ppos < 0 || *ppos >= pdev->cfg_size ||
*ppos + count > pdev->cfg_size)
return -EFAULT; return -EFAULT;
/* /*
* gcc can't seem to figure out we're a static function, only called * Chop accesses into aligned chunks containing no more than a
* with count of 1/2/4 and hits copy_from_user_overflow without this. * single capability. Caller increments to the next chunk.
*/ */
if (count > sizeof(val)) count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos));
return -EINVAL; if (count >= 4 && !(*ppos % 4))
count = 4;
cap_id = vdev->pci_config_map[*ppos / 4]; else if (count >= 2 && !(*ppos % 2))
count = 2;
else
count = 1;
if (cap_id == PCI_CAP_ID_INVALID) { ret = count;
if (iswrite)
return ret; /* drop */
/* cap_id = vdev->pci_config_map[*ppos];
* Per PCI spec 3.0, section 6.1, reads from reserved and
* unimplemented registers return 0
*/
if (copy_to_user(buf, &val, count))
return -EFAULT;
return ret; if (cap_id == PCI_CAP_ID_INVALID) {
} perm = &unassigned_perms;
cap_start = *ppos;
/* } else {
* All capabilities are minimum 4 bytes and aligned on dword
* boundaries. Since we don't support unaligned accesses, we're
* only ever accessing a single capability.
*/
if (*ppos >= PCI_CFG_SPACE_SIZE) { if (*ppos >= PCI_CFG_SPACE_SIZE) {
WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
perm = &ecap_perms[cap_id]; perm = &ecap_perms[cap_id];
cap_start = vfio_find_cap_start(vdev, *ppos); cap_start = vfio_find_cap_start(vdev, *ppos);
} else { } else {
WARN_ON(cap_id > PCI_CAP_ID_MAX); WARN_ON(cap_id > PCI_CAP_ID_MAX);
...@@ -1508,6 +1535,7 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, ...@@ -1508,6 +1535,7 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
if (cap_id > PCI_CAP_ID_BASIC) if (cap_id > PCI_CAP_ID_BASIC)
cap_start = vfio_find_cap_start(vdev, *ppos); cap_start = vfio_find_cap_start(vdev, *ppos);
} }
}
WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
WARN_ON(cap_start > *ppos); WARN_ON(cap_start > *ppos);
...@@ -1546,20 +1574,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, ...@@ -1546,20 +1574,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
pos &= VFIO_PCI_OFFSET_MASK; pos &= VFIO_PCI_OFFSET_MASK;
/*
* We want to both keep the access size the caller users as well as
* support reading large chunks of config space in a single call.
* PCI doesn't support unaligned accesses, so we can safely break
* those apart.
*/
while (count) { while (count) {
if (count >= 4 && !(pos % 4)) ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite);
ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
else if (count >= 2 && !(pos % 2))
ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
else
ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
if (ret < 0) if (ret < 0)
return ret; return ret;
......
...@@ -287,7 +287,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev) ...@@ -287,7 +287,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
* a signal is necessary, which can then be handled via a work queue * a signal is necessary, which can then be handled via a work queue
* or directly depending on the caller. * or directly depending on the caller.
*/ */
int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused) static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
void *unused)
{ {
struct pci_dev *pdev = vdev->pdev; struct pci_dev *pdev = vdev->pdev;
unsigned long flags; unsigned long flags;
...@@ -746,6 +747,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, ...@@ -746,6 +747,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
return 0; return 0;
} }
static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags, void *data)
{
int32_t fd = *(int32_t *)data;
struct pci_dev *pdev = vdev->pdev;
if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
return -EINVAL;
/*
* device_lock synchronizes setting and checking of
* err_trigger. The vfio_pci_aer_err_detected() is also
* called with device_lock held.
*/
/* DATA_NONE/DATA_BOOL enables loopback testing */
if (flags & VFIO_IRQ_SET_DATA_NONE) {
device_lock(&pdev->dev);
if (vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
device_unlock(&pdev->dev);
return 0;
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
device_lock(&pdev->dev);
if (trigger && vdev->err_trigger)
eventfd_signal(vdev->err_trigger, 1);
device_unlock(&pdev->dev);
return 0;
}
/* Handle SET_DATA_EVENTFD */
if (fd == -1) {
device_lock(&pdev->dev);
if (vdev->err_trigger)
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = NULL;
device_unlock(&pdev->dev);
return 0;
} else if (fd >= 0) {
struct eventfd_ctx *efdctx;
efdctx = eventfd_ctx_fdget(fd);
if (IS_ERR(efdctx))
return PTR_ERR(efdctx);
device_lock(&pdev->dev);
if (vdev->err_trigger)
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = efdctx;
device_unlock(&pdev->dev);
return 0;
} else
return -EINVAL;
}
int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
unsigned index, unsigned start, unsigned count, unsigned index, unsigned start, unsigned count,
void *data) void *data)
...@@ -780,6 +838,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, ...@@ -780,6 +838,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
break; break;
} }
break; break;
case VFIO_PCI_ERR_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_TRIGGER:
if (pci_is_pcie(vdev->pdev))
func = vfio_pci_set_err_trigger;
break;
}
} }
if (!func) if (!func)
......
...@@ -56,6 +56,7 @@ struct vfio_pci_device { ...@@ -56,6 +56,7 @@ struct vfio_pci_device {
bool has_vga; bool has_vga;
struct pci_saved_state *pci_saved_state; struct pci_saved_state *pci_saved_state;
atomic_t refcnt; atomic_t refcnt;
struct eventfd_ctx *err_trigger;
}; };
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
......
...@@ -24,8 +24,10 @@ ...@@ -24,8 +24,10 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/stat.h>
#include <linux/string.h> #include <linux/string.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/vfio.h> #include <linux/vfio.h>
...@@ -57,7 +59,7 @@ struct vfio_iommu_driver { ...@@ -57,7 +59,7 @@ struct vfio_iommu_driver {
struct vfio_container { struct vfio_container {
struct kref kref; struct kref kref;
struct list_head group_list; struct list_head group_list;
struct mutex group_lock; struct rw_semaphore group_lock;
struct vfio_iommu_driver *iommu_driver; struct vfio_iommu_driver *iommu_driver;
void *iommu_data; void *iommu_data;
}; };
...@@ -392,12 +394,13 @@ static void vfio_device_release(struct kref *kref) ...@@ -392,12 +394,13 @@ static void vfio_device_release(struct kref *kref)
} }
/* Device reference always implies a group reference */ /* Device reference always implies a group reference */
static void vfio_device_put(struct vfio_device *device) void vfio_device_put(struct vfio_device *device)
{ {
struct vfio_group *group = device->group; struct vfio_group *group = device->group;
kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock); kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
vfio_group_put(group); vfio_group_put(group);
} }
EXPORT_SYMBOL_GPL(vfio_device_put);
static void vfio_device_get(struct vfio_device *device) static void vfio_device_get(struct vfio_device *device)
{ {
...@@ -627,6 +630,33 @@ int vfio_add_group_dev(struct device *dev, ...@@ -627,6 +630,33 @@ int vfio_add_group_dev(struct device *dev,
} }
EXPORT_SYMBOL_GPL(vfio_add_group_dev); EXPORT_SYMBOL_GPL(vfio_add_group_dev);
/**
* Get a reference to the vfio_device for a device that is known to
* be bound to a vfio driver. The driver implicitly holds a
* vfio_device reference between vfio_add_group_dev and
* vfio_del_group_dev. We can therefore use drvdata to increment
* that reference from the struct device. This additional
* reference must be released by calling vfio_device_put.
*/
struct vfio_device *vfio_device_get_from_dev(struct device *dev)
{
struct vfio_device *device = dev_get_drvdata(dev);
vfio_device_get(device);
return device;
}
EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
/*
* Caller must hold a reference to the vfio_device
*/
void *vfio_device_data(struct vfio_device *device)
{
return device->device_data;
}
EXPORT_SYMBOL_GPL(vfio_device_data);
/* Given a referenced group, check if it contains the device */ /* Given a referenced group, check if it contains the device */
static bool vfio_dev_present(struct vfio_group *group, struct device *dev) static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
{ {
...@@ -675,9 +705,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev); ...@@ -675,9 +705,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev);
static long vfio_ioctl_check_extension(struct vfio_container *container, static long vfio_ioctl_check_extension(struct vfio_container *container,
unsigned long arg) unsigned long arg)
{ {
struct vfio_iommu_driver *driver = container->iommu_driver; struct vfio_iommu_driver *driver;
long ret = 0; long ret = 0;
down_read(&container->group_lock);
driver = container->iommu_driver;
switch (arg) { switch (arg) {
/* No base extensions yet */ /* No base extensions yet */
default: default:
...@@ -707,10 +741,12 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, ...@@ -707,10 +741,12 @@ static long vfio_ioctl_check_extension(struct vfio_container *container,
VFIO_CHECK_EXTENSION, arg); VFIO_CHECK_EXTENSION, arg);
} }
up_read(&container->group_lock);
return ret; return ret;
} }
/* hold container->group_lock */ /* hold write lock on container->group_lock */
static int __vfio_container_attach_groups(struct vfio_container *container, static int __vfio_container_attach_groups(struct vfio_container *container,
struct vfio_iommu_driver *driver, struct vfio_iommu_driver *driver,
void *data) void *data)
...@@ -741,7 +777,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, ...@@ -741,7 +777,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
struct vfio_iommu_driver *driver; struct vfio_iommu_driver *driver;
long ret = -ENODEV; long ret = -ENODEV;
mutex_lock(&container->group_lock); down_write(&container->group_lock);
/* /*
* The container is designed to be an unprivileged interface while * The container is designed to be an unprivileged interface while
...@@ -752,7 +788,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, ...@@ -752,7 +788,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
* the container is deprivileged and returns to an unset state. * the container is deprivileged and returns to an unset state.
*/ */
if (list_empty(&container->group_list) || container->iommu_driver) { if (list_empty(&container->group_list) || container->iommu_driver) {
mutex_unlock(&container->group_lock); up_write(&container->group_lock);
return -EINVAL; return -EINVAL;
} }
...@@ -799,7 +835,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, ...@@ -799,7 +835,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
mutex_unlock(&vfio.iommu_drivers_lock); mutex_unlock(&vfio.iommu_drivers_lock);
skip_drivers_unlock: skip_drivers_unlock:
mutex_unlock(&container->group_lock); up_write(&container->group_lock);
return ret; return ret;
} }
...@@ -815,9 +851,6 @@ static long vfio_fops_unl_ioctl(struct file *filep, ...@@ -815,9 +851,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
if (!container) if (!container)
return ret; return ret;
driver = container->iommu_driver;
data = container->iommu_data;
switch (cmd) { switch (cmd) {
case VFIO_GET_API_VERSION: case VFIO_GET_API_VERSION:
ret = VFIO_API_VERSION; ret = VFIO_API_VERSION;
...@@ -829,8 +862,15 @@ static long vfio_fops_unl_ioctl(struct file *filep, ...@@ -829,8 +862,15 @@ static long vfio_fops_unl_ioctl(struct file *filep,
ret = vfio_ioctl_set_iommu(container, arg); ret = vfio_ioctl_set_iommu(container, arg);
break; break;
default: default:
down_read(&container->group_lock);
driver = container->iommu_driver;
data = container->iommu_data;
if (driver) /* passthrough all unrecognized ioctls */ if (driver) /* passthrough all unrecognized ioctls */
ret = driver->ops->ioctl(data, cmd, arg); ret = driver->ops->ioctl(data, cmd, arg);
up_read(&container->group_lock);
} }
return ret; return ret;
...@@ -854,7 +894,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) ...@@ -854,7 +894,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
return -ENOMEM; return -ENOMEM;
INIT_LIST_HEAD(&container->group_list); INIT_LIST_HEAD(&container->group_list);
mutex_init(&container->group_lock); init_rwsem(&container->group_lock);
kref_init(&container->kref); kref_init(&container->kref);
filep->private_data = container; filep->private_data = container;
...@@ -881,35 +921,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf, ...@@ -881,35 +921,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
size_t count, loff_t *ppos) size_t count, loff_t *ppos)
{ {
struct vfio_container *container = filep->private_data; struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver = container->iommu_driver; struct vfio_iommu_driver *driver;
ssize_t ret = -EINVAL;
if (unlikely(!driver || !driver->ops->read)) down_read(&container->group_lock);
return -EINVAL;
driver = container->iommu_driver;
if (likely(driver && driver->ops->read))
ret = driver->ops->read(container->iommu_data,
buf, count, ppos);
return driver->ops->read(container->iommu_data, buf, count, ppos); up_read(&container->group_lock);
return ret;
} }
static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
size_t count, loff_t *ppos) size_t count, loff_t *ppos)
{ {
struct vfio_container *container = filep->private_data; struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver = container->iommu_driver; struct vfio_iommu_driver *driver;
ssize_t ret = -EINVAL;
if (unlikely(!driver || !driver->ops->write)) down_read(&container->group_lock);
return -EINVAL;
driver = container->iommu_driver;
if (likely(driver && driver->ops->write))
ret = driver->ops->write(container->iommu_data,
buf, count, ppos);
return driver->ops->write(container->iommu_data, buf, count, ppos); up_read(&container->group_lock);
return ret;
} }
static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
{ {
struct vfio_container *container = filep->private_data; struct vfio_container *container = filep->private_data;
struct vfio_iommu_driver *driver = container->iommu_driver; struct vfio_iommu_driver *driver;
int ret = -EINVAL;
if (unlikely(!driver || !driver->ops->mmap)) down_read(&container->group_lock);
return -EINVAL;
driver = container->iommu_driver;
if (likely(driver && driver->ops->mmap))
ret = driver->ops->mmap(container->iommu_data, vma);
return driver->ops->mmap(container->iommu_data, vma); up_read(&container->group_lock);
return ret;
} }
static const struct file_operations vfio_fops = { static const struct file_operations vfio_fops = {
...@@ -933,7 +993,7 @@ static void __vfio_group_unset_container(struct vfio_group *group) ...@@ -933,7 +993,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
struct vfio_container *container = group->container; struct vfio_container *container = group->container;
struct vfio_iommu_driver *driver; struct vfio_iommu_driver *driver;
mutex_lock(&container->group_lock); down_write(&container->group_lock);
driver = container->iommu_driver; driver = container->iommu_driver;
if (driver) if (driver)
...@@ -951,7 +1011,7 @@ static void __vfio_group_unset_container(struct vfio_group *group) ...@@ -951,7 +1011,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
container->iommu_data = NULL; container->iommu_data = NULL;
} }
mutex_unlock(&container->group_lock); up_write(&container->group_lock);
vfio_container_put(container); vfio_container_put(container);
} }
...@@ -1011,7 +1071,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) ...@@ -1011,7 +1071,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
container = f.file->private_data; container = f.file->private_data;
WARN_ON(!container); /* fget ensures we don't race vfio_release */ WARN_ON(!container); /* fget ensures we don't race vfio_release */
mutex_lock(&container->group_lock); down_write(&container->group_lock);
driver = container->iommu_driver; driver = container->iommu_driver;
if (driver) { if (driver) {
...@@ -1029,7 +1089,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) ...@@ -1029,7 +1089,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
atomic_inc(&group->container_users); atomic_inc(&group->container_users);
unlock_out: unlock_out:
mutex_unlock(&container->group_lock); up_write(&container->group_lock);
fdput(f); fdput(f);
return ret; return ret;
} }
...@@ -1300,6 +1360,9 @@ static const struct file_operations vfio_device_fops = { ...@@ -1300,6 +1360,9 @@ static const struct file_operations vfio_device_fops = {
*/ */
static char *vfio_devnode(struct device *dev, umode_t *mode) static char *vfio_devnode(struct device *dev, umode_t *mode)
{ {
if (MINOR(dev->devt) == 0)
*mode = S_IRUGO | S_IWUGO;
return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
} }
......
...@@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev, ...@@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev,
void *device_data); void *device_data);
extern void *vfio_del_group_dev(struct device *dev); extern void *vfio_del_group_dev(struct device *dev);
extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
extern void vfio_device_put(struct vfio_device *device);
extern void *vfio_device_data(struct vfio_device *device);
/** /**
* struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
......
...@@ -319,6 +319,7 @@ enum { ...@@ -319,6 +319,7 @@ enum {
VFIO_PCI_INTX_IRQ_INDEX, VFIO_PCI_INTX_IRQ_INDEX,
VFIO_PCI_MSI_IRQ_INDEX, VFIO_PCI_MSI_IRQ_INDEX,
VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_MSIX_IRQ_INDEX,
VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_NUM_IRQS VFIO_PCI_NUM_IRQS
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment