Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 65743de..e44bf73 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -47,4 +47,5 @@
source "drivers/vfio/pci/Kconfig"
source "drivers/vfio/platform/Kconfig"
source "drivers/vfio/mdev/Kconfig"
+source "drivers/vfio/fsl-mc/Kconfig"
source "virt/lib/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index de67c47..fee73f3 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -9,3 +9,4 @@
obj-$(CONFIG_VFIO_PCI) += pci/
obj-$(CONFIG_VFIO_PLATFORM) += platform/
obj-$(CONFIG_VFIO_MDEV) += mdev/
+obj-$(CONFIG_VFIO_FSL_MC) += fsl-mc/
diff --git a/drivers/vfio/fsl-mc/Kconfig b/drivers/vfio/fsl-mc/Kconfig
new file mode 100644
index 0000000..b1a527d
--- /dev/null
+++ b/drivers/vfio/fsl-mc/Kconfig
@@ -0,0 +1,9 @@
+config VFIO_FSL_MC
+ tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices"
+ depends on VFIO && FSL_MC_BUS && EVENTFD
+ help
+ Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc
+ (Management Complex) devices. This is required to passthrough
+ fsl-mc bus devices using the VFIO framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/fsl-mc/Makefile b/drivers/vfio/fsl-mc/Makefile
new file mode 100644
index 0000000..cad6dbf
--- /dev/null
+++ b/drivers/vfio/fsl-mc/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+
+vfio-fsl-mc-y := vfio_fsl_mc.o vfio_fsl_mc_intr.o
+obj-$(CONFIG_VFIO_FSL_MC) += vfio-fsl-mc.o
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
new file mode 100644
index 0000000..8722f5e
--- /dev/null
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c
@@ -0,0 +1,707 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * Copyright 2013-2016 Freescale Semiconductor Inc.
+ * Copyright 2016-2017,2019-2020 NXP
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/vfio.h>
+#include <linux/fsl/mc.h>
+#include <linux/delay.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+
+#include "vfio_fsl_mc_private.h"
+
+static struct fsl_mc_driver vfio_fsl_mc_driver;
+
+static DEFINE_MUTEX(reflck_lock);
+
+static void vfio_fsl_mc_reflck_get(struct vfio_fsl_mc_reflck *reflck)
+{
+ kref_get(&reflck->kref);
+}
+
+static void vfio_fsl_mc_reflck_release(struct kref *kref)
+{
+ struct vfio_fsl_mc_reflck *reflck = container_of(kref,
+ struct vfio_fsl_mc_reflck,
+ kref);
+
+ mutex_destroy(&reflck->lock);
+ kfree(reflck);
+ mutex_unlock(&reflck_lock);
+}
+
+static void vfio_fsl_mc_reflck_put(struct vfio_fsl_mc_reflck *reflck)
+{
+ kref_put_mutex(&reflck->kref, vfio_fsl_mc_reflck_release, &reflck_lock);
+}
+
+static struct vfio_fsl_mc_reflck *vfio_fsl_mc_reflck_alloc(void)
+{
+ struct vfio_fsl_mc_reflck *reflck;
+
+ reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
+ if (!reflck)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&reflck->kref);
+ mutex_init(&reflck->lock);
+
+ return reflck;
+}
+
+static int vfio_fsl_mc_reflck_attach(struct vfio_fsl_mc_device *vdev)
+{
+ int ret = 0;
+
+ mutex_lock(&reflck_lock);
+ if (is_fsl_mc_bus_dprc(vdev->mc_dev)) {
+ vdev->reflck = vfio_fsl_mc_reflck_alloc();
+ ret = PTR_ERR_OR_ZERO(vdev->reflck);
+ } else {
+ struct device *mc_cont_dev = vdev->mc_dev->dev.parent;
+ struct vfio_device *device;
+ struct vfio_fsl_mc_device *cont_vdev;
+
+ device = vfio_device_get_from_dev(mc_cont_dev);
+ if (!device) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+
+ cont_vdev = vfio_device_data(device);
+ if (!cont_vdev || !cont_vdev->reflck) {
+ vfio_device_put(device);
+ ret = -ENODEV;
+ goto unlock;
+ }
+ vfio_fsl_mc_reflck_get(cont_vdev->reflck);
+ vdev->reflck = cont_vdev->reflck;
+ vfio_device_put(device);
+ }
+
+unlock:
+ mutex_unlock(&reflck_lock);
+ return ret;
+}
+
+static int vfio_fsl_mc_regions_init(struct vfio_fsl_mc_device *vdev)
+{
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ int count = mc_dev->obj_desc.region_count;
+ int i;
+
+ vdev->regions = kcalloc(count, sizeof(struct vfio_fsl_mc_region),
+ GFP_KERNEL);
+ if (!vdev->regions)
+ return -ENOMEM;
+
+ for (i = 0; i < count; i++) {
+ struct resource *res = &mc_dev->regions[i];
+ int no_mmap = is_fsl_mc_bus_dprc(mc_dev);
+
+ vdev->regions[i].addr = res->start;
+ vdev->regions[i].size = resource_size(res);
+ vdev->regions[i].type = mc_dev->regions[i].flags & IORESOURCE_BITS;
+ /*
+ * Only regions addressed with PAGE granularity may be
+ * MMAPed securely.
+ */
+ if (!no_mmap && !(vdev->regions[i].addr & ~PAGE_MASK) &&
+ !(vdev->regions[i].size & ~PAGE_MASK))
+ vdev->regions[i].flags |=
+ VFIO_REGION_INFO_FLAG_MMAP;
+ vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ;
+ if (!(mc_dev->regions[i].flags & IORESOURCE_READONLY))
+ vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE;
+ }
+
+ return 0;
+}
+
+static void vfio_fsl_mc_regions_cleanup(struct vfio_fsl_mc_device *vdev)
+{
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ int i;
+
+ for (i = 0; i < mc_dev->obj_desc.region_count; i++)
+ iounmap(vdev->regions[i].ioaddr);
+ kfree(vdev->regions);
+}
+
+static int vfio_fsl_mc_open(void *device_data)
+{
+ struct vfio_fsl_mc_device *vdev = device_data;
+ int ret;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ mutex_lock(&vdev->reflck->lock);
+ if (!vdev->refcnt) {
+ ret = vfio_fsl_mc_regions_init(vdev);
+ if (ret)
+ goto err_reg_init;
+ }
+ vdev->refcnt++;
+
+ mutex_unlock(&vdev->reflck->lock);
+
+ return 0;
+
+err_reg_init:
+ mutex_unlock(&vdev->reflck->lock);
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static void vfio_fsl_mc_release(void *device_data)
+{
+ struct vfio_fsl_mc_device *vdev = device_data;
+ int ret;
+
+ mutex_lock(&vdev->reflck->lock);
+
+ if (!(--vdev->refcnt)) {
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev);
+ struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev);
+
+ vfio_fsl_mc_regions_cleanup(vdev);
+
+ /* reset the device before cleaning up the interrupts */
+ ret = dprc_reset_container(mc_cont->mc_io, 0,
+ mc_cont->mc_handle,
+ mc_cont->obj_desc.id,
+ DPRC_RESET_OPTION_NON_RECURSIVE);
+
+ if (ret) {
+ dev_warn(&mc_cont->dev, "VFIO_FLS_MC: reset device has failed (%d)\n",
+ ret);
+ WARN_ON(1);
+ }
+
+ vfio_fsl_mc_irqs_cleanup(vdev);
+
+ fsl_mc_cleanup_irq_pool(mc_cont);
+ }
+
+ mutex_unlock(&vdev->reflck->lock);
+
+ module_put(THIS_MODULE);
+}
+
+static long vfio_fsl_mc_ioctl(void *device_data, unsigned int cmd,
+ unsigned long arg)
+{
+ unsigned long minsz;
+ struct vfio_fsl_mc_device *vdev = device_data;
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+ switch (cmd) {
+ case VFIO_DEVICE_GET_INFO:
+ {
+ struct vfio_device_info info;
+
+ minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ info.flags = VFIO_DEVICE_FLAGS_FSL_MC;
+
+ if (is_fsl_mc_bus_dprc(mc_dev))
+ info.flags |= VFIO_DEVICE_FLAGS_RESET;
+
+ info.num_regions = mc_dev->obj_desc.region_count;
+ info.num_irqs = mc_dev->obj_desc.irq_count;
+
+ return copy_to_user((void __user *)arg, &info, minsz) ?
+ -EFAULT : 0;
+ }
+ case VFIO_DEVICE_GET_REGION_INFO:
+ {
+ struct vfio_region_info info;
+
+ minsz = offsetofend(struct vfio_region_info, offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ if (info.index >= mc_dev->obj_desc.region_count)
+ return -EINVAL;
+
+ /* map offset to the physical address */
+ info.offset = VFIO_FSL_MC_INDEX_TO_OFFSET(info.index);
+ info.size = vdev->regions[info.index].size;
+ info.flags = vdev->regions[info.index].flags;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+ }
+ case VFIO_DEVICE_GET_IRQ_INFO:
+ {
+ struct vfio_irq_info info;
+
+ minsz = offsetofend(struct vfio_irq_info, count);
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ if (info.index >= mc_dev->obj_desc.irq_count)
+ return -EINVAL;
+
+ info.flags = VFIO_IRQ_INFO_EVENTFD;
+ info.count = 1;
+
+ if (copy_to_user((void __user *)arg, &info, minsz))
+ return -EFAULT;
+ return 0;
+ }
+ case VFIO_DEVICE_SET_IRQS:
+ {
+ struct vfio_irq_set hdr;
+ u8 *data = NULL;
+ int ret = 0;
+ size_t data_size = 0;
+
+ minsz = offsetofend(struct vfio_irq_set, count);
+
+ if (copy_from_user(&hdr, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ ret = vfio_set_irqs_validate_and_prepare(&hdr, mc_dev->obj_desc.irq_count,
+ mc_dev->obj_desc.irq_count, &data_size);
+ if (ret)
+ return ret;
+
+ if (data_size) {
+ data = memdup_user((void __user *)(arg + minsz),
+ data_size);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+ }
+
+ mutex_lock(&vdev->igate);
+ ret = vfio_fsl_mc_set_irqs_ioctl(vdev, hdr.flags,
+ hdr.index, hdr.start,
+ hdr.count, data);
+ mutex_unlock(&vdev->igate);
+ kfree(data);
+
+ return ret;
+ }
+ case VFIO_DEVICE_RESET:
+ {
+ int ret;
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+ /* reset is supported only for the DPRC */
+ if (!is_fsl_mc_bus_dprc(mc_dev))
+ return -ENOTTY;
+
+ ret = dprc_reset_container(mc_dev->mc_io, 0,
+ mc_dev->mc_handle,
+ mc_dev->obj_desc.id,
+ DPRC_RESET_OPTION_NON_RECURSIVE);
+ return ret;
+
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+static ssize_t vfio_fsl_mc_read(void *device_data, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct vfio_fsl_mc_device *vdev = device_data;
+ unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos);
+ loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK;
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ struct vfio_fsl_mc_region *region;
+ u64 data[8];
+ int i;
+
+ if (index >= mc_dev->obj_desc.region_count)
+ return -EINVAL;
+
+ region = &vdev->regions[index];
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_READ))
+ return -EINVAL;
+
+ if (!region->ioaddr) {
+ region->ioaddr = ioremap(region->addr, region->size);
+ if (!region->ioaddr)
+ return -ENOMEM;
+ }
+
+ if (count != 64 || off != 0)
+ return -EINVAL;
+
+ for (i = 7; i >= 0; i--)
+ data[i] = readq(region->ioaddr + i * sizeof(uint64_t));
+
+ if (copy_to_user(buf, data, 64))
+ return -EFAULT;
+
+ return count;
+}
+
+#define MC_CMD_COMPLETION_TIMEOUT_MS 5000
+#define MC_CMD_COMPLETION_POLLING_MAX_SLEEP_USECS 500
+
+static int vfio_fsl_mc_send_command(void __iomem *ioaddr, uint64_t *cmd_data)
+{
+ int i;
+ enum mc_cmd_status status;
+ unsigned long timeout_usecs = MC_CMD_COMPLETION_TIMEOUT_MS * 1000;
+
+ /* Write at command parameter into portal */
+ for (i = 7; i >= 1; i--)
+ writeq_relaxed(cmd_data[i], ioaddr + i * sizeof(uint64_t));
+
+ /* Write command header in the end */
+ writeq(cmd_data[0], ioaddr);
+
+ /* Wait for response before returning to user-space
+ * This can be optimized in future to even prepare response
+ * before returning to user-space and avoid read ioctl.
+ */
+ for (;;) {
+ u64 header;
+ struct mc_cmd_header *resp_hdr;
+
+ header = cpu_to_le64(readq_relaxed(ioaddr));
+
+ resp_hdr = (struct mc_cmd_header *)&header;
+ status = (enum mc_cmd_status)resp_hdr->status;
+ if (status != MC_CMD_STATUS_READY)
+ break;
+
+ udelay(MC_CMD_COMPLETION_POLLING_MAX_SLEEP_USECS);
+ timeout_usecs -= MC_CMD_COMPLETION_POLLING_MAX_SLEEP_USECS;
+ if (timeout_usecs == 0)
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+
+static ssize_t vfio_fsl_mc_write(void *device_data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct vfio_fsl_mc_device *vdev = device_data;
+ unsigned int index = VFIO_FSL_MC_OFFSET_TO_INDEX(*ppos);
+ loff_t off = *ppos & VFIO_FSL_MC_OFFSET_MASK;
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ struct vfio_fsl_mc_region *region;
+ u64 data[8];
+ int ret;
+
+ if (index >= mc_dev->obj_desc.region_count)
+ return -EINVAL;
+
+ region = &vdev->regions[index];
+
+ if (!(region->flags & VFIO_REGION_INFO_FLAG_WRITE))
+ return -EINVAL;
+
+ if (!region->ioaddr) {
+ region->ioaddr = ioremap(region->addr, region->size);
+ if (!region->ioaddr)
+ return -ENOMEM;
+ }
+
+ if (count != 64 || off != 0)
+ return -EINVAL;
+
+ if (copy_from_user(&data, buf, 64))
+ return -EFAULT;
+
+ ret = vfio_fsl_mc_send_command(region->ioaddr, data);
+ if (ret)
+ return ret;
+
+ return count;
+
+}
+
+static int vfio_fsl_mc_mmap_mmio(struct vfio_fsl_mc_region region,
+ struct vm_area_struct *vma)
+{
+ u64 size = vma->vm_end - vma->vm_start;
+ u64 pgoff, base;
+ u8 region_cacheable;
+
+ pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_FSL_MC_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+ base = pgoff << PAGE_SHIFT;
+
+ if (region.size < PAGE_SIZE || base + size > region.size)
+ return -EINVAL;
+
+ region_cacheable = (region.type & FSL_MC_REGION_CACHEABLE) &&
+ (region.type & FSL_MC_REGION_SHAREABLE);
+ if (!region_cacheable)
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff;
+
+ return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+ size, vma->vm_page_prot);
+}
+
+static int vfio_fsl_mc_mmap(void *device_data, struct vm_area_struct *vma)
+{
+ struct vfio_fsl_mc_device *vdev = device_data;
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ unsigned int index;
+
+ index = vma->vm_pgoff >> (VFIO_FSL_MC_OFFSET_SHIFT - PAGE_SHIFT);
+
+ if (vma->vm_end < vma->vm_start)
+ return -EINVAL;
+ if (vma->vm_start & ~PAGE_MASK)
+ return -EINVAL;
+ if (vma->vm_end & ~PAGE_MASK)
+ return -EINVAL;
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+ if (index >= mc_dev->obj_desc.region_count)
+ return -EINVAL;
+
+ if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP))
+ return -EINVAL;
+
+ if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ)
+ && (vma->vm_flags & VM_READ))
+ return -EINVAL;
+
+ if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE)
+ && (vma->vm_flags & VM_WRITE))
+ return -EINVAL;
+
+ vma->vm_private_data = mc_dev;
+
+ return vfio_fsl_mc_mmap_mmio(vdev->regions[index], vma);
+}
+
+static const struct vfio_device_ops vfio_fsl_mc_ops = {
+ .name = "vfio-fsl-mc",
+ .open = vfio_fsl_mc_open,
+ .release = vfio_fsl_mc_release,
+ .ioctl = vfio_fsl_mc_ioctl,
+ .read = vfio_fsl_mc_read,
+ .write = vfio_fsl_mc_write,
+ .mmap = vfio_fsl_mc_mmap,
+};
+
+static int vfio_fsl_mc_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct vfio_fsl_mc_device *vdev = container_of(nb,
+ struct vfio_fsl_mc_device, nb);
+ struct device *dev = data;
+ struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev);
+ struct fsl_mc_device *mc_cont = to_fsl_mc_device(mc_dev->dev.parent);
+
+ if (action == BUS_NOTIFY_ADD_DEVICE &&
+ vdev->mc_dev == mc_cont) {
+ mc_dev->driver_override = kasprintf(GFP_KERNEL, "%s",
+ vfio_fsl_mc_ops.name);
+ if (!mc_dev->driver_override)
+ dev_warn(dev, "VFIO_FSL_MC: Setting driver override for device in dprc %s failed\n",
+ dev_name(&mc_cont->dev));
+ else
+ dev_info(dev, "VFIO_FSL_MC: Setting driver override for device in dprc %s\n",
+ dev_name(&mc_cont->dev));
+ } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+ vdev->mc_dev == mc_cont) {
+ struct fsl_mc_driver *mc_drv = to_fsl_mc_driver(dev->driver);
+
+ if (mc_drv && mc_drv != &vfio_fsl_mc_driver)
+ dev_warn(dev, "VFIO_FSL_MC: Object %s bound to driver %s while DPRC bound to vfio-fsl-mc\n",
+ dev_name(dev), mc_drv->driver.name);
+ }
+
+ return 0;
+}
+
+static int vfio_fsl_mc_init_device(struct vfio_fsl_mc_device *vdev)
+{
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ int ret;
+
+ /* Non-dprc devices share mc_io from parent */
+ if (!is_fsl_mc_bus_dprc(mc_dev)) {
+ struct fsl_mc_device *mc_cont = to_fsl_mc_device(mc_dev->dev.parent);
+
+ mc_dev->mc_io = mc_cont->mc_io;
+ return 0;
+ }
+
+ vdev->nb.notifier_call = vfio_fsl_mc_bus_notifier;
+ ret = bus_register_notifier(&fsl_mc_bus_type, &vdev->nb);
+ if (ret)
+ return ret;
+
+ /* open DPRC, allocate a MC portal */
+ ret = dprc_setup(mc_dev);
+ if (ret) {
+ dev_err(&mc_dev->dev, "VFIO_FSL_MC: Failed to setup DPRC (%d)\n", ret);
+ goto out_nc_unreg;
+ }
+ return 0;
+
+out_nc_unreg:
+ bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb);
+ return ret;
+}
+
+static int vfio_fsl_mc_scan_container(struct fsl_mc_device *mc_dev)
+{
+ int ret;
+
+ /* non dprc devices do not scan for other devices */
+ if (!is_fsl_mc_bus_dprc(mc_dev))
+ return 0;
+ ret = dprc_scan_container(mc_dev, false);
+ if (ret) {
+ dev_err(&mc_dev->dev,
+ "VFIO_FSL_MC: Container scanning failed (%d)\n", ret);
+ dprc_remove_devices(mc_dev, NULL, 0);
+ return ret;
+ }
+ return 0;
+}
+
+static void vfio_fsl_uninit_device(struct vfio_fsl_mc_device *vdev)
+{
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+
+ if (!is_fsl_mc_bus_dprc(mc_dev))
+ return;
+
+ dprc_cleanup(mc_dev);
+ bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb);
+}
+
+static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev)
+{
+ struct iommu_group *group;
+ struct vfio_fsl_mc_device *vdev;
+ struct device *dev = &mc_dev->dev;
+ int ret;
+
+ group = vfio_iommu_group_get(dev);
+ if (!group) {
+ dev_err(dev, "VFIO_FSL_MC: No IOMMU group\n");
+ return -EINVAL;
+ }
+
+ vdev = devm_kzalloc(dev, sizeof(*vdev), GFP_KERNEL);
+ if (!vdev) {
+ ret = -ENOMEM;
+ goto out_group_put;
+ }
+
+ vdev->mc_dev = mc_dev;
+ mutex_init(&vdev->igate);
+
+ ret = vfio_fsl_mc_reflck_attach(vdev);
+ if (ret)
+ goto out_group_put;
+
+ ret = vfio_fsl_mc_init_device(vdev);
+ if (ret)
+ goto out_reflck;
+
+ ret = vfio_add_group_dev(dev, &vfio_fsl_mc_ops, vdev);
+ if (ret) {
+ dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n");
+ goto out_device;
+ }
+
+ /*
+ * This triggers recursion into vfio_fsl_mc_probe() on another device
+ * and the vfio_fsl_mc_reflck_attach() must succeed, which relies on the
+ * vfio_add_group_dev() above. It has no impact on this vdev, so it is
+ * safe to be after the vfio device is made live.
+ */
+ ret = vfio_fsl_mc_scan_container(mc_dev);
+ if (ret)
+ goto out_group_dev;
+ return 0;
+
+out_group_dev:
+ vfio_del_group_dev(dev);
+out_device:
+ vfio_fsl_uninit_device(vdev);
+out_reflck:
+ vfio_fsl_mc_reflck_put(vdev->reflck);
+out_group_put:
+ vfio_iommu_group_put(group, dev);
+ return ret;
+}
+
+static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev)
+{
+ struct vfio_fsl_mc_device *vdev;
+ struct device *dev = &mc_dev->dev;
+
+ vdev = vfio_del_group_dev(dev);
+ if (!vdev)
+ return -EINVAL;
+
+ mutex_destroy(&vdev->igate);
+
+ dprc_remove_devices(mc_dev, NULL, 0);
+ vfio_fsl_uninit_device(vdev);
+ vfio_fsl_mc_reflck_put(vdev->reflck);
+
+ vfio_iommu_group_put(mc_dev->dev.iommu_group, dev);
+
+ return 0;
+}
+
+static struct fsl_mc_driver vfio_fsl_mc_driver = {
+ .probe = vfio_fsl_mc_probe,
+ .remove = vfio_fsl_mc_remove,
+ .driver = {
+ .name = "vfio-fsl-mc",
+ .owner = THIS_MODULE,
+ },
+};
+
+static int __init vfio_fsl_mc_driver_init(void)
+{
+ return fsl_mc_driver_register(&vfio_fsl_mc_driver);
+}
+
+static void __exit vfio_fsl_mc_driver_exit(void)
+{
+ fsl_mc_driver_unregister(&vfio_fsl_mc_driver);
+}
+
+module_init(vfio_fsl_mc_driver_init);
+module_exit(vfio_fsl_mc_driver_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("VFIO for FSL-MC devices - User Level meta-driver");
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
new file mode 100644
index 0000000..0d9f300
--- /dev/null
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * Copyright 2013-2016 Freescale Semiconductor Inc.
+ * Copyright 2019 NXP
+ */
+
+#include <linux/vfio.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/eventfd.h>
+#include <linux/msi.h>
+
+#include "linux/fsl/mc.h"
+#include "vfio_fsl_mc_private.h"
+
+static int vfio_fsl_mc_irqs_allocate(struct vfio_fsl_mc_device *vdev)
+{
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ struct vfio_fsl_mc_irq *mc_irq;
+ int irq_count;
+ int ret, i;
+
+ /* Device does not support any interrupt */
+ if (mc_dev->obj_desc.irq_count == 0)
+ return 0;
+
+ /* interrupts were already allocated for this device */
+ if (vdev->mc_irqs)
+ return 0;
+
+ irq_count = mc_dev->obj_desc.irq_count;
+
+ mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL);
+ if (!mc_irq)
+ return -ENOMEM;
+
+ /* Allocate IRQs */
+ ret = fsl_mc_allocate_irqs(mc_dev);
+ if (ret) {
+ kfree(mc_irq);
+ return ret;
+ }
+
+ for (i = 0; i < irq_count; i++) {
+ mc_irq[i].count = 1;
+ mc_irq[i].flags = VFIO_IRQ_INFO_EVENTFD;
+ }
+
+ vdev->mc_irqs = mc_irq;
+
+ return 0;
+}
+
+static irqreturn_t vfio_fsl_mc_irq_handler(int irq_num, void *arg)
+{
+ struct vfio_fsl_mc_irq *mc_irq = (struct vfio_fsl_mc_irq *)arg;
+
+ eventfd_signal(mc_irq->trigger, 1);
+ return IRQ_HANDLED;
+}
+
+static int vfio_set_trigger(struct vfio_fsl_mc_device *vdev,
+ int index, int fd)
+{
+ struct vfio_fsl_mc_irq *irq = &vdev->mc_irqs[index];
+ struct eventfd_ctx *trigger;
+ int hwirq;
+ int ret;
+
+ hwirq = vdev->mc_dev->irqs[index]->msi_desc->irq;
+ if (irq->trigger) {
+ free_irq(hwirq, irq);
+ kfree(irq->name);
+ eventfd_ctx_put(irq->trigger);
+ irq->trigger = NULL;
+ }
+
+ if (fd < 0) /* Disable only */
+ return 0;
+
+ irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)",
+ hwirq, dev_name(&vdev->mc_dev->dev));
+ if (!irq->name)
+ return -ENOMEM;
+
+ trigger = eventfd_ctx_fdget(fd);
+ if (IS_ERR(trigger)) {
+ kfree(irq->name);
+ return PTR_ERR(trigger);
+ }
+
+ irq->trigger = trigger;
+
+ ret = request_irq(hwirq, vfio_fsl_mc_irq_handler, 0,
+ irq->name, irq);
+ if (ret) {
+ kfree(irq->name);
+ eventfd_ctx_put(trigger);
+ irq->trigger = NULL;
+ return ret;
+ }
+
+ return 0;
+}
+
+static int vfio_fsl_mc_set_irq_trigger(struct vfio_fsl_mc_device *vdev,
+ unsigned int index, unsigned int start,
+ unsigned int count, u32 flags,
+ void *data)
+{
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ int ret, hwirq;
+ struct vfio_fsl_mc_irq *irq;
+ struct device *cont_dev = fsl_mc_cont_dev(&mc_dev->dev);
+ struct fsl_mc_device *mc_cont = to_fsl_mc_device(cont_dev);
+
+ if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
+ return vfio_set_trigger(vdev, index, -1);
+
+ if (start != 0 || count != 1)
+ return -EINVAL;
+
+ mutex_lock(&vdev->reflck->lock);
+ ret = fsl_mc_populate_irq_pool(mc_cont,
+ FSL_MC_IRQ_POOL_MAX_TOTAL_IRQS);
+ if (ret)
+ goto unlock;
+
+ ret = vfio_fsl_mc_irqs_allocate(vdev);
+ if (ret)
+ goto unlock;
+ mutex_unlock(&vdev->reflck->lock);
+
+ if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+ s32 fd = *(s32 *)data;
+
+ return vfio_set_trigger(vdev, index, fd);
+ }
+
+ hwirq = vdev->mc_dev->irqs[index]->msi_desc->irq;
+
+ irq = &vdev->mc_irqs[index];
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ vfio_fsl_mc_irq_handler(hwirq, irq);
+
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ u8 trigger = *(u8 *)data;
+
+ if (trigger)
+ vfio_fsl_mc_irq_handler(hwirq, irq);
+ }
+
+ return 0;
+
+unlock:
+ mutex_unlock(&vdev->reflck->lock);
+ return ret;
+
+}
+
+int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
+ u32 flags, unsigned int index,
+ unsigned int start, unsigned int count,
+ void *data)
+{
+ if (flags & VFIO_IRQ_SET_ACTION_TRIGGER)
+ return vfio_fsl_mc_set_irq_trigger(vdev, index, start,
+ count, flags, data);
+ else
+ return -EINVAL;
+}
+
+/* Free All IRQs for the given MC object */
+void vfio_fsl_mc_irqs_cleanup(struct vfio_fsl_mc_device *vdev)
+{
+ struct fsl_mc_device *mc_dev = vdev->mc_dev;
+ int irq_count = mc_dev->obj_desc.irq_count;
+ int i;
+
+ /*
+ * Device does not support any interrupt or the interrupts
+ * were not configured
+ */
+ if (!vdev->mc_irqs)
+ return;
+
+ for (i = 0; i < irq_count; i++)
+ vfio_set_trigger(vdev, i, -1);
+
+ fsl_mc_free_irqs(mc_dev);
+ kfree(vdev->mc_irqs);
+ vdev->mc_irqs = NULL;
+}
diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
new file mode 100644
index 0000000..a97ee69
--- /dev/null
+++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_private.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */
+/*
+ * Copyright 2013-2016 Freescale Semiconductor Inc.
+ * Copyright 2016,2019-2020 NXP
+ */
+
+#ifndef VFIO_FSL_MC_PRIVATE_H
+#define VFIO_FSL_MC_PRIVATE_H
+
+#define VFIO_FSL_MC_OFFSET_SHIFT 40
+#define VFIO_FSL_MC_OFFSET_MASK (((u64)(1) << VFIO_FSL_MC_OFFSET_SHIFT) - 1)
+
+#define VFIO_FSL_MC_OFFSET_TO_INDEX(off) ((off) >> VFIO_FSL_MC_OFFSET_SHIFT)
+
+#define VFIO_FSL_MC_INDEX_TO_OFFSET(index) \
+ ((u64)(index) << VFIO_FSL_MC_OFFSET_SHIFT)
+
+struct vfio_fsl_mc_irq {
+ u32 flags;
+ u32 count;
+ struct eventfd_ctx *trigger;
+ char *name;
+};
+
+struct vfio_fsl_mc_reflck {
+ struct kref kref;
+ struct mutex lock;
+};
+
+struct vfio_fsl_mc_region {
+ u32 flags;
+ u32 type;
+ u64 addr;
+ resource_size_t size;
+ void __iomem *ioaddr;
+};
+
+struct vfio_fsl_mc_device {
+ struct fsl_mc_device *mc_dev;
+ struct notifier_block nb;
+ int refcnt;
+ struct vfio_fsl_mc_region *regions;
+ struct vfio_fsl_mc_reflck *reflck;
+ struct mutex igate;
+ struct vfio_fsl_mc_irq *mc_irqs;
+};
+
+extern int vfio_fsl_mc_set_irqs_ioctl(struct vfio_fsl_mc_device *vdev,
+ u32 flags, unsigned int index,
+ unsigned int start, unsigned int count,
+ void *data);
+
+void vfio_fsl_mc_irqs_cleanup(struct vfio_fsl_mc_device *vdev);
+
+#endif /* VFIO_FSL_MC_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index ca94768..367ff54 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -74,7 +74,7 @@
return count;
}
-MDEV_TYPE_ATTR_WO(create);
+static MDEV_TYPE_ATTR_WO(create);
static void mdev_type_release(struct kobject *kobj)
{
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index c691127..4e11077 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -46,3 +46,15 @@
depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
help
VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
+
+config VFIO_PCI_ZDEV
+ bool "VFIO PCI ZPCI device CLP support"
+ depends on VFIO_PCI && S390
+ default y
+ help
+ Enabling this option exposes VFIO capabilities containing hardware
+ configuration for zPCI devices. This enables userspace (e.g. QEMU)
+ to supply proper configuration values instead of hard-coded defaults
+ for zPCI devices passed through via VFIO on s390.
+
+ Say Y here.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index f027f8a..781e080 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -3,5 +3,6 @@
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
+vfio-pci-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b06d762..57ae8b4 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -9,7 +9,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
#include <linux/device.h>
#include <linux/eventfd.h>
@@ -55,6 +54,16 @@
MODULE_PARM_DESC(disable_idle_d3,
"Disable using the PCI D3 low power state for idle, unused devices");
+static bool enable_sriov;
+#ifdef CONFIG_PCI_IOV
+module_param(enable_sriov, bool, 0644);
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
+#endif
+
+static bool disable_denylist;
+module_param(disable_denylist, bool, 0444);
+MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
+
static inline bool vfio_vga_disabled(void)
{
#ifdef CONFIG_VFIO_PCI_VGA
@@ -64,6 +73,44 @@
#endif
}
+static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
+{
+ switch (pdev->vendor) {
+ case PCI_VENDOR_ID_INTEL:
+ switch (pdev->device) {
+ case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
+ case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
+ case PCI_DEVICE_ID_INTEL_QAT_C62X:
+ case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+ case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+ case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ return false;
+}
+
+static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
+{
+ if (!vfio_pci_dev_in_denylist(pdev))
+ return false;
+
+ if (disable_denylist) {
+ pci_warn(pdev,
+ "device denylist disabled - allowing device %04x:%04x.\n",
+ pdev->vendor, pdev->device);
+ return false;
+ }
+
+ pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
+ pdev->vendor, pdev->device);
+
+ return true;
+}
+
/*
* Our VGA arbiter participation is limited since we don't know anything
* about the device itself. However, if the device is the only VGA device
@@ -111,11 +158,13 @@
static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
{
struct resource *res;
- int bar;
+ int i;
struct vfio_pci_dummy_resource *dummy_res;
- for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
- res = vdev->pdev->resource + bar;
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ int bar = i + PCI_STD_RESOURCES;
+
+ res = &vdev->pdev->resource[bar];
if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
goto no_mmap;
@@ -198,6 +247,8 @@
case 0x1580 ... 0x1581:
case 0x1583 ... 0x158b:
case 0x37d0 ... 0x37d2:
+ /* X550 */
+ case 0x1563:
return true;
default:
return false;
@@ -399,7 +450,8 @@
vfio_config_free(vdev);
- for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ bar = i + PCI_STD_RESOURCES;
if (!vdev->barmap[bar])
continue;
pci_iounmap(pdev, vdev->barmap[bar]);
@@ -463,6 +515,44 @@
vfio_pci_set_power_state(vdev, PCI_D3hot);
}
+static struct pci_driver vfio_pci_driver;
+
+static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev,
+ struct vfio_device **pf_dev)
+{
+ struct pci_dev *physfn = pci_physfn(vdev->pdev);
+
+ if (!vdev->pdev->is_virtfn)
+ return NULL;
+
+ *pf_dev = vfio_device_get_from_dev(&physfn->dev);
+ if (!*pf_dev)
+ return NULL;
+
+ if (pci_dev_driver(physfn) != &vfio_pci_driver) {
+ vfio_device_put(*pf_dev);
+ return NULL;
+ }
+
+ return vfio_device_data(*pf_dev);
+}
+
+static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
+{
+ struct vfio_device *pf_dev;
+ struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+
+ if (!pf_vdev)
+ return;
+
+ mutex_lock(&pf_vdev->vf_token->lock);
+ pf_vdev->vf_token->users += val;
+ WARN_ON(pf_vdev->vf_token->users < 0);
+ mutex_unlock(&pf_vdev->vf_token->lock);
+
+ vfio_device_put(pf_dev);
+}
+
static void vfio_pci_release(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
@@ -470,16 +560,15 @@
mutex_lock(&vdev->reflck->lock);
if (!(--vdev->refcnt)) {
+ vfio_pci_vf_token_user_add(vdev, -1);
vfio_spapr_pci_eeh_release(vdev->pdev);
vfio_pci_disable(vdev);
+
mutex_lock(&vdev->igate);
if (vdev->err_trigger) {
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = NULL;
}
- mutex_unlock(&vdev->igate);
-
- mutex_lock(&vdev->igate);
if (vdev->req_trigger) {
eventfd_ctx_put(vdev->req_trigger);
vdev->req_trigger = NULL;
@@ -508,6 +597,7 @@
goto error;
vfio_spapr_pci_eeh_open(vdev->pdev);
+ vfio_pci_vf_token_user_add(vdev, 1);
}
vdev->refcnt++;
error:
@@ -715,15 +805,25 @@
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ unsigned long capsz;
minsz = offsetofend(struct vfio_device_info, num_irqs);
+ /* For backward compatibility, cannot require this */
+ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
+ if (info.argsz >= capsz) {
+ minsz = capsz;
+ info.cap_offset = 0;
+ }
+
info.flags = VFIO_DEVICE_FLAGS_PCI;
if (vdev->reset_works)
@@ -732,6 +832,33 @@
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
+ int ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
+
+ if (ret && ret != -ENODEV) {
+ pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
+ return ret;
+ }
+ }
+
+ if (caps.size) {
+ info.flags |= VFIO_DEVICE_FLAGS_CAPS;
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ return -EFAULT;
+ }
+ info.cap_offset = sizeof(info);
+ }
+
+ kfree(caps.buf);
+ }
+
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
@@ -898,7 +1025,7 @@
case VFIO_PCI_ERR_IRQ_INDEX:
if (pci_is_pcie(vdev->pdev))
break;
- /* fall through */
+ fallthrough;
default:
return -EINVAL;
}
@@ -1144,7 +1271,7 @@
/*
* We need to get memory_lock for each device, but devices
- * can share mmap_sem, therefore we need to zap and hold
+ * can share mmap_lock, therefore we need to zap and hold
* the vma_lock for each device, and only then get each
* memory_lock.
*/
@@ -1213,6 +1340,65 @@
return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
ioeventfd.data, count, ioeventfd.fd);
+ } else if (cmd == VFIO_DEVICE_FEATURE) {
+ struct vfio_device_feature feature;
+ uuid_t uuid;
+
+ minsz = offsetofend(struct vfio_device_feature, flags);
+
+ if (copy_from_user(&feature, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (feature.argsz < minsz)
+ return -EINVAL;
+
+ /* Check unknown flags */
+ if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK |
+ VFIO_DEVICE_FEATURE_SET |
+ VFIO_DEVICE_FEATURE_GET |
+ VFIO_DEVICE_FEATURE_PROBE))
+ return -EINVAL;
+
+ /* GET & SET are mutually exclusive except with PROBE */
+ if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
+ (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
+ (feature.flags & VFIO_DEVICE_FEATURE_GET))
+ return -EINVAL;
+
+ switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
+ if (!vdev->vf_token)
+ return -ENOTTY;
+
+ /*
+ * We do not support GET of the VF Token UUID as this
+ * could expose the token of the previous device user.
+ */
+ if (feature.flags & VFIO_DEVICE_FEATURE_GET)
+ return -EINVAL;
+
+ if (feature.flags & VFIO_DEVICE_FEATURE_PROBE)
+ return 0;
+
+ /* Don't SET unless told to do so */
+ if (!(feature.flags & VFIO_DEVICE_FEATURE_SET))
+ return -EINVAL;
+
+ if (feature.argsz < minsz + sizeof(uuid))
+ return -EINVAL;
+
+ if (copy_from_user(&uuid, (void __user *)(arg + minsz),
+ sizeof(uuid)))
+ return -EFAULT;
+
+ mutex_lock(&vdev->vf_token->lock);
+ uuid_copy(&vdev->vf_token->uuid, &uuid);
+ mutex_unlock(&vdev->vf_token->lock);
+
+ return 0;
+ default:
+ return -ENOTTY;
+ }
}
return -ENOTTY;
@@ -1275,26 +1461,26 @@
/*
* Lock ordering:
- * vma_lock is nested under mmap_sem for vm_ops callback paths.
+ * vma_lock is nested under mmap_lock for vm_ops callback paths.
* The memory_lock semaphore is used by both code paths calling
* into this function to zap vmas and the vm_ops.fault callback
* to protect the memory enable state of the device.
*
- * When zapping vmas we need to maintain the mmap_sem => vma_lock
+ * When zapping vmas we need to maintain the mmap_lock => vma_lock
* ordering, which requires using vma_lock to walk vma_list to
- * acquire an mm, then dropping vma_lock to get the mmap_sem and
+ * acquire an mm, then dropping vma_lock to get the mmap_lock and
* reacquiring vma_lock. This logic is derived from similar
* requirements in uverbs_user_mmap_disassociate().
*
- * mmap_sem must always be the top-level lock when it is taken.
+ * mmap_lock must always be the top-level lock when it is taken.
* Therefore we can only hold the memory_lock write lock when
- * vma_list is empty, as we'd need to take mmap_sem to clear
+ * vma_list is empty, as we'd need to take mmap_lock to clear
* entries. vma_list can only be guaranteed empty when holding
* vma_lock, thus memory_lock is nested under vma_lock.
*
* This enables the vm_ops.fault callback to acquire vma_lock,
* followed by memory_lock read lock, while already holding
- * mmap_sem without risk of deadlock.
+ * mmap_lock without risk of deadlock.
*/
while (1) {
struct mm_struct *mm = NULL;
@@ -1322,39 +1508,37 @@
mutex_unlock(&vdev->vma_lock);
if (try) {
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (!mmap_read_trylock(mm)) {
mmput(mm);
return 0;
}
} else {
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
}
- if (mmget_still_valid(mm)) {
- if (try) {
- if (!mutex_trylock(&vdev->vma_lock)) {
- up_read(&mm->mmap_sem);
- mmput(mm);
- return 0;
- }
- } else {
- mutex_lock(&vdev->vma_lock);
+ if (try) {
+ if (!mutex_trylock(&vdev->vma_lock)) {
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
}
- list_for_each_entry_safe(mmap_vma, tmp,
- &vdev->vma_list, vma_next) {
- struct vm_area_struct *vma = mmap_vma->vma;
-
- if (vma->vm_mm != mm)
- continue;
-
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
-
- zap_vma_ptes(vma, vma->vm_start,
- vma->vm_end - vma->vm_start);
- }
- mutex_unlock(&vdev->vma_lock);
+ } else {
+ mutex_lock(&vdev->vma_lock);
}
- up_read(&mm->mmap_sem);
+ list_for_each_entry_safe(mmap_vma, tmp,
+ &vdev->vma_list, vma_next) {
+ struct vm_area_struct *vma = mmap_vma->vma;
+
+ if (vma->vm_mm != mm)
+ continue;
+
+ list_del(&mmap_vma->vma_next);
+ kfree(mmap_vma);
+
+ zap_vma_ptes(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ }
+ mutex_unlock(&vdev->vma_lock);
+ mmap_read_unlock(mm);
mmput(mm);
}
}
@@ -1568,6 +1752,150 @@
mutex_unlock(&vdev->igate);
}
+static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
+ bool vf_token, uuid_t *uuid)
+{
+ /*
+ * There's always some degree of trust or collaboration between SR-IOV
+ * PF and VFs, even if just that the PF hosts the SR-IOV capability and
+ * can disrupt VFs with a reset, but often the PF has more explicit
+ * access to deny service to the VF or access data passed through the
+ * VF. We therefore require an opt-in via a shared VF token (UUID) to
+ * represent this trust. This both prevents that a VF driver might
+ * assume the PF driver is a trusted, in-kernel driver, and also that
+ * a PF driver might be replaced with a rogue driver, unknown to in-use
+ * VF drivers.
+ *
+ * Therefore when presented with a VF, if the PF is a vfio device and
+ * it is bound to the vfio-pci driver, the user needs to provide a VF
+ * token to access the device, in the form of appending a vf_token to
+ * the device name, for example:
+ *
+ * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
+ *
+ * When presented with a PF which has VFs in use, the user must also
+ * provide the current VF token to prove collaboration with existing
+ * VF users. If VFs are not in use, the VF token provided for the PF
+ * device will act to set the VF token.
+ *
+ * If the VF token is provided but unused, an error is generated.
+ */
+ if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
+ return 0; /* No VF token provided or required */
+
+ if (vdev->pdev->is_virtfn) {
+ struct vfio_device *pf_dev;
+ struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+ bool match;
+
+ if (!pf_vdev) {
+ if (!vf_token)
+ return 0; /* PF is not vfio-pci, no VF token */
+
+ pci_info_ratelimited(vdev->pdev,
+ "VF token incorrectly provided, PF not bound to vfio-pci\n");
+ return -EINVAL;
+ }
+
+ if (!vf_token) {
+ vfio_device_put(pf_dev);
+ pci_info_ratelimited(vdev->pdev,
+ "VF token required to access device\n");
+ return -EACCES;
+ }
+
+ mutex_lock(&pf_vdev->vf_token->lock);
+ match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
+ mutex_unlock(&pf_vdev->vf_token->lock);
+
+ vfio_device_put(pf_dev);
+
+ if (!match) {
+ pci_info_ratelimited(vdev->pdev,
+ "Incorrect VF token provided for device\n");
+ return -EACCES;
+ }
+ } else if (vdev->vf_token) {
+ mutex_lock(&vdev->vf_token->lock);
+ if (vdev->vf_token->users) {
+ if (!vf_token) {
+ mutex_unlock(&vdev->vf_token->lock);
+ pci_info_ratelimited(vdev->pdev,
+ "VF token required to access device\n");
+ return -EACCES;
+ }
+
+ if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
+ mutex_unlock(&vdev->vf_token->lock);
+ pci_info_ratelimited(vdev->pdev,
+ "Incorrect VF token provided for device\n");
+ return -EACCES;
+ }
+ } else if (vf_token) {
+ uuid_copy(&vdev->vf_token->uuid, uuid);
+ }
+
+ mutex_unlock(&vdev->vf_token->lock);
+ } else if (vf_token) {
+ pci_info_ratelimited(vdev->pdev,
+ "VF token incorrectly provided, not a PF or VF\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define VF_TOKEN_ARG "vf_token="
+
+static int vfio_pci_match(void *device_data, char *buf)
+{
+ struct vfio_pci_device *vdev = device_data;
+ bool vf_token = false;
+ uuid_t uuid;
+ int ret;
+
+ if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
+ return 0; /* No match */
+
+ if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
+ buf += strlen(pci_name(vdev->pdev));
+
+ if (*buf != ' ')
+ return 0; /* No match: non-whitespace after name */
+
+ while (*buf) {
+ if (*buf == ' ') {
+ buf++;
+ continue;
+ }
+
+ if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
+ strlen(VF_TOKEN_ARG))) {
+ buf += strlen(VF_TOKEN_ARG);
+
+ if (strlen(buf) < UUID_STRING_LEN)
+ return -EINVAL;
+
+ ret = uuid_parse(buf, &uuid);
+ if (ret)
+ return ret;
+
+ vf_token = true;
+ buf += UUID_STRING_LEN;
+ } else {
+ /* Unknown/duplicate option */
+ return -EINVAL;
+ }
+ }
+ }
+
+ ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
+ if (ret)
+ return ret;
+
+ return 1; /* Match */
+}
+
static const struct vfio_device_ops vfio_pci_ops = {
.name = "vfio-pci",
.open = vfio_pci_open,
@@ -1577,27 +1905,121 @@
.write = vfio_pci_write,
.mmap = vfio_pci_mmap,
.request = vfio_pci_request,
+ .match = vfio_pci_match,
};
static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+static int vfio_pci_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct vfio_pci_device *vdev = container_of(nb,
+ struct vfio_pci_device, nb);
+ struct device *dev = data;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct pci_dev *physfn = pci_physfn(pdev);
+
+ if (action == BUS_NOTIFY_ADD_DEVICE &&
+ pdev->is_virtfn && physfn == vdev->pdev) {
+ pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
+ pci_name(pdev));
+ pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
+ vfio_pci_ops.name);
+ } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+ pdev->is_virtfn && physfn == vdev->pdev) {
+ struct pci_driver *drv = pci_dev_driver(pdev);
+
+ if (drv && drv != &vfio_pci_driver)
+ pci_warn(vdev->pdev,
+ "VF %s bound to driver %s while PF bound to vfio-pci\n",
+ pci_name(pdev), drv->name);
+ }
+
+ return 0;
+}
+
+static int vfio_pci_vf_init(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ int ret;
+
+ if (!pdev->is_physfn)
+ return 0;
+
+ vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
+ if (!vdev->vf_token)
+ return -ENOMEM;
+
+ mutex_init(&vdev->vf_token->lock);
+ uuid_gen(&vdev->vf_token->uuid);
+
+ vdev->nb.notifier_call = vfio_pci_bus_notifier;
+ ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+ if (ret) {
+ kfree(vdev->vf_token);
+ return ret;
+ }
+ return 0;
+}
+
+static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev)
+{
+ if (!vdev->vf_token)
+ return;
+
+ bus_unregister_notifier(&pci_bus_type, &vdev->nb);
+ WARN_ON(vdev->vf_token->users);
+ mutex_destroy(&vdev->vf_token->lock);
+ kfree(vdev->vf_token);
+}
+
+static int vfio_pci_vga_init(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ int ret;
+
+ if (!vfio_pci_is_vga(pdev))
+ return 0;
+
+ ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
+ if (ret)
+ return ret;
+ vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false));
+ return 0;
+}
+
+static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+
+ if (!vfio_pci_is_vga(pdev))
+ return;
+ vga_client_register(pdev, NULL, NULL, NULL);
+ vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
+ VGA_RSRC_LEGACY_IO |
+ VGA_RSRC_LEGACY_MEM);
+}
+
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct vfio_pci_device *vdev;
struct iommu_group *group;
int ret;
+ if (vfio_pci_is_denylisted(pdev))
+ return -EINVAL;
+
if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
return -EINVAL;
/*
- * Prevent binding to PFs with VFs enabled, this too easily allows
- * userspace instance with VFs and PFs from the same device, which
- * cannot work. Disabling SR-IOV here would initiate removing the
- * VFs, which would unbind the driver, which is prone to blocking
- * if that VF is also in use by vfio-pci. Just reject these PFs
- * and let the user sort it out.
+ * Prevent binding to PFs with VFs enabled, the VFs might be in use
+ * by the host or other users. We cannot capture the VFs if they
+ * already exist, nor can we track VF users. Disabling SR-IOV here
+ * would initiate removing the VFs, which would unbind the driver,
+ * which is prone to blocking if that VF is also in use by vfio-pci.
+ * Just reject these PFs and let the user sort it out.
*/
if (pci_num_vf(pdev)) {
pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
@@ -1610,8 +2032,8 @@
vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
if (!vdev) {
- vfio_iommu_group_put(group, &pdev->dev);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_group_put;
}
vdev->pdev = pdev;
@@ -1625,26 +2047,15 @@
INIT_LIST_HEAD(&vdev->vma_list);
init_rwsem(&vdev->memory_lock);
- ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
- if (ret) {
- vfio_iommu_group_put(group, &pdev->dev);
- kfree(vdev);
- return ret;
- }
-
ret = vfio_pci_reflck_attach(vdev);
- if (ret) {
- vfio_del_group_dev(&pdev->dev);
- vfio_iommu_group_put(group, &pdev->dev);
- kfree(vdev);
- return ret;
- }
-
- if (vfio_pci_is_vga(pdev)) {
- vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
- vga_set_legacy_decoding(pdev,
- vfio_pci_set_vga_decode(vdev, false));
- }
+ if (ret)
+ goto out_free;
+ ret = vfio_pci_vf_init(vdev);
+ if (ret)
+ goto out_reflck;
+ ret = vfio_pci_vga_init(vdev);
+ if (ret)
+ goto out_vf;
vfio_pci_probe_power_state(vdev);
@@ -1662,6 +2073,23 @@
vfio_pci_set_power_state(vdev, PCI_D3hot);
}
+ ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
+ if (ret)
+ goto out_power;
+ return 0;
+
+out_power:
+ if (!disable_idle_d3)
+ vfio_pci_set_power_state(vdev, PCI_D0);
+out_vf:
+ vfio_pci_vf_uninit(vdev);
+out_reflck:
+ vfio_pci_reflck_put(vdev->reflck);
+out_free:
+ kfree(vdev->pm_save);
+ kfree(vdev);
+out_group_put:
+ vfio_iommu_group_put(group, &pdev->dev);
return ret;
}
@@ -1669,28 +2097,25 @@
{
struct vfio_pci_device *vdev;
+ pci_disable_sriov(pdev);
+
vdev = vfio_del_group_dev(&pdev->dev);
if (!vdev)
return;
+ vfio_pci_vf_uninit(vdev);
vfio_pci_reflck_put(vdev->reflck);
+ vfio_pci_vga_uninit(vdev);
vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
- kfree(vdev->region);
- mutex_destroy(&vdev->ioeventfds_lock);
if (!disable_idle_d3)
vfio_pci_set_power_state(vdev, PCI_D0);
+ mutex_destroy(&vdev->ioeventfds_lock);
+ kfree(vdev->region);
kfree(vdev->pm_save);
kfree(vdev);
-
- if (vfio_pci_is_vga(pdev)) {
- vga_client_register(pdev, NULL, NULL, NULL);
- vga_set_legacy_decoding(pdev,
- VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
- VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
- }
}
static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
@@ -1721,16 +2146,48 @@
return PCI_ERS_RESULT_CAN_RECOVER;
}
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+ struct vfio_pci_device *vdev;
+ struct vfio_device *device;
+ int ret = 0;
+
+ might_sleep();
+
+ if (!enable_sriov)
+ return -ENOENT;
+
+ device = vfio_device_get_from_dev(&pdev->dev);
+ if (!device)
+ return -ENODEV;
+
+ vdev = vfio_device_data(device);
+ if (!vdev) {
+ vfio_device_put(device);
+ return -ENODEV;
+ }
+
+ if (nr_virtfn == 0)
+ pci_disable_sriov(pdev);
+ else
+ ret = pci_enable_sriov(pdev, nr_virtfn);
+
+ vfio_device_put(device);
+
+ return ret < 0 ? ret : nr_virtfn;
+}
+
static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
};
static struct pci_driver vfio_pci_driver = {
- .name = "vfio-pci",
- .id_table = NULL, /* only dynamic ids */
- .probe = vfio_pci_probe,
- .remove = vfio_pci_remove,
- .err_handler = &vfio_err_handlers,
+ .name = "vfio-pci",
+ .id_table = NULL, /* only dynamic ids */
+ .probe = vfio_pci_probe,
+ .remove = vfio_pci_remove,
+ .sriov_configure = vfio_pci_sriov_configure,
+ .err_handler = &vfio_err_handlers,
};
static DEFINE_MUTEX(reflck_lock);
@@ -2014,6 +2471,9 @@
vfio_pci_fill_ids();
+ if (disable_denylist)
+ pr_warn("device denylist disabled.\n");
+
return 0;
out_driver:
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 50cd17f..47f21a6 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -464,30 +464,35 @@
{
struct pci_dev *pdev = vdev->pdev;
int i;
- __le32 *bar;
+ __le32 *vbar;
u64 mask;
- bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+ if (!vdev->bardirty)
+ return;
- for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
- if (!pci_resource_start(pdev, i)) {
- *bar = 0; /* Unmapped by host = unimplemented to user */
+ vbar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+
+ for (i = 0; i < PCI_STD_NUM_BARS; i++, vbar++) {
+ int bar = i + PCI_STD_RESOURCES;
+
+ if (!pci_resource_start(pdev, bar)) {
+ *vbar = 0; /* Unmapped by host = unimplemented to user */
continue;
}
- mask = ~(pci_resource_len(pdev, i) - 1);
+ mask = ~(pci_resource_len(pdev, bar) - 1);
- *bar &= cpu_to_le32((u32)mask);
- *bar |= vfio_generate_bar_flags(pdev, i);
+ *vbar &= cpu_to_le32((u32)mask);
+ *vbar |= vfio_generate_bar_flags(pdev, bar);
- if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
- bar++;
- *bar &= cpu_to_le32((u32)(mask >> 32));
+ if (*vbar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
+ vbar++;
+ *vbar &= cpu_to_le32((u32)(mask >> 32));
i++;
}
}
- bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
+ vbar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
/*
* NB. REGION_INFO will have reported zero size if we weren't able
@@ -497,14 +502,14 @@
if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
- *bar &= cpu_to_le32((u32)mask);
+ *vbar &= cpu_to_le32((u32)mask);
} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
IORESOURCE_ROM_SHADOW) {
mask = ~(0x20000 - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
- *bar &= cpu_to_le32((u32)mask);
+ *vbar &= cpu_to_le32((u32)mask);
} else
- *bar = 0;
+ *vbar = 0;
vdev->bardirty = false;
}
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
index 08f1783..9adcf6a 100644
--- a/drivers/vfio/pci/vfio_pci_nvlink2.c
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -67,7 +67,7 @@
*
* This is not fast path anyway.
*/
- sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE);
+ sizealigned = ALIGN(posoff + count, PAGE_SIZE);
ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned);
if (!ptr)
return -EFAULT;
@@ -161,7 +161,7 @@
data->useraddr = vma->vm_start;
data->mm = current->mm;
- atomic_inc(&data->mm->mm_count);
+ mmgrab(data->mm);
ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
vma_pages(vma), data->gpu_hpa, &data->mem);
@@ -425,8 +425,14 @@
if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
&mmio_atsd)) {
- dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
- mmio_atsd = 0;
+ if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0,
+ &mmio_atsd)) {
+ dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
+ mmio_atsd = 0;
+ } else {
+ dev_warn(&vdev->pdev->dev,
+ "Using fallback ibm,mmio-atsd[0] for ATSD.\n");
+ }
}
if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 987b4d3..5c90e56 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -12,6 +12,8 @@
#include <linux/pci.h>
#include <linux/irqbypass.h>
#include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/notifier.h>
#ifndef VFIO_PCI_PRIVATE_H
#define VFIO_PCI_PRIVATE_H
@@ -31,12 +33,14 @@
struct vfio_pci_ioeventfd {
struct list_head next;
+ struct vfio_pci_device *vdev;
struct virqfd *virqfd;
void __iomem *addr;
uint64_t data;
loff_t pos;
int bar;
int count;
+ bool test_mem;
};
struct vfio_pci_irq_ctx {
@@ -84,6 +88,12 @@
struct mutex lock;
};
+struct vfio_pci_vf_token {
+ struct mutex lock;
+ uuid_t uuid;
+ int users;
+};
+
struct vfio_pci_mmap_vma {
struct vm_area_struct *vma;
struct list_head vma_next;
@@ -91,8 +101,8 @@
struct vfio_pci_device {
struct pci_dev *pdev;
- void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
- bool bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
+ void __iomem *barmap[PCI_STD_NUM_BARS];
+ bool bar_mmap_supported[PCI_STD_NUM_BARS];
u8 *pci_config_map;
u8 *vconfig;
struct perm_bits *msi_perm;
@@ -127,6 +137,8 @@
struct list_head dummy_resources_list;
struct mutex ioeventfds_lock;
struct list_head ioeventfds_list;
+ struct vfio_pci_vf_token *vf_token;
+ struct notifier_block nb;
struct mutex vma_lock;
struct list_head vma_list;
struct rw_semaphore memory_lock;
@@ -201,4 +213,16 @@
return -ENODEV;
}
#endif
+
+#ifdef CONFIG_VFIO_PCI_ZDEV
+extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps);
+#else
+static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ return -ENODEV;
+}
+#endif
+
#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 83f81d2..a0b5fc8 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -37,17 +37,70 @@
#define vfio_ioread8 ioread8
#define vfio_iowrite8 iowrite8
+#define VFIO_IOWRITE(size) \
+static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev, \
+ bool test_mem, u##size val, void __iomem *io) \
+{ \
+ if (test_mem) { \
+ down_read(&vdev->memory_lock); \
+ if (!__vfio_pci_memory_enabled(vdev)) { \
+ up_read(&vdev->memory_lock); \
+ return -EIO; \
+ } \
+ } \
+ \
+ vfio_iowrite##size(val, io); \
+ \
+ if (test_mem) \
+ up_read(&vdev->memory_lock); \
+ \
+ return 0; \
+}
+
+VFIO_IOWRITE(8)
+VFIO_IOWRITE(16)
+VFIO_IOWRITE(32)
+#ifdef iowrite64
+VFIO_IOWRITE(64)
+#endif
+
+#define VFIO_IOREAD(size) \
+static int vfio_pci_ioread##size(struct vfio_pci_device *vdev, \
+ bool test_mem, u##size *val, void __iomem *io) \
+{ \
+ if (test_mem) { \
+ down_read(&vdev->memory_lock); \
+ if (!__vfio_pci_memory_enabled(vdev)) { \
+ up_read(&vdev->memory_lock); \
+ return -EIO; \
+ } \
+ } \
+ \
+ *val = vfio_ioread##size(io); \
+ \
+ if (test_mem) \
+ up_read(&vdev->memory_lock); \
+ \
+ return 0; \
+}
+
+VFIO_IOREAD(8)
+VFIO_IOREAD(16)
+VFIO_IOREAD(32)
+
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
* range which is inaccessible. The excluded range drops writes and fills
* reads with -1. This is intended for handling MSI-X vector tables and
* leftover space for ROM BARs.
*/
-static ssize_t do_io_rw(void __iomem *io, char __user *buf,
+static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem,
+ void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite)
{
ssize_t done = 0;
+ int ret;
while (count) {
size_t fillable, filled;
@@ -66,9 +119,15 @@
if (copy_from_user(&val, buf, 4))
return -EFAULT;
- vfio_iowrite32(val, io + off);
+ ret = vfio_pci_iowrite32(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread32(io + off);
+ ret = vfio_pci_ioread32(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 4))
return -EFAULT;
@@ -82,9 +141,15 @@
if (copy_from_user(&val, buf, 2))
return -EFAULT;
- vfio_iowrite16(val, io + off);
+ ret = vfio_pci_iowrite16(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread16(io + off);
+ ret = vfio_pci_ioread16(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 2))
return -EFAULT;
@@ -98,9 +163,15 @@
if (copy_from_user(&val, buf, 1))
return -EFAULT;
- vfio_iowrite8(val, io + off);
+ ret = vfio_pci_iowrite8(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread8(io + off);
+ ret = vfio_pci_ioread8(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 1))
return -EFAULT;
@@ -178,14 +249,6 @@
count = min(count, (size_t)(end - pos));
- if (res->flags & IORESOURCE_MEM) {
- down_read(&vdev->memory_lock);
- if (!__vfio_pci_memory_enabled(vdev)) {
- up_read(&vdev->memory_lock);
- return -EIO;
- }
- }
-
if (bar == PCI_ROM_RESOURCE) {
/*
* The ROM can fill less space than the BAR, so we start the
@@ -213,7 +276,8 @@
x_end = vdev->msix_offset + vdev->msix_size;
}
- done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
+ done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+ count, x_start, x_end, iswrite);
if (done >= 0)
*ppos += done;
@@ -221,9 +285,6 @@
if (bar == PCI_ROM_RESOURCE)
pci_unmap_rom(pdev, io);
out:
- if (res->flags & IORESOURCE_MEM)
- up_read(&vdev->memory_lock);
-
return done;
}
@@ -246,7 +307,7 @@
switch ((u32)pos) {
case 0xa0000 ... 0xbffff:
count = min(count, (size_t)(0xc0000 - pos));
- iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
+ iomem = ioremap(0xa0000, 0xbffff - 0xa0000 + 1);
off = pos - 0xa0000;
rsrc = VGA_RSRC_LEGACY_MEM;
is_ioport = false;
@@ -278,7 +339,12 @@
return ret;
}
- done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
+ /*
+ * VGA MMIO is a legacy, non-BAR resource that hopefully allows
+ * probing, so we don't currently worry about access in relation
+ * to the memory enable bit in the command register.
+ */
+ done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
vga_put(vdev->pdev, rsrc);
@@ -290,30 +356,60 @@
return done;
}
-static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
+ bool test_mem)
{
- struct vfio_pci_ioeventfd *ioeventfd = opaque;
-
switch (ioeventfd->count) {
case 1:
- vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite8(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
case 2:
- vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite16(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
case 4:
- vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite32(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
#ifdef iowrite64
case 8:
- vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite64(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
#endif
}
+}
+
+static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+{
+ struct vfio_pci_ioeventfd *ioeventfd = opaque;
+ struct vfio_pci_device *vdev = ioeventfd->vdev;
+
+ if (ioeventfd->test_mem) {
+ if (!down_read_trylock(&vdev->memory_lock))
+ return 1; /* Lock contended, use thread */
+ if (!__vfio_pci_memory_enabled(vdev)) {
+ up_read(&vdev->memory_lock);
+ return 0;
+ }
+ }
+
+ vfio_pci_ioeventfd_do_write(ioeventfd, false);
+
+ if (ioeventfd->test_mem)
+ up_read(&vdev->memory_lock);
return 0;
}
+static void vfio_pci_ioeventfd_thread(void *opaque, void *unused)
+{
+ struct vfio_pci_ioeventfd *ioeventfd = opaque;
+
+ vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem);
+}
+
long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
uint64_t data, int count, int fd)
{
@@ -378,14 +474,17 @@
goto out_unlock;
}
+ ioeventfd->vdev = vdev;
ioeventfd->addr = vdev->barmap[bar] + pos;
ioeventfd->data = data;
ioeventfd->pos = pos;
ioeventfd->bar = bar;
ioeventfd->count = count;
+ ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
- NULL, NULL, &ioeventfd->virqfd, fd);
+ vfio_pci_ioeventfd_thread, NULL,
+ &ioeventfd->virqfd, fd);
if (ret) {
kfree(ioeventfd);
goto out_unlock;
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
new file mode 100644
index 0000000..1bb7eda
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2020. All rights reserved.
+ * Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ * Matthew Rosato <mjrosato@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_zdev.h>
+#include <asm/pci_clp.h>
+#include <asm/pci_io.h>
+
+#include "vfio_pci_private.h"
+
+/*
+ * Add the Base PCI Function information to the device info region.
+ */
+static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_base cap = {
+ .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
+ .header.version = 1,
+ .start_dma = zdev->start_dma,
+ .end_dma = zdev->end_dma,
+ .pchid = zdev->pchid,
+ .vfn = zdev->vfn,
+ .fmb_length = zdev->fmb_length,
+ .pft = zdev->pft,
+ .gid = zdev->pfgid
+ };
+
+ return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the Base PCI Function Group information to the device info region.
+ */
+static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_group cap = {
+ .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
+ .header.version = 1,
+ .dasm = zdev->dma_mask,
+ .msi_addr = zdev->msi_addr,
+ .flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
+ .mui = zdev->fmb_update,
+ .noi = zdev->max_msi,
+ .maxstbl = ZPCI_MAX_WRITE_SIZE,
+ .version = zdev->version
+ };
+
+ return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the device utility string to the device info region.
+ */
+static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_util *cap;
+ int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
+ int ret;
+
+ cap = kmalloc(cap_size, GFP_KERNEL);
+ if (!cap)
+ return -ENOMEM;
+
+ cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
+ cap->header.version = 1;
+ cap->size = CLP_UTIL_STR_LEN;
+ memcpy(cap->util_str, zdev->util_str, cap->size);
+
+ ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+ kfree(cap);
+
+ return ret;
+}
+
+/*
+ * Add the function path string to the device info region.
+ */
+static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_pfip *cap;
+ int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
+ int ret;
+
+ cap = kmalloc(cap_size, GFP_KERNEL);
+ if (!cap)
+ return -ENOMEM;
+
+ cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
+ cap->header.version = 1;
+ cap->size = CLP_PFIP_NR_SEGMENTS;
+ memcpy(cap->pfip, zdev->pfip, cap->size);
+
+ ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+ kfree(cap);
+
+ return ret;
+}
+
+/*
+ * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain.
+ */
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct zpci_dev *zdev = to_zpci(vdev->pdev);
+ int ret;
+
+ if (!zdev)
+ return -ENODEV;
+
+ ret = zpci_base_cap(zdev, vdev, caps);
+ if (ret)
+ return ret;
+
+ ret = zpci_group_cap(zdev, vdev, caps);
+ if (ret)
+ return ret;
+
+ if (zdev->util_str_avail) {
+ ret = zpci_util_cap(zdev, vdev, caps);
+ if (ret)
+ return ret;
+ }
+
+ ret = zpci_pfip_cap(zdev, vdev, caps);
+
+ return ret;
+}
diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
index 2d2babe..abdca90 100644
--- a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
+++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
@@ -24,7 +24,7 @@
#define MDIO_AN_INT 0x8002
#define MDIO_AN_INTMASK 0x8001
-static unsigned int xmdio_read(void *ioaddr, unsigned int mmd,
+static unsigned int xmdio_read(void __iomem *ioaddr, unsigned int mmd,
unsigned int reg)
{
unsigned int mmd_address, value;
@@ -35,7 +35,7 @@
return value;
}
-static void xmdio_write(void *ioaddr, unsigned int mmd,
+static void xmdio_write(void __iomem *ioaddr, unsigned int mmd,
unsigned int reg, unsigned int value)
{
unsigned int mmd_address;
@@ -54,13 +54,13 @@
if (!xgmac_regs->ioaddr) {
xgmac_regs->ioaddr =
- ioremap_nocache(xgmac_regs->addr, xgmac_regs->size);
+ ioremap(xgmac_regs->addr, xgmac_regs->size);
if (!xgmac_regs->ioaddr)
return -ENOMEM;
}
if (!xpcs_regs->ioaddr) {
xpcs_regs->ioaddr =
- ioremap_nocache(xpcs_regs->addr, xpcs_regs->size);
+ ioremap(xpcs_regs->addr, xpcs_regs->size);
if (!xpcs_regs->ioaddr)
return -ENOMEM;
}
diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
index 16165a6..96064ef 100644
--- a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
+++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
@@ -82,7 +82,7 @@
/* Map FlexRM ring registers if not mapped */
if (!reg->ioaddr) {
- reg->ioaddr = ioremap_nocache(reg->addr, reg->size);
+ reg->ioaddr = ioremap(reg->addr, reg->size);
if (!reg->ioaddr)
return -ENOMEM;
}
diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
index f67bab5..09a9453 100644
--- a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
+++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
@@ -52,7 +52,7 @@
if (!reg->ioaddr) {
reg->ioaddr =
- ioremap_nocache(reg->addr, reg->size);
+ ioremap(reg->addr, reg->size);
if (!reg->ioaddr)
return -ENOMEM;
}
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index 6f72703..e83a7cd 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -408,7 +408,7 @@
if (!reg->ioaddr) {
reg->ioaddr =
- ioremap_nocache(reg->addr, reg->size);
+ ioremap(reg->addr, reg->size);
if (!reg->ioaddr)
return -ENOMEM;
@@ -485,7 +485,7 @@
if (!reg->ioaddr) {
reg->ioaddr =
- ioremap_nocache(reg->addr, reg->size);
+ ioremap(reg->addr, reg->size);
if (!reg->ioaddr)
return -ENOMEM;
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 3885979..2151bc7 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -85,6 +85,7 @@
atomic_t opened;
wait_queue_head_t container_q;
bool noiommu;
+ unsigned int dev_counter;
struct kvm *kvm;
struct blocking_notifier_head notifier;
};
@@ -555,6 +556,7 @@
mutex_lock(&group->device_lock);
list_add(&device->group_next, &group->device_list);
+ group->dev_counter++;
mutex_unlock(&group->device_lock);
return device;
@@ -567,6 +569,7 @@
struct vfio_group *group = device->group;
list_del(&device->group_next);
+ group->dev_counter--;
mutex_unlock(&group->device_lock);
dev_set_drvdata(device->dev, NULL);
@@ -624,9 +627,10 @@
* that error notification via MSI can be affected for platforms that handle
* MSI within the same IOVA space as DMA.
*/
-static const char * const vfio_driver_whitelist[] = { "pci-stub" };
+static const char * const vfio_driver_allowed[] = { "pci-stub" };
-static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
+static bool vfio_dev_driver_allowed(struct device *dev,
+ struct device_driver *drv)
{
if (dev_is_pci(dev)) {
struct pci_dev *pdev = to_pci_dev(dev);
@@ -635,8 +639,8 @@
return true;
}
- return match_string(vfio_driver_whitelist,
- ARRAY_SIZE(vfio_driver_whitelist),
+ return match_string(vfio_driver_allowed,
+ ARRAY_SIZE(vfio_driver_allowed),
drv->name) >= 0;
}
@@ -645,7 +649,7 @@
* one of the following states:
* - driver-less
* - bound to a vfio driver
- * - bound to a whitelisted driver
+ * - bound to an otherwise allowed driver
* - a PCI interconnect device
*
* We use two methods to determine whether a device is bound to a vfio
@@ -671,7 +675,7 @@
}
mutex_unlock(&group->unbound_lock);
- if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
+ if (!ret || !drv || vfio_dev_driver_allowed(dev, drv))
return 0;
device = vfio_group_get_device(group, dev);
@@ -875,11 +879,23 @@
static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
char *buf)
{
- struct vfio_device *it, *device = NULL;
+ struct vfio_device *it, *device = ERR_PTR(-ENODEV);
mutex_lock(&group->device_lock);
list_for_each_entry(it, &group->device_list, group_next) {
- if (!strcmp(dev_name(it->dev), buf)) {
+ int ret;
+
+ if (it->ops->match) {
+ ret = it->ops->match(it->device_data, buf);
+ if (ret < 0) {
+ device = ERR_PTR(ret);
+ break;
+ }
+ } else {
+ ret = !strcmp(dev_name(it->dev), buf);
+ }
+
+ if (ret) {
device = it;
vfio_device_get(device);
break;
@@ -1184,15 +1200,6 @@
return ret;
}
-#ifdef CONFIG_COMPAT
-static long vfio_fops_compat_ioctl(struct file *filep,
- unsigned int cmd, unsigned long arg)
-{
- arg = (unsigned long)compat_ptr(arg);
- return vfio_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif /* CONFIG_COMPAT */
-
static int vfio_fops_open(struct inode *inode, struct file *filep)
{
struct vfio_container *container;
@@ -1275,9 +1282,7 @@
.read = vfio_fops_read,
.write = vfio_fops_write,
.unlocked_ioctl = vfio_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = vfio_fops_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
.mmap = vfio_fops_mmap,
};
@@ -1441,8 +1446,8 @@
return -EPERM;
device = vfio_device_get_from_name(group, buf);
- if (!device)
- return -ENODEV;
+ if (IS_ERR(device))
+ return PTR_ERR(device);
ret = device->ops->open(device->device_data);
if (ret) {
@@ -1556,15 +1561,6 @@
return ret;
}
-#ifdef CONFIG_COMPAT
-static long vfio_group_fops_compat_ioctl(struct file *filep,
- unsigned int cmd, unsigned long arg)
-{
- arg = (unsigned long)compat_ptr(arg);
- return vfio_group_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif /* CONFIG_COMPAT */
-
static int vfio_group_fops_open(struct inode *inode, struct file *filep)
{
struct vfio_group *group;
@@ -1620,9 +1616,7 @@
static const struct file_operations vfio_group_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = vfio_group_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = vfio_group_fops_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
.open = vfio_group_fops_open,
.release = vfio_group_fops_release,
};
@@ -1687,24 +1681,13 @@
return device->ops->mmap(device->device_data, vma);
}
-#ifdef CONFIG_COMPAT
-static long vfio_device_fops_compat_ioctl(struct file *filep,
- unsigned int cmd, unsigned long arg)
-{
- arg = (unsigned long)compat_ptr(arg);
- return vfio_device_fops_unl_ioctl(filep, cmd, arg);
-}
-#endif /* CONFIG_COMPAT */
-
static const struct file_operations vfio_device_fops = {
.owner = THIS_MODULE,
.release = vfio_device_fops_release,
.read = vfio_device_fops_read,
.write = vfio_device_fops_write,
.unlocked_ioctl = vfio_device_fops_unl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = vfio_device_fops_compat_ioctl,
-#endif
+ .compat_ioctl = compat_ptr_ioctl,
.mmap = vfio_device_fops_mmap,
};
@@ -1753,6 +1736,44 @@
}
EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
+/**
+ * External user API, exported by symbols to be linked dynamically.
+ * The external user passes in a device pointer
+ * to verify that:
+ * - A VFIO group is assiciated with the device;
+ * - IOMMU is set for the group.
+ * If both checks passed, vfio_group_get_external_user_from_dev()
+ * increments the container user counter to prevent the VFIO group
+ * from disposal before external user exits and returns the pointer
+ * to the VFIO group.
+ *
+ * When the external user finishes using the VFIO group, it calls
+ * vfio_group_put_external_user() to release the VFIO group and
+ * decrement the container user counter.
+ *
+ * @dev [in] : device
+ * Return error PTR or pointer to VFIO group.
+ */
+
+struct vfio_group *vfio_group_get_external_user_from_dev(struct device *dev)
+{
+ struct vfio_group *group;
+ int ret;
+
+ group = vfio_group_get_from_dev(dev);
+ if (!group)
+ return ERR_PTR(-ENODEV);
+
+ ret = vfio_group_add_container_user(group);
+ if (ret) {
+ vfio_group_put(group);
+ return ERR_PTR(ret);
+ }
+
+ return group;
+}
+EXPORT_SYMBOL_GPL(vfio_group_get_external_user_from_dev);
+
void vfio_group_put_external_user(struct vfio_group *group)
{
vfio_group_try_dissolve_container(group);
@@ -1928,6 +1949,11 @@
if (!group)
return -ENODEV;
+ if (group->dev_counter > 1) {
+ ret = -EINVAL;
+ goto err_pin_pages;
+ }
+
ret = vfio_group_add_container_user(group);
if (ret)
goto err_pin_pages;
@@ -1935,7 +1961,8 @@
container = group->container;
driver = container->iommu_driver;
if (likely(driver && driver->ops->pin_pages))
- ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
+ ret = driver->ops->pin_pages(container->iommu_data,
+ group->iommu_group, user_pfn,
npage, prot, phys_pfn);
else
ret = -ENOTTY;
@@ -1994,6 +2021,149 @@
}
EXPORT_SYMBOL(vfio_unpin_pages);
+/*
+ * Pin a set of guest IOVA PFNs and return their associated host PFNs for a
+ * VFIO group.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in] : VFIO group
+ * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be pinned.
+ * @npage [in] : count of elements in user_iova_pfn array.
+ * This count should not be greater
+ * VFIO_PIN_PAGES_MAX_ENTRIES.
+ * @prot [in] : protection flags
+ * @phys_pfn [out] : array of host PFNs
+ * Return error or number of pages pinned.
+ */
+int vfio_group_pin_pages(struct vfio_group *group,
+ unsigned long *user_iova_pfn, int npage,
+ int prot, unsigned long *phys_pfn)
+{
+ struct vfio_container *container;
+ struct vfio_iommu_driver *driver;
+ int ret;
+
+ if (!group || !user_iova_pfn || !phys_pfn || !npage)
+ return -EINVAL;
+
+ if (group->dev_counter > 1)
+ return -EINVAL;
+
+ if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+ return -E2BIG;
+
+ container = group->container;
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->pin_pages))
+ ret = driver->ops->pin_pages(container->iommu_data,
+ group->iommu_group, user_iova_pfn,
+ npage, prot, phys_pfn);
+ else
+ ret = -ENOTTY;
+
+ return ret;
+}
+EXPORT_SYMBOL(vfio_group_pin_pages);
+
+/*
+ * Unpin a set of guest IOVA PFNs for a VFIO group.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in] : vfio group
+ * @user_iova_pfn [in] : array of user/guest IOVA PFNs to be unpinned.
+ * @npage [in] : count of elements in user_iova_pfn array.
+ * This count should not be greater than
+ * VFIO_PIN_PAGES_MAX_ENTRIES.
+ * Return error or number of pages unpinned.
+ */
+int vfio_group_unpin_pages(struct vfio_group *group,
+ unsigned long *user_iova_pfn, int npage)
+{
+ struct vfio_container *container;
+ struct vfio_iommu_driver *driver;
+ int ret;
+
+ if (!group || !user_iova_pfn || !npage)
+ return -EINVAL;
+
+ if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
+ return -E2BIG;
+
+ container = group->container;
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->unpin_pages))
+ ret = driver->ops->unpin_pages(container->iommu_data,
+ user_iova_pfn, npage);
+ else
+ ret = -ENOTTY;
+
+ return ret;
+}
+EXPORT_SYMBOL(vfio_group_unpin_pages);
+
+
+/*
+ * This interface allows the CPUs to perform some sort of virtual DMA on
+ * behalf of the device.
+ *
+ * CPUs read/write from/into a range of IOVAs pointing to user space memory
+ * into/from a kernel buffer.
+ *
+ * As the read/write of user space memory is conducted via the CPUs and is
+ * not a real device DMA, it is not necessary to pin the user space memory.
+ *
+ * The caller needs to call vfio_group_get_external_user() or
+ * vfio_group_get_external_user_from_dev() prior to calling this interface,
+ * so as to prevent the VFIO group from disposal in the middle of the call.
+ * But it can keep the reference to the VFIO group for several calls into
+ * this interface.
+ * After finishing using of the VFIO group, the caller needs to release the
+ * VFIO group by calling vfio_group_put_external_user().
+ *
+ * @group [in] : VFIO group
+ * @user_iova [in] : base IOVA of a user space buffer
+ * @data [in] : pointer to kernel buffer
+ * @len [in] : kernel buffer length
+ * @write : indicate read or write
+ * Return error code on failure or 0 on success.
+ */
+int vfio_dma_rw(struct vfio_group *group, dma_addr_t user_iova,
+ void *data, size_t len, bool write)
+{
+ struct vfio_container *container;
+ struct vfio_iommu_driver *driver;
+ int ret = 0;
+
+ if (!group || !data || len <= 0)
+ return -EINVAL;
+
+ container = group->container;
+ driver = container->iommu_driver;
+
+ if (likely(driver && driver->ops->dma_rw))
+ ret = driver->ops->dma_rw(container->iommu_data,
+ user_iova, data, len, write);
+ else
+ ret = -ENOTTY;
+
+ return ret;
+}
+EXPORT_SYMBOL(vfio_dma_rw);
+
static int vfio_register_iommu_notifier(struct vfio_group *group,
unsigned long *events,
struct notifier_block *nb)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 26cef65..fe888b5 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -79,7 +79,7 @@
}
BUG_ON(!current->mm);
container->mm = current->mm;
- atomic_inc(&container->mm->mm_count);
+ mmgrab(container->mm);
return 0;
}
@@ -383,7 +383,7 @@
struct page *page;
page = pfn_to_page(hpa >> PAGE_SHIFT);
- put_page(page);
+ unpin_user_page(page);
}
static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
@@ -486,7 +486,7 @@
struct page *page = NULL;
enum dma_data_direction direction = iommu_tce_direction(tce);
- if (get_user_pages_fast(tce & PAGE_MASK, 1,
+ if (pin_user_pages_fast(tce & PAGE_MASK, 1,
direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
&page) != 1)
return -EFAULT;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6b1e8cb..fbd438e 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -28,6 +28,7 @@
#include <linux/iommu.h>
#include <linux/module.h>
#include <linux/mm.h>
+#include <linux/kthread.h>
#include <linux/rbtree.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
@@ -69,8 +70,11 @@
struct rb_root dma_list;
struct blocking_notifier_head notifier;
unsigned int dma_avail;
+ uint64_t pgsize_bitmap;
bool v2;
bool nesting;
+ bool dirty_page_tracking;
+ bool pinned_page_dirty_scope;
};
struct vfio_domain {
@@ -91,12 +95,14 @@
bool lock_cap; /* capable(CAP_IPC_LOCK) */
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
+ unsigned long *bitmap;
};
struct vfio_group {
struct iommu_group *iommu_group;
struct list_head next;
bool mdev_group; /* An mdev group */
+ bool pinned_page_dirty_scope;
};
struct vfio_iova {
@@ -112,7 +118,7 @@
struct rb_node node;
dma_addr_t iova; /* Device address */
unsigned long pfn; /* Host pfn */
- atomic_t ref_count;
+ unsigned int ref_count;
};
struct vfio_regions {
@@ -125,8 +131,25 @@
#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
(!list_empty(&iommu->domain_list))
+#define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
+
+/*
+ * Input argument of number of bits to bitmap_set() is unsigned integer, which
+ * further casts to signed integer for unaligned multi-bit operation,
+ * __bitmap_set().
+ * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
+ * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
+ * system.
+ */
+#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
+#define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
+
static int put_pfn(unsigned long pfn, int prot);
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+ struct iommu_group *iommu_group);
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu);
/*
* This code handles mapping and unmapping of user data buffers
* into DMA'ble space using the IOMMU
@@ -175,6 +198,93 @@
rb_erase(&old->node, &iommu->dma_list);
}
+
+static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
+{
+ uint64_t npages = dma->size / pgsize;
+
+ if (npages > DIRTY_BITMAP_PAGES_MAX)
+ return -EINVAL;
+
+ /*
+ * Allocate extra 64 bits that are used to calculate shift required for
+ * bitmap_shift_left() to manipulate and club unaligned number of pages
+ * in adjacent vfio_dma ranges.
+ */
+ dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
+ GFP_KERNEL);
+ if (!dma->bitmap)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void vfio_dma_bitmap_free(struct vfio_dma *dma)
+{
+ kfree(dma->bitmap);
+ dma->bitmap = NULL;
+}
+
+static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
+{
+ struct rb_node *p;
+ unsigned long pgshift = __ffs(pgsize);
+
+ for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
+ struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
+
+ bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
+ }
+}
+
+static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
+{
+ struct rb_node *n;
+ unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+ for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+ struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+ bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
+ }
+}
+
+static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
+{
+ struct rb_node *n;
+
+ for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+ struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+ int ret;
+
+ ret = vfio_dma_bitmap_alloc(dma, pgsize);
+ if (ret) {
+ struct rb_node *p;
+
+ for (p = rb_prev(n); p; p = rb_prev(p)) {
+ struct vfio_dma *dma = rb_entry(n,
+ struct vfio_dma, node);
+
+ vfio_dma_bitmap_free(dma);
+ }
+ return ret;
+ }
+ vfio_dma_populate_bitmap(dma, pgsize);
+ }
+ return 0;
+}
+
+static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
+{
+ struct rb_node *n;
+
+ for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+ struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+ vfio_dma_bitmap_free(dma);
+ }
+}
+
/*
* Helper Functions for host iova-pfn list
*/
@@ -233,7 +343,7 @@
vpfn->iova = iova;
vpfn->pfn = pfn;
- atomic_set(&vpfn->ref_count, 1);
+ vpfn->ref_count = 1;
vfio_link_pfn(dma, vpfn);
return 0;
}
@@ -251,7 +361,7 @@
struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
if (vpfn)
- atomic_inc(&vpfn->ref_count);
+ vpfn->ref_count++;
return vpfn;
}
@@ -259,7 +369,8 @@
{
int ret = 0;
- if (atomic_dec_and_test(&vpfn->ref_count)) {
+ vpfn->ref_count--;
+ if (!vpfn->ref_count) {
ret = put_pfn(vpfn->pfn, dma->prot);
vfio_remove_from_pfn_list(dma, vpfn);
}
@@ -278,11 +389,11 @@
if (!mm)
return -ESRCH; /* process exited */
- ret = down_write_killable(&mm->mmap_sem);
+ ret = mmap_write_lock_killable(mm);
if (!ret) {
ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
dma->lock_cap);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
if (async)
@@ -295,31 +406,13 @@
* Some mappings aren't backed by a struct page, for example an mmap'd
* MMIO range for our own or another device. These use a different
* pfn conversion and shouldn't be tracked as locked pages.
+ * For compound pages, any driver that sets the reserved bit in head
+ * page needs to set the reserved bit in all subpages to be safe.
*/
static bool is_invalid_reserved_pfn(unsigned long pfn)
{
- if (pfn_valid(pfn)) {
- bool reserved;
- struct page *tail = pfn_to_page(pfn);
- struct page *head = compound_head(tail);
- reserved = !!(PageReserved(head));
- if (head != tail) {
- /*
- * "head" is not a dangling pointer
- * (compound_head takes care of that)
- * but the hugepage may have been split
- * from under us (and we may not hold a
- * reference count on the head page so it can
- * be reused before we run PageReferenced), so
- * we've to check PageTail before returning
- * what we just read.
- */
- smp_rmb();
- if (PageTail(tail))
- return reserved;
- }
- return PageReserved(tail);
- }
+ if (pfn_valid(pfn))
+ return PageReserved(pfn_to_page(pfn));
return true;
}
@@ -328,9 +421,8 @@
{
if (!is_invalid_reserved_pfn(pfn)) {
struct page *page = pfn_to_page(pfn);
- if (prot & IOMMU_WRITE)
- SetPageDirty(page);
- put_page(page);
+
+ unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
return 1;
}
return 0;
@@ -348,7 +440,7 @@
if (ret) {
bool unlocked = false;
- ret = fixup_user_fault(NULL, mm, vaddr,
+ ret = fixup_user_fault(mm, vaddr,
FAULT_FLAG_REMOTE |
(write_fault ? FAULT_FLAG_WRITE : 0),
&unlocked);
@@ -377,41 +469,21 @@
{
struct page *page[1];
struct vm_area_struct *vma;
- struct vm_area_struct *vmas[1];
unsigned int flags = 0;
int ret;
if (prot & IOMMU_WRITE)
flags |= FOLL_WRITE;
- down_read(&mm->mmap_sem);
- if (mm == current->mm) {
- ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
- vmas);
- } else {
- ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
- vmas, NULL);
- /*
- * The lifetime of a vaddr_get_pfn() page pin is
- * userspace-controlled. In the fs-dax case this could
- * lead to indefinite stalls in filesystem operations.
- * Disallow attempts to pin fs-dax pages via this
- * interface.
- */
- if (ret > 0 && vma_is_fsdax(vmas[0])) {
- ret = -EOPNOTSUPP;
- put_page(page[0]);
- }
- }
- up_read(&mm->mmap_sem);
-
+ mmap_read_lock(mm);
+ ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM,
+ page, NULL, NULL);
if (ret == 1) {
*pfn = page_to_pfn(page[0]);
- return 0;
+ ret = 0;
+ goto done;
}
- down_read(&mm->mmap_sem);
-
vaddr = untagged_addr(vaddr);
retry:
@@ -425,8 +497,8 @@
if (!ret && !is_invalid_reserved_pfn(*pfn))
ret = -EFAULT;
}
-
- up_read(&mm->mmap_sem);
+done:
+ mmap_read_unlock(mm);
return ret;
}
@@ -580,11 +652,13 @@
}
static int vfio_iommu_type1_pin_pages(void *iommu_data,
+ struct iommu_group *iommu_group,
unsigned long *user_pfn,
int npage, int prot,
unsigned long *phys_pfn)
{
struct vfio_iommu *iommu = iommu_data;
+ struct vfio_group *group;
int i, j, ret;
unsigned long remote_vaddr;
struct vfio_dma *dma;
@@ -646,9 +720,26 @@
vfio_lock_acct(dma, -1, true);
goto pin_unwind;
}
+
+ if (iommu->dirty_page_tracking) {
+ unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+
+ /*
+ * Bitmap populated with the smallest supported page
+ * size
+ */
+ bitmap_set(dma->bitmap,
+ (iova - dma->iova) >> pgshift, 1);
+ }
+ }
+ ret = i;
+
+ group = vfio_iommu_find_iommu_group(iommu, iommu_group);
+ if (!group->pinned_page_dirty_scope) {
+ group->pinned_page_dirty_scope = true;
+ update_pinned_page_dirty_scope(iommu);
}
- ret = i;
goto pin_done;
pin_unwind:
@@ -707,7 +798,7 @@
long unlocked = 0;
struct vfio_regions *entry, *next;
- iommu_tlb_sync(domain->domain, iotlb_gather);
+ iommu_iotlb_sync(domain->domain, iotlb_gather);
list_for_each_entry_safe(entry, next, regions, list) {
unlocked += vfio_unpin_pages_remote(dma,
@@ -881,19 +972,19 @@
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
+ vfio_dma_bitmap_free(dma);
kfree(dma);
iommu->dma_avail++;
}
-static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
+static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
{
struct vfio_domain *domain;
- unsigned long bitmap = ULONG_MAX;
- mutex_lock(&iommu->lock);
+ iommu->pgsize_bitmap = ULONG_MAX;
+
list_for_each_entry(domain, &iommu->domain_list, next)
- bitmap &= domain->domain->pgsize_bitmap;
- mutex_unlock(&iommu->lock);
+ iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
/*
* In case the IOMMU supports page sizes smaller than PAGE_SIZE
@@ -903,36 +994,143 @@
* granularity while iommu driver can use the sub-PAGE_SIZE size
* to map the buffer.
*/
- if (bitmap & ~PAGE_MASK) {
- bitmap &= PAGE_MASK;
- bitmap |= PAGE_SIZE;
+ if (iommu->pgsize_bitmap & ~PAGE_MASK) {
+ iommu->pgsize_bitmap &= PAGE_MASK;
+ iommu->pgsize_bitmap |= PAGE_SIZE;
+ }
+}
+
+static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+ struct vfio_dma *dma, dma_addr_t base_iova,
+ size_t pgsize)
+{
+ unsigned long pgshift = __ffs(pgsize);
+ unsigned long nbits = dma->size >> pgshift;
+ unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
+ unsigned long copy_offset = bit_offset / BITS_PER_LONG;
+ unsigned long shift = bit_offset % BITS_PER_LONG;
+ unsigned long leftover;
+
+ /*
+ * mark all pages dirty if any IOMMU capable device is not able
+ * to report dirty pages and all pages are pinned and mapped.
+ */
+ if (!iommu->pinned_page_dirty_scope && dma->iommu_mapped)
+ bitmap_set(dma->bitmap, 0, nbits);
+
+ if (shift) {
+ bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
+ nbits + shift);
+
+ if (copy_from_user(&leftover,
+ (void __user *)(bitmap + copy_offset),
+ sizeof(leftover)))
+ return -EFAULT;
+
+ bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
}
- return bitmap;
+ if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
+ DIRTY_BITMAP_BYTES(nbits + shift)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
+ dma_addr_t iova, size_t size, size_t pgsize)
+{
+ struct vfio_dma *dma;
+ struct rb_node *n;
+ unsigned long pgshift = __ffs(pgsize);
+ int ret;
+
+ /*
+ * GET_BITMAP request must fully cover vfio_dma mappings. Multiple
+ * vfio_dma mappings may be clubbed by specifying large ranges, but
+ * there must not be any previous mappings bisected by the range.
+ * An error will be returned if these conditions are not met.
+ */
+ dma = vfio_find_dma(iommu, iova, 1);
+ if (dma && dma->iova != iova)
+ return -EINVAL;
+
+ dma = vfio_find_dma(iommu, iova + size - 1, 0);
+ if (dma && dma->iova + dma->size != iova + size)
+ return -EINVAL;
+
+ for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
+ struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
+
+ if (dma->iova < iova)
+ continue;
+
+ if (dma->iova > iova + size - 1)
+ break;
+
+ ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
+ if (ret)
+ return ret;
+
+ /*
+ * Re-populate bitmap to include all pinned pages which are
+ * considered as dirty but exclude pages which are unpinned and
+ * pages which are marked dirty by vfio_dma_rw()
+ */
+ bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
+ vfio_dma_populate_bitmap(dma, pgsize);
+ }
+ return 0;
+}
+
+static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
+{
+ if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
+ (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
+ return -EINVAL;
+
+ return 0;
}
static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
- struct vfio_iommu_type1_dma_unmap *unmap)
+ struct vfio_iommu_type1_dma_unmap *unmap,
+ struct vfio_bitmap *bitmap)
{
- uint64_t mask;
struct vfio_dma *dma, *dma_last = NULL;
- size_t unmapped = 0;
+ size_t unmapped = 0, pgsize;
int ret = 0, retries = 0;
+ unsigned long pgshift;
- mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
- if (unmap->iova & mask)
- return -EINVAL;
- if (!unmap->size || unmap->size & mask)
- return -EINVAL;
- if (unmap->iova + unmap->size - 1 < unmap->iova ||
- unmap->size > SIZE_MAX)
- return -EINVAL;
-
- WARN_ON(mask & PAGE_MASK);
-again:
mutex_lock(&iommu->lock);
+ pgshift = __ffs(iommu->pgsize_bitmap);
+ pgsize = (size_t)1 << pgshift;
+
+ if (unmap->iova & (pgsize - 1)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!unmap->size || unmap->size & (pgsize - 1)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (unmap->iova + unmap->size - 1 < unmap->iova ||
+ unmap->size > SIZE_MAX) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ /* When dirty tracking is enabled, allow only min supported pgsize */
+ if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
+ (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ WARN_ON((pgsize - 1) & PAGE_MASK);
+again:
/*
* vfio-iommu-type1 (v1) - User mappings were coalesced together to
* avoid tracking individual mappings. This means that the granularity
@@ -1010,8 +1208,17 @@
blocking_notifier_call_chain(&iommu->notifier,
VFIO_IOMMU_NOTIFY_DMA_UNMAP,
&nb_unmap);
+ mutex_lock(&iommu->lock);
goto again;
}
+
+ if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+ ret = update_user_bitmap(bitmap->data, iommu, dma,
+ unmap->iova, pgsize);
+ if (ret)
+ break;
+ }
+
unmapped += dma->size;
vfio_remove_dma(iommu, dma);
}
@@ -1043,8 +1250,10 @@
return 0;
unwind:
- list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
+ list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
+ cond_resched();
+ }
return ret;
}
@@ -1118,31 +1327,35 @@
unsigned long vaddr = map->vaddr;
size_t size = map->size;
int ret = 0, prot = 0;
- uint64_t mask;
+ size_t pgsize;
struct vfio_dma *dma;
/* Verify that none of our __u64 fields overflow */
if (map->size != size || map->vaddr != vaddr || map->iova != iova)
return -EINVAL;
- mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
-
- WARN_ON(mask & PAGE_MASK);
-
/* READ/WRITE from device perspective */
if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
prot |= IOMMU_WRITE;
if (map->flags & VFIO_DMA_MAP_FLAG_READ)
prot |= IOMMU_READ;
- if (!prot || !size || (size | iova | vaddr) & mask)
- return -EINVAL;
+ mutex_lock(&iommu->lock);
+
+ pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+ WARN_ON((pgsize - 1) & PAGE_MASK);
+
+ if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
/* Don't allow IOVA or virtual address wrap */
- if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
- return -EINVAL;
-
- mutex_lock(&iommu->lock);
+ if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
if (vfio_find_dma(iommu, iova, size)) {
ret = -EEXIST;
@@ -1210,6 +1423,12 @@
else
ret = vfio_pin_map_dma(iommu, dma, size);
+ if (!ret && iommu->dirty_page_tracking) {
+ ret = vfio_dma_bitmap_alloc(dma, pgsize);
+ if (ret)
+ vfio_remove_dma(iommu, dma);
+ }
+
out_unlock:
mutex_unlock(&iommu->lock);
return ret;
@@ -1409,6 +1628,51 @@
return NULL;
}
+static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
+ struct iommu_group *iommu_group)
+{
+ struct vfio_domain *domain;
+ struct vfio_group *group = NULL;
+
+ list_for_each_entry(domain, &iommu->domain_list, next) {
+ group = find_iommu_group(domain, iommu_group);
+ if (group)
+ return group;
+ }
+
+ if (iommu->external_domain)
+ group = find_iommu_group(iommu->external_domain, iommu_group);
+
+ return group;
+}
+
+static void update_pinned_page_dirty_scope(struct vfio_iommu *iommu)
+{
+ struct vfio_domain *domain;
+ struct vfio_group *group;
+
+ list_for_each_entry(domain, &iommu->domain_list, next) {
+ list_for_each_entry(group, &domain->group_list, next) {
+ if (!group->pinned_page_dirty_scope) {
+ iommu->pinned_page_dirty_scope = false;
+ return;
+ }
+ }
+ }
+
+ if (iommu->external_domain) {
+ domain = iommu->external_domain;
+ list_for_each_entry(group, &domain->group_list, next) {
+ if (!group->pinned_page_dirty_scope) {
+ iommu->pinned_page_dirty_scope = false;
+ return;
+ }
+ }
+ }
+
+ iommu->pinned_page_dirty_scope = true;
+}
+
static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
phys_addr_t *base)
{
@@ -1753,6 +2017,7 @@
list_splice_tail(iova_copy, iova);
}
+
static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
@@ -1769,18 +2034,10 @@
mutex_lock(&iommu->lock);
- list_for_each_entry(d, &iommu->domain_list, next) {
- if (find_iommu_group(d, iommu_group)) {
- mutex_unlock(&iommu->lock);
- return -EINVAL;
- }
- }
-
- if (iommu->external_domain) {
- if (find_iommu_group(iommu->external_domain, iommu_group)) {
- mutex_unlock(&iommu->lock);
- return -EINVAL;
- }
+ /* Check for duplicates */
+ if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
+ mutex_unlock(&iommu->lock);
+ return -EINVAL;
}
group = kzalloc(sizeof(*group), GFP_KERNEL);
@@ -1809,12 +2066,23 @@
if (!iommu->external_domain) {
INIT_LIST_HEAD(&domain->group_list);
iommu->external_domain = domain;
+ vfio_update_pgsize_bitmap(iommu);
} else {
kfree(domain);
}
list_add(&group->next,
&iommu->external_domain->group_list);
+ /*
+ * Non-iommu backed group cannot dirty memory directly,
+ * it can only use interfaces that provide dirty
+ * tracking.
+ * The iommu scope can only be promoted with the
+ * addition of a dirty tracking group.
+ */
+ group->pinned_page_dirty_scope = true;
+ if (!iommu->pinned_page_dirty_scope)
+ update_pinned_page_dirty_scope(iommu);
mutex_unlock(&iommu->lock);
return 0;
@@ -1929,14 +2197,22 @@
if (resv_msi) {
ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
- if (ret)
+ if (ret && ret != -ENODEV)
goto out_detach;
}
list_add(&domain->next, &iommu->domain_list);
+ vfio_update_pgsize_bitmap(iommu);
done:
/* Delete the old one and insert new iova list */
vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+
+ /*
+ * An iommu backed group can dirty memory directly and therefore
+ * demotes the iommu scope until it declares itself dirty tracking
+ * capable via the page pinning interface.
+ */
+ iommu->pinned_page_dirty_scope = false;
mutex_unlock(&iommu->lock);
vfio_iommu_resv_free(&group_resv_regions);
@@ -2072,6 +2348,7 @@
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
+ bool update_dirty_scope = false;
LIST_HEAD(iova_copy);
mutex_lock(&iommu->lock);
@@ -2079,6 +2356,7 @@
if (iommu->external_domain) {
group = find_iommu_group(iommu->external_domain, iommu_group);
if (group) {
+ update_dirty_scope = !group->pinned_page_dirty_scope;
list_del(&group->next);
kfree(group);
@@ -2108,6 +2386,7 @@
continue;
vfio_iommu_detach_group(domain, group);
+ update_dirty_scope = !group->pinned_page_dirty_scope;
list_del(&group->next);
kfree(group);
/*
@@ -2130,6 +2409,7 @@
list_del(&domain->next);
kfree(domain);
vfio_iommu_aper_expand(iommu, &iova_copy);
+ vfio_update_pgsize_bitmap(iommu);
}
break;
}
@@ -2140,6 +2420,15 @@
vfio_iommu_iova_free(&iova_copy);
detach_group_done:
+ /*
+ * Removal of a group without dirty tracking may allow the iommu scope
+ * to be promoted.
+ */
+ if (update_dirty_scope) {
+ update_pinned_page_dirty_scope(iommu);
+ if (iommu->dirty_page_tracking)
+ vfio_iommu_populate_bitmap_full(iommu);
+ }
mutex_unlock(&iommu->lock);
}
@@ -2156,7 +2445,7 @@
break;
case VFIO_TYPE1_NESTING_IOMMU:
iommu->nesting = true;
- /* fall through */
+ fallthrough;
case VFIO_TYPE1v2_IOMMU:
iommu->v2 = true;
break;
@@ -2232,6 +2521,23 @@
return ret;
}
+static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
+ unsigned long arg)
+{
+ switch (arg) {
+ case VFIO_TYPE1_IOMMU:
+ case VFIO_TYPE1v2_IOMMU:
+ case VFIO_TYPE1_NESTING_IOMMU:
+ return 1;
+ case VFIO_DMA_CC_IOMMU:
+ if (!iommu)
+ return 0;
+ return vfio_domains_have_iommu_cache(iommu);
+ default:
+ return 0;
+ }
+}
+
static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
size_t size)
@@ -2261,8 +2567,6 @@
size_t size;
int iovas = 0, i = 0, ret;
- mutex_lock(&iommu->lock);
-
list_for_each_entry(iova, &iommu->iova_list, list)
iovas++;
@@ -2271,17 +2575,14 @@
* Return 0 as a container with a single mdev device
* will have an empty list
*/
- ret = 0;
- goto out_unlock;
+ return 0;
}
size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
cap_iovas = kzalloc(size, GFP_KERNEL);
- if (!cap_iovas) {
- ret = -ENOMEM;
- goto out_unlock;
- }
+ if (!cap_iovas)
+ return -ENOMEM;
cap_iovas->nr_iovas = iovas;
@@ -2294,140 +2595,292 @@
ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
kfree(cap_iovas);
-out_unlock:
- mutex_unlock(&iommu->lock);
return ret;
}
+static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+ cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+ cap_mig.header.version = 1;
+
+ cap_mig.flags = 0;
+ /* support minimum pgsize */
+ cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+ cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
+
+ return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
+}
+
static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
struct vfio_info_cap *caps)
{
struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
- int ret;
- mutex_lock(&iommu->lock);
cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
cap_dma_avail.header.version = 1;
cap_dma_avail.avail = iommu->dma_avail;
- ret = vfio_info_add_capability(caps, &cap_dma_avail.header,
- sizeof(cap_dma_avail));
+ return vfio_info_add_capability(caps, &cap_dma_avail.header,
+ sizeof(cap_dma_avail));
+}
+
+static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
+ unsigned long arg)
+{
+ struct vfio_iommu_type1_info info;
+ unsigned long minsz;
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ unsigned long capsz;
+ int ret;
+
+ minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+
+ /* For backward compatibility, cannot require this */
+ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
+ if (copy_from_user(&info, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (info.argsz < minsz)
+ return -EINVAL;
+
+ if (info.argsz >= capsz) {
+ minsz = capsz;
+ info.cap_offset = 0; /* output, no-recopy necessary */
+ }
+
+ mutex_lock(&iommu->lock);
+ info.flags = VFIO_IOMMU_INFO_PGSIZES;
+
+ info.iova_pgsizes = iommu->pgsize_bitmap;
+
+ ret = vfio_iommu_migration_build_caps(iommu, &caps);
+
+ if (!ret)
+ ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
+
+ if (!ret)
+ ret = vfio_iommu_iova_build_caps(iommu, &caps);
+
mutex_unlock(&iommu->lock);
- return ret;
+
+ if (ret)
+ return ret;
+
+ if (caps.size) {
+ info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ return -EFAULT;
+ }
+ info.cap_offset = sizeof(info);
+ }
+
+ kfree(caps.buf);
+ }
+
+ return copy_to_user((void __user *)arg, &info, minsz) ?
+ -EFAULT : 0;
+}
+
+static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
+ unsigned long arg)
+{
+ struct vfio_iommu_type1_dma_map map;
+ unsigned long minsz;
+ uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+ minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+ if (copy_from_user(&map, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (map.argsz < minsz || map.flags & ~mask)
+ return -EINVAL;
+
+ return vfio_dma_do_map(iommu, &map);
+}
+
+static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
+ unsigned long arg)
+{
+ struct vfio_iommu_type1_dma_unmap unmap;
+ struct vfio_bitmap bitmap = { 0 };
+ unsigned long minsz;
+ int ret;
+
+ minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+
+ if (copy_from_user(&unmap, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (unmap.argsz < minsz ||
+ unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
+ return -EINVAL;
+
+ if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
+ unsigned long pgshift;
+
+ if (unmap.argsz < (minsz + sizeof(bitmap)))
+ return -EINVAL;
+
+ if (copy_from_user(&bitmap,
+ (void __user *)(arg + minsz),
+ sizeof(bitmap)))
+ return -EFAULT;
+
+ if (!access_ok((void __user *)bitmap.data, bitmap.size))
+ return -EINVAL;
+
+ pgshift = __ffs(bitmap.pgsize);
+ ret = verify_bitmap_size(unmap.size >> pgshift,
+ bitmap.size);
+ if (ret)
+ return ret;
+ }
+
+ ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
+ if (ret)
+ return ret;
+
+ return copy_to_user((void __user *)arg, &unmap, minsz) ?
+ -EFAULT : 0;
+}
+
+static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
+ unsigned long arg)
+{
+ struct vfio_iommu_type1_dirty_bitmap dirty;
+ uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
+ VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
+ VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
+ unsigned long minsz;
+ int ret = 0;
+
+ if (!iommu->v2)
+ return -EACCES;
+
+ minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
+
+ if (copy_from_user(&dirty, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (dirty.argsz < minsz || dirty.flags & ~mask)
+ return -EINVAL;
+
+ /* only one flag should be set at a time */
+ if (__ffs(dirty.flags) != __fls(dirty.flags))
+ return -EINVAL;
+
+ if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
+ size_t pgsize;
+
+ mutex_lock(&iommu->lock);
+ pgsize = 1 << __ffs(iommu->pgsize_bitmap);
+ if (!iommu->dirty_page_tracking) {
+ ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
+ if (!ret)
+ iommu->dirty_page_tracking = true;
+ }
+ mutex_unlock(&iommu->lock);
+ return ret;
+ } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
+ mutex_lock(&iommu->lock);
+ if (iommu->dirty_page_tracking) {
+ iommu->dirty_page_tracking = false;
+ vfio_dma_bitmap_free_all(iommu);
+ }
+ mutex_unlock(&iommu->lock);
+ return 0;
+ } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
+ struct vfio_iommu_type1_dirty_bitmap_get range;
+ unsigned long pgshift;
+ size_t data_size = dirty.argsz - minsz;
+ size_t iommu_pgsize;
+
+ if (!data_size || data_size < sizeof(range))
+ return -EINVAL;
+
+ if (copy_from_user(&range, (void __user *)(arg + minsz),
+ sizeof(range)))
+ return -EFAULT;
+
+ if (range.iova + range.size < range.iova)
+ return -EINVAL;
+ if (!access_ok((void __user *)range.bitmap.data,
+ range.bitmap.size))
+ return -EINVAL;
+
+ pgshift = __ffs(range.bitmap.pgsize);
+ ret = verify_bitmap_size(range.size >> pgshift,
+ range.bitmap.size);
+ if (ret)
+ return ret;
+
+ mutex_lock(&iommu->lock);
+
+ iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
+
+ /* allow only smallest supported pgsize */
+ if (range.bitmap.pgsize != iommu_pgsize) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (range.iova & (iommu_pgsize - 1)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ if (!range.size || range.size & (iommu_pgsize - 1)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (iommu->dirty_page_tracking)
+ ret = vfio_iova_dirty_bitmap(range.bitmap.data,
+ iommu, range.iova,
+ range.size,
+ range.bitmap.pgsize);
+ else
+ ret = -EINVAL;
+out_unlock:
+ mutex_unlock(&iommu->lock);
+
+ return ret;
+ }
+
+ return -EINVAL;
}
static long vfio_iommu_type1_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
struct vfio_iommu *iommu = iommu_data;
- unsigned long minsz;
- if (cmd == VFIO_CHECK_EXTENSION) {
- switch (arg) {
- case VFIO_TYPE1_IOMMU:
- case VFIO_TYPE1v2_IOMMU:
- case VFIO_TYPE1_NESTING_IOMMU:
- return 1;
- case VFIO_DMA_CC_IOMMU:
- if (!iommu)
- return 0;
- return vfio_domains_have_iommu_cache(iommu);
- default:
- return 0;
- }
- } else if (cmd == VFIO_IOMMU_GET_INFO) {
- struct vfio_iommu_type1_info info;
- struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
- unsigned long capsz;
- int ret;
-
- minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
-
- /* For backward compatibility, cannot require this */
- capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
-
- if (copy_from_user(&info, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (info.argsz < minsz)
- return -EINVAL;
-
- if (info.argsz >= capsz) {
- minsz = capsz;
- info.cap_offset = 0; /* output, no-recopy necessary */
- }
-
- info.flags = VFIO_IOMMU_INFO_PGSIZES;
-
- info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
-
- ret = vfio_iommu_iova_build_caps(iommu, &caps);
-
- if (!ret)
- ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
-
- if (ret)
- return ret;
-
- if (caps.size) {
- info.flags |= VFIO_IOMMU_INFO_CAPS;
-
- if (info.argsz < sizeof(info) + caps.size) {
- info.argsz = sizeof(info) + caps.size;
- } else {
- vfio_info_cap_shift(&caps, sizeof(info));
- if (copy_to_user((void __user *)arg +
- sizeof(info), caps.buf,
- caps.size)) {
- kfree(caps.buf);
- return -EFAULT;
- }
- info.cap_offset = sizeof(info);
- }
-
- kfree(caps.buf);
- }
-
- return copy_to_user((void __user *)arg, &info, minsz) ?
- -EFAULT : 0;
-
- } else if (cmd == VFIO_IOMMU_MAP_DMA) {
- struct vfio_iommu_type1_dma_map map;
- uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
-
- if (copy_from_user(&map, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (map.argsz < minsz || map.flags & ~mask)
- return -EINVAL;
-
- return vfio_dma_do_map(iommu, &map);
-
- } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
- struct vfio_iommu_type1_dma_unmap unmap;
- long ret;
-
- minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
-
- if (copy_from_user(&unmap, (void __user *)arg, minsz))
- return -EFAULT;
-
- if (unmap.argsz < minsz || unmap.flags)
- return -EINVAL;
-
- ret = vfio_dma_do_unmap(iommu, &unmap);
- if (ret)
- return ret;
-
- return copy_to_user((void __user *)arg, &unmap, minsz) ?
- -EFAULT : 0;
+ switch (cmd) {
+ case VFIO_CHECK_EXTENSION:
+ return vfio_iommu_type1_check_extension(iommu, arg);
+ case VFIO_IOMMU_GET_INFO:
+ return vfio_iommu_type1_get_info(iommu, arg);
+ case VFIO_IOMMU_MAP_DMA:
+ return vfio_iommu_type1_map_dma(iommu, arg);
+ case VFIO_IOMMU_UNMAP_DMA:
+ return vfio_iommu_type1_unmap_dma(iommu, arg);
+ case VFIO_IOMMU_DIRTY_PAGES:
+ return vfio_iommu_type1_dirty_pages(iommu, arg);
+ default:
+ return -ENOTTY;
}
-
- return -ENOTTY;
}
static int vfio_iommu_type1_register_notifier(void *iommu_data,
@@ -2454,6 +2907,90 @@
return blocking_notifier_chain_unregister(&iommu->notifier, nb);
}
+static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
+ dma_addr_t user_iova, void *data,
+ size_t count, bool write,
+ size_t *copied)
+{
+ struct mm_struct *mm;
+ unsigned long vaddr;
+ struct vfio_dma *dma;
+ bool kthread = current->mm == NULL;
+ size_t offset;
+
+ *copied = 0;
+
+ dma = vfio_find_dma(iommu, user_iova, 1);
+ if (!dma)
+ return -EINVAL;
+
+ if ((write && !(dma->prot & IOMMU_WRITE)) ||
+ !(dma->prot & IOMMU_READ))
+ return -EPERM;
+
+ mm = get_task_mm(dma->task);
+
+ if (!mm)
+ return -EPERM;
+
+ if (kthread)
+ kthread_use_mm(mm);
+ else if (current->mm != mm)
+ goto out;
+
+ offset = user_iova - dma->iova;
+
+ if (count > dma->size - offset)
+ count = dma->size - offset;
+
+ vaddr = dma->vaddr + offset;
+
+ if (write) {
+ *copied = copy_to_user((void __user *)vaddr, data,
+ count) ? 0 : count;
+ if (*copied && iommu->dirty_page_tracking) {
+ unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
+ /*
+ * Bitmap populated with the smallest supported page
+ * size
+ */
+ bitmap_set(dma->bitmap, offset >> pgshift,
+ ((offset + *copied - 1) >> pgshift) -
+ (offset >> pgshift) + 1);
+ }
+ } else
+ *copied = copy_from_user(data, (void __user *)vaddr,
+ count) ? 0 : count;
+ if (kthread)
+ kthread_unuse_mm(mm);
+out:
+ mmput(mm);
+ return *copied ? 0 : -EFAULT;
+}
+
+static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
+ void *data, size_t count, bool write)
+{
+ struct vfio_iommu *iommu = iommu_data;
+ int ret = 0;
+ size_t done;
+
+ mutex_lock(&iommu->lock);
+ while (count > 0) {
+ ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
+ count, write, &done);
+ if (ret)
+ break;
+
+ count -= done;
+ data += done;
+ user_iova += done;
+ }
+
+ mutex_unlock(&iommu->lock);
+ return ret;
+}
+
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.name = "vfio-iommu-type1",
.owner = THIS_MODULE,
@@ -2466,6 +3003,7 @@
.unpin_pages = vfio_iommu_type1_unpin_pages,
.register_notifier = vfio_iommu_type1_register_notifier,
.unregister_notifier = vfio_iommu_type1_unregister_notifier,
+ .dma_rw = vfio_iommu_type1_dma_rw,
};
static int __init vfio_iommu_type1_init(void)