Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index c691127..4e11077 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -46,3 +46,15 @@
depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU
help
VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
+
+config VFIO_PCI_ZDEV
+ bool "VFIO PCI ZPCI device CLP support"
+ depends on VFIO_PCI && S390
+ default y
+ help
+ Enabling this option exposes VFIO capabilities containing hardware
+ configuration for zPCI devices. This enables userspace (e.g. QEMU)
+ to supply proper configuration values instead of hard-coded defaults
+ for zPCI devices passed through via VFIO on s390.
+
+ Say Y here.
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index f027f8a..781e080 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -3,5 +3,6 @@
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
+vfio-pci-$(CONFIG_VFIO_PCI_ZDEV) += vfio_pci_zdev.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index b06d762..57ae8b4 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -9,7 +9,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define dev_fmt pr_fmt
#include <linux/device.h>
#include <linux/eventfd.h>
@@ -55,6 +54,16 @@
MODULE_PARM_DESC(disable_idle_d3,
"Disable using the PCI D3 low power state for idle, unused devices");
+static bool enable_sriov;
+#ifdef CONFIG_PCI_IOV
+module_param(enable_sriov, bool, 0644);
+MODULE_PARM_DESC(enable_sriov, "Enable support for SR-IOV configuration. Enabling SR-IOV on a PF typically requires support of the userspace PF driver, enabling VFs without such support may result in non-functional VFs or PF.");
+#endif
+
+static bool disable_denylist;
+module_param(disable_denylist, bool, 0444);
+MODULE_PARM_DESC(disable_denylist, "Disable use of device denylist. Disabling the denylist allows binding to devices with known errata that may lead to exploitable stability or security issues when accessed by untrusted users.");
+
static inline bool vfio_vga_disabled(void)
{
#ifdef CONFIG_VFIO_PCI_VGA
@@ -64,6 +73,44 @@
#endif
}
+static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
+{
+ switch (pdev->vendor) {
+ case PCI_VENDOR_ID_INTEL:
+ switch (pdev->device) {
+ case PCI_DEVICE_ID_INTEL_QAT_C3XXX:
+ case PCI_DEVICE_ID_INTEL_QAT_C3XXX_VF:
+ case PCI_DEVICE_ID_INTEL_QAT_C62X:
+ case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
+ case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
+ case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ return false;
+}
+
+static bool vfio_pci_is_denylisted(struct pci_dev *pdev)
+{
+ if (!vfio_pci_dev_in_denylist(pdev))
+ return false;
+
+ if (disable_denylist) {
+ pci_warn(pdev,
+ "device denylist disabled - allowing device %04x:%04x.\n",
+ pdev->vendor, pdev->device);
+ return false;
+ }
+
+ pci_warn(pdev, "%04x:%04x exists in vfio-pci device denylist, driver probing disallowed.\n",
+ pdev->vendor, pdev->device);
+
+ return true;
+}
+
/*
* Our VGA arbiter participation is limited since we don't know anything
* about the device itself. However, if the device is the only VGA device
@@ -111,11 +158,13 @@
static void vfio_pci_probe_mmaps(struct vfio_pci_device *vdev)
{
struct resource *res;
- int bar;
+ int i;
struct vfio_pci_dummy_resource *dummy_res;
- for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
- res = vdev->pdev->resource + bar;
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ int bar = i + PCI_STD_RESOURCES;
+
+ res = &vdev->pdev->resource[bar];
if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
goto no_mmap;
@@ -198,6 +247,8 @@
case 0x1580 ... 0x1581:
case 0x1583 ... 0x158b:
case 0x37d0 ... 0x37d2:
+ /* X550 */
+ case 0x1563:
return true;
default:
return false;
@@ -399,7 +450,8 @@
vfio_config_free(vdev);
- for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
+ for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+ bar = i + PCI_STD_RESOURCES;
if (!vdev->barmap[bar])
continue;
pci_iounmap(pdev, vdev->barmap[bar]);
@@ -463,6 +515,44 @@
vfio_pci_set_power_state(vdev, PCI_D3hot);
}
+static struct pci_driver vfio_pci_driver;
+
+static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev,
+ struct vfio_device **pf_dev)
+{
+ struct pci_dev *physfn = pci_physfn(vdev->pdev);
+
+ if (!vdev->pdev->is_virtfn)
+ return NULL;
+
+ *pf_dev = vfio_device_get_from_dev(&physfn->dev);
+ if (!*pf_dev)
+ return NULL;
+
+ if (pci_dev_driver(physfn) != &vfio_pci_driver) {
+ vfio_device_put(*pf_dev);
+ return NULL;
+ }
+
+ return vfio_device_data(*pf_dev);
+}
+
+static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val)
+{
+ struct vfio_device *pf_dev;
+ struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+
+ if (!pf_vdev)
+ return;
+
+ mutex_lock(&pf_vdev->vf_token->lock);
+ pf_vdev->vf_token->users += val;
+ WARN_ON(pf_vdev->vf_token->users < 0);
+ mutex_unlock(&pf_vdev->vf_token->lock);
+
+ vfio_device_put(pf_dev);
+}
+
static void vfio_pci_release(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
@@ -470,16 +560,15 @@
mutex_lock(&vdev->reflck->lock);
if (!(--vdev->refcnt)) {
+ vfio_pci_vf_token_user_add(vdev, -1);
vfio_spapr_pci_eeh_release(vdev->pdev);
vfio_pci_disable(vdev);
+
mutex_lock(&vdev->igate);
if (vdev->err_trigger) {
eventfd_ctx_put(vdev->err_trigger);
vdev->err_trigger = NULL;
}
- mutex_unlock(&vdev->igate);
-
- mutex_lock(&vdev->igate);
if (vdev->req_trigger) {
eventfd_ctx_put(vdev->req_trigger);
vdev->req_trigger = NULL;
@@ -508,6 +597,7 @@
goto error;
vfio_spapr_pci_eeh_open(vdev->pdev);
+ vfio_pci_vf_token_user_add(vdev, 1);
}
vdev->refcnt++;
error:
@@ -715,15 +805,25 @@
if (cmd == VFIO_DEVICE_GET_INFO) {
struct vfio_device_info info;
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ unsigned long capsz;
minsz = offsetofend(struct vfio_device_info, num_irqs);
+ /* For backward compatibility, cannot require this */
+ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
+ if (info.argsz >= capsz) {
+ minsz = capsz;
+ info.cap_offset = 0;
+ }
+
info.flags = VFIO_DEVICE_FLAGS_PCI;
if (vdev->reset_works)
@@ -732,6 +832,33 @@
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
info.num_irqs = VFIO_PCI_NUM_IRQS;
+ if (IS_ENABLED(CONFIG_VFIO_PCI_ZDEV)) {
+ int ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
+
+ if (ret && ret != -ENODEV) {
+ pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
+ return ret;
+ }
+ }
+
+ if (caps.size) {
+ info.flags |= VFIO_DEVICE_FLAGS_CAPS;
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ return -EFAULT;
+ }
+ info.cap_offset = sizeof(info);
+ }
+
+ kfree(caps.buf);
+ }
+
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
@@ -898,7 +1025,7 @@
case VFIO_PCI_ERR_IRQ_INDEX:
if (pci_is_pcie(vdev->pdev))
break;
- /* fall through */
+ fallthrough;
default:
return -EINVAL;
}
@@ -1144,7 +1271,7 @@
/*
* We need to get memory_lock for each device, but devices
- * can share mmap_sem, therefore we need to zap and hold
+ * can share mmap_lock, therefore we need to zap and hold
* the vma_lock for each device, and only then get each
* memory_lock.
*/
@@ -1213,6 +1340,65 @@
return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
ioeventfd.data, count, ioeventfd.fd);
+ } else if (cmd == VFIO_DEVICE_FEATURE) {
+ struct vfio_device_feature feature;
+ uuid_t uuid;
+
+ minsz = offsetofend(struct vfio_device_feature, flags);
+
+ if (copy_from_user(&feature, (void __user *)arg, minsz))
+ return -EFAULT;
+
+ if (feature.argsz < minsz)
+ return -EINVAL;
+
+ /* Check unknown flags */
+ if (feature.flags & ~(VFIO_DEVICE_FEATURE_MASK |
+ VFIO_DEVICE_FEATURE_SET |
+ VFIO_DEVICE_FEATURE_GET |
+ VFIO_DEVICE_FEATURE_PROBE))
+ return -EINVAL;
+
+ /* GET & SET are mutually exclusive except with PROBE */
+ if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
+ (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
+ (feature.flags & VFIO_DEVICE_FEATURE_GET))
+ return -EINVAL;
+
+ switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
+ case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
+ if (!vdev->vf_token)
+ return -ENOTTY;
+
+ /*
+ * We do not support GET of the VF Token UUID as this
+ * could expose the token of the previous device user.
+ */
+ if (feature.flags & VFIO_DEVICE_FEATURE_GET)
+ return -EINVAL;
+
+ if (feature.flags & VFIO_DEVICE_FEATURE_PROBE)
+ return 0;
+
+ /* Don't SET unless told to do so */
+ if (!(feature.flags & VFIO_DEVICE_FEATURE_SET))
+ return -EINVAL;
+
+ if (feature.argsz < minsz + sizeof(uuid))
+ return -EINVAL;
+
+ if (copy_from_user(&uuid, (void __user *)(arg + minsz),
+ sizeof(uuid)))
+ return -EFAULT;
+
+ mutex_lock(&vdev->vf_token->lock);
+ uuid_copy(&vdev->vf_token->uuid, &uuid);
+ mutex_unlock(&vdev->vf_token->lock);
+
+ return 0;
+ default:
+ return -ENOTTY;
+ }
}
return -ENOTTY;
@@ -1275,26 +1461,26 @@
/*
* Lock ordering:
- * vma_lock is nested under mmap_sem for vm_ops callback paths.
+ * vma_lock is nested under mmap_lock for vm_ops callback paths.
* The memory_lock semaphore is used by both code paths calling
* into this function to zap vmas and the vm_ops.fault callback
* to protect the memory enable state of the device.
*
- * When zapping vmas we need to maintain the mmap_sem => vma_lock
+ * When zapping vmas we need to maintain the mmap_lock => vma_lock
* ordering, which requires using vma_lock to walk vma_list to
- * acquire an mm, then dropping vma_lock to get the mmap_sem and
+ * acquire an mm, then dropping vma_lock to get the mmap_lock and
* reacquiring vma_lock. This logic is derived from similar
* requirements in uverbs_user_mmap_disassociate().
*
- * mmap_sem must always be the top-level lock when it is taken.
+ * mmap_lock must always be the top-level lock when it is taken.
* Therefore we can only hold the memory_lock write lock when
- * vma_list is empty, as we'd need to take mmap_sem to clear
+ * vma_list is empty, as we'd need to take mmap_lock to clear
* entries. vma_list can only be guaranteed empty when holding
* vma_lock, thus memory_lock is nested under vma_lock.
*
* This enables the vm_ops.fault callback to acquire vma_lock,
* followed by memory_lock read lock, while already holding
- * mmap_sem without risk of deadlock.
+ * mmap_lock without risk of deadlock.
*/
while (1) {
struct mm_struct *mm = NULL;
@@ -1322,39 +1508,37 @@
mutex_unlock(&vdev->vma_lock);
if (try) {
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (!mmap_read_trylock(mm)) {
mmput(mm);
return 0;
}
} else {
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
}
- if (mmget_still_valid(mm)) {
- if (try) {
- if (!mutex_trylock(&vdev->vma_lock)) {
- up_read(&mm->mmap_sem);
- mmput(mm);
- return 0;
- }
- } else {
- mutex_lock(&vdev->vma_lock);
+ if (try) {
+ if (!mutex_trylock(&vdev->vma_lock)) {
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
}
- list_for_each_entry_safe(mmap_vma, tmp,
- &vdev->vma_list, vma_next) {
- struct vm_area_struct *vma = mmap_vma->vma;
-
- if (vma->vm_mm != mm)
- continue;
-
- list_del(&mmap_vma->vma_next);
- kfree(mmap_vma);
-
- zap_vma_ptes(vma, vma->vm_start,
- vma->vm_end - vma->vm_start);
- }
- mutex_unlock(&vdev->vma_lock);
+ } else {
+ mutex_lock(&vdev->vma_lock);
}
- up_read(&mm->mmap_sem);
+ list_for_each_entry_safe(mmap_vma, tmp,
+ &vdev->vma_list, vma_next) {
+ struct vm_area_struct *vma = mmap_vma->vma;
+
+ if (vma->vm_mm != mm)
+ continue;
+
+ list_del(&mmap_vma->vma_next);
+ kfree(mmap_vma);
+
+ zap_vma_ptes(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ }
+ mutex_unlock(&vdev->vma_lock);
+ mmap_read_unlock(mm);
mmput(mm);
}
}
@@ -1568,6 +1752,150 @@
mutex_unlock(&vdev->igate);
}
+static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev,
+ bool vf_token, uuid_t *uuid)
+{
+ /*
+ * There's always some degree of trust or collaboration between SR-IOV
+ * PF and VFs, even if just that the PF hosts the SR-IOV capability and
+ * can disrupt VFs with a reset, but often the PF has more explicit
+ * access to deny service to the VF or access data passed through the
+ * VF. We therefore require an opt-in via a shared VF token (UUID) to
+ * represent this trust. This both prevents that a VF driver might
+ * assume the PF driver is a trusted, in-kernel driver, and also that
+ * a PF driver might be replaced with a rogue driver, unknown to in-use
+ * VF drivers.
+ *
+ * Therefore when presented with a VF, if the PF is a vfio device and
+ * it is bound to the vfio-pci driver, the user needs to provide a VF
+ * token to access the device, in the form of appending a vf_token to
+ * the device name, for example:
+ *
+ * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
+ *
+ * When presented with a PF which has VFs in use, the user must also
+ * provide the current VF token to prove collaboration with existing
+ * VF users. If VFs are not in use, the VF token provided for the PF
+ * device will act to set the VF token.
+ *
+ * If the VF token is provided but unused, an error is generated.
+ */
+ if (!vdev->pdev->is_virtfn && !vdev->vf_token && !vf_token)
+ return 0; /* No VF token provided or required */
+
+ if (vdev->pdev->is_virtfn) {
+ struct vfio_device *pf_dev;
+ struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev);
+ bool match;
+
+ if (!pf_vdev) {
+ if (!vf_token)
+ return 0; /* PF is not vfio-pci, no VF token */
+
+ pci_info_ratelimited(vdev->pdev,
+ "VF token incorrectly provided, PF not bound to vfio-pci\n");
+ return -EINVAL;
+ }
+
+ if (!vf_token) {
+ vfio_device_put(pf_dev);
+ pci_info_ratelimited(vdev->pdev,
+ "VF token required to access device\n");
+ return -EACCES;
+ }
+
+ mutex_lock(&pf_vdev->vf_token->lock);
+ match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
+ mutex_unlock(&pf_vdev->vf_token->lock);
+
+ vfio_device_put(pf_dev);
+
+ if (!match) {
+ pci_info_ratelimited(vdev->pdev,
+ "Incorrect VF token provided for device\n");
+ return -EACCES;
+ }
+ } else if (vdev->vf_token) {
+ mutex_lock(&vdev->vf_token->lock);
+ if (vdev->vf_token->users) {
+ if (!vf_token) {
+ mutex_unlock(&vdev->vf_token->lock);
+ pci_info_ratelimited(vdev->pdev,
+ "VF token required to access device\n");
+ return -EACCES;
+ }
+
+ if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
+ mutex_unlock(&vdev->vf_token->lock);
+ pci_info_ratelimited(vdev->pdev,
+ "Incorrect VF token provided for device\n");
+ return -EACCES;
+ }
+ } else if (vf_token) {
+ uuid_copy(&vdev->vf_token->uuid, uuid);
+ }
+
+ mutex_unlock(&vdev->vf_token->lock);
+ } else if (vf_token) {
+ pci_info_ratelimited(vdev->pdev,
+ "VF token incorrectly provided, not a PF or VF\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+#define VF_TOKEN_ARG "vf_token="
+
+static int vfio_pci_match(void *device_data, char *buf)
+{
+ struct vfio_pci_device *vdev = device_data;
+ bool vf_token = false;
+ uuid_t uuid;
+ int ret;
+
+ if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
+ return 0; /* No match */
+
+ if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
+ buf += strlen(pci_name(vdev->pdev));
+
+ if (*buf != ' ')
+ return 0; /* No match: non-whitespace after name */
+
+ while (*buf) {
+ if (*buf == ' ') {
+ buf++;
+ continue;
+ }
+
+ if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
+ strlen(VF_TOKEN_ARG))) {
+ buf += strlen(VF_TOKEN_ARG);
+
+ if (strlen(buf) < UUID_STRING_LEN)
+ return -EINVAL;
+
+ ret = uuid_parse(buf, &uuid);
+ if (ret)
+ return ret;
+
+ vf_token = true;
+ buf += UUID_STRING_LEN;
+ } else {
+ /* Unknown/duplicate option */
+ return -EINVAL;
+ }
+ }
+ }
+
+ ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
+ if (ret)
+ return ret;
+
+ return 1; /* Match */
+}
+
static const struct vfio_device_ops vfio_pci_ops = {
.name = "vfio-pci",
.open = vfio_pci_open,
@@ -1577,27 +1905,121 @@
.write = vfio_pci_write,
.mmap = vfio_pci_mmap,
.request = vfio_pci_request,
+ .match = vfio_pci_match,
};
static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+static int vfio_pci_bus_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct vfio_pci_device *vdev = container_of(nb,
+ struct vfio_pci_device, nb);
+ struct device *dev = data;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct pci_dev *physfn = pci_physfn(pdev);
+
+ if (action == BUS_NOTIFY_ADD_DEVICE &&
+ pdev->is_virtfn && physfn == vdev->pdev) {
+ pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
+ pci_name(pdev));
+ pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
+ vfio_pci_ops.name);
+ } else if (action == BUS_NOTIFY_BOUND_DRIVER &&
+ pdev->is_virtfn && physfn == vdev->pdev) {
+ struct pci_driver *drv = pci_dev_driver(pdev);
+
+ if (drv && drv != &vfio_pci_driver)
+ pci_warn(vdev->pdev,
+ "VF %s bound to driver %s while PF bound to vfio-pci\n",
+ pci_name(pdev), drv->name);
+ }
+
+ return 0;
+}
+
+static int vfio_pci_vf_init(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ int ret;
+
+ if (!pdev->is_physfn)
+ return 0;
+
+ vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
+ if (!vdev->vf_token)
+ return -ENOMEM;
+
+ mutex_init(&vdev->vf_token->lock);
+ uuid_gen(&vdev->vf_token->uuid);
+
+ vdev->nb.notifier_call = vfio_pci_bus_notifier;
+ ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
+ if (ret) {
+ kfree(vdev->vf_token);
+ return ret;
+ }
+ return 0;
+}
+
+static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev)
+{
+ if (!vdev->vf_token)
+ return;
+
+ bus_unregister_notifier(&pci_bus_type, &vdev->nb);
+ WARN_ON(vdev->vf_token->users);
+ mutex_destroy(&vdev->vf_token->lock);
+ kfree(vdev->vf_token);
+}
+
+static int vfio_pci_vga_init(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ int ret;
+
+ if (!vfio_pci_is_vga(pdev))
+ return 0;
+
+ ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
+ if (ret)
+ return ret;
+ vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false));
+ return 0;
+}
+
+static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+
+ if (!vfio_pci_is_vga(pdev))
+ return;
+ vga_client_register(pdev, NULL, NULL, NULL);
+ vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
+ VGA_RSRC_LEGACY_IO |
+ VGA_RSRC_LEGACY_MEM);
+}
+
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct vfio_pci_device *vdev;
struct iommu_group *group;
int ret;
+ if (vfio_pci_is_denylisted(pdev))
+ return -EINVAL;
+
if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
return -EINVAL;
/*
- * Prevent binding to PFs with VFs enabled, this too easily allows
- * userspace instance with VFs and PFs from the same device, which
- * cannot work. Disabling SR-IOV here would initiate removing the
- * VFs, which would unbind the driver, which is prone to blocking
- * if that VF is also in use by vfio-pci. Just reject these PFs
- * and let the user sort it out.
+ * Prevent binding to PFs with VFs enabled, the VFs might be in use
+ * by the host or other users. We cannot capture the VFs if they
+ * already exist, nor can we track VF users. Disabling SR-IOV here
+ * would initiate removing the VFs, which would unbind the driver,
+ * which is prone to blocking if that VF is also in use by vfio-pci.
+ * Just reject these PFs and let the user sort it out.
*/
if (pci_num_vf(pdev)) {
pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
@@ -1610,8 +2032,8 @@
vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
if (!vdev) {
- vfio_iommu_group_put(group, &pdev->dev);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_group_put;
}
vdev->pdev = pdev;
@@ -1625,26 +2047,15 @@
INIT_LIST_HEAD(&vdev->vma_list);
init_rwsem(&vdev->memory_lock);
- ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
- if (ret) {
- vfio_iommu_group_put(group, &pdev->dev);
- kfree(vdev);
- return ret;
- }
-
ret = vfio_pci_reflck_attach(vdev);
- if (ret) {
- vfio_del_group_dev(&pdev->dev);
- vfio_iommu_group_put(group, &pdev->dev);
- kfree(vdev);
- return ret;
- }
-
- if (vfio_pci_is_vga(pdev)) {
- vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
- vga_set_legacy_decoding(pdev,
- vfio_pci_set_vga_decode(vdev, false));
- }
+ if (ret)
+ goto out_free;
+ ret = vfio_pci_vf_init(vdev);
+ if (ret)
+ goto out_reflck;
+ ret = vfio_pci_vga_init(vdev);
+ if (ret)
+ goto out_vf;
vfio_pci_probe_power_state(vdev);
@@ -1662,6 +2073,23 @@
vfio_pci_set_power_state(vdev, PCI_D3hot);
}
+ ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
+ if (ret)
+ goto out_power;
+ return 0;
+
+out_power:
+ if (!disable_idle_d3)
+ vfio_pci_set_power_state(vdev, PCI_D0);
+out_vf:
+ vfio_pci_vf_uninit(vdev);
+out_reflck:
+ vfio_pci_reflck_put(vdev->reflck);
+out_free:
+ kfree(vdev->pm_save);
+ kfree(vdev);
+out_group_put:
+ vfio_iommu_group_put(group, &pdev->dev);
return ret;
}
@@ -1669,28 +2097,25 @@
{
struct vfio_pci_device *vdev;
+ pci_disable_sriov(pdev);
+
vdev = vfio_del_group_dev(&pdev->dev);
if (!vdev)
return;
+ vfio_pci_vf_uninit(vdev);
vfio_pci_reflck_put(vdev->reflck);
+ vfio_pci_vga_uninit(vdev);
vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
- kfree(vdev->region);
- mutex_destroy(&vdev->ioeventfds_lock);
if (!disable_idle_d3)
vfio_pci_set_power_state(vdev, PCI_D0);
+ mutex_destroy(&vdev->ioeventfds_lock);
+ kfree(vdev->region);
kfree(vdev->pm_save);
kfree(vdev);
-
- if (vfio_pci_is_vga(pdev)) {
- vga_client_register(pdev, NULL, NULL, NULL);
- vga_set_legacy_decoding(pdev,
- VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
- VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
- }
}
static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
@@ -1721,16 +2146,48 @@
return PCI_ERS_RESULT_CAN_RECOVER;
}
+static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn)
+{
+ struct vfio_pci_device *vdev;
+ struct vfio_device *device;
+ int ret = 0;
+
+ might_sleep();
+
+ if (!enable_sriov)
+ return -ENOENT;
+
+ device = vfio_device_get_from_dev(&pdev->dev);
+ if (!device)
+ return -ENODEV;
+
+ vdev = vfio_device_data(device);
+ if (!vdev) {
+ vfio_device_put(device);
+ return -ENODEV;
+ }
+
+ if (nr_virtfn == 0)
+ pci_disable_sriov(pdev);
+ else
+ ret = pci_enable_sriov(pdev, nr_virtfn);
+
+ vfio_device_put(device);
+
+ return ret < 0 ? ret : nr_virtfn;
+}
+
static const struct pci_error_handlers vfio_err_handlers = {
.error_detected = vfio_pci_aer_err_detected,
};
static struct pci_driver vfio_pci_driver = {
- .name = "vfio-pci",
- .id_table = NULL, /* only dynamic ids */
- .probe = vfio_pci_probe,
- .remove = vfio_pci_remove,
- .err_handler = &vfio_err_handlers,
+ .name = "vfio-pci",
+ .id_table = NULL, /* only dynamic ids */
+ .probe = vfio_pci_probe,
+ .remove = vfio_pci_remove,
+ .sriov_configure = vfio_pci_sriov_configure,
+ .err_handler = &vfio_err_handlers,
};
static DEFINE_MUTEX(reflck_lock);
@@ -2014,6 +2471,9 @@
vfio_pci_fill_ids();
+ if (disable_denylist)
+ pr_warn("device denylist disabled.\n");
+
return 0;
out_driver:
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 50cd17f..47f21a6 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -464,30 +464,35 @@
{
struct pci_dev *pdev = vdev->pdev;
int i;
- __le32 *bar;
+ __le32 *vbar;
u64 mask;
- bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+ if (!vdev->bardirty)
+ return;
- for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) {
- if (!pci_resource_start(pdev, i)) {
- *bar = 0; /* Unmapped by host = unimplemented to user */
+ vbar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0];
+
+ for (i = 0; i < PCI_STD_NUM_BARS; i++, vbar++) {
+ int bar = i + PCI_STD_RESOURCES;
+
+ if (!pci_resource_start(pdev, bar)) {
+ *vbar = 0; /* Unmapped by host = unimplemented to user */
continue;
}
- mask = ~(pci_resource_len(pdev, i) - 1);
+ mask = ~(pci_resource_len(pdev, bar) - 1);
- *bar &= cpu_to_le32((u32)mask);
- *bar |= vfio_generate_bar_flags(pdev, i);
+ *vbar &= cpu_to_le32((u32)mask);
+ *vbar |= vfio_generate_bar_flags(pdev, bar);
- if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
- bar++;
- *bar &= cpu_to_le32((u32)(mask >> 32));
+ if (*vbar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) {
+ vbar++;
+ *vbar &= cpu_to_le32((u32)(mask >> 32));
i++;
}
}
- bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
+ vbar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS];
/*
* NB. REGION_INFO will have reported zero size if we weren't able
@@ -497,14 +502,14 @@
if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
- *bar &= cpu_to_le32((u32)mask);
+ *vbar &= cpu_to_le32((u32)mask);
} else if (pdev->resource[PCI_ROM_RESOURCE].flags &
IORESOURCE_ROM_SHADOW) {
mask = ~(0x20000 - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
- *bar &= cpu_to_le32((u32)mask);
+ *vbar &= cpu_to_le32((u32)mask);
} else
- *bar = 0;
+ *vbar = 0;
vdev->bardirty = false;
}
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
index 08f1783..9adcf6a 100644
--- a/drivers/vfio/pci/vfio_pci_nvlink2.c
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -67,7 +67,7 @@
*
* This is not fast path anyway.
*/
- sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE);
+ sizealigned = ALIGN(posoff + count, PAGE_SIZE);
ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned);
if (!ptr)
return -EFAULT;
@@ -161,7 +161,7 @@
data->useraddr = vma->vm_start;
data->mm = current->mm;
- atomic_inc(&data->mm->mm_count);
+ mmgrab(data->mm);
ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
vma_pages(vma), data->gpu_hpa, &data->mem);
@@ -425,8 +425,14 @@
if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
&mmio_atsd)) {
- dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
- mmio_atsd = 0;
+ if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0,
+ &mmio_atsd)) {
+ dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
+ mmio_atsd = 0;
+ } else {
+ dev_warn(&vdev->pdev->dev,
+ "Using fallback ibm,mmio-atsd[0] for ATSD.\n");
+ }
}
if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 987b4d3..5c90e56 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -12,6 +12,8 @@
#include <linux/pci.h>
#include <linux/irqbypass.h>
#include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/notifier.h>
#ifndef VFIO_PCI_PRIVATE_H
#define VFIO_PCI_PRIVATE_H
@@ -31,12 +33,14 @@
struct vfio_pci_ioeventfd {
struct list_head next;
+ struct vfio_pci_device *vdev;
struct virqfd *virqfd;
void __iomem *addr;
uint64_t data;
loff_t pos;
int bar;
int count;
+ bool test_mem;
};
struct vfio_pci_irq_ctx {
@@ -84,6 +88,12 @@
struct mutex lock;
};
+struct vfio_pci_vf_token {
+ struct mutex lock;
+ uuid_t uuid;
+ int users;
+};
+
struct vfio_pci_mmap_vma {
struct vm_area_struct *vma;
struct list_head vma_next;
@@ -91,8 +101,8 @@
struct vfio_pci_device {
struct pci_dev *pdev;
- void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
- bool bar_mmap_supported[PCI_STD_RESOURCE_END + 1];
+ void __iomem *barmap[PCI_STD_NUM_BARS];
+ bool bar_mmap_supported[PCI_STD_NUM_BARS];
u8 *pci_config_map;
u8 *vconfig;
struct perm_bits *msi_perm;
@@ -127,6 +137,8 @@
struct list_head dummy_resources_list;
struct mutex ioeventfds_lock;
struct list_head ioeventfds_list;
+ struct vfio_pci_vf_token *vf_token;
+ struct notifier_block nb;
struct mutex vma_lock;
struct list_head vma_list;
struct rw_semaphore memory_lock;
@@ -201,4 +213,16 @@
return -ENODEV;
}
#endif
+
+#ifdef CONFIG_VFIO_PCI_ZDEV
+extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps);
+#else
+static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ return -ENODEV;
+}
+#endif
+
#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 83f81d2..a0b5fc8 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -37,17 +37,70 @@
#define vfio_ioread8 ioread8
#define vfio_iowrite8 iowrite8
+#define VFIO_IOWRITE(size) \
+static int vfio_pci_iowrite##size(struct vfio_pci_device *vdev, \
+ bool test_mem, u##size val, void __iomem *io) \
+{ \
+ if (test_mem) { \
+ down_read(&vdev->memory_lock); \
+ if (!__vfio_pci_memory_enabled(vdev)) { \
+ up_read(&vdev->memory_lock); \
+ return -EIO; \
+ } \
+ } \
+ \
+ vfio_iowrite##size(val, io); \
+ \
+ if (test_mem) \
+ up_read(&vdev->memory_lock); \
+ \
+ return 0; \
+}
+
+VFIO_IOWRITE(8)
+VFIO_IOWRITE(16)
+VFIO_IOWRITE(32)
+#ifdef iowrite64
+VFIO_IOWRITE(64)
+#endif
+
+#define VFIO_IOREAD(size) \
+static int vfio_pci_ioread##size(struct vfio_pci_device *vdev, \
+ bool test_mem, u##size *val, void __iomem *io) \
+{ \
+ if (test_mem) { \
+ down_read(&vdev->memory_lock); \
+ if (!__vfio_pci_memory_enabled(vdev)) { \
+ up_read(&vdev->memory_lock); \
+ return -EIO; \
+ } \
+ } \
+ \
+ *val = vfio_ioread##size(io); \
+ \
+ if (test_mem) \
+ up_read(&vdev->memory_lock); \
+ \
+ return 0; \
+}
+
+VFIO_IOREAD(8)
+VFIO_IOREAD(16)
+VFIO_IOREAD(32)
+
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
* range which is inaccessible. The excluded range drops writes and fills
* reads with -1. This is intended for handling MSI-X vector tables and
* leftover space for ROM BARs.
*/
-static ssize_t do_io_rw(void __iomem *io, char __user *buf,
+static ssize_t do_io_rw(struct vfio_pci_device *vdev, bool test_mem,
+ void __iomem *io, char __user *buf,
loff_t off, size_t count, size_t x_start,
size_t x_end, bool iswrite)
{
ssize_t done = 0;
+ int ret;
while (count) {
size_t fillable, filled;
@@ -66,9 +119,15 @@
if (copy_from_user(&val, buf, 4))
return -EFAULT;
- vfio_iowrite32(val, io + off);
+ ret = vfio_pci_iowrite32(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread32(io + off);
+ ret = vfio_pci_ioread32(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 4))
return -EFAULT;
@@ -82,9 +141,15 @@
if (copy_from_user(&val, buf, 2))
return -EFAULT;
- vfio_iowrite16(val, io + off);
+ ret = vfio_pci_iowrite16(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread16(io + off);
+ ret = vfio_pci_ioread16(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 2))
return -EFAULT;
@@ -98,9 +163,15 @@
if (copy_from_user(&val, buf, 1))
return -EFAULT;
- vfio_iowrite8(val, io + off);
+ ret = vfio_pci_iowrite8(vdev, test_mem,
+ val, io + off);
+ if (ret)
+ return ret;
} else {
- val = vfio_ioread8(io + off);
+ ret = vfio_pci_ioread8(vdev, test_mem,
+ &val, io + off);
+ if (ret)
+ return ret;
if (copy_to_user(buf, &val, 1))
return -EFAULT;
@@ -178,14 +249,6 @@
count = min(count, (size_t)(end - pos));
- if (res->flags & IORESOURCE_MEM) {
- down_read(&vdev->memory_lock);
- if (!__vfio_pci_memory_enabled(vdev)) {
- up_read(&vdev->memory_lock);
- return -EIO;
- }
- }
-
if (bar == PCI_ROM_RESOURCE) {
/*
* The ROM can fill less space than the BAR, so we start the
@@ -213,7 +276,8 @@
x_end = vdev->msix_offset + vdev->msix_size;
}
- done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite);
+ done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+ count, x_start, x_end, iswrite);
if (done >= 0)
*ppos += done;
@@ -221,9 +285,6 @@
if (bar == PCI_ROM_RESOURCE)
pci_unmap_rom(pdev, io);
out:
- if (res->flags & IORESOURCE_MEM)
- up_read(&vdev->memory_lock);
-
return done;
}
@@ -246,7 +307,7 @@
switch ((u32)pos) {
case 0xa0000 ... 0xbffff:
count = min(count, (size_t)(0xc0000 - pos));
- iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1);
+ iomem = ioremap(0xa0000, 0xbffff - 0xa0000 + 1);
off = pos - 0xa0000;
rsrc = VGA_RSRC_LEGACY_MEM;
is_ioport = false;
@@ -278,7 +339,12 @@
return ret;
}
- done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite);
+ /*
+ * VGA MMIO is a legacy, non-BAR resource that hopefully allows
+ * probing, so we don't currently worry about access in relation
+ * to the memory enable bit in the command register.
+ */
+ done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
vga_put(vdev->pdev, rsrc);
@@ -290,30 +356,60 @@
return done;
}
-static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
+ bool test_mem)
{
- struct vfio_pci_ioeventfd *ioeventfd = opaque;
-
switch (ioeventfd->count) {
case 1:
- vfio_iowrite8(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite8(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
case 2:
- vfio_iowrite16(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite16(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
case 4:
- vfio_iowrite32(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite32(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
#ifdef iowrite64
case 8:
- vfio_iowrite64(ioeventfd->data, ioeventfd->addr);
+ vfio_pci_iowrite64(ioeventfd->vdev, test_mem,
+ ioeventfd->data, ioeventfd->addr);
break;
#endif
}
+}
+
+static int vfio_pci_ioeventfd_handler(void *opaque, void *unused)
+{
+ struct vfio_pci_ioeventfd *ioeventfd = opaque;
+ struct vfio_pci_device *vdev = ioeventfd->vdev;
+
+ if (ioeventfd->test_mem) {
+ if (!down_read_trylock(&vdev->memory_lock))
+ return 1; /* Lock contended, use thread */
+ if (!__vfio_pci_memory_enabled(vdev)) {
+ up_read(&vdev->memory_lock);
+ return 0;
+ }
+ }
+
+ vfio_pci_ioeventfd_do_write(ioeventfd, false);
+
+ if (ioeventfd->test_mem)
+ up_read(&vdev->memory_lock);
return 0;
}
+static void vfio_pci_ioeventfd_thread(void *opaque, void *unused)
+{
+ struct vfio_pci_ioeventfd *ioeventfd = opaque;
+
+ vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem);
+}
+
long vfio_pci_ioeventfd(struct vfio_pci_device *vdev, loff_t offset,
uint64_t data, int count, int fd)
{
@@ -378,14 +474,17 @@
goto out_unlock;
}
+ ioeventfd->vdev = vdev;
ioeventfd->addr = vdev->barmap[bar] + pos;
ioeventfd->data = data;
ioeventfd->pos = pos;
ioeventfd->bar = bar;
ioeventfd->count = count;
+ ioeventfd->test_mem = vdev->pdev->resource[bar].flags & IORESOURCE_MEM;
ret = vfio_virqfd_enable(ioeventfd, vfio_pci_ioeventfd_handler,
- NULL, NULL, &ioeventfd->virqfd, fd);
+ vfio_pci_ioeventfd_thread, NULL,
+ &ioeventfd->virqfd, fd);
if (ret) {
kfree(ioeventfd);
goto out_unlock;
diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c
new file mode 100644
index 0000000..1bb7eda
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_zdev.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VFIO ZPCI devices support
+ *
+ * Copyright (C) IBM Corp. 2020. All rights reserved.
+ * Author(s): Pierre Morel <pmorel@linux.ibm.com>
+ * Matthew Rosato <mjrosato@linux.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/vfio_zdev.h>
+#include <asm/pci_clp.h>
+#include <asm/pci_io.h>
+
+#include "vfio_pci_private.h"
+
+/*
+ * Add the Base PCI Function information to the device info region.
+ */
+static int zpci_base_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_base cap = {
+ .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_BASE,
+ .header.version = 1,
+ .start_dma = zdev->start_dma,
+ .end_dma = zdev->end_dma,
+ .pchid = zdev->pchid,
+ .vfn = zdev->vfn,
+ .fmb_length = zdev->fmb_length,
+ .pft = zdev->pft,
+ .gid = zdev->pfgid
+ };
+
+ return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the Base PCI Function Group information to the device info region.
+ */
+static int zpci_group_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_group cap = {
+ .header.id = VFIO_DEVICE_INFO_CAP_ZPCI_GROUP,
+ .header.version = 1,
+ .dasm = zdev->dma_mask,
+ .msi_addr = zdev->msi_addr,
+ .flags = VFIO_DEVICE_INFO_ZPCI_FLAG_REFRESH,
+ .mui = zdev->fmb_update,
+ .noi = zdev->max_msi,
+ .maxstbl = ZPCI_MAX_WRITE_SIZE,
+ .version = zdev->version
+ };
+
+ return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+/*
+ * Add the device utility string to the device info region.
+ */
+static int zpci_util_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_util *cap;
+ int cap_size = sizeof(*cap) + CLP_UTIL_STR_LEN;
+ int ret;
+
+ cap = kmalloc(cap_size, GFP_KERNEL);
+ if (!cap)
+ return -ENOMEM;
+
+ cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_UTIL;
+ cap->header.version = 1;
+ cap->size = CLP_UTIL_STR_LEN;
+ memcpy(cap->util_str, zdev->util_str, cap->size);
+
+ ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+ kfree(cap);
+
+ return ret;
+}
+
+/*
+ * Add the function path string to the device info region.
+ */
+static int zpci_pfip_cap(struct zpci_dev *zdev, struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_device_info_cap_zpci_pfip *cap;
+ int cap_size = sizeof(*cap) + CLP_PFIP_NR_SEGMENTS;
+ int ret;
+
+ cap = kmalloc(cap_size, GFP_KERNEL);
+ if (!cap)
+ return -ENOMEM;
+
+ cap->header.id = VFIO_DEVICE_INFO_CAP_ZPCI_PFIP;
+ cap->header.version = 1;
+ cap->size = CLP_PFIP_NR_SEGMENTS;
+ memcpy(cap->pfip, zdev->pfip, cap->size);
+
+ ret = vfio_info_add_capability(caps, &cap->header, cap_size);
+
+ kfree(cap);
+
+ return ret;
+}
+
+/*
+ * Add all supported capabilities to the VFIO_DEVICE_GET_INFO capability chain.
+ */
+int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev,
+ struct vfio_info_cap *caps)
+{
+ struct zpci_dev *zdev = to_zpci(vdev->pdev);
+ int ret;
+
+ if (!zdev)
+ return -ENODEV;
+
+ ret = zpci_base_cap(zdev, vdev, caps);
+ if (ret)
+ return ret;
+
+ ret = zpci_group_cap(zdev, vdev, caps);
+ if (ret)
+ return ret;
+
+ if (zdev->util_str_avail) {
+ ret = zpci_util_cap(zdev, vdev, caps);
+ if (ret)
+ return ret;
+ }
+
+ ret = zpci_pfip_cap(zdev, vdev, caps);
+
+ return ret;
+}