Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index c84333e..fd17db9 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config VFIO_IOMMU_TYPE1
tristate
depends on VFIO
@@ -21,11 +22,10 @@
menuconfig VFIO
tristate "VFIO Non-Privileged userspace driver framework"
depends on IOMMU_API
- select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM_SMMU || ARM_SMMU_V3)
- select ANON_INODES
+ select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64)
help
VFIO provides a framework for secure userspace device drivers.
- See Documentation/vfio.txt for more details.
+ See Documentation/driver-api/vfio.rst for more details.
If you don't know what to do here, say N.
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
index 14fdb10..5da27f2 100644
--- a/drivers/vfio/mdev/Kconfig
+++ b/drivers/vfio/mdev/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config VFIO_MDEV
tristate "Mediated device driver framework"
@@ -5,7 +6,7 @@
default n
help
Provides a framework to virtualize devices.
- See Documentation/vfio-mediated-device.txt for more details.
+ See Documentation/driver-api/vfio-mediated-device.rst for more details.
If you don't know what do here, say N.
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
index fa2d5ea..101516f 100644
--- a/drivers/vfio/mdev/Makefile
+++ b/drivers/vfio/mdev/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index 0212f0e..b558d4c 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Mediated device Core Driver
*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
* Author: Neo Jia <cjia@nvidia.com>
* Kirti Wankhede <kwankhede@nvidia.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -60,9 +57,9 @@
}
EXPORT_SYMBOL(mdev_from_dev);
-uuid_le mdev_uuid(struct mdev_device *mdev)
+const guid_t *mdev_uuid(struct mdev_device *mdev)
{
- return mdev->uuid;
+ return &mdev->uuid;
}
EXPORT_SYMBOL(mdev_uuid);
@@ -88,8 +85,7 @@
put_device(dev);
}
-static
-inline struct mdev_parent *mdev_get_parent(struct mdev_parent *parent)
+static struct mdev_parent *mdev_get_parent(struct mdev_parent *parent)
{
if (parent)
kref_get(&parent->ref);
@@ -97,63 +93,42 @@
return parent;
}
-static inline void mdev_put_parent(struct mdev_parent *parent)
+static void mdev_put_parent(struct mdev_parent *parent)
{
if (parent)
kref_put(&parent->ref, mdev_release_parent);
}
-static int mdev_device_create_ops(struct kobject *kobj,
- struct mdev_device *mdev)
+/* Caller must hold parent unreg_sem read or write lock */
+static void mdev_device_remove_common(struct mdev_device *mdev)
{
- struct mdev_parent *parent = mdev->parent;
+ struct mdev_parent *parent;
+ struct mdev_type *type;
int ret;
- ret = parent->ops->create(kobj, mdev);
- if (ret)
- return ret;
-
- ret = sysfs_create_groups(&mdev->dev.kobj,
- parent->ops->mdev_attr_groups);
- if (ret)
- parent->ops->remove(mdev);
-
- return ret;
-}
-
-/*
- * mdev_device_remove_ops gets called from sysfs's 'remove' and when parent
- * device is being unregistered from mdev device framework.
- * - 'force_remove' is set to 'false' when called from sysfs's 'remove' which
- * indicates that if the mdev device is active, used by VMM or userspace
- * application, vendor driver could return error then don't remove the device.
- * - 'force_remove' is set to 'true' when called from mdev_unregister_device()
- * which indicate that parent device is being removed from mdev device
- * framework so remove mdev device forcefully.
- */
-static int mdev_device_remove_ops(struct mdev_device *mdev, bool force_remove)
-{
- struct mdev_parent *parent = mdev->parent;
- int ret;
-
- /*
- * Vendor driver can return error if VMM or userspace application is
- * using this mdev device.
- */
+ type = to_mdev_type(mdev->type_kobj);
+ mdev_remove_sysfs_files(&mdev->dev, type);
+ device_del(&mdev->dev);
+ parent = mdev->parent;
+ lockdep_assert_held(&parent->unreg_sem);
ret = parent->ops->remove(mdev);
- if (ret && !force_remove)
- return -EBUSY;
+ if (ret)
+ dev_err(&mdev->dev, "Remove failed: err=%d\n", ret);
- sysfs_remove_groups(&mdev->dev.kobj, parent->ops->mdev_attr_groups);
- return 0;
+ /* Balances with device_initialize() */
+ put_device(&mdev->dev);
+ mdev_put_parent(parent);
}
static int mdev_device_remove_cb(struct device *dev, void *data)
{
- if (!dev_is_mdev(dev))
- return 0;
+ if (dev_is_mdev(dev)) {
+ struct mdev_device *mdev;
- return mdev_device_remove(dev, data ? *(bool *)data : true);
+ mdev = to_mdev_device(dev);
+ mdev_device_remove_common(mdev);
+ }
+ return 0;
}
/*
@@ -168,6 +143,8 @@
{
int ret;
struct mdev_parent *parent;
+ char *env_string = "MDEV_STATE=registered";
+ char *envp[] = { env_string, NULL };
/* check for mandatory ops */
if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups)
@@ -182,6 +159,7 @@
/* Check for duplicate */
parent = __find_parent_device(dev);
if (parent) {
+ parent = NULL;
ret = -EEXIST;
goto add_dev_err;
}
@@ -193,6 +171,7 @@
}
kref_init(&parent->ref);
+ init_rwsem(&parent->unreg_sem);
parent->dev = dev;
parent->ops = ops;
@@ -217,6 +196,8 @@
mutex_unlock(&parent_list_lock);
dev_info(dev, "MDEV: Registered\n");
+ kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp);
+
return 0;
add_dev_err:
@@ -240,7 +221,8 @@
void mdev_unregister_device(struct device *dev)
{
struct mdev_parent *parent;
- bool force_remove = true;
+ char *env_string = "MDEV_STATE=unregistered";
+ char *envp[] = { env_string, NULL };
mutex_lock(&parent_list_lock);
parent = __find_parent_device(dev);
@@ -252,22 +234,26 @@
dev_info(dev, "MDEV: Unregistering\n");
list_del(&parent->next);
+ mutex_unlock(&parent_list_lock);
+
+ down_write(&parent->unreg_sem);
+
class_compat_remove_link(mdev_bus_compat_class, dev, NULL);
- device_for_each_child(dev, (void *)&force_remove,
- mdev_device_remove_cb);
+ device_for_each_child(dev, NULL, mdev_device_remove_cb);
parent_remove_sysfs_files(parent);
+ up_write(&parent->unreg_sem);
- mutex_unlock(&parent_list_lock);
mdev_put_parent(parent);
+
+ /* We still have the caller's reference to use for the uevent */
+ kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp);
}
EXPORT_SYMBOL(mdev_unregister_device);
-static void mdev_device_release(struct device *dev)
+static void mdev_device_free(struct mdev_device *mdev)
{
- struct mdev_device *mdev = to_mdev_device(dev);
-
mutex_lock(&mdev_list_lock);
list_del(&mdev->next);
mutex_unlock(&mdev_list_lock);
@@ -276,7 +262,15 @@
kfree(mdev);
}
-int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid)
+static void mdev_device_release(struct device *dev)
+{
+ struct mdev_device *mdev = to_mdev_device(dev);
+
+ mdev_device_free(mdev);
+}
+
+int mdev_device_create(struct kobject *kobj,
+ struct device *dev, const guid_t *uuid)
{
int ret;
struct mdev_device *mdev, *tmp;
@@ -291,7 +285,7 @@
/* Check for duplicate */
list_for_each_entry(tmp, &mdev_list, next) {
- if (!uuid_le_cmp(tmp->uuid, uuid)) {
+ if (guid_equal(&tmp->uuid, uuid)) {
mutex_unlock(&mdev_list_lock);
ret = -EEXIST;
goto mdev_fail;
@@ -305,53 +299,61 @@
goto mdev_fail;
}
- memcpy(&mdev->uuid, &uuid, sizeof(uuid_le));
+ guid_copy(&mdev->uuid, uuid);
list_add(&mdev->next, &mdev_list);
mutex_unlock(&mdev_list_lock);
mdev->parent = parent;
- kref_init(&mdev->ref);
- mdev->dev.parent = dev;
- mdev->dev.bus = &mdev_bus_type;
- mdev->dev.release = mdev_device_release;
- dev_set_name(&mdev->dev, "%pUl", uuid.b);
-
- ret = device_register(&mdev->dev);
- if (ret) {
- put_device(&mdev->dev);
+ /* Check if parent unregistration has started */
+ if (!down_read_trylock(&parent->unreg_sem)) {
+ mdev_device_free(mdev);
+ ret = -ENODEV;
goto mdev_fail;
}
- ret = mdev_device_create_ops(kobj, mdev);
+ device_initialize(&mdev->dev);
+ mdev->dev.parent = dev;
+ mdev->dev.bus = &mdev_bus_type;
+ mdev->dev.release = mdev_device_release;
+ dev_set_name(&mdev->dev, "%pUl", uuid);
+ mdev->dev.groups = parent->ops->mdev_attr_groups;
+ mdev->type_kobj = kobj;
+
+ ret = parent->ops->create(kobj, mdev);
if (ret)
- goto create_fail;
+ goto ops_create_fail;
+
+ ret = device_add(&mdev->dev);
+ if (ret)
+ goto add_fail;
ret = mdev_create_sysfs_files(&mdev->dev, type);
- if (ret) {
- mdev_device_remove_ops(mdev, true);
- goto create_fail;
- }
+ if (ret)
+ goto sysfs_fail;
- mdev->type_kobj = kobj;
mdev->active = true;
dev_dbg(&mdev->dev, "MDEV: created\n");
+ up_read(&parent->unreg_sem);
return 0;
-create_fail:
- device_unregister(&mdev->dev);
+sysfs_fail:
+ device_del(&mdev->dev);
+add_fail:
+ parent->ops->remove(mdev);
+ops_create_fail:
+ up_read(&parent->unreg_sem);
+ put_device(&mdev->dev);
mdev_fail:
mdev_put_parent(parent);
return ret;
}
-int mdev_device_remove(struct device *dev, bool force_remove)
+int mdev_device_remove(struct device *dev)
{
struct mdev_device *mdev, *tmp;
struct mdev_parent *parent;
- struct mdev_type *type;
- int ret;
mdev = to_mdev_device(dev);
@@ -374,21 +376,33 @@
mdev->active = false;
mutex_unlock(&mdev_list_lock);
- type = to_mdev_type(mdev->type_kobj);
parent = mdev->parent;
+ /* Check if parent unregistration has started */
+ if (!down_read_trylock(&parent->unreg_sem))
+ return -ENODEV;
- ret = mdev_device_remove_ops(mdev, force_remove);
- if (ret) {
- mdev->active = true;
- return ret;
- }
+ mdev_device_remove_common(mdev);
+ up_read(&parent->unreg_sem);
+ return 0;
+}
- mdev_remove_sysfs_files(dev, type);
- device_unregister(dev);
- mdev_put_parent(parent);
+int mdev_set_iommu_device(struct device *dev, struct device *iommu_device)
+{
+ struct mdev_device *mdev = to_mdev_device(dev);
+
+ mdev->iommu_device = iommu_device;
return 0;
}
+EXPORT_SYMBOL(mdev_set_iommu_device);
+
+struct device *mdev_get_iommu_device(struct device *dev)
+{
+ struct mdev_device *mdev = to_mdev_device(dev);
+
+ return mdev->iommu_device;
+}
+EXPORT_SYMBOL(mdev_get_iommu_device);
static int __init mdev_init(void)
{
diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 6f0391f..0d3223a 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* MDEV driver
*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
* Author: Neo Jia <cjia@nvidia.com>
* Kirti Wankhede <kwankhede@nvidia.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/device.h>
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index b5819b7..7d92295 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -1,13 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Mediated device interal definitions
*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
* Author: Neo Jia <cjia@nvidia.com>
* Kirti Wankhede <kwankhede@nvidia.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef MDEV_PRIVATE_H
@@ -23,16 +20,18 @@
struct list_head next;
struct kset *mdev_types_kset;
struct list_head type_list;
+ /* Synchronize device creation/removal with parent unregistration */
+ struct rw_semaphore unreg_sem;
};
struct mdev_device {
struct device dev;
struct mdev_parent *parent;
- uuid_le uuid;
+ guid_t uuid;
void *driver_data;
- struct kref ref;
struct list_head next;
struct kobject *type_kobj;
+ struct device *iommu_device;
bool active;
};
@@ -58,7 +57,8 @@
int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type);
void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type);
-int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid);
-int mdev_device_remove(struct device *dev, bool force_remove);
+int mdev_device_create(struct kobject *kobj,
+ struct device *dev, const guid_t *uuid);
+int mdev_device_remove(struct device *dev);
#endif /* MDEV_PRIVATE_H */
diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c
index 249472f..7570c76 100644
--- a/drivers/vfio/mdev/mdev_sysfs.c
+++ b/drivers/vfio/mdev/mdev_sysfs.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* File attributes for Mediated devices
*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
* Author: Neo Jia <cjia@nvidia.com>
* Kirti Wankhede <kwankhede@nvidia.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/sysfs.h>
@@ -55,7 +52,7 @@
const char *buf, size_t count)
{
char *str;
- uuid_le uuid;
+ guid_t uuid;
int ret;
if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1))
@@ -65,12 +62,12 @@
if (!str)
return -ENOMEM;
- ret = uuid_le_to_bin(str, &uuid);
+ ret = guid_parse(str, &uuid);
kfree(str);
if (ret)
return ret;
- ret = mdev_device_create(kobj, dev, uuid);
+ ret = mdev_device_create(kobj, dev, &uuid);
if (ret)
return ret;
@@ -92,8 +89,8 @@
.release = mdev_type_release,
};
-struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
- struct attribute_group *group)
+static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent,
+ struct attribute_group *group)
{
struct mdev_type *type;
int ret;
@@ -236,11 +233,9 @@
if (val && device_remove_file_self(dev, attr)) {
int ret;
- ret = mdev_device_remove(dev, false);
- if (ret) {
- device_create_file(dev, attr);
+ ret = mdev_device_remove(dev);
+ if (ret)
return ret;
- }
}
return count;
@@ -280,7 +275,7 @@
void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type)
{
+ sysfs_remove_files(&dev->kobj, mdev_device_attrs);
sysfs_remove_link(&dev->kobj, "mdev_type");
sysfs_remove_link(type->devices_kobj, dev_name(dev));
- sysfs_remove_files(&dev->kobj, mdev_device_attrs);
}
diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c
index d230620..30964a4 100644
--- a/drivers/vfio/mdev/vfio_mdev.c
+++ b/drivers/vfio/mdev/vfio_mdev.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO based driver for Mediated device
*
* Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
* Author: Neo Jia <cjia@nvidia.com>
* Kirti Wankhede <kwankhede@nvidia.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/init.h>
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 42dc1d3..ac3c1dd 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config VFIO_PCI
tristate "VFIO support for PCI devices"
depends on VFIO && PCI && EVENTFD
@@ -38,3 +39,9 @@
and LPC bridge config space.
To enable Intel IGD assignment through vfio-pci, say Y.
+
+config VFIO_PCI_NVLINK2
+ def_bool y
+ depends on VFIO_PCI && PPC_POWERNV
+ help
+ VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 76d8ec0..f027f8a 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,5 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
+vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o
obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h
new file mode 100644
index 0000000..b2aa986
--- /dev/null
+++ b/drivers/vfio/pci/trace.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * VFIO PCI mmap/mmap_fault tracepoints
+ *
+ * Copyright (C) 2018 IBM Corp. All rights reserved.
+ * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vfio_pci
+
+#if !defined(_TRACE_VFIO_PCI_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_VFIO_PCI_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap_fault,
+ TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+ vm_fault_t ret),
+ TP_ARGS(pdev, hpa, ua, ret),
+
+ TP_STRUCT__entry(
+ __field(const char *, name)
+ __field(unsigned long, hpa)
+ __field(unsigned long, ua)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->name = dev_name(&pdev->dev),
+ __entry->hpa = hpa;
+ __entry->ua = ua;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%s: %lx -> %lx ret=%d", __entry->name, __entry->hpa,
+ __entry->ua, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_nvgpu_mmap,
+ TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+ unsigned long size, int ret),
+ TP_ARGS(pdev, hpa, ua, size, ret),
+
+ TP_STRUCT__entry(
+ __field(const char *, name)
+ __field(unsigned long, hpa)
+ __field(unsigned long, ua)
+ __field(unsigned long, size)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->name = dev_name(&pdev->dev),
+ __entry->hpa = hpa;
+ __entry->ua = ua;
+ __entry->size = size;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+ __entry->ua, __entry->size, __entry->ret)
+);
+
+TRACE_EVENT(vfio_pci_npu2_mmap,
+ TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
+ unsigned long size, int ret),
+ TP_ARGS(pdev, hpa, ua, size, ret),
+
+ TP_STRUCT__entry(
+ __field(const char *, name)
+ __field(unsigned long, hpa)
+ __field(unsigned long, ua)
+ __field(unsigned long, size)
+ __field(int, ret)
+ ),
+
+ TP_fast_assign(
+ __entry->name = dev_name(&pdev->dev),
+ __entry->hpa = hpa;
+ __entry->ua = ua;
+ __entry->size = size;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
+ __entry->ua, __entry->size, __entry->ret)
+);
+
+#endif /* _TRACE_VFIO_PCI_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/vfio/pci
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index cddb453..0220616 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1,17 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define dev_fmt pr_fmt
#include <linux/device.h>
#include <linux/eventfd.h>
@@ -56,8 +54,6 @@
MODULE_PARM_DESC(disable_idle_d3,
"Disable using the PCI D3 low power state for idle, unused devices");
-static DEFINE_MUTEX(driver_lock);
-
static inline bool vfio_vga_disabled(void)
{
#ifdef CONFIG_VFIO_PCI_VGA
@@ -211,6 +207,57 @@
return false;
}
+static void vfio_pci_probe_power_state(struct vfio_pci_device *vdev)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ u16 pmcsr;
+
+ if (!pdev->pm_cap)
+ return;
+
+ pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
+
+ vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
+}
+
+/*
+ * pci_set_power_state() wrapper handling devices which perform a soft reset on
+ * D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
+ * restore when returned to D0. Saved separately from pci_saved_state for use
+ * by PM capability emulation and separately from pci_dev internal saved state
+ * to avoid it being overwritten and consumed around other resets.
+ */
+int vfio_pci_set_power_state(struct vfio_pci_device *vdev, pci_power_t state)
+{
+ struct pci_dev *pdev = vdev->pdev;
+ bool needs_restore = false, needs_save = false;
+ int ret;
+
+ if (vdev->needs_pm_restore) {
+ if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
+ pci_save_state(pdev);
+ needs_save = true;
+ }
+
+ if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
+ needs_restore = true;
+ }
+
+ ret = pci_set_power_state(pdev, state);
+
+ if (!ret) {
+ /* D3 might be unsupported via quirk, skip unless in D3 */
+ if (needs_save && pdev->current_state >= PCI_D3hot) {
+ vdev->pm_save = pci_store_saved_state(pdev);
+ } else if (needs_restore) {
+ pci_load_and_free_saved_state(pdev, &vdev->pm_save);
+ pci_restore_state(pdev);
+ }
+ }
+
+ return ret;
+}
+
static int vfio_pci_enable(struct vfio_pci_device *vdev)
{
struct pci_dev *pdev = vdev->pdev;
@@ -218,7 +265,7 @@
u16 cmd;
u8 msix_pos;
- pci_set_power_state(pdev, PCI_D0);
+ vfio_pci_set_power_state(vdev, PCI_D0);
/* Don't allow our initial saved state to include busmaster */
pci_clear_master(pdev);
@@ -238,12 +285,11 @@
pci_save_state(pdev);
vdev->pci_saved_state = pci_store_saved_state(pdev);
if (!vdev->pci_saved_state)
- pr_debug("%s: Couldn't store %s saved state\n",
- __func__, dev_name(&pdev->dev));
+ pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
if (likely(!nointxmask)) {
if (vfio_pci_nointx(pdev)) {
- dev_info(&pdev->dev, "Masking broken INTx support\n");
+ pci_info(pdev, "Masking broken INTx support\n");
vdev->nointx = true;
pci_intx(pdev, 0);
} else
@@ -287,16 +333,36 @@
IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
ret = vfio_pci_igd_init(vdev);
if (ret) {
- dev_warn(&vdev->pdev->dev,
- "Failed to setup Intel IGD regions\n");
- vfio_pci_disable(vdev);
- return ret;
+ pci_warn(pdev, "Failed to setup Intel IGD regions\n");
+ goto disable_exit;
+ }
+ }
+
+ if (pdev->vendor == PCI_VENDOR_ID_NVIDIA &&
+ IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
+ ret = vfio_pci_nvdia_v100_nvlink2_init(vdev);
+ if (ret && ret != -ENODEV) {
+ pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n");
+ goto disable_exit;
+ }
+ }
+
+ if (pdev->vendor == PCI_VENDOR_ID_IBM &&
+ IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) {
+ ret = vfio_pci_ibm_npu2_init(vdev);
+ if (ret && ret != -ENODEV) {
+ pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n");
+ goto disable_exit;
}
}
vfio_pci_probe_mmaps(vdev);
return 0;
+
+disable_exit:
+ vfio_pci_disable(vdev);
+ return ret;
}
static void vfio_pci_disable(struct vfio_pci_device *vdev)
@@ -357,8 +423,7 @@
* is just busy work.
*/
if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
- pr_info("%s: Couldn't reload %s saved state\n",
- __func__, dev_name(&pdev->dev));
+ pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
if (!vdev->reset_works)
goto out;
@@ -373,11 +438,20 @@
pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
/*
- * Try to reset the device. The success of this is dependent on
- * being able to lock the device, which is not always possible.
+ * Try to get the locks ourselves to prevent a deadlock. The
+ * success of this is dependent on being able to lock the device,
+ * which is not always possible.
+ * We can not use the "try" reset interface here, which will
+ * overwrite the previously restored configuration information.
*/
- if (vdev->reset_works && !pci_try_reset_function(pdev))
- vdev->needs_reset = false;
+ if (vdev->reset_works && pci_cfg_access_trylock(pdev)) {
+ if (device_trylock(&pdev->dev)) {
+ if (!__pci_reset_function_locked(pdev))
+ vdev->needs_reset = false;
+ device_unlock(&pdev->dev);
+ }
+ pci_cfg_access_unlock(pdev);
+ }
pci_restore_state(pdev);
out:
@@ -386,21 +460,21 @@
vfio_pci_try_bus_reset(vdev);
if (!disable_idle_d3)
- pci_set_power_state(pdev, PCI_D3hot);
+ vfio_pci_set_power_state(vdev, PCI_D3hot);
}
static void vfio_pci_release(void *device_data)
{
struct vfio_pci_device *vdev = device_data;
- mutex_lock(&driver_lock);
+ mutex_lock(&vdev->reflck->lock);
if (!(--vdev->refcnt)) {
vfio_spapr_pci_eeh_release(vdev->pdev);
vfio_pci_disable(vdev);
}
- mutex_unlock(&driver_lock);
+ mutex_unlock(&vdev->reflck->lock);
module_put(THIS_MODULE);
}
@@ -413,7 +487,7 @@
if (!try_module_get(THIS_MODULE))
return -ENODEV;
- mutex_lock(&driver_lock);
+ mutex_lock(&vdev->reflck->lock);
if (!vdev->refcnt) {
ret = vfio_pci_enable(vdev);
@@ -424,7 +498,7 @@
}
vdev->refcnt++;
error:
- mutex_unlock(&driver_lock);
+ mutex_unlock(&vdev->reflck->lock);
if (ret)
module_put(THIS_MODULE);
return ret;
@@ -434,10 +508,14 @@
{
if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
u8 pin;
- pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
- if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && !vdev->nointx && pin)
- return 1;
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
+ vdev->nointx || vdev->pdev->is_virtfn)
+ return 0;
+
+ pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
+
+ return pin ? 1 : 0;
} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
u8 pos;
u16 flags;
@@ -683,6 +761,7 @@
{
void __iomem *io;
size_t size;
+ u16 orig_cmd;
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.flags = 0;
@@ -698,15 +777,23 @@
break;
}
- /* Is it really there? */
- io = pci_map_rom(pdev, &size);
- if (!io || !size) {
- info.size = 0;
- break;
- }
- pci_unmap_rom(pdev, io);
+ /*
+ * Is it really there? Enable memory decode for
+ * implicit access in pci_map_rom().
+ */
+ pci_read_config_word(pdev, PCI_COMMAND, &orig_cmd);
+ pci_write_config_word(pdev, PCI_COMMAND,
+ orig_cmd | PCI_COMMAND_MEMORY);
- info.flags = VFIO_REGION_INFO_FLAG_READ;
+ io = pci_map_rom(pdev, &size);
+ if (io) {
+ info.flags = VFIO_REGION_INFO_FLAG_READ;
+ pci_unmap_rom(pdev, io);
+ } else {
+ info.size = 0;
+ }
+
+ pci_write_config_word(pdev, PCI_COMMAND, orig_cmd);
break;
}
case VFIO_PCI_VGA_REGION_INDEX:
@@ -746,6 +833,12 @@
if (ret)
return ret;
+ if (vdev->region[i].ops->add_capability) {
+ ret = vdev->region[i].ops->add_capability(vdev,
+ &vdev->region[i], &caps);
+ if (ret)
+ return ret;
+ }
}
}
@@ -1113,6 +1206,15 @@
return -EINVAL;
if ((vma->vm_flags & VM_SHARED) == 0)
return -EINVAL;
+ if (index >= VFIO_PCI_NUM_REGIONS) {
+ int regnum = index - VFIO_PCI_NUM_REGIONS;
+ struct vfio_pci_region *region = vdev->region + regnum;
+
+ if (region && region->ops && region->ops->mmap &&
+ (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+ return region->ops->mmap(vdev, region, vma);
+ return -EINVAL;
+ }
if (index >= VFIO_PCI_ROM_REGION_INDEX)
return -EINVAL;
if (!vdev->bar_mmap_supported[index])
@@ -1155,17 +1257,18 @@
static void vfio_pci_request(void *device_data, unsigned int count)
{
struct vfio_pci_device *vdev = device_data;
+ struct pci_dev *pdev = vdev->pdev;
mutex_lock(&vdev->igate);
if (vdev->req_trigger) {
if (!(count % 10))
- dev_notice_ratelimited(&vdev->pdev->dev,
+ pci_notice_ratelimited(pdev,
"Relaying device request to user (#%u)\n",
count);
eventfd_signal(vdev->req_trigger, 1);
} else if (count == 0) {
- dev_warn(&vdev->pdev->dev,
+ pci_warn(pdev,
"No device request channel registered, blocked until released by user\n");
}
@@ -1183,6 +1286,9 @@
.request = vfio_pci_request,
};
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev);
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck);
+
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
struct vfio_pci_device *vdev;
@@ -1229,12 +1335,22 @@
return ret;
}
+ ret = vfio_pci_reflck_attach(vdev);
+ if (ret) {
+ vfio_del_group_dev(&pdev->dev);
+ vfio_iommu_group_put(group, &pdev->dev);
+ kfree(vdev);
+ return ret;
+ }
+
if (vfio_pci_is_vga(pdev)) {
vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode);
vga_set_legacy_decoding(pdev,
vfio_pci_set_vga_decode(vdev, false));
}
+ vfio_pci_probe_power_state(vdev);
+
if (!disable_idle_d3) {
/*
* pci-core sets the device power state to an unknown value at
@@ -1245,8 +1361,8 @@
* be able to get to D3. Therefore first do a D0 transition
* before going to D3.
*/
- pci_set_power_state(pdev, PCI_D0);
- pci_set_power_state(pdev, PCI_D3hot);
+ vfio_pci_set_power_state(vdev, PCI_D0);
+ vfio_pci_set_power_state(vdev, PCI_D3hot);
}
return ret;
@@ -1260,9 +1376,16 @@
if (!vdev)
return;
+ vfio_pci_reflck_put(vdev->reflck);
+
vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev);
kfree(vdev->region);
mutex_destroy(&vdev->ioeventfds_lock);
+
+ if (!disable_idle_d3)
+ vfio_pci_set_power_state(vdev, PCI_D0);
+
+ kfree(vdev->pm_save);
kfree(vdev);
if (vfio_pci_is_vga(pdev)) {
@@ -1271,9 +1394,6 @@
VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM);
}
-
- if (!disable_idle_d3)
- pci_set_power_state(pdev, PCI_D0);
}
static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
@@ -1316,16 +1436,97 @@
.err_handler = &vfio_err_handlers,
};
+static DEFINE_MUTEX(reflck_lock);
+
+static struct vfio_pci_reflck *vfio_pci_reflck_alloc(void)
+{
+ struct vfio_pci_reflck *reflck;
+
+ reflck = kzalloc(sizeof(*reflck), GFP_KERNEL);
+ if (!reflck)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&reflck->kref);
+ mutex_init(&reflck->lock);
+
+ return reflck;
+}
+
+static void vfio_pci_reflck_get(struct vfio_pci_reflck *reflck)
+{
+ kref_get(&reflck->kref);
+}
+
+static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data)
+{
+ struct vfio_pci_reflck **preflck = data;
+ struct vfio_device *device;
+ struct vfio_pci_device *vdev;
+
+ device = vfio_device_get_from_dev(&pdev->dev);
+ if (!device)
+ return 0;
+
+ if (pci_dev_driver(pdev) != &vfio_pci_driver) {
+ vfio_device_put(device);
+ return 0;
+ }
+
+ vdev = vfio_device_data(device);
+
+ if (vdev->reflck) {
+ vfio_pci_reflck_get(vdev->reflck);
+ *preflck = vdev->reflck;
+ vfio_device_put(device);
+ return 1;
+ }
+
+ vfio_device_put(device);
+ return 0;
+}
+
+static int vfio_pci_reflck_attach(struct vfio_pci_device *vdev)
+{
+ bool slot = !pci_probe_reset_slot(vdev->pdev->slot);
+
+ mutex_lock(&reflck_lock);
+
+ if (pci_is_root_bus(vdev->pdev->bus) ||
+ vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_reflck_find,
+ &vdev->reflck, slot) <= 0)
+ vdev->reflck = vfio_pci_reflck_alloc();
+
+ mutex_unlock(&reflck_lock);
+
+ return PTR_ERR_OR_ZERO(vdev->reflck);
+}
+
+static void vfio_pci_reflck_release(struct kref *kref)
+{
+ struct vfio_pci_reflck *reflck = container_of(kref,
+ struct vfio_pci_reflck,
+ kref);
+
+ kfree(reflck);
+ mutex_unlock(&reflck_lock);
+}
+
+static void vfio_pci_reflck_put(struct vfio_pci_reflck *reflck)
+{
+ kref_put_mutex(&reflck->kref, vfio_pci_reflck_release, &reflck_lock);
+}
+
struct vfio_devices {
struct vfio_device **devices;
int cur_index;
int max_index;
};
-static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
+static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data)
{
struct vfio_devices *devs = data;
struct vfio_device *device;
+ struct vfio_pci_device *vdev;
if (devs->cur_index == devs->max_index)
return -ENOSPC;
@@ -1339,16 +1540,28 @@
return -EBUSY;
}
+ vdev = vfio_device_data(device);
+
+ /* Fault if the device is not unused */
+ if (vdev->refcnt) {
+ vfio_device_put(device);
+ return -EBUSY;
+ }
+
devs->devices[devs->cur_index++] = device;
return 0;
}
/*
- * Attempt to do a bus/slot reset if there are devices affected by a reset for
- * this device that are needs_reset and all of the affected devices are unused
- * (!refcnt). Callers are required to hold driver_lock when calling this to
- * prevent device opens and concurrent bus reset attempts. We prevent device
- * unbinds by acquiring and holding a reference to the vfio_device.
+ * If a bus or slot reset is available for the provided device and:
+ * - All of the devices affected by that bus or slot reset are unused
+ * (!refcnt)
+ * - At least one of the affected devices is marked dirty via
+ * needs_reset (such as by lack of FLR support)
+ * Then attempt to perform that bus or slot reset. Callers are required
+ * to hold vdev->reflck->lock, protecting the bus/slot reset group from
+ * concurrent opens. A vfio_device reference is acquired for each device
+ * to prevent unbinds during the reset operation.
*
* NB: vfio-core considers a group to be viable even if some devices are
* bound to drivers like pci-stub or pcieport. Here we require all devices
@@ -1359,7 +1572,7 @@
{
struct vfio_devices devs = { .cur_index = 0 };
int i = 0, ret = -EINVAL;
- bool needs_reset = false, slot = false;
+ bool slot = false;
struct vfio_pci_device *tmp;
if (!pci_probe_reset_slot(vdev->pdev->slot))
@@ -1377,28 +1590,36 @@
return;
if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
- vfio_pci_get_devs, &devs, slot))
+ vfio_pci_get_unused_devs,
+ &devs, slot))
goto put_devs;
+ /* Does at least one need a reset? */
for (i = 0; i < devs.cur_index; i++) {
tmp = vfio_device_data(devs.devices[i]);
- if (tmp->needs_reset)
- needs_reset = true;
- if (tmp->refcnt)
- goto put_devs;
+ if (tmp->needs_reset) {
+ ret = pci_reset_bus(vdev->pdev);
+ break;
+ }
}
- if (needs_reset)
- ret = pci_reset_bus(vdev->pdev);
-
put_devs:
for (i = 0; i < devs.cur_index; i++) {
tmp = vfio_device_data(devs.devices[i]);
- if (!ret)
+
+ /*
+ * If reset was successful, affected devices no longer need
+ * a reset and we should return all the collateral devices
+ * to low power. If not successful, we either didn't reset
+ * the bus or timed out waiting for it, so let's not touch
+ * the power state.
+ */
+ if (!ret) {
tmp->needs_reset = false;
- if (!tmp->refcnt && !disable_idle_d3)
- pci_set_power_state(tmp->pdev, PCI_D3hot);
+ if (tmp != vdev && !disable_idle_d3)
+ vfio_pci_set_power_state(tmp, PCI_D3hot);
+ }
vfio_device_put(devs.devices[i]);
}
@@ -1443,11 +1664,11 @@
rc = pci_add_dynid(&vfio_pci_driver, vendor, device,
subvendor, subdevice, class, class_mask, 0);
if (rc)
- pr_warn("failed to add dynamic id [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x (%d)\n",
+ pr_warn("failed to add dynamic id [%04x:%04x[%04x:%04x]] class %#08x/%08x (%d)\n",
vendor, device, subvendor, subdevice,
class, class_mask, rc);
else
- pr_info("add [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x\n",
+ pr_info("add [%04x:%04x[%04x:%04x]] class %#08x/%08x\n",
vendor, device, subvendor, subdevice,
class, class_mask);
}
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 115a36f..f0891bd 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO PCI config space virtualization
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
@@ -412,8 +409,7 @@
if (pdev->is_virtfn)
return;
- pr_info("%s: %s reset recovery - restoring bars\n",
- __func__, dev_name(&pdev->dev));
+ pci_info(pdev, "%s: reset recovery - restoring BARs\n", __func__);
for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++)
pci_user_write_config_dword(pdev, i, *rbar);
@@ -691,7 +687,7 @@
break;
}
- pci_set_power_state(vdev->pdev, state);
+ vfio_pci_set_power_state(vdev, state);
}
return count;
@@ -1180,8 +1176,10 @@
return -ENOMEM;
ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags);
- if (ret)
+ if (ret) {
+ kfree(vdev->msi_perm);
return ret;
+ }
return len;
}
@@ -1296,8 +1294,8 @@
else
return PCI_SATA_SIZEOF_SHORT;
default:
- pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n",
- dev_name(&pdev->dev), __func__, cap, pos);
+ pci_warn(pdev, "%s: unknown length for PCI cap %#x@%#x\n",
+ __func__, cap, pos);
}
return 0;
@@ -1370,8 +1368,8 @@
}
return PCI_TPH_BASE_SIZEOF;
default:
- pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n",
- dev_name(&pdev->dev), __func__, ecap, epos);
+ pci_warn(pdev, "%s: unknown length for PCI ecap %#x@%#x\n",
+ __func__, ecap, epos);
}
return 0;
@@ -1472,8 +1470,8 @@
}
if (!len) {
- pr_info("%s: %s hiding cap 0x%x\n",
- __func__, dev_name(&pdev->dev), cap);
+ pci_info(pdev, "%s: hiding cap %#x@%#x\n", __func__,
+ cap, pos);
*prev = next;
pos = next;
continue;
@@ -1484,9 +1482,8 @@
if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
continue;
- pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
- __func__, dev_name(&pdev->dev),
- pos + i, map[pos + i], cap);
+ pci_warn(pdev, "%s: PCI config conflict @%#x, was cap %#x now cap %#x\n",
+ __func__, pos + i, map[pos + i], cap);
}
BUILD_BUG_ON(PCI_CAP_ID_MAX >= PCI_CAP_ID_INVALID_VIRT);
@@ -1547,8 +1544,8 @@
}
if (!len) {
- pr_info("%s: %s hiding ecap 0x%x@0x%x\n",
- __func__, dev_name(&pdev->dev), ecap, epos);
+ pci_info(pdev, "%s: hiding ecap %#x@%#x\n",
+ __func__, ecap, epos);
/* If not the first in the chain, we can skip over it */
if (prev) {
@@ -1570,9 +1567,8 @@
if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
continue;
- pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
- __func__, dev_name(&pdev->dev),
- epos + i, map[epos + i], ecap);
+ pci_warn(pdev, "%s: PCI config conflict @%#x, was ecap %#x now ecap %#x\n",
+ __func__, epos + i, map[epos + i], ecap);
}
/*
@@ -1610,6 +1606,15 @@
}
/*
+ * Nag about hardware bugs, hopefully to have vendors fix them, but at least
+ * to collect a list of dependencies for the VF INTx pin quirk below.
+ */
+static const struct pci_device_id known_bogus_vf_intx_pin[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x270c) },
+ {}
+};
+
+/*
* For each device we allocate a pci_config_map that indicates the
* capability occupying each dword and thus the struct perm_bits we
* use for read and write. We also allocate a virtualized config
@@ -1674,6 +1679,24 @@
if (pdev->is_virtfn) {
*(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor);
*(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device);
+
+ /*
+ * Per SR-IOV spec rev 1.1, 3.4.1.18 the interrupt pin register
+ * does not apply to VFs and VFs must implement this register
+ * as read-only with value zero. Userspace is not readily able
+ * to identify whether a device is a VF and thus that the pin
+ * definition on the device is bogus should it violate this
+ * requirement. We already virtualize the pin register for
+ * other purposes, so we simply need to replace the bogus value
+ * and consider VFs when we determine INTx IRQ count.
+ */
+ if (vconfig[PCI_INTERRUPT_PIN] &&
+ !pci_match_id(known_bogus_vf_intx_pin, pdev))
+ pci_warn(pdev,
+ "Hardware bug: VF reports bogus INTx pin %d\n",
+ vconfig[PCI_INTERRUPT_PIN]);
+
+ vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */
}
if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx)
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index 6394b16..53d97f4 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO PCI Intel Graphics support
*
* Copyright (C) 2016 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Register a device specific region through which to provide read-only
* access to the Intel IGD opregion. The register defining the opregion
* address is also virtualized to prevent user modification.
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 1c46045..3fa3f72 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO PCI interrupt handling
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c
new file mode 100644
index 0000000..f2983f0
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_nvlink2.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
+ *
+ * Copyright (C) 2018 IBM Corp. All rights reserved.
+ * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ * Register an on-GPU RAM region for cacheable access.
+ *
+ * Derived from original vfio_pci_igd.c:
+ * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
+ * Author: Alex Williamson <alex.williamson@redhat.com>
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
+#include <asm/kvm_ppc.h>
+#include "vfio_pci_private.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault);
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap);
+
+struct vfio_pci_nvgpu_data {
+ unsigned long gpu_hpa; /* GPU RAM physical address */
+ unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
+ unsigned long useraddr; /* GPU RAM userspace address */
+ unsigned long size; /* Size of the GPU RAM window (usually 128GB) */
+ struct mm_struct *mm;
+ struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */
+ struct pci_dev *gpdev;
+ struct notifier_block group_notifier;
+};
+
+static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev,
+ char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
+ unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+ struct vfio_pci_nvgpu_data *data = vdev->region[i].data;
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+ loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK;
+ size_t sizealigned;
+ void __iomem *ptr;
+
+ if (pos >= vdev->region[i].size)
+ return -EINVAL;
+
+ count = min(count, (size_t)(vdev->region[i].size - pos));
+
+ /*
+ * We map only a bit of GPU RAM for a short time instead of mapping it
+ * for the guest lifetime as:
+ *
+ * 1) we do not know GPU RAM size, only aperture which is 4-8 times
+ * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
+ * 2) mapping GPU RAM allows CPU to prefetch and if this happens
+ * before NVLink bridge is reset (which fences GPU RAM),
+ * hardware management interrupts (HMI) might happen, this
+ * will freeze NVLink bridge.
+ *
+ * This is not fast path anyway.
+ */
+ sizealigned = _ALIGN_UP(posoff + count, PAGE_SIZE);
+ ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned);
+ if (!ptr)
+ return -EFAULT;
+
+ if (iswrite) {
+ if (copy_from_user(ptr + posoff, buf, count))
+ count = -EFAULT;
+ else
+ *ppos += count;
+ } else {
+ if (copy_to_user(buf, ptr + posoff, count))
+ count = -EFAULT;
+ else
+ *ppos += count;
+ }
+
+ iounmap(ptr);
+
+ return count;
+}
+
+static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region)
+{
+ struct vfio_pci_nvgpu_data *data = region->data;
+ long ret;
+
+ /* If there were any mappings at all... */
+ if (data->mm) {
+ ret = mm_iommu_put(data->mm, data->mem);
+ WARN_ON(ret);
+
+ mmdrop(data->mm);
+ }
+
+ vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+ &data->group_notifier);
+
+ pnv_npu2_unmap_lpar_dev(data->gpdev);
+
+ kfree(data);
+}
+
+static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf)
+{
+ vm_fault_t ret;
+ struct vm_area_struct *vma = vmf->vma;
+ struct vfio_pci_region *region = vma->vm_private_data;
+ struct vfio_pci_nvgpu_data *data = region->data;
+ unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT;
+ unsigned long vm_pgoff = vma->vm_pgoff &
+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+ unsigned long pfn = nv2pg + vm_pgoff + vmf_off;
+
+ ret = vmf_insert_pfn(vma, vmf->address, pfn);
+ trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT,
+ vmf->address, ret);
+
+ return ret;
+}
+
+static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = {
+ .fault = vfio_pci_nvgpu_mmap_fault,
+};
+
+static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region, struct vm_area_struct *vma)
+{
+ int ret;
+ struct vfio_pci_nvgpu_data *data = region->data;
+
+ if (data->useraddr)
+ return -EPERM;
+
+ if (vma->vm_end - vma->vm_start > data->size)
+ return -EINVAL;
+
+ vma->vm_private_data = region;
+ vma->vm_flags |= VM_PFNMAP;
+ vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops;
+
+ /*
+ * Calling mm_iommu_newdev() here once as the region is not
+ * registered yet and therefore right initialization will happen now.
+ * Other places will use mm_iommu_find() which returns
+ * registered @mem and does not go gup().
+ */
+ data->useraddr = vma->vm_start;
+ data->mm = current->mm;
+
+ atomic_inc(&data->mm->mm_count);
+ ret = (int) mm_iommu_newdev(data->mm, data->useraddr,
+ vma_pages(vma), data->gpu_hpa, &data->mem);
+
+ trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr,
+ vma->vm_end - vma->vm_start, ret);
+
+ return ret;
+}
+
+static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region, struct vfio_info_cap *caps)
+{
+ struct vfio_pci_nvgpu_data *data = region->data;
+ struct vfio_region_info_cap_nvlink2_ssatgt cap = {
+ .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT,
+ .header.version = 1,
+ .tgt = data->gpu_tgt
+ };
+
+ return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
+}
+
+static const struct vfio_pci_regops vfio_pci_nvgpu_regops = {
+ .rw = vfio_pci_nvgpu_rw,
+ .release = vfio_pci_nvgpu_release,
+ .mmap = vfio_pci_nvgpu_mmap,
+ .add_capability = vfio_pci_nvgpu_add_capability,
+};
+
+static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb,
+ unsigned long action, void *opaque)
+{
+ struct kvm *kvm = opaque;
+ struct vfio_pci_nvgpu_data *data = container_of(nb,
+ struct vfio_pci_nvgpu_data,
+ group_notifier);
+
+ if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm &&
+ pnv_npu2_map_lpar_dev(data->gpdev,
+ kvm->arch.lpid, MSR_DR | MSR_PR))
+ return NOTIFY_BAD;
+
+ return NOTIFY_OK;
+}
+
+int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
+{
+ int ret;
+ u64 reg[2];
+ u64 tgt = 0;
+ struct device_node *npu_node, *mem_node;
+ struct pci_dev *npu_dev;
+ struct vfio_pci_nvgpu_data *data;
+ uint32_t mem_phandle = 0;
+ unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM;
+
+ /*
+ * PCI config space does not tell us about NVLink presense but
+ * platform does, use this.
+ */
+ npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0);
+ if (!npu_dev)
+ return -ENODEV;
+
+ npu_node = pci_device_to_OF_node(npu_dev);
+ if (!npu_node)
+ return -EINVAL;
+
+ if (of_property_read_u32(npu_node, "memory-region", &mem_phandle))
+ return -EINVAL;
+
+ mem_node = of_find_node_by_phandle(mem_phandle);
+ if (!mem_node)
+ return -EINVAL;
+
+ if (of_property_read_variable_u64_array(mem_node, "reg", reg,
+ ARRAY_SIZE(reg), ARRAY_SIZE(reg)) !=
+ ARRAY_SIZE(reg))
+ return -EINVAL;
+
+ if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
+ dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
+ return -EFAULT;
+ }
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->gpu_hpa = reg[0];
+ data->gpu_tgt = tgt;
+ data->size = reg[1];
+
+ dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa,
+ data->gpu_hpa + data->size - 1);
+
+ data->gpdev = vdev->pdev;
+ data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier;
+
+ ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+ &events, &data->group_notifier);
+ if (ret)
+ goto free_exit;
+
+ /*
+ * We have just set KVM, we do not need the listener anymore.
+ * Also, keeping it registered means that if more than one GPU is
+ * assigned, we will get several similar notifiers notifying about
+ * the same device again which does not help with anything.
+ */
+ vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY,
+ &data->group_notifier);
+
+ ret = vfio_pci_register_dev_region(vdev,
+ PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+ VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
+ &vfio_pci_nvgpu_regops,
+ data->size,
+ VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP,
+ data);
+ if (ret)
+ goto free_exit;
+
+ return 0;
+free_exit:
+ kfree(data);
+
+ return ret;
+}
+
+/*
+ * IBM NPU2 bridge
+ */
+struct vfio_pci_npu2_data {
+ void *base; /* ATSD register virtual address, for emulated access */
+ unsigned long mmio_atsd; /* ATSD physical address */
+ unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */
+ unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */
+};
+
+static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev,
+ char __user *buf, size_t count, loff_t *ppos, bool iswrite)
+{
+ unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+ struct vfio_pci_npu2_data *data = vdev->region[i].data;
+ loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+ if (pos >= vdev->region[i].size)
+ return -EINVAL;
+
+ count = min(count, (size_t)(vdev->region[i].size - pos));
+
+ if (iswrite) {
+ if (copy_from_user(data->base + pos, buf, count))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(buf, data->base + pos, count))
+ return -EFAULT;
+ }
+ *ppos += count;
+
+ return count;
+}
+
+static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region, struct vm_area_struct *vma)
+{
+ int ret;
+ struct vfio_pci_npu2_data *data = region->data;
+ unsigned long req_len = vma->vm_end - vma->vm_start;
+
+ if (req_len != PAGE_SIZE)
+ return -EINVAL;
+
+ vma->vm_flags |= VM_PFNMAP;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+ ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT,
+ req_len, vma->vm_page_prot);
+ trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start,
+ vma->vm_end - vma->vm_start, ret);
+
+ return ret;
+}
+
+static void vfio_pci_npu2_release(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region)
+{
+ struct vfio_pci_npu2_data *data = region->data;
+
+ memunmap(data->base);
+ kfree(data);
+}
+
+static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region, struct vfio_info_cap *caps)
+{
+ struct vfio_pci_npu2_data *data = region->data;
+ struct vfio_region_info_cap_nvlink2_ssatgt captgt = {
+ .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT,
+ .header.version = 1,
+ .tgt = data->gpu_tgt
+ };
+ struct vfio_region_info_cap_nvlink2_lnkspd capspd = {
+ .header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD,
+ .header.version = 1,
+ .link_speed = data->link_speed
+ };
+ int ret;
+
+ ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt));
+ if (ret)
+ return ret;
+
+ return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd));
+}
+
+static const struct vfio_pci_regops vfio_pci_npu2_regops = {
+ .rw = vfio_pci_npu2_rw,
+ .mmap = vfio_pci_npu2_mmap,
+ .release = vfio_pci_npu2_release,
+ .add_capability = vfio_pci_npu2_add_capability,
+};
+
+int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
+{
+ int ret;
+ struct vfio_pci_npu2_data *data;
+ struct device_node *nvlink_dn;
+ u32 nvlink_index = 0;
+ struct pci_dev *npdev = vdev->pdev;
+ struct device_node *npu_node = pci_device_to_OF_node(npdev);
+ struct pci_controller *hose = pci_bus_to_host(npdev->bus);
+ u64 mmio_atsd = 0;
+ u64 tgt = 0;
+ u32 link_speed = 0xff;
+
+ /*
+ * PCI config space does not tell us about NVLink presense but
+ * platform does, use this.
+ */
+ if (!pnv_pci_get_gpu_dev(vdev->pdev))
+ return -ENODEV;
+
+ /*
+ * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
+ * so we can allocate one register per link, using nvlink index as
+ * a key.
+ * There is always at least one ATSD register so as long as at least
+ * NVLink bridge #0 is passed to the guest, ATSD will be available.
+ */
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+ &nvlink_index)))
+ return -ENODEV;
+
+ if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index,
+ &mmio_atsd)) {
+ dev_warn(&vdev->pdev->dev, "No available ATSD found\n");
+ mmio_atsd = 0;
+ }
+
+ if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) {
+ dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n");
+ return -EFAULT;
+ }
+
+ if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) {
+ dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n");
+ return -EFAULT;
+ }
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->mmio_atsd = mmio_atsd;
+ data->gpu_tgt = tgt;
+ data->link_speed = link_speed;
+ if (data->mmio_atsd) {
+ data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT);
+ if (!data->base) {
+ ret = -ENOMEM;
+ goto free_exit;
+ }
+ }
+
+ /*
+ * We want to expose the capability even if this specific NVLink
+ * did not get its own ATSD register because capabilities
+ * belong to VFIO regions and normally there will be ATSD register
+ * assigned to the NVLink bridge.
+ */
+ ret = vfio_pci_register_dev_region(vdev,
+ PCI_VENDOR_ID_IBM |
+ VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+ VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
+ &vfio_pci_npu2_regops,
+ data->mmio_atsd ? PAGE_SIZE : 0,
+ VFIO_REGION_INFO_FLAG_READ |
+ VFIO_REGION_INFO_FLAG_WRITE |
+ VFIO_REGION_INFO_FLAG_MMAP,
+ data);
+ if (ret)
+ goto free_exit;
+
+ return 0;
+
+free_exit:
+ if (data->base)
+ memunmap(data->base);
+ kfree(data);
+
+ return ret;
+}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d..ee6ee91 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -1,11 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
@@ -59,6 +56,12 @@
size_t count, loff_t *ppos, bool iswrite);
void (*release)(struct vfio_pci_device *vdev,
struct vfio_pci_region *region);
+ int (*mmap)(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region,
+ struct vm_area_struct *vma);
+ int (*add_capability)(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region,
+ struct vfio_info_cap *caps);
};
struct vfio_pci_region {
@@ -76,6 +79,11 @@
struct list_head res_next;
};
+struct vfio_pci_reflck {
+ struct kref kref;
+ struct mutex lock;
+};
+
struct vfio_pci_device {
struct pci_dev *pdev;
void __iomem *barmap[PCI_STD_RESOURCE_END + 1];
@@ -103,7 +111,10 @@
bool has_vga;
bool needs_reset;
bool nointx;
+ bool needs_pm_restore;
struct pci_saved_state *pci_saved_state;
+ struct pci_saved_state *pm_save;
+ struct vfio_pci_reflck *reflck;
int refcnt;
int ioeventfds_nr;
struct eventfd_ctx *err_trigger;
@@ -149,6 +160,10 @@
unsigned int type, unsigned int subtype,
const struct vfio_pci_regops *ops,
size_t size, u32 flags, void *data);
+
+extern int vfio_pci_set_power_state(struct vfio_pci_device *vdev,
+ pci_power_t state);
+
#ifdef CONFIG_VFIO_PCI_IGD
extern int vfio_pci_igd_init(struct vfio_pci_device *vdev);
#else
@@ -157,4 +172,18 @@
return -ENODEV;
}
#endif
+#ifdef CONFIG_VFIO_PCI_NVLINK2
+extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev);
+extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev);
+#else
+static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev)
+{
+ return -ENODEV;
+}
+
+static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev)
+{
+ return -ENODEV;
+}
+#endif
#endif /* VFIO_PCI_PRIVATE_H */
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index a6029d0..0120d83 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO PCI I/O Port & MMIO access
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index bb30128..dc1a3c4 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config VFIO_PLATFORM
tristate "VFIO support for platform devices"
depends on VFIO && EVENTFD && (ARM || ARM64)
diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig
index 392e3c0..1edbe9e 100644
--- a/drivers/vfio/platform/reset/Kconfig
+++ b/drivers/vfio/platform/reset/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config VFIO_PLATFORM_CALXEDAXGMAC_RESET
tristate "VFIO support for calxeda xgmac reset"
depends on VFIO_PLATFORM
diff --git a/drivers/vfio/platform/reset/Makefile b/drivers/vfio/platform/reset/Makefile
index 57abd4f..7294c5e 100644
--- a/drivers/vfio/platform/reset/Makefile
+++ b/drivers/vfio/platform/reset/Makefile
@@ -2,8 +2,6 @@
vfio-platform-calxedaxgmac-y := vfio_platform_calxedaxgmac.o
vfio-platform-amdxgbe-y := vfio_platform_amdxgbe.o
-ccflags-y += -Idrivers/vfio/platform
-
obj-$(CONFIG_VFIO_PLATFORM_CALXEDAXGMAC_RESET) += vfio-platform-calxedaxgmac.o
obj-$(CONFIG_VFIO_PLATFORM_AMDXGBE_RESET) += vfio-platform-amdxgbe.o
obj-$(CONFIG_VFIO_PLATFORM_BCMFLEXRM_RESET) += vfio_platform_bcmflexrm.o
diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
index bcd419c..2d2babe 100644
--- a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
+++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c
@@ -1,21 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO platform driver specialized for AMD xgbe reset
* reset code is inherited from AMD xgbe native driver
*
* Copyright (c) 2015 Linaro Ltd.
* www.linaro.org
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/module.h>
@@ -25,7 +14,7 @@
#include <uapi/linux/mdio.h>
#include <linux/delay.h>
-#include "vfio_platform_private.h"
+#include "../vfio_platform_private.h"
#define DMA_MR 0x3000
#define MAC_VR 0x0110
@@ -89,7 +78,8 @@
} while ((pcs_value & MDIO_CTRL1_RESET) && --count);
if (pcs_value & MDIO_CTRL1_RESET)
- pr_warn("%s XGBE PHY reset timeout\n", __func__);
+ dev_warn(vdev->device, "%s: XGBE PHY reset timeout\n",
+ __func__);
/* disable auto-negotiation */
value = xmdio_read(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_CTRL1);
@@ -114,7 +104,7 @@
usleep_range(500, 600);
if (!count)
- pr_warn("%s MAC SW reset failed\n", __func__);
+ dev_warn(vdev->device, "%s: MAC SW reset failed\n", __func__);
return 0;
}
diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
index d45c3be..16165a6 100644
--- a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
+++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c
@@ -23,7 +23,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include "vfio_platform_private.h"
+#include "../vfio_platform_private.h"
/* FlexRM configuration */
#define RING_REGS_SIZE 0x10000
diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
index 49e5df6..f67bab5 100644
--- a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
+++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO platform driver specialized for Calxeda xgmac reset
* reset code is inherited from calxeda xgmac native driver
@@ -5,18 +6,6 @@
* Copyright 2010-2011 Calxeda, Inc.
* Copyright (c) 2015 Linaro Ltd.
* www.linaro.org
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/module.h>
@@ -24,7 +13,7 @@
#include <linux/init.h>
#include <linux/io.h>
-#include "vfio_platform_private.h"
+#include "../vfio_platform_private.h"
#define DRIVER_VERSION "0.1"
#define DRIVER_AUTHOR "Eric Auger <eric.auger@linaro.org>"
diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c
index 62dfbfe..9636a2a 100644
--- a/drivers/vfio/platform/vfio_amba.c
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2013 - Virtual Open Systems
* Author: Antonios Motakis <a.motakis@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/module.h>
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 6561751..ae1a5eb 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2013 - Virtual Open Systems
* Author: Antonios Motakis <a.motakis@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/module.h>
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index c0cd824..e8f2bdb 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -1,17 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2013 - Virtual Open Systems
* Author: Antonios Motakis <a.motakis@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
+#define dev_fmt(fmt) "VFIO: " fmt
+
#include <linux/device.h>
#include <linux/acpi.h>
#include <linux/iommu.h>
@@ -63,7 +57,7 @@
adev = ACPI_COMPANION(dev);
if (!adev) {
- pr_err("VFIO: ACPI companion device not found for %s\n",
+ dev_err(dev, "ACPI companion device not found for %s\n",
vdev->name);
return -ENODEV;
}
@@ -638,7 +632,7 @@
ret = device_property_read_string(dev, "compatible",
&vdev->compat);
if (ret)
- pr_err("VFIO: Cannot retrieve compat for %s\n", vdev->name);
+ dev_err(dev, "Cannot retrieve compat for %s\n", vdev->name);
return ret;
}
@@ -680,14 +674,14 @@
ret = vfio_platform_get_reset(vdev);
if (ret && vdev->reset_required) {
- pr_err("VFIO: No reset function found for device %s\n",
- vdev->name);
+ dev_err(dev, "No reset function found for device %s\n",
+ vdev->name);
return ret;
}
group = vfio_iommu_group_get(dev);
if (!group) {
- pr_err("VFIO: No IOMMU group for device %s\n", vdev->name);
+ dev_err(dev, "No IOMMU group for device %s\n", vdev->name);
ret = -EINVAL;
goto put_reset;
}
diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c
index 46d4750..c5b09ec 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO platform devices interrupt handling
*
* Copyright (C) 2013 - Virtual Open Systems
* Author: Antonios Motakis <a.motakis@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/eventfd.h>
diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h
index 85ffe5d..2890899 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2013 - Virtual Open Systems
* Author: Antonios Motakis <a.motakis@virtualopensystems.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#ifndef VFIO_PLATFORM_PRIVATE_H
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 6483387..3885979 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO core
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
@@ -34,6 +31,7 @@
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/wait.h>
+#include <linux/sched/signal.h>
#define DRIVER_VERSION "0.3"
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
@@ -704,8 +702,8 @@
return 0;
/* TODO Prevent device auto probing */
- WARN(1, "Device %s added to live group %d!\n", dev_name(dev),
- iommu_group_id(group->iommu_group));
+ dev_WARN(dev, "Device added to live group %d!\n",
+ iommu_group_id(group->iommu_group));
return 0;
}
@@ -748,25 +746,22 @@
*/
break;
case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
- pr_debug("%s: Device %s, group %d binding to driver\n",
- __func__, dev_name(dev),
- iommu_group_id(group->iommu_group));
+ dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
+ iommu_group_id(group->iommu_group));
break;
case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
- pr_debug("%s: Device %s, group %d bound to driver %s\n",
- __func__, dev_name(dev),
- iommu_group_id(group->iommu_group), dev->driver->name);
+ dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
+ iommu_group_id(group->iommu_group), dev->driver->name);
BUG_ON(vfio_group_nb_verify(group, dev));
break;
case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
- pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
- __func__, dev_name(dev),
- iommu_group_id(group->iommu_group), dev->driver->name);
+ dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
+ __func__, iommu_group_id(group->iommu_group),
+ dev->driver->name);
break;
case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
- pr_debug("%s: Device %s, group %d unbound from driver\n",
- __func__, dev_name(dev),
- iommu_group_id(group->iommu_group));
+ dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
+ iommu_group_id(group->iommu_group));
/*
* XXX An unbound device in a live group is ok, but we'd
* really like to avoid the above BUG_ON by preventing other
@@ -830,8 +825,8 @@
device = vfio_group_get_device(group, dev);
if (device) {
- WARN(1, "Device %s already exists on group %d\n",
- dev_name(dev), iommu_group_id(iommu_group));
+ dev_WARN(dev, "Device already exists on group %d\n",
+ iommu_group_id(iommu_group));
vfio_device_put(device);
vfio_group_put(group);
return -EBUSY;
@@ -904,30 +899,17 @@
}
EXPORT_SYMBOL_GPL(vfio_device_data);
-/* Given a referenced group, check if it contains the device */
-static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
-{
- struct vfio_device *device;
-
- device = vfio_group_get_device(group, dev);
- if (!device)
- return false;
-
- vfio_device_put(device);
- return true;
-}
-
/*
* Decrement the device reference count and wait for the device to be
* removed. Open file descriptors for the device... */
void *vfio_del_group_dev(struct device *dev)
{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct vfio_device *device = dev_get_drvdata(dev);
struct vfio_group *group = device->group;
void *device_data = device->device_data;
struct vfio_unbound_dev *unbound;
unsigned int i = 0;
- long ret;
bool interrupted = false;
/*
@@ -964,6 +946,8 @@
* interval with counter to allow the driver to take escalating
* measures to release the device if it has the ability to do so.
*/
+ add_wait_queue(&vfio.release_q, &wait);
+
do {
device = vfio_group_get_device(group, dev);
if (!device)
@@ -975,12 +959,10 @@
vfio_device_put(device);
if (interrupted) {
- ret = wait_event_timeout(vfio.release_q,
- !vfio_dev_present(group, dev), HZ * 10);
+ wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
} else {
- ret = wait_event_interruptible_timeout(vfio.release_q,
- !vfio_dev_present(group, dev), HZ * 10);
- if (ret == -ERESTARTSYS) {
+ wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
+ if (signal_pending(current)) {
interrupted = true;
dev_warn(dev,
"Device is currently in use, task"
@@ -989,8 +971,10 @@
current->comm, task_pid_nr(current));
}
}
- } while (ret <= 0);
+ } while (1);
+
+ remove_wait_queue(&vfio.release_q, &wait);
/*
* In order to support multiple devices per group, devices can be
* plucked from the group while other devices in the group are still
@@ -2219,12 +2203,12 @@
vfio.class->devnode = vfio_devnode;
- ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio");
+ ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
if (ret)
goto err_alloc_chrdev;
cdev_init(&vfio.group_cdev, &vfio_group_fops);
- ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK);
+ ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
if (ret)
goto err_cdev_add;
@@ -2236,7 +2220,7 @@
return 0;
err_cdev_add:
- unregister_chrdev_region(vfio.group_devt, MINORMASK);
+ unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
err_alloc_chrdev:
class_destroy(vfio.class);
vfio.class = NULL;
@@ -2254,7 +2238,7 @@
#endif
idr_destroy(&vfio.group_idr);
cdev_del(&vfio.group_cdev);
- unregister_chrdev_region(vfio.group_devt, MINORMASK);
+ unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
class_destroy(vfio.class);
vfio.class = NULL;
misc_deregister(&vfio_dev);
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 96721b1..26cef65 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO: IOMMU DMA mapping support for TCE on POWER
*
* Copyright (C) 2013 IBM Corp. All rights reserved.
* Author: Alexey Kardashevskiy <aik@ozlabs.ru>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio_iommu_type1.c:
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
@@ -22,6 +19,7 @@
#include <linux/vmalloc.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
+#include <linux/mm.h>
#include <asm/iommu.h>
#include <asm/tce.h>
@@ -34,51 +32,6 @@
static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
-static long try_increment_locked_vm(struct mm_struct *mm, long npages)
-{
- long ret = 0, locked, lock_limit;
-
- if (WARN_ON_ONCE(!mm))
- return -EPERM;
-
- if (!npages)
- return 0;
-
- down_write(&mm->mmap_sem);
- locked = mm->locked_vm + npages;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- ret = -ENOMEM;
- else
- mm->locked_vm += npages;
-
- pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
- npages << PAGE_SHIFT,
- mm->locked_vm << PAGE_SHIFT,
- rlimit(RLIMIT_MEMLOCK),
- ret ? " - exceeded" : "");
-
- up_write(&mm->mmap_sem);
-
- return ret;
-}
-
-static void decrement_locked_vm(struct mm_struct *mm, long npages)
-{
- if (!mm || !npages)
- return;
-
- down_write(&mm->mmap_sem);
- if (WARN_ON_ONCE(npages > mm->locked_vm))
- npages = mm->locked_vm;
- mm->locked_vm -= npages;
- pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
- npages << PAGE_SHIFT,
- mm->locked_vm << PAGE_SHIFT,
- rlimit(RLIMIT_MEMLOCK));
- up_write(&mm->mmap_sem);
-}
-
/*
* VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
*
@@ -152,11 +105,12 @@
struct mm_iommu_table_group_mem_t *mem;
struct tce_iommu_prereg *tcemem;
bool found = false;
+ long ret;
if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
return -EINVAL;
- mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
+ mem = mm_iommu_get(container->mm, vaddr, size >> PAGE_SHIFT);
if (!mem)
return -ENOENT;
@@ -168,9 +122,13 @@
}
if (!found)
- return -ENOENT;
+ ret = -ENOENT;
+ else
+ ret = tce_iommu_prereg_free(container, tcemem);
- return tce_iommu_prereg_free(container, tcemem);
+ mm_iommu_put(container->mm, mem);
+
+ return ret;
}
static long tce_iommu_register_pages(struct tce_container *container,
@@ -185,22 +143,24 @@
((vaddr + size) < vaddr))
return -EINVAL;
- mem = mm_iommu_find(container->mm, vaddr, entries);
+ mem = mm_iommu_get(container->mm, vaddr, entries);
if (mem) {
list_for_each_entry(tcemem, &container->prereg_list, next) {
- if (tcemem->mem == mem)
- return -EBUSY;
+ if (tcemem->mem == mem) {
+ ret = -EBUSY;
+ goto put_exit;
+ }
}
+ } else {
+ ret = mm_iommu_new(container->mm, vaddr, entries, &mem);
+ if (ret)
+ return ret;
}
- ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
- if (ret)
- return ret;
-
tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
if (!tcemem) {
- mm_iommu_put(container->mm, mem);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto put_exit;
}
tcemem->mem = mem;
@@ -209,16 +169,28 @@
container->enabled = true;
return 0;
+
+put_exit:
+ mm_iommu_put(container->mm, mem);
+ return ret;
}
-static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
+ unsigned int it_page_shift)
{
+ struct page *page;
+ unsigned long size = 0;
+
+ if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
+ return size == (1UL << it_page_shift);
+
+ page = pfn_to_page(hpa >> PAGE_SHIFT);
/*
* Check that the TCE table granularity is not bigger than the size of
* a page we just found. Otherwise the hardware can get access to
* a bigger memory chunk that it should.
*/
- return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
+ return page_shift(compound_head(page)) >= it_page_shift;
}
static inline bool tce_groups_attached(struct tce_container *container)
@@ -317,7 +289,7 @@
return ret;
locked = table_group->tce32_size >> PAGE_SHIFT;
- ret = try_increment_locked_vm(container->mm, locked);
+ ret = account_locked_vm(container->mm, locked, true);
if (ret)
return ret;
@@ -336,7 +308,7 @@
container->enabled = false;
BUG_ON(!container->mm);
- decrement_locked_vm(container->mm, container->locked_pages);
+ account_locked_vm(container->mm, container->locked_pages, false);
}
static void *tce_iommu_open(unsigned long arg)
@@ -371,6 +343,7 @@
{
struct tce_container *container = iommu_data;
struct tce_iommu_group *tcegrp;
+ struct tce_iommu_prereg *tcemem, *tmtmp;
long i;
while (tce_groups_attached(container)) {
@@ -393,13 +366,8 @@
tce_iommu_free_table(container, tbl);
}
- while (!list_empty(&container->prereg_list)) {
- struct tce_iommu_prereg *tcemem;
-
- tcemem = list_first_entry(&container->prereg_list,
- struct tce_iommu_prereg, next);
- WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
- }
+ list_for_each_entry_safe(tcemem, tmtmp, &container->prereg_list, next)
+ WARN_ON(tce_iommu_prereg_free(container, tcemem));
tce_iommu_disable(container);
if (container->mm)
@@ -444,7 +412,7 @@
struct mm_iommu_table_group_mem_t *mem = NULL;
int ret;
unsigned long hpa = 0;
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
if (!pua)
return;
@@ -467,13 +435,33 @@
unsigned long oldhpa;
long ret;
enum dma_data_direction direction;
+ unsigned long lastentry = entry + pages, firstentry = entry;
- for ( ; pages; --pages, ++entry) {
+ for ( ; entry < lastentry; ++entry) {
+ if (tbl->it_indirect_levels && tbl->it_userspace) {
+ /*
+ * For multilevel tables, we can take a shortcut here
+ * and skip some TCEs as we know that the userspace
+ * addresses cache is a mirror of the real TCE table
+ * and if it is missing some indirect levels, then
+ * the hardware table does not have them allocated
+ * either and therefore does not require updating.
+ */
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl,
+ entry);
+ if (!pua) {
+ /* align to level_size which is power of two */
+ entry |= tbl->it_level_size - 1;
+ continue;
+ }
+ }
+
cond_resched();
direction = DMA_NONE;
oldhpa = 0;
- ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
+ ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry, &oldhpa,
+ &direction);
if (ret)
continue;
@@ -488,6 +476,8 @@
tce_iommu_unuse_page(container, oldhpa);
}
+ iommu_tce_kill(tbl, firstentry, pages);
+
return 0;
}
@@ -497,7 +487,8 @@
enum dma_data_direction direction = iommu_tce_direction(tce);
if (get_user_pages_fast(tce & PAGE_MASK, 1,
- direction != DMA_TO_DEVICE, &page) != 1)
+ direction != DMA_TO_DEVICE ? FOLL_WRITE : 0,
+ &page) != 1)
return -EFAULT;
*hpa = __pa((unsigned long) page_address(page));
@@ -511,7 +502,6 @@
enum dma_data_direction direction)
{
long i, ret = 0;
- struct page *page;
unsigned long hpa;
enum dma_data_direction dirtmp;
@@ -522,15 +512,16 @@
if (ret)
break;
- page = pfn_to_page(hpa >> PAGE_SHIFT);
- if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+ if (!tce_page_is_contained(container->mm, hpa,
+ tbl->it_page_shift)) {
ret = -EPERM;
break;
}
hpa |= offset;
dirtmp = direction;
- ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+ ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
+ &hpa, &dirtmp);
if (ret) {
tce_iommu_unuse_page(container, hpa);
pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
@@ -547,6 +538,8 @@
if (ret)
tce_iommu_clear(container, tbl, entry, i);
+ else
+ iommu_tce_kill(tbl, entry, pages);
return ret;
}
@@ -557,7 +550,6 @@
enum dma_data_direction direction)
{
long i, ret = 0;
- struct page *page;
unsigned long hpa;
enum dma_data_direction dirtmp;
@@ -570,8 +562,8 @@
if (ret)
break;
- page = pfn_to_page(hpa >> PAGE_SHIFT);
- if (!tce_page_is_contained(page, tbl->it_page_shift)) {
+ if (!tce_page_is_contained(container->mm, hpa,
+ tbl->it_page_shift)) {
ret = -EPERM;
break;
}
@@ -584,7 +576,8 @@
if (mm_iommu_mapped_inc(mem))
break;
- ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
+ ret = iommu_tce_xchg_no_kill(container->mm, tbl, entry + i,
+ &hpa, &dirtmp);
if (ret) {
/* dirtmp cannot be DMA_NONE here */
tce_iommu_unuse_page_v2(container, tbl, entry + i);
@@ -604,6 +597,8 @@
if (ret)
tce_iommu_clear(container, tbl, entry, i);
+ else
+ iommu_tce_kill(tbl, entry, pages);
return ret;
}
@@ -623,7 +618,7 @@
if (!table_size)
return -EINVAL;
- ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
+ ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
if (ret)
return ret;
@@ -642,7 +637,7 @@
unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
iommu_tce_table_put(tbl);
- decrement_locked_vm(container->mm, pages);
+ account_locked_vm(container->mm, pages, false);
}
static long tce_iommu_create_window(struct tce_container *container,
@@ -1200,7 +1195,8 @@
}
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
- table_group->ops->unset_window(table_group, i);
+ if (container->tables[i])
+ table_group->ops->unset_window(table_group, i);
table_group->ops->release_ownership(table_group);
}
@@ -1244,7 +1240,7 @@
static int tce_iommu_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
- int ret;
+ int ret = 0;
struct tce_container *container = iommu_data;
struct iommu_table_group *table_group;
struct tce_iommu_group *tcegrp = NULL;
@@ -1297,13 +1293,13 @@
!table_group->ops->release_ownership) {
if (container->v2) {
ret = -EPERM;
- goto unlock_exit;
+ goto free_exit;
}
ret = tce_iommu_take_ownership(container, table_group);
} else {
if (!container->v2) {
ret = -EPERM;
- goto unlock_exit;
+ goto free_exit;
}
ret = tce_iommu_take_ownership_ddw(container, table_group);
if (!tce_groups_attached(container) && !container->tables[0])
@@ -1315,10 +1311,11 @@
list_add(&tcegrp->next, &container->group_list);
}
-unlock_exit:
+free_exit:
if (ret && tcegrp)
kfree(tcegrp);
+unlock_exit:
mutex_unlock(&container->lock);
return ret;
@@ -1362,7 +1359,7 @@
mutex_unlock(&container->lock);
}
-const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
+static const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
.name = "iommu-vfio-powerpc",
.owner = THIS_MODULE,
.open = tce_iommu_open,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d9fd318..d864277 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO: IOMMU DMA mapping support for Type1 IOMMU
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Derived from original vfio:
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
* Author: Tom Lyon, pugs@cisco.com
@@ -58,12 +55,19 @@
MODULE_PARM_DESC(disable_hugepages,
"Disable VFIO IOMMU support for IOMMU hugepages.");
+static unsigned int dma_entry_limit __read_mostly = U16_MAX;
+module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
+MODULE_PARM_DESC(dma_entry_limit,
+ "Maximum number of user DMA mappings per container (65535).");
+
struct vfio_iommu {
struct list_head domain_list;
+ struct list_head iova_list;
struct vfio_domain *external_domain; /* domain for external user */
struct mutex lock;
struct rb_root dma_list;
struct blocking_notifier_head notifier;
+ unsigned int dma_avail;
bool v2;
bool nesting;
};
@@ -91,6 +95,13 @@
struct vfio_group {
struct iommu_group *iommu_group;
struct list_head next;
+ bool mdev_group; /* An mdev group */
+};
+
+struct vfio_iova {
+ struct list_head list;
+ dma_addr_t start;
+ dma_addr_t end;
};
/*
@@ -268,21 +279,8 @@
ret = down_write_killable(&mm->mmap_sem);
if (!ret) {
- if (npage > 0) {
- if (!dma->lock_cap) {
- unsigned long limit;
-
- limit = task_rlimit(dma->task,
- RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
- if (mm->locked_vm + npage > limit)
- ret = -ENOMEM;
- }
- }
-
- if (!ret)
- mm->locked_vm += npage;
-
+ ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
+ dma->lock_cap);
up_write(&mm->mmap_sem);
}
@@ -351,7 +349,8 @@
down_read(&mm->mmap_sem);
if (mm == current->mm) {
- ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas);
+ ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
+ vmas);
} else {
ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
vmas, NULL);
@@ -376,6 +375,8 @@
down_read(&mm->mmap_sem);
+ vaddr = untagged_addr(vaddr);
+
vma = find_vma_intersection(mm, vaddr, vaddr + 1);
if (vma && vma->vm_flags & VM_PFNMAP) {
@@ -558,7 +559,7 @@
mutex_lock(&iommu->lock);
/* Fail if notifier list is empty */
- if ((!iommu->external_domain) || (!iommu->notifier.head)) {
+ if (!iommu->notifier.head) {
ret = -EINVAL;
goto pin_done;
}
@@ -640,11 +641,6 @@
mutex_lock(&iommu->lock);
- if (!iommu->external_domain) {
- mutex_unlock(&iommu->lock);
- return -EINVAL;
- }
-
do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
for (i = 0; i < npage; i++) {
struct vfio_dma *dma;
@@ -663,12 +659,13 @@
}
static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
- struct list_head *regions)
+ struct list_head *regions,
+ struct iommu_iotlb_gather *iotlb_gather)
{
long unlocked = 0;
struct vfio_regions *entry, *next;
- iommu_tlb_sync(domain->domain);
+ iommu_tlb_sync(domain->domain, iotlb_gather);
list_for_each_entry_safe(entry, next, regions, list) {
unlocked += vfio_unpin_pages_remote(dma,
@@ -698,18 +695,19 @@
struct vfio_dma *dma, dma_addr_t *iova,
size_t len, phys_addr_t phys, long *unlocked,
struct list_head *unmapped_list,
- int *unmapped_cnt)
+ int *unmapped_cnt,
+ struct iommu_iotlb_gather *iotlb_gather)
{
size_t unmapped = 0;
struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (entry) {
- unmapped = iommu_unmap_fast(domain->domain, *iova, len);
+ unmapped = iommu_unmap_fast(domain->domain, *iova, len,
+ iotlb_gather);
if (!unmapped) {
kfree(entry);
} else {
- iommu_tlb_range_add(domain->domain, *iova, unmapped);
entry->iova = *iova;
entry->phys = phys;
entry->len = unmapped;
@@ -725,8 +723,8 @@
* or in case of errors.
*/
if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
- *unlocked += vfio_sync_unpin(dma, domain,
- unmapped_list);
+ *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
+ iotlb_gather);
*unmapped_cnt = 0;
}
@@ -757,6 +755,7 @@
dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
struct vfio_domain *domain, *d;
LIST_HEAD(unmapped_region_list);
+ struct iommu_iotlb_gather iotlb_gather;
int unmapped_region_cnt = 0;
long unlocked = 0;
@@ -781,6 +780,7 @@
cond_resched();
}
+ iommu_iotlb_gather_init(&iotlb_gather);
while (iova < end) {
size_t unmapped, len;
phys_addr_t phys, next;
@@ -809,7 +809,8 @@
*/
unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
&unlocked, &unmapped_region_list,
- &unmapped_region_cnt);
+ &unmapped_region_cnt,
+ &iotlb_gather);
if (!unmapped) {
unmapped = unmap_unpin_slow(domain, dma, &iova, len,
phys, &unlocked);
@@ -820,8 +821,10 @@
dma->iommu_mapped = false;
- if (unmapped_region_cnt)
- unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list);
+ if (unmapped_region_cnt) {
+ unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
+ &iotlb_gather);
+ }
if (do_accounting) {
vfio_lock_acct(dma, -unlocked, true);
@@ -836,6 +839,7 @@
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
kfree(dma);
+ iommu->dma_avail++;
}
static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
@@ -878,7 +882,7 @@
return -EINVAL;
if (!unmap->size || unmap->size & mask)
return -EINVAL;
- if (unmap->iova + unmap->size < unmap->iova ||
+ if (unmap->iova + unmap->size - 1 < unmap->iova ||
unmap->size > SIZE_MAX)
return -EINVAL;
@@ -978,32 +982,6 @@
return ret;
}
-/*
- * Turns out AMD IOMMU has a page table bug where it won't map large pages
- * to a region that previously mapped smaller pages. This should be fixed
- * soon, so this is just a temporary workaround to break mappings down into
- * PAGE_SIZE. Better to map smaller pages than nothing.
- */
-static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
- unsigned long pfn, long npage, int prot)
-{
- long i;
- int ret = 0;
-
- for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
- ret = iommu_map(domain->domain, iova,
- (phys_addr_t)pfn << PAGE_SHIFT,
- PAGE_SIZE, prot | domain->prot);
- if (ret)
- break;
- }
-
- for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
- iommu_unmap(domain->domain, iova, PAGE_SIZE);
-
- return ret;
-}
-
static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
unsigned long pfn, long npage, int prot)
{
@@ -1013,11 +991,8 @@
list_for_each_entry(d, &iommu->domain_list, next) {
ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
npage << PAGE_SHIFT, prot | d->prot);
- if (ret) {
- if (ret != -EBUSY ||
- map_try_harder(d, iova, pfn, npage, prot))
- goto unwind;
- }
+ if (ret)
+ goto unwind;
cond_resched();
}
@@ -1072,6 +1047,27 @@
return ret;
}
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+ dma_addr_t start, dma_addr_t end)
+{
+ struct list_head *iova = &iommu->iova_list;
+ struct vfio_iova *node;
+
+ list_for_each_entry(node, iova, list) {
+ if (start >= node->start && end <= node->end)
+ return true;
+ }
+
+ /*
+ * Check for list_empty() as well since a container with
+ * a single mdev device will have an empty list.
+ */
+ return list_empty(iova);
+}
+
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
@@ -1110,12 +1106,23 @@
goto out_unlock;
}
+ if (!iommu->dma_avail) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+
+ if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
ret = -ENOMEM;
goto out_unlock;
}
+ iommu->dma_avail--;
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
@@ -1298,15 +1305,13 @@
return NULL;
}
-static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
+static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
+ phys_addr_t *base)
{
- struct list_head group_resv_regions;
- struct iommu_resv_region *region, *next;
+ struct iommu_resv_region *region;
bool ret = false;
- INIT_LIST_HEAD(&group_resv_regions);
- iommu_get_group_resv_regions(group, &group_resv_regions);
- list_for_each_entry(region, &group_resv_regions, list) {
+ list_for_each_entry(region, group_resv_regions, list) {
/*
* The presence of any 'real' MSI regions should take
* precedence over the software-managed one if the
@@ -1322,21 +1327,341 @@
ret = true;
}
}
- list_for_each_entry_safe(region, next, &group_resv_regions, list)
- kfree(region);
+
return ret;
}
+static struct device *vfio_mdev_get_iommu_device(struct device *dev)
+{
+ struct device *(*fn)(struct device *dev);
+ struct device *iommu_device;
+
+ fn = symbol_get(mdev_get_iommu_device);
+ if (fn) {
+ iommu_device = fn(dev);
+ symbol_put(mdev_get_iommu_device);
+
+ return iommu_device;
+ }
+
+ return NULL;
+}
+
+static int vfio_mdev_attach_domain(struct device *dev, void *data)
+{
+ struct iommu_domain *domain = data;
+ struct device *iommu_device;
+
+ iommu_device = vfio_mdev_get_iommu_device(dev);
+ if (iommu_device) {
+ if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
+ return iommu_aux_attach_device(domain, iommu_device);
+ else
+ return iommu_attach_device(domain, iommu_device);
+ }
+
+ return -EINVAL;
+}
+
+static int vfio_mdev_detach_domain(struct device *dev, void *data)
+{
+ struct iommu_domain *domain = data;
+ struct device *iommu_device;
+
+ iommu_device = vfio_mdev_get_iommu_device(dev);
+ if (iommu_device) {
+ if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
+ iommu_aux_detach_device(domain, iommu_device);
+ else
+ iommu_detach_device(domain, iommu_device);
+ }
+
+ return 0;
+}
+
+static int vfio_iommu_attach_group(struct vfio_domain *domain,
+ struct vfio_group *group)
+{
+ if (group->mdev_group)
+ return iommu_group_for_each_dev(group->iommu_group,
+ domain->domain,
+ vfio_mdev_attach_domain);
+ else
+ return iommu_attach_group(domain->domain, group->iommu_group);
+}
+
+static void vfio_iommu_detach_group(struct vfio_domain *domain,
+ struct vfio_group *group)
+{
+ if (group->mdev_group)
+ iommu_group_for_each_dev(group->iommu_group, domain->domain,
+ vfio_mdev_detach_domain);
+ else
+ iommu_detach_group(domain->domain, group->iommu_group);
+}
+
+static bool vfio_bus_is_mdev(struct bus_type *bus)
+{
+ struct bus_type *mdev_bus;
+ bool ret = false;
+
+ mdev_bus = symbol_get(mdev_bus_type);
+ if (mdev_bus) {
+ ret = (bus == mdev_bus);
+ symbol_put(mdev_bus_type);
+ }
+
+ return ret;
+}
+
+static int vfio_mdev_iommu_device(struct device *dev, void *data)
+{
+ struct device **old = data, *new;
+
+ new = vfio_mdev_get_iommu_device(dev);
+ if (!new || (*old && *old != new))
+ return -EINVAL;
+
+ *old = new;
+
+ return 0;
+}
+
+/*
+ * This is a helper function to insert an address range to iova list.
+ * The list is initially created with a single entry corresponding to
+ * the IOMMU domain geometry to which the device group is attached.
+ * The list aperture gets modified when a new domain is added to the
+ * container if the new aperture doesn't conflict with the current one
+ * or with any existing dma mappings. The list is also modified to
+ * exclude any reserved regions associated with the device group.
+ */
+static int vfio_iommu_iova_insert(struct list_head *head,
+ dma_addr_t start, dma_addr_t end)
+{
+ struct vfio_iova *region;
+
+ region = kmalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(®ion->list);
+ region->start = start;
+ region->end = end;
+
+ list_add_tail(®ion->list, head);
+ return 0;
+}
+
+/*
+ * Check the new iommu aperture conflicts with existing aper or with any
+ * existing dma mappings.
+ */
+static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+ dma_addr_t start, dma_addr_t end)
+{
+ struct vfio_iova *first, *last;
+ struct list_head *iova = &iommu->iova_list;
+
+ if (list_empty(iova))
+ return false;
+
+ /* Disjoint sets, return conflict */
+ first = list_first_entry(iova, struct vfio_iova, list);
+ last = list_last_entry(iova, struct vfio_iova, list);
+ if (start > last->end || end < first->start)
+ return true;
+
+ /* Check for any existing dma mappings below the new start */
+ if (start > first->start) {
+ if (vfio_find_dma(iommu, first->start, start - first->start))
+ return true;
+ }
+
+ /* Check for any existing dma mappings beyond the new end */
+ if (end < last->end) {
+ if (vfio_find_dma(iommu, end + 1, last->end - end))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Resize iommu iova aperture window. This is called only if the new
+ * aperture has no conflict with existing aperture and dma mappings.
+ */
+static int vfio_iommu_aper_resize(struct list_head *iova,
+ dma_addr_t start, dma_addr_t end)
+{
+ struct vfio_iova *node, *next;
+
+ if (list_empty(iova))
+ return vfio_iommu_iova_insert(iova, start, end);
+
+ /* Adjust iova list start */
+ list_for_each_entry_safe(node, next, iova, list) {
+ if (start < node->start)
+ break;
+ if (start >= node->start && start < node->end) {
+ node->start = start;
+ break;
+ }
+ /* Delete nodes before new start */
+ list_del(&node->list);
+ kfree(node);
+ }
+
+ /* Adjust iova list end */
+ list_for_each_entry_safe(node, next, iova, list) {
+ if (end > node->end)
+ continue;
+ if (end > node->start && end <= node->end) {
+ node->end = end;
+ continue;
+ }
+ /* Delete nodes after new end */
+ list_del(&node->list);
+ kfree(node);
+ }
+
+ return 0;
+}
+
+/*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+ struct list_head *resv_regions)
+{
+ struct iommu_resv_region *region;
+
+ /* Check for conflict with existing dma mappings */
+ list_for_each_entry(region, resv_regions, list) {
+ if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
+ continue;
+
+ if (vfio_find_dma(iommu, region->start, region->length))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check iova region overlap with reserved regions and
+ * exclude them from the iommu iova range
+ */
+static int vfio_iommu_resv_exclude(struct list_head *iova,
+ struct list_head *resv_regions)
+{
+ struct iommu_resv_region *resv;
+ struct vfio_iova *n, *next;
+
+ list_for_each_entry(resv, resv_regions, list) {
+ phys_addr_t start, end;
+
+ if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
+ continue;
+
+ start = resv->start;
+ end = resv->start + resv->length - 1;
+
+ list_for_each_entry_safe(n, next, iova, list) {
+ int ret = 0;
+
+ /* No overlap */
+ if (start > n->end || end < n->start)
+ continue;
+ /*
+ * Insert a new node if current node overlaps with the
+ * reserve region to exlude that from valid iova range.
+ * Note that, new node is inserted before the current
+ * node and finally the current node is deleted keeping
+ * the list updated and sorted.
+ */
+ if (start > n->start)
+ ret = vfio_iommu_iova_insert(&n->list, n->start,
+ start - 1);
+ if (!ret && end < n->end)
+ ret = vfio_iommu_iova_insert(&n->list, end + 1,
+ n->end);
+ if (ret)
+ return ret;
+
+ list_del(&n->list);
+ kfree(n);
+ }
+ }
+
+ if (list_empty(iova))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void vfio_iommu_resv_free(struct list_head *resv_regions)
+{
+ struct iommu_resv_region *n, *next;
+
+ list_for_each_entry_safe(n, next, resv_regions, list) {
+ list_del(&n->list);
+ kfree(n);
+ }
+}
+
+static void vfio_iommu_iova_free(struct list_head *iova)
+{
+ struct vfio_iova *n, *next;
+
+ list_for_each_entry_safe(n, next, iova, list) {
+ list_del(&n->list);
+ kfree(n);
+ }
+}
+
+static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
+ struct list_head *iova_copy)
+{
+ struct list_head *iova = &iommu->iova_list;
+ struct vfio_iova *n;
+ int ret;
+
+ list_for_each_entry(n, iova, list) {
+ ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
+ if (ret)
+ goto out_free;
+ }
+
+ return 0;
+
+out_free:
+ vfio_iommu_iova_free(iova_copy);
+ return ret;
+}
+
+static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
+ struct list_head *iova_copy)
+{
+ struct list_head *iova = &iommu->iova_list;
+
+ vfio_iommu_iova_free(iova);
+
+ list_splice_tail(iova_copy, iova);
+}
static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_group *group;
struct vfio_domain *domain, *d;
- struct bus_type *bus = NULL, *mdev_bus;
+ struct bus_type *bus = NULL;
int ret;
bool resv_msi, msi_remap;
- phys_addr_t resv_msi_base;
+ phys_addr_t resv_msi_base = 0;
+ struct iommu_domain_geometry geo;
+ LIST_HEAD(iova_copy);
+ LIST_HEAD(group_resv_regions);
mutex_lock(&iommu->lock);
@@ -1368,23 +1693,30 @@
if (ret)
goto out_free;
- mdev_bus = symbol_get(mdev_bus_type);
+ if (vfio_bus_is_mdev(bus)) {
+ struct device *iommu_device = NULL;
- if (mdev_bus) {
- if ((bus == mdev_bus) && !iommu_present(bus)) {
- symbol_put(mdev_bus_type);
+ group->mdev_group = true;
+
+ /* Determine the isolation type */
+ ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
+ vfio_mdev_iommu_device);
+ if (ret || !iommu_device) {
if (!iommu->external_domain) {
INIT_LIST_HEAD(&domain->group_list);
iommu->external_domain = domain;
- } else
+ } else {
kfree(domain);
+ }
list_add(&group->next,
&iommu->external_domain->group_list);
mutex_unlock(&iommu->lock);
+
return 0;
}
- symbol_put(mdev_bus_type);
+
+ bus = iommu_device->bus;
}
domain->domain = iommu_domain_alloc(bus);
@@ -1402,11 +1734,47 @@
goto out_domain;
}
- ret = iommu_attach_group(domain->domain, iommu_group);
+ ret = vfio_iommu_attach_group(domain, group);
if (ret)
goto out_domain;
- resv_msi = vfio_iommu_has_sw_msi(iommu_group, &resv_msi_base);
+ /* Get aperture info */
+ iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
+
+ if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
+ geo.aperture_end)) {
+ ret = -EINVAL;
+ goto out_detach;
+ }
+
+ ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
+ if (ret)
+ goto out_detach;
+
+ if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
+ ret = -EINVAL;
+ goto out_detach;
+ }
+
+ /*
+ * We don't want to work on the original iova list as the list
+ * gets modified and in case of failure we have to retain the
+ * original list. Get a copy here.
+ */
+ ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
+ if (ret)
+ goto out_detach;
+
+ ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
+ geo.aperture_end);
+ if (ret)
+ goto out_detach;
+
+ ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
+ if (ret)
+ goto out_detach;
+
+ resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
INIT_LIST_HEAD(&domain->group_list);
list_add(&group->next, &domain->group_list);
@@ -1434,16 +1802,15 @@
list_for_each_entry(d, &iommu->domain_list, next) {
if (d->domain->ops == domain->domain->ops &&
d->prot == domain->prot) {
- iommu_detach_group(domain->domain, iommu_group);
- if (!iommu_attach_group(d->domain, iommu_group)) {
+ vfio_iommu_detach_group(domain, group);
+ if (!vfio_iommu_attach_group(d, group)) {
list_add(&group->next, &d->group_list);
iommu_domain_free(domain->domain);
kfree(domain);
- mutex_unlock(&iommu->lock);
- return 0;
+ goto done;
}
- ret = iommu_attach_group(domain->domain, iommu_group);
+ ret = vfio_iommu_attach_group(domain, group);
if (ret)
goto out_domain;
}
@@ -1463,15 +1830,20 @@
}
list_add(&domain->next, &iommu->domain_list);
-
+done:
+ /* Delete the old one and insert new iova list */
+ vfio_iommu_iova_insert_copy(iommu, &iova_copy);
mutex_unlock(&iommu->lock);
+ vfio_iommu_resv_free(&group_resv_regions);
return 0;
out_detach:
- iommu_detach_group(domain->domain, iommu_group);
+ vfio_iommu_detach_group(domain, group);
out_domain:
iommu_domain_free(domain->domain);
+ vfio_iommu_iova_free(&iova_copy);
+ vfio_iommu_resv_free(&group_resv_regions);
out_free:
kfree(domain);
kfree(group);
@@ -1527,12 +1899,93 @@
WARN_ON(iommu->notifier.head);
}
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+ struct list_head *iova_copy)
+{
+ struct vfio_domain *domain;
+ struct iommu_domain_geometry geo;
+ struct vfio_iova *node;
+ dma_addr_t start = 0;
+ dma_addr_t end = (dma_addr_t)~0;
+
+ if (list_empty(iova_copy))
+ return;
+
+ list_for_each_entry(domain, &iommu->domain_list, next) {
+ iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+ &geo);
+ if (geo.aperture_start > start)
+ start = geo.aperture_start;
+ if (geo.aperture_end < end)
+ end = geo.aperture_end;
+ }
+
+ /* Modify aperture limits. The new aper is either same or bigger */
+ node = list_first_entry(iova_copy, struct vfio_iova, list);
+ node->start = start;
+ node = list_last_entry(iova_copy, struct vfio_iova, list);
+ node->end = end;
+}
+
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+ * list again.
+ */
+static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+ struct list_head *iova_copy)
+{
+ struct vfio_domain *d;
+ struct vfio_group *g;
+ struct vfio_iova *node;
+ dma_addr_t start, end;
+ LIST_HEAD(resv_regions);
+ int ret;
+
+ if (list_empty(iova_copy))
+ return -EINVAL;
+
+ list_for_each_entry(d, &iommu->domain_list, next) {
+ list_for_each_entry(g, &d->group_list, next) {
+ ret = iommu_get_group_resv_regions(g->iommu_group,
+ &resv_regions);
+ if (ret)
+ goto done;
+ }
+ }
+
+ node = list_first_entry(iova_copy, struct vfio_iova, list);
+ start = node->start;
+ node = list_last_entry(iova_copy, struct vfio_iova, list);
+ end = node->end;
+
+ /* purge the iova list and create new one */
+ vfio_iommu_iova_free(iova_copy);
+
+ ret = vfio_iommu_aper_resize(iova_copy, start, end);
+ if (ret)
+ goto done;
+
+ /* Exclude current reserved regions from iova ranges */
+ ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
+done:
+ vfio_iommu_resv_free(&resv_regions);
+ return ret;
+}
+
static void vfio_iommu_type1_detach_group(void *iommu_data,
struct iommu_group *iommu_group)
{
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
+ LIST_HEAD(iova_copy);
mutex_lock(&iommu->lock);
@@ -1555,12 +2008,19 @@
}
}
+ /*
+ * Get a copy of iova list. This will be used to update
+ * and to replace the current one later. Please note that
+ * we will leave the original list as it is if update fails.
+ */
+ vfio_iommu_iova_get_copy(iommu, &iova_copy);
+
list_for_each_entry(domain, &iommu->domain_list, next) {
group = find_iommu_group(domain, iommu_group);
if (!group)
continue;
- iommu_detach_group(domain->domain, iommu_group);
+ vfio_iommu_detach_group(domain, group);
list_del(&group->next);
kfree(group);
/*
@@ -1580,10 +2040,16 @@
iommu_domain_free(domain->domain);
list_del(&domain->next);
kfree(domain);
+ vfio_iommu_aper_expand(iommu, &iova_copy);
}
break;
}
+ if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
+ vfio_iommu_iova_insert_copy(iommu, &iova_copy);
+ else
+ vfio_iommu_iova_free(&iova_copy);
+
detach_group_done:
mutex_unlock(&iommu->lock);
}
@@ -1611,7 +2077,9 @@
}
INIT_LIST_HEAD(&iommu->domain_list);
+ INIT_LIST_HEAD(&iommu->iova_list);
iommu->dma_list = RB_ROOT;
+ iommu->dma_avail = dma_entry_limit;
mutex_init(&iommu->lock);
BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
@@ -1625,7 +2093,7 @@
list_for_each_entry_safe(group, group_tmp,
&domain->group_list, next) {
if (!external)
- iommu_detach_group(domain->domain, group->iommu_group);
+ vfio_iommu_detach_group(domain, group);
list_del(&group->next);
kfree(group);
}
@@ -1653,6 +2121,9 @@
list_del(&domain->next);
kfree(domain);
}
+
+ vfio_iommu_iova_free(&iommu->iova_list);
+
kfree(iommu);
}
@@ -1673,6 +2144,73 @@
return ret;
}
+static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
+ struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
+ size_t size)
+{
+ struct vfio_info_cap_header *header;
+ struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
+
+ header = vfio_info_cap_add(caps, size,
+ VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+ if (IS_ERR(header))
+ return PTR_ERR(header);
+
+ iova_cap = container_of(header,
+ struct vfio_iommu_type1_info_cap_iova_range,
+ header);
+ iova_cap->nr_iovas = cap_iovas->nr_iovas;
+ memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
+ cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
+ return 0;
+}
+
+static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+ struct vfio_info_cap *caps)
+{
+ struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
+ struct vfio_iova *iova;
+ size_t size;
+ int iovas = 0, i = 0, ret;
+
+ mutex_lock(&iommu->lock);
+
+ list_for_each_entry(iova, &iommu->iova_list, list)
+ iovas++;
+
+ if (!iovas) {
+ /*
+ * Return 0 as a container with a single mdev device
+ * will have an empty list
+ */
+ ret = 0;
+ goto out_unlock;
+ }
+
+ size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+
+ cap_iovas = kzalloc(size, GFP_KERNEL);
+ if (!cap_iovas) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ cap_iovas->nr_iovas = iovas;
+
+ list_for_each_entry(iova, &iommu->iova_list, list) {
+ cap_iovas->iova_ranges[i].start = iova->start;
+ cap_iovas->iova_ranges[i].end = iova->end;
+ i++;
+ }
+
+ ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
+
+ kfree(cap_iovas);
+out_unlock:
+ mutex_unlock(&iommu->lock);
+ return ret;
+}
+
static long vfio_iommu_type1_ioctl(void *iommu_data,
unsigned int cmd, unsigned long arg)
{
@@ -1694,19 +2232,53 @@
}
} else if (cmd == VFIO_IOMMU_GET_INFO) {
struct vfio_iommu_type1_info info;
+ struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+ unsigned long capsz;
+ int ret;
minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
+ /* For backward compatibility, cannot require this */
+ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
+ if (info.argsz >= capsz) {
+ minsz = capsz;
+ info.cap_offset = 0; /* output, no-recopy necessary */
+ }
+
info.flags = VFIO_IOMMU_INFO_PGSIZES;
info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
+ ret = vfio_iommu_iova_build_caps(iommu, &caps);
+ if (ret)
+ return ret;
+
+ if (caps.size) {
+ info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+ if (info.argsz < sizeof(info) + caps.size) {
+ info.argsz = sizeof(info) + caps.size;
+ } else {
+ vfio_info_cap_shift(&caps, sizeof(info));
+ if (copy_to_user((void __user *)arg +
+ sizeof(info), caps.buf,
+ caps.size)) {
+ kfree(caps.buf);
+ return -EFAULT;
+ }
+ info.cap_offset = sizeof(info);
+ }
+
+ kfree(caps.buf);
+ }
+
return copy_to_user((void __user *)arg, &info, minsz) ?
-EFAULT : 0;
diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
index 38edeb4..67f55ac 100644
--- a/drivers/vfio/vfio_spapr_eeh.c
+++ b/drivers/vfio/vfio_spapr_eeh.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* EEH functionality support for VFIO devices. The feature is only
* available on sPAPR compatible platforms.
*
* Copyright Gavin Shan, IBM Corporation 2014.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -74,13 +71,13 @@
ret = eeh_pe_get_state(pe);
break;
case VFIO_EEH_PE_RESET_DEACTIVATE:
- ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
+ ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE, true);
break;
case VFIO_EEH_PE_RESET_HOT:
- ret = eeh_pe_reset(pe, EEH_RESET_HOT);
+ ret = eeh_pe_reset(pe, EEH_RESET_HOT, true);
break;
case VFIO_EEH_PE_RESET_FUNDAMENTAL:
- ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL);
+ ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL, true);
break;
case VFIO_EEH_PE_CONFIGURE:
ret = eeh_pe_configure(pe);
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 2a1be85..997cb5d 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VFIO generic eventfd code for IRQFD support.
* Derived from drivers/vfio/pci/vfio_pci_intrs.c
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/vfio.h>