From 1e736f1489563470b58ede3fae5274a624280cf1 Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Mon, 4 Aug 2025 12:44:31 -0700 Subject: [PATCH 01/48] vfio/pci: print vfio-device syspath to fdinfo Print the PCI device syspath to a vfio device's fdinfo. This enables tools to query which device is associated with a given vfio device fd. This results in output like below: $ cat /proc/"$SOME_PID"/fdinfo/"$VFIO_FD" | grep vfio vfio-device-syspath: /sys/devices/pci0000:e0/0000:e0:01.1/0000:e1:00.0/0000:e2:05.0/0000:e8:00.0 Signed-off-by: Alex Mastro Reviewed-by: Amit Machhiwal Tested-by: Amit Machhiwal Link: https://lore.kernel.org/r/20250804-show-fdinfo-v4-1-96b14c5691b3@fb.com Signed-off-by: Alex Williamson --- Documentation/filesystems/proc.rst | 14 ++++++++++++++ drivers/vfio/vfio_main.c | 20 ++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b7235..ed8d23b677ca 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -2166,6 +2166,20 @@ DMA Buffer files where 'size' is the size of the DMA buffer in bytes. 'count' is the file count of the DMA buffer file. 'exp_name' is the name of the DMA buffer exporter. +VFIO Device files +~~~~~~~~~~~~~~~~ + +:: + + pos: 0 + flags: 02000002 + mnt_id: 17 + ino: 5122 + vfio-device-syspath: /sys/devices/pci0000:e0/0000:e0:01.1/0000:e1:00.0/0000:e2:05.0/0000:e8:00.0 + +where 'vfio-device-syspath' is the sysfs path corresponding to the VFIO device +file. + 3.9 /proc//map_files - Information about memory mapped files --------------------------------------------------------------------- This directory contains symbolic links which represent memory mapped files diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 5046cae05222..91a8eae308ea 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1355,6 +1356,22 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) return device->ops->mmap(device, vma); } +#ifdef CONFIG_PROC_FS +static void vfio_device_show_fdinfo(struct seq_file *m, struct file *filep) +{ + char *path; + struct vfio_device_file *df = filep->private_data; + struct vfio_device *device = df->device; + + path = kobject_get_path(&device->dev->kobj, GFP_KERNEL); + if (!path) + return; + + seq_printf(m, "vfio-device-syspath: /sys%s\n", path); + kfree(path); +} +#endif + const struct file_operations vfio_device_fops = { .owner = THIS_MODULE, .open = vfio_device_fops_cdev_open, @@ -1364,6 +1381,9 @@ const struct file_operations vfio_device_fops = { .unlocked_ioctl = vfio_device_fops_unl_ioctl, .compat_ioctl = compat_ptr_ioctl, .mmap = vfio_device_fops_mmap, +#ifdef CONFIG_PROC_FS + .show_fdinfo = vfio_device_show_fdinfo, +#endif }; static struct vfio_device *vfio_device_from_file(struct file *file) From 1b1d9ca13475cbedf25727ae4b4ead684ae85c03 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 6 Aug 2025 11:03:11 -0600 Subject: [PATCH 02/48] vfio/fsl-mc: Mark for removal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver has been orphaned for more than a year, mark it for removal. Reviewed-by: Kevin Tian Reviewed-by: Cédric Le Goater Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20250806170314.3768750-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- MAINTAINERS | 2 +- drivers/vfio/fsl-mc/Kconfig | 5 ++++- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fed6cd812d79..482c18d39f62 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26443,7 +26443,7 @@ F: include/uapi/linux/vfio.h VFIO FSL-MC DRIVER L: kvm@vger.kernel.org -S: Orphan +S: Obsolete F: drivers/vfio/fsl-mc/ VFIO HISILICON PCI DRIVER diff --git a/drivers/vfio/fsl-mc/Kconfig b/drivers/vfio/fsl-mc/Kconfig index 7d1d690348f0..43c145d17971 100644 --- a/drivers/vfio/fsl-mc/Kconfig +++ b/drivers/vfio/fsl-mc/Kconfig @@ -2,9 +2,12 @@ menu "VFIO support for FSL_MC bus devices" depends on FSL_MC_BUS config VFIO_FSL_MC - tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices" + tristate "VFIO support for QorIQ DPAA2 fsl-mc bus devices (DEPRECATED)" select EVENTFD help + The vfio-fsl-mc driver is deprecated and will be removed in a + future kernel release. + Driver to enable support for the VFIO QorIQ DPAA2 fsl-mc (Management Complex) devices. This is required to passthrough fsl-mc bus devices using the VFIO framework. diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index f65d91c01f2e..76ccbab0e3d6 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -537,6 +537,8 @@ static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) struct device *dev = &mc_dev->dev; int ret; + dev_err_once(dev, "DEPRECATION: vfio-fsl-mc is deprecated and will be removed in a future kernel release\n"); + vdev = vfio_alloc_device(vfio_fsl_mc_device, vdev, dev, &vfio_fsl_mc_ops); if (IS_ERR(vdev)) From 473c3af395c904ad5e10897bf482a92c4f3f3650 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Mon, 18 Aug 2025 16:52:01 +0800 Subject: [PATCH 03/48] vfio/pci: drop redundant conversion to bool The result of integer comparison already evaluates to bool. No need for explicit conversion. No functional impact. Signed-off-by: Xichao Zhao Link: https://lore.kernel.org/r/20250818085201.510206-1-zhao.xichao@vivo.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 123298a4dc8f..00583909b380 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -677,7 +677,7 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_core_device *vdev, { struct vfio_pci_irq_ctx *ctx; unsigned int i; - bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false; + bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX); if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { vfio_msi_disable(vdev, msix); From 767b1ed8b980498978c77dc89497602ae3421af5 Mon Sep 17 00:00:00 2001 From: Morduan Zang Date: Thu, 14 Aug 2025 19:03:58 +0800 Subject: [PATCH 04/48] vfio/nvgrace-gpu: fix grammatical error The word "as" in the comment should be replaced with "is", and there is an extra space in the comment. Signed-off-by: Morduan Zang Reviewed-by: Ankit Agrawal Link: https://lore.kernel.org/r/54E1ED6C5A2682C8+20250814110358.285412-1-zhangdandan@uniontech.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index d95761dcdd58..0adaa6150252 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -260,7 +260,7 @@ nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev, info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); /* * The region memory size may not be power-of-2 aligned. - * Given that the memory as a BAR and may not be + * Given that the memory is a BAR and may not be * aligned, roundup to the next power-of-2. */ info.size = memregion->bar_size; From 292e9ee22b0adad49c9a6f63708988e32c007da6 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:24:48 +0000 Subject: [PATCH 05/48] selftests: Create tools/testing/selftests/vfio Create the directory tools/testing/selftests/vfio with a stub Makefile and hook it up to the top-level selftests Makefile. This directory will be used in subsequent commits to host selftests for the VFIO subsystem. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-2-dmatlack@google.com Signed-off-by: Alex Williamson --- MAINTAINERS | 7 +++++++ tools/testing/selftests/Makefile | 1 + tools/testing/selftests/vfio/.gitignore | 7 +++++++ tools/testing/selftests/vfio/Makefile | 2 ++ 4 files changed, 17 insertions(+) create mode 100644 tools/testing/selftests/vfio/.gitignore create mode 100644 tools/testing/selftests/vfio/Makefile diff --git a/MAINTAINERS b/MAINTAINERS index 482c18d39f62..fba915fcb30e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26440,6 +26440,7 @@ F: drivers/vfio/ F: include/linux/vfio.h F: include/linux/vfio_pci_core.h F: include/uapi/linux/vfio.h +F: tools/testing/selftests/vfio/ VFIO FSL-MC DRIVER L: kvm@vger.kernel.org @@ -26504,6 +26505,12 @@ L: qat-linux@intel.com S: Supported F: drivers/vfio/pci/qat/ +VFIO SELFTESTS +M: David Matlack +L: kvm@vger.kernel.org +S: Maintained +F: tools/testing/selftests/vfio/ + VFIO VIRTIO PCI DRIVER M: Yishai Hadas L: kvm@vger.kernel.org diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 030da61dbff3..c4e616183aa5 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -124,6 +124,7 @@ TARGETS += uevent TARGETS += user_events TARGETS += vDSO TARGETS += mm +TARGETS += vfio TARGETS += x86 TARGETS += x86/bugs TARGETS += zram diff --git a/tools/testing/selftests/vfio/.gitignore b/tools/testing/selftests/vfio/.gitignore new file mode 100644 index 000000000000..6d9381d60172 --- /dev/null +++ b/tools/testing/selftests/vfio/.gitignore @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +* +!/**/ +!*.c +!*.h +!*.S +!*.sh diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile new file mode 100644 index 000000000000..2bba39aff5d9 --- /dev/null +++ b/tools/testing/selftests/vfio/Makefile @@ -0,0 +1,2 @@ +CFLAGS = $(KHDR_INCLUDES) +include ../lib.mk From 19faf6fd969c21589b6dd40c35255e4d00d427f8 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:24:49 +0000 Subject: [PATCH 06/48] vfio: selftests: Add a helper library for VFIO selftests Add a basic helper library to be used by VFIO selftests. The basic unit of the library is struct vfio_pci_device, which represents a single PCI device that is bound to the vfio-pci driver. The library currently only supports a single device per group and container, and VFIO IOMMU types. The code in this library was heavily based on prior work done by Raghavendra Rao Ananta , and the VFIO_ASSERT*() macros were written by Vipin Sharma . Separate that Makefile rules for building the library into a separate script so that the library can be built by and linked into KVM selftests in a subsequent commit. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-3-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 14 + .../selftests/vfio/lib/include/vfio_util.h | 140 +++++++ tools/testing/selftests/vfio/lib/libvfio.mk | 15 + .../selftests/vfio/lib/vfio_pci_device.c | 365 ++++++++++++++++++ 4 files changed, 534 insertions(+) create mode 100644 tools/testing/selftests/vfio/lib/include/vfio_util.h create mode 100644 tools/testing/selftests/vfio/lib/libvfio.mk create mode 100644 tools/testing/selftests/vfio/lib/vfio_pci_device.c diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 2bba39aff5d9..db3e4db1a6dd 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,2 +1,16 @@ CFLAGS = $(KHDR_INCLUDES) include ../lib.mk +include lib/libvfio.mk + +CFLAGS += -I$(top_srcdir)/tools/include +CFLAGS += -MD +CFLAGS += $(EXTRA_CFLAGS) + +$(TEST_GEN_PROGS): %: %.o $(LIBVFIO_O) + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $< $(LIBVFIO_O) $(LDLIBS) -o $@ + +TEST_GEN_PROGS_O = $(patsubst %, %.o, $(TEST_GEN_PROGS)) +TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_PROGS_O) $(LIBVFIO_O)) +-include $(TEST_DEP_FILES) + +EXTRA_CLEAN += $(TEST_GEN_PROGS_O) $(TEST_DEP_FILES) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h new file mode 100644 index 000000000000..b7d2bb8c18ba --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H +#define SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H + +#include +#include +#include +#include + +#include "../../../kselftest.h" + +#define VFIO_LOG_AND_EXIT(...) do { \ + fprintf(stderr, " " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(KSFT_FAIL); \ +} while (0) + +#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \ + typeof(_lhs) __lhs = (_lhs); \ + typeof(_rhs) __rhs = (_rhs); \ + \ + if (__lhs _op __rhs) \ + break; \ + \ + fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \ + fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \ + fprintf(stderr, " Observed: %#lx %s %#lx\n", \ + (u64)__lhs, #_op, (u64)__rhs); \ + fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \ + VFIO_LOG_AND_EXIT(__VA_ARGS__); \ +} while (0) + +#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__) +#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__) +#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__) +#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__) +#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__) +#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__) +#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__) +#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__) + +#define VFIO_FAIL(_fmt, ...) do { \ + fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \ + VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ +} while (0) + +struct vfio_pci_bar { + struct vfio_region_info info; + void *vaddr; +}; + +struct vfio_pci_device { + int fd; + int group_fd; + int container_fd; + + struct vfio_device_info info; + struct vfio_region_info config_space; + struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; + + struct vfio_irq_info msi_info; + struct vfio_irq_info msix_info; + + /* eventfds for MSI and MSI-x interrupts */ + int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; +}; + +/* + * Return the BDF string of the device that the test should use. + * + * If a BDF string is provided by the user on the command line (as the last + * element of argv[]), then this function will return that and decrement argc + * by 1. + * + * Otherwise this function will attempt to use the environment variable + * $VFIO_SELFTESTS_BDF. + * + * If BDF cannot be determined then the test will exit with KSFT_SKIP. + */ +const char *vfio_selftests_get_bdf(int *argc, char *argv[]); + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type); +void vfio_pci_device_cleanup(struct vfio_pci_device *device); + +void vfio_pci_dma_map(struct vfio_pci_device *device, u64 iova, u64 size, + void *vaddr); +void vfio_pci_dma_unmap(struct vfio_pci_device *device, u64 iova, u64 size); + +void vfio_pci_config_access(struct vfio_pci_device *device, bool write, + size_t config, size_t size, void *data); + +#define vfio_pci_config_read(_device, _offset, _type) ({ \ + _type __data; \ + vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \ + __data; \ +}) + +#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8) +#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16) +#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32) + +#define vfio_pci_config_write(_device, _offset, _value, _type) do { \ + _type __data = (_value); \ + vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \ +} while (0) + +#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8) +#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16) +#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32) + +void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, + u32 vector, int count); +void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); +void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); + +static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msi_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX); +} + +static inline void vfio_pci_msix_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); +} + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */ diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk new file mode 100644 index 000000000000..72e55a560eeb --- /dev/null +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -0,0 +1,15 @@ +VFIO_DIR := $(selfdir)/vfio + +LIBVFIO_C := lib/vfio_pci_device.c + +LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C)) + +LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq) +$(shell mkdir -p $(LIBVFIO_O_DIRS)) + +CFLAGS += -I$(VFIO_DIR)/lib/include + +$(LIBVFIO_O): $(OUTPUT)/%.o : $(VFIO_DIR)/%.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + +EXTRA_CLEAN += $(LIBVFIO_O) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c new file mode 100644 index 000000000000..76adb1841f16 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -0,0 +1,365 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../../../kselftest.h" +#include + +#define VFIO_DEV_PATH "/dev/vfio/vfio" +#define PCI_SYSFS_PATH "/sys/bus/pci/devices" + +#define ioctl_assert(_fd, _op, _arg) do { \ + void *__arg = (_arg); \ + int __ret = ioctl((_fd), (_op), (__arg)); \ + VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ +} while (0) + +static void vfio_pci_irq_set(struct vfio_pci_device *device, + u32 index, u32 vector, u32 count, int *fds) +{ + u8 buf[sizeof(struct vfio_irq_set) + sizeof(int) * count] = {}; + struct vfio_irq_set *irq = (void *)&buf; + int *irq_fds = (void *)&irq->data; + + irq->argsz = sizeof(buf); + irq->flags = VFIO_IRQ_SET_ACTION_TRIGGER; + irq->index = index; + irq->start = vector; + irq->count = count; + + if (count) { + irq->flags |= VFIO_IRQ_SET_DATA_EVENTFD; + memcpy(irq_fds, fds, sizeof(int) * count); + } else { + irq->flags |= VFIO_IRQ_SET_DATA_NONE; + } + + ioctl_assert(device->fd, VFIO_DEVICE_SET_IRQS, irq); +} + +void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector) +{ + struct vfio_irq_set irq = { + .argsz = sizeof(irq), + .flags = VFIO_IRQ_SET_ACTION_TRIGGER | VFIO_IRQ_SET_DATA_NONE, + .index = index, + .start = vector, + .count = 1, + }; + + ioctl_assert(device->fd, VFIO_DEVICE_SET_IRQS, &irq); +} + +static void check_supported_irq_index(u32 index) +{ + /* VFIO selftests only supports MSI and MSI-x for now. */ + VFIO_ASSERT_TRUE(index == VFIO_PCI_MSI_IRQ_INDEX || + index == VFIO_PCI_MSIX_IRQ_INDEX, + "Unsupported IRQ index: %u\n", index); +} + +void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, u32 vector, + int count) +{ + int i; + + check_supported_irq_index(index); + + for (i = vector; i < vector + count; i++) { + VFIO_ASSERT_LT(device->msi_eventfds[i], 0); + device->msi_eventfds[i] = eventfd(0, 0); + VFIO_ASSERT_GE(device->msi_eventfds[i], 0); + } + + vfio_pci_irq_set(device, index, vector, count, device->msi_eventfds + vector); +} + +void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index) +{ + int i; + + check_supported_irq_index(index); + + for (i = 0; i < ARRAY_SIZE(device->msi_eventfds); i++) { + if (device->msi_eventfds[i] < 0) + continue; + + VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0); + device->msi_eventfds[i] = -1; + } + + vfio_pci_irq_set(device, index, 0, 0, NULL); +} + +static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, + struct vfio_irq_info *irq_info) +{ + irq_info->argsz = sizeof(*irq_info); + irq_info->index = index; + + ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); +} + +void vfio_pci_dma_map(struct vfio_pci_device *device, u64 iova, u64 size, void *vaddr) +{ + struct vfio_iommu_type1_dma_map map = { + .argsz = sizeof(map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = (u64)vaddr, + .iova = iova, + .size = size, + }; + + ioctl_assert(device->container_fd, VFIO_IOMMU_MAP_DMA, &map); +} + +void vfio_pci_dma_unmap(struct vfio_pci_device *device, u64 iova, u64 size) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .iova = iova, + .size = size, + }; + + ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &unmap); +} + +static void vfio_pci_region_get(struct vfio_pci_device *device, int index, + struct vfio_region_info *info) +{ + memset(info, 0, sizeof(*info)); + + info->argsz = sizeof(*info); + info->index = index; + + ioctl_assert(device->fd, VFIO_DEVICE_GET_REGION_INFO, info); +} + +static void vfio_pci_bar_map(struct vfio_pci_device *device, int index) +{ + struct vfio_pci_bar *bar = &device->bars[index]; + int prot = 0; + + VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); + VFIO_ASSERT_NULL(bar->vaddr); + VFIO_ASSERT_TRUE(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP); + + if (bar->info.flags & VFIO_REGION_INFO_FLAG_READ) + prot |= PROT_READ; + if (bar->info.flags & VFIO_REGION_INFO_FLAG_WRITE) + prot |= PROT_WRITE; + + bar->vaddr = mmap(NULL, bar->info.size, prot, MAP_FILE | MAP_SHARED, + device->fd, bar->info.offset); + VFIO_ASSERT_NE(bar->vaddr, MAP_FAILED); +} + +static void vfio_pci_bar_unmap(struct vfio_pci_device *device, int index) +{ + struct vfio_pci_bar *bar = &device->bars[index]; + + VFIO_ASSERT_LT(index, PCI_STD_NUM_BARS); + VFIO_ASSERT_NOT_NULL(bar->vaddr); + + VFIO_ASSERT_EQ(munmap(bar->vaddr, bar->info.size), 0); + bar->vaddr = NULL; +} + +static void vfio_pci_bar_unmap_all(struct vfio_pci_device *device) +{ + int i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + if (device->bars[i].vaddr) + vfio_pci_bar_unmap(device, i); + } +} + +void vfio_pci_config_access(struct vfio_pci_device *device, bool write, + size_t config, size_t size, void *data) +{ + struct vfio_region_info *config_space = &device->config_space; + int ret; + + if (write) + ret = pwrite(device->fd, data, size, config_space->offset + config); + else + ret = pread(device->fd, data, size, config_space->offset + config); + + VFIO_ASSERT_EQ(ret, size, "Failed to %s PCI config space: 0x%lx\n", + write ? "write to" : "read from", config); +} + +static unsigned int vfio_pci_get_group_from_dev(const char *bdf) +{ + char dev_iommu_group_path[PATH_MAX] = {0}; + char sysfs_path[PATH_MAX] = {0}; + unsigned int group; + int ret; + + snprintf(sysfs_path, PATH_MAX, "%s/%s/iommu_group", PCI_SYSFS_PATH, bdf); + + ret = readlink(sysfs_path, dev_iommu_group_path, sizeof(dev_iommu_group_path)); + VFIO_ASSERT_NE(ret, -1, "Failed to get the IOMMU group for device: %s\n", bdf); + + ret = sscanf(basename(dev_iommu_group_path), "%u", &group); + VFIO_ASSERT_EQ(ret, 1, "Failed to get the IOMMU group for device: %s\n", bdf); + + return group; +} + +static void vfio_pci_container_setup(struct vfio_pci_device *device) +{ + int version; + + device->container_fd = open(VFIO_DEV_PATH, O_RDWR); + VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", VFIO_DEV_PATH); + + version = ioctl(device->container_fd, VFIO_GET_API_VERSION); + VFIO_ASSERT_EQ(version, VFIO_API_VERSION); +} + +static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status), + }; + char group_path[32]; + int group; + + group = vfio_pci_get_group_from_dev(bdf); + snprintf(group_path, sizeof(group_path), "/dev/vfio/%d", group); + + device->group_fd = open(group_path, O_RDWR); + VFIO_ASSERT_GE(device->group_fd, 0, "open(%s) failed\n", group_path); + + ioctl_assert(device->group_fd, VFIO_GROUP_GET_STATUS, &group_status); + VFIO_ASSERT_TRUE(group_status.flags & VFIO_GROUP_FLAGS_VIABLE); + + ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->container_fd); +} + +static void vfio_pci_iommu_setup(struct vfio_pci_device *device, unsigned long iommu_type) +{ + int ret; + + ret = ioctl(device->container_fd, VFIO_CHECK_EXTENSION, iommu_type); + VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); + + ioctl_assert(device->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); +} + +static void vfio_pci_device_setup(struct vfio_pci_device *device, const char *bdf) +{ + int i; + + device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); + VFIO_ASSERT_GE(device->fd, 0); + + device->info.argsz = sizeof(device->info); + ioctl_assert(device->fd, VFIO_DEVICE_GET_INFO, &device->info); + + vfio_pci_region_get(device, VFIO_PCI_CONFIG_REGION_INDEX, &device->config_space); + + /* Sanity check VFIO does not advertise mmap for config space */ + VFIO_ASSERT_TRUE(!(device->config_space.flags & VFIO_REGION_INFO_FLAG_MMAP), + "PCI config space should not support mmap()\n"); + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + struct vfio_pci_bar *bar = device->bars + i; + + vfio_pci_region_get(device, i, &bar->info); + if (bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP) + vfio_pci_bar_map(device, i); + } + + vfio_pci_irq_get(device, VFIO_PCI_MSI_IRQ_INDEX, &device->msi_info); + vfio_pci_irq_get(device, VFIO_PCI_MSIX_IRQ_INDEX, &device->msix_info); + + for (i = 0; i < ARRAY_SIZE(device->msi_eventfds); i++) + device->msi_eventfds[i] = -1; +} + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type) +{ + struct vfio_pci_device *device; + + device = calloc(1, sizeof(*device)); + VFIO_ASSERT_NOT_NULL(device); + + vfio_pci_container_setup(device); + vfio_pci_group_setup(device, bdf); + vfio_pci_iommu_setup(device, iommu_type); + vfio_pci_device_setup(device, bdf); + + return device; +} + +void vfio_pci_device_cleanup(struct vfio_pci_device *device) +{ + int i; + + vfio_pci_bar_unmap_all(device); + + VFIO_ASSERT_EQ(close(device->fd), 0); + + for (i = 0; i < ARRAY_SIZE(device->msi_eventfds); i++) { + if (device->msi_eventfds[i] < 0) + continue; + + VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0); + } + + VFIO_ASSERT_EQ(close(device->group_fd), 0); + VFIO_ASSERT_EQ(close(device->container_fd), 0); + + free(device); +} + +static bool is_bdf(const char *str) +{ + unsigned int s, b, d, f; + int length, count; + + count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length); + return count == 4 && length == strlen(str); +} + +const char *vfio_selftests_get_bdf(int *argc, char *argv[]) +{ + char *bdf; + + if (*argc > 1 && is_bdf(argv[*argc - 1])) + return argv[--(*argc)]; + + bdf = getenv("VFIO_SELFTESTS_BDF"); + if (bdf) { + VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); + return bdf; + } + + fprintf(stderr, "Unable to determine which device to use, skipping test.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address via environment variable:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " export VFIO_SELFTESTS_BDF=segment:bus:device.function\n"); + fprintf(stderr, " %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address via argv:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " %s [options] segment:bus:device.function\n", argv[0]); + fprintf(stderr, "\n"); + exit(KSFT_SKIP); +} From 16eadd7c1277284cfff5b7071910920a2d008251 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:24:50 +0000 Subject: [PATCH 07/48] vfio: selftests: Introduce vfio_pci_device_test Introduce a basic VFIO selftest called vfio_pci_device_test to demonstrate the functionality of the VFIO selftest library and provide some test coverage of basic VFIO operations, including: - Mapping and unmapping DMA - Mapping and unmapping BARs - Enabling, triggering, and disabling MSI and MSI-x - Reading and writing to PCI config space This test should work with most PCI devices, as long as they are bound to vfio-pci. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-4-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 1 + .../selftests/vfio/vfio_pci_device_test.c | 180 ++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 tools/testing/selftests/vfio/vfio_pci_device_test.c diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index db3e4db1a6dd..828419537250 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,4 +1,5 @@ CFLAGS = $(KHDR_INCLUDES) +TEST_GEN_PROGS += vfio_pci_device_test include ../lib.mk include lib/libvfio.mk diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c new file mode 100644 index 000000000000..3e7049b9c8f6 --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +#include "../kselftest_harness.h" + +static const char *device_bdf; + +/* + * Limit the number of MSIs enabled/disabled by the test regardless of the + * number of MSIs the device itself supports, e.g. to avoid hitting IRTE limits. + */ +#define MAX_TEST_MSI 16U + +FIXTURE(vfio_pci_device_test) { + struct vfio_pci_device *device; +}; + +FIXTURE_SETUP(vfio_pci_device_test) +{ + self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); +} + +FIXTURE_TEARDOWN(vfio_pci_device_test) +{ + vfio_pci_device_cleanup(self->device); +} + +TEST_F(vfio_pci_device_test, dma_map_unmap) +{ + const u64 size = SZ_2M; + void *mem; + u64 iova; + + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(mem, MAP_FAILED); + + iova = (u64)mem; + + vfio_pci_dma_map(self->device, iova, size, mem); + printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", mem, size, iova); + vfio_pci_dma_unmap(self->device, iova, size); + + ASSERT_TRUE(!munmap(mem, SZ_2M)); +} + +#define read_pci_id_from_sysfs(_file) ({ \ + char __sysfs_path[PATH_MAX]; \ + char __buf[32]; \ + int __fd; \ + \ + snprintf(__sysfs_path, PATH_MAX, "/sys/bus/pci/devices/%s/%s", device_bdf, _file); \ + ASSERT_GT((__fd = open(__sysfs_path, O_RDONLY)), 0); \ + ASSERT_GT(read(__fd, __buf, ARRAY_SIZE(__buf)), 0); \ + ASSERT_EQ(0, close(__fd)); \ + (u16)strtoul(__buf, NULL, 0); \ +}) + +TEST_F(vfio_pci_device_test, config_space_read_write) +{ + u16 vendor, device; + u16 command; + + /* Check that Vendor and Device match what the kernel reports. */ + vendor = read_pci_id_from_sysfs("vendor"); + device = read_pci_id_from_sysfs("device"); + + ASSERT_EQ(vendor, vfio_pci_config_readw(self->device, PCI_VENDOR_ID)); + ASSERT_EQ(device, vfio_pci_config_readw(self->device, PCI_DEVICE_ID)); + + printf("Vendor: %04x, Device: %04x\n", vendor, device); + + command = vfio_pci_config_readw(self->device, PCI_COMMAND); + ASSERT_FALSE(command & PCI_COMMAND_MASTER); + + vfio_pci_config_writew(self->device, PCI_COMMAND, command | PCI_COMMAND_MASTER); + command = vfio_pci_config_readw(self->device, PCI_COMMAND); + ASSERT_TRUE(command & PCI_COMMAND_MASTER); + printf("Enabled Bus Mastering (command: %04x)\n", command); + + vfio_pci_config_writew(self->device, PCI_COMMAND, command & ~PCI_COMMAND_MASTER); + command = vfio_pci_config_readw(self->device, PCI_COMMAND); + ASSERT_FALSE(command & PCI_COMMAND_MASTER); + printf("Disabled Bus Mastering (command: %04x)\n", command); +} + +TEST_F(vfio_pci_device_test, validate_bars) +{ + struct vfio_pci_bar *bar; + int i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + bar = &self->device->bars[i]; + + if (!(bar->info.flags & VFIO_REGION_INFO_FLAG_MMAP)) { + printf("BAR %d does not support mmap()\n", i); + ASSERT_EQ(NULL, bar->vaddr); + continue; + } + + /* + * BARs that support mmap() should be automatically mapped by + * vfio_pci_device_init(). + */ + ASSERT_NE(NULL, bar->vaddr); + ASSERT_NE(0, bar->info.size); + printf("BAR %d mapped at %p (size 0x%llx)\n", i, bar->vaddr, bar->info.size); + } +} + +FIXTURE(vfio_pci_irq_test) { + struct vfio_pci_device *device; +}; + +FIXTURE_VARIANT(vfio_pci_irq_test) { + int irq_index; +}; + +FIXTURE_VARIANT_ADD(vfio_pci_irq_test, msi) { + .irq_index = VFIO_PCI_MSI_IRQ_INDEX, +}; + +FIXTURE_VARIANT_ADD(vfio_pci_irq_test, msix) { + .irq_index = VFIO_PCI_MSIX_IRQ_INDEX, +}; + +FIXTURE_SETUP(vfio_pci_irq_test) +{ + self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); +} + +FIXTURE_TEARDOWN(vfio_pci_irq_test) +{ + vfio_pci_device_cleanup(self->device); +} + +TEST_F(vfio_pci_irq_test, enable_trigger_disable) +{ + bool msix = variant->irq_index == VFIO_PCI_MSIX_IRQ_INDEX; + u32 count; + u64 value; + int i; + + if (msix) + count = self->device->msix_info.count; + else + count = self->device->msi_info.count; + + count = min(count, MAX_TEST_MSI); + + if (!count) + SKIP(return, "MSI%s: not supported\n", msix ? "-x" : ""); + + vfio_pci_irq_enable(self->device, variant->irq_index, 0, count); + printf("MSI%s: enabled %d interrupts\n", msix ? "-x" : "", count); + + for (i = 0; i < count; i++) { + vfio_pci_irq_trigger(self->device, variant->irq_index, i); + ASSERT_EQ(8, read(self->device->msi_eventfds[i], &value, 8)); + ASSERT_EQ(1, value); + } + + vfio_pci_irq_disable(self->device, variant->irq_index); +} + +int main(int argc, char *argv[]) +{ + device_bdf = vfio_selftests_get_bdf(&argc, argv); + return test_harness_run(argc, argv); +} From 790588f06e9ce58c281faeada453f47361bc06b6 Mon Sep 17 00:00:00 2001 From: Josh Hilke Date: Fri, 22 Aug 2025 21:24:51 +0000 Subject: [PATCH 08/48] vfio: selftests: Test basic VFIO and IOMMUFD integration Add a vfio test suite which verifies that userspace can bind and unbind devices, allocate I/O address space, and attach a device to an IOMMU domain using the cdev + IOMMUfd VFIO interface. Signed-off-by: Josh Hilke Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-5-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 1 + .../selftests/vfio/vfio_iommufd_setup_test.c | 157 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 tools/testing/selftests/vfio/vfio_iommufd_setup_test.c diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 828419537250..e4a5d6eadff3 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,4 +1,5 @@ CFLAGS = $(KHDR_INCLUDES) +TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test include ../lib.mk include lib/libvfio.mk diff --git a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c new file mode 100644 index 000000000000..f45335d9260f --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include "../kselftest_harness.h" + +static const char iommu_dev_path[] = "/dev/iommu"; +static char cdev_path[PATH_MAX] = { '\0' }; + +static void set_cdev_path(const char *bdf) +{ + char dir_path[PATH_MAX]; + DIR *dir; + struct dirent *entry; + + snprintf(dir_path, sizeof(dir_path), "/sys/bus/pci/devices/%s/vfio-dev/", bdf); + + dir = opendir(dir_path); + assert(dir); + + /* Find the file named "vfio" */ + while ((entry = readdir(dir)) != NULL) { + if (!strncmp("vfio", entry->d_name, 4)) { + snprintf(cdev_path, sizeof(cdev_path), "/dev/vfio/devices/%s", + entry->d_name); + break; + } + } + + assert(strlen(cdev_path) > 0); + + closedir(dir); +} + +static int vfio_device_bind_iommufd_ioctl(int cdev_fd, int iommufd) +{ + struct vfio_device_bind_iommufd bind_args = { + .argsz = sizeof(bind_args), + .iommufd = iommufd, + }; + + return ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind_args); +} + +static int vfio_device_get_info_ioctl(int cdev_fd) +{ + struct vfio_device_info info_args = { .argsz = sizeof(info_args) }; + + return ioctl(cdev_fd, VFIO_DEVICE_GET_INFO, &info_args); +} + +static int vfio_device_ioas_alloc_ioctl(int iommufd, struct iommu_ioas_alloc *alloc_args) +{ + *alloc_args = (struct iommu_ioas_alloc){ + .size = sizeof(struct iommu_ioas_alloc), + }; + + return ioctl(iommufd, IOMMU_IOAS_ALLOC, alloc_args); +} + +static int vfio_device_attach_iommufd_pt_ioctl(int cdev_fd, u32 pt_id) +{ + struct vfio_device_attach_iommufd_pt attach_args = { + .argsz = sizeof(attach_args), + .pt_id = pt_id, + }; + + return ioctl(cdev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_args); +} + +static int vfio_device_detach_iommufd_pt_ioctl(int cdev_fd) +{ + struct vfio_device_detach_iommufd_pt detach_args = { + .argsz = sizeof(detach_args), + }; + + return ioctl(cdev_fd, VFIO_DEVICE_DETACH_IOMMUFD_PT, &detach_args); +} + +FIXTURE(vfio_cdev) { + int cdev_fd; + int iommufd; +}; + +FIXTURE_SETUP(vfio_cdev) +{ + ASSERT_LE(0, (self->cdev_fd = open(cdev_path, O_RDWR, 0))); + ASSERT_LE(0, (self->iommufd = open(iommu_dev_path, O_RDWR, 0))); +} + +FIXTURE_TEARDOWN(vfio_cdev) +{ + ASSERT_EQ(0, close(self->cdev_fd)); + ASSERT_EQ(0, close(self->iommufd)); +} + +TEST_F(vfio_cdev, bind) +{ + ASSERT_EQ(0, vfio_device_bind_iommufd_ioctl(self->cdev_fd, self->iommufd)); + ASSERT_EQ(0, vfio_device_get_info_ioctl(self->cdev_fd)); +} + +TEST_F(vfio_cdev, get_info_without_bind_fails) +{ + ASSERT_NE(0, vfio_device_get_info_ioctl(self->cdev_fd)); +} + +TEST_F(vfio_cdev, bind_bad_iommufd_fails) +{ + ASSERT_NE(0, vfio_device_bind_iommufd_ioctl(self->cdev_fd, -2)); +} + +TEST_F(vfio_cdev, repeated_bind_fails) +{ + ASSERT_EQ(0, vfio_device_bind_iommufd_ioctl(self->cdev_fd, self->iommufd)); + ASSERT_NE(0, vfio_device_bind_iommufd_ioctl(self->cdev_fd, self->iommufd)); +} + +TEST_F(vfio_cdev, attach_detatch_pt) +{ + struct iommu_ioas_alloc alloc_args; + + ASSERT_EQ(0, vfio_device_bind_iommufd_ioctl(self->cdev_fd, self->iommufd)); + ASSERT_EQ(0, vfio_device_ioas_alloc_ioctl(self->iommufd, &alloc_args)); + ASSERT_EQ(0, vfio_device_attach_iommufd_pt_ioctl(self->cdev_fd, alloc_args.out_ioas_id)); + ASSERT_EQ(0, vfio_device_detach_iommufd_pt_ioctl(self->cdev_fd)); +} + +TEST_F(vfio_cdev, attach_invalid_pt_fails) +{ + ASSERT_EQ(0, vfio_device_bind_iommufd_ioctl(self->cdev_fd, self->iommufd)); + ASSERT_NE(0, vfio_device_attach_iommufd_pt_ioctl(self->cdev_fd, UINT32_MAX)); +} + +int main(int argc, char *argv[]) +{ + const char *device_bdf = vfio_selftests_get_bdf(&argc, argv); + + set_cdev_path(device_bdf); + printf("Using cdev device %s\n", cdev_path); + + return test_harness_run(argc, argv); +} From b477e7bcd25ecb4da91bb52d5f980611cc77d543 Mon Sep 17 00:00:00 2001 From: Josh Hilke Date: Fri, 22 Aug 2025 21:24:52 +0000 Subject: [PATCH 09/48] vfio: selftests: Move vfio dma mapping test to their own file Move the dma_map_unmap test from vfio_pci_device_test to a new test: vfio_dma_mapping_test. We are going to add more complex dma mapping tests, so it makes sense to separate this from the vfio pci device test which is more of a sanity check for vfio pci functionality. Signed-off-by: Josh Hilke Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-6-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 1 + .../selftests/vfio/vfio_dma_mapping_test.c | 51 +++++++++++++++++++ .../selftests/vfio/vfio_pci_device_test.c | 18 ------- 3 files changed, 52 insertions(+), 18 deletions(-) create mode 100644 tools/testing/selftests/vfio/vfio_dma_mapping_test.c diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index e4a5d6eadff3..05c5a585cca6 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,4 +1,5 @@ CFLAGS = $(KHDR_INCLUDES) +TEST_GEN_PROGS += vfio_dma_mapping_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test include ../lib.mk diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c new file mode 100644 index 000000000000..b56cebbf97eb --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include + +#include +#include + +#include + +#include "../kselftest_harness.h" + +static const char *device_bdf; + +FIXTURE(vfio_dma_mapping_test) { + struct vfio_pci_device *device; +}; + +FIXTURE_SETUP(vfio_dma_mapping_test) +{ + self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); +} + +FIXTURE_TEARDOWN(vfio_dma_mapping_test) +{ + vfio_pci_device_cleanup(self->device); +} + +TEST_F(vfio_dma_mapping_test, dma_map_unmap) +{ + const u64 size = SZ_2M; + void *mem; + u64 iova; + + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(mem, MAP_FAILED); + + iova = (u64)mem; + + vfio_pci_dma_map(self->device, iova, size, mem); + printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", mem, size, iova); + vfio_pci_dma_unmap(self->device, iova, size); + + ASSERT_TRUE(!munmap(mem, size)); +} + +int main(int argc, char *argv[]) +{ + device_bdf = vfio_selftests_get_bdf(&argc, argv); + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 3e7049b9c8f6..a2e41398d184 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -36,24 +36,6 @@ FIXTURE_TEARDOWN(vfio_pci_device_test) vfio_pci_device_cleanup(self->device); } -TEST_F(vfio_pci_device_test, dma_map_unmap) -{ - const u64 size = SZ_2M; - void *mem; - u64 iova; - - mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(mem, MAP_FAILED); - - iova = (u64)mem; - - vfio_pci_dma_map(self->device, iova, size, mem); - printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", mem, size, iova); - vfio_pci_dma_unmap(self->device, iova, size); - - ASSERT_TRUE(!munmap(mem, SZ_2M)); -} - #define read_pci_id_from_sysfs(_file) ({ \ char __sysfs_path[PATH_MAX]; \ char __buf[32]; \ From a0fd0af504f7fe11e2f87e48a1924d7e7f5a0590 Mon Sep 17 00:00:00 2001 From: Josh Hilke Date: Fri, 22 Aug 2025 21:24:53 +0000 Subject: [PATCH 10/48] vfio: selftests: Add test to reset vfio device. Add a test to vfio_pci_device_test which resets the device. If reset is not supported by the device, the test is skipped. Signed-off-by: Josh Hilke Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-7-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/vfio_util.h | 1 + tools/testing/selftests/vfio/lib/vfio_pci_device.c | 5 +++++ tools/testing/selftests/vfio/vfio_pci_device_test.c | 8 ++++++++ 3 files changed, 14 insertions(+) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index b7d2bb8c18ba..234403b442af 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -83,6 +83,7 @@ const char *vfio_selftests_get_bdf(int *argc, char *argv[]); struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type); void vfio_pci_device_cleanup(struct vfio_pci_device *device); +void vfio_pci_device_reset(struct vfio_pci_device *device); void vfio_pci_dma_map(struct vfio_pci_device *device, u64 iova, u64 size, void *vaddr); diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 76adb1841f16..98cce0a6ecd7 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -202,6 +202,11 @@ void vfio_pci_config_access(struct vfio_pci_device *device, bool write, write ? "write to" : "read from", config); } +void vfio_pci_device_reset(struct vfio_pci_device *device) +{ + ioctl_assert(device->fd, VFIO_DEVICE_RESET, NULL); +} + static unsigned int vfio_pci_get_group_from_dev(const char *bdf) { char dev_iommu_group_path[PATH_MAX] = {0}; diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index a2e41398d184..82e3c947f45d 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -155,6 +155,14 @@ TEST_F(vfio_pci_irq_test, enable_trigger_disable) vfio_pci_irq_disable(self->device, variant->irq_index); } +TEST_F(vfio_pci_device_test, reset) +{ + if (!(self->device->info.flags & VFIO_DEVICE_FLAGS_RESET)) + SKIP(return, "Device does not support reset\n"); + + vfio_pci_device_reset(self->device); +} + int main(int argc, char *argv[]) { device_bdf = vfio_selftests_get_bdf(&argc, argv); From 751f6b5d06c301b329b699ab2089c9dcb7eebc47 Mon Sep 17 00:00:00 2001 From: Josh Hilke Date: Fri, 22 Aug 2025 21:24:54 +0000 Subject: [PATCH 11/48] vfio: selftests: Add DMA mapping tests for 2M and 1G HugeTLB Add test coverage of mapping 2M and 1G HugeTLB to vfio_dma_mapping_test using a fixture variant. If there isn't enough HugeTLB memory available for the test, just skip them. Signed-off-by: Josh Hilke [switch from command line option to fixture variant] Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-8-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/vfio_dma_mapping_test.c | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index b56cebbf97eb..8f8e6e9e8197 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only -#include - +#include #include +#include +#include +#include #include #include @@ -16,6 +18,25 @@ FIXTURE(vfio_dma_mapping_test) { struct vfio_pci_device *device; }; +FIXTURE_VARIANT(vfio_dma_mapping_test) { + u64 size; + int mmap_flags; +}; + +FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, anonymous) { + .mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE, +}; + +FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, anonymous_hugetlb_2mb) { + .size = SZ_2M, + .mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_2MB, +}; + +FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, anonymous_hugetlb_1gb) { + .size = SZ_1G, + .mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_1GB, +}; + FIXTURE_SETUP(vfio_dma_mapping_test) { self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); @@ -28,17 +49,24 @@ FIXTURE_TEARDOWN(vfio_dma_mapping_test) TEST_F(vfio_dma_mapping_test, dma_map_unmap) { - const u64 size = SZ_2M; + const u64 size = variant->size ?: getpagesize(); + const int flags = variant->mmap_flags; void *mem; u64 iova; - mem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(mem, MAP_FAILED); + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + + /* Skip the test if there aren't enough HugeTLB pages available. */ + if (flags & MAP_HUGETLB && mem == MAP_FAILED) + SKIP(return, "mmap() failed: %s (%d)\n", strerror(errno), errno); + else + ASSERT_NE(mem, MAP_FAILED); iova = (u64)mem; vfio_pci_dma_map(self->device, iova, size, mem); printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", mem, size, iova); + vfio_pci_dma_unmap(self->device, iova, size); ASSERT_TRUE(!munmap(mem, size)); From 47f861048ef7034a59431020c5916a43378a8c0c Mon Sep 17 00:00:00 2001 From: Josh Hilke Date: Fri, 22 Aug 2025 21:24:55 +0000 Subject: [PATCH 12/48] vfio: selftests: Validate 2M/1G HugeTLB are mapped as 2M/1G in IOMMU Update vfio dma mapping test to verify that the IOMMU uses 2M and 1G mappings when 2M and 1G HugeTLB pages are mapped into a device respectively. This validation is done by inspecting the contents of the I/O page tables via /sys/kernel/debug/iommu/intel/. This validation is skipped if that directory is not available (i.e. non-Intel IOMMUs). Signed-off-by: Josh Hilke [reword commit message, refactor code] Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-9-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/vfio_dma_mapping_test.c | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 8f8e6e9e8197..2612f0cabea5 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -14,6 +14,83 @@ static const char *device_bdf; +struct iommu_mapping { + u64 pgd; + u64 p4d; + u64 pud; + u64 pmd; + u64 pte; +}; + +static void parse_next_value(char **line, u64 *value) +{ + char *token; + + token = strtok_r(*line, " \t|\n", line); + if (!token) + return; + + /* Caller verifies `value`. No need to check return value. */ + sscanf(token, "0x%lx", value); +} + +static int intel_iommu_mapping_get(const char *bdf, u64 iova, + struct iommu_mapping *mapping) +{ + char iommu_mapping_path[PATH_MAX], line[PATH_MAX]; + u64 line_iova = -1; + int ret = -ENOENT; + FILE *file; + char *rest; + + snprintf(iommu_mapping_path, sizeof(iommu_mapping_path), + "/sys/kernel/debug/iommu/intel/%s/domain_translation_struct", + bdf); + + printf("Searching for IOVA 0x%lx in %s\n", iova, iommu_mapping_path); + + file = fopen(iommu_mapping_path, "r"); + VFIO_ASSERT_NOT_NULL(file, "fopen(%s) failed", iommu_mapping_path); + + while (fgets(line, sizeof(line), file)) { + rest = line; + + parse_next_value(&rest, &line_iova); + if (line_iova != (iova / getpagesize())) + continue; + + /* + * Ensure each struct field is initialized in case of empty + * page table values. + */ + memset(mapping, 0, sizeof(*mapping)); + parse_next_value(&rest, &mapping->pgd); + parse_next_value(&rest, &mapping->p4d); + parse_next_value(&rest, &mapping->pud); + parse_next_value(&rest, &mapping->pmd); + parse_next_value(&rest, &mapping->pte); + + ret = 0; + break; + } + + fclose(file); + + if (ret) + printf("IOVA not found\n"); + + return ret; +} + +static int iommu_mapping_get(const char *bdf, u64 iova, + struct iommu_mapping *mapping) +{ + if (!access("/sys/kernel/debug/iommu/intel", F_OK)) + return intel_iommu_mapping_get(bdf, iova, mapping); + + return -EOPNOTSUPP; +} + FIXTURE(vfio_dma_mapping_test) { struct vfio_pci_device *device; }; @@ -51,8 +128,10 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) { const u64 size = variant->size ?: getpagesize(); const int flags = variant->mmap_flags; + struct iommu_mapping mapping; void *mem; u64 iova; + int rc; mem = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); @@ -67,7 +146,39 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) vfio_pci_dma_map(self->device, iova, size, mem); printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", mem, size, iova); + rc = iommu_mapping_get(device_bdf, iova, &mapping); + if (rc == -EOPNOTSUPP) + goto unmap; + + ASSERT_EQ(0, rc); + printf("Found IOMMU mappings for IOVA 0x%lx:\n", iova); + printf("PGD: 0x%016lx\n", mapping.pgd); + printf("P4D: 0x%016lx\n", mapping.p4d); + printf("PUD: 0x%016lx\n", mapping.pud); + printf("PMD: 0x%016lx\n", mapping.pmd); + printf("PTE: 0x%016lx\n", mapping.pte); + + switch (size) { + case SZ_4K: + ASSERT_NE(0, mapping.pte); + break; + case SZ_2M: + ASSERT_EQ(0, mapping.pte); + ASSERT_NE(0, mapping.pmd); + break; + case SZ_1G: + ASSERT_EQ(0, mapping.pte); + ASSERT_EQ(0, mapping.pmd); + ASSERT_NE(0, mapping.pud); + break; + default: + VFIO_FAIL("Unrecognized size: 0x%lx\n", size); + } + +unmap: vfio_pci_dma_unmap(self->device, iova, size); + printf("Unmapped IOVA 0x%lx\n", iova); + ASSERT_NE(0, iommu_mapping_get(device_bdf, iova, &mapping)); ASSERT_TRUE(!munmap(mem, size)); } From 346cd58f1fb588f8ff193d76cf0bb455446ace03 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:24:56 +0000 Subject: [PATCH 13/48] vfio: selftests: Keep track of DMA regions mapped into the device Keep track of the list of DMA regions that are mapped into the device using a linked list and a new struct vfio_dma_region and use that to add {__,}to_iova() for converting host virtual addresses into IOVAs. This will be used in a subsequent commit to map multiple DMA regions into a device that are then used by drivers. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-10-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 23 +++++++-- .../selftests/vfio/lib/vfio_pci_device.c | 49 ++++++++++++++++--- .../selftests/vfio/vfio_dma_mapping_test.c | 31 ++++++------ 3 files changed, 79 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 234403b442af..db08646c2819 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -51,6 +51,17 @@ struct vfio_pci_bar { void *vaddr; }; +typedef u64 iova_t; + +#define INVALID_IOVA UINT64_MAX + +struct vfio_dma_region { + struct list_head link; + void *vaddr; + iova_t iova; + u64 size; +}; + struct vfio_pci_device { int fd; int group_fd; @@ -63,6 +74,8 @@ struct vfio_pci_device { struct vfio_irq_info msi_info; struct vfio_irq_info msix_info; + struct list_head dma_regions; + /* eventfds for MSI and MSI-x interrupts */ int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; }; @@ -85,9 +98,10 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type); void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); -void vfio_pci_dma_map(struct vfio_pci_device *device, u64 iova, u64 size, - void *vaddr); -void vfio_pci_dma_unmap(struct vfio_pci_device *device, u64 iova, u64 size); +void vfio_pci_dma_map(struct vfio_pci_device *device, + struct vfio_dma_region *region); +void vfio_pci_dma_unmap(struct vfio_pci_device *device, + struct vfio_dma_region *region); void vfio_pci_config_access(struct vfio_pci_device *device, bool write, size_t config, size_t size, void *data); @@ -138,4 +152,7 @@ static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); } +iova_t __to_iova(struct vfio_pci_device *device, void *vaddr); +iova_t to_iova(struct vfio_pci_device *device, void *vaddr); + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */ diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 98cce0a6ecd7..36b4b30b75cf 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -26,6 +26,33 @@ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ } while (0) +iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) +{ + struct vfio_dma_region *region; + + list_for_each_entry(region, &device->dma_regions, link) { + if (vaddr < region->vaddr) + continue; + + if (vaddr >= region->vaddr + region->size) + continue; + + return region->iova + (vaddr - region->vaddr); + } + + return INVALID_IOVA; +} + +iova_t to_iova(struct vfio_pci_device *device, void *vaddr) +{ + iova_t iova; + + iova = __to_iova(device, vaddr); + VFIO_ASSERT_NE(iova, INVALID_IOVA, "%p is not mapped into device.\n", vaddr); + + return iova; +} + static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { @@ -112,28 +139,34 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } -void vfio_pci_dma_map(struct vfio_pci_device *device, u64 iova, u64 size, void *vaddr) +void vfio_pci_dma_map(struct vfio_pci_device *device, + struct vfio_dma_region *region) { struct vfio_iommu_type1_dma_map map = { .argsz = sizeof(map), .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, - .vaddr = (u64)vaddr, - .iova = iova, - .size = size, + .vaddr = (u64)region->vaddr, + .iova = region->iova, + .size = region->size, }; ioctl_assert(device->container_fd, VFIO_IOMMU_MAP_DMA, &map); + + list_add(®ion->link, &device->dma_regions); } -void vfio_pci_dma_unmap(struct vfio_pci_device *device, u64 iova, u64 size) +void vfio_pci_dma_unmap(struct vfio_pci_device *device, + struct vfio_dma_region *region) { struct vfio_iommu_type1_dma_unmap unmap = { .argsz = sizeof(unmap), - .iova = iova, - .size = size, + .iova = region->iova, + .size = region->size, }; ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &unmap); + + list_del(®ion->link); } static void vfio_pci_region_get(struct vfio_pci_device *device, int index, @@ -260,6 +293,8 @@ static void vfio_pci_iommu_setup(struct vfio_pci_device *device, unsigned long i { int ret; + INIT_LIST_HEAD(&device->dma_regions); + ret = ioctl(device->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 2612f0cabea5..4578ee6df0e1 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -128,30 +128,32 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) { const u64 size = variant->size ?: getpagesize(); const int flags = variant->mmap_flags; + struct vfio_dma_region region; struct iommu_mapping mapping; - void *mem; - u64 iova; int rc; - mem = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); + region.vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); /* Skip the test if there aren't enough HugeTLB pages available. */ - if (flags & MAP_HUGETLB && mem == MAP_FAILED) + if (flags & MAP_HUGETLB && region.vaddr == MAP_FAILED) SKIP(return, "mmap() failed: %s (%d)\n", strerror(errno), errno); else - ASSERT_NE(mem, MAP_FAILED); + ASSERT_NE(region.vaddr, MAP_FAILED); - iova = (u64)mem; + region.iova = (u64)region.vaddr; + region.size = size; - vfio_pci_dma_map(self->device, iova, size, mem); - printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", mem, size, iova); + vfio_pci_dma_map(self->device, ®ion); + printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova); - rc = iommu_mapping_get(device_bdf, iova, &mapping); + ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr)); + + rc = iommu_mapping_get(device_bdf, region.iova, &mapping); if (rc == -EOPNOTSUPP) goto unmap; ASSERT_EQ(0, rc); - printf("Found IOMMU mappings for IOVA 0x%lx:\n", iova); + printf("Found IOMMU mappings for IOVA 0x%lx:\n", region.iova); printf("PGD: 0x%016lx\n", mapping.pgd); printf("P4D: 0x%016lx\n", mapping.p4d); printf("PUD: 0x%016lx\n", mapping.pud); @@ -176,11 +178,12 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) } unmap: - vfio_pci_dma_unmap(self->device, iova, size); - printf("Unmapped IOVA 0x%lx\n", iova); - ASSERT_NE(0, iommu_mapping_get(device_bdf, iova, &mapping)); + vfio_pci_dma_unmap(self->device, ®ion); + printf("Unmapped IOVA 0x%lx\n", region.iova); + ASSERT_EQ(INVALID_IOVA, __to_iova(self->device, region.vaddr)); + ASSERT_NE(0, iommu_mapping_get(device_bdf, region.iova, &mapping)); - ASSERT_TRUE(!munmap(mem, size)); + ASSERT_TRUE(!munmap(region.vaddr, size)); } int main(int argc, char *argv[]) From 924947804f2b9e564efdc814420d21b239df2dd4 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:24:57 +0000 Subject: [PATCH 14/48] vfio: selftests: Enable asserting MSI eventfds not firing Make it possible to assert that a given MSI eventfd did _not_ fire by adding a helper to mark an eventfd non-blocking. Demonstrate this in vfio_pci_device_test by asserting the MSI eventfd did not fire before vfio_pci_irq_trigger(). Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-11-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/vfio_util.h | 12 ++++++++++++ tools/testing/selftests/vfio/vfio_pci_device_test.c | 10 +++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index db08646c2819..9c928fcc00e2 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -2,6 +2,7 @@ #ifndef SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H #define SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H +#include #include #include #include @@ -130,6 +131,17 @@ void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); +static inline void fcntl_set_nonblock(int fd) +{ + int r; + + r = fcntl(fd, F_GETFL, 0); + VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd); + + r = fcntl(fd, F_SETFL, r | O_NONBLOCK); + VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd); +} + static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, u32 vector, int count) { diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 82e3c947f45d..1b5c2ff77e3f 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -129,6 +129,7 @@ FIXTURE_TEARDOWN(vfio_pci_irq_test) TEST_F(vfio_pci_irq_test, enable_trigger_disable) { bool msix = variant->irq_index == VFIO_PCI_MSIX_IRQ_INDEX; + int msi_eventfd; u32 count; u64 value; int i; @@ -147,8 +148,15 @@ TEST_F(vfio_pci_irq_test, enable_trigger_disable) printf("MSI%s: enabled %d interrupts\n", msix ? "-x" : "", count); for (i = 0; i < count; i++) { + msi_eventfd = self->device->msi_eventfds[i]; + + fcntl_set_nonblock(msi_eventfd); + ASSERT_EQ(-1, read(msi_eventfd, &value, 8)); + ASSERT_EQ(EAGAIN, errno); + vfio_pci_irq_trigger(self->device, variant->irq_index, i); - ASSERT_EQ(8, read(self->device->msi_eventfds[i], &value, 8)); + + ASSERT_EQ(8, read(msi_eventfd, &value, 8)); ASSERT_EQ(1, value); } From 50d8fe805f75a159551ddb6b04ecdad26ec50221 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:24:58 +0000 Subject: [PATCH 15/48] vfio: selftests: Add a helper for matching vendor+device IDs Add a helper function for matching a device against a given vendor and device ID. This will be used in a subsequent commit to match devices against drivers. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-12-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/vfio_util.h | 7 +++++++ tools/testing/selftests/vfio/vfio_pci_device_test.c | 4 +--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 9c928fcc00e2..a51c971004cd 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -167,4 +167,11 @@ static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) iova_t __to_iova(struct vfio_pci_device *device, void *vaddr); iova_t to_iova(struct vfio_pci_device *device, void *vaddr); +static inline bool vfio_pci_device_match(struct vfio_pci_device *device, + u16 vendor_id, u16 device_id) +{ + return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) && + (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); +} + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */ diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 1b5c2ff77e3f..8856205d52a6 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -56,9 +56,7 @@ TEST_F(vfio_pci_device_test, config_space_read_write) /* Check that Vendor and Device match what the kernel reports. */ vendor = read_pci_id_from_sysfs("vendor"); device = read_pci_id_from_sysfs("device"); - - ASSERT_EQ(vendor, vfio_pci_config_readw(self->device, PCI_VENDOR_ID)); - ASSERT_EQ(device, vfio_pci_config_readw(self->device, PCI_DEVICE_ID)); + ASSERT_TRUE(vfio_pci_device_match(self->device, vendor, device)); printf("Vendor: %04x, Device: %04x\n", vendor, device); From 1b197032ac58b9a17350c086fc151390f32080b2 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:24:59 +0000 Subject: [PATCH 16/48] vfio: selftests: Add driver framework Add a driver framework to VFIO selftests, so that devices can generate DMA and interrupts in a common way that can be then utilized by tests. This will enable VFIO selftests to exercise real hardware DMA and interrupt paths, without needing any device-specific code in the test itself. Subsequent commits will introduce drivers for specific devices. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-13-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 92 ++++++++++++++ tools/testing/selftests/vfio/lib/libvfio.mk | 1 + .../selftests/vfio/lib/vfio_pci_device.c | 5 + .../selftests/vfio/lib/vfio_pci_driver.c | 116 ++++++++++++++++++ 4 files changed, 214 insertions(+) create mode 100644 tools/testing/selftests/vfio/lib/vfio_pci_driver.c diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index a51c971004cd..a7d05a4299a1 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -63,6 +63,85 @@ struct vfio_dma_region { u64 size; }; +struct vfio_pci_device; + +struct vfio_pci_driver_ops { + const char *name; + + /** + * @probe() - Check if the driver supports the given device. + * + * Return: 0 on success, non-0 on failure. + */ + int (*probe)(struct vfio_pci_device *device); + + /** + * @init() - Initialize the driver for @device. + * + * Must be called after device->driver.region has been initialized. + */ + void (*init)(struct vfio_pci_device *device); + + /** + * remove() - Deinitialize the driver for @device. + */ + void (*remove)(struct vfio_pci_device *device); + + /** + * memcpy_start() - Kick off @count repeated memcpy operations from + * [@src, @src + @size) to [@dst, @dst + @size). + * + * Guarantees: + * - The device will attempt DMA reads on [src, src + size). + * - The device will attempt DMA writes on [dst, dst + size). + * - The device will not generate any interrupts. + * + * memcpy_start() returns immediately, it does not wait for the + * copies to complete. + */ + void (*memcpy_start)(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count); + + /** + * memcpy_wait() - Wait until the memcpy operations started by + * memcpy_start() have finished. + * + * Guarantees: + * - All in-flight DMAs initiated by memcpy_start() are fully complete + * before memcpy_wait() returns. + * + * Returns non-0 if the driver detects that an error occurred during the + * memcpy, 0 otherwise. + */ + int (*memcpy_wait)(struct vfio_pci_device *device); + + /** + * send_msi() - Make the device send the MSI device->driver.msi. + * + * Guarantees: + * - The device will send the MSI once. + */ + void (*send_msi)(struct vfio_pci_device *device); +}; + +struct vfio_pci_driver { + const struct vfio_pci_driver_ops *ops; + bool initialized; + bool memcpy_in_progress; + + /* Region to be used by the driver (e.g. for in-memory descriptors) */ + struct vfio_dma_region region; + + /* The maximum size that can be passed to memcpy_start(). */ + u64 max_memcpy_size; + + /* The maximum count that can be passed to memcpy_start(). */ + u64 max_memcpy_count; + + /* The MSI vector the device will signal in ops->send_msi(). */ + int msi; +}; + struct vfio_pci_device { int fd; int group_fd; @@ -79,6 +158,8 @@ struct vfio_pci_device { /* eventfds for MSI and MSI-x interrupts */ int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; + + struct vfio_pci_driver driver; }; /* @@ -174,4 +255,15 @@ static inline bool vfio_pci_device_match(struct vfio_pci_device *device, (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); } +void vfio_pci_driver_probe(struct vfio_pci_device *device); +void vfio_pci_driver_init(struct vfio_pci_device *device); +void vfio_pci_driver_remove(struct vfio_pci_device *device); +int vfio_pci_driver_memcpy(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size); +void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, + u64 count); +int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); +void vfio_pci_driver_send_msi(struct vfio_pci_device *device); + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */ diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 72e55a560eeb..a3c3bc9a7c00 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -1,6 +1,7 @@ VFIO_DIR := $(selfdir)/vfio LIBVFIO_C := lib/vfio_pci_device.c +LIBVFIO_C += lib/vfio_pci_driver.c LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C)) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 36b4b30b75cf..d8bb227e869d 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -344,6 +344,8 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type) vfio_pci_iommu_setup(device, iommu_type); vfio_pci_device_setup(device, bdf); + vfio_pci_driver_probe(device); + return device; } @@ -351,6 +353,9 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) { int i; + if (device->driver.initialized) + vfio_pci_driver_remove(device); + vfio_pci_bar_unmap_all(device); VFIO_ASSERT_EQ(close(device->fd), 0); diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c new file mode 100644 index 000000000000..c98bd2d31d8a --- /dev/null +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include "../../../kselftest.h" +#include + +static struct vfio_pci_driver_ops *driver_ops[] = {}; + +void vfio_pci_driver_probe(struct vfio_pci_device *device) +{ + struct vfio_pci_driver_ops *ops; + int i; + + VFIO_ASSERT_NULL(device->driver.ops); + + for (i = 0; i < ARRAY_SIZE(driver_ops); i++) { + ops = driver_ops[i]; + + if (ops->probe(device)) + continue; + + printf("Driver found: %s\n", ops->name); + device->driver.ops = ops; + } +} + +static void vfio_check_driver_op(struct vfio_pci_driver *driver, void *op, + const char *op_name) +{ + VFIO_ASSERT_NOT_NULL(driver->ops); + VFIO_ASSERT_NOT_NULL(op, "Driver has no %s()\n", op_name); + VFIO_ASSERT_EQ(driver->initialized, op != driver->ops->init); + VFIO_ASSERT_EQ(driver->memcpy_in_progress, op == driver->ops->memcpy_wait); +} + +#define VFIO_CHECK_DRIVER_OP(_driver, _op) do { \ + struct vfio_pci_driver *__driver = (_driver); \ + vfio_check_driver_op(__driver, __driver->ops->_op, #_op); \ +} while (0) + +void vfio_pci_driver_init(struct vfio_pci_device *device) +{ + struct vfio_pci_driver *driver = &device->driver; + + VFIO_ASSERT_NOT_NULL(driver->region.vaddr); + VFIO_CHECK_DRIVER_OP(driver, init); + + driver->ops->init(device); + + driver->initialized = true; + + printf("%s: region: vaddr %p, iova 0x%lx, size 0x%lx\n", + driver->ops->name, + driver->region.vaddr, + driver->region.iova, + driver->region.size); + + printf("%s: max_memcpy_size 0x%lx, max_memcpy_count 0x%lx\n", + driver->ops->name, + driver->max_memcpy_size, + driver->max_memcpy_count); +} + +void vfio_pci_driver_remove(struct vfio_pci_device *device) +{ + struct vfio_pci_driver *driver = &device->driver; + + VFIO_CHECK_DRIVER_OP(driver, remove); + + driver->ops->remove(device); + driver->initialized = false; +} + +void vfio_pci_driver_send_msi(struct vfio_pci_device *device) +{ + struct vfio_pci_driver *driver = &device->driver; + + VFIO_CHECK_DRIVER_OP(driver, send_msi); + + driver->ops->send_msi(device); +} + +void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, + u64 count) +{ + struct vfio_pci_driver *driver = &device->driver; + + VFIO_ASSERT_LE(size, driver->max_memcpy_size); + VFIO_ASSERT_LE(count, driver->max_memcpy_count); + VFIO_CHECK_DRIVER_OP(driver, memcpy_start); + + driver->ops->memcpy_start(device, src, dst, size, count); + driver->memcpy_in_progress = true; +} + +int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device) +{ + struct vfio_pci_driver *driver = &device->driver; + int r; + + VFIO_CHECK_DRIVER_OP(driver, memcpy_wait); + + r = driver->ops->memcpy_wait(device); + driver->memcpy_in_progress = false; + + return r; +} + +int vfio_pci_driver_memcpy(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size) +{ + vfio_pci_driver_memcpy_start(device, src, dst, size, 1); + + return vfio_pci_driver_memcpy_wait(device); +} From fded8da4bc38df6e1475ac5998934c490b96215d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:00 +0000 Subject: [PATCH 17/48] vfio: sefltests: Add vfio_pci_driver_test Add a new selftest that tests all driver operations. This test serves both as a demonstration of the driver framework, and also as a correctness test for future drivers. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-14-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 1 + .../selftests/vfio/vfio_pci_driver_test.c | 233 ++++++++++++++++++ 2 files changed, 234 insertions(+) create mode 100644 tools/testing/selftests/vfio/vfio_pci_driver_test.c diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 05c5a585cca6..ee09c027ade5 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -2,6 +2,7 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test +TEST_GEN_PROGS += vfio_pci_driver_test include ../lib.mk include lib/libvfio.mk diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c new file mode 100644 index 000000000000..97ed0ff4636d --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include +#include + +#include + +#include "../kselftest_harness.h" + +static const char *device_bdf; + +#define ASSERT_NO_MSI(_eventfd) do { \ + u64 __value; \ + \ + ASSERT_EQ(-1, read(_eventfd, &__value, 8)); \ + ASSERT_EQ(EAGAIN, errno); \ +} while (0) + +static void region_setup(struct vfio_pci_device *device, + struct vfio_dma_region *region, u64 size) +{ + const int flags = MAP_SHARED | MAP_ANONYMOUS; + const int prot = PROT_READ | PROT_WRITE; + void *vaddr; + + vaddr = mmap(NULL, size, prot, flags, -1, 0); + VFIO_ASSERT_NE(vaddr, MAP_FAILED); + + region->vaddr = vaddr; + region->iova = (u64)vaddr; + region->size = size; + + vfio_pci_dma_map(device, region); +} + +static void region_teardown(struct vfio_pci_device *device, + struct vfio_dma_region *region) +{ + vfio_pci_dma_unmap(device, region); + VFIO_ASSERT_EQ(munmap(region->vaddr, region->size), 0); +} + +FIXTURE(vfio_pci_driver_test) { + struct vfio_pci_device *device; + struct vfio_dma_region memcpy_region; + void *vaddr; + int msi_fd; + + u64 size; + void *src; + void *dst; + iova_t src_iova; + iova_t dst_iova; + iova_t unmapped_iova; +}; + +FIXTURE_SETUP(vfio_pci_driver_test) +{ + struct vfio_pci_driver *driver; + + self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); + + driver = &self->device->driver; + + region_setup(self->device, &self->memcpy_region, SZ_1G); + region_setup(self->device, &driver->region, SZ_2M); + + /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ + self->unmapped_iova = 8UL * SZ_1G; + + vfio_pci_driver_init(self->device); + self->msi_fd = self->device->msi_eventfds[driver->msi]; + + /* + * Use the maximum size supported by the device for memcpy operations, + * slimmed down to fit into the memcpy region (divided by 2 so src and + * dst regions do not overlap). + */ + self->size = self->device->driver.max_memcpy_size; + self->size = min(self->size, self->memcpy_region.size / 2); + + self->src = self->memcpy_region.vaddr; + self->dst = self->src + self->size; + + self->src_iova = to_iova(self->device, self->src); + self->dst_iova = to_iova(self->device, self->dst); +} + +FIXTURE_TEARDOWN(vfio_pci_driver_test) +{ + struct vfio_pci_driver *driver = &self->device->driver; + + vfio_pci_driver_remove(self->device); + + region_teardown(self->device, &self->memcpy_region); + region_teardown(self->device, &driver->region); + + vfio_pci_device_cleanup(self->device); +} + +TEST_F(vfio_pci_driver_test, init_remove) +{ + int i; + + for (i = 0; i < 10; i++) { + vfio_pci_driver_remove(self->device); + vfio_pci_driver_init(self->device); + } +} + +TEST_F(vfio_pci_driver_test, memcpy_success) +{ + fcntl_set_nonblock(self->msi_fd); + + memset(self->src, 'x', self->size); + memset(self->dst, 'y', self->size); + + ASSERT_EQ(0, vfio_pci_driver_memcpy(self->device, + self->src_iova, + self->dst_iova, + self->size)); + + ASSERT_EQ(0, memcmp(self->src, self->dst, self->size)); + ASSERT_NO_MSI(self->msi_fd); +} + +TEST_F(vfio_pci_driver_test, memcpy_from_unmapped_iova) +{ + fcntl_set_nonblock(self->msi_fd); + + /* + * Ignore the return value since not all devices will detect and report + * accesses to unmapped IOVAs as errors. + */ + vfio_pci_driver_memcpy(self->device, self->unmapped_iova, + self->dst_iova, self->size); + + ASSERT_NO_MSI(self->msi_fd); +} + +TEST_F(vfio_pci_driver_test, memcpy_to_unmapped_iova) +{ + fcntl_set_nonblock(self->msi_fd); + + /* + * Ignore the return value since not all devices will detect and report + * accesses to unmapped IOVAs as errors. + */ + vfio_pci_driver_memcpy(self->device, self->src_iova, + self->unmapped_iova, self->size); + + ASSERT_NO_MSI(self->msi_fd); +} + +TEST_F(vfio_pci_driver_test, send_msi) +{ + u64 value; + + vfio_pci_driver_send_msi(self->device); + ASSERT_EQ(8, read(self->msi_fd, &value, 8)); + ASSERT_EQ(1, value); +} + +TEST_F(vfio_pci_driver_test, mix_and_match) +{ + u64 value; + int i; + + for (i = 0; i < 10; i++) { + memset(self->src, 'x', self->size); + memset(self->dst, 'y', self->size); + + ASSERT_EQ(0, vfio_pci_driver_memcpy(self->device, + self->src_iova, + self->dst_iova, + self->size)); + + ASSERT_EQ(0, memcmp(self->src, self->dst, self->size)); + + vfio_pci_driver_memcpy(self->device, + self->unmapped_iova, + self->dst_iova, + self->size); + + vfio_pci_driver_send_msi(self->device); + ASSERT_EQ(8, read(self->msi_fd, &value, 8)); + ASSERT_EQ(1, value); + } +} + +TEST_F_TIMEOUT(vfio_pci_driver_test, memcpy_storm, 60) +{ + struct vfio_pci_driver *driver = &self->device->driver; + u64 total_size; + u64 count; + + fcntl_set_nonblock(self->msi_fd); + + /* + * Perform up to 250GiB worth of DMA reads and writes across several + * memcpy operations. Some devices can support even more but the test + * will take too long. + */ + total_size = 250UL * SZ_1G; + count = min(total_size / self->size, driver->max_memcpy_count); + + printf("Kicking off %lu memcpys of size 0x%lx\n", count, self->size); + vfio_pci_driver_memcpy_start(self->device, + self->src_iova, + self->dst_iova, + self->size, count); + + ASSERT_EQ(0, vfio_pci_driver_memcpy_wait(self->device)); + ASSERT_NO_MSI(self->msi_fd); +} + +int main(int argc, char *argv[]) +{ + struct vfio_pci_device *device; + + device_bdf = vfio_selftests_get_bdf(&argc, argv); + + device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); + if (!device->driver.ops) { + fprintf(stderr, "No driver found for device %s\n", device_bdf); + return KSFT_SKIP; + } + vfio_pci_device_cleanup(device); + + return test_harness_run(argc, argv); +} From 9bf9b185e3ce76c9fddb4c6edb0ec3334b7649df Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:01 +0000 Subject: [PATCH 18/48] tools headers: Add stub definition for __iomem Add an empty definition for __iomem so that kernel headers that use __iomem can be imported into tools/include/ with less modifications. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-15-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/include/linux/compiler.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index 33411ca0cc90..f40bd2b04c29 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -138,6 +138,10 @@ # define __force #endif +#ifndef __iomem +# define __iomem +#endif + #ifndef __weak # define __weak __attribute__((weak)) #endif From 1f9c8edd6a7e9b0fd914cfeef8ce075307e8e702 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:02 +0000 Subject: [PATCH 19/48] tools headers: Import asm-generic MMIO helpers Import the asm-generic MMIO helper functions from the kernel headers into tools/include/. The top-level include is which then includes the arch-specific , which then includes . This layout is chosen to match the kernel header layout and to appease checkpatch.pl (which warns against including or directly). Changes made when importing: - Add missing includes at the top. - Stub out mmiowb_set_pending(). - Stub out _THIS_IP_. - Stub out log_*_mmio() calls. - Drop the CONFIG_64BIT checks, since tools/include/linux/types.h always defines u64. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-16-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/include/asm-generic/io.h | 482 +++++++++++++++++++++++++++++++++ tools/include/asm/io.h | 7 + tools/include/linux/io.h | 4 +- 3 files changed, 492 insertions(+), 1 deletion(-) create mode 100644 tools/include/asm-generic/io.h create mode 100644 tools/include/asm/io.h diff --git a/tools/include/asm-generic/io.h b/tools/include/asm-generic/io.h new file mode 100644 index 000000000000..e5a0b07ad452 --- /dev/null +++ b/tools/include/asm-generic/io.h @@ -0,0 +1,482 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_ASM_GENERIC_IO_H +#define _TOOLS_ASM_GENERIC_IO_H + +#include +#include + +#include +#include +#include + +#ifndef mmiowb_set_pending +#define mmiowb_set_pending() do { } while (0) +#endif + +#ifndef __io_br +#define __io_br() barrier() +#endif + +/* prevent prefetching of coherent DMA data ahead of a dma-complete */ +#ifndef __io_ar +#ifdef rmb +#define __io_ar(v) rmb() +#else +#define __io_ar(v) barrier() +#endif +#endif + +/* flush writes to coherent DMA data before possibly triggering a DMA read */ +#ifndef __io_bw +#ifdef wmb +#define __io_bw() wmb() +#else +#define __io_bw() barrier() +#endif +#endif + +/* serialize device access against a spin_unlock, usually handled there. */ +#ifndef __io_aw +#define __io_aw() mmiowb_set_pending() +#endif + +#ifndef __io_pbw +#define __io_pbw() __io_bw() +#endif + +#ifndef __io_paw +#define __io_paw() __io_aw() +#endif + +#ifndef __io_pbr +#define __io_pbr() __io_br() +#endif + +#ifndef __io_par +#define __io_par(v) __io_ar(v) +#endif + +#ifndef _THIS_IP_ +#define _THIS_IP_ 0 +#endif + +static inline void log_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr, unsigned long caller_addr0) {} +static inline void log_post_write_mmio(u64 val, u8 width, volatile void __iomem *addr, + unsigned long caller_addr, unsigned long caller_addr0) {} +static inline void log_read_mmio(u8 width, const volatile void __iomem *addr, + unsigned long caller_addr, unsigned long caller_addr0) {} +static inline void log_post_read_mmio(u64 val, u8 width, const volatile void __iomem *addr, + unsigned long caller_addr, unsigned long caller_addr0) {} + +/* + * __raw_{read,write}{b,w,l,q}() access memory in native endianness. + * + * On some architectures memory mapped IO needs to be accessed differently. + * On the simple architectures, we just read/write the memory location + * directly. + */ + +#ifndef __raw_readb +#define __raw_readb __raw_readb +static inline u8 __raw_readb(const volatile void __iomem *addr) +{ + return *(const volatile u8 __force *)addr; +} +#endif + +#ifndef __raw_readw +#define __raw_readw __raw_readw +static inline u16 __raw_readw(const volatile void __iomem *addr) +{ + return *(const volatile u16 __force *)addr; +} +#endif + +#ifndef __raw_readl +#define __raw_readl __raw_readl +static inline u32 __raw_readl(const volatile void __iomem *addr) +{ + return *(const volatile u32 __force *)addr; +} +#endif + +#ifndef __raw_readq +#define __raw_readq __raw_readq +static inline u64 __raw_readq(const volatile void __iomem *addr) +{ + return *(const volatile u64 __force *)addr; +} +#endif + +#ifndef __raw_writeb +#define __raw_writeb __raw_writeb +static inline void __raw_writeb(u8 value, volatile void __iomem *addr) +{ + *(volatile u8 __force *)addr = value; +} +#endif + +#ifndef __raw_writew +#define __raw_writew __raw_writew +static inline void __raw_writew(u16 value, volatile void __iomem *addr) +{ + *(volatile u16 __force *)addr = value; +} +#endif + +#ifndef __raw_writel +#define __raw_writel __raw_writel +static inline void __raw_writel(u32 value, volatile void __iomem *addr) +{ + *(volatile u32 __force *)addr = value; +} +#endif + +#ifndef __raw_writeq +#define __raw_writeq __raw_writeq +static inline void __raw_writeq(u64 value, volatile void __iomem *addr) +{ + *(volatile u64 __force *)addr = value; +} +#endif + +/* + * {read,write}{b,w,l,q}() access little endian memory and return result in + * native endianness. + */ + +#ifndef readb +#define readb readb +static inline u8 readb(const volatile void __iomem *addr) +{ + u8 val; + + log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); + __io_br(); + val = __raw_readb(addr); + __io_ar(val); + log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#ifndef readw +#define readw readw +static inline u16 readw(const volatile void __iomem *addr) +{ + u16 val; + + log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); + __io_br(); + val = __le16_to_cpu((__le16 __force)__raw_readw(addr)); + __io_ar(val); + log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#ifndef readl +#define readl readl +static inline u32 readl(const volatile void __iomem *addr) +{ + u32 val; + + log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); + __io_br(); + val = __le32_to_cpu((__le32 __force)__raw_readl(addr)); + __io_ar(val); + log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#ifndef readq +#define readq readq +static inline u64 readq(const volatile void __iomem *addr) +{ + u64 val; + + log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); + __io_br(); + val = __le64_to_cpu((__le64 __force)__raw_readq(addr)); + __io_ar(val); + log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#ifndef writeb +#define writeb writeb +static inline void writeb(u8 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); + __io_bw(); + __raw_writeb(value, addr); + __io_aw(); + log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); +} +#endif + +#ifndef writew +#define writew writew +static inline void writew(u16 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); + __io_bw(); + __raw_writew((u16 __force)cpu_to_le16(value), addr); + __io_aw(); + log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); +} +#endif + +#ifndef writel +#define writel writel +static inline void writel(u32 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); + __io_bw(); + __raw_writel((u32 __force)__cpu_to_le32(value), addr); + __io_aw(); + log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); +} +#endif + +#ifndef writeq +#define writeq writeq +static inline void writeq(u64 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); + __io_bw(); + __raw_writeq((u64 __force)__cpu_to_le64(value), addr); + __io_aw(); + log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); +} +#endif + +/* + * {read,write}{b,w,l,q}_relaxed() are like the regular version, but + * are not guaranteed to provide ordering against spinlocks or memory + * accesses. + */ +#ifndef readb_relaxed +#define readb_relaxed readb_relaxed +static inline u8 readb_relaxed(const volatile void __iomem *addr) +{ + u8 val; + + log_read_mmio(8, addr, _THIS_IP_, _RET_IP_); + val = __raw_readb(addr); + log_post_read_mmio(val, 8, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#ifndef readw_relaxed +#define readw_relaxed readw_relaxed +static inline u16 readw_relaxed(const volatile void __iomem *addr) +{ + u16 val; + + log_read_mmio(16, addr, _THIS_IP_, _RET_IP_); + val = __le16_to_cpu((__le16 __force)__raw_readw(addr)); + log_post_read_mmio(val, 16, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#ifndef readl_relaxed +#define readl_relaxed readl_relaxed +static inline u32 readl_relaxed(const volatile void __iomem *addr) +{ + u32 val; + + log_read_mmio(32, addr, _THIS_IP_, _RET_IP_); + val = __le32_to_cpu((__le32 __force)__raw_readl(addr)); + log_post_read_mmio(val, 32, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#if defined(readq) && !defined(readq_relaxed) +#define readq_relaxed readq_relaxed +static inline u64 readq_relaxed(const volatile void __iomem *addr) +{ + u64 val; + + log_read_mmio(64, addr, _THIS_IP_, _RET_IP_); + val = __le64_to_cpu((__le64 __force)__raw_readq(addr)); + log_post_read_mmio(val, 64, addr, _THIS_IP_, _RET_IP_); + return val; +} +#endif + +#ifndef writeb_relaxed +#define writeb_relaxed writeb_relaxed +static inline void writeb_relaxed(u8 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); + __raw_writeb(value, addr); + log_post_write_mmio(value, 8, addr, _THIS_IP_, _RET_IP_); +} +#endif + +#ifndef writew_relaxed +#define writew_relaxed writew_relaxed +static inline void writew_relaxed(u16 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); + __raw_writew((u16 __force)cpu_to_le16(value), addr); + log_post_write_mmio(value, 16, addr, _THIS_IP_, _RET_IP_); +} +#endif + +#ifndef writel_relaxed +#define writel_relaxed writel_relaxed +static inline void writel_relaxed(u32 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); + __raw_writel((u32 __force)__cpu_to_le32(value), addr); + log_post_write_mmio(value, 32, addr, _THIS_IP_, _RET_IP_); +} +#endif + +#if defined(writeq) && !defined(writeq_relaxed) +#define writeq_relaxed writeq_relaxed +static inline void writeq_relaxed(u64 value, volatile void __iomem *addr) +{ + log_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); + __raw_writeq((u64 __force)__cpu_to_le64(value), addr); + log_post_write_mmio(value, 64, addr, _THIS_IP_, _RET_IP_); +} +#endif + +/* + * {read,write}s{b,w,l,q}() repeatedly access the same memory address in + * native endianness in 8-, 16-, 32- or 64-bit chunks (@count times). + */ +#ifndef readsb +#define readsb readsb +static inline void readsb(const volatile void __iomem *addr, void *buffer, + unsigned int count) +{ + if (count) { + u8 *buf = buffer; + + do { + u8 x = __raw_readb(addr); + *buf++ = x; + } while (--count); + } +} +#endif + +#ifndef readsw +#define readsw readsw +static inline void readsw(const volatile void __iomem *addr, void *buffer, + unsigned int count) +{ + if (count) { + u16 *buf = buffer; + + do { + u16 x = __raw_readw(addr); + *buf++ = x; + } while (--count); + } +} +#endif + +#ifndef readsl +#define readsl readsl +static inline void readsl(const volatile void __iomem *addr, void *buffer, + unsigned int count) +{ + if (count) { + u32 *buf = buffer; + + do { + u32 x = __raw_readl(addr); + *buf++ = x; + } while (--count); + } +} +#endif + +#ifndef readsq +#define readsq readsq +static inline void readsq(const volatile void __iomem *addr, void *buffer, + unsigned int count) +{ + if (count) { + u64 *buf = buffer; + + do { + u64 x = __raw_readq(addr); + *buf++ = x; + } while (--count); + } +} +#endif + +#ifndef writesb +#define writesb writesb +static inline void writesb(volatile void __iomem *addr, const void *buffer, + unsigned int count) +{ + if (count) { + const u8 *buf = buffer; + + do { + __raw_writeb(*buf++, addr); + } while (--count); + } +} +#endif + +#ifndef writesw +#define writesw writesw +static inline void writesw(volatile void __iomem *addr, const void *buffer, + unsigned int count) +{ + if (count) { + const u16 *buf = buffer; + + do { + __raw_writew(*buf++, addr); + } while (--count); + } +} +#endif + +#ifndef writesl +#define writesl writesl +static inline void writesl(volatile void __iomem *addr, const void *buffer, + unsigned int count) +{ + if (count) { + const u32 *buf = buffer; + + do { + __raw_writel(*buf++, addr); + } while (--count); + } +} +#endif + +#ifndef writesq +#define writesq writesq +static inline void writesq(volatile void __iomem *addr, const void *buffer, + unsigned int count) +{ + if (count) { + const u64 *buf = buffer; + + do { + __raw_writeq(*buf++, addr); + } while (--count); + } +} +#endif + +#endif /* _TOOLS_ASM_GENERIC_IO_H */ diff --git a/tools/include/asm/io.h b/tools/include/asm/io.h new file mode 100644 index 000000000000..9ae219b12604 --- /dev/null +++ b/tools/include/asm/io.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_ASM_IO_H +#define _TOOLS_ASM_IO_H + +#include + +#endif /* _TOOLS_ASM_IO_H */ diff --git a/tools/include/linux/io.h b/tools/include/linux/io.h index e129871fe661..4b94b84160b8 100644 --- a/tools/include/linux/io.h +++ b/tools/include/linux/io.h @@ -2,4 +2,6 @@ #ifndef _TOOLS_IO_H #define _TOOLS_IO_H -#endif +#include + +#endif /* _TOOLS_IO_H */ From ce5dc9aa72d9c3d6cb14b3a6aab900124999d8d0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:03 +0000 Subject: [PATCH 20/48] tools headers: Import x86 MMIO helper overrides Import the x86-specific overrides for from the kernel headers into tools/include/. Changes made when importing: - Replace CONFIG_X86_64 with __x86_64__. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-17-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/arch/x86/include/asm/io.h | 75 +++++++++++++++++++++++++++++++++ tools/include/asm/io.h | 4 ++ 2 files changed, 79 insertions(+) create mode 100644 tools/arch/x86/include/asm/io.h diff --git a/tools/arch/x86/include/asm/io.h b/tools/arch/x86/include/asm/io.h new file mode 100644 index 000000000000..4c787a2363de --- /dev/null +++ b/tools/arch/x86/include/asm/io.h @@ -0,0 +1,75 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_ASM_X86_IO_H +#define _TOOLS_ASM_X86_IO_H + +#include +#include + +#define build_mmio_read(name, size, type, reg, barrier) \ +static inline type name(const volatile void __iomem *addr) \ +{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \ +:"m" (*(volatile type __force *)addr) barrier); return ret; } + +#define build_mmio_write(name, size, type, reg, barrier) \ +static inline void name(type val, volatile void __iomem *addr) \ +{ asm volatile("mov" size " %0,%1": :reg (val), \ +"m" (*(volatile type __force *)addr) barrier); } + +build_mmio_read(readb, "b", unsigned char, "=q", :"memory") +build_mmio_read(readw, "w", unsigned short, "=r", :"memory") +build_mmio_read(readl, "l", unsigned int, "=r", :"memory") + +build_mmio_read(__readb, "b", unsigned char, "=q", ) +build_mmio_read(__readw, "w", unsigned short, "=r", ) +build_mmio_read(__readl, "l", unsigned int, "=r", ) + +build_mmio_write(writeb, "b", unsigned char, "q", :"memory") +build_mmio_write(writew, "w", unsigned short, "r", :"memory") +build_mmio_write(writel, "l", unsigned int, "r", :"memory") + +build_mmio_write(__writeb, "b", unsigned char, "q", ) +build_mmio_write(__writew, "w", unsigned short, "r", ) +build_mmio_write(__writel, "l", unsigned int, "r", ) + +#define readb readb +#define readw readw +#define readl readl +#define readb_relaxed(a) __readb(a) +#define readw_relaxed(a) __readw(a) +#define readl_relaxed(a) __readl(a) +#define __raw_readb __readb +#define __raw_readw __readw +#define __raw_readl __readl + +#define writeb writeb +#define writew writew +#define writel writel +#define writeb_relaxed(v, a) __writeb(v, a) +#define writew_relaxed(v, a) __writew(v, a) +#define writel_relaxed(v, a) __writel(v, a) +#define __raw_writeb __writeb +#define __raw_writew __writew +#define __raw_writel __writel + +#ifdef __x86_64__ + +build_mmio_read(readq, "q", u64, "=r", :"memory") +build_mmio_read(__readq, "q", u64, "=r", ) +build_mmio_write(writeq, "q", u64, "r", :"memory") +build_mmio_write(__writeq, "q", u64, "r", ) + +#define readq_relaxed(a) __readq(a) +#define writeq_relaxed(v, a) __writeq(v, a) + +#define __raw_readq __readq +#define __raw_writeq __writeq + +/* Let people know that we have them */ +#define readq readq +#define writeq writeq + +#endif /* __x86_64__ */ + +#include + +#endif /* _TOOLS_ASM_X86_IO_H */ diff --git a/tools/include/asm/io.h b/tools/include/asm/io.h index 9ae219b12604..eed5066f25c4 100644 --- a/tools/include/asm/io.h +++ b/tools/include/asm/io.h @@ -2,6 +2,10 @@ #ifndef _TOOLS_ASM_IO_H #define _TOOLS_ASM_IO_H +#if defined(__i386__) || defined(__x86_64__) +#include "../../arch/x86/include/asm/io.h" +#else #include +#endif #endif /* _TOOLS_ASM_IO_H */ From dc0e216cf00b74cf61fdc882f8373beb8cdbec5e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:04 +0000 Subject: [PATCH 21/48] tools headers: Add symlink to linux/pci_ids.h Add a symlink to include/linux/pci_ids.h to tools/include/. This will be used by VFIO selftests in subsequent commits to match device and vendor IDs. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-18-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) create mode 120000 tools/include/linux/pci_ids.h diff --git a/tools/include/linux/pci_ids.h b/tools/include/linux/pci_ids.h new file mode 120000 index 000000000000..1c9e88f41261 --- /dev/null +++ b/tools/include/linux/pci_ids.h @@ -0,0 +1 @@ +../../../include/linux/pci_ids.h \ No newline at end of file From b7f086912c1d3ce1e8c2753e0ff329947219d0f6 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:05 +0000 Subject: [PATCH 22/48] dmaengine: ioat: Move system_has_dca_enabled() to dma.h Move the function prototype for system_has_dca_enabled() from hw.h to dma.h. This allows hw.h to be included from tools/, which will be used in a subsysequent commit to implement a userspace driver for Intel CBDMA devices in tools/testing/selftests/vfio. No functional change intended. Acked-by: Dave Jiang Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-19-dmatlack@google.com Signed-off-by: Alex Williamson --- drivers/dma/ioat/dma.h | 2 ++ drivers/dma/ioat/hw.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h index a180171087a8..12a4a4860a74 100644 --- a/drivers/dma/ioat/dma.h +++ b/drivers/dma/ioat/dma.h @@ -19,6 +19,8 @@ #define IOAT_DMA_DCA_ANY_CPU ~0 +int system_has_dca_enabled(struct pci_dev *pdev); + #define to_ioatdma_device(dev) container_of(dev, struct ioatdma_device, dma_dev) #define to_dev(ioat_chan) (&(ioat_chan)->ioat_dma->pdev->dev) #define to_pdev(ioat_chan) ((ioat_chan)->ioat_dma->pdev) diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h index 79e4e4c09c18..0373c48520c9 100644 --- a/drivers/dma/ioat/hw.h +++ b/drivers/dma/ioat/hw.h @@ -63,9 +63,6 @@ #define IOAT_VER_3_3 0x33 /* Version 3.3 */ #define IOAT_VER_3_4 0x34 /* Version 3.4 */ - -int system_has_dca_enabled(struct pci_dev *pdev); - #define IOAT_DESC_SZ 64 struct ioat_dma_descriptor { From 2223587df5c5e935cc6f973f62a9608eef81bec8 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:06 +0000 Subject: [PATCH 23/48] vfio: selftests: Add driver for Intel CBDMA Add a driver for the Intel CBDMA device. This driver is based on and named after the Linux driver for this device (drivers/dma/ioat/) and also based on previous work from Peter Shier . The driver aims to be as simple as possible. It uses a single descriptor to issue DMA operations, and only supports the copy operation. For "DMA storms", the driver kicks off the maximum number of maximum-sized DMA operations. On Skylake server parts, this was 2^16-1 copies of size 2M and lasts about 15 seconds. Create symlinks to drivers/dma/ioat/{hw.h,registers.h} to get access to various macros (e.g. IOAT_CHANCMD_RESET) and struct ioat_dma_descriptor. Cc: Dave Jiang Cc: Dan Williams Acked-by: Dave Jiang Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-20-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/drivers/ioat/hw.h | 1 + .../selftests/vfio/lib/drivers/ioat/ioat.c | 235 ++++++++++++++++++ .../vfio/lib/drivers/ioat/registers.h | 1 + tools/testing/selftests/vfio/lib/libvfio.mk | 7 + .../selftests/vfio/lib/vfio_pci_driver.c | 10 +- 5 files changed, 253 insertions(+), 1 deletion(-) create mode 120000 tools/testing/selftests/vfio/lib/drivers/ioat/hw.h create mode 100644 tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c create mode 120000 tools/testing/selftests/vfio/lib/drivers/ioat/registers.h diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/hw.h b/tools/testing/selftests/vfio/lib/drivers/ioat/hw.h new file mode 120000 index 000000000000..8ab52ddd4458 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/hw.h @@ -0,0 +1 @@ +../../../../../../../drivers/dma/ioat/hw.h \ No newline at end of file diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c new file mode 100644 index 000000000000..c3b91d9b1f59 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include +#include +#include +#include + +#include + +#include "hw.h" +#include "registers.h" + +#define IOAT_DMACOUNT_MAX UINT16_MAX + +struct ioat_state { + /* Single descriptor used to issue DMA memcpy operations */ + struct ioat_dma_descriptor desc; + + /* Copy buffers used by ioat_send_msi() to generate an interrupt. */ + u64 send_msi_src; + u64 send_msi_dst; +}; + +static inline struct ioat_state *to_ioat_state(struct vfio_pci_device *device) +{ + return device->driver.region.vaddr; +} + +static inline void *ioat_channel_registers(struct vfio_pci_device *device) +{ + return device->bars[0].vaddr + IOAT_CHANNEL_MMIO_SIZE; +} + +static int ioat_probe(struct vfio_pci_device *device) +{ + u8 version; + int r; + + if (!vfio_pci_device_match(device, PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_IOAT_SKX)) + return -EINVAL; + + VFIO_ASSERT_NOT_NULL(device->bars[0].vaddr); + + version = readb(device->bars[0].vaddr + IOAT_VER_OFFSET); + switch (version) { + case IOAT_VER_3_2: + case IOAT_VER_3_3: + r = 0; + break; + default: + printf("ioat: Unsupported version: 0x%x\n", version); + r = -EINVAL; + } + return r; +} + +static u64 ioat_channel_status(void *bar) +{ + return readq(bar + IOAT_CHANSTS_OFFSET) & IOAT_CHANSTS_STATUS; +} + +static void ioat_clear_errors(struct vfio_pci_device *device) +{ + void *registers = ioat_channel_registers(device); + u32 errors; + + errors = vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET); + vfio_pci_config_writel(device, IOAT_PCI_CHANERR_INT_OFFSET, errors); + + errors = vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET); + vfio_pci_config_writel(device, IOAT_PCI_CHANERR_INT_OFFSET, errors); + + errors = readl(registers + IOAT_CHANERR_OFFSET); + writel(errors, registers + IOAT_CHANERR_OFFSET); +} + +static void ioat_reset(struct vfio_pci_device *device) +{ + void *registers = ioat_channel_registers(device); + u32 sleep_ms = 1, attempts = 5000 / sleep_ms; + u8 chancmd; + + ioat_clear_errors(device); + + writeb(IOAT_CHANCMD_RESET, registers + IOAT2_CHANCMD_OFFSET); + + for (;;) { + chancmd = readb(registers + IOAT2_CHANCMD_OFFSET); + if (!(chancmd & IOAT_CHANCMD_RESET)) + break; + + VFIO_ASSERT_GT(--attempts, 0); + usleep(sleep_ms * 1000); + } + + VFIO_ASSERT_EQ(ioat_channel_status(registers), IOAT_CHANSTS_HALTED); +} + +static void ioat_init(struct vfio_pci_device *device) +{ + struct ioat_state *ioat = to_ioat_state(device); + u8 intrctrl; + + VFIO_ASSERT_GE(device->driver.region.size, sizeof(*ioat)); + + vfio_pci_config_writew(device, PCI_COMMAND, + PCI_COMMAND_MEMORY | + PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + + ioat_reset(device); + + /* Enable the use of MXI-x interrupts for channel interrupts. */ + intrctrl = IOAT_INTRCTRL_MSIX_VECTOR_CONTROL; + writeb(intrctrl, device->bars[0].vaddr + IOAT_INTRCTRL_OFFSET); + + vfio_pci_msix_enable(device, 0, device->msix_info.count); + + device->driver.msi = 0; + device->driver.max_memcpy_size = + 1UL << readb(device->bars[0].vaddr + IOAT_XFERCAP_OFFSET); + device->driver.max_memcpy_count = IOAT_DMACOUNT_MAX; +} + +static void ioat_remove(struct vfio_pci_device *device) +{ + ioat_reset(device); + vfio_pci_msix_disable(device); +} + +static void ioat_handle_error(struct vfio_pci_device *device) +{ + void *registers = ioat_channel_registers(device); + + printf("Error detected during memcpy operation!\n" + " CHANERR: 0x%x\n" + " CHANERR_INT: 0x%x\n" + " DMAUNCERRSTS: 0x%x\n", + readl(registers + IOAT_CHANERR_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET)); + + ioat_reset(device); +} + +static int ioat_memcpy_wait(struct vfio_pci_device *device) +{ + void *registers = ioat_channel_registers(device); + u64 status; + int r = 0; + + /* Wait until all operations complete. */ + for (;;) { + status = ioat_channel_status(registers); + if (status == IOAT_CHANSTS_DONE) + break; + + if (status == IOAT_CHANSTS_HALTED) { + ioat_handle_error(device); + return -1; + } + } + + /* Put the channel into the SUSPENDED state. */ + writeb(IOAT_CHANCMD_SUSPEND, registers + IOAT2_CHANCMD_OFFSET); + for (;;) { + status = ioat_channel_status(registers); + if (status == IOAT_CHANSTS_SUSPENDED) + break; + } + + return r; +} + +static void __ioat_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, + u16 count, bool interrupt) +{ + void *registers = ioat_channel_registers(device); + struct ioat_state *ioat = to_ioat_state(device); + u64 desc_iova; + u16 chanctrl; + + desc_iova = to_iova(device, &ioat->desc); + ioat->desc = (struct ioat_dma_descriptor) { + .ctl_f.op = IOAT_OP_COPY, + .ctl_f.int_en = interrupt, + .src_addr = src, + .dst_addr = dst, + .size = size, + .next = desc_iova, + }; + + /* Tell the device the address of the descriptor. */ + writeq(desc_iova, registers + IOAT2_CHAINADDR_OFFSET); + + /* (Re)Enable the channel interrupt and abort on any errors */ + chanctrl = IOAT_CHANCTRL_INT_REARM | IOAT_CHANCTRL_ANY_ERR_ABORT_EN; + writew(chanctrl, registers + IOAT_CHANCTRL_OFFSET); + + /* Kick off @count DMA copy operation(s). */ + writew(count, registers + IOAT_CHAN_DMACOUNT_OFFSET); +} + +static void ioat_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, + u64 count) +{ + __ioat_memcpy_start(device, src, dst, size, count, false); +} + +static void ioat_send_msi(struct vfio_pci_device *device) +{ + struct ioat_state *ioat = to_ioat_state(device); + + __ioat_memcpy_start(device, + to_iova(device, &ioat->send_msi_src), + to_iova(device, &ioat->send_msi_dst), + sizeof(ioat->send_msi_src), 1, true); + + VFIO_ASSERT_EQ(ioat_memcpy_wait(device), 0); +} + +const struct vfio_pci_driver_ops ioat_ops = { + .name = "ioat", + .probe = ioat_probe, + .init = ioat_init, + .remove = ioat_remove, + .memcpy_start = ioat_memcpy_start, + .memcpy_wait = ioat_memcpy_wait, + .send_msi = ioat_send_msi, +}; diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/registers.h b/tools/testing/selftests/vfio/lib/drivers/ioat/registers.h new file mode 120000 index 000000000000..0b809cfd8fe6 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/registers.h @@ -0,0 +1 @@ +../../../../../../../drivers/dma/ioat/registers.h \ No newline at end of file diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index a3c3bc9a7c00..624dc267a879 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -1,8 +1,15 @@ +include $(top_srcdir)/scripts/subarch.include +ARCH ?= $(SUBARCH) + VFIO_DIR := $(selfdir)/vfio LIBVFIO_C := lib/vfio_pci_device.c LIBVFIO_C += lib/vfio_pci_driver.c +ifeq ($(ARCH:x86_64=x86),x86) +LIBVFIO_C += lib/drivers/ioat/ioat.c +endif + LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C)) LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index c98bd2d31d8a..aa47360e47a9 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -4,7 +4,15 @@ #include "../../../kselftest.h" #include -static struct vfio_pci_driver_ops *driver_ops[] = {}; +#ifdef __x86_64__ +extern struct vfio_pci_driver_ops ioat_ops; +#endif + +static struct vfio_pci_driver_ops *driver_ops[] = { +#ifdef __x86_64__ + &ioat_ops, +#endif +}; void vfio_pci_driver_probe(struct vfio_pci_device *device) { From 3fe305773bbeb2d946b9fb3c1224c8b00d6755ef Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:07 +0000 Subject: [PATCH 24/48] tools headers: Import iosubmit_cmds512() Import iosubmit_cmds512() from arch/x86/include/asm/io.h into tools/ so it can be used by VFIO selftests to interact with Intel DSA devices. Also pull in movdir64b() from arch/x86/include/asm/special_insns.h into tools/, which is the underlying instruction used by iosubmit_cmds512(). Changes made when importing: None Acked-by: Vinicius Costa Gomes Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-21-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/arch/x86/include/asm/io.h | 26 +++++++++++++++++++++ tools/arch/x86/include/asm/special_insns.h | 27 ++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 tools/arch/x86/include/asm/special_insns.h diff --git a/tools/arch/x86/include/asm/io.h b/tools/arch/x86/include/asm/io.h index 4c787a2363de..ecad61a3ea52 100644 --- a/tools/arch/x86/include/asm/io.h +++ b/tools/arch/x86/include/asm/io.h @@ -4,6 +4,7 @@ #include #include +#include "special_insns.h" #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -72,4 +73,29 @@ build_mmio_write(__writeq, "q", u64, "r", ) #include +/** + * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units + * @dst: destination, in MMIO space (must be 512-bit aligned) + * @src: source + * @count: number of 512 bits quantities to submit + * + * Submit data from kernel space to MMIO space, in units of 512 bits at a + * time. Order of access is not guaranteed, nor is a memory barrier + * performed afterwards. + * + * Warning: Do not use this helper unless your driver has checked that the CPU + * instruction is supported on the platform. + */ +static inline void iosubmit_cmds512(void __iomem *dst, const void *src, + size_t count) +{ + const u8 *from = src; + const u8 *end = from + count * 64; + + while (from < end) { + movdir64b(dst, from); + from += 64; + } +} + #endif /* _TOOLS_ASM_X86_IO_H */ diff --git a/tools/arch/x86/include/asm/special_insns.h b/tools/arch/x86/include/asm/special_insns.h new file mode 100644 index 000000000000..04af42a99c38 --- /dev/null +++ b/tools/arch/x86/include/asm/special_insns.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TOOLS_ASM_X86_SPECIAL_INSNS_H +#define _TOOLS_ASM_X86_SPECIAL_INSNS_H + +/* The dst parameter must be 64-bytes aligned */ +static inline void movdir64b(void *dst, const void *src) +{ + const struct { char _[64]; } *__src = src; + struct { char _[64]; } *__dst = dst; + + /* + * MOVDIR64B %(rdx), rax. + * + * Both __src and __dst must be memory constraints in order to tell the + * compiler that no other memory accesses should be reordered around + * this one. + * + * Also, both must be supplied as lvalues because this tells + * the compiler what the object is (its size) the instruction accesses. + * I.e., not the pointers but what they point to, thus the deref'ing '*'. + */ + asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" + : "+m" (*__dst) + : "m" (*__src), "a" (__dst), "d" (__src)); +} + +#endif /* _TOOLS_ASM_X86_SPECIAL_INSNS_H */ From 003e6faf2c8ff1670c35622f11ff6211be563a9e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:08 +0000 Subject: [PATCH 25/48] dmaengine: idxd: Allow registers.h to be included from tools/ Allow drivers/dma/idxd/registers.h to be included from userspace in tools/ by adjusting the include path to uapi/linux/idxd.h if __KERNEL__ is not defined. A subsequent commit will use registers.h to implement a userspace driver for Intel DSA devices in tools/testing/selftests/vfio. Acked-by: Vinicius Costa Gomes Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-22-dmatlack@google.com Signed-off-by: Alex Williamson --- drivers/dma/idxd/registers.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h index 9c1c546fe443..02bab136385e 100644 --- a/drivers/dma/idxd/registers.h +++ b/drivers/dma/idxd/registers.h @@ -3,7 +3,11 @@ #ifndef _IDXD_REGISTERS_H_ #define _IDXD_REGISTERS_H_ +#ifdef __KERNEL__ #include +#else +#include +#endif /* PCI Config */ #define PCI_DEVICE_ID_INTEL_DSA_GNRD 0x11fb From 35b05bd96204bc40a2ca8620bad1a5d139c20060 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:09 +0000 Subject: [PATCH 26/48] vfio: selftests: Add driver for Intel DSA Add a driver to VFIO selftests for Intel DSA devices. For now the driver only supports up to 32 batches and 1024 copies per batch, which were the limits of the hardware this commit was tested with. This is sufficient to generate 9+ minutes of DMA memcpys at a rate of over 30 GB/s. This should be plenty to stress test VFIO and the IOMMU. The driver does not yet support requesting interrupt handles, as this commit was not tested against hardware that requires it. Cc: Vinicius Costa Gomes Cc: Dave Jiang Acked-by: Vinicius Costa Gomes Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-23-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/drivers/dsa/dsa.c | 416 ++++++++++++++++++ .../vfio/lib/drivers/dsa/registers.h | 1 + tools/testing/selftests/vfio/lib/libvfio.mk | 1 + .../selftests/vfio/lib/vfio_pci_driver.c | 2 + 4 files changed, 420 insertions(+) create mode 100644 tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c create mode 120000 tools/testing/selftests/vfio/lib/drivers/dsa/registers.h diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c new file mode 100644 index 000000000000..0ca2cbc2a316 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c @@ -0,0 +1,416 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "registers.h" + +/* Vectors 1+ are available for work queue completion interrupts. */ +#define MSIX_VECTOR 1 + +struct dsa_state { + /* Descriptors for copy and batch operations. */ + struct dsa_hw_desc batch[32]; + struct dsa_hw_desc copy[1024]; + + /* Completion records for copy and batch operations. */ + struct dsa_completion_record copy_completion; + struct dsa_completion_record batch_completion; + + /* Cached device registers (and derived data) for easy access */ + union gen_cap_reg gen_cap; + union wq_cap_reg wq_cap; + union group_cap_reg group_cap; + union engine_cap_reg engine_cap; + union offsets_reg table_offsets; + void *wqcfg_table; + void *grpcfg_table; + u64 max_batches; + u64 max_copies_per_batch; + + /* The number of ongoing memcpy operations. */ + u64 memcpy_count; + + /* Buffers used by dsa_send_msi() to generate an interrupt */ + u64 send_msi_src; + u64 send_msi_dst; +}; + +static inline struct dsa_state *to_dsa_state(struct vfio_pci_device *device) +{ + return device->driver.region.vaddr; +} + +static bool dsa_int_handle_request_required(struct vfio_pci_device *device) +{ + void *bar0 = device->bars[0].vaddr; + union gen_cap_reg gen_cap; + u32 cmd_cap; + + gen_cap.bits = readq(bar0 + IDXD_GENCAP_OFFSET); + if (!gen_cap.cmd_cap) + return false; + + cmd_cap = readl(bar0 + IDXD_CMDCAP_OFFSET); + return (cmd_cap >> IDXD_CMD_REQUEST_INT_HANDLE) & 1; +} + +static int dsa_probe(struct vfio_pci_device *device) +{ + if (!vfio_pci_device_match(device, PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_DSA_SPR0)) + return -EINVAL; + + if (dsa_int_handle_request_required(device)) { + printf("Device requires requesting interrupt handles\n"); + return -EINVAL; + } + + return 0; +} + +static void dsa_check_sw_err(struct vfio_pci_device *device) +{ + void *reg = device->bars[0].vaddr + IDXD_SWERR_OFFSET; + union sw_err_reg err = {}; + int i; + + for (i = 0; i < ARRAY_SIZE(err.bits); i++) { + err.bits[i] = readq(reg + offsetof(union sw_err_reg, bits[i])); + + /* No errors */ + if (i == 0 && !err.valid) + return; + } + + fprintf(stderr, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", + err.bits[0], err.bits[1], err.bits[2], err.bits[3]); + + fprintf(stderr, " valid: 0x%x\n", err.valid); + fprintf(stderr, " overflow: 0x%x\n", err.overflow); + fprintf(stderr, " desc_valid: 0x%x\n", err.desc_valid); + fprintf(stderr, " wq_idx_valid: 0x%x\n", err.wq_idx_valid); + fprintf(stderr, " batch: 0x%x\n", err.batch); + fprintf(stderr, " fault_rw: 0x%x\n", err.fault_rw); + fprintf(stderr, " priv: 0x%x\n", err.priv); + fprintf(stderr, " error: 0x%x\n", err.error); + fprintf(stderr, " wq_idx: 0x%x\n", err.wq_idx); + fprintf(stderr, " operation: 0x%x\n", err.operation); + fprintf(stderr, " pasid: 0x%x\n", err.pasid); + fprintf(stderr, " batch_idx: 0x%x\n", err.batch_idx); + fprintf(stderr, " invalid_flags: 0x%x\n", err.invalid_flags); + fprintf(stderr, " fault_addr: 0x%lx\n", err.fault_addr); + + VFIO_FAIL("Software Error Detected!\n"); +} + +static void dsa_command(struct vfio_pci_device *device, u32 cmd) +{ + union idxd_command_reg cmd_reg = { .cmd = cmd }; + u32 sleep_ms = 1, attempts = 5000 / sleep_ms; + void *bar0 = device->bars[0].vaddr; + u32 status; + u8 err; + + writel(cmd_reg.bits, bar0 + IDXD_CMD_OFFSET); + + for (;;) { + dsa_check_sw_err(device); + + status = readl(bar0 + IDXD_CMDSTS_OFFSET); + if (!(status & IDXD_CMDSTS_ACTIVE)) + break; + + VFIO_ASSERT_GT(--attempts, 0); + usleep(sleep_ms * 1000); + } + + err = status & IDXD_CMDSTS_ERR_MASK; + VFIO_ASSERT_EQ(err, 0, "Error issuing command 0x%x: 0x%x\n", cmd, err); +} + +static void dsa_wq_init(struct vfio_pci_device *device) +{ + struct dsa_state *dsa = to_dsa_state(device); + union wq_cap_reg wq_cap = dsa->wq_cap; + union wqcfg wqcfg; + u64 wqcfg_size; + int i; + + VFIO_ASSERT_GT((u32)wq_cap.num_wqs, 0); + + wqcfg = (union wqcfg) { + .wq_size = wq_cap.total_wq_size, + .mode = 1, + .priority = 1, + /* + * Disable Address Translation Service (if enabled) so that VFIO + * selftests using this driver can generate I/O page faults. + */ + .wq_ats_disable = wq_cap.wq_ats_support, + .max_xfer_shift = dsa->gen_cap.max_xfer_shift, + .max_batch_shift = dsa->gen_cap.max_batch_shift, + .op_config[0] = BIT(DSA_OPCODE_MEMMOVE) | BIT(DSA_OPCODE_BATCH), + }; + + wqcfg_size = 1UL << (wq_cap.wqcfg_size + IDXD_WQCFG_MIN); + + for (i = 0; i < wqcfg_size / sizeof(wqcfg.bits[0]); i++) + writel(wqcfg.bits[i], dsa->wqcfg_table + offsetof(union wqcfg, bits[i])); +} + +static void dsa_group_init(struct vfio_pci_device *device) +{ + struct dsa_state *dsa = to_dsa_state(device); + union group_cap_reg group_cap = dsa->group_cap; + union engine_cap_reg engine_cap = dsa->engine_cap; + + VFIO_ASSERT_GT((u32)group_cap.num_groups, 0); + VFIO_ASSERT_GT((u32)engine_cap.num_engines, 0); + + /* Assign work queue 0 and engine 0 to group 0 */ + writeq(1, dsa->grpcfg_table + offsetof(struct grpcfg, wqs[0])); + writeq(1, dsa->grpcfg_table + offsetof(struct grpcfg, engines)); +} + +static void dsa_register_cache_init(struct vfio_pci_device *device) +{ + struct dsa_state *dsa = to_dsa_state(device); + void *bar0 = device->bars[0].vaddr; + + dsa->gen_cap.bits = readq(bar0 + IDXD_GENCAP_OFFSET); + dsa->wq_cap.bits = readq(bar0 + IDXD_WQCAP_OFFSET); + dsa->group_cap.bits = readq(bar0 + IDXD_GRPCAP_OFFSET); + dsa->engine_cap.bits = readq(bar0 + IDXD_ENGCAP_OFFSET); + + dsa->table_offsets.bits[0] = readq(bar0 + IDXD_TABLE_OFFSET); + dsa->table_offsets.bits[1] = readq(bar0 + IDXD_TABLE_OFFSET + 8); + + dsa->wqcfg_table = bar0 + dsa->table_offsets.wqcfg * IDXD_TABLE_MULT; + dsa->grpcfg_table = bar0 + dsa->table_offsets.grpcfg * IDXD_TABLE_MULT; + + dsa->max_batches = 1U << (dsa->wq_cap.total_wq_size + IDXD_WQCFG_MIN); + dsa->max_batches = min(dsa->max_batches, ARRAY_SIZE(dsa->batch)); + + dsa->max_copies_per_batch = 1UL << dsa->gen_cap.max_batch_shift; + dsa->max_copies_per_batch = min(dsa->max_copies_per_batch, ARRAY_SIZE(dsa->copy)); +} + +static void dsa_init(struct vfio_pci_device *device) +{ + struct dsa_state *dsa = to_dsa_state(device); + + VFIO_ASSERT_GE(device->driver.region.size, sizeof(*dsa)); + + vfio_pci_config_writew(device, PCI_COMMAND, + PCI_COMMAND_MEMORY | + PCI_COMMAND_MASTER | + PCI_COMMAND_INTX_DISABLE); + + dsa_command(device, IDXD_CMD_RESET_DEVICE); + + dsa_register_cache_init(device); + dsa_wq_init(device); + dsa_group_init(device); + + dsa_command(device, IDXD_CMD_ENABLE_DEVICE); + dsa_command(device, IDXD_CMD_ENABLE_WQ); + + vfio_pci_msix_enable(device, MSIX_VECTOR, 1); + + device->driver.max_memcpy_count = + dsa->max_batches * dsa->max_copies_per_batch; + device->driver.max_memcpy_size = 1UL << dsa->gen_cap.max_xfer_shift; + device->driver.msi = MSIX_VECTOR; +} + +static void dsa_remove(struct vfio_pci_device *device) +{ + dsa_command(device, IDXD_CMD_RESET_DEVICE); + vfio_pci_msix_disable(device); +} + +static int dsa_completion_wait(struct vfio_pci_device *device, + struct dsa_completion_record *completion) +{ + u8 status; + + for (;;) { + dsa_check_sw_err(device); + + status = READ_ONCE(completion->status); + if (status) + break; + + usleep(1000); + } + + if (status == DSA_COMP_SUCCESS) + return 0; + + printf("Error detected during memcpy operation: 0x%x\n", status); + return -1; +} + +static void dsa_copy_desc_init(struct vfio_pci_device *device, + struct dsa_hw_desc *desc, + iova_t src, iova_t dst, u64 size, + bool interrupt) +{ + struct dsa_state *dsa = to_dsa_state(device); + u16 flags; + + flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; + + if (interrupt) + flags |= IDXD_OP_FLAG_RCI; + + *desc = (struct dsa_hw_desc) { + .opcode = DSA_OPCODE_MEMMOVE, + .flags = flags, + .priv = 1, + .src_addr = src, + .dst_addr = dst, + .xfer_size = size, + .completion_addr = to_iova(device, &dsa->copy_completion), + .int_handle = interrupt ? MSIX_VECTOR : 0, + }; +} + +static void dsa_batch_desc_init(struct vfio_pci_device *device, + struct dsa_hw_desc *desc, + u64 count) +{ + struct dsa_state *dsa = to_dsa_state(device); + + *desc = (struct dsa_hw_desc) { + .opcode = DSA_OPCODE_BATCH, + .flags = IDXD_OP_FLAG_CRAV, + .priv = 1, + .completion_addr = to_iova(device, &dsa->batch_completion), + .desc_list_addr = to_iova(device, &dsa->copy[0]), + .desc_count = count, + }; +} + +static void dsa_desc_write(struct vfio_pci_device *device, struct dsa_hw_desc *desc) +{ + /* Write the contents (not address) of the 64-byte descriptor to the device. */ + iosubmit_cmds512(device->bars[2].vaddr, desc, 1); +} + +static void dsa_memcpy_one(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, bool interrupt) +{ + struct dsa_state *dsa = to_dsa_state(device); + + memset(&dsa->copy_completion, 0, sizeof(dsa->copy_completion)); + + dsa_copy_desc_init(device, &dsa->copy[0], src, dst, size, interrupt); + dsa_desc_write(device, &dsa->copy[0]); +} + +static void dsa_memcpy_batch(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count) +{ + struct dsa_state *dsa = to_dsa_state(device); + int i; + + memset(&dsa->batch_completion, 0, sizeof(dsa->batch_completion)); + + for (i = 0; i < ARRAY_SIZE(dsa->copy); i++) { + struct dsa_hw_desc *copy_desc = &dsa->copy[i]; + + dsa_copy_desc_init(device, copy_desc, src, dst, size, false); + + /* Don't request completions for individual copies. */ + copy_desc->flags &= ~IDXD_OP_FLAG_RCR; + } + + for (i = 0; i < ARRAY_SIZE(dsa->batch) && count; i++) { + struct dsa_hw_desc *batch_desc = &dsa->batch[i]; + int nr_copies; + + nr_copies = min(count, dsa->max_copies_per_batch); + count -= nr_copies; + + /* + * Batches must have at least 2 copies, so handle the case where + * there is exactly 1 copy left by doing one less copy in this + * batch and then 2 in the next. + */ + if (count == 1) { + nr_copies--; + count++; + } + + dsa_batch_desc_init(device, batch_desc, nr_copies); + + /* Request a completion for the last batch. */ + if (!count) + batch_desc->flags |= IDXD_OP_FLAG_RCR; + + dsa_desc_write(device, batch_desc); + } + + VFIO_ASSERT_EQ(count, 0, "Failed to start %lu copies.\n", count); +} + +static void dsa_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count) +{ + struct dsa_state *dsa = to_dsa_state(device); + + /* DSA devices require at least 2 copies per batch. */ + if (count == 1) + dsa_memcpy_one(device, src, dst, size, false); + else + dsa_memcpy_batch(device, src, dst, size, count); + + dsa->memcpy_count = count; +} + +static int dsa_memcpy_wait(struct vfio_pci_device *device) +{ + struct dsa_state *dsa = to_dsa_state(device); + int r; + + if (dsa->memcpy_count == 1) + r = dsa_completion_wait(device, &dsa->copy_completion); + else + r = dsa_completion_wait(device, &dsa->batch_completion); + + dsa->memcpy_count = 0; + + return r; +} + +static void dsa_send_msi(struct vfio_pci_device *device) +{ + struct dsa_state *dsa = to_dsa_state(device); + + dsa_memcpy_one(device, + to_iova(device, &dsa->send_msi_src), + to_iova(device, &dsa->send_msi_dst), + sizeof(dsa->send_msi_src), true); + + VFIO_ASSERT_EQ(dsa_completion_wait(device, &dsa->copy_completion), 0); +} + +const struct vfio_pci_driver_ops dsa_ops = { + .name = "dsa", + .probe = dsa_probe, + .init = dsa_init, + .remove = dsa_remove, + .memcpy_start = dsa_memcpy_start, + .memcpy_wait = dsa_memcpy_wait, + .send_msi = dsa_send_msi, +}; diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/registers.h b/tools/testing/selftests/vfio/lib/drivers/dsa/registers.h new file mode 120000 index 000000000000..bde657c3c2af --- /dev/null +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/registers.h @@ -0,0 +1 @@ +../../../../../../../drivers/dma/idxd/registers.h \ No newline at end of file diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 624dc267a879..5d11c3a89a28 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -8,6 +8,7 @@ LIBVFIO_C += lib/vfio_pci_driver.c ifeq ($(ARCH:x86_64=x86),x86) LIBVFIO_C += lib/drivers/ioat/ioat.c +LIBVFIO_C += lib/drivers/dsa/dsa.c endif LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C)) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index aa47360e47a9..e5e8723ecb41 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -5,11 +5,13 @@ #include #ifdef __x86_64__ +extern struct vfio_pci_driver_ops dsa_ops; extern struct vfio_pci_driver_ops ioat_ops; #endif static struct vfio_pci_driver_ops *driver_ops[] = { #ifdef __x86_64__ + &dsa_ops, &ioat_ops, #endif }; From 118e073ef6a3446862ada27bdca2b8a53447f428 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:10 +0000 Subject: [PATCH 27/48] vfio: selftests: Move helper to get cdev path to libvfio Move the helper function to get the VFIO cdev path to libvfio so that it can be used in libvfio in a subsequent commit. No functional change intended. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-24-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 1 + .../selftests/vfio/lib/vfio_pci_device.c | 31 +++++++++++++++++ .../selftests/vfio/vfio_iommufd_setup_test.c | 34 ++----------------- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index a7d05a4299a1..05a10417e0d7 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -175,6 +175,7 @@ struct vfio_pci_device { * If BDF cannot be determined then the test will exit with KSFT_SKIP. */ const char *vfio_selftests_get_bdf(int *argc, char *argv[]); +const char *vfio_pci_get_cdev_path(const char *bdf); struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type); void vfio_pci_device_cleanup(struct vfio_pci_device *device); diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index d8bb227e869d..d53e2d682c7e 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -332,6 +333,36 @@ static void vfio_pci_device_setup(struct vfio_pci_device *device, const char *bd device->msi_eventfds[i] = -1; } +const char *vfio_pci_get_cdev_path(const char *bdf) +{ + char dir_path[PATH_MAX]; + struct dirent *entry; + char *cdev_path; + DIR *dir; + + cdev_path = calloc(PATH_MAX, 1); + VFIO_ASSERT_NOT_NULL(cdev_path); + + snprintf(dir_path, sizeof(dir_path), "/sys/bus/pci/devices/%s/vfio-dev/", bdf); + + dir = opendir(dir_path); + VFIO_ASSERT_NOT_NULL(dir, "Failed to open directory %s\n", dir_path); + + while ((entry = readdir(dir)) != NULL) { + /* Find the file that starts with "vfio" */ + if (strncmp("vfio", entry->d_name, 4)) + continue; + + snprintf(cdev_path, PATH_MAX, "/dev/vfio/devices/%s", entry->d_name); + break; + } + + VFIO_ASSERT_NE(cdev_path[0], 0, "Failed to find vfio cdev file.\n"); + VFIO_ASSERT_EQ(closedir(dir), 0); + + return cdev_path; +} + struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type) { struct vfio_pci_device *device; diff --git a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c index f45335d9260f..3655106b912d 100644 --- a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c +++ b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c @@ -1,8 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include - #include #include #include @@ -11,7 +7,6 @@ #include #include -#include #include #include @@ -19,32 +14,7 @@ #include "../kselftest_harness.h" static const char iommu_dev_path[] = "/dev/iommu"; -static char cdev_path[PATH_MAX] = { '\0' }; - -static void set_cdev_path(const char *bdf) -{ - char dir_path[PATH_MAX]; - DIR *dir; - struct dirent *entry; - - snprintf(dir_path, sizeof(dir_path), "/sys/bus/pci/devices/%s/vfio-dev/", bdf); - - dir = opendir(dir_path); - assert(dir); - - /* Find the file named "vfio" */ - while ((entry = readdir(dir)) != NULL) { - if (!strncmp("vfio", entry->d_name, 4)) { - snprintf(cdev_path, sizeof(cdev_path), "/dev/vfio/devices/%s", - entry->d_name); - break; - } - } - - assert(strlen(cdev_path) > 0); - - closedir(dir); -} +static const char *cdev_path; static int vfio_device_bind_iommufd_ioctl(int cdev_fd, int iommufd) { @@ -150,7 +120,7 @@ int main(int argc, char *argv[]) { const char *device_bdf = vfio_selftests_get_bdf(&argc, argv); - set_cdev_path(device_bdf); + cdev_path = vfio_pci_get_cdev_path(device_bdf); printf("Using cdev device %s\n", cdev_path); return test_harness_run(argc, argv); From 5df9bd6205114fac04c0f9539fa23f996e22a439 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:11 +0000 Subject: [PATCH 28/48] vfio: selftests: Encapsulate IOMMU mode Encapsulate the "IOMMU mode" a test should use behind a new struct. In the future this will be used to support other types of IOMMUs besides VFIO_TYPE1_IOMMU, and allow users to select the mode on the command line. No functional change intended. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-25-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 12 +++++- .../selftests/vfio/lib/vfio_pci_device.c | 42 ++++++++++++++++--- .../selftests/vfio/vfio_dma_mapping_test.c | 2 +- .../selftests/vfio/vfio_pci_device_test.c | 4 +- .../selftests/vfio/vfio_pci_driver_test.c | 4 +- 5 files changed, 52 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 05a10417e0d7..d50debd84813 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -47,6 +47,12 @@ VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ } while (0) +struct vfio_iommu_mode { + const char *name; + const char *container_path; + unsigned long iommu_type; +}; + struct vfio_pci_bar { struct vfio_region_info info; void *vaddr; @@ -144,6 +150,8 @@ struct vfio_pci_driver { struct vfio_pci_device { int fd; + + const struct vfio_iommu_mode *iommu_mode; int group_fd; int container_fd; @@ -177,7 +185,9 @@ struct vfio_pci_device { const char *vfio_selftests_get_bdf(int *argc, char *argv[]); const char *vfio_pci_get_cdev_path(const char *bdf); -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type); +extern const char *default_iommu_mode; + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode); void vfio_pci_device_cleanup(struct vfio_pci_device *device); void vfio_pci_device_reset(struct vfio_pci_device *device); diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index d53e2d682c7e..15e5adb770c3 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -18,7 +18,6 @@ #include "../../../kselftest.h" #include -#define VFIO_DEV_PATH "/dev/vfio/vfio" #define PCI_SYSFS_PATH "/sys/bus/pci/devices" #define ioctl_assert(_fd, _op, _arg) do { \ @@ -261,10 +260,11 @@ static unsigned int vfio_pci_get_group_from_dev(const char *bdf) static void vfio_pci_container_setup(struct vfio_pci_device *device) { + const char *path = device->iommu_mode->container_path; int version; - device->container_fd = open(VFIO_DEV_PATH, O_RDWR); - VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", VFIO_DEV_PATH); + device->container_fd = open(path, O_RDWR); + VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", path); version = ioctl(device->container_fd, VFIO_GET_API_VERSION); VFIO_ASSERT_EQ(version, VFIO_API_VERSION); @@ -290,8 +290,9 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->container_fd); } -static void vfio_pci_iommu_setup(struct vfio_pci_device *device, unsigned long iommu_type) +static void vfio_pci_iommu_setup(struct vfio_pci_device *device) { + unsigned long iommu_type = device->iommu_mode->iommu_type; int ret; INIT_LIST_HEAD(&device->dma_regions); @@ -363,16 +364,45 @@ const char *vfio_pci_get_cdev_path(const char *bdf) return cdev_path; } -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, int iommu_type) +static const struct vfio_iommu_mode iommu_modes[] = { + { + .name = "vfio_type1_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1_IOMMU, + }, +}; + +const char *default_iommu_mode = "vfio_type1_iommu"; + +static const struct vfio_iommu_mode *lookup_iommu_mode(const char *iommu_mode) +{ + int i; + + if (!iommu_mode) + iommu_mode = default_iommu_mode; + + for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) { + if (strcmp(iommu_mode, iommu_modes[i].name)) + continue; + + return &iommu_modes[i]; + } + + VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); +} + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode) { struct vfio_pci_device *device; device = calloc(1, sizeof(*device)); VFIO_ASSERT_NOT_NULL(device); + device->iommu_mode = lookup_iommu_mode(iommu_mode); + vfio_pci_container_setup(device); vfio_pci_group_setup(device, bdf); - vfio_pci_iommu_setup(device, iommu_type); + vfio_pci_iommu_setup(device); vfio_pci_device_setup(device, bdf); vfio_pci_driver_probe(device); diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 4578ee6df0e1..f07bdb602422 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -116,7 +116,7 @@ FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, anonymous_hugetlb_1gb) { FIXTURE_SETUP(vfio_dma_mapping_test) { - self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); + self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 8856205d52a6..7a270698e4d2 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -28,7 +28,7 @@ FIXTURE(vfio_pci_device_test) { FIXTURE_SETUP(vfio_pci_device_test) { - self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); + self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); } FIXTURE_TEARDOWN(vfio_pci_device_test) @@ -116,7 +116,7 @@ FIXTURE_VARIANT_ADD(vfio_pci_irq_test, msix) { FIXTURE_SETUP(vfio_pci_irq_test) { - self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); + self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); } FIXTURE_TEARDOWN(vfio_pci_irq_test) diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 97ed0ff4636d..14ec862c0b11 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -60,7 +60,7 @@ FIXTURE_SETUP(vfio_pci_driver_test) { struct vfio_pci_driver *driver; - self->device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); + self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); driver = &self->device->driver; @@ -222,7 +222,7 @@ int main(int argc, char *argv[]) device_bdf = vfio_selftests_get_bdf(&argc, argv); - device = vfio_pci_device_init(device_bdf, VFIO_TYPE1_IOMMU); + device = vfio_pci_device_init(device_bdf, default_iommu_mode); if (!device->driver.ops) { fprintf(stderr, "No driver found for device %s\n", device_bdf); return KSFT_SKIP; From 892aff147a545fa7c94d98613093afa84faa25b1 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:12 +0000 Subject: [PATCH 29/48] vfio: selftests: Replicate tests across all iommu_modes Automatically replicate vfio_dma_mapping_test and vfio_pci_driver_test across all supported IOMMU modes using fixture variants. Both of these tests exercise DMA mapping to some degree so having automatic coverage across all IOMMU modes will help catch bugs. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-26-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 8 +++++++ .../selftests/vfio/lib/vfio_pci_device.c | 1 + .../selftests/vfio/vfio_dma_mapping_test.c | 24 +++++++++---------- .../selftests/vfio/vfio_pci_driver_test.c | 13 +++++++++- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index d50debd84813..bf0b636a9c0c 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -53,6 +53,14 @@ struct vfio_iommu_mode { unsigned long iommu_type; }; +/* + * Generator for VFIO selftests fixture variants that replicate across all + * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() + * which should then use FIXTURE_VARIANT_ADD() to create the variant. + */ +#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__) + struct vfio_pci_bar { struct vfio_region_info info; void *vaddr; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 15e5adb770c3..5c4d008f2a25 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -364,6 +364,7 @@ const char *vfio_pci_get_cdev_path(const char *bdf) return cdev_path; } +/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ static const struct vfio_iommu_mode iommu_modes[] = { { .name = "vfio_type1_iommu", diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index f07bdb602422..b65949c6b846 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -96,27 +96,25 @@ FIXTURE(vfio_dma_mapping_test) { }; FIXTURE_VARIANT(vfio_dma_mapping_test) { + const char *iommu_mode; u64 size; int mmap_flags; }; -FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, anonymous) { - .mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE, -}; +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode, _name, _size, _mmap_flags) \ +FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, _iommu_mode ## _ ## _name) { \ + .iommu_mode = #_iommu_mode, \ + .size = (_size), \ + .mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE | (_mmap_flags), \ +} -FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, anonymous_hugetlb_2mb) { - .size = SZ_2M, - .mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_2MB, -}; - -FIXTURE_VARIANT_ADD(vfio_dma_mapping_test, anonymous_hugetlb_1gb) { - .size = SZ_1G, - .mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_1GB, -}; +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous, 0, 0); +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_2mb, SZ_2M, MAP_HUGETLB | MAP_HUGE_2MB); +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | MAP_HUGE_1GB); FIXTURE_SETUP(vfio_dma_mapping_test) { - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index 14ec862c0b11..2dbd70b7db62 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -56,11 +56,22 @@ FIXTURE(vfio_pci_driver_test) { iova_t unmapped_iova; }; +FIXTURE_VARIANT(vfio_pci_driver_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_pci_driver_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + FIXTURE_SETUP(vfio_pci_driver_test) { struct vfio_pci_driver *driver; - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); driver = &self->device->driver; From 0969c685ba5b248648533a3313f55a3fd9382a9e Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:13 +0000 Subject: [PATCH 30/48] vfio: selftests: Add vfio_type1v2_mode Add a new IOMMU mode for using VFIO_TYPE1v2_IOMMU. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-27-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/vfio_util.h | 3 ++- tools/testing/selftests/vfio/lib/vfio_pci_device.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index bf0b636a9c0c..981ddc9a52a9 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -59,7 +59,8 @@ struct vfio_iommu_mode { * which should then use FIXTURE_VARIANT_ADD() to create the variant. */ #define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__) +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__) struct vfio_pci_bar { struct vfio_region_info info; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 5c4d008f2a25..cc1b732dd8ba 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -371,6 +371,11 @@ static const struct vfio_iommu_mode iommu_modes[] = { .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1_IOMMU, }, + { + .name = "vfio_type1v2_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, }; const char *default_iommu_mode = "vfio_type1_iommu"; From d1a17495bb878542898d7ca4aa8fde29423a8ee0 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:14 +0000 Subject: [PATCH 31/48] vfio: selftests: Add iommufd_compat_type1{,v2} modes Add new IOMMU modes for using iommufd in compatibility mode with VFIO_TYPE1_IOMMU and VFIO_TYPE1v2_IOMMU. In these modes, VFIO selftests will open /dev/iommu and treats it as a container FD (as if it had opened /dev/vfio/vfio) and the kernel translates the container ioctls to iommufd calls transparently. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-28-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/include/vfio_util.h | 4 +++- tools/testing/selftests/vfio/lib/vfio_pci_device.c | 10 ++++++++++ tools/testing/selftests/vfio/vfio_dma_mapping_test.c | 12 ++++++++++-- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 981ddc9a52a9..035ef5b9d678 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -60,7 +60,9 @@ struct vfio_iommu_mode { */ #define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__) +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__) struct vfio_pci_bar { struct vfio_region_info info; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index cc1b732dd8ba..b6fefe2b3ec8 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -376,6 +376,16 @@ static const struct vfio_iommu_mode iommu_modes[] = { .container_path = "/dev/vfio/vfio", .iommu_type = VFIO_TYPE1v2_IOMMU, }, + { + .name = "iommufd_compat_type1", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1_IOMMU, + }, + { + .name = "iommufd_compat_type1v2", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, }; const char *default_iommu_mode = "vfio_type1_iommu"; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index b65949c6b846..ab19c54a774d 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -128,6 +128,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) const int flags = variant->mmap_flags; struct vfio_dma_region region; struct iommu_mapping mapping; + u64 mapping_size = size; int rc; region.vaddr = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); @@ -150,6 +151,13 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) if (rc == -EOPNOTSUPP) goto unmap; + /* + * IOMMUFD compatibility-mode does not support huge mappings when + * using VFIO_TYPE1_IOMMU. + */ + if (!strcmp(variant->iommu_mode, "iommufd_compat_type1")) + mapping_size = SZ_4K; + ASSERT_EQ(0, rc); printf("Found IOMMU mappings for IOVA 0x%lx:\n", region.iova); printf("PGD: 0x%016lx\n", mapping.pgd); @@ -158,7 +166,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) printf("PMD: 0x%016lx\n", mapping.pmd); printf("PTE: 0x%016lx\n", mapping.pte); - switch (size) { + switch (mapping_size) { case SZ_4K: ASSERT_NE(0, mapping.pte); break; @@ -172,7 +180,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) ASSERT_NE(0, mapping.pud); break; default: - VFIO_FAIL("Unrecognized size: 0x%lx\n", size); + VFIO_FAIL("Unrecognized size: 0x%lx\n", mapping_size); } unmap: From 61cbfe5014cbc17b376b6a9b2087f39f379a6b86 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:15 +0000 Subject: [PATCH 32/48] vfio: selftests: Add iommufd mode Add a new IOMMU mode for using iommufd directly. In this mode userspace opens /dev/iommu and binds it to a device FD acquired through /dev/vfio/devices/vfioX. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-29-dmatlack@google.com Signed-off-by: Alex Williamson --- .../selftests/vfio/lib/include/vfio_util.h | 6 +- .../selftests/vfio/lib/vfio_pci_device.c | 181 ++++++++++++++---- 2 files changed, 149 insertions(+), 38 deletions(-) diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h index 035ef5b9d678..ed31606e01b7 100644 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ b/tools/testing/selftests/vfio/lib/include/vfio_util.h @@ -62,7 +62,8 @@ struct vfio_iommu_mode { FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__) +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__) struct vfio_pci_bar { struct vfio_region_info info; @@ -166,6 +167,9 @@ struct vfio_pci_device { int group_fd; int container_fd; + int iommufd; + u32 ioas_id; + struct vfio_device_info info; struct vfio_region_info config_space; struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index b6fefe2b3ec8..5d8944a37982 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -10,10 +10,12 @@ #include #include +#include #include #include #include #include +#include #include "../../../kselftest.h" #include @@ -139,32 +141,80 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } -void vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) +static void vfio_iommu_dma_map(struct vfio_pci_device *device, + struct vfio_dma_region *region) { - struct vfio_iommu_type1_dma_map map = { - .argsz = sizeof(map), + struct vfio_iommu_type1_dma_map args = { + .argsz = sizeof(args), .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, .vaddr = (u64)region->vaddr, .iova = region->iova, .size = region->size, }; - ioctl_assert(device->container_fd, VFIO_IOMMU_MAP_DMA, &map); + ioctl_assert(device->container_fd, VFIO_IOMMU_MAP_DMA, &args); +} + +static void iommufd_dma_map(struct vfio_pci_device *device, + struct vfio_dma_region *region) +{ + struct iommu_ioas_map args = { + .size = sizeof(args), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .user_va = (u64)region->vaddr, + .iova = region->iova, + .length = region->size, + .ioas_id = device->ioas_id, + }; + + ioctl_assert(device->iommufd, IOMMU_IOAS_MAP, &args); +} + +void vfio_pci_dma_map(struct vfio_pci_device *device, + struct vfio_dma_region *region) +{ + if (device->iommufd) + iommufd_dma_map(device, region); + else + vfio_iommu_dma_map(device, region); list_add(®ion->link, &device->dma_regions); } +static void vfio_iommu_dma_unmap(struct vfio_pci_device *device, + struct vfio_dma_region *region) +{ + struct vfio_iommu_type1_dma_unmap args = { + .argsz = sizeof(args), + .iova = region->iova, + .size = region->size, + }; + + ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &args); +} + +static void iommufd_dma_unmap(struct vfio_pci_device *device, + struct vfio_dma_region *region) +{ + struct iommu_ioas_unmap args = { + .size = sizeof(args), + .iova = region->iova, + .length = region->size, + .ioas_id = device->ioas_id, + }; + + ioctl_assert(device->iommufd, IOMMU_IOAS_UNMAP, &args); +} + void vfio_pci_dma_unmap(struct vfio_pci_device *device, struct vfio_dma_region *region) { - struct vfio_iommu_type1_dma_unmap unmap = { - .argsz = sizeof(unmap), - .iova = region->iova, - .size = region->size, - }; - - ioctl_assert(device->container_fd, VFIO_IOMMU_UNMAP_DMA, &unmap); + if (device->iommufd) + iommufd_dma_unmap(device, region); + else + vfio_iommu_dma_unmap(device, region); list_del(®ion->link); } @@ -258,18 +308,6 @@ static unsigned int vfio_pci_get_group_from_dev(const char *bdf) return group; } -static void vfio_pci_container_setup(struct vfio_pci_device *device) -{ - const char *path = device->iommu_mode->container_path; - int version; - - device->container_fd = open(path, O_RDWR); - VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", path); - - version = ioctl(device->container_fd, VFIO_GET_API_VERSION); - VFIO_ASSERT_EQ(version, VFIO_API_VERSION); -} - static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf) { struct vfio_group_status group_status = { @@ -290,25 +328,33 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->container_fd); } -static void vfio_pci_iommu_setup(struct vfio_pci_device *device) +static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf) { unsigned long iommu_type = device->iommu_mode->iommu_type; + const char *path = device->iommu_mode->container_path; + int version; int ret; - INIT_LIST_HEAD(&device->dma_regions); + device->container_fd = open(path, O_RDWR); + VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", path); + + version = ioctl(device->container_fd, VFIO_GET_API_VERSION); + VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); + + vfio_pci_group_setup(device, bdf); ret = ioctl(device->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); ioctl_assert(device->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); -} - -static void vfio_pci_device_setup(struct vfio_pci_device *device, const char *bdf) -{ - int i; device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); VFIO_ASSERT_GE(device->fd, 0); +} + +static void vfio_pci_device_setup(struct vfio_pci_device *device) +{ + int i; device->info.argsz = sizeof(device->info); ioctl_assert(device->fd, VFIO_DEVICE_GET_INFO, &device->info); @@ -386,6 +432,9 @@ static const struct vfio_iommu_mode iommu_modes[] = { .container_path = "/dev/iommu", .iommu_type = VFIO_TYPE1v2_IOMMU, }, + { + .name = "iommufd", + }, }; const char *default_iommu_mode = "vfio_type1_iommu"; @@ -407,6 +456,57 @@ static const struct vfio_iommu_mode *lookup_iommu_mode(const char *iommu_mode) VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); } +static void vfio_device_bind_iommufd(int device_fd, int iommufd) +{ + struct vfio_device_bind_iommufd args = { + .argsz = sizeof(args), + .iommufd = iommufd, + }; + + ioctl_assert(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args); +} + +static u32 iommufd_ioas_alloc(int iommufd) +{ + struct iommu_ioas_alloc args = { + .size = sizeof(args), + }; + + ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args); + return args.out_ioas_id; +} + +static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) +{ + struct vfio_device_attach_iommufd_pt args = { + .argsz = sizeof(args), + .pt_id = pt_id, + }; + + ioctl_assert(device_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &args); +} + +static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *bdf) +{ + const char *cdev_path = vfio_pci_get_cdev_path(bdf); + + device->fd = open(cdev_path, O_RDWR); + VFIO_ASSERT_GE(device->fd, 0); + free((void *)cdev_path); + + /* + * Require device->iommufd to be >0 so that a simple non-0 check can be + * used to check if iommufd is enabled. In practice open() will never + * return 0 unless stdin is closed. + */ + device->iommufd = open("/dev/iommu", O_RDWR); + VFIO_ASSERT_GT(device->iommufd, 0); + + vfio_device_bind_iommufd(device->fd, device->iommufd); + device->ioas_id = iommufd_ioas_alloc(device->iommufd); + vfio_device_attach_iommufd_pt(device->fd, device->ioas_id); +} + struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode) { struct vfio_pci_device *device; @@ -414,13 +514,16 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_ device = calloc(1, sizeof(*device)); VFIO_ASSERT_NOT_NULL(device); + INIT_LIST_HEAD(&device->dma_regions); + device->iommu_mode = lookup_iommu_mode(iommu_mode); - vfio_pci_container_setup(device); - vfio_pci_group_setup(device, bdf); - vfio_pci_iommu_setup(device); - vfio_pci_device_setup(device, bdf); + if (device->iommu_mode->container_path) + vfio_pci_container_setup(device, bdf); + else + vfio_pci_iommufd_setup(device, bdf); + vfio_pci_device_setup(device); vfio_pci_driver_probe(device); return device; @@ -444,8 +547,12 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0); } - VFIO_ASSERT_EQ(close(device->group_fd), 0); - VFIO_ASSERT_EQ(close(device->container_fd), 0); + if (device->iommufd) { + VFIO_ASSERT_EQ(close(device->iommufd), 0); + } else { + VFIO_ASSERT_EQ(close(device->group_fd), 0); + VFIO_ASSERT_EQ(close(device->container_fd), 0); + } free(device); } From 8afcbe20476ad238fd1f331f51d721138eff5172 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:16 +0000 Subject: [PATCH 33/48] vfio: selftests: Make iommufd the default iommu_mode Now that VFIO selftests support iommufd, make it the default mode. IOMMUFD is the successor to VFIO_TYPE1{,v2}_IOMMU and all new features are being added there, so it's a slightly better fit as the default mode. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-30-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/lib/vfio_pci_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 5d8944a37982..0921b2451ba5 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -437,7 +437,7 @@ static const struct vfio_iommu_mode iommu_modes[] = { }, }; -const char *default_iommu_mode = "vfio_type1_iommu"; +const char *default_iommu_mode = "iommufd"; static const struct vfio_iommu_mode *lookup_iommu_mode(const char *iommu_mode) { From fd134b0f2f8ef9b1b7b0cade8cac4ff831619713 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 22 Aug 2025 21:25:17 +0000 Subject: [PATCH 34/48] vfio: selftests: Add a script to help with running VFIO selftests Introduce run.sh, a script to help with running VFIO selftests. The script is intended to be used for both humans manually running VFIO selftests, and to incorporate into test automation where VFIO selftests may run alongside other tests. As such the script aims to be hermetic, returning the system to the state it was before the test started. The script takes as input the BDF of a device to use and a command to run (typically the command would be a VFIO selftest). e.g. $ ./run.sh -d 0000:6a:01.0 ./vfio_pci_device_test or $ ./run.sh -d 0000:6a:01.0 -- ./vfio_pci_device_test The script then handles unbinding device 0000:6a:01.0 from its current driver, binding it to vfio-pci, running the test, unbinding from vfio-pci, and binding back to the original driver. When run.sh runs the provided test, it does so by appending the BDF as the last parameter. For example: $ ./run.sh -d 0000:6a:01.0 -- echo hello Results in the following being printed to stdout: hello 0000:6a:01.0 The script also supports a mode where it can break out into a shell so that multiple tests can be run manually. $ ./run.sh -d 0000:6a:01.0 -s $ echo $VFIO_SELFTESTS_BDF $ ./vfio_pci_device_test $ exit Choosing which device to use is up to the user. In the future this script should be extensible to tests that want to use multiple devices. The script can support accepting -d BDF multiple times and parse them into an array, setup all the devices, pass the list of BDFs to the test, and then cleanup all the devices. Acked-by: Shuah Khan Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250822212518.4156428-31-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/Makefile | 1 + tools/testing/selftests/vfio/run.sh | 109 ++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100755 tools/testing/selftests/vfio/run.sh diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index ee09c027ade5..324ba0175a33 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS += vfio_dma_mapping_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_driver_test +TEST_PROGS_EXTENDED := run.sh include ../lib.mk include lib/libvfio.mk diff --git a/tools/testing/selftests/vfio/run.sh b/tools/testing/selftests/vfio/run.sh new file mode 100755 index 000000000000..0476b6d7adc3 --- /dev/null +++ b/tools/testing/selftests/vfio/run.sh @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +# Global variables initialized in main() and then used during cleanup() when +# the script exits. +declare DEVICE_BDF +declare NEW_DRIVER +declare OLD_DRIVER +declare OLD_NUMVFS +declare DRIVER_OVERRIDE + +function write_to() { + # Unfortunately set -x does not show redirects so use echo to manually + # tell the user what commands are being run. + echo "+ echo \"${2}\" > ${1}" + echo "${2}" > ${1} +} + +function bind() { + write_to /sys/bus/pci/drivers/${2}/bind ${1} +} + +function unbind() { + write_to /sys/bus/pci/drivers/${2}/unbind ${1} +} + +function set_sriov_numvfs() { + write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2} +} + +function set_driver_override() { + write_to /sys/bus/pci/devices/${1}/driver_override ${2} +} + +function clear_driver_override() { + set_driver_override ${1} "" +} + +function cleanup() { + if [ "${NEW_DRIVER}" ]; then unbind ${DEVICE_BDF} ${NEW_DRIVER} ; fi + if [ "${DRIVER_OVERRIDE}" ]; then clear_driver_override ${DEVICE_BDF} ; fi + if [ "${OLD_DRIVER}" ]; then bind ${DEVICE_BDF} ${OLD_DRIVER} ; fi + if [ "${OLD_NUMVFS}" ]; then set_sriov_numvfs ${DEVICE_BDF} ${OLD_NUMVFS} ; fi +} + +function usage() { + echo "usage: $0 [-d segment:bus:device.function] [-s] [-h] [cmd ...]" >&2 + echo >&2 + echo " -d: The BDF of the device to use for the test (required)" >&2 + echo " -h: Show this help message" >&2 + echo " -s: Drop into a shell rather than running a command" >&2 + echo >&2 + echo " cmd: The command to run and arguments to pass to it." >&2 + echo " Required when not using -s. The SBDF will be " >&2 + echo " appended to the argument list." >&2 + exit 1 +} + +function main() { + local shell + + while getopts "d:hs" opt; do + case $opt in + d) DEVICE_BDF="$OPTARG" ;; + s) shell=true ;; + *) usage ;; + esac + done + + # Shift past all optional arguments. + shift $((OPTIND - 1)) + + # Check that the user passed in the command to run. + [ ! "${shell}" ] && [ $# = 0 ] && usage + + # Check that the user passed in a BDF. + [ "${DEVICE_BDF}" ] || usage + + trap cleanup EXIT + set -e + + test -d /sys/bus/pci/devices/${DEVICE_BDF} + + if [ -f /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs ]; then + OLD_NUMVFS=$(cat /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs) + set_sriov_numvfs ${DEVICE_BDF} 0 + fi + + if [ -L /sys/bus/pci/devices/${DEVICE_BDF}/driver ]; then + OLD_DRIVER=$(basename $(readlink -m /sys/bus/pci/devices/${DEVICE_BDF}/driver)) + unbind ${DEVICE_BDF} ${OLD_DRIVER} + fi + + set_driver_override ${DEVICE_BDF} vfio-pci + DRIVER_OVERRIDE=true + + bind ${DEVICE_BDF} vfio-pci + NEW_DRIVER=vfio-pci + + echo + if [ "${shell}" ]; then + echo "Dropping into ${SHELL} with VFIO_SELFTESTS_BDF=${DEVICE_BDF}" + VFIO_SELFTESTS_BDF=${DEVICE_BDF} ${SHELL} + else + "$@" ${DEVICE_BDF} + fi + echo +} + +main "$@" From fcf9ae9ec9761802b69294d1b3f98d51f14e5175 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 27 Aug 2025 15:32:15 +0100 Subject: [PATCH 35/48] MAINTAINERS: Update Shameer Kolothum's email address Changed jobs and Huawei email is no longer valid. Also, since I no longer have access to HiSilicon hardware, remove myself from HISILICON PCI DRIVER maintainer entry. Acked-by: Jonathan Cameron Signed-off-by: Shameer Kolothum Link: https://lore.kernel.org/r/20250827143215.2311-1-skolothumtho@nvidia.com Signed-off-by: Alex Williamson --- .mailmap | 1 + MAINTAINERS | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index a124aeed52a2..12caec550f9e 100644 --- a/.mailmap +++ b/.mailmap @@ -705,6 +705,7 @@ Sergey Senozhatsky Sergey Senozhatsky Seth Forshee Shakeel Butt +Shameer Kolothum Shannon Nelson Shannon Nelson Shannon Nelson diff --git a/MAINTAINERS b/MAINTAINERS index fba915fcb30e..839f910bad6b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26449,7 +26449,6 @@ F: drivers/vfio/fsl-mc/ VFIO HISILICON PCI DRIVER M: Longfang Liu -M: Shameer Kolothum L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/pci/hisilicon/ @@ -26478,7 +26477,7 @@ F: drivers/vfio/pci/nvgrace-gpu/ VFIO PCI DEVICE SPECIFIC DRIVERS R: Jason Gunthorpe R: Yishai Hadas -R: Shameer Kolothum +R: Shameer Kolothum R: Kevin Tian L: kvm@vger.kernel.org S: Maintained From ab1d8dda32e9507ca3bfb6b43661aeaa27f7bd82 Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Tue, 26 Aug 2025 10:08:51 +0530 Subject: [PATCH 36/48] cdx: don't select CONFIG_GENERIC_MSI_IRQ x86 does not use CONFIG_GENERIC_MSI_IRQ, and trying to enable it anyway results in a build failure: In file included from include/linux/ssb/ssb.h:10, from drivers/ssb/pcihost_wrapper.c:18: include/linux/gpio/driver.h:41:33: error: field 'msiinfo' has incomplete type 41 | msi_alloc_info_t msiinfo; | ^~~~~~~ In file included from include/linux/kvm_host.h:19, from arch/x86/events/intel/core.c:17: include/linux/msi.h:528:33: error: field 'alloc_info' has incomplete type 528 | msi_alloc_info_t alloc_info; Change the driver to actually build without this symbol and remove the incorrect 'select' statements. Fixes: e8b18c11731d ("cdx: Fix missing GENERIC_MSI_IRQ on compile test") Reviewed-by: Robin Murphy Reviewed-by: Nikhil Agarwal Signed-off-by: Arnd Bergmann Signed-off-by: Nipun Gupta Link: https://lore.kernel.org/r/20250826043852.2206008-1-nipun.gupta@amd.com Signed-off-by: Alex Williamson --- drivers/cdx/Kconfig | 1 - drivers/cdx/cdx.c | 4 ++-- drivers/cdx/controller/Kconfig | 1 - drivers/cdx/controller/cdx_controller.c | 3 ++- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/cdx/Kconfig b/drivers/cdx/Kconfig index 3af41f51cf38..1f1e360507d7 100644 --- a/drivers/cdx/Kconfig +++ b/drivers/cdx/Kconfig @@ -8,7 +8,6 @@ config CDX_BUS bool "CDX Bus driver" depends on OF && ARM64 || COMPILE_TEST - select GENERIC_MSI_IRQ help Driver to enable Composable DMA Transfer(CDX) Bus. CDX bus exposes Fabric devices which uses composable DMA IP to the diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c index 092306ca2541..3d50f8cd9c0b 100644 --- a/drivers/cdx/cdx.c +++ b/drivers/cdx/cdx.c @@ -310,7 +310,7 @@ static int cdx_probe(struct device *dev) * Setup MSI device data so that generic MSI alloc/free can * be used by the device driver. */ - if (cdx->msi_domain) { + if (IS_ENABLED(CONFIG_GENERIC_MSI_IRQ) && cdx->msi_domain) { error = msi_setup_device_data(&cdx_dev->dev); if (error) return error; @@ -833,7 +833,7 @@ int cdx_device_add(struct cdx_dev_params *dev_params) ((cdx->id << CDX_CONTROLLER_ID_SHIFT) | (cdx_dev->bus_num & CDX_BUS_NUM_MASK)), cdx_dev->dev_num); - if (cdx->msi_domain) { + if (IS_ENABLED(CONFIG_GENERIC_MSI_IRQ) && cdx->msi_domain) { cdx_dev->num_msi = dev_params->num_msi; dev_set_msi_domain(&cdx_dev->dev, cdx->msi_domain); } diff --git a/drivers/cdx/controller/Kconfig b/drivers/cdx/controller/Kconfig index 0641a4c21e66..a480b62cbd1f 100644 --- a/drivers/cdx/controller/Kconfig +++ b/drivers/cdx/controller/Kconfig @@ -10,7 +10,6 @@ if CDX_BUS config CDX_CONTROLLER tristate "CDX bus controller" depends on HAS_DMA - select GENERIC_MSI_IRQ select REMOTEPROC select RPMSG help diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c index fca83141e3e6..5e3fd89b6b56 100644 --- a/drivers/cdx/controller/cdx_controller.c +++ b/drivers/cdx/controller/cdx_controller.c @@ -193,7 +193,8 @@ static int xlnx_cdx_probe(struct platform_device *pdev) cdx->ops = &cdx_ops; /* Create MSI domain */ - cdx->msi_domain = cdx_msi_domain_init(&pdev->dev); + if (IS_ENABLED(CONFIG_GENERIC_MSI_IRQ)) + cdx->msi_domain = cdx_msi_domain_init(&pdev->dev); if (!cdx->msi_domain) { ret = dev_err_probe(&pdev->dev, -ENODEV, "cdx_msi_domain_init() failed"); goto cdx_msi_fail; From 9f3acb3d9a1872e2fa36af068ca2e93a8a864089 Mon Sep 17 00:00:00 2001 From: Nipun Gupta Date: Tue, 26 Aug 2025 10:08:52 +0530 Subject: [PATCH 37/48] vfio/cdx: update driver to build without CONFIG_GENERIC_MSI_IRQ Define dummy MSI related APIs in VFIO CDX driver to build the driver without enabling CONFIG_GENERIC_MSI_IRQ flag. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508070308.opy5dIFX-lkp@intel.com/ Reviewed-by: Nikhil Agarwal Reviewed-by: Alex Williamson Signed-off-by: Nipun Gupta Link: https://lore.kernel.org/r/20250826043852.2206008-2-nipun.gupta@amd.com Signed-off-by: Alex Williamson --- drivers/vfio/cdx/Makefile | 6 +++++- drivers/vfio/cdx/private.h | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/cdx/Makefile b/drivers/vfio/cdx/Makefile index df92b320122a..dadbef2419ea 100644 --- a/drivers/vfio/cdx/Makefile +++ b/drivers/vfio/cdx/Makefile @@ -5,4 +5,8 @@ obj-$(CONFIG_VFIO_CDX) += vfio-cdx.o -vfio-cdx-objs := main.o intr.o +vfio-cdx-objs := main.o + +ifdef CONFIG_GENERIC_MSI_IRQ +vfio-cdx-objs += intr.o +endif diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h index dc56729b3114..172e48caa3a0 100644 --- a/drivers/vfio/cdx/private.h +++ b/drivers/vfio/cdx/private.h @@ -38,11 +38,25 @@ struct vfio_cdx_device { u8 config_msi; }; +#ifdef CONFIG_GENERIC_MSI_IRQ int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev, u32 flags, unsigned int index, unsigned int start, unsigned int count, void *data); void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev); +#else +static int vfio_cdx_set_irqs_ioctl(struct vfio_cdx_device *vdev, + u32 flags, unsigned int index, + unsigned int start, unsigned int count, + void *data) +{ + return -EINVAL; +} + +static void vfio_cdx_irqs_cleanup(struct vfio_cdx_device *vdev) +{ +} +#endif #endif /* VFIO_CDX_PRIVATE_H */ From 03e073bc4dbc3d64ce0beb21fbe793ae7787e062 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 28 Aug 2025 18:58:14 +0000 Subject: [PATCH 38/48] vfio: selftests: Fix .gitignore for already tracked files Fix the rules in tools/testing/selftests/vfio/.gitignore to not ignore some already tracked files (.gitignore, Makefile, lib/libvfio.mk). This change should be a no-op, since these files are already tracked by git and thus git will not ignore updates to them even though they match the ignore rules in the VFIO selftests .gitignore file. However, they do generate warnings with W=1, as reported by the kernel test robot. $ KBUILD_EXTRA_WARN=1 scripts/misc-check tools/testing/selftests/vfio/.gitignore: warning: ignored by one of the .gitignore files tools/testing/selftests/vfio/Makefile: warning: ignored by one of the .gitignore files tools/testing/selftests/vfio/lib/libvfio.mk: warning: ignored by one of the .gitignore files Fix this by explicitly un-ignoring the tracked files. Fixes: 292e9ee22b0a ("selftests: Create tools/testing/selftests/vfio") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508280918.rFRyiLEU-lkp@intel.com/ Signed-off-by: David Matlack Link: https://lore.kernel.org/r/20250828185815.382215-1-dmatlack@google.com Signed-off-by: Alex Williamson --- tools/testing/selftests/vfio/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/vfio/.gitignore b/tools/testing/selftests/vfio/.gitignore index 6d9381d60172..7fadc19d3bca 100644 --- a/tools/testing/selftests/vfio/.gitignore +++ b/tools/testing/selftests/vfio/.gitignore @@ -5,3 +5,6 @@ !*.h !*.S !*.sh +!*.mk +!.gitignore +!Makefile From 093458c58f830d0a713fab0de037df5f0ce24fef Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 28 Aug 2025 14:36:24 -0600 Subject: [PATCH 39/48] docs: proc.rst: Fix VFIO Device title formatting Title underline is one character too short. Cc: Alex Mastro Cc: Jonathan Corbet Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20250828123035.2f0c74e7@canb.auug.org.au Fixes: 1e736f148956 ("vfio/pci: print vfio-device syspath to fdinfo") Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20250828203629.283418-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- Documentation/filesystems/proc.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index ed8d23b677ca..ff09f668cdeb 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -2167,7 +2167,7 @@ where 'size' is the size of the DMA buffer in bytes. 'count' is the file count o the DMA buffer file. 'exp_name' is the name of the DMA buffer exporter. VFIO Device files -~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ :: From 6dbcc6ed4bd340f96fde4335a901d985401dd45b Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Wed, 20 Aug 2025 20:31:02 +0000 Subject: [PATCH 40/48] MAINTAINERS: Add myself as VFIO-platform reviewer Based on discussion: https://lore.kernel.org/kvm/20250806170314.3768750-3-alex.williamson@redhat.com/ I will start looking into adding support for modern HW and more features to VFIO-platform. Signed-off-by: Mostafa Saleh Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20250820203102.2034333-1-smostafa@google.com Signed-off-by: Alex Williamson --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 839f910bad6b..3a80b7fc7935 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26493,6 +26493,7 @@ F: drivers/vfio/pci/pds/ VFIO PLATFORM DRIVER M: Eric Auger +R: Mostafa Saleh L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/platform/ From 08fb9897f75719947303acfb23b8c41039118a2d Mon Sep 17 00:00:00 2001 From: Pranjal Shrivastava Date: Mon, 1 Sep 2025 19:16:19 +0000 Subject: [PATCH 41/48] MAINTAINERS: Add myself as VFIO-platform reviewer While my work at Google Cloud focuses on various areas of the kernel, my background in IOMMU and the VFIO subsystem motivates me to help with the maintenance effort for vfio-platform (based on the discussion [1]) and ensure its continued health. Link: https://lore.kernel.org/all/aKxpyyKvYcd84Ayi@google.com/ [1] Signed-off-by: Pranjal Shrivastava Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20250901191619.183116-1-praan@google.com Signed-off-by: Alex Williamson --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 3a80b7fc7935..3f9890ec18a4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26494,6 +26494,7 @@ F: drivers/vfio/pci/pds/ VFIO PLATFORM DRIVER M: Eric Auger R: Mostafa Saleh +R: Pranjal Shrivastava L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/platform/ From fd0f75308bfde358e39b0ebd25a50750b6139ae5 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 25 Aug 2025 11:58:00 -0600 Subject: [PATCH 42/48] vfio/amba: Mark for removal vfio-amba has only been touched to keep up with the rest of the code base for the past 10 years. We have no basis to believe that it's currently tested or used. Mark it for deprecation. Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20250825175807.3264083-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/Kconfig | 5 ++++- drivers/vfio/platform/vfio_amba.c | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig index 88fcde51f024..c6be29b2c24b 100644 --- a/drivers/vfio/platform/Kconfig +++ b/drivers/vfio/platform/Kconfig @@ -17,10 +17,13 @@ config VFIO_PLATFORM If you don't know what to do here, say N. config VFIO_AMBA - tristate "VFIO support for AMBA devices" + tristate "VFIO support for AMBA devices (DEPRECATED)" depends on ARM_AMBA || COMPILE_TEST select VFIO_PLATFORM_BASE help + The vfio-amba driver is deprecated and will be removed in a + future kernel release. + Support for ARM AMBA devices with VFIO. This is required to make use of ARM AMBA devices present on the system using the VFIO framework. diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index ff8ff8480968..9f5c527baa8a 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -70,6 +70,8 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) struct vfio_platform_device *vdev; int ret; + dev_err_once(&adev->dev, "DEPRECATION: vfio-amba is deprecated and will be removed in a future kernel release\n"); + vdev = vfio_alloc_device(vfio_platform_device, vdev, &adev->dev, &vfio_amba_ops); if (IS_ERR(vdev)) From 801ca4ce0bce45aae1da2c8914d2f86cb68f8b55 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 25 Aug 2025 11:58:01 -0600 Subject: [PATCH 43/48] vfio/platform: Mark reset drivers for removal While vfio-platform itself is on a reprieve from being removed[1], these reset drivers don't support any current hardware, are not being tested, and suggest a level of support that doesn't really exist. Mark them for removal to surface any remaining user such that we can potentially drop them and simplify the code if none appear. Link: https://lore.kernel.org/all/20250806170314.3768750-3-alex.williamson@redhat.com [1] Reviewed-by: Pranjal Shrivastava Reviewed-by: Mostafa Saleh Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20250825175807.3264083-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/reset/Kconfig | 6 +++--- drivers/vfio/platform/reset/vfio_platform_amdxgbe.c | 2 ++ drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c | 2 ++ drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/platform/reset/Kconfig b/drivers/vfio/platform/reset/Kconfig index dcc08dc145a5..70af0dbe293b 100644 --- a/drivers/vfio/platform/reset/Kconfig +++ b/drivers/vfio/platform/reset/Kconfig @@ -1,21 +1,21 @@ # SPDX-License-Identifier: GPL-2.0-only if VFIO_PLATFORM config VFIO_PLATFORM_CALXEDAXGMAC_RESET - tristate "VFIO support for calxeda xgmac reset" + tristate "VFIO support for calxeda xgmac reset (DEPRECATED)" help Enables the VFIO platform driver to handle reset for Calxeda xgmac If you don't know what to do here, say N. config VFIO_PLATFORM_AMDXGBE_RESET - tristate "VFIO support for AMD XGBE reset" + tristate "VFIO support for AMD XGBE reset (DEPRECATED)" help Enables the VFIO platform driver to handle reset for AMD XGBE If you don't know what to do here, say N. config VFIO_PLATFORM_BCMFLEXRM_RESET - tristate "VFIO support for Broadcom FlexRM reset" + tristate "VFIO support for Broadcom FlexRM reset (DEPRECATED)" depends on ARCH_BCM_IPROC || COMPILE_TEST default ARCH_BCM_IPROC help diff --git a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c index abdca900802d..45f386a042a9 100644 --- a/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c +++ b/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c @@ -52,6 +52,8 @@ static int vfio_platform_amdxgbe_reset(struct vfio_platform_device *vdev) u32 dma_mr_value, pcs_value, value; unsigned int count; + dev_err_once(vdev->device, "DEPRECATION: VFIO AMD XGBE platform reset is deprecated and will be removed in a future kernel release\n"); + if (!xgmac_regs->ioaddr) { xgmac_regs->ioaddr = ioremap(xgmac_regs->addr, xgmac_regs->size); diff --git a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c index 1131ebe4837d..51c9d156f307 100644 --- a/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c +++ b/drivers/vfio/platform/reset/vfio_platform_bcmflexrm.c @@ -72,6 +72,8 @@ static int vfio_platform_bcmflexrm_reset(struct vfio_platform_device *vdev) int rc = 0, ret = 0, ring_num = 0; struct vfio_platform_region *reg = &vdev->regions[0]; + dev_err_once(vdev->device, "DEPRECATION: VFIO Broadcom FlexRM platform reset is deprecated and will be removed in a future kernel release\n"); + /* Map FlexRM ring registers if not mapped */ if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); diff --git a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c index 63cc7f0b2e4a..a298045a8e19 100644 --- a/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c +++ b/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c @@ -50,6 +50,8 @@ static int vfio_platform_calxedaxgmac_reset(struct vfio_platform_device *vdev) { struct vfio_platform_region *reg = &vdev->regions[0]; + dev_err_once(vdev->device, "DEPRECATION: VFIO Calxeda xgmac platform reset is deprecated and will be removed in a future kernel release\n"); + if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); From eaba58355ecd124b4a8c91df7335970ad9fe2624 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 1 Sep 2025 16:18:08 +0800 Subject: [PATCH 44/48] hisi_acc_vfio_pci: Fix reference leak in hisi_acc_vfio_debug_init The debugfs_lookup() function returns a dentry with an increased reference count that must be released by calling dput(). Fixes: b398f91779b8 ("hisi_acc_vfio_pci: register debugfs for hisilicon migration driver") Cc: stable@vger.kernel.org Signed-off-by: Miaoqian Lin Reviewed-by: Longfang Liu Link: https://lore.kernel.org/r/20250901081809.2286649-1-linmq006@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 397f5e445136..fde33f54e99e 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1612,8 +1612,10 @@ static void hisi_acc_vfio_debug_init(struct hisi_acc_vf_core_device *hisi_acc_vd } migf = kzalloc(sizeof(*migf), GFP_KERNEL); - if (!migf) + if (!migf) { + dput(vfio_dev_migration); return; + } hisi_acc_vdev->debug_migf = migf; vfio_hisi_acc = debugfs_create_dir("hisi_acc", vfio_dev_migration); @@ -1623,6 +1625,8 @@ static void hisi_acc_vfio_debug_init(struct hisi_acc_vf_core_device *hisi_acc_vd hisi_acc_vf_migf_read); debugfs_create_devm_seqfile(dev, "cmd_state", vfio_hisi_acc, hisi_acc_vf_debug_cmd); + + dput(vfio_dev_migration); } static void hisi_acc_vf_debugfs_exit(struct hisi_acc_vf_core_device *hisi_acc_vdev) From 16df67f2189a71a8310bcebddb87ed569e8352be Mon Sep 17 00:00:00 2001 From: Alex Mastro Date: Mon, 8 Sep 2025 08:58:40 -0700 Subject: [PATCH 45/48] vfio: return -ENOTTY for unsupported device feature The two implementers of vfio_device_ops.device_feature, vfio_cdx_ioctl_feature and vfio_pci_core_ioctl_feature, return -ENOTTY in the fallthrough case when the feature is unsupported. For consistency, the base case, vfio_ioctl_device_feature, should do the same when device_feature == NULL, indicating an implementation has no feature extensions. Signed-off-by: Alex Mastro Link: https://lore.kernel.org/r/20250908-vfio-enotty-v1-1-4428e1539e2e@fb.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 91a8eae308ea..38c8e9350a60 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1252,7 +1252,7 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) - return -EINVAL; + return -ENOTTY; return device->ops->device_feature(device, feature.flags, arg->data, feature.argsz - minsz); From acb59a4bb8ed34e738a4c3463127bf3f6b5e11a9 Mon Sep 17 00:00:00 2001 From: Zilin Guan Date: Sat, 13 Sep 2025 15:31:54 +0000 Subject: [PATCH 46/48] vfio/pds: replace bitmap_free with vfree host_seq_bmp is allocated with vzalloc but is currently freed with bitmap_free, which uses kfree internally. This mismach prevents the resource from being released properly and may result in memory leaks or other issues. Fix this by freeing host_seq_bmp with vfree to match the vzalloc allocation. Fixes: f232836a9152 ("vfio/pds: Add support for dirty page tracking") Signed-off-by: Zilin Guan Reviewed-by: Brett Creeley Link: https://lore.kernel.org/r/20250913153154.1028835-1-zilin@seu.edu.cn Signed-off-by: Alex Williamson --- drivers/vfio/pci/pds/dirty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c index c51f5e4c3dd6..481992142f79 100644 --- a/drivers/vfio/pci/pds/dirty.c +++ b/drivers/vfio/pci/pds/dirty.c @@ -82,7 +82,7 @@ static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_region *region, host_ack_bmp = vzalloc(bytes); if (!host_ack_bmp) { - bitmap_free(host_seq_bmp); + vfree(host_seq_bmp); return -ENOMEM; } From 8b9f128947dd72e0fcf256088a673abac9b720bf Mon Sep 17 00:00:00 2001 From: Timothy Pearson Date: Tue, 23 Sep 2025 12:04:33 -0500 Subject: [PATCH 47/48] vfio/pci: Fix INTx handling on legacy non-PCI 2.3 devices PCI devices prior to PCI 2.3 both use level interrupts and do not support interrupt masking, leading to a failure when passed through to a KVM guest on at least the ppc64 platform. This failure manifests as receiving and acknowledging a single interrupt in the guest, while the device continues to assert the level interrupt indicating a need for further servicing. When lazy IRQ masking is used on DisINTx- (non-PCI 2.3) hardware, the following sequence occurs: * Level IRQ assertion on device * IRQ marked disabled in kernel * Host interrupt handler exits without clearing the interrupt on the device * Eventfd is delivered to userspace * Guest processes IRQ and clears device interrupt * Device de-asserts INTx, then re-asserts INTx while the interrupt is masked * Newly asserted interrupt acknowledged by kernel VMM without being handled * Software mask removed by VFIO driver * Device INTx still asserted, host controller does not see new edge after EOI The behavior is now platform-dependent. Some platforms (amd64) will continue to spew IRQs for as long as the INTX line remains asserted, therefore the IRQ will be handled by the host as soon as the mask is dropped. Others (ppc64) will only send the one request, and if it is not handled no further interrupts will be sent. The former behavior theoretically leaves the system vulnerable to interrupt storm, and the latter will result in the device stalling after receiving exactly one interrupt in the guest. Work around this by disabling lazy IRQ masking for DisINTx- INTx devices. Signed-off-by: Timothy Pearson Link: https://lore.kernel.org/r/333803015.1744464.1758647073336.JavaMail.zimbra@raptorengineeringinc.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_intrs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 00583909b380..30d3e921cb0d 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -304,9 +304,14 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; + if (!vdev->pci_2_3) + irq_set_status_flags(pdev->irq, IRQ_DISABLE_UNLAZY); + ret = request_irq(pdev->irq, vfio_intx_handler, irqflags, ctx->name, ctx); if (ret) { + if (!vdev->pci_2_3) + irq_clear_status_flags(pdev->irq, IRQ_DISABLE_UNLAZY); vdev->irq_type = VFIO_PCI_NUM_IRQS; kfree(name); vfio_irq_ctx_free(vdev, ctx, 0); @@ -352,6 +357,8 @@ static void vfio_intx_disable(struct vfio_pci_core_device *vdev) vfio_virqfd_disable(&ctx->unmask); vfio_virqfd_disable(&ctx->mask); free_irq(pdev->irq, ctx); + if (!vdev->pci_2_3) + irq_clear_status_flags(pdev->irq, IRQ_DISABLE_UNLAZY); if (ctx->trigger) eventfd_ctx_put(ctx->trigger); kfree(ctx->name); From 407aa63018d15c35a34938633868e61174d2ef6e Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Thu, 25 Sep 2025 12:09:35 -0500 Subject: [PATCH 48/48] vfio/nvgrace-gpu: Add GB300 SKU to the devid table GB300 is NVIDIA's Grace Blackwell Ultra Superchip. Add the GB300 SKU device-id to nvgrace_gpu_vfio_pci_table. Signed-off-by: Tushar Dave Reviewed-by: Ankit Agrawal Link: https://lore.kernel.org/r/20250925170935.121587-1-tdave@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/nvgrace-gpu/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 0adaa6150252..e346392b72f6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -995,6 +995,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, /* GB200 SKU */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, + /* GB300 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x31C2) }, {} };