From 2cc3643df568c9d274781ef896368fd50514fb3e Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 2 May 2025 07:22:38 -0700 Subject: [PATCH 01/28] vfio/container: ram discard disable helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a helper to set ram discard disable, generate error messages, and cleanup on failure. The second vfio_ram_block_discard_disable call site now performs VFIO_GROUP_UNSET_CONTAINER immediately on failure, instead of relying on the close of the container fd to do so in the kernel, but this is equivalent. Signed-off-by: Steve Sistare Reviewed-by: Cedric Le Goater Link: https://lore.kernel.org/qemu-devel/1746195760-101443-2-git-send-email-steven.sistare@oracle.com [ clg: vfio_attach_discard_disable() -> vfio_container_attach_discard_disable() ] Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 48 +++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 77ff56b43f..2ea137fb02 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -511,16 +511,10 @@ static bool vfio_legacy_setup(VFIOContainerBase *bcontainer, Error **errp) return true; } -static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, - Error **errp) +static bool vfio_container_attach_discard_disable(VFIOContainer *container, + VFIOGroup *group, Error **errp) { - VFIOContainer *container; - VFIOContainerBase *bcontainer; - int ret, fd; - VFIOAddressSpace *space; - VFIOIOMMUClass *vioc; - - space = vfio_address_space_get(as); + int ret; /* * VFIO is currently incompatible with discarding of RAM insofar as the @@ -553,18 +547,32 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, * details once we know which type of IOMMU we are using. */ + ret = vfio_ram_block_discard_disable(container, true); + if (ret) { + error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) { + error_report("vfio: error disconnecting group %d from" + " container", group->groupid); + } + } + return !ret; +} + +static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, + Error **errp) +{ + VFIOContainer *container; + VFIOContainerBase *bcontainer; + int ret, fd; + VFIOAddressSpace *space; + VFIOIOMMUClass *vioc; + + space = vfio_address_space_get(as); + QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOContainer, bcontainer); if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, - "Cannot set discarding of RAM broken"); - if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, - &container->fd)) { - error_report("vfio: error disconnecting group %d from" - " container", group->groupid); - } + if (!vfio_container_attach_discard_disable(container, group, errp)) { return false; } group->container = container; @@ -596,9 +604,7 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, goto free_container_exit; } - ret = vfio_ram_block_discard_disable(container, true); - if (ret) { - error_setg_errno(errp, -ret, "Cannot set discarding of RAM broken"); + if (!vfio_container_attach_discard_disable(container, group, errp)) { goto unregister_container_exit; } From a1f267a7d4d9f60ca013e89ca7562cbb483d7d83 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 2 May 2025 07:22:39 -0700 Subject: [PATCH 02/28] vfio/container: reform vfio_container_connect cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the proliferation of exit labels in vfio_container_connect with conditionals for cleaning each piece of state. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cedric Le Goater Link: https://lore.kernel.org/qemu-devel/1746195760-101443-3-git-send-email-steven.sistare@oracle.com [ clg: vfio_attach_discard_disable() -> vfio_container_attach_discard_disable() ] Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 60 +++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 2ea137fb02..4b2864cfa8 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -563,9 +563,12 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, { VFIOContainer *container; VFIOContainerBase *bcontainer; - int ret, fd; + int ret, fd = -1; VFIOAddressSpace *space; - VFIOIOMMUClass *vioc; + VFIOIOMMUClass *vioc = NULL; + bool new_container = false; + bool group_was_added = false; + bool discard_disabled = false; space = vfio_address_space_get(as); @@ -584,35 +587,37 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp); if (fd < 0) { - goto put_space_exit; + goto fail; } ret = ioctl(fd, VFIO_GET_API_VERSION); if (ret != VFIO_API_VERSION) { error_setg(errp, "supported vfio version: %d, " "reported version: %d", VFIO_API_VERSION, ret); - goto close_fd_exit; + goto fail; } container = vfio_create_container(fd, group, errp); if (!container) { - goto close_fd_exit; + goto fail; } + new_container = true; bcontainer = &container->bcontainer; if (!vfio_cpr_register_container(bcontainer, errp)) { - goto free_container_exit; + goto fail; } if (!vfio_container_attach_discard_disable(container, group, errp)) { - goto unregister_container_exit; + goto fail; } + discard_disabled = true; vioc = VFIO_IOMMU_GET_CLASS(bcontainer); assert(vioc->setup); if (!vioc->setup(bcontainer, errp)) { - goto enable_discards_exit; + goto fail; } vfio_group_add_kvm_device(group); @@ -621,35 +626,36 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, group->container = container; QLIST_INSERT_HEAD(&container->group_list, group, container_next); + group_was_added = true; if (!vfio_listener_register(bcontainer, errp)) { - goto listener_release_exit; + goto fail; } bcontainer->initialized = true; return true; -listener_release_exit: - QLIST_REMOVE(group, container_next); - vfio_group_del_kvm_device(group); + +fail: vfio_listener_unregister(bcontainer); - if (vioc->release) { + + if (group_was_added) { + QLIST_REMOVE(group, container_next); + vfio_group_del_kvm_device(group); + } + if (vioc && vioc->release) { vioc->release(bcontainer); } - -enable_discards_exit: - vfio_ram_block_discard_disable(container, false); - -unregister_container_exit: - vfio_cpr_unregister_container(bcontainer); - -free_container_exit: - object_unref(container); - -close_fd_exit: - close(fd); - -put_space_exit: + if (discard_disabled) { + vfio_ram_block_discard_disable(container, false); + } + if (new_container) { + vfio_cpr_unregister_container(bcontainer); + object_unref(container); + } + if (fd >= 0) { + close(fd); + } vfio_address_space_put(space); return false; From 07f86929e5e1f8506fbdd4c3522c91d74b206ad3 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Fri, 2 May 2025 07:22:40 -0700 Subject: [PATCH 03/28] vfio/container: vfio_container_group_add MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add vfio_container_group_add to de-dup some code. No functional change. Signed-off-by: Steve Sistare Reviewed-by: Cedric Le Goater Link: https://lore.kernel.org/qemu-devel/1746195760-101443-4-git-send-email-steven.sistare@oracle.com [ clg: vfio_attach_discard_disable() -> vfio_container_attach_discard_disable() ] Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 47 ++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index 4b2864cfa8..a761f0958b 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -558,6 +558,26 @@ static bool vfio_container_attach_discard_disable(VFIOContainer *container, return !ret; } +static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup *group, + Error **errp) +{ + if (!vfio_container_attach_discard_disable(container, group, errp)) { + return false; + } + group->container = container; + QLIST_INSERT_HEAD(&container->group_list, group, container_next); + vfio_group_add_kvm_device(group); + return true; +} + +static void vfio_container_group_del(VFIOContainer *container, VFIOGroup *group) +{ + QLIST_REMOVE(group, container_next); + group->container = NULL; + vfio_group_del_kvm_device(group); + vfio_ram_block_discard_disable(container, false); +} + static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, Error **errp) { @@ -568,20 +588,13 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, VFIOIOMMUClass *vioc = NULL; bool new_container = false; bool group_was_added = false; - bool discard_disabled = false; space = vfio_address_space_get(as); QLIST_FOREACH(bcontainer, &space->containers, next) { container = container_of(bcontainer, VFIOContainer, bcontainer); if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) { - if (!vfio_container_attach_discard_disable(container, group, errp)) { - return false; - } - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); - vfio_group_add_kvm_device(group); - return true; + return vfio_container_group_add(container, group, errp); } } @@ -608,11 +621,6 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, goto fail; } - if (!vfio_container_attach_discard_disable(container, group, errp)) { - goto fail; - } - discard_disabled = true; - vioc = VFIO_IOMMU_GET_CLASS(bcontainer); assert(vioc->setup); @@ -620,12 +628,11 @@ static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as, goto fail; } - vfio_group_add_kvm_device(group); - vfio_address_space_insert(space, bcontainer); - group->container = container; - QLIST_INSERT_HEAD(&container->group_list, group, container_next); + if (!vfio_container_group_add(container, group, errp)) { + goto fail; + } group_was_added = true; if (!vfio_listener_register(bcontainer, errp)) { @@ -640,15 +647,11 @@ fail: vfio_listener_unregister(bcontainer); if (group_was_added) { - QLIST_REMOVE(group, container_next); - vfio_group_del_kvm_device(group); + vfio_container_group_del(container, group); } if (vioc && vioc->release) { vioc->release(bcontainer); } - if (discard_disabled) { - vfio_ram_block_discard_disable(container, false); - } if (new_container) { vfio_cpr_unregister_container(bcontainer); object_unref(container); From dd69d846046f697863ebbd18f9a3544d36720476 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:02:57 +0800 Subject: [PATCH 04/28] vfio/igd: Restrict legacy mode to Gen6-9 devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel only provides legacy VBIOS for IGD up to Gen9, and there is no CSM support on later devices. Additionally, Seabios can only handle 32-bit BDSM register used until Gen9. Since legacy mode requires VGA capability, restrict it to Gen6 through Gen9 devices. Link: https://lore.kernel.org/qemu-devel/20250325172239.27926-1-tomitamoeko@gmail.com/T/ Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-2-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- docs/igd-assign.txt | 1 + hw/vfio/igd.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt index 3aed7956d5..95beb76812 100644 --- a/docs/igd-assign.txt +++ b/docs/igd-assign.txt @@ -47,6 +47,7 @@ Intel document [1] shows how to dump VBIOS to file. For UEFI Option ROM, see QEMU also provides a "Legacy" mode that implicitly enables full functionality on IGD, it is automatically enabled when +* IGD generation is 6 to 9 (Sandy Bridge to Comet Lake) * Machine type is i440fx * IGD is assigned to guest BDF 00:02.0 * ROM BAR or romfile is present diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index d7e4728fdc..e06484c911 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -517,11 +517,13 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) /* * For backward compatibility, enable legacy mode when + * - Device geneation is 6 to 9 (including both) * - Machine type is i440fx (pc_piix) * - IGD device is at guest BDF 00:02.0 * - Not manually disabled by x-igd-legacy-mode=off */ if ((vdev->igd_legacy_mode != ON_OFF_AUTO_OFF) && + (gen >= 6 && gen <= 9) && !strcmp(MACHINE_GET_CLASS(qdev_get_machine())->family, "pc_piix") && (&vdev->pdev == pci_find_device(pci_device_root_bus(&vdev->pdev), 0, PCI_DEVFN(0x2, 0)))) { @@ -566,7 +568,9 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) vdev->features |= VFIO_FEATURE_ENABLE_IGD_LPC; } else if (vdev->igd_legacy_mode == ON_OFF_AUTO_ON) { error_setg(&err, - "Machine is not i440fx or assigned BDF is not 00:02.0"); + "Machine is not i440fx, assigned BDF is not 00:02.0, " + "or device %04x (gen %d) doesn't support legacy mode", + vdev->device_id, gen); goto error; } From 1d5f84f349d27f1d3ea6a0a6261253269fc1cf68 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:02:58 +0800 Subject: [PATCH 05/28] vfio/igd: Always emulate ASLS (OpRegion) register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ASLS register represents the base address of OpRegion, and it is programmed with HPA. In IGD passthrough scenario, it needs to be reprogrammed with GPA by guest firmware. To prevent guest accessing wrong memory range, ASLS should always be emulated and cleared. In GVT-g scenario, emulating ASLS is unnecessary as access is handled by kvmgt backend [1]. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/gpu/drm/i915/gvt/cfg_space.c?h=v6.14#n295 Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-3-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index e06484c911..b1fce76f72 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -182,10 +182,6 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, trace_vfio_pci_igd_opregion_enabled(vdev->vbasedev.name); - pci_set_long(vdev->pdev.config + IGD_ASLS, 0); - pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); - pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); - return true; } @@ -584,7 +580,15 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_LPC) && !vfio_pci_igd_setup_lpc_bridge(vdev, errp)) { goto error; - } + } + + /* + * ASLS (OpRegion address) is read-only, emulated + * It contains HPA, guest firmware need to reprogram it with GPA. + */ + pci_set_long(vdev->pdev.config + IGD_ASLS, 0); + pci_set_long(vdev->pdev.wmask + IGD_ASLS, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_ASLS, ~0); /* * Allow user to override dsm size using x-igd-gms option, in multiples of From c0273e77f2d7aab3312eb557b49332da528ff66b Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:02:59 +0800 Subject: [PATCH 06/28] vfio/igd: Detect IGD device by OpRegion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is currently no straightforward way to distinguish if a Intel graphics device is IGD or discrete GPU. However, only IGD devices have OpRegion. Use the presence of VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION to identify IGD devices. Still, OpRegion on hotplugged IGD device is not supported. Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-4-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index b1fce76f72..347253d08c 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -185,9 +185,10 @@ static bool vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, return true; } -static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp) +static bool vfio_pci_igd_opregion_detect(VFIOPCIDevice *vdev, + struct vfio_region_info **opregion, + Error **errp) { - g_autofree struct vfio_region_info *opregion = NULL; int ret; /* Hotplugging is not supported for opregion access */ @@ -198,17 +199,13 @@ static bool vfio_pci_igd_setup_opregion(VFIOPCIDevice *vdev, Error **errp) ret = vfio_device_get_region_info_type(&vdev->vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, - VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, opregion); if (ret) { error_setg_errno(errp, -ret, "Device does not supports IGD OpRegion feature"); return false; } - if (!vfio_pci_igd_opregion_init(vdev, opregion, errp)) { - return false; - } - return true; } @@ -480,6 +477,7 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) { + g_autofree struct vfio_region_info *opregion = NULL; int ret, gen; uint64_t gms_size; uint64_t *bdsm_size; @@ -487,16 +485,17 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) bool legacy_mode_enabled = false; Error *err = NULL; - /* - * This must be an Intel VGA device at address 00:02.0 for us to even - * consider enabling legacy mode. The vBIOS has dependencies on the - * PCI bus address. - */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || !vfio_is_vga(vdev)) { return true; } + /* IGD device always comes with OpRegion */ + if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + return true; + } + info_report("OpRegion detected on Intel display %x.", vdev->device_id); + /* * IGD is not a standard, they like to change their specs often. We * only attempt to support back to SandBridge and we hope that newer @@ -572,7 +571,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) /* Setup OpRegion access */ if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && - !vfio_pci_igd_setup_opregion(vdev, errp)) { + !vfio_pci_igd_opregion_init(vdev, opregion, errp)) { goto error; } @@ -672,8 +671,11 @@ error: */ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) { + g_autofree struct vfio_region_info *opregion = NULL; + if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && - !vfio_pci_igd_setup_opregion(vdev, errp)) { + (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp) || + !vfio_pci_igd_opregion_init(vdev, opregion, errp))) { return false; } From 2bd33abcf16b50de8e71b1db98186d6fa67f9a39 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:03:00 +0800 Subject: [PATCH 07/28] vfio/igd: Check vendor and device ID on GVT-g mdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check the vendor and device ID on GVT-g mdev to ensure it is a supported device [1]. This extra check is required for automatically enabling OpRegion access later. Note that Cherryview and Gemini Lake are marked as supported here since current code cannot distinguish them with other Gen8 and Gen9 devices. Since mdev cannot be created on these devices, this has no functional impact. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/gpu/drm/i915/intel_gvt.c?h=v6.14#n52 Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-5-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 347253d08c..f5dd475028 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -672,6 +672,18 @@ error: static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) { g_autofree struct vfio_region_info *opregion = NULL; + int gen; + + if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || + !vfio_is_vga(vdev)) { + return true; + } + + /* FIXME: Cherryview is Gen8, but don't support GVT-g */ + gen = igd_gen(vdev); + if (gen != 8 && gen != 9) { + return true; + } if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp) || From 106cdbcef4ef1c99c2c66b68f8c3349363fac97b Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:03:01 +0800 Subject: [PATCH 08/28] vfio/igd: Check OpRegion support on GVT-g mdev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Intel GVT-g backend `kvmgt` always emulates OpRegion for vGPU, make sure the OpRegion is present for enabling access to it automatically later. Also, hotplugging GVT-g vGPU is now always disallowed regardless of OpRegion to prevent potential issues. Intel has never claimed support for GVT-g hotplugging. Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-6-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index f5dd475028..9dec32818c 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -685,9 +685,13 @@ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) return true; } + if (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp)) { + /* Should never reach here, KVMGT always emulates OpRegion */ + return false; + } + if ((vdev->features & VFIO_FEATURE_ENABLE_IGD_OPREGION) && - (!vfio_pci_igd_opregion_detect(vdev, &opregion, errp) || - !vfio_pci_igd_opregion_init(vdev, opregion, errp))) { + !vfio_pci_igd_opregion_init(vdev, opregion, errp)) { return false; } From 16cbb43302a228f804c067992f6f61787934f3e9 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:03:02 +0800 Subject: [PATCH 09/28] vfio/igd: Enable OpRegion by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As the presence of OpRegion is used to detect IGD device now, and guest driver usually depends on OpRegion to work. Enable OpRegion on IGD devices by default for out-of-the-box passthrough experience (except pre-boot display output), especially for libvirt users. Example of IGD passthrough with libvirt:
Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-7-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- docs/igd-assign.txt | 4 ++-- hw/vfio/pci.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt index 95beb76812..fc444503ff 100644 --- a/docs/igd-assign.txt +++ b/docs/igd-assign.txt @@ -102,7 +102,7 @@ digital formats work well. Options ======= -* x-igd-opregion=[on|*off*] +* x-igd-opregion=[*on*|off] Copy host IGD OpRegion and expose it to guest with fw_cfg * x-igd-lpc=[on|*off*] @@ -124,7 +124,7 @@ Examples * Adding IGD with OpRegion and LPC ID hack, but without VGA ranges (For UEFI guests) - -device vfio-pci,host=00:02.0,id=hostdev0,addr=2.0,x-igd-legacy-mode=off,x-igd-opregion=on,x-igd-lpc=on,romfile=efi_oprom.rom + -device vfio-pci,host=00:02.0,id=hostdev0,addr=2.0,x-igd-legacy-mode=off,x-igd-lpc=on,romfile=efi_oprom.rom Guest firmware diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 6908bcc0d3..e1fab21b47 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -3385,7 +3385,7 @@ static const Property vfio_pci_dev_properties[] = { DEFINE_PROP_BIT("x-req", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, - VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), + VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, true), DEFINE_PROP_BIT("x-igd-lpc", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_LPC_BIT, false), DEFINE_PROP_ON_OFF_AUTO("x-igd-legacy-mode", VFIOPCIDevice, From 395a1f7941f4e46044c865ea0c39d1eef0eaddc6 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:03:03 +0800 Subject: [PATCH 10/28] vfio/igd: Allow overriding GMS with 0xf0 to 0xfe on Gen9+ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Gen9 and later IGD devices, GMS 0xf0 to 0xfe represents 4MB to 60MB pre-allocated memory size in 4MB increments. Allow users overriding GMS with these values. Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-8-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 59 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 9dec32818c..8ad9d723e2 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -412,6 +412,44 @@ static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) return true; } +static bool vfio_pci_igd_override_gms(int gen, uint32_t gms, uint32_t *gmch) +{ + bool ret = false; + + if (gen == -1) { + error_report("x-igd-gms is not supported on this device"); + } else if (gen < 8) { + if (gms <= 0x10) { + *gmch &= ~(IGD_GMCH_GEN6_GMS_MASK << IGD_GMCH_GEN6_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN6_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, "x-igd-gms", "0~0x10"); + } + } else if (gen == 8) { + if (gms <= 0x40) { + *gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN8_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, "x-igd-gms", "0~0x40"); + } + } else { + /* 0x0 to 0x40: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if ((gms <= 0x40) || (gms >= 0xf0 && gms <= 0xfe)) { + *gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); + *gmch |= gms << IGD_GMCH_GEN8_GMS_SHIFT; + ret = true; + } else { + error_report(QERR_INVALID_PARAMETER_VALUE, + "x-igd-gms", "0~0x40 or 0xf0~0xfe"); + } + } + + return ret; +} + #define IGD_GGC_MMIO_OFFSET 0x108040 #define IGD_BDSM_MMIO_OFFSET 0x1080C0 @@ -594,24 +632,9 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * 32MiB. This option should only be used when the desired size cannot be * set from DVMT Pre-Allocated option in host BIOS. */ - if (vdev->igd_gms) { - if (gen < 8) { - if (vdev->igd_gms <= 0x10) { - gmch &= ~(IGD_GMCH_GEN6_GMS_MASK << IGD_GMCH_GEN6_GMS_SHIFT); - gmch |= vdev->igd_gms << IGD_GMCH_GEN6_GMS_SHIFT; - } else { - error_report(QERR_INVALID_PARAMETER_VALUE, - "x-igd-gms", "0~0x10"); - } - } else { - if (vdev->igd_gms <= 0x40) { - gmch &= ~(IGD_GMCH_GEN8_GMS_MASK << IGD_GMCH_GEN8_GMS_SHIFT); - gmch |= vdev->igd_gms << IGD_GMCH_GEN8_GMS_SHIFT; - } else { - error_report(QERR_INVALID_PARAMETER_VALUE, - "x-igd-gms", "0~0x40"); - } - } + if (vdev->igd_gms && + !vfio_pci_igd_override_gms(gen, vdev->igd_gms, &gmch)) { + return false; } gms_size = igd_stolen_memory_size(gen, gmch); From 36e4047a9b1c6577f3f09a75abf39f886837cb60 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:03:04 +0800 Subject: [PATCH 11/28] vfio/igd: Only emulate GGC register when x-igd-gms is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit x-igd-gms is used for overriding DSM region size in GGC register in both config space and MMIO BAR0, by default host value is used. There is no need to emulate it in default case. Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-9-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- hw/vfio/igd.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 8ad9d723e2..886d44f017 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -478,22 +478,24 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) return; } - ggc_quirk = vfio_quirk_alloc(1); - ggc_mirror = ggc_quirk->data = g_malloc0(sizeof(*ggc_mirror)); - ggc_mirror->mem = ggc_quirk->mem; - ggc_mirror->vdev = vdev; - ggc_mirror->bar = nr; - ggc_mirror->offset = IGD_GGC_MMIO_OFFSET; - ggc_mirror->config_offset = IGD_GMCH; + if (vdev->igd_gms) { + ggc_quirk = vfio_quirk_alloc(1); + ggc_mirror = ggc_quirk->data = g_malloc0(sizeof(*ggc_mirror)); + ggc_mirror->mem = ggc_quirk->mem; + ggc_mirror->vdev = vdev; + ggc_mirror->bar = nr; + ggc_mirror->offset = IGD_GGC_MMIO_OFFSET; + ggc_mirror->config_offset = IGD_GMCH; - memory_region_init_io(ggc_mirror->mem, OBJECT(vdev), - &vfio_generic_mirror_quirk, ggc_mirror, - "vfio-igd-ggc-quirk", 2); - memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, - ggc_mirror->offset, ggc_mirror->mem, - 1); + memory_region_init_io(ggc_mirror->mem, OBJECT(vdev), + &vfio_generic_mirror_quirk, ggc_mirror, + "vfio-igd-ggc-quirk", 2); + memory_region_add_subregion_overlap(vdev->bars[nr].region.mem, + ggc_mirror->offset, ggc_mirror->mem, + 1); - QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, ggc_quirk, next); + QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, ggc_quirk, next); + } bdsm_quirk = vfio_quirk_alloc(1); bdsm_mirror = bdsm_quirk->data = g_malloc0(sizeof(*bdsm_mirror)); @@ -632,9 +634,15 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * 32MiB. This option should only be used when the desired size cannot be * set from DVMT Pre-Allocated option in host BIOS. */ - if (vdev->igd_gms && - !vfio_pci_igd_override_gms(gen, vdev->igd_gms, &gmch)) { - return false; + if (vdev->igd_gms) { + if (!vfio_pci_igd_override_gms(gen, vdev->igd_gms, &gmch)) { + return false; + } + + /* GMCH is read-only, emulated */ + pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); + pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); + pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); } gms_size = igd_stolen_memory_size(gen, gmch); @@ -652,11 +660,6 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", bdsm_size, sizeof(*bdsm_size)); - /* GMCH is read-only, emulated */ - pci_set_long(vdev->pdev.config + IGD_GMCH, gmch); - pci_set_long(vdev->pdev.wmask + IGD_GMCH, 0); - pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); - /* BDSM is read-write, emulated. The BIOS needs to be able to write it */ if (gen < 11) { pci_set_long(vdev->pdev.config + IGD_BDSM, 0); From 7969cf4639794e0af84862a269daac72adcfb554 Mon Sep 17 00:00:00 2001 From: Tomita Moeko Date: Tue, 6 May 2025 01:03:05 +0800 Subject: [PATCH 12/28] vfio/igd: Remove generation limitation for IGD passthrough MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting from Intel Core Ultra Series (Meteor Lake), Data Stolen Memory has became a part of LMEMBAR (MMIO BAR2) [1][2], meaning that BDSM and GGC register quirks are no longer needed on these platforms. To support Meteor/Arrow/Lunar Lake and future IGD devices, remove the generation limitation in IGD passthrough, and apply BDSM and GGC quirks only to known Gen6-12 devices. [1] https://edc.intel.com/content/www/us/en/design/publications/14th-generation-core-processors-cfg-and-mem-registers/d2-f0-processor-graphics-registers/ [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/gpu/drm/i915/gem/i915_gem_stolen.c?h=v6.14#n142 Signed-off-by: Tomita Moeko Reviewed-by: Corvin Köhne Reviewed-by: Alex Williamson Tested-by: Alex Williamson Link: https://lore.kernel.org/qemu-devel/20250505170305.23622-10-tomitamoeko@gmail.com Signed-off-by: Cédric Le Goater --- docs/igd-assign.txt | 6 +++++ hw/vfio/igd.c | 58 ++++++++++++++++----------------------------- 2 files changed, 27 insertions(+), 37 deletions(-) diff --git a/docs/igd-assign.txt b/docs/igd-assign.txt index fc444503ff..af4e8391fc 100644 --- a/docs/igd-assign.txt +++ b/docs/igd-assign.txt @@ -157,6 +157,12 @@ fw_cfg requirements on the VM firmware: it's expected that this fw_cfg file is only relevant to a single PCI class VGA device with Intel vendor ID, appearing at PCI bus address 00:02.0. + Starting from Meteor Lake, IGD devices access stolen memory via its MMIO + BAR2 (LMEMBAR) and removed the BDSM register in config space. There is + no need for guest firmware to allocate data stolen memory in guest address + space and write it to BDSM register. Value of this fw_cfg file is 0 in + such case. + Upstream Seabios has OpRegion and BDSM (pre-Gen11 device only) support. However, the support is not accepted by upstream EDK2/OVMF. A recommended solution is to create a virtual OpRom with following DXE drivers: diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 886d44f017..3ee1a73b57 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -103,6 +103,7 @@ static int igd_gen(VFIOPCIDevice *vdev) /* * Unfortunately, Intel changes it's specification quite often. This makes * it impossible to use a suitable default value for unknown devices. + * Return -1 for not applying any generation-specific quirks. */ return -1; } @@ -459,20 +460,12 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) VFIOConfigMirrorQuirk *ggc_mirror, *bdsm_mirror; int gen; - /* - * This must be an Intel VGA device at address 00:02.0 for us to even - * consider enabling legacy mode. Some driver have dependencies on the PCI - * bus address. - */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || !vfio_is_vga(vdev) || nr != 0) { return; } - /* - * Only on IGD devices of gen 11 and above, the BDSM register is mirrored - * into MMIO space and read from MMIO space by the Windows driver. - */ + /* Only on IGD Gen6-12 device needs quirks in BAR 0 */ gen = igd_gen(vdev); if (gen < 6) { return; @@ -519,7 +512,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) { g_autofree struct vfio_region_info *opregion = NULL; int ret, gen; - uint64_t gms_size; + uint64_t gms_size = 0; uint64_t *bdsm_size; uint32_t gmch; bool legacy_mode_enabled = false; @@ -536,18 +529,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) } info_report("OpRegion detected on Intel display %x.", vdev->device_id); - /* - * IGD is not a standard, they like to change their specs often. We - * only attempt to support back to SandBridge and we hope that newer - * devices maintain compatibility with generation 8. - */ gen = igd_gen(vdev); - if (gen == -1) { - error_report("IGD device %s is unsupported in legacy mode, " - "try SandyBridge or newer", vdev->vbasedev.name); - return true; - } - gmch = vfio_pci_read_config(&vdev->pdev, IGD_GMCH, 4); /* @@ -645,32 +627,34 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) pci_set_long(vdev->emulated_config_bits + IGD_GMCH, ~0); } - gms_size = igd_stolen_memory_size(gen, gmch); + if (gen > 0) { + gms_size = igd_stolen_memory_size(gen, gmch); + + /* BDSM is read-write, emulated. BIOS needs to be able to write it */ + if (gen < 11) { + pci_set_long(vdev->pdev.config + IGD_BDSM, 0); + pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); + pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); + } else { + pci_set_quad(vdev->pdev.config + IGD_BDSM_GEN11, 0); + pci_set_quad(vdev->pdev.wmask + IGD_BDSM_GEN11, ~0); + pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); + } + } /* * Request reserved memory for stolen memory via fw_cfg. VM firmware * must allocate a 1MB aligned reserved memory region below 4GB with - * the requested size (in bytes) for use by the Intel PCI class VGA - * device at VM address 00:02.0. The base address of this reserved - * memory region must be written to the device BDSM register at PCI - * config offset 0x5C. + * the requested size (in bytes) for use by the IGD device. The base + * address of this reserved memory region must be written to the + * device BDSM register. + * For newer device without BDSM register, this fw_cfg item is 0. */ bdsm_size = g_malloc(sizeof(*bdsm_size)); *bdsm_size = cpu_to_le64(gms_size); fw_cfg_add_file(fw_cfg_find(), "etc/igd-bdsm-size", bdsm_size, sizeof(*bdsm_size)); - /* BDSM is read-write, emulated. The BIOS needs to be able to write it */ - if (gen < 11) { - pci_set_long(vdev->pdev.config + IGD_BDSM, 0); - pci_set_long(vdev->pdev.wmask + IGD_BDSM, ~0); - pci_set_long(vdev->emulated_config_bits + IGD_BDSM, ~0); - } else { - pci_set_quad(vdev->pdev.config + IGD_BDSM_GEN11, 0); - pci_set_quad(vdev->pdev.wmask + IGD_BDSM_GEN11, ~0); - pci_set_quad(vdev->emulated_config_bits + IGD_BDSM_GEN11, ~0); - } - trace_vfio_pci_igd_bdsm_enabled(vdev->vbasedev.name, (gms_size / MiB)); return true; From f4df9f261866452a842762b7bad6031d3f5f7c35 Mon Sep 17 00:00:00 2001 From: Rorie Reyes Date: Fri, 25 Apr 2025 01:23:56 -0400 Subject: [PATCH 13/28] linux-header: update-linux-header script changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel commit 8a141be3233a changed from using ASSEMBLY to ASSEMBLER Updated the update-linux-header script to match Signed-off-by: Rorie Reyes Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250425052401.8287-2-rreyes@linux.ibm.com Signed-off-by: Cédric Le Goater --- scripts/update-linux-headers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh index 8913e4fb99..b43b8ef75a 100755 --- a/scripts/update-linux-headers.sh +++ b/scripts/update-linux-headers.sh @@ -177,7 +177,7 @@ EOF # Remove everything except the macros from bootparam.h avoiding the # unnecessary import of several video/ist/etc headers - sed -e '/__ASSEMBLY__/,/__ASSEMBLY__/d' \ + sed -e '/__ASSEMBLER__/,/__ASSEMBLER__/d' \ "$hdrdir/include/asm/bootparam.h" > "$hdrdir/bootparam.h" cp_portable "$hdrdir/bootparam.h" \ "$output/include/standard-headers/asm-$arch" From 1cab5a02ab8144aad2abd001835e49104e4aae0f Mon Sep 17 00:00:00 2001 From: Rorie Reyes Date: Fri, 25 Apr 2025 01:23:57 -0400 Subject: [PATCH 14/28] linux-headers: Update to Linux v6.15-rc3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update headers to retrieve uapi information for vfio-ap Signed-off-by: Rorie Reyes Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250425052401.8287-3-rreyes@linux.ibm.com Signed-off-by: Cédric Le Goater --- include/standard-headers/asm-x86/setup_data.h | 4 +- include/standard-headers/drm/drm_fourcc.h | 41 ++++++ include/standard-headers/linux/const.h | 2 +- include/standard-headers/linux/ethtool.h | 22 +++ include/standard-headers/linux/fuse.h | 12 +- include/standard-headers/linux/pci_regs.h | 13 +- include/standard-headers/linux/virtio_net.h | 13 ++ include/standard-headers/linux/virtio_snd.h | 2 +- linux-headers/asm-arm64/kvm.h | 11 ++ linux-headers/asm-arm64/unistd_64.h | 1 + linux-headers/asm-generic/mman-common.h | 1 + linux-headers/asm-generic/unistd.h | 4 +- linux-headers/asm-loongarch/unistd_64.h | 1 + linux-headers/asm-mips/unistd_n32.h | 1 + linux-headers/asm-mips/unistd_n64.h | 1 + linux-headers/asm-mips/unistd_o32.h | 1 + linux-headers/asm-powerpc/unistd_32.h | 1 + linux-headers/asm-powerpc/unistd_64.h | 1 + linux-headers/asm-riscv/kvm.h | 2 + linux-headers/asm-riscv/unistd_32.h | 1 + linux-headers/asm-riscv/unistd_64.h | 1 + linux-headers/asm-s390/unistd_32.h | 1 + linux-headers/asm-s390/unistd_64.h | 1 + linux-headers/asm-x86/kvm.h | 3 + linux-headers/asm-x86/unistd_32.h | 1 + linux-headers/asm-x86/unistd_64.h | 1 + linux-headers/asm-x86/unistd_x32.h | 1 + linux-headers/linux/bits.h | 8 +- linux-headers/linux/const.h | 2 +- linux-headers/linux/iommufd.h | 129 +++++++++++++++++- linux-headers/linux/kvm.h | 1 + linux-headers/linux/psp-sev.h | 21 ++- linux-headers/linux/stddef.h | 2 + linux-headers/linux/vfio.h | 30 ++-- 34 files changed, 301 insertions(+), 36 deletions(-) diff --git a/include/standard-headers/asm-x86/setup_data.h b/include/standard-headers/asm-x86/setup_data.h index 09355f54c5..a483d72f42 100644 --- a/include/standard-headers/asm-x86/setup_data.h +++ b/include/standard-headers/asm-x86/setup_data.h @@ -18,7 +18,7 @@ #define SETUP_INDIRECT (1<<31) #define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include "standard-headers/linux/types.h" @@ -78,6 +78,6 @@ struct ima_setup_data { uint64_t size; } QEMU_PACKED; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_SETUP_DATA_H */ diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h index 708647776f..a8b759dcbc 100644 --- a/include/standard-headers/drm/drm_fourcc.h +++ b/include/standard-headers/drm/drm_fourcc.h @@ -420,6 +420,7 @@ extern "C" { #define DRM_FORMAT_MOD_VENDOR_ARM 0x08 #define DRM_FORMAT_MOD_VENDOR_ALLWINNER 0x09 #define DRM_FORMAT_MOD_VENDOR_AMLOGIC 0x0a +#define DRM_FORMAT_MOD_VENDOR_MTK 0x0b /* add more to the end as needed */ @@ -1452,6 +1453,46 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier) */ #define AMLOGIC_FBC_OPTION_MEM_SAVING (1ULL << 0) +/* MediaTek modifiers + * Bits Parameter Notes + * ----- ------------------------ --------------------------------------------- + * 7: 0 TILE LAYOUT Values are MTK_FMT_MOD_TILE_* + * 15: 8 COMPRESSION Values are MTK_FMT_MOD_COMPRESS_* + * 23:16 10 BIT LAYOUT Values are MTK_FMT_MOD_10BIT_LAYOUT_* + * + */ + +#define DRM_FORMAT_MOD_MTK(__flags) fourcc_mod_code(MTK, __flags) + +/* + * MediaTek Tiled Modifier + * The lowest 8 bits of the modifier is used to specify the tiling + * layout. Only the 16L_32S tiling is used for now, but we define an + * "untiled" version and leave room for future expansion. + */ +#define MTK_FMT_MOD_TILE_MASK 0xf +#define MTK_FMT_MOD_TILE_NONE 0x0 +#define MTK_FMT_MOD_TILE_16L32S 0x1 + +/* + * Bits 8-15 specify compression options + */ +#define MTK_FMT_MOD_COMPRESS_MASK (0xf << 8) +#define MTK_FMT_MOD_COMPRESS_NONE (0x0 << 8) +#define MTK_FMT_MOD_COMPRESS_V1 (0x1 << 8) + +/* + * Bits 16-23 specify how the bits of 10 bit formats are + * stored out in memory + */ +#define MTK_FMT_MOD_10BIT_LAYOUT_MASK (0xf << 16) +#define MTK_FMT_MOD_10BIT_LAYOUT_PACKED (0x0 << 16) +#define MTK_FMT_MOD_10BIT_LAYOUT_LSBTILED (0x1 << 16) +#define MTK_FMT_MOD_10BIT_LAYOUT_LSBRASTER (0x2 << 16) + +/* alias for the most common tiling format */ +#define DRM_FORMAT_MOD_MTK_16L_32S_TILE DRM_FORMAT_MOD_MTK(MTK_FMT_MOD_TILE_16L32S) + /* * AMD modifiers * diff --git a/include/standard-headers/linux/const.h b/include/standard-headers/linux/const.h index 2122610de7..95ede23342 100644 --- a/include/standard-headers/linux/const.h +++ b/include/standard-headers/linux/const.h @@ -33,7 +33,7 @@ * Missing __asm__ support * * __BIT128() would not work in the __asm__ code, as it shifts an - * 'unsigned __init128' data type as direct representation of + * 'unsigned __int128' data type as direct representation of * 128 bit constants is not supported in the gcc compiler, as * they get silently truncated. * diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h index e83382531c..5d1ad5fdea 100644 --- a/include/standard-headers/linux/ethtool.h +++ b/include/standard-headers/linux/ethtool.h @@ -2059,6 +2059,24 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_10baseT1S_Half_BIT = 100, ETHTOOL_LINK_MODE_10baseT1S_P2MP_Half_BIT = 101, ETHTOOL_LINK_MODE_10baseT1BRR_Full_BIT = 102, + ETHTOOL_LINK_MODE_200000baseCR_Full_BIT = 103, + ETHTOOL_LINK_MODE_200000baseKR_Full_BIT = 104, + ETHTOOL_LINK_MODE_200000baseDR_Full_BIT = 105, + ETHTOOL_LINK_MODE_200000baseDR_2_Full_BIT = 106, + ETHTOOL_LINK_MODE_200000baseSR_Full_BIT = 107, + ETHTOOL_LINK_MODE_200000baseVR_Full_BIT = 108, + ETHTOOL_LINK_MODE_400000baseCR2_Full_BIT = 109, + ETHTOOL_LINK_MODE_400000baseKR2_Full_BIT = 110, + ETHTOOL_LINK_MODE_400000baseDR2_Full_BIT = 111, + ETHTOOL_LINK_MODE_400000baseDR2_2_Full_BIT = 112, + ETHTOOL_LINK_MODE_400000baseSR2_Full_BIT = 113, + ETHTOOL_LINK_MODE_400000baseVR2_Full_BIT = 114, + ETHTOOL_LINK_MODE_800000baseCR4_Full_BIT = 115, + ETHTOOL_LINK_MODE_800000baseKR4_Full_BIT = 116, + ETHTOOL_LINK_MODE_800000baseDR4_Full_BIT = 117, + ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT = 118, + ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT = 119, + ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT = 120, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS @@ -2271,6 +2289,10 @@ static inline int ethtool_validate_duplex(uint8_t duplex) * be exploited to reduce the RSS queue spread. */ #define RXH_XFRM_SYM_XOR (1 << 0) +/* Similar to SYM_XOR, except that one copy of the XOR'ed fields is replaced by + * an OR of the same fields + */ +#define RXH_XFRM_SYM_OR_XOR (1 << 1) #define RXH_XFRM_NO_CHANGE 0xff /* L2-L4 network traffic flow types */ diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h index d303effb2a..a2b5815d89 100644 --- a/include/standard-headers/linux/fuse.h +++ b/include/standard-headers/linux/fuse.h @@ -229,6 +229,9 @@ * - FUSE_URING_IN_OUT_HEADER_SZ * - FUSE_URING_OP_IN_OUT_SZ * - enum fuse_uring_cmd + * + * 7.43 + * - add FUSE_REQUEST_TIMEOUT */ #ifndef _LINUX_FUSE_H @@ -260,7 +263,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 42 +#define FUSE_KERNEL_MINOR_VERSION 43 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -431,6 +434,8 @@ struct fuse_file_lock { * of the request ID indicates resend requests * FUSE_ALLOW_IDMAP: allow creation of idmapped mounts * FUSE_OVER_IO_URING: Indicate that client supports io-uring + * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests. + * init_out.request_timeout contains the timeout (in secs) */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -473,11 +478,11 @@ struct fuse_file_lock { #define FUSE_PASSTHROUGH (1ULL << 37) #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) #define FUSE_HAS_RESEND (1ULL << 39) - /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP #define FUSE_ALLOW_IDMAP (1ULL << 40) #define FUSE_OVER_IO_URING (1ULL << 41) +#define FUSE_REQUEST_TIMEOUT (1ULL << 42) /** * CUSE INIT request/reply flags @@ -905,7 +910,8 @@ struct fuse_init_out { uint16_t map_alignment; uint32_t flags2; uint32_t max_stack_depth; - uint32_t unused[6]; + uint16_t request_timeout; + uint16_t unused[11]; }; #define CUSE_INIT_INFO_MAX 4096 diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h index 3445c4970e..ba326710f9 100644 --- a/include/standard-headers/linux/pci_regs.h +++ b/include/standard-headers/linux/pci_regs.h @@ -486,6 +486,7 @@ #define PCI_EXP_TYPE_RC_EC 0xa /* Root Complex Event Collector */ #define PCI_EXP_FLAGS_SLOT 0x0100 /* Slot implemented */ #define PCI_EXP_FLAGS_IRQ 0x3e00 /* Interrupt message number */ +#define PCI_EXP_FLAGS_FLIT 0x8000 /* Flit Mode Supported */ #define PCI_EXP_DEVCAP 0x04 /* Device capabilities */ #define PCI_EXP_DEVCAP_PAYLOAD 0x00000007 /* Max_Payload_Size */ #define PCI_EXP_DEVCAP_PHANTOM 0x00000018 /* Phantom functions */ @@ -795,6 +796,8 @@ #define PCI_ERR_CAP_ECRC_CHKC 0x00000080 /* ECRC Check Capable */ #define PCI_ERR_CAP_ECRC_CHKE 0x00000100 /* ECRC Check Enable */ #define PCI_ERR_CAP_PREFIX_LOG_PRESENT 0x00000800 /* TLP Prefix Log Present */ +#define PCI_ERR_CAP_TLP_LOG_FLIT 0x00040000 /* TLP was logged in Flit Mode */ +#define PCI_ERR_CAP_TLP_LOG_SIZE 0x00f80000 /* Logged TLP Size (only in Flit mode) */ #define PCI_ERR_HEADER_LOG 0x1c /* Header Log Register (16 bytes) */ #define PCI_ERR_ROOT_COMMAND 0x2c /* Root Error Command */ #define PCI_ERR_ROOT_CMD_COR_EN 0x00000001 /* Correctable Err Reporting Enable */ @@ -1013,7 +1016,7 @@ /* Resizable BARs */ #define PCI_REBAR_CAP 4 /* capability register */ -#define PCI_REBAR_CAP_SIZES 0x00FFFFF0 /* supported BAR sizes */ +#define PCI_REBAR_CAP_SIZES 0xFFFFFFF0 /* supported BAR sizes */ #define PCI_REBAR_CTRL 8 /* control register */ #define PCI_REBAR_CTRL_BAR_IDX 0x00000007 /* BAR index */ #define PCI_REBAR_CTRL_NBAR_MASK 0x000000E0 /* # of resizable BARs */ @@ -1061,8 +1064,9 @@ #define PCI_EXP_DPC_CAP_RP_EXT 0x0020 /* Root Port Extensions */ #define PCI_EXP_DPC_CAP_POISONED_TLP 0x0040 /* Poisoned TLP Egress Blocking Supported */ #define PCI_EXP_DPC_CAP_SW_TRIGGER 0x0080 /* Software Triggering Supported */ -#define PCI_EXP_DPC_RP_PIO_LOG_SIZE 0x0F00 /* RP PIO Log Size */ +#define PCI_EXP_DPC_RP_PIO_LOG_SIZE 0x0F00 /* RP PIO Log Size [3:0] */ #define PCI_EXP_DPC_CAP_DL_ACTIVE 0x1000 /* ERR_COR signal on DL_Active supported */ +#define PCI_EXP_DPC_RP_PIO_LOG_SIZE4 0x2000 /* RP PIO Log Size [4] */ #define PCI_EXP_DPC_CTL 0x06 /* DPC control */ #define PCI_EXP_DPC_CTL_EN_FATAL 0x0001 /* Enable trigger on ERR_FATAL message */ @@ -1205,9 +1209,12 @@ #define PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX 0x000000ff #define PCI_DOE_DATA_OBJECT_DISC_REQ_3_VER 0x0000ff00 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID 0x0000ffff -#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL 0x00ff0000 +#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE 0x00ff0000 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX 0xff000000 +/* Deprecated old name, replaced with PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE */ +#define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE + /* Compute Express Link (CXL r3.1, sec 8.1.5) */ #define PCI_DVSEC_CXL_PORT 3 #define PCI_DVSEC_CXL_PORT_CTL 0x0c diff --git a/include/standard-headers/linux/virtio_net.h b/include/standard-headers/linux/virtio_net.h index fc594fe5fc..982e854f14 100644 --- a/include/standard-headers/linux/virtio_net.h +++ b/include/standard-headers/linux/virtio_net.h @@ -327,6 +327,19 @@ struct virtio_net_rss_config { uint8_t hash_key_data[/* hash_key_length */]; }; +struct virtio_net_rss_config_hdr { + uint32_t hash_types; + uint16_t indirection_table_mask; + uint16_t unclassified_queue; + uint16_t indirection_table[/* 1 + indirection_table_mask */]; +}; + +struct virtio_net_rss_config_trailer { + uint16_t max_tx_vq; + uint8_t hash_key_length; + uint8_t hash_key_data[/* hash_key_length */]; +}; + #define VIRTIO_NET_CTRL_MQ_RSS_CONFIG 1 /* diff --git a/include/standard-headers/linux/virtio_snd.h b/include/standard-headers/linux/virtio_snd.h index 860f12e0a4..160d57899f 100644 --- a/include/standard-headers/linux/virtio_snd.h +++ b/include/standard-headers/linux/virtio_snd.h @@ -25,7 +25,7 @@ struct virtio_snd_config { uint32_t streams; /* # of available channel maps */ uint32_t chmaps; - /* # of available control elements */ + /* # of available control elements (if VIRTIO_SND_F_CTLS) */ uint32_t controls; }; diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index ec1e82bdc8..4e6aff08df 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -105,6 +105,7 @@ struct kvm_regs { #define KVM_ARM_VCPU_PTRAUTH_ADDRESS 5 /* VCPU uses address authentication */ #define KVM_ARM_VCPU_PTRAUTH_GENERIC 6 /* VCPU uses generic authentication */ #define KVM_ARM_VCPU_HAS_EL2 7 /* Support nested virtualization */ +#define KVM_ARM_VCPU_HAS_EL2_E2H0 8 /* Limit NV support to E2H RES0 */ struct kvm_vcpu_init { __u32 target; @@ -365,6 +366,7 @@ enum { KVM_REG_ARM_STD_HYP_BIT_PV_TIME = 0, }; +/* Vendor hyper call function numbers 0-63 */ #define KVM_REG_ARM_VENDOR_HYP_BMAP KVM_REG_ARM_FW_FEAT_BMAP_REG(2) enum { @@ -372,6 +374,14 @@ enum { KVM_REG_ARM_VENDOR_HYP_BIT_PTP = 1, }; +/* Vendor hyper call function numbers 64-127 */ +#define KVM_REG_ARM_VENDOR_HYP_BMAP_2 KVM_REG_ARM_FW_FEAT_BMAP_REG(3) + +enum { + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_VER = 0, + KVM_REG_ARM_VENDOR_HYP_BIT_DISCOVER_IMPL_CPUS = 1, +}; + /* Device Control API on vm fd */ #define KVM_ARM_VM_SMCCC_CTRL 0 #define KVM_ARM_VM_SMCCC_FILTER 0 @@ -394,6 +404,7 @@ enum { #define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 #define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 #define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 +#define KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ 9 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 #define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) diff --git a/linux-headers/asm-arm64/unistd_64.h b/linux-headers/asm-arm64/unistd_64.h index d4e90fff76..ee9aaebdf3 100644 --- a/linux-headers/asm-arm64/unistd_64.h +++ b/linux-headers/asm-arm64/unistd_64.h @@ -323,6 +323,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-generic/mman-common.h b/linux-headers/asm-generic/mman-common.h index 1ea2c4c33b..ef1c27fa3c 100644 --- a/linux-headers/asm-generic/mman-common.h +++ b/linux-headers/asm-generic/mman-common.h @@ -85,6 +85,7 @@ /* compatibility flags */ #define MAP_FILE 0 +#define PKEY_UNRESTRICTED 0x0 #define PKEY_DISABLE_ACCESS 0x1 #define PKEY_DISABLE_WRITE 0x2 #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\ diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h index 88dc393c2b..2892a45023 100644 --- a/linux-headers/asm-generic/unistd.h +++ b/linux-headers/asm-generic/unistd.h @@ -849,9 +849,11 @@ __SYSCALL(__NR_getxattrat, sys_getxattrat) __SYSCALL(__NR_listxattrat, sys_listxattrat) #define __NR_removexattrat 466 __SYSCALL(__NR_removexattrat, sys_removexattrat) +#define __NR_open_tree_attr 467 +__SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) #undef __NR_syscalls -#define __NR_syscalls 467 +#define __NR_syscalls 468 /* * 32 bit systems traditionally used different diff --git a/linux-headers/asm-loongarch/unistd_64.h b/linux-headers/asm-loongarch/unistd_64.h index 23fb96a8a7..50d22df8f7 100644 --- a/linux-headers/asm-loongarch/unistd_64.h +++ b/linux-headers/asm-loongarch/unistd_64.h @@ -319,6 +319,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h index 9a75719644..bdcc2f460b 100644 --- a/linux-headers/asm-mips/unistd_n32.h +++ b/linux-headers/asm-mips/unistd_n32.h @@ -395,5 +395,6 @@ #define __NR_getxattrat (__NR_Linux + 464) #define __NR_listxattrat (__NR_Linux + 465) #define __NR_removexattrat (__NR_Linux + 466) +#define __NR_open_tree_attr (__NR_Linux + 467) #endif /* _ASM_UNISTD_N32_H */ diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h index 7086783b0c..3b6b0193b6 100644 --- a/linux-headers/asm-mips/unistd_n64.h +++ b/linux-headers/asm-mips/unistd_n64.h @@ -371,5 +371,6 @@ #define __NR_getxattrat (__NR_Linux + 464) #define __NR_listxattrat (__NR_Linux + 465) #define __NR_removexattrat (__NR_Linux + 466) +#define __NR_open_tree_attr (__NR_Linux + 467) #endif /* _ASM_UNISTD_N64_H */ diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h index b3825823e4..4609a4b4d3 100644 --- a/linux-headers/asm-mips/unistd_o32.h +++ b/linux-headers/asm-mips/unistd_o32.h @@ -441,5 +441,6 @@ #define __NR_getxattrat (__NR_Linux + 464) #define __NR_listxattrat (__NR_Linux + 465) #define __NR_removexattrat (__NR_Linux + 466) +#define __NR_open_tree_attr (__NR_Linux + 467) #endif /* _ASM_UNISTD_O32_H */ diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h index 38ee4dc35d..5d38a427e0 100644 --- a/linux-headers/asm-powerpc/unistd_32.h +++ b/linux-headers/asm-powerpc/unistd_32.h @@ -448,6 +448,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h index 5e5f156834..860a488e4d 100644 --- a/linux-headers/asm-powerpc/unistd_64.h +++ b/linux-headers/asm-powerpc/unistd_64.h @@ -420,6 +420,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-riscv/kvm.h b/linux-headers/asm-riscv/kvm.h index f06bc5efcd..5f59fd226c 100644 --- a/linux-headers/asm-riscv/kvm.h +++ b/linux-headers/asm-riscv/kvm.h @@ -182,6 +182,8 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_SVVPTC, KVM_RISCV_ISA_EXT_ZABHA, KVM_RISCV_ISA_EXT_ZICCRSE, + KVM_RISCV_ISA_EXT_ZAAMO, + KVM_RISCV_ISA_EXT_ZALRSC, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/linux-headers/asm-riscv/unistd_32.h b/linux-headers/asm-riscv/unistd_32.h index 74f6127aed..a5e769f1d9 100644 --- a/linux-headers/asm-riscv/unistd_32.h +++ b/linux-headers/asm-riscv/unistd_32.h @@ -314,6 +314,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-riscv/unistd_64.h b/linux-headers/asm-riscv/unistd_64.h index bb6a15a2ec..8df4d64841 100644 --- a/linux-headers/asm-riscv/unistd_64.h +++ b/linux-headers/asm-riscv/unistd_64.h @@ -324,6 +324,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index 620201cb36..85eedbd18e 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -439,5 +439,6 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_S390_UNISTD_32_H */ diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index e7e4a10aaf..c03b1b9701 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -387,5 +387,6 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_S390_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h index 86f2c34e7a..dc591fb17e 100644 --- a/linux-headers/asm-x86/kvm.h +++ b/linux-headers/asm-x86/kvm.h @@ -557,6 +557,9 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) #define KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA (1 << 8) +#define KVM_XEN_MSR_MIN_INDEX 0x40000000u +#define KVM_XEN_MSR_MAX_INDEX 0x4fffffffu + struct kvm_xen_hvm_config { __u32 flags; __u32 msr; diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index a2eb492a75..491d6b4eb6 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -457,6 +457,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index 2f5fc400f5..7cf88bf9bd 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -380,6 +380,7 @@ #define __NR_getxattrat 464 #define __NR_listxattrat 465 #define __NR_removexattrat 466 +#define __NR_open_tree_attr 467 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index fecd832e7f..82959111e6 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -333,6 +333,7 @@ #define __NR_getxattrat (__X32_SYSCALL_BIT + 464) #define __NR_listxattrat (__X32_SYSCALL_BIT + 465) #define __NR_removexattrat (__X32_SYSCALL_BIT + 466) +#define __NR_open_tree_attr (__X32_SYSCALL_BIT + 467) #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) #define __NR_ioctl (__X32_SYSCALL_BIT + 514) diff --git a/linux-headers/linux/bits.h b/linux-headers/linux/bits.h index c0d00c0a98..58596d18f4 100644 --- a/linux-headers/linux/bits.h +++ b/linux-headers/linux/bits.h @@ -4,13 +4,9 @@ #ifndef _LINUX_BITS_H #define _LINUX_BITS_H -#define __GENMASK(h, l) \ - (((~_UL(0)) - (_UL(1) << (l)) + 1) & \ - (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) \ - (((~_ULL(0)) - (_ULL(1) << (l)) + 1) & \ - (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) diff --git a/linux-headers/linux/const.h b/linux-headers/linux/const.h index 2122610de7..95ede23342 100644 --- a/linux-headers/linux/const.h +++ b/linux-headers/linux/const.h @@ -33,7 +33,7 @@ * Missing __asm__ support * * __BIT128() would not work in the __asm__ code, as it shifts an - * 'unsigned __init128' data type as direct representation of + * 'unsigned __int128' data type as direct representation of * 128 bit constants is not supported in the gcc compiler, as * they get silently truncated. * diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index ccbdca5e11..cb0f7d6b4d 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -55,6 +55,7 @@ enum { IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, + IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, }; /** @@ -392,6 +393,9 @@ struct iommu_vfio_ioas { * Any domain attached to the non-PASID part of the * device must also be flagged, otherwise attaching a * PASID will blocked. + * For the user that wants to attach PASID, ioas is + * not recommended for both the non-PASID part + * and PASID part of the device. * If IOMMU does not support PASID it will return * error (-EOPNOTSUPP). */ @@ -608,9 +612,17 @@ enum iommu_hw_info_type { * IOMMU_HWPT_GET_DIRTY_BITMAP * IOMMU_HWPT_SET_DIRTY_TRACKING * + * @IOMMU_HW_CAP_PCI_PASID_EXEC: Execute Permission Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. + * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, + IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, + IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; /** @@ -626,6 +638,9 @@ enum iommufd_hw_capabilities { * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined * in the enum iommu_hw_capabilities. + * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. + * PCI devices turn to out_capabilities to check if the + * specific capabilities is supported or not. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -649,7 +664,8 @@ struct iommu_hw_info { __u32 data_len; __aligned_u64 data_uptr; __u32 out_data_type; - __u32 __reserved; + __u8 out_max_pasid_log2; + __u8 __reserved[3]; __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) @@ -1014,4 +1030,115 @@ struct iommu_ioas_change_process { #define IOMMU_IOAS_CHANGE_PROCESS \ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS) +/** + * enum iommu_veventq_flag - flag for struct iommufd_vevent_header + * @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs + */ +enum iommu_veventq_flag { + IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0), +}; + +/** + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status + * @flags: Combination of enum iommu_veventq_flag + * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of + * [0, INT_MAX] where the following index of INT_MAX is 0 + * + * Each iommufd_vevent_header reports a sequence index of the following vEVENT: + * + * +----------------------+-------+----------------------+-------+---+-------+ + * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN | + * +----------------------+-------+----------------------+-------+---+-------+ + * + * And this sequence index is expected to be monotonic to the sequence index of + * the previous vEVENT. If two adjacent sequence indexes has a delta larger than + * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs: + * + * +-----+----------------------+-------+----------------------+-------+-----+ + * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... | + * +-----+----------------------+-------+----------------------+-------+-----+ + * + * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT + * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header + * would be added to the tail, and no data would follow this header: + * + * +--+----------------------+-------+-----------------------------------------+ + * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} | + * +--+----------------------+-------+-----------------------------------------+ + */ +struct iommufd_vevent_header { + __u32 flags; + __u32 sequence; +}; + +/** + * enum iommu_veventq_type - Virtual Event Queue Type + * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue + */ +enum iommu_veventq_type { + IOMMU_VEVENTQ_TYPE_DEFAULT = 0, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, +}; + +/** + * struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event + * (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3) + * @evt: 256-bit ARM SMMUv3 Event record, little-endian. + * Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec) + * - 0x04 C_BAD_STE + * - 0x06 F_STREAM_DISABLED + * - 0x08 C_BAD_SUBSTREAMID + * - 0x0a C_BAD_CD + * - 0x10 F_TRANSLATION + * - 0x11 F_ADDR_SIZE + * - 0x12 F_ACCESS + * - 0x13 F_PERMISSION + * + * StreamID field reports a virtual device ID. To receive a virtual event for a + * device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC. + */ +struct iommu_vevent_arm_smmuv3 { + __aligned_le64 evt[4]; +}; + +/** + * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) + * @size: sizeof(struct iommu_veventq_alloc) + * @flags: Must be 0 + * @viommu_id: virtual IOMMU ID to associate the vEVENTQ with + * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type + * @veventq_depth: Maximum number of events in the vEVENTQ + * @out_veventq_id: The ID of the new vEVENTQ + * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the + * successfully returned fd after using it + * @__reserved: Must be 0 + * + * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU + * can have multiple FDs for different types, but is confined to one per @type. + * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ, + * if there are vEVENTs available. A vEVENTQ will lose events due to overflow, + * if the number of the vEVENTs hits @veventq_depth. + * + * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by + * a type-specific data structure, in a normal case: + * + * +-+---------+-------+---------+-------+-----+---------+-------+-+ + * | | header0 | data0 | header1 | data1 | ... | headerN | dataN | | + * +-+---------+-------+---------+-------+-----+---------+-------+-+ + * + * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to + * struct iommufd_vevent_header). + */ +struct iommu_veventq_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 veventq_depth; + __u32 out_veventq_id; + __u32 out_veventq_fd; + __u32 __reserved; +}; +#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 27181b3dd8..e5f3e8b5a0 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -921,6 +921,7 @@ struct kvm_enable_cap { #define KVM_CAP_PRE_FAULT_MEMORY 236 #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 #define KVM_CAP_X86_GUEST_MODE 238 +#define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/linux-headers/linux/psp-sev.h b/linux-headers/linux/psp-sev.h index 17bf191573..113c4ceb78 100644 --- a/linux-headers/linux/psp-sev.h +++ b/linux-headers/linux/psp-sev.h @@ -73,13 +73,20 @@ typedef enum { SEV_RET_INVALID_PARAM, SEV_RET_RESOURCE_LIMIT, SEV_RET_SECURE_DATA_INVALID, - SEV_RET_INVALID_KEY = 0x27, - SEV_RET_INVALID_PAGE_SIZE, - SEV_RET_INVALID_PAGE_STATE, - SEV_RET_INVALID_MDATA_ENTRY, - SEV_RET_INVALID_PAGE_OWNER, - SEV_RET_INVALID_PAGE_AEAD_OFLOW, - SEV_RET_RMP_INIT_REQUIRED, + SEV_RET_INVALID_PAGE_SIZE = 0x0019, + SEV_RET_INVALID_PAGE_STATE = 0x001A, + SEV_RET_INVALID_MDATA_ENTRY = 0x001B, + SEV_RET_INVALID_PAGE_OWNER = 0x001C, + SEV_RET_AEAD_OFLOW = 0x001D, + SEV_RET_EXIT_RING_BUFFER = 0x001F, + SEV_RET_RMP_INIT_REQUIRED = 0x0020, + SEV_RET_BAD_SVN = 0x0021, + SEV_RET_BAD_VERSION = 0x0022, + SEV_RET_SHUTDOWN_REQUIRED = 0x0023, + SEV_RET_UPDATE_FAILED = 0x0024, + SEV_RET_RESTORE_REQUIRED = 0x0025, + SEV_RET_RMP_INITIALIZATION_FAILED = 0x0026, + SEV_RET_INVALID_KEY = 0x0027, SEV_RET_MAX, } sev_ret_code; diff --git a/linux-headers/linux/stddef.h b/linux-headers/linux/stddef.h index e1416f7937..e1fcfcf3b3 100644 --- a/linux-headers/linux/stddef.h +++ b/linux-headers/linux/stddef.h @@ -70,4 +70,6 @@ #define __counted_by_be(m) #endif +#define __kernel_nonstring + #endif /* _LINUX_STDDEF_H */ diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 1b5e254d6a..79bf8c0cc5 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -671,6 +671,7 @@ enum { */ enum { VFIO_AP_REQ_IRQ_INDEX, + VFIO_AP_CFG_CHG_IRQ_INDEX, VFIO_AP_NUM_IRQS }; @@ -931,29 +932,34 @@ struct vfio_device_bind_iommufd { * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19, * struct vfio_device_attach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for attach. * @pt_id: Input the target id which can represent an ioas or a hwpt * allocated via iommufd subsystem. * Output the input ioas id or the attached hwpt id which could * be the specified hwpt itself or a hwpt automatically created * for the specified ioas by kernel during the attachment. + * @pasid: The pasid to be attached, only meaningful when + * VFIO_DEVICE_ATTACH_PASID is set in @flags * * Associate the device with an address space within the bound iommufd. * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only * allowed on cdev fds. * - * If a vfio device is currently attached to a valid hw_pagetable, without doing - * a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl - * passing in another hw_pagetable (hwpt) id is allowed. This action, also known - * as a hw_pagetable replacement, will replace the device's currently attached - * hw_pagetable with a new hw_pagetable corresponding to the given pt_id. + * If a vfio device or a pasid of this device is currently attached to a valid + * hw_pagetable (hwpt), without doing a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second + * VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is allowed. + * This action, also known as a hw_pagetable replacement, will replace the + * currently attached hwpt of the device or the pasid of this device with a new + * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_ATTACH_PASID (1 << 0) __u32 pt_id; + __u32 pasid; }; #define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19) @@ -962,17 +968,21 @@ struct vfio_device_attach_iommufd_pt { * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20, * struct vfio_device_detach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for detach. + * @pasid: The pasid to be detached, only meaningful when + * VFIO_DEVICE_DETACH_PASID is set in @flags * - * Remove the association of the device and its current associated address - * space. After it, the device should be in a blocking DMA state. This is only - * allowed on cdev fds. + * Remove the association of the device or a pasid of the device and its current + * associated address space. After it, the device or the pasid should be in a + * blocking DMA state. This is only allowed on cdev fds. * * Return: 0 on success, -errno on failure. */ struct vfio_device_detach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_DETACH_PASID (1 << 0) + __u32 pasid; }; #define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20) From a901682f53b51c07dc27aab7e30256855a2a1f2f Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:06 +0100 Subject: [PATCH 15/28] vfio: add vfio_device_prepare() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commonize some initialization code shared by the legacy and iommufd vfio implementations. Reviewed-by: Cédric Le Goater Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-2-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 14 ++------------ hw/vfio/device.c | 14 ++++++++++++++ hw/vfio/iommufd.c | 9 +-------- include/hw/vfio/vfio-device.h | 3 +++ 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index a761f0958b..d30c1a141d 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -826,18 +826,14 @@ static bool vfio_device_get(VFIOGroup *group, const char *name, } } + vfio_device_prepare(vbasedev, &group->container->bcontainer, info); + vbasedev->fd = fd; vbasedev->group = group; QLIST_INSERT_HEAD(&group->device_list, vbasedev, next); - vbasedev->num_irqs = info->num_irqs; - vbasedev->num_regions = info->num_regions; - vbasedev->flags = info->flags; - trace_vfio_device_get(name, info->flags, info->num_regions, info->num_irqs); - vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); - return true; } @@ -890,7 +886,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, int groupid = vfio_device_get_groupid(vbasedev, errp); VFIODevice *vbasedev_iter; VFIOGroup *group; - VFIOContainerBase *bcontainer; if (groupid < 0) { return false; @@ -919,11 +914,6 @@ static bool vfio_legacy_attach_device(const char *name, VFIODevice *vbasedev, goto device_put_exit; } - bcontainer = &group->container->bcontainer; - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); - return true; device_put_exit: diff --git a/hw/vfio/device.c b/hw/vfio/device.c index d625a7c4db..f3b9902d21 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -398,3 +398,17 @@ void vfio_device_detach(VFIODevice *vbasedev) } VFIO_IOMMU_GET_CLASS(vbasedev->bcontainer)->detach_device(vbasedev); } + +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, + struct vfio_device_info *info) +{ + vbasedev->num_irqs = info->num_irqs; + vbasedev->num_regions = info->num_regions; + vbasedev->flags = info->flags; + vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET); + + vbasedev->bcontainer = bcontainer; + QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); + + QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 232c06dd15..83033c352a 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -588,14 +588,7 @@ found_container: iommufd_cdev_ram_block_discard_disable(false); } - vbasedev->group = 0; - vbasedev->num_irqs = dev_info.num_irqs; - vbasedev->num_regions = dev_info.num_regions; - vbasedev->flags = dev_info.flags; - vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET); - vbasedev->bcontainer = bcontainer; - QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); - QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + vfio_device_prepare(vbasedev, bcontainer, &dev_info); trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs, vbasedev->num_regions, vbasedev->flags); diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 81c95bb51e..081929ca4b 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -134,6 +134,9 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIODeviceList vfio_device_list; #ifdef CONFIG_LINUX +void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, + struct vfio_device_info *info); + int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, From d60fb709cf948b3dc508eb88162f5666a49762ae Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:07 +0100 Subject: [PATCH 16/28] vfio: add vfio_device_unprepare() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper that's the inverse of vfio_device_prepare(). Reviewed-by: Cédric Le Goater Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-3-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 6 +++--- hw/vfio/device.c | 7 +++++++ hw/vfio/iommufd.c | 4 +--- include/hw/vfio/vfio-device.h | 2 ++ 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index d30c1a141d..cf23aa799f 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -927,10 +927,10 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev) { VFIOGroup *group = vbasedev->group; - QLIST_REMOVE(vbasedev, global_next); - QLIST_REMOVE(vbasedev, container_next); - vbasedev->bcontainer = NULL; trace_vfio_device_detach(vbasedev->name, group->groupid); + + vfio_device_unprepare(vbasedev); + object_unref(vbasedev->hiod); vfio_device_put(vbasedev); vfio_group_put(group); diff --git a/hw/vfio/device.c b/hw/vfio/device.c index f3b9902d21..31c441a3df 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -412,3 +412,10 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); } + +void vfio_device_unprepare(VFIODevice *vbasedev) +{ + QLIST_REMOVE(vbasedev, container_next); + QLIST_REMOVE(vbasedev, global_next); + vbasedev->bcontainer = NULL; +} diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 83033c352a..62ecb758f1 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -615,9 +615,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev) VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); - QLIST_REMOVE(vbasedev, global_next); - QLIST_REMOVE(vbasedev, container_next); - vbasedev->bcontainer = NULL; + vfio_device_unprepare(vbasedev); if (!vbasedev->ram_block_discard_allowed) { iommufd_cdev_ram_block_discard_disable(false); diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 081929ca4b..342c4ba3bf 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -137,6 +137,8 @@ extern VFIODeviceList vfio_device_list; void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, struct vfio_device_info *info); +void vfio_device_unprepare(VFIODevice *vbasedev); + int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info); int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, From ef73671f0bfc7bd852439e42ab4260104d902055 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:08 +0100 Subject: [PATCH 17/28] vfio: add vfio_attach_device_by_iommu_type() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow attachment by explicitly passing a TYPE_VFIO_IOMMU_* string; vfio-user will use this later. Reviewed-by: Cédric Le Goater Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-4-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 22 +++++++++++++++------- include/hw/vfio/vfio-device.h | 3 +++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 31c441a3df..9673b0717e 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -376,21 +376,29 @@ VFIODevice *vfio_get_vfio_device(Object *obj) } } -bool vfio_device_attach(char *name, VFIODevice *vbasedev, - AddressSpace *as, Error **errp) +bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name, + VFIODevice *vbasedev, AddressSpace *as, + Error **errp) { const VFIOIOMMUClass *ops = - VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_LEGACY)); - - if (vbasedev->iommufd) { - ops = VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD)); - } + VFIO_IOMMU_CLASS(object_class_by_name(iommu_type)); assert(ops); return ops->attach_device(name, vbasedev, as, errp); } +bool vfio_device_attach(char *name, VFIODevice *vbasedev, + AddressSpace *as, Error **errp) +{ + const char *iommu_type = vbasedev->iommufd ? + TYPE_VFIO_IOMMU_IOMMUFD : + TYPE_VFIO_IOMMU_LEGACY; + + return vfio_device_attach_by_iommu_type(iommu_type, name, vbasedev, + as, errp); +} + void vfio_device_detach(VFIODevice *vbasedev) { if (!vbasedev->bcontainer) { diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 342c4ba3bf..8b1437ba66 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -127,6 +127,9 @@ bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev, const char *typename, Error **errp); bool vfio_device_attach(char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); +bool vfio_device_attach_by_iommu_type(const char *iommu_type, char *name, + VFIODevice *vbasedev, AddressSpace *as, + Error **errp); void vfio_device_detach(VFIODevice *vbasedev); VFIODevice *vfio_get_vfio_device(Object *obj); From 5321e623ebe27deef7cfb793a71d23affa77d157 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:09 +0100 Subject: [PATCH 18/28] vfio: add vfio_device_get_irq_info() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper similar to vfio_device_get_region_info() and use it everywhere. Replace a couple of needless allocations with stack variables. As a side-effect, this fixes a minor error reporting issue in the call from vfio_msix_early_setup(). Reviewed-by: Cédric Le Goater Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-5-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/ap.c | 19 ++++++++++--------- hw/vfio/ccw.c | 20 +++++++++++--------- hw/vfio/device.c | 15 +++++++++++++++ hw/vfio/pci.c | 23 +++++++++++------------ hw/vfio/platform.c | 6 +++--- include/hw/vfio/vfio-device.h | 3 +++ 6 files changed, 53 insertions(+), 33 deletions(-) diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c index 1207c08d8d..785c0a0197 100644 --- a/hw/vfio/ap.c +++ b/hw/vfio/ap.c @@ -74,10 +74,10 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, unsigned int irq, Error **errp) { int fd; - size_t argsz; + int ret; IOHandler *fd_read; EventNotifier *notifier; - g_autofree struct vfio_irq_info *irq_info = NULL; + struct vfio_irq_info irq_info; VFIODevice *vdev = &vapdev->vdev; switch (irq) { @@ -96,14 +96,15 @@ static bool vfio_ap_register_irq_notifier(VFIOAPDevice *vapdev, return false; } - argsz = sizeof(*irq_info); - irq_info = g_malloc0(argsz); - irq_info->index = irq; - irq_info->argsz = argsz; + ret = vfio_device_get_irq_info(vdev, irq, &irq_info); - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, - irq_info) < 0 || irq_info->count < 1) { - error_setg_errno(errp, errno, "vfio: Error getting irq info"); + if (ret < 0) { + error_setg_errno(errp, -ret, "vfio: Error getting irq info"); + return false; + } + + if (irq_info.count < 1) { + error_setg(errp, "vfio: Error getting irq info, count=0"); return false; } diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index fde0c3fbef..ab3fabf991 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -376,8 +376,8 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, Error **errp) { VFIODevice *vdev = &vcdev->vdev; - g_autofree struct vfio_irq_info *irq_info = NULL; - size_t argsz; + struct vfio_irq_info irq_info; + int ret; int fd; EventNotifier *notifier; IOHandler *fd_read; @@ -406,13 +406,15 @@ static bool vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, return false; } - argsz = sizeof(*irq_info); - irq_info = g_malloc0(argsz); - irq_info->index = irq; - irq_info->argsz = argsz; - if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, - irq_info) < 0 || irq_info->count < 1) { - error_setg_errno(errp, errno, "vfio: Error getting irq info"); + ret = vfio_device_get_irq_info(vdev, irq, &irq_info); + + if (ret < 0) { + error_setg_errno(errp, -ret, "vfio: Error getting irq info"); + return false; + } + + if (irq_info.count < 1) { + error_setg(errp, "vfio: Error getting irq info, count=0"); return false; } diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 9673b0717e..5d837092cb 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -185,6 +185,21 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex return false; } +int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, + struct vfio_irq_info *info) +{ + int ret; + + memset(info, 0, sizeof(*info)); + + info->argsz = sizeof(*info); + info->index = index; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); + + return ret < 0 ? -errno : ret; +} + int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index e1fab21b47..5ccfc67aef 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1555,8 +1555,7 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) uint16_t ctrl; uint32_t table, pba; int ret, fd = vdev->vbasedev.fd; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_MSIX_IRQ_INDEX }; + struct vfio_irq_info irq_info; VFIOMSIXInfo *msix; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); @@ -1593,7 +1592,8 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX, + &irq_info); if (ret < 0) { error_setg_errno(errp, -ret, "failed to get MSI-X irq info"); g_free(msix); @@ -2736,7 +2736,7 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; g_autofree struct vfio_region_info *reg_info = NULL; - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + struct vfio_irq_info irq_info; int i, ret = -1; /* Sanity check device */ @@ -2797,12 +2797,10 @@ static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } } - irq_info.index = VFIO_PCI_ERR_IRQ_INDEX; - - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + ret = vfio_device_get_irq_info(vbasedev, VFIO_PCI_ERR_IRQ_INDEX, &irq_info); if (ret) { /* This can fail for an old kernel or legacy PCI dev */ - trace_vfio_populate_device_get_irq_info_failure(strerror(errno)); + trace_vfio_populate_device_get_irq_info_failure(strerror(-ret)); } else if (irq_info.count == 1) { vdev->pci_aer = true; } else { @@ -2911,17 +2909,18 @@ static void vfio_req_notifier_handler(void *opaque) static void vfio_register_req_notifier(VFIOPCIDevice *vdev) { - struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), - .index = VFIO_PCI_REQ_IRQ_INDEX }; + struct vfio_irq_info irq_info; Error *err = NULL; int32_t fd; + int ret; if (!(vdev->features & VFIO_FEATURE_ENABLE_REQ)) { return; } - if (ioctl(vdev->vbasedev.fd, - VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0 || irq_info.count < 1) { + ret = vfio_device_get_irq_info(&vdev->vbasedev, VFIO_PCI_REQ_IRQ_INDEX, + &irq_info); + if (ret < 0 || irq_info.count < 1) { return; } diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c index ffb3681607..9a21f2e50a 100644 --- a/hw/vfio/platform.c +++ b/hw/vfio/platform.c @@ -474,10 +474,10 @@ static bool vfio_populate_device(VFIODevice *vbasedev, Error **errp) QSIMPLEQ_INIT(&vdev->pending_intp_queue); for (i = 0; i < vbasedev->num_irqs; i++) { - struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + struct vfio_irq_info irq; + + ret = vfio_device_get_irq_info(vbasedev, i, &irq); - irq.index = i; - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); if (ret) { error_setg_errno(errp, -ret, "failed to get device irq info"); goto irq_err; diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 8b1437ba66..a7eaaa31e7 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -147,6 +147,9 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, uint32_t subtype, struct vfio_region_info **info); bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type); + +int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, + struct vfio_irq_info *info); #endif /* Returns 0 on success, or a negative errno. */ From 2e27becf17be231bc15588a51c0b61efec68d021 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:10 +0100 Subject: [PATCH 19/28] vfio: consistently handle return value for helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various bits of code that call vfio device APIs should consistently use the "return -errno" approach for passing errors back, rather than presuming errno is (still) set correctly. Reviewed-by: Cédric Le Goater Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-6-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 5ccfc67aef..866cf58d04 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -398,7 +398,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); - return ret; + return ret < 0 ? -errno : ret; } static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) @@ -459,7 +459,7 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) g_free(irq_set); - return ret; + return ret < 0 ? -errno : ret; } static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, @@ -581,7 +581,8 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, vfio_device_irq_disable(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { Error *err = NULL; @@ -695,7 +696,8 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) if (vdev->nr_vectors) { ret = vfio_enable_vectors(vdev, true); if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); + error_report("vfio: failed to enable vectors, %s", + strerror(-ret)); } } else { /* @@ -712,7 +714,8 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) */ ret = vfio_enable_msix_no_vec(vdev); if (ret) { - error_report("vfio: failed to enable MSI-X, %d", ret); + error_report("vfio: failed to enable MSI-X, %s", + strerror(-ret)); } } @@ -765,7 +768,8 @@ retry: ret = vfio_enable_vectors(vdev, false); if (ret) { if (ret < 0) { - error_report("vfio: Error: Failed to setup MSI fds: %m"); + error_report("vfio: Error: Failed to setup MSI fds: %s", + strerror(-ret)); } else { error_report("vfio: Error: Failed to enable %d " "MSI vectors, retry with %d", vdev->nr_vectors, ret); @@ -882,17 +886,21 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { g_autofree struct vfio_region_info *reg_info = NULL; + VFIODevice *vbasedev = &vdev->vbasedev; uint64_t size; off_t off = 0; ssize_t bytes; + int ret; - if (vfio_device_get_region_info(&vdev->vbasedev, - VFIO_PCI_ROM_REGION_INDEX, ®_info)) { - error_report("vfio: Error getting ROM info: %m"); + ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_ROM_REGION_INDEX, + ®_info); + + if (ret != 0) { + error_report("vfio: Error getting ROM info: %s", strerror(-ret)); return; } - trace_vfio_pci_load_rom(vdev->vbasedev.name, (unsigned long)reg_info->size, + trace_vfio_pci_load_rom(vbasedev->name, (unsigned long)reg_info->size, (unsigned long)reg_info->offset, (unsigned long)reg_info->flags); @@ -901,8 +909,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) if (!vdev->rom_size) { vdev->rom_read_failed = true; - error_report("vfio-pci: Cannot read device rom at " - "%s", vdev->vbasedev.name); + error_report("vfio-pci: Cannot read device rom at %s", vbasedev->name); error_printf("Device option ROM contents are probably invalid " "(check dmesg).\nSkip option ROM probe with rombar=0, " "or load from file with romfile=\n"); @@ -913,7 +920,7 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vdev->vbasedev.fd, vdev->rom + off, + bytes = pread(vbasedev->fd, vdev->rom + off, size, vdev->rom_offset + off); if (bytes == 0) { break; From 5363a1a117ea6e1af520d1faed303f944975f760 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:11 +0100 Subject: [PATCH 20/28] vfio: add strread/writeerror() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add simple helpers to correctly report failures from read/write routines using the return -errno style. Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-7-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- include/hw/vfio/vfio-device.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index a7eaaa31e7..4a32202943 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -115,6 +115,20 @@ struct VFIODeviceOps { int (*vfio_load_config)(VFIODevice *vdev, QEMUFile *f); }; +/* + * Given a return value of either a short number of bytes read or -errno, + * construct a meaningful error message. + */ +#define strreaderror(ret) \ + (ret < 0 ? strerror(-ret) : "short read") + +/* + * Given a return value of either a short number of bytes written or -errno, + * construct a meaningful error message. + */ +#define strwriteerror(ret) \ + (ret < 0 ? strerror(-ret) : "short write") + void vfio_device_irq_disable(VFIODevice *vbasedev, int index); void vfio_device_irq_unmask(VFIODevice *vbasedev, int index); void vfio_device_irq_mask(VFIODevice *vbasedev, int index); From cae04b56347be59718f1a778d0ad588a205bf409 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:12 +0100 Subject: [PATCH 21/28] vfio: add vfio_pci_config_space_read/write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add these helpers that access config space and return an -errno style return. Reviewed-by: Cédric Le Goater Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-8-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/pci.c | 123 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 80 insertions(+), 43 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 866cf58d04..f65c9463ce 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -967,6 +967,28 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) } } +/* "Raw" read of underlying config space. */ +static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + ssize_t ret; + + ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset); + + return ret < 0 ? -errno : (int)ret; +} + +/* "Raw" write of underlying config space. */ +static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, + uint32_t size, void *data) +{ + ssize_t ret; + + ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset); + + return ret < 0 ? -errno : (int)ret; +} + static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) { VFIOPCIDevice *vdev = opaque; @@ -1019,10 +1041,9 @@ static const MemoryRegionOps vfio_rom_ops = { static void vfio_pci_size_rom(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t orig, size = cpu_to_le32((uint32_t)PCI_ROM_ADDRESS_MASK); - off_t offset = vdev->config_offset + PCI_ROM_ADDRESS; char *name; - int fd = vdev->vbasedev.fd; if (vdev->pdev.romfile || !vdev->pdev.rom_bar) { /* Since pci handles romfile, just print a message and return */ @@ -1039,11 +1060,12 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) * Use the same size ROM BAR as the physical device. The contents * will get filled in later when the guest tries to read it. */ - if (pread(fd, &orig, 4, offset) != 4 || - pwrite(fd, &size, 4, offset) != 4 || - pread(fd, &size, 4, offset) != 4 || - pwrite(fd, &orig, 4, offset) != 4) { - error_report("%s(%s) failed: %m", __func__, vdev->vbasedev.name); + if (vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_read(vdev, PCI_ROM_ADDRESS, 4, &size) != 4 || + vfio_pci_config_space_write(vdev, PCI_ROM_ADDRESS, 4, &orig) != 4) { + + error_report("%s(%s) ROM access failed", __func__, vbasedev->name); return; } @@ -1223,6 +1245,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; memcpy(&emu_bits, vdev->emulated_config_bits + addr, len); @@ -1235,12 +1258,12 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) if (~emu_bits & (0xffffffffU >> (32 - len * 8))) { ssize_t ret; - ret = pread(vdev->vbasedev.fd, &phys_val, len, - vdev->config_offset + addr); + ret = vfio_pci_config_space_read(vdev, addr, len, &phys_val); if (ret != len) { - error_report("%s(%s, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, len); - return -errno; + error_report("%s(%s, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, len, + strreaderror(ret)); + return -1; } phys_val = le32_to_cpu(phys_val); } @@ -1256,15 +1279,18 @@ void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); + int ret; trace_vfio_pci_write_config(vdev->vbasedev.name, addr, val, len); /* Write everything to VFIO, let it filter out what we can't write */ - if (pwrite(vdev->vbasedev.fd, &val_le, len, vdev->config_offset + addr) - != len) { - error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %m", - __func__, vdev->vbasedev.name, addr, val, len); + ret = vfio_pci_config_space_write(vdev, addr, len, &val_le); + if (ret != len) { + error_report("%s(%s, 0x%x, 0x%x, 0x%x) failed: %s", + __func__, vbasedev->name, addr, val, len, + strwriteerror(ret)); } /* MSI/MSI-X Enabling/Disabling */ @@ -1352,9 +1378,11 @@ static bool vfio_msi_setup(VFIOPCIDevice *vdev, int pos, Error **errp) int ret, entries; Error *err = NULL; - if (pread(vdev->vbasedev.fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed reading MSI PCI_CAP_FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_CAP_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed reading MSI PCI_CAP_FLAGS: %s", + strreaderror(ret)); return false; } ctrl = le16_to_cpu(ctrl); @@ -1561,30 +1589,35 @@ static bool vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) uint8_t pos; uint16_t ctrl; uint32_t table, pba; - int ret, fd = vdev->vbasedev.fd; struct vfio_irq_info irq_info; VFIOMSIXInfo *msix; + int ret; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); if (!pos) { return true; } - if (pread(fd, &ctrl, sizeof(ctrl), - vdev->config_offset + pos + PCI_MSIX_FLAGS) != sizeof(ctrl)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX FLAGS"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_FLAGS, + sizeof(ctrl), &ctrl); + if (ret != sizeof(ctrl)) { + error_setg(errp, "failed to read PCI MSIX FLAGS: %s", + strreaderror(ret)); return false; } - if (pread(fd, &table, sizeof(table), - vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX TABLE"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_TABLE, + sizeof(table), &table); + if (ret != sizeof(table)) { + error_setg(errp, "failed to read PCI MSIX TABLE: %s", + strreaderror(ret)); return false; } - if (pread(fd, &pba, sizeof(pba), - vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) { - error_setg_errno(errp, errno, "failed to read PCI MSIX PBA"); + ret = vfio_pci_config_space_read(vdev, pos + PCI_MSIX_PBA, + sizeof(pba), &pba); + if (ret != sizeof(pba)) { + error_setg(errp, "failed to read PCI MSIX PBA: %s", strreaderror(ret)); return false; } @@ -1744,10 +1777,10 @@ static void vfio_bar_prepare(VFIOPCIDevice *vdev, int nr) } /* Determine what type of BAR this is for registration */ - ret = pread(vdev->vbasedev.fd, &pci_bar, sizeof(pci_bar), - vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr)); + ret = vfio_pci_config_space_read(vdev, PCI_BASE_ADDRESS_0 + (4 * nr), + sizeof(pci_bar), &pci_bar); if (ret != sizeof(pci_bar)) { - error_report("vfio: Failed to read BAR %d (%m)", nr); + error_report("vfio: Failed to read BAR %d: %s", nr, strreaderror(ret)); return; } @@ -2450,21 +2483,23 @@ void vfio_pci_pre_reset(VFIOPCIDevice *vdev) void vfio_pci_post_reset(VFIOPCIDevice *vdev) { + VFIODevice *vbasedev = &vdev->vbasedev; Error *err = NULL; - int nr; + int ret, nr; if (!vfio_intx_enable(vdev, &err)) { error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); } for (nr = 0; nr < PCI_NUM_REGIONS - 1; ++nr) { - off_t addr = vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr); + off_t addr = PCI_BASE_ADDRESS_0 + (4 * nr); uint32_t val = 0; uint32_t len = sizeof(val); - if (pwrite(vdev->vbasedev.fd, &val, len, addr) != len) { - error_report("%s(%s) reset bar %d failed: %m", __func__, - vdev->vbasedev.name, nr); + ret = vfio_pci_config_space_write(vdev, addr, len, &val); + if (ret != len) { + error_report("%s(%s) reset bar %d failed: %s", __func__, + vbasedev->name, nr, strwriteerror(ret)); } } @@ -3101,6 +3136,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) int i, ret; char uuid[UUID_STR_LEN]; g_autofree char *name = NULL; + uint32_t config_space_size; if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || @@ -3155,13 +3191,14 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } + config_space_size = MIN(pci_config_size(&vdev->pdev), vdev->config_size); + /* Get a copy of config space */ - ret = pread(vbasedev->fd, vdev->pdev.config, - MIN(pci_config_size(&vdev->pdev), vdev->config_size), - vdev->config_offset); - if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { - ret = ret < 0 ? -errno : -EFAULT; - error_setg_errno(errp, -ret, "failed to read device config space"); + ret = vfio_pci_config_space_read(vdev, 0, config_space_size, + vdev->pdev.config); + if (ret < (int)config_space_size) { + ret = ret < 0 ? -ret : EFAULT; + error_setg_errno(errp, ret, "failed to read device config space"); goto error; } From 5a22b505914bcb1d19a533eb5a36c907169b5ee3 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:13 +0100 Subject: [PATCH 22/28] vfio: add unmap_all flag to DMA unmap callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We'll use this parameter shortly; this just adds the plumbing. Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-9-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/container-base.c | 4 ++-- hw/vfio/container.c | 8 ++++++-- hw/vfio/iommufd.c | 6 +++++- hw/vfio/listener.c | 8 ++++---- include/hw/vfio/vfio-container-base.h | 15 +++++++++++++-- 5 files changed, 30 insertions(+), 11 deletions(-) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 09340fd97a..3ff473a45c 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -85,12 +85,12 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, bool unmap_all) { VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer); g_assert(vioc->dma_unmap); - return vioc->dma_unmap(bcontainer, iova, size, iotlb); + return vioc->dma_unmap(bcontainer, iova, size, iotlb, unmap_all); } bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, diff --git a/hw/vfio/container.c b/hw/vfio/container.c index cf23aa799f..d5f4e66f1c 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -124,7 +124,7 @@ unmap_exit: */ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, bool unmap_all) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); @@ -138,6 +138,10 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, int ret; Error *local_err = NULL; + if (unmap_all) { + return -ENOTSUP; + } + if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && bcontainer->dirty_pages_supported) { @@ -205,7 +209,7 @@ static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, */ if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0 || (errno == EBUSY && - vfio_legacy_dma_unmap(bcontainer, iova, size, NULL) == 0 && + vfio_legacy_dma_unmap(bcontainer, iova, size, NULL, false) == 0 && ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map) == 0)) { return 0; } diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 62ecb758f1..6b2764c044 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -46,11 +46,15 @@ static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova, static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb) + IOMMUTLBEntry *iotlb, bool unmap_all) { const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + if (unmap_all) { + return -ENOTSUP; + } + /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ return iommufd_backend_unmap_dma(container->be, container->ioas_id, iova, size); diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index 6f77e18a7a..c5183700db 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -172,7 +172,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) } } else { ret = vfio_container_dma_unmap(bcontainer, iova, - iotlb->addr_mask + 1, iotlb); + iotlb->addr_mask + 1, iotlb, false); if (ret) { error_setg(&local_err, "vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " @@ -201,7 +201,7 @@ static void vfio_ram_discard_notify_discard(RamDiscardListener *rdl, int ret; /* Unmap with a single call. */ - ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL); + ret = vfio_container_dma_unmap(bcontainer, iova, size , NULL, false); if (ret) { error_report("%s: vfio_container_dma_unmap() failed: %s", __func__, strerror(-ret)); @@ -638,7 +638,7 @@ static void vfio_listener_region_del(MemoryListener *listener, /* The unmap ioctl doesn't accept a full 64-bit span. */ llsize = int128_rshift(llsize, 1); ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); + int128_get64(llsize), NULL, false); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", @@ -648,7 +648,7 @@ static void vfio_listener_region_del(MemoryListener *listener, iova += int128_get64(llsize); } ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL); + int128_get64(llsize), NULL, false); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 5527e02722..59f07d26e8 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -81,7 +81,7 @@ int vfio_container_dma_map(VFIOContainerBase *bcontainer, void *vaddr, bool readonly); int vfio_container_dma_unmap(VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb); + IOMMUTLBEntry *iotlb, bool unmap_all); bool vfio_container_add_section_window(VFIOContainerBase *bcontainer, MemoryRegionSection *section, Error **errp); @@ -120,9 +120,20 @@ struct VFIOIOMMUClass { int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly); + /** + * @dma_unmap + * + * Unmap an address range from the container. + * + * @bcontainer: #VFIOContainerBase to use for unmap + * @iova: start address to unmap + * @size: size of the range to unmap + * @iotlb: The IOMMU TLB mapping entry (or NULL) + * @unmap_all: if set, unmap the entire address space + */ int (*dma_unmap)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb); + IOMMUTLBEntry *iotlb, bool unmap_all); bool (*attach_device)(const char *name, VFIODevice *vbasedev, AddressSpace *as, Error **errp); void (*detach_device)(VFIODevice *vbasedev); From 9458d9b4dc6a9d1c51772fc8a29a48ab47521430 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:14 +0100 Subject: [PATCH 23/28] vfio: implement unmap all for DMA unmap callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handle unmap_all in the DMA unmap handlers rather than in the caller. Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-10-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/container.c | 41 +++++++++++++++++++++++++++++++---------- hw/vfio/iommufd.c | 15 ++++++++++++++- hw/vfio/listener.c | 19 ++++++------------- 3 files changed, 51 insertions(+), 24 deletions(-) diff --git a/hw/vfio/container.c b/hw/vfio/container.c index d5f4e66f1c..a9f0dbaec4 100644 --- a/hw/vfio/container.c +++ b/hw/vfio/container.c @@ -119,12 +119,9 @@ unmap_exit: return ret; } -/* - * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 - */ -static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, - hwaddr iova, ram_addr_t size, - IOMMUTLBEntry *iotlb, bool unmap_all) +static int vfio_legacy_dma_unmap_one(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb) { const VFIOContainer *container = container_of(bcontainer, VFIOContainer, bcontainer); @@ -138,10 +135,6 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, int ret; Error *local_err = NULL; - if (unmap_all) { - return -ENOTSUP; - } - if (iotlb && vfio_container_dirty_tracking_is_started(bcontainer)) { if (!vfio_container_devices_dirty_tracking_is_supported(bcontainer) && bcontainer->dirty_pages_supported) { @@ -185,6 +178,34 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, return 0; } +/* + * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86 + */ +static int vfio_legacy_dma_unmap(const VFIOContainerBase *bcontainer, + hwaddr iova, ram_addr_t size, + IOMMUTLBEntry *iotlb, bool unmap_all) +{ + int ret; + + if (unmap_all) { + /* The unmap ioctl doesn't accept a full 64-bit span. */ + Int128 llsize = int128_rshift(int128_2_64(), 1); + + ret = vfio_legacy_dma_unmap_one(bcontainer, 0, int128_get64(llsize), + iotlb); + + if (ret == 0) { + ret = vfio_legacy_dma_unmap_one(bcontainer, int128_get64(llsize), + int128_get64(llsize), iotlb); + } + + } else { + ret = vfio_legacy_dma_unmap_one(bcontainer, iova, size, iotlb); + } + + return ret; +} + static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly) { diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c index 6b2764c044..af1c7ab10a 100644 --- a/hw/vfio/iommufd.c +++ b/hw/vfio/iommufd.c @@ -51,8 +51,21 @@ static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer, const VFIOIOMMUFDContainer *container = container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer); + /* unmap in halves */ if (unmap_all) { - return -ENOTSUP; + Int128 llsize = int128_rshift(int128_2_64(), 1); + int ret; + + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + 0, int128_get64(llsize)); + + if (ret == 0) { + ret = iommufd_backend_unmap_dma(container->be, container->ioas_id, + int128_get64(llsize), + int128_get64(llsize)); + } + + return ret; } /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */ diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index c5183700db..e7ade7d62e 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -634,21 +634,14 @@ static void vfio_listener_region_del(MemoryListener *listener, } if (try_unmap) { + bool unmap_all = false; + if (int128_eq(llsize, int128_2_64())) { - /* The unmap ioctl doesn't accept a full 64-bit span. */ - llsize = int128_rshift(llsize, 1); - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL, false); - if (ret) { - error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " - "0x%"HWADDR_PRIx") = %d (%s)", - bcontainer, iova, int128_get64(llsize), ret, - strerror(-ret)); - } - iova += int128_get64(llsize); + unmap_all = true; + llsize = int128_zero(); } - ret = vfio_container_dma_unmap(bcontainer, iova, - int128_get64(llsize), NULL, false); + ret = vfio_container_dma_unmap(bcontainer, iova, int128_get64(llsize), + NULL, unmap_all); if (ret) { error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", " "0x%"HWADDR_PRIx") = %d (%s)", From 38bf025d0dc80c1ae6e8f36093c5145c08b332d9 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:15 +0100 Subject: [PATCH 24/28] vfio: add device IO ops vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For vfio-user, device operations such as IRQ handling and region read/writes are implemented in userspace over the control socket, not ioctl() to the vfio kernel driver; add an ops vector to generalize this, and implement vfio_device_io_ops_ioctl for interacting with the kernel vfio driver. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-11-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/container-base.c | 6 +-- hw/vfio/device.c | 74 +++++++++++++++++++++++++++++------ hw/vfio/listener.c | 13 +++--- hw/vfio/pci.c | 10 ++--- include/hw/vfio/vfio-device.h | 38 ++++++++++++++++++ 5 files changed, 114 insertions(+), 27 deletions(-) diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c index 3ff473a45c..1c6ca94b60 100644 --- a/hw/vfio/container-base.c +++ b/hw/vfio/container-base.c @@ -198,11 +198,7 @@ static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova, feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT; - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { - return -errno; - } - - return 0; + return vbasedev->io_ops->device_feature(vbasedev, feature); } static int vfio_container_iommu_query_dirty_bitmap(const VFIOContainerBase *bcontainer, diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 5d837092cb..40a196bfb9 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -82,7 +82,7 @@ void vfio_device_irq_disable(VFIODevice *vbasedev, int index) .count = 0, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); } void vfio_device_irq_unmask(VFIODevice *vbasedev, int index) @@ -95,7 +95,7 @@ void vfio_device_irq_unmask(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); } void vfio_device_irq_mask(VFIODevice *vbasedev, int index) @@ -108,7 +108,7 @@ void vfio_device_irq_mask(VFIODevice *vbasedev, int index) .count = 1, }; - ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + vbasedev->io_ops->set_irqs(vbasedev, &irq_set); } static inline const char *action_to_str(int action) @@ -167,7 +167,7 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex pfd = (int32_t *)&irq_set->data; *pfd = fd; - if (!ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irq_set)) { + if (!vbasedev->io_ops->set_irqs(vbasedev, irq_set)) { return true; } @@ -188,22 +188,19 @@ bool vfio_device_irq_set_signaling(VFIODevice *vbasedev, int index, int subindex int vfio_device_get_irq_info(VFIODevice *vbasedev, int index, struct vfio_irq_info *info) { - int ret; - memset(info, 0, sizeof(*info)); info->argsz = sizeof(*info); info->index = index; - ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); - - return ret < 0 ? -errno : ret; + return vbasedev->io_ops->get_irq_info(vbasedev, info); } int vfio_device_get_region_info(VFIODevice *vbasedev, int index, struct vfio_region_info **info) { size_t argsz = sizeof(struct vfio_region_info); + int ret; *info = g_malloc0(argsz); @@ -211,10 +208,11 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, retry: (*info)->argsz = argsz; - if (ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, *info)) { + ret = vbasedev->io_ops->get_region_info(vbasedev, *info); + if (ret != 0) { g_free(*info); *info = NULL; - return -errno; + return ret; } if ((*info)->argsz > argsz) { @@ -320,11 +318,14 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char *str, Error **errp) vbasedev->fd = fd; } +static VFIODeviceIOOps vfio_device_io_ops_ioctl; + void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops, DeviceState *dev, bool ram_discard) { vbasedev->type = type; vbasedev->ops = ops; + vbasedev->io_ops = &vfio_device_io_ops_ioctl; vbasedev->dev = dev; vbasedev->fd = -1; @@ -442,3 +443,54 @@ void vfio_device_unprepare(VFIODevice *vbasedev) QLIST_REMOVE(vbasedev, global_next); vbasedev->bcontainer = NULL; } + +/* + * Traditional ioctl() based io + */ + +static int vfio_device_io_device_feature(VFIODevice *vbasedev, + struct vfio_device_feature *feature) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_region_info(VFIODevice *vbasedev, + struct vfio_region_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_get_irq_info(VFIODevice *vbasedev, + struct vfio_irq_info *info) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, info); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_set_irqs(VFIODevice *vbasedev, + struct vfio_irq_set *irqs) +{ + int ret; + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_SET_IRQS, irqs); + + return ret < 0 ? -errno : ret; +} + +static VFIODeviceIOOps vfio_device_io_ops_ioctl = { + .device_feature = vfio_device_io_device_feature, + .get_region_info = vfio_device_io_get_region_info, + .get_irq_info = vfio_device_io_get_irq_info, + .set_irqs = vfio_device_io_set_irqs, +}; diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index e7ade7d62e..2b93ca55b6 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -794,13 +794,17 @@ static void vfio_devices_dma_logging_stop(VFIOContainerBase *bcontainer) VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP; QLIST_FOREACH(vbasedev, &bcontainer->device_list, container_next) { + int ret; + if (!vbasedev->dirty_tracking) { continue; } - if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) { + ret = vbasedev->io_ops->device_feature(vbasedev, feature); + + if (ret != 0) { warn_report("%s: Failed to stop DMA logging, err %d (%s)", - vbasedev->name, -errno, strerror(errno)); + vbasedev->name, -ret, strerror(-ret)); } vbasedev->dirty_tracking = false; } @@ -901,10 +905,9 @@ static bool vfio_devices_dma_logging_start(VFIOContainerBase *bcontainer, continue; } - ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature); + ret = vbasedev->io_ops->device_feature(vbasedev, feature); if (ret) { - ret = -errno; - error_setg_errno(errp, errno, "%s: Failed to start DMA logging", + error_setg_errno(errp, -ret, "%s: Failed to start DMA logging", vbasedev->name); goto out; } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index f65c9463ce..da2ffc9bf3 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -381,7 +381,7 @@ static void vfio_msi_interrupt(void *opaque) static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) { g_autofree struct vfio_irq_set *irq_set = NULL; - int ret = 0, argsz; + int argsz; int32_t *fd; argsz = sizeof(*irq_set) + sizeof(*fd); @@ -396,9 +396,7 @@ static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) fd = (int32_t *)&irq_set->data; *fd = -1; - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); - - return ret < 0 ? -errno : ret; + return vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); } static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) @@ -455,11 +453,11 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) fds[i] = fd; } - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + ret = vdev->vbasedev.io_ops->set_irqs(&vdev->vbasedev, irq_set); g_free(irq_set); - return ret < 0 ? -errno : ret; + return ret; } static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 4a32202943..7e1e81e76b 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -41,6 +41,7 @@ enum { }; typedef struct VFIODeviceOps VFIODeviceOps; +typedef struct VFIODeviceIOOps VFIODeviceIOOps; typedef struct VFIOMigration VFIOMigration; typedef struct IOMMUFDBackend IOMMUFDBackend; @@ -66,6 +67,7 @@ typedef struct VFIODevice { OnOffAuto migration_multifd_transfer; bool migration_events; VFIODeviceOps *ops; + VFIODeviceIOOps *io_ops; unsigned int num_irqs; unsigned int num_regions; unsigned int flags; @@ -151,6 +153,42 @@ typedef QLIST_HEAD(VFIODeviceList, VFIODevice) VFIODeviceList; extern VFIODeviceList vfio_device_list; #ifdef CONFIG_LINUX +/* + * How devices communicate with the server. The default option is through + * ioctl() to the kernel VFIO driver, but vfio-user can use a socket to a remote + * process. + */ +struct VFIODeviceIOOps { + /** + * @device_feature + * + * Fill in feature info for the given device. + */ + int (*device_feature)(VFIODevice *vdev, struct vfio_device_feature *); + + /** + * @get_region_info + * + * Fill in @info with information on the region given by @info->index. + */ + int (*get_region_info)(VFIODevice *vdev, + struct vfio_region_info *info); + + /** + * @get_irq_info + * + * Fill in @irq with information on the IRQ given by @info->index. + */ + int (*get_irq_info)(VFIODevice *vdev, struct vfio_irq_info *irq); + + /** + * @set_irqs + * + * Configure IRQs as defined by @irqs. + */ + int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); +}; + void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, struct vfio_device_info *info); From 95cdb024515b107e2574c5cbef0a43cafa0db77f Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:16 +0100 Subject: [PATCH 25/28] vfio: add region info cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of requesting region information on demand with VFIO_DEVICE_GET_REGION_INFO, maintain a cache: this will become necessary for performance for vfio-user, where this call becomes a message over the control socket, so is of higher overhead than the traditional path. We will also need it to generalize region accesses, as that means we can't use ->config_offset for configuration space accesses, but must look up the region offset (if relevant) each time. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-12-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/ccw.c | 5 ----- hw/vfio/device.c | 25 +++++++++++++++++++++---- hw/vfio/igd.c | 10 +++++----- hw/vfio/pci.c | 6 +++--- hw/vfio/region.c | 2 +- include/hw/vfio/vfio-device.h | 1 + 6 files changed, 31 insertions(+), 18 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index ab3fabf991..cea9d6e005 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -504,7 +504,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->io_region_offset = info->offset; vcdev->io_region = g_malloc0(info->size); - g_free(info); /* check for the optional async command region */ ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, @@ -517,7 +516,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->async_cmd_region_offset = info->offset; vcdev->async_cmd_region = g_malloc0(info->size); - g_free(info); } ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, @@ -530,7 +528,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->schib_region_offset = info->offset; vcdev->schib_region = g_malloc(info->size); - g_free(info); } ret = vfio_device_get_region_info_type(vdev, VFIO_REGION_TYPE_CCW, @@ -544,7 +541,6 @@ static bool vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) } vcdev->crw_region_offset = info->offset; vcdev->crw_region = g_malloc(info->size); - g_free(info); } return true; @@ -554,7 +550,6 @@ out_err: g_free(vcdev->schib_region); g_free(vcdev->async_cmd_region); g_free(vcdev->io_region); - g_free(info); return false; } diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 40a196bfb9..77b0675abe 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -202,6 +202,12 @@ int vfio_device_get_region_info(VFIODevice *vbasedev, int index, size_t argsz = sizeof(struct vfio_region_info); int ret; + /* check cache */ + if (vbasedev->reginfo[index] != NULL) { + *info = vbasedev->reginfo[index]; + return 0; + } + *info = g_malloc0(argsz); (*info)->index = index; @@ -222,6 +228,9 @@ retry: goto retry; } + /* fill cache */ + vbasedev->reginfo[index] = *info; + return 0; } @@ -240,7 +249,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, hdr = vfio_get_region_info_cap(*info, VFIO_REGION_INFO_CAP_TYPE); if (!hdr) { - g_free(*info); continue; } @@ -252,8 +260,6 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, if (cap_type->type == type && cap_type->subtype == subtype) { return 0; } - - g_free(*info); } *info = NULL; @@ -262,7 +268,7 @@ int vfio_device_get_region_info_type(VFIODevice *vbasedev, uint32_t type, bool vfio_device_has_region_cap(VFIODevice *vbasedev, int region, uint16_t cap_type) { - g_autofree struct vfio_region_info *info = NULL; + struct vfio_region_info *info = NULL; bool ret = false; if (!vfio_device_get_region_info(vbasedev, region, &info)) { @@ -435,10 +441,21 @@ void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, QLIST_INSERT_HEAD(&bcontainer->device_list, vbasedev, container_next); QLIST_INSERT_HEAD(&vfio_device_list, vbasedev, global_next); + + vbasedev->reginfo = g_new0(struct vfio_region_info *, + vbasedev->num_regions); } void vfio_device_unprepare(VFIODevice *vbasedev) { + int i; + + for (i = 0; i < vbasedev->num_regions; i++) { + g_free(vbasedev->reginfo[i]); + } + g_free(vbasedev->reginfo); + vbasedev->reginfo = NULL; + QLIST_REMOVE(vbasedev, container_next); QLIST_REMOVE(vbasedev, global_next); vbasedev->bcontainer = NULL; diff --git a/hw/vfio/igd.c b/hw/vfio/igd.c index 3ee1a73b57..e7952d15a0 100644 --- a/hw/vfio/igd.c +++ b/hw/vfio/igd.c @@ -349,8 +349,8 @@ static int vfio_pci_igd_lpc_init(VFIOPCIDevice *vdev, static bool vfio_pci_igd_setup_lpc_bridge(VFIOPCIDevice *vdev, Error **errp) { - g_autofree struct vfio_region_info *host = NULL; - g_autofree struct vfio_region_info *lpc = NULL; + struct vfio_region_info *host = NULL; + struct vfio_region_info *lpc = NULL; PCIDevice *lpc_bridge; int ret; @@ -510,7 +510,7 @@ void vfio_probe_igd_bar0_quirk(VFIOPCIDevice *vdev, int nr) static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) { - g_autofree struct vfio_region_info *opregion = NULL; + struct vfio_region_info *opregion = NULL; int ret, gen; uint64_t gms_size = 0; uint64_t *bdsm_size; @@ -551,7 +551,7 @@ static bool vfio_pci_igd_config_quirk(VFIOPCIDevice *vdev, Error **errp) * - OpRegion * - Same LPC bridge and Host bridge VID/DID/SVID/SSID as host */ - g_autofree struct vfio_region_info *rom = NULL; + struct vfio_region_info *rom = NULL; legacy_mode_enabled = true; info_report("IGD legacy mode enabled, " @@ -681,7 +681,7 @@ error: */ static bool vfio_pci_kvmgt_config_quirk(VFIOPCIDevice *vdev, Error **errp) { - g_autofree struct vfio_region_info *opregion = NULL; + struct vfio_region_info *opregion = NULL; int gen; if (!vfio_pci_is(vdev, PCI_VENDOR_ID_INTEL, PCI_ANY_ID) || diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index da2ffc9bf3..9136cf52c8 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -883,8 +883,8 @@ static void vfio_update_msi(VFIOPCIDevice *vdev) static void vfio_pci_load_rom(VFIOPCIDevice *vdev) { - g_autofree struct vfio_region_info *reg_info = NULL; VFIODevice *vbasedev = &vdev->vbasedev; + struct vfio_region_info *reg_info = NULL; uint64_t size; off_t off = 0; ssize_t bytes; @@ -2710,7 +2710,7 @@ static VFIODeviceOps vfio_pci_ops = { bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; int ret; ret = vfio_device_get_region_info(vbasedev, VFIO_PCI_VGA_REGION_INDEX, ®_info); @@ -2775,7 +2775,7 @@ bool vfio_populate_vga(VFIOPCIDevice *vdev, Error **errp) static bool vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) { VFIODevice *vbasedev = &vdev->vbasedev; - g_autofree struct vfio_region_info *reg_info = NULL; + struct vfio_region_info *reg_info = NULL; struct vfio_irq_info irq_info; int i, ret = -1; diff --git a/hw/vfio/region.c b/hw/vfio/region.c index 04bf9eb098..ef2630cac3 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -182,7 +182,7 @@ static int vfio_setup_region_sparse_mmaps(VFIORegion *region, int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, int index, const char *name) { - g_autofree struct vfio_region_info *info = NULL; + struct vfio_region_info *info = NULL; int ret; ret = vfio_device_get_region_info(vbasedev, index, &info); diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 7e1e81e76b..4fff3dcee3 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -83,6 +83,7 @@ typedef struct VFIODevice { IOMMUFDBackend *iommufd; VFIOIOASHwpt *hwpt; QLIST_ENTRY(VFIODevice) hwpt_next; + struct vfio_region_info **reginfo; } VFIODevice; struct VFIODeviceOps { From 776066ac90a2b57fedb6b0186b30c5a9e9e1c9bd Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:17 +0100 Subject: [PATCH 26/28] vfio: add read/write to device IO ops vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now we have the region info cache, add ->region_read/write device I/O operations instead of explicit pread()/pwrite() system calls. Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-13-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 34 ++++++++++++++++++++++++++++++++++ hw/vfio/pci.c | 28 ++++++++++++++-------------- hw/vfio/region.c | 17 +++++++++++------ include/hw/vfio/vfio-device.h | 18 ++++++++++++++++++ 4 files changed, 77 insertions(+), 20 deletions(-) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 77b0675abe..0b2cd90d64 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -505,9 +505,43 @@ static int vfio_device_io_set_irqs(VFIODevice *vbasedev, return ret < 0 ? -errno : ret; } +static int vfio_device_io_region_read(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data) +{ + struct vfio_region_info *info; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret != 0) { + return ret; + } + + ret = pread(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + +static int vfio_device_io_region_write(VFIODevice *vbasedev, uint8_t index, + off_t off, uint32_t size, void *data) +{ + struct vfio_region_info *info; + int ret; + + ret = vfio_device_get_region_info(vbasedev, index, &info); + if (ret != 0) { + return ret; + } + + ret = pwrite(vbasedev->fd, data, size, info->offset + off); + + return ret < 0 ? -errno : ret; +} + static VFIODeviceIOOps vfio_device_io_ops_ioctl = { .device_feature = vfio_device_io_device_feature, .get_region_info = vfio_device_io_get_region_info, .get_irq_info = vfio_device_io_get_irq_info, .set_irqs = vfio_device_io_set_irqs, + .region_read = vfio_device_io_region_read, + .region_write = vfio_device_io_region_write, }; diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 9136cf52c8..1236de315d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -918,18 +918,22 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) memset(vdev->rom, 0xff, size); while (size) { - bytes = pread(vbasedev->fd, vdev->rom + off, - size, vdev->rom_offset + off); + bytes = vbasedev->io_ops->region_read(vbasedev, + VFIO_PCI_ROM_REGION_INDEX, + off, size, vdev->rom + off); + if (bytes == 0) { break; } else if (bytes > 0) { off += bytes; size -= bytes; } else { - if (errno == EINTR || errno == EAGAIN) { + if (bytes == -EINTR || bytes == -EAGAIN) { continue; } - error_report("vfio: Error reading device ROM: %m"); + error_report("vfio: Error reading device ROM: %s", + strreaderror(bytes)); + break; } } @@ -969,22 +973,18 @@ static void vfio_pci_load_rom(VFIOPCIDevice *vdev) static int vfio_pci_config_space_read(VFIOPCIDevice *vdev, off_t offset, uint32_t size, void *data) { - ssize_t ret; - - ret = pread(vdev->vbasedev.fd, data, size, vdev->config_offset + offset); - - return ret < 0 ? -errno : (int)ret; + return vdev->vbasedev.io_ops->region_read(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data); } /* "Raw" write of underlying config space. */ static int vfio_pci_config_space_write(VFIOPCIDevice *vdev, off_t offset, uint32_t size, void *data) { - ssize_t ret; - - ret = pwrite(vdev->vbasedev.fd, data, size, vdev->config_offset + offset); - - return ret < 0 ? -errno : (int)ret; + return vdev->vbasedev.io_ops->region_write(&vdev->vbasedev, + VFIO_PCI_CONFIG_REGION_INDEX, + offset, size, data); } static uint64_t vfio_rom_read(void *opaque, hwaddr addr, unsigned size) diff --git a/hw/vfio/region.c b/hw/vfio/region.c index ef2630cac3..34752c3f65 100644 --- a/hw/vfio/region.c +++ b/hw/vfio/region.c @@ -45,6 +45,7 @@ void vfio_region_write(void *opaque, hwaddr addr, uint32_t dword; uint64_t qword; } buf; + int ret; switch (size) { case 1: @@ -64,11 +65,13 @@ void vfio_region_write(void *opaque, hwaddr addr, break; } - if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { + ret = vbasedev->io_ops->region_write(vbasedev, region->nr, + addr, size, &buf); + if (ret != size) { error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 - ",%d) failed: %m", + ",%d) failed: %s", __func__, vbasedev->name, region->nr, - addr, data, size); + addr, data, size, strwriteerror(ret)); } trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); @@ -96,11 +99,13 @@ uint64_t vfio_region_read(void *opaque, uint64_t qword; } buf; uint64_t data = 0; + int ret; - if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { - error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", + ret = vbasedev->io_ops->region_read(vbasedev, region->nr, addr, size, &buf); + if (ret != size) { + error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %s", __func__, vbasedev->name, region->nr, - addr, size); + addr, size, strreaderror(ret)); return (uint64_t)-1; } switch (size) { diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h index 4fff3dcee3..8bcb3c19f6 100644 --- a/include/hw/vfio/vfio-device.h +++ b/include/hw/vfio/vfio-device.h @@ -188,6 +188,24 @@ struct VFIODeviceIOOps { * Configure IRQs as defined by @irqs. */ int (*set_irqs)(VFIODevice *vdev, struct vfio_irq_set *irqs); + + /** + * @region_read + * + * Read @size bytes from the region @nr at offset @off into the buffer + * @data. + */ + int (*region_read)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data); + + /** + * @region_write + * + * Write @size bytes to the region @nr at offset @off from the buffer + * @data. + */ + int (*region_write)(VFIODevice *vdev, uint8_t nr, off_t off, uint32_t size, + void *data); }; void vfio_device_prepare(VFIODevice *vbasedev, VFIOContainerBase *bcontainer, From d4e392d0a99b8018453e26f907e33b89724697a6 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:18 +0100 Subject: [PATCH 27/28] vfio: add vfio-pci-base class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split out parts of TYPE_VFIO_PCI into a base TYPE_VFIO_PCI_BASE, although we have not yet introduced another subclass, so all the properties have remained in TYPE_VFIO_PCI. Note that currently there is no need for additional data for TYPE_VFIO_PCI, so it shares the same C struct type as TYPE_VFIO_PCI_BASE, VFIOPCIDevice. Originally-by: John Johnson Signed-off-by: Elena Ufimtseva Signed-off-by: Jagannathan Raman Signed-off-by: John Levon Reviewed-by: Cédric Le Goater Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-14-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/device.c | 2 +- hw/vfio/pci.c | 62 +++++++++++++++++++++++++++++++----------------- hw/vfio/pci.h | 10 +++++++- 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/hw/vfio/device.c b/hw/vfio/device.c index 0b2cd90d64..9fba2c7272 100644 --- a/hw/vfio/device.c +++ b/hw/vfio/device.c @@ -392,7 +392,7 @@ bool vfio_device_hiod_create_and_realize(VFIODevice *vbasedev, VFIODevice *vfio_get_vfio_device(Object *obj) { if (object_dynamic_cast(obj, TYPE_VFIO_PCI)) { - return &VFIO_PCI(obj)->vbasedev; + return &VFIO_PCI_BASE(obj)->vbasedev; } else { return NULL; } diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 1236de315d..a1bfdfe375 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -241,7 +241,7 @@ static void vfio_intx_update(VFIOPCIDevice *vdev, PCIINTxRoute *route) static void vfio_intx_routing_notifier(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); PCIINTxRoute route; if (vdev->interrupt != VFIO_INT_INTx) { @@ -514,7 +514,7 @@ static void vfio_update_kvm_msi_virq(VFIOMSIVector *vector, MSIMessage msg, static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, MSIMessage *msg, IOHandler *handler) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector; int ret; bool resizing = !!(vdev->nr_vectors < nr + 1); @@ -620,7 +620,7 @@ static int vfio_msix_vector_use(PCIDevice *pdev, static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIOMSIVector *vector = &vdev->msi_vectors[nr]; trace_vfio_msix_vector_release(vdev->vbasedev.name, nr); @@ -1196,7 +1196,7 @@ static const MemoryRegionOps vfio_vga_ops = { */ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIORegion *region = &vdev->bars[bar].region; MemoryRegion *mmap_mr, *region_mr, *base_mr; PCIIORegion *r; @@ -1242,7 +1242,7 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) */ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; uint32_t emu_bits = 0, emu_val = 0, phys_val = 0, val; @@ -1276,7 +1276,7 @@ uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len) void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr, uint32_t val, int len) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; uint32_t val_le = cpu_to_le32(val); int ret; @@ -3129,7 +3129,7 @@ static bool vfio_interrupt_setup(VFIOPCIDevice *vdev, Error **errp) static void vfio_realize(PCIDevice *pdev, Error **errp) { ERRP_GUARD(); - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; int i, ret; char uuid[UUID_STR_LEN]; @@ -3300,7 +3300,7 @@ error: static void vfio_instance_finalize(Object *obj) { - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); vfio_display_finalize(vdev); vfio_bars_finalize(vdev); @@ -3318,7 +3318,7 @@ static void vfio_instance_finalize(Object *obj) static void vfio_exitfn(PCIDevice *pdev) { - VFIOPCIDevice *vdev = VFIO_PCI(pdev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(pdev); VFIODevice *vbasedev = &vdev->vbasedev; vfio_unregister_req_notifier(vdev); @@ -3342,7 +3342,7 @@ static void vfio_exitfn(PCIDevice *pdev) static void vfio_pci_reset(DeviceState *dev) { - VFIOPCIDevice *vdev = VFIO_PCI(dev); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(dev); trace_vfio_pci_reset(vdev->vbasedev.name); @@ -3382,7 +3382,7 @@ post_reset: static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); - VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, @@ -3403,6 +3403,31 @@ static void vfio_instance_init(Object *obj) pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; } +static void vfio_pci_base_dev_class_init(ObjectClass *klass, const void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass); + + dc->desc = "VFIO PCI base device"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); + pdc->exit = vfio_exitfn; + pdc->config_read = vfio_pci_read_config; + pdc->config_write = vfio_pci_write_config; +} + +static const TypeInfo vfio_pci_base_dev_info = { + .name = TYPE_VFIO_PCI_BASE, + .parent = TYPE_PCI_DEVICE, + .instance_size = 0, + .abstract = true, + .class_init = vfio_pci_base_dev_class_init, + .interfaces = (const InterfaceInfo[]) { + { INTERFACE_PCIE_DEVICE }, + { INTERFACE_CONVENTIONAL_PCI_DEVICE }, + { } + }, +}; + static PropertyInfo vfio_pci_migration_multifd_transfer_prop; static const Property vfio_pci_dev_properties[] = { @@ -3473,7 +3498,8 @@ static const Property vfio_pci_dev_properties[] = { #ifdef CONFIG_IOMMUFD static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) { - vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); + VFIOPCIDevice *vdev = VFIO_PCI_BASE(obj); + vfio_device_set_fd(&vdev->vbasedev, str, errp); } #endif @@ -3488,11 +3514,7 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); #endif dc->desc = "VFIO-based PCI device assignment"; - set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; - pdc->exit = vfio_exitfn; - pdc->config_read = vfio_pci_read_config; - pdc->config_write = vfio_pci_write_config; object_class_property_set_description(klass, /* 1.3 */ "host", @@ -3617,16 +3639,11 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, const void *data) static const TypeInfo vfio_pci_dev_info = { .name = TYPE_VFIO_PCI, - .parent = TYPE_PCI_DEVICE, + .parent = TYPE_VFIO_PCI_BASE, .instance_size = sizeof(VFIOPCIDevice), .class_init = vfio_pci_dev_class_init, .instance_init = vfio_instance_init, .instance_finalize = vfio_instance_finalize, - .interfaces = (const InterfaceInfo[]) { - { INTERFACE_PCIE_DEVICE }, - { INTERFACE_CONVENTIONAL_PCI_DEVICE }, - { } - }, }; static const Property vfio_pci_dev_nohotplug_properties[] = { @@ -3673,6 +3690,7 @@ static void register_vfio_pci_dev_type(void) vfio_pci_migration_multifd_transfer_prop = qdev_prop_on_off_auto; vfio_pci_migration_multifd_transfer_prop.realized_set_allowed = true; + type_register_static(&vfio_pci_base_dev_info); type_register_static(&vfio_pci_dev_info); type_register_static(&vfio_pci_nohotplug_dev_info); } diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index f835b1dbc2..5ce0fb916f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -118,8 +118,16 @@ typedef struct VFIOMSIXInfo { bool noresize; } VFIOMSIXInfo; +/* + * TYPE_VFIO_PCI_BASE is an abstract type used to share code + * between VFIO implementations that use a kernel driver + * with those that use user sockets. + */ +#define TYPE_VFIO_PCI_BASE "vfio-pci-base" +OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI_BASE) + #define TYPE_VFIO_PCI "vfio-pci" -OBJECT_DECLARE_SIMPLE_TYPE(VFIOPCIDevice, VFIO_PCI) +/* TYPE_VFIO_PCI shares struct VFIOPCIDevice. */ struct VFIOPCIDevice { PCIDevice pdev; From d9b7d8b6993b5193480e5a972902e3e9bbc4d8a1 Mon Sep 17 00:00:00 2001 From: John Levon Date: Wed, 7 May 2025 16:20:19 +0100 Subject: [PATCH 28/28] vfio/container: pass listener_begin/commit callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vfio-user container will later need to hook into these callbacks; set up vfio to use them, and optionally pass them through to the container. Reviewed-by: Cédric Le Goater Signed-off-by: John Levon Link: https://lore.kernel.org/qemu-devel/20250507152020.1254632-15-john.levon@nutanix.com Signed-off-by: Cédric Le Goater --- hw/vfio/listener.c | 28 +++++++++++++++++++++++++++ include/hw/vfio/vfio-container-base.h | 2 ++ 2 files changed, 30 insertions(+) diff --git a/hw/vfio/listener.c b/hw/vfio/listener.c index 2b93ca55b6..bfacb3d8d9 100644 --- a/hw/vfio/listener.c +++ b/hw/vfio/listener.c @@ -411,6 +411,32 @@ static bool vfio_get_section_iova_range(VFIOContainerBase *bcontainer, return true; } +static void vfio_listener_begin(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + void (*listener_begin)(VFIOContainerBase *bcontainer); + + listener_begin = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; + + if (listener_begin) { + listener_begin(bcontainer); + } +} + +static void vfio_listener_commit(MemoryListener *listener) +{ + VFIOContainerBase *bcontainer = container_of(listener, VFIOContainerBase, + listener); + void (*listener_commit)(VFIOContainerBase *bcontainer); + + listener_commit = VFIO_IOMMU_GET_CLASS(bcontainer)->listener_begin; + + if (listener_commit) { + listener_commit(bcontainer); + } +} + static void vfio_device_error_append(VFIODevice *vbasedev, Error **errp) { /* @@ -1161,6 +1187,8 @@ static void vfio_listener_log_sync(MemoryListener *listener, static const MemoryListener vfio_memory_listener = { .name = "vfio", + .begin = vfio_listener_begin, + .commit = vfio_listener_commit, .region_add = vfio_listener_region_add, .region_del = vfio_listener_region_del, .log_global_start = vfio_listener_log_global_start, diff --git a/include/hw/vfio/vfio-container-base.h b/include/hw/vfio/vfio-container-base.h index 59f07d26e8..3d392b0fd8 100644 --- a/include/hw/vfio/vfio-container-base.h +++ b/include/hw/vfio/vfio-container-base.h @@ -117,6 +117,8 @@ struct VFIOIOMMUClass { /* basic feature */ bool (*setup)(VFIOContainerBase *bcontainer, Error **errp); + void (*listener_begin)(VFIOContainerBase *bcontainer); + void (*listener_commit)(VFIOContainerBase *bcontainer); int (*dma_map)(const VFIOContainerBase *bcontainer, hwaddr iova, ram_addr_t size, void *vaddr, bool readonly);