From 1bcf209154445a5837a8fc0fc9d88200fedb25b2 Mon Sep 17 00:00:00 2001
From: Marcel Apfelbaum <marcel@redhat.com>
Date: Wed, 3 Jan 2018 17:02:29 +0200
Subject: [PATCH 01/29] MAINTAINERS: Add myself as maintainer to X86 machines

Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4770f105d4..753e7996ce 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -852,6 +852,7 @@ X86 Machines
 ------------
 PC
 M: Michael S. Tsirkin <mst@redhat.com>
+M: Marcel Apfelbaum <marcel@redhat.com>
 S: Supported
 F: include/hw/i386/
 F: hw/i386/

From 4c3e257b5e6ccba6bd34f780fab8008e0d79680a Mon Sep 17 00:00:00 2001
From: Changpeng Liu <changpeng.liu@intel.com>
Date: Thu, 4 Jan 2018 09:53:31 +0800
Subject: [PATCH 02/29] vhost-user: add new vhost user messages to support
 virtio config space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add VHOST_USER_GET_CONFIG/VHOST_USER_SET_CONFIG messages which can be
used for live migration of vhost user devices, also vhost user devices
can benefit from the messages to get/set virtio config space from/to the
I/O target. For the purpose to support virtio config space change,
VHOST_USER_SLAVE_CONFIG_CHANGE_MSG message is added as the event notifier
in case virtio config space change in the slave I/O target.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 docs/interop/vhost-user.txt       |  55 ++++++++++++++
 hw/virtio/vhost-user.c            | 118 ++++++++++++++++++++++++++++++
 hw/virtio/vhost.c                 |  32 ++++++++
 include/hw/virtio/vhost-backend.h |  12 +++
 include/hw/virtio/vhost.h         |  15 ++++
 5 files changed, 232 insertions(+)

diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
index d49444e037..0875ef4ec3 100644
--- a/docs/interop/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
@@ -119,6 +119,19 @@ Depending on the request type, payload can be:
     - 3: IOTLB invalidate
     - 4: IOTLB access fail
 
+ * Virtio device config space
+   -----------------------------------
+   | offset | size | flags | payload |
+   -----------------------------------
+
+   Offset: a 32-bit offset of virtio device's configuration space
+   Size: a 32-bit configuration space access size in bytes
+   Flags: a 32-bit value:
+    - 0: Vhost master messages used for writeable fields
+    - 1: Vhost master messages used for live migration
+   Payload: Size bytes array holding the contents of the virtio
+       device's configuration space
+
 In QEMU the vhost-user message is implemented with the following struct:
 
 typedef struct VhostUserMsg {
@@ -132,6 +145,7 @@ typedef struct VhostUserMsg {
         VhostUserMemory memory;
         VhostUserLog log;
         struct vhost_iotlb_msg iotlb;
+        VhostUserConfig config;
     };
 } QEMU_PACKED VhostUserMsg;
 
@@ -623,6 +637,32 @@ Master message types
       and expect this message once (per VQ) during device configuration
       (ie. before the master starts the VQ).
 
+ * VHOST_USER_GET_CONFIG
+
+      Id: 24
+      Equivalent ioctl: N/A
+      Master payload: virtio device config space
+      Slave payload: virtio device config space
+
+      Submitted by the vhost-user master to fetch the contents of the virtio
+      device configuration space, vhost-user slave's payload size MUST match
+      master's request, vhost-user slave uses zero length of payload to
+      indicate an error to vhost-user master. The vhost-user master may
+      cache the contents to avoid repeated VHOST_USER_GET_CONFIG calls.
+
+* VHOST_USER_SET_CONFIG
+
+      Id: 25
+      Equivalent ioctl: N/A
+      Master payload: virtio device config space
+      Slave payload: N/A
+
+      Submitted by the vhost-user master when the Guest changes the virtio
+      device configuration space and also can be used for live migration
+      on the destination host. The vhost-user slave must check the flags
+      field, and slaves MUST NOT accept SET_CONFIG for read-only
+      configuration space fields unless the live migration bit is set.
+
 Slave message types
 -------------------
 
@@ -641,6 +681,21 @@ Slave message types
       This request should be send only when VIRTIO_F_IOMMU_PLATFORM feature
       has been successfully negotiated.
 
+* VHOST_USER_SLAVE_CONFIG_CHANGE_MSG
+
+     Id: 2
+     Equivalent ioctl: N/A
+     Slave payload: N/A
+     Master payload: N/A
+
+     Vhost-user slave sends such messages to notify that the virtio device's
+     configuration space has changed, for those host devices which can support
+     such feature, host driver can send VHOST_USER_GET_CONFIG message to slave
+     to get the latest content. If VHOST_USER_PROTOCOL_F_REPLY_ACK is
+     negotiated, and slave set the VHOST_USER_NEED_REPLY flag, master must
+     respond with zero when operation is successfully completed, or non-zero
+     otherwise.
+
 VHOST_USER_PROTOCOL_F_REPLY_ACK:
 -------------------------------
 The original vhost-user specification only demands replies for certain
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 093675ed98..8b946880fe 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -26,6 +26,11 @@
 #define VHOST_MEMORY_MAX_NREGIONS    8
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
 
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
 enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_MQ = 0,
     VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
@@ -65,12 +70,15 @@ typedef enum VhostUserRequest {
     VHOST_USER_SET_SLAVE_REQ_FD = 21,
     VHOST_USER_IOTLB_MSG = 22,
     VHOST_USER_SET_VRING_ENDIAN = 23,
+    VHOST_USER_GET_CONFIG = 24,
+    VHOST_USER_SET_CONFIG = 25,
     VHOST_USER_MAX
 } VhostUserRequest;
 
 typedef enum VhostUserSlaveRequest {
     VHOST_USER_SLAVE_NONE = 0,
     VHOST_USER_SLAVE_IOTLB_MSG = 1,
+    VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
     VHOST_USER_SLAVE_MAX
 }  VhostUserSlaveRequest;
 
@@ -92,6 +100,18 @@ typedef struct VhostUserLog {
     uint64_t mmap_offset;
 } VhostUserLog;
 
+typedef struct VhostUserConfig {
+    uint32_t offset;
+    uint32_t size;
+    uint32_t flags;
+    uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+} VhostUserConfig;
+
+static VhostUserConfig c __attribute__ ((unused));
+#define VHOST_USER_CONFIG_HDR_SIZE (sizeof(c.offset) \
+                                   + sizeof(c.size) \
+                                   + sizeof(c.flags))
+
 typedef struct VhostUserMsg {
     VhostUserRequest request;
 
@@ -109,6 +129,7 @@ typedef struct VhostUserMsg {
         VhostUserMemory memory;
         VhostUserLog log;
         struct vhost_iotlb_msg iotlb;
+        VhostUserConfig config;
     } payload;
 } QEMU_PACKED VhostUserMsg;
 
@@ -608,6 +629,21 @@ static int vhost_user_reset_device(struct vhost_dev *dev)
     return 0;
 }
 
+static int vhost_user_slave_handle_config_change(struct vhost_dev *dev)
+{
+    int ret = -1;
+
+    if (!dev->config_ops) {
+        return -1;
+    }
+
+    if (dev->config_ops->vhost_dev_config_notifier) {
+        ret = dev->config_ops->vhost_dev_config_notifier(dev);
+    }
+
+    return ret;
+}
+
 static void slave_read(void *opaque)
 {
     struct vhost_dev *dev = opaque;
@@ -640,6 +676,9 @@ static void slave_read(void *opaque)
     case VHOST_USER_SLAVE_IOTLB_MSG:
         ret = vhost_backend_handle_iotlb_msg(dev, &msg.payload.iotlb);
         break;
+    case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
+        ret = vhost_user_slave_handle_config_change(dev);
+        break;
     default:
         error_report("Received unexpected msg type.");
         ret = -EINVAL;
@@ -922,6 +961,83 @@ static void vhost_user_set_iotlb_callback(struct vhost_dev *dev, int enabled)
     /* No-op as the receive channel is not dedicated to IOTLB messages. */
 }
 
+static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config,
+                                 uint32_t config_len)
+{
+    VhostUserMsg msg = {
+        msg.request = VHOST_USER_GET_CONFIG,
+        msg.flags = VHOST_USER_VERSION,
+        msg.size = VHOST_USER_CONFIG_HDR_SIZE + config_len,
+    };
+
+    if (config_len > VHOST_USER_MAX_CONFIG_SIZE) {
+        return -1;
+    }
+
+    msg.payload.config.offset = 0;
+    msg.payload.config.size = config_len;
+    if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
+        return -1;
+    }
+
+    if (vhost_user_read(dev, &msg) < 0) {
+        return -1;
+    }
+
+    if (msg.request != VHOST_USER_GET_CONFIG) {
+        error_report("Received unexpected msg type. Expected %d received %d",
+                     VHOST_USER_GET_CONFIG, msg.request);
+        return -1;
+    }
+
+    if (msg.size != VHOST_USER_CONFIG_HDR_SIZE + config_len) {
+        error_report("Received bad msg size.");
+        return -1;
+    }
+
+    memcpy(config, msg.payload.config.region, config_len);
+
+    return 0;
+}
+
+static int vhost_user_set_config(struct vhost_dev *dev, const uint8_t *data,
+                                 uint32_t offset, uint32_t size, uint32_t flags)
+{
+    uint8_t *p;
+    bool reply_supported = virtio_has_feature(dev->protocol_features,
+                                              VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+    VhostUserMsg msg = {
+        msg.request = VHOST_USER_SET_CONFIG,
+        msg.flags = VHOST_USER_VERSION,
+        msg.size = VHOST_USER_CONFIG_HDR_SIZE + size,
+    };
+
+    if (reply_supported) {
+        msg.flags |= VHOST_USER_NEED_REPLY_MASK;
+    }
+
+    if (size > VHOST_USER_MAX_CONFIG_SIZE) {
+        return -1;
+    }
+
+    msg.payload.config.offset = offset,
+    msg.payload.config.size = size,
+    msg.payload.config.flags = flags,
+    p = msg.payload.config.region;
+    memcpy(p, data, size);
+
+    if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
+        return -1;
+    }
+
+    if (reply_supported) {
+        return process_message_reply(dev, &msg);
+    }
+
+    return 0;
+}
+
 const VhostOps user_ops = {
         .backend_type = VHOST_BACKEND_TYPE_USER,
         .vhost_backend_init = vhost_user_init,
@@ -948,4 +1064,6 @@ const VhostOps user_ops = {
         .vhost_net_set_mtu = vhost_user_net_set_mtu,
         .vhost_set_iotlb_callback = vhost_user_set_iotlb_callback,
         .vhost_send_device_iotlb_msg = vhost_user_send_device_iotlb_msg,
+        .vhost_get_config = vhost_user_get_config,
+        .vhost_set_config = vhost_user_set_config,
 };
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index e4290ce93d..386aef85be 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1505,6 +1505,38 @@ void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
     }
 }
 
+int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
+                         uint32_t config_len)
+{
+    assert(hdev->vhost_ops);
+
+    if (hdev->vhost_ops->vhost_get_config) {
+        return hdev->vhost_ops->vhost_get_config(hdev, config, config_len);
+    }
+
+    return -1;
+}
+
+int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
+                         uint32_t offset, uint32_t size, uint32_t flags)
+{
+    assert(hdev->vhost_ops);
+
+    if (hdev->vhost_ops->vhost_set_config) {
+        return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
+                                                 size, flags);
+    }
+
+    return -1;
+}
+
+void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
+                                   const VhostDevConfigOps *ops)
+{
+    assert(hdev->vhost_ops);
+    hdev->config_ops = ops;
+}
+
 /* Host notifiers must be enabled at this point. */
 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
 {
diff --git a/include/hw/virtio/vhost-backend.h b/include/hw/virtio/vhost-backend.h
index a7a5f22bc6..592254f40d 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -20,6 +20,11 @@ typedef enum VhostBackendType {
     VHOST_BACKEND_TYPE_MAX = 3,
 } VhostBackendType;
 
+typedef enum VhostSetConfigType {
+    VHOST_SET_CONFIG_TYPE_MASTER = 0,
+    VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
+} VhostSetConfigType;
+
 struct vhost_dev;
 struct vhost_log;
 struct vhost_memory;
@@ -84,6 +89,11 @@ typedef void (*vhost_set_iotlb_callback_op)(struct vhost_dev *dev,
                                            int enabled);
 typedef int (*vhost_send_device_iotlb_msg_op)(struct vhost_dev *dev,
                                               struct vhost_iotlb_msg *imsg);
+typedef int (*vhost_set_config_op)(struct vhost_dev *dev, const uint8_t *data,
+                                   uint32_t offset, uint32_t size,
+                                   uint32_t flags);
+typedef int (*vhost_get_config_op)(struct vhost_dev *dev, uint8_t *config,
+                                   uint32_t config_len);
 
 typedef struct VhostOps {
     VhostBackendType backend_type;
@@ -118,6 +128,8 @@ typedef struct VhostOps {
     vhost_vsock_set_running_op vhost_vsock_set_running;
     vhost_set_iotlb_callback_op vhost_set_iotlb_callback;
     vhost_send_device_iotlb_msg_op vhost_send_device_iotlb_msg;
+    vhost_get_config_op vhost_get_config;
+    vhost_set_config_op vhost_set_config;
 } VhostOps;
 
 extern const VhostOps user_ops;
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index 467dc7794b..1dc2d73d76 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -46,6 +46,12 @@ struct vhost_iommu {
     QLIST_ENTRY(vhost_iommu) iommu_next;
 };
 
+typedef struct VhostDevConfigOps {
+    /* Vhost device config space changed callback
+     */
+    int (*vhost_dev_config_notifier)(struct vhost_dev *dev);
+} VhostDevConfigOps;
+
 struct vhost_memory;
 struct vhost_dev {
     VirtIODevice *vdev;
@@ -76,6 +82,7 @@ struct vhost_dev {
     QLIST_ENTRY(vhost_dev) entry;
     QLIST_HEAD(, vhost_iommu) iommu_list;
     IOMMUNotifier n;
+    const VhostDevConfigOps *config_ops;
 };
 
 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
@@ -106,4 +113,12 @@ int vhost_net_set_backend(struct vhost_dev *hdev,
                           struct vhost_vring_file *file);
 
 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
+int vhost_dev_get_config(struct vhost_dev *dev, uint8_t *config,
+                         uint32_t config_len);
+int vhost_dev_set_config(struct vhost_dev *dev, const uint8_t *data,
+                         uint32_t offset, uint32_t size, uint32_t flags);
+/* notifier callback in case vhost device config space changed
+ */
+void vhost_dev_set_config_notifier(struct vhost_dev *dev,
+                                   const VhostDevConfigOps *ops);
 #endif

From 00343e4b54ba0685e9ebe928ec5713b0cf7f1d1c Mon Sep 17 00:00:00 2001
From: Changpeng Liu <changpeng.liu@intel.com>
Date: Thu, 4 Jan 2018 09:53:32 +0800
Subject: [PATCH 03/29] vhost-user-blk: introduce a new vhost-user-blk host
 device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit introduces a new vhost-user device for block, it uses a
chardev to connect with the backend, same with Qemu virito-blk device,
Guest OS still uses the virtio-blk frontend driver.

To use it, start QEMU with command line like this:

qemu-system-x86_64 \
    -chardev socket,id=char0,path=/path/vhost.socket \
    -device vhost-user-blk-pci,chardev=char0,num-queues=2, \
            bootindex=2... \

Users can use different parameters for `num-queues` and `bootindex`.

Different with exist Qemu virtio-blk host device, it makes more easy
for users to implement their own I/O processing logic, such as all
user space I/O stack against hardware block device. It uses the new
vhost messages(VHOST_USER_GET_CONFIG) to get block virtio config
information from backend process.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 default-configs/pci.mak            |   1 +
 default-configs/s390x-softmmu.mak  |   1 +
 hw/block/Makefile.objs             |   3 +
 hw/block/vhost-user-blk.c          | 359 +++++++++++++++++++++++++++++
 hw/virtio/virtio-pci.c             |  55 +++++
 hw/virtio/virtio-pci.h             |  18 ++
 include/hw/virtio/vhost-user-blk.h |  41 ++++
 7 files changed, 478 insertions(+)
 create mode 100644 hw/block/vhost-user-blk.c
 create mode 100644 include/hw/virtio/vhost-user-blk.h

diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index e514bdef42..49a0f285ac 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -43,3 +43,4 @@ CONFIG_VGA_PCI=y
 CONFIG_IVSHMEM_DEVICE=$(CONFIG_IVSHMEM)
 CONFIG_ROCKER=y
 CONFIG_VHOST_USER_SCSI=$(call land,$(CONFIG_VHOST_USER),$(CONFIG_LINUX))
+CONFIG_VHOST_USER_BLK=$(call land,$(CONFIG_VHOST_USER),$(CONFIG_LINUX))
diff --git a/default-configs/s390x-softmmu.mak b/default-configs/s390x-softmmu.mak
index 444bf16b80..2f4bfe73b4 100644
--- a/default-configs/s390x-softmmu.mak
+++ b/default-configs/s390x-softmmu.mak
@@ -1,6 +1,7 @@
 CONFIG_PCI=y
 CONFIG_VIRTIO_PCI=$(CONFIG_PCI)
 CONFIG_VHOST_USER_SCSI=$(call land,$(CONFIG_VHOST_USER),$(CONFIG_LINUX))
+CONFIG_VHOST_USER_BLK=$(call land,$(CONFIG_VHOST_USER),$(CONFIG_LINUX))
 CONFIG_VIRTIO=y
 CONFIG_SCLPCONSOLE=y
 CONFIG_TERMINAL3270=y
diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
index e0ed980c90..4c19a583c8 100644
--- a/hw/block/Makefile.objs
+++ b/hw/block/Makefile.objs
@@ -13,3 +13,6 @@ obj-$(CONFIG_SH4) += tc58128.o
 
 obj-$(CONFIG_VIRTIO) += virtio-blk.o
 obj-$(CONFIG_VIRTIO) += dataplane/
+ifeq ($(CONFIG_VIRTIO),y)
+obj-$(CONFIG_VHOST_USER_BLK) += vhost-user-blk.o
+endif
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
new file mode 100644
index 0000000000..b53b4c9c57
--- /dev/null
+++ b/hw/block/vhost-user-blk.c
@@ -0,0 +1,359 @@
+/*
+ * vhost-user-blk host device
+ *
+ * Copyright(C) 2017 Intel Corporation.
+ *
+ * Authors:
+ *  Changpeng Liu <changpeng.liu@intel.com>
+ *
+ * Largely based on the "vhost-user-scsi.c" and "vhost-scsi.c" implemented by:
+ * Felipe Franciosi <felipe@nutanix.com>
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Nicholas Bellinger <nab@risingtidesystems.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/typedefs.h"
+#include "qemu/cutils.h"
+#include "qom/object.h"
+#include "hw/qdev-core.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user-blk.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+
+static const int user_feature_bits[] = {
+    VIRTIO_BLK_F_SIZE_MAX,
+    VIRTIO_BLK_F_SEG_MAX,
+    VIRTIO_BLK_F_GEOMETRY,
+    VIRTIO_BLK_F_BLK_SIZE,
+    VIRTIO_BLK_F_TOPOLOGY,
+    VIRTIO_BLK_F_MQ,
+    VIRTIO_BLK_F_RO,
+    VIRTIO_BLK_F_FLUSH,
+    VIRTIO_BLK_F_CONFIG_WCE,
+    VIRTIO_F_VERSION_1,
+    VIRTIO_RING_F_INDIRECT_DESC,
+    VIRTIO_RING_F_EVENT_IDX,
+    VIRTIO_F_NOTIFY_ON_EMPTY,
+    VHOST_INVALID_FEATURE_BIT
+};
+
+static void vhost_user_blk_update_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+
+    memcpy(config, &s->blkcfg, sizeof(struct virtio_blk_config));
+}
+
+static void vhost_user_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    struct virtio_blk_config *blkcfg = (struct virtio_blk_config *)config;
+    int ret;
+
+    if (blkcfg->wce == s->blkcfg.wce) {
+        return;
+    }
+
+    ret = vhost_dev_set_config(&s->dev, &blkcfg->wce,
+                               offsetof(struct virtio_blk_config, wce),
+                               sizeof(blkcfg->wce),
+                               VHOST_SET_CONFIG_TYPE_MASTER);
+    if (ret) {
+        error_report("set device config space failed");
+        return;
+    }
+
+    s->blkcfg.wce = blkcfg->wce;
+}
+
+static int vhost_user_blk_handle_config_change(struct vhost_dev *dev)
+{
+    int ret;
+    struct virtio_blk_config blkcfg;
+    VHostUserBlk *s = VHOST_USER_BLK(dev->vdev);
+
+    ret = vhost_dev_get_config(dev, (uint8_t *)&blkcfg,
+                               sizeof(struct virtio_blk_config));
+    if (ret < 0) {
+        error_report("get config space failed");
+        return -1;
+    }
+
+    /* valid for resize only */
+    if (blkcfg.capacity != s->blkcfg.capacity) {
+        s->blkcfg.capacity = blkcfg.capacity;
+        memcpy(dev->vdev->config, &s->blkcfg, sizeof(struct virtio_blk_config));
+        virtio_notify_config(dev->vdev);
+    }
+
+    return 0;
+}
+
+const VhostDevConfigOps blk_ops = {
+    .vhost_dev_config_notifier = vhost_user_blk_handle_config_change,
+};
+
+static void vhost_user_blk_start(VirtIODevice *vdev)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    int i, ret;
+
+    if (!k->set_guest_notifiers) {
+        error_report("binding does not support guest notifiers");
+        return;
+    }
+
+    ret = vhost_dev_enable_notifiers(&s->dev, vdev);
+    if (ret < 0) {
+        error_report("Error enabling host notifiers: %d", -ret);
+        return;
+    }
+
+    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
+    if (ret < 0) {
+        error_report("Error binding guest notifier: %d", -ret);
+        goto err_host_notifiers;
+    }
+
+    s->dev.acked_features = vdev->guest_features;
+    ret = vhost_dev_start(&s->dev, vdev);
+    if (ret < 0) {
+        error_report("Error starting vhost: %d", -ret);
+        goto err_guest_notifiers;
+    }
+
+    /* guest_notifier_mask/pending not used yet, so just unmask
+     * everything here. virtio-pci will do the right thing by
+     * enabling/disabling irqfd.
+     */
+    for (i = 0; i < s->dev.nvqs; i++) {
+        vhost_virtqueue_mask(&s->dev, vdev, i, false);
+    }
+
+    return;
+
+err_guest_notifiers:
+    k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
+err_host_notifiers:
+    vhost_dev_disable_notifiers(&s->dev, vdev);
+}
+
+static void vhost_user_blk_stop(VirtIODevice *vdev)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    int ret;
+
+    if (!k->set_guest_notifiers) {
+        return;
+    }
+
+    vhost_dev_stop(&s->dev, vdev);
+
+    ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false);
+    if (ret < 0) {
+        error_report("vhost guest notifier cleanup failed: %d", ret);
+        return;
+    }
+
+    vhost_dev_disable_notifiers(&s->dev, vdev);
+}
+
+static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK;
+
+    if (!vdev->vm_running) {
+        should_start = false;
+    }
+
+    if (s->dev.started == should_start) {
+        return;
+    }
+
+    if (should_start) {
+        vhost_user_blk_start(vdev);
+    } else {
+        vhost_user_blk_stop(vdev);
+    }
+
+}
+
+static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev,
+                                            uint64_t features,
+                                            Error **errp)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    uint64_t get_features;
+
+    /* Turn on pre-defined features */
+    virtio_add_feature(&features, VIRTIO_BLK_F_SEG_MAX);
+    virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
+    virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
+    virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
+    virtio_add_feature(&features, VIRTIO_BLK_F_FLUSH);
+
+    if (s->config_wce) {
+        virtio_add_feature(&features, VIRTIO_BLK_F_CONFIG_WCE);
+    }
+    if (s->config_ro) {
+        virtio_add_feature(&features, VIRTIO_BLK_F_RO);
+    }
+    if (s->num_queues > 1) {
+        virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
+    }
+
+    get_features = vhost_get_features(&s->dev, user_feature_bits, features);
+
+    return get_features;
+}
+
+static void vhost_user_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
+{
+
+}
+
+static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    int i, ret;
+
+    if (!s->chardev.chr) {
+        error_setg(errp, "vhost-user-blk: chardev is mandatory");
+        return;
+    }
+
+    if (!s->num_queues || s->num_queues > VIRTIO_QUEUE_MAX) {
+        error_setg(errp, "vhost-user-blk: invalid number of IO queues");
+        return;
+    }
+
+    if (!s->queue_size) {
+        error_setg(errp, "vhost-user-blk: queue size must be non-zero");
+        return;
+    }
+
+    virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
+                sizeof(struct virtio_blk_config));
+
+    for (i = 0; i < s->num_queues; i++) {
+        virtio_add_queue(vdev, s->queue_size,
+                         vhost_user_blk_handle_output);
+    }
+
+    s->dev.nvqs = s->num_queues;
+    s->dev.vqs = g_new(struct vhost_virtqueue, s->dev.nvqs);
+    s->dev.vq_index = 0;
+    s->dev.backend_features = 0;
+
+    ret = vhost_dev_init(&s->dev, &s->chardev, VHOST_BACKEND_TYPE_USER, 0);
+    if (ret < 0) {
+        error_setg(errp, "vhost-user-blk: vhost initialization failed: %s",
+                   strerror(-ret));
+        goto virtio_err;
+    }
+
+    ret = vhost_dev_get_config(&s->dev, (uint8_t *)&s->blkcfg,
+                              sizeof(struct virtio_blk_config));
+    if (ret < 0) {
+        error_setg(errp, "vhost-user-blk: get block config failed");
+        goto vhost_err;
+    }
+
+    if (s->blkcfg.num_queues != s->num_queues) {
+        s->blkcfg.num_queues = s->num_queues;
+    }
+
+    vhost_dev_set_config_notifier(&s->dev, &blk_ops);
+
+    return;
+
+vhost_err:
+    vhost_dev_cleanup(&s->dev);
+virtio_err:
+    g_free(s->dev.vqs);
+    virtio_cleanup(vdev);
+}
+
+static void vhost_user_blk_device_unrealize(DeviceState *dev, Error **errp)
+{
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VHostUserBlk *s = VHOST_USER_BLK(dev);
+
+    vhost_user_blk_set_status(vdev, 0);
+    vhost_dev_cleanup(&s->dev);
+    g_free(s->dev.vqs);
+    virtio_cleanup(vdev);
+}
+
+static void vhost_user_blk_instance_init(Object *obj)
+{
+    VHostUserBlk *s = VHOST_USER_BLK(obj);
+
+    device_add_bootindex_property(obj, &s->bootindex, "bootindex",
+                                  "/disk@0,0", DEVICE(obj), NULL);
+}
+
+static const VMStateDescription vmstate_vhost_user_blk = {
+    .name = "vhost-user-blk",
+    .minimum_version_id = 1,
+    .version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_VIRTIO_DEVICE,
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+static Property vhost_user_blk_properties[] = {
+    DEFINE_PROP_CHR("chardev", VHostUserBlk, chardev),
+    DEFINE_PROP_UINT16("num-queues", VHostUserBlk, num_queues, 1),
+    DEFINE_PROP_UINT32("queue-size", VHostUserBlk, queue_size, 128),
+    DEFINE_PROP_BIT("config-wce", VHostUserBlk, config_wce, 0, true),
+    DEFINE_PROP_BIT("config-ro", VHostUserBlk, config_ro, 0, false),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_blk_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
+
+    dc->props = vhost_user_blk_properties;
+    dc->vmsd = &vmstate_vhost_user_blk;
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    vdc->realize = vhost_user_blk_device_realize;
+    vdc->unrealize = vhost_user_blk_device_unrealize;
+    vdc->get_config = vhost_user_blk_update_config;
+    vdc->set_config = vhost_user_blk_set_config;
+    vdc->get_features = vhost_user_blk_get_features;
+    vdc->set_status = vhost_user_blk_set_status;
+}
+
+static const TypeInfo vhost_user_blk_info = {
+    .name = TYPE_VHOST_USER_BLK,
+    .parent = TYPE_VIRTIO_DEVICE,
+    .instance_size = sizeof(VHostUserBlk),
+    .instance_init = vhost_user_blk_instance_init,
+    .class_init = vhost_user_blk_class_init,
+};
+
+static void virtio_register_types(void)
+{
+    type_register_static(&vhost_user_blk_info);
+}
+
+type_init(virtio_register_types)
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 6c75cca88a..9ae10f0cdd 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1978,6 +1978,58 @@ static const TypeInfo virtio_blk_pci_info = {
     .class_init    = virtio_blk_pci_class_init,
 };
 
+#if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
+/* vhost-user-blk */
+
+static Property vhost_user_blk_pci_properties[] = {
+    DEFINE_PROP_UINT32("class", VirtIOPCIProxy, class_code, 0),
+    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_blk_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+    VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(vpci_dev);
+    DeviceState *vdev = DEVICE(&dev->vdev);
+
+    qdev_set_parent_bus(vdev, BUS(&vpci_dev->bus));
+    object_property_set_bool(OBJECT(vdev), true, "realized", errp);
+}
+
+static void vhost_user_blk_pci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+    PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+
+    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+    dc->props = vhost_user_blk_pci_properties;
+    k->realize = vhost_user_blk_pci_realize;
+    pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+    pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_BLOCK;
+    pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
+    pcidev_k->class_id = PCI_CLASS_STORAGE_SCSI;
+}
+
+static void vhost_user_blk_pci_instance_init(Object *obj)
+{
+    VHostUserBlkPCI *dev = VHOST_USER_BLK_PCI(obj);
+
+    virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+                                TYPE_VHOST_USER_BLK);
+    object_property_add_alias(obj, "bootindex", OBJECT(&dev->vdev),
+                              "bootindex", &error_abort);
+}
+
+static const TypeInfo vhost_user_blk_pci_info = {
+    .name           = TYPE_VHOST_USER_BLK_PCI,
+    .parent         = TYPE_VIRTIO_PCI,
+    .instance_size  = sizeof(VHostUserBlkPCI),
+    .instance_init  = vhost_user_blk_pci_instance_init,
+    .class_init     = vhost_user_blk_pci_class_init,
+};
+#endif
+
 /* virtio-scsi-pci */
 
 static Property virtio_scsi_pci_properties[] = {
@@ -2624,6 +2676,9 @@ static void virtio_pci_register_types(void)
     type_register_static(&virtio_9p_pci_info);
 #endif
     type_register_static(&virtio_blk_pci_info);
+#if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
+    type_register_static(&vhost_user_blk_pci_info);
+#endif
     type_register_static(&virtio_scsi_pci_info);
     type_register_static(&virtio_balloon_pci_info);
     type_register_static(&virtio_serial_pci_info);
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 12d3a90686..813082b0d7 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -27,6 +27,9 @@
 #include "hw/virtio/virtio-gpu.h"
 #include "hw/virtio/virtio-crypto.h"
 #include "hw/virtio/vhost-user-scsi.h"
+#if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
+#include "hw/virtio/vhost-user-blk.h"
+#endif
 
 #ifdef CONFIG_VIRTFS
 #include "hw/9pfs/virtio-9p.h"
@@ -46,6 +49,7 @@ typedef struct VirtIOSerialPCI VirtIOSerialPCI;
 typedef struct VirtIONetPCI VirtIONetPCI;
 typedef struct VHostSCSIPCI VHostSCSIPCI;
 typedef struct VHostUserSCSIPCI VHostUserSCSIPCI;
+typedef struct VHostUserBlkPCI VHostUserBlkPCI;
 typedef struct VirtIORngPCI VirtIORngPCI;
 typedef struct VirtIOInputPCI VirtIOInputPCI;
 typedef struct VirtIOInputHIDPCI VirtIOInputHIDPCI;
@@ -244,6 +248,20 @@ struct VHostUserSCSIPCI {
     VHostUserSCSI vdev;
 };
 
+#if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
+/*
+ * vhost-user-blk-pci: This extends VirtioPCIProxy.
+ */
+#define TYPE_VHOST_USER_BLK_PCI "vhost-user-blk-pci"
+#define VHOST_USER_BLK_PCI(obj) \
+        OBJECT_CHECK(VHostUserBlkPCI, (obj), TYPE_VHOST_USER_BLK_PCI)
+
+struct VHostUserBlkPCI {
+    VirtIOPCIProxy parent_obj;
+    VHostUserBlk vdev;
+};
+#endif
+
 /*
  * virtio-blk-pci: This extends VirtioPCIProxy.
  */
diff --git a/include/hw/virtio/vhost-user-blk.h b/include/hw/virtio/vhost-user-blk.h
new file mode 100644
index 0000000000..5804cc904a
--- /dev/null
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -0,0 +1,41 @@
+/*
+ * vhost-user-blk host device
+ * Copyright(C) 2017 Intel Corporation.
+ *
+ * Authors:
+ *  Changpeng Liu <changpeng.liu@intel.com>
+ *
+ * Based on vhost-scsi.h, Copyright IBM, Corp. 2011
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef VHOST_USER_BLK_H
+#define VHOST_USER_BLK_H
+
+#include "standard-headers/linux/virtio_blk.h"
+#include "qemu-common.h"
+#include "hw/qdev.h"
+#include "hw/block/block.h"
+#include "chardev/char-fe.h"
+#include "hw/virtio/vhost.h"
+
+#define TYPE_VHOST_USER_BLK "vhost-user-blk"
+#define VHOST_USER_BLK(obj) \
+        OBJECT_CHECK(VHostUserBlk, (obj), TYPE_VHOST_USER_BLK)
+
+typedef struct VHostUserBlk {
+    VirtIODevice parent_obj;
+    CharBackend chardev;
+    int32_t bootindex;
+    struct virtio_blk_config blkcfg;
+    uint16_t num_queues;
+    uint32_t queue_size;
+    uint32_t config_wce;
+    uint32_t config_ro;
+    struct vhost_dev dev;
+} VHostUserBlk;
+
+#endif

From 0bc24d831e4979cce8f1ab28ae36c20597b92edc Mon Sep 17 00:00:00 2001
From: Changpeng Liu <changpeng.liu@intel.com>
Date: Thu, 4 Jan 2018 09:53:33 +0800
Subject: [PATCH 04/29] contrib/libvhost-user: enable virtio config space
 messages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable VHOST_USER_GET_CONFIG/VHOST_USER_SET_CONFIG messages in
libvhost-user library, users can implement their own I/O target
based on the library. This enable the virtio config space delivered
between QEMU host device and the I/O target.

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 contrib/libvhost-user/libvhost-user.c | 42 +++++++++++++++++++++++++++
 contrib/libvhost-user/libvhost-user.h | 33 +++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index f409bd3d41..27cc59791b 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -84,6 +84,8 @@ vu_request_to_string(unsigned int req)
         REQ(VHOST_USER_SET_SLAVE_REQ_FD),
         REQ(VHOST_USER_IOTLB_MSG),
         REQ(VHOST_USER_SET_VRING_ENDIAN),
+        REQ(VHOST_USER_GET_CONFIG),
+        REQ(VHOST_USER_SET_CONFIG),
         REQ(VHOST_USER_MAX),
     };
 #undef REQ
@@ -797,6 +799,42 @@ vu_set_slave_req_fd(VuDev *dev, VhostUserMsg *vmsg)
     return false;
 }
 
+static bool
+vu_get_config(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int ret = -1;
+
+    if (dev->iface->get_config) {
+        ret = dev->iface->get_config(dev, vmsg->payload.config.region,
+                                     vmsg->payload.config.size);
+    }
+
+    if (ret) {
+        /* resize to zero to indicate an error to master */
+        vmsg->size = 0;
+    }
+
+    return true;
+}
+
+static bool
+vu_set_config(VuDev *dev, VhostUserMsg *vmsg)
+{
+    int ret = -1;
+
+    if (dev->iface->set_config) {
+        ret = dev->iface->set_config(dev, vmsg->payload.config.region,
+                                     vmsg->payload.config.offset,
+                                     vmsg->payload.config.size,
+                                     vmsg->payload.config.flags);
+        if (ret) {
+            vu_panic(dev, "Set virtio configuration space failed");
+        }
+    }
+
+    return false;
+}
+
 static bool
 vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
 {
@@ -862,6 +900,10 @@ vu_process_message(VuDev *dev, VhostUserMsg *vmsg)
         return vu_set_vring_enable_exec(dev, vmsg);
     case VHOST_USER_SET_SLAVE_REQ_FD:
         return vu_set_slave_req_fd(dev, vmsg);
+    case VHOST_USER_GET_CONFIG:
+        return vu_get_config(dev, vmsg);
+    case VHOST_USER_SET_CONFIG:
+        return vu_set_config(dev, vmsg);
     case VHOST_USER_NONE:
         break;
     default:
diff --git a/contrib/libvhost-user/libvhost-user.h b/contrib/libvhost-user/libvhost-user.h
index 2f5864b5c4..f8a730b725 100644
--- a/contrib/libvhost-user/libvhost-user.h
+++ b/contrib/libvhost-user/libvhost-user.h
@@ -30,6 +30,16 @@
 
 #define VHOST_MEMORY_MAX_NREGIONS 8
 
+typedef enum VhostSetConfigType {
+    VHOST_SET_CONFIG_TYPE_MASTER = 0,
+    VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
+} VhostSetConfigType;
+
+/*
+ * Maximum size of virtio device config space
+ */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
 enum VhostUserProtocolFeature {
     VHOST_USER_PROTOCOL_F_MQ = 0,
     VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
@@ -69,6 +79,8 @@ typedef enum VhostUserRequest {
     VHOST_USER_SET_SLAVE_REQ_FD = 21,
     VHOST_USER_IOTLB_MSG = 22,
     VHOST_USER_SET_VRING_ENDIAN = 23,
+    VHOST_USER_GET_CONFIG = 24,
+    VHOST_USER_SET_CONFIG = 25,
     VHOST_USER_MAX
 } VhostUserRequest;
 
@@ -90,6 +102,18 @@ typedef struct VhostUserLog {
     uint64_t mmap_offset;
 } VhostUserLog;
 
+typedef struct VhostUserConfig {
+    uint32_t offset;
+    uint32_t size;
+    uint32_t flags;
+    uint8_t region[VHOST_USER_MAX_CONFIG_SIZE];
+} VhostUserConfig;
+
+static VhostUserConfig c __attribute__ ((unused));
+#define VHOST_USER_CONFIG_HDR_SIZE (sizeof(c.offset) \
+                                   + sizeof(c.size) \
+                                   + sizeof(c.flags))
+
 #if defined(_WIN32)
 # define VU_PACKED __attribute__((gcc_struct, packed))
 #else
@@ -112,6 +136,7 @@ typedef struct VhostUserMsg {
         struct vhost_vring_addr addr;
         VhostUserMemory memory;
         VhostUserLog log;
+        VhostUserConfig config;
     } payload;
 
     int fds[VHOST_MEMORY_MAX_NREGIONS];
@@ -140,6 +165,10 @@ typedef int (*vu_process_msg_cb) (VuDev *dev, VhostUserMsg *vmsg,
                                   int *do_reply);
 typedef void (*vu_queue_set_started_cb) (VuDev *dev, int qidx, bool started);
 typedef bool (*vu_queue_is_processed_in_order_cb) (VuDev *dev, int qidx);
+typedef int (*vu_get_config_cb) (VuDev *dev, uint8_t *config, uint32_t len);
+typedef int (*vu_set_config_cb) (VuDev *dev, const uint8_t *data,
+                                 uint32_t offset, uint32_t size,
+                                 uint32_t flags);
 
 typedef struct VuDevIface {
     /* called by VHOST_USER_GET_FEATURES to get the features bitmask */
@@ -162,6 +191,10 @@ typedef struct VuDevIface {
      * on unmanaged exit/crash.
      */
     vu_queue_is_processed_in_order_cb queue_is_processed_in_order;
+    /* get the config space of the device */
+    vu_get_config_cb get_config;
+    /* set the config space of the device */
+    vu_set_config_cb set_config;
 } VuDevIface;
 
 typedef void (*vu_queue_handler_cb) (VuDev *dev, int qidx);

From 406d2aa2cc0770526081da00780ed2124cff1654 Mon Sep 17 00:00:00 2001
From: Changpeng Liu <changpeng.liu@intel.com>
Date: Thu, 4 Jan 2018 09:53:34 +0800
Subject: [PATCH 05/29] contrib/vhost-user-blk: introduce a vhost-user-blk
 sample application
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit introduces a vhost-user-blk backend device, it uses UNIX
domain socket to communicate with QEMU. The vhost-user-blk sample
application should be used with QEMU vhost-user-blk-pci device.

To use it, complie with:
make vhost-user-blk

and start like this:
vhost-user-blk -b /dev/sdb -s /path/vhost.socket

Signed-off-by: Changpeng Liu <changpeng.liu@intel.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 .gitignore                              |   1 +
 Makefile                                |   3 +
 Makefile.objs                           |   1 +
 contrib/vhost-user-blk/Makefile.objs    |   1 +
 contrib/vhost-user-blk/vhost-user-blk.c | 545 ++++++++++++++++++++++++
 5 files changed, 551 insertions(+)
 create mode 100644 contrib/vhost-user-blk/Makefile.objs
 create mode 100644 contrib/vhost-user-blk/vhost-user-blk.c

diff --git a/.gitignore b/.gitignore
index 433f64f429..704b22285d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,6 +54,7 @@
 /module_block.h
 /scsi/qemu-pr-helper
 /vhost-user-scsi
+/vhost-user-blk
 /fsdev/virtfs-proxy-helper
 *.tmp
 *.[1-9]
diff --git a/Makefile b/Makefile
index f26ef1b1df..d835bb92e7 100644
--- a/Makefile
+++ b/Makefile
@@ -334,6 +334,7 @@ dummy := $(call unnest-vars,, \
                 ivshmem-server-obj-y \
                 libvhost-user-obj-y \
                 vhost-user-scsi-obj-y \
+                vhost-user-blk-obj-y \
                 qga-vss-dll-obj-y \
                 block-obj-y \
                 block-obj-m \
@@ -565,6 +566,8 @@ ivshmem-server$(EXESUF): $(ivshmem-server-obj-y) $(COMMON_LDADDS)
 endif
 vhost-user-scsi$(EXESUF): $(vhost-user-scsi-obj-y) libvhost-user.a
 	$(call LINK, $^)
+vhost-user-blk$(EXESUF): $(vhost-user-blk-obj-y) libvhost-user.a
+	$(call LINK, $^)
 
 module_block.h: $(SRC_PATH)/scripts/modules/module_block.py config-host.mak
 	$(call quiet-command,$(PYTHON) $< $@ \
diff --git a/Makefile.objs b/Makefile.objs
index c8b1bba593..669d8d684d 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -115,6 +115,7 @@ libvhost-user-obj-y = contrib/libvhost-user/
 vhost-user-scsi.o-cflags := $(LIBISCSI_CFLAGS)
 vhost-user-scsi.o-libs := $(LIBISCSI_LIBS)
 vhost-user-scsi-obj-y = contrib/vhost-user-scsi/
+vhost-user-blk-obj-y = contrib/vhost-user-blk/
 
 ######################################################################
 trace-events-subdirs =
diff --git a/contrib/vhost-user-blk/Makefile.objs b/contrib/vhost-user-blk/Makefile.objs
new file mode 100644
index 0000000000..72e2cdc3ad
--- /dev/null
+++ b/contrib/vhost-user-blk/Makefile.objs
@@ -0,0 +1 @@
+vhost-user-blk-obj-y = vhost-user-blk.o
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
new file mode 100644
index 0000000000..67dac8155a
--- /dev/null
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -0,0 +1,545 @@
+/*
+ * vhost-user-blk sample application
+ *
+ * Copyright (c) 2017 Intel Corporation. All rights reserved.
+ *
+ * Author:
+ *  Changpeng Liu <changpeng.liu@intel.com>
+ *
+ * This work is based on the "vhost-user-scsi" sample and "virtio-blk" driver
+ * implementation by:
+ *  Felipe Franciosi <felipe@nutanix.com>
+ *  Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 only.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "standard-headers/linux/virtio_blk.h"
+#include "contrib/libvhost-user/libvhost-user-glib.h"
+#include "contrib/libvhost-user/libvhost-user.h"
+
+#include <glib.h>
+
+struct virtio_blk_inhdr {
+    unsigned char status;
+};
+
+/* vhost user block device */
+typedef struct VubDev {
+    VugDev parent;
+    int blk_fd;
+    struct virtio_blk_config blkcfg;
+    char *blk_name;
+    GMainLoop *loop;
+} VubDev;
+
+typedef struct VubReq {
+    VuVirtqElement *elem;
+    int64_t sector_num;
+    size_t size;
+    struct virtio_blk_inhdr *in;
+    struct virtio_blk_outhdr *out;
+    VubDev *vdev_blk;
+    struct VuVirtq *vq;
+} VubReq;
+
+/* refer util/iov.c */
+static size_t vub_iov_size(const struct iovec *iov,
+                              const unsigned int iov_cnt)
+{
+    size_t len;
+    unsigned int i;
+
+    len = 0;
+    for (i = 0; i < iov_cnt; i++) {
+        len += iov[i].iov_len;
+    }
+    return len;
+}
+
+static void vub_panic_cb(VuDev *vu_dev, const char *buf)
+{
+    VugDev *gdev;
+    VubDev *vdev_blk;
+
+    assert(vu_dev);
+
+    gdev = container_of(vu_dev, VugDev, parent);
+    vdev_blk = container_of(gdev, VubDev, parent);
+    if (buf) {
+        g_warning("vu_panic: %s", buf);
+    }
+
+    g_main_loop_quit(vdev_blk->loop);
+}
+
+static void vub_req_complete(VubReq *req)
+{
+    VugDev *gdev = &req->vdev_blk->parent;
+    VuDev *vu_dev = &gdev->parent;
+
+    /* IO size with 1 extra status byte */
+    vu_queue_push(vu_dev, req->vq, req->elem,
+                  req->size + 1);
+    vu_queue_notify(vu_dev, req->vq);
+
+    if (req->elem) {
+        free(req->elem);
+    }
+
+    g_free(req);
+}
+
+static int vub_open(const char *file_name, bool wce)
+{
+    int fd;
+    int flags = O_RDWR;
+
+    if (!wce) {
+        flags |= O_DIRECT;
+    }
+
+    fd = open(file_name, flags);
+    if (fd < 0) {
+        fprintf(stderr, "Cannot open file %s, %s\n", file_name,
+                strerror(errno));
+        return -1;
+    }
+
+    return fd;
+}
+
+static ssize_t
+vub_readv(VubReq *req, struct iovec *iov, uint32_t iovcnt)
+{
+    VubDev *vdev_blk = req->vdev_blk;
+    ssize_t rc;
+
+    if (!iovcnt) {
+        fprintf(stderr, "Invalid Read IOV count\n");
+        return -1;
+    }
+
+    req->size = vub_iov_size(iov, iovcnt);
+    rc = preadv(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
+    if (rc < 0) {
+        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
+                vdev_blk->blk_name, req->sector_num, req->size,
+                strerror(errno));
+        return -1;
+    }
+
+    return rc;
+}
+
+static ssize_t
+vub_writev(VubReq *req, struct iovec *iov, uint32_t iovcnt)
+{
+    VubDev *vdev_blk = req->vdev_blk;
+    ssize_t rc;
+
+    if (!iovcnt) {
+        fprintf(stderr, "Invalid Write IOV count\n");
+        return -1;
+    }
+
+    req->size = vub_iov_size(iov, iovcnt);
+    rc = pwritev(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
+    if (rc < 0) {
+        fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
+                vdev_blk->blk_name, req->sector_num, req->size,
+                strerror(errno));
+        return -1;
+    }
+
+    return rc;
+}
+
+static void
+vub_flush(VubReq *req)
+{
+    VubDev *vdev_blk = req->vdev_blk;
+
+    fdatasync(vdev_blk->blk_fd);
+}
+
+static int vub_virtio_process_req(VubDev *vdev_blk,
+                                     VuVirtq *vq)
+{
+    VugDev *gdev = &vdev_blk->parent;
+    VuDev *vu_dev = &gdev->parent;
+    VuVirtqElement *elem;
+    uint32_t type;
+    unsigned in_num;
+    unsigned out_num;
+    VubReq *req;
+
+    elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
+    if (!elem) {
+        return -1;
+    }
+
+    /* refer to hw/block/virtio_blk.c */
+    if (elem->out_num < 1 || elem->in_num < 1) {
+        fprintf(stderr, "virtio-blk request missing headers\n");
+        free(elem);
+        return -1;
+    }
+
+    req = g_new0(VubReq, 1);
+    req->vdev_blk = vdev_blk;
+    req->vq = vq;
+    req->elem = elem;
+
+    in_num = elem->in_num;
+    out_num = elem->out_num;
+
+    /* don't support VIRTIO_F_ANY_LAYOUT and virtio 1.0 only */
+    if (elem->out_sg[0].iov_len < sizeof(struct virtio_blk_outhdr)) {
+        fprintf(stderr, "Invalid outhdr size\n");
+        goto err;
+    }
+    req->out = (struct virtio_blk_outhdr *)elem->out_sg[0].iov_base;
+    out_num--;
+
+    if (elem->in_sg[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
+        fprintf(stderr, "Invalid inhdr size\n");
+        goto err;
+    }
+    req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base;
+    in_num--;
+
+    type = le32toh(req->out->type);
+    switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
+        case VIRTIO_BLK_T_IN: {
+            ssize_t ret = 0;
+            bool is_write = type & VIRTIO_BLK_T_OUT;
+            req->sector_num = le64toh(req->out->sector);
+            if (is_write) {
+                ret  = vub_writev(req, &elem->out_sg[1], out_num);
+            } else {
+                ret = vub_readv(req, &elem->in_sg[0], in_num);
+            }
+            if (ret >= 0) {
+                req->in->status = VIRTIO_BLK_S_OK;
+            } else {
+                req->in->status = VIRTIO_BLK_S_IOERR;
+            }
+            vub_req_complete(req);
+            break;
+        }
+        case VIRTIO_BLK_T_FLUSH: {
+            vub_flush(req);
+            req->in->status = VIRTIO_BLK_S_OK;
+            vub_req_complete(req);
+            break;
+        }
+        case VIRTIO_BLK_T_GET_ID: {
+            size_t size = MIN(vub_iov_size(&elem->in_sg[0], in_num),
+                              VIRTIO_BLK_ID_BYTES);
+            snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
+            req->in->status = VIRTIO_BLK_S_OK;
+            req->size = elem->in_sg[0].iov_len;
+            vub_req_complete(req);
+            break;
+        }
+        default: {
+            req->in->status = VIRTIO_BLK_S_UNSUPP;
+            vub_req_complete(req);
+            break;
+        }
+    }
+
+    return 0;
+
+err:
+    free(elem);
+    g_free(req);
+    return -1;
+}
+
+static void vub_process_vq(VuDev *vu_dev, int idx)
+{
+    VugDev *gdev;
+    VubDev *vdev_blk;
+    VuVirtq *vq;
+    int ret;
+
+    if ((idx < 0) || (idx >= VHOST_MAX_NR_VIRTQUEUE)) {
+        fprintf(stderr, "VQ Index out of range: %d\n", idx);
+        vub_panic_cb(vu_dev, NULL);
+        return;
+    }
+
+    gdev = container_of(vu_dev, VugDev, parent);
+    vdev_blk = container_of(gdev, VubDev, parent);
+    assert(vdev_blk);
+
+    vq = vu_get_queue(vu_dev, idx);
+    assert(vq);
+
+    while (1) {
+        ret = vub_virtio_process_req(vdev_blk, vq);
+        if (ret) {
+            break;
+        }
+    }
+}
+
+static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
+{
+    VuVirtq *vq;
+
+    assert(vu_dev);
+
+    vq = vu_get_queue(vu_dev, idx);
+    vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
+}
+
+static uint64_t
+vub_get_features(VuDev *dev)
+{
+    return 1ull << VIRTIO_BLK_F_SIZE_MAX |
+           1ull << VIRTIO_BLK_F_SEG_MAX |
+           1ull << VIRTIO_BLK_F_TOPOLOGY |
+           1ull << VIRTIO_BLK_F_BLK_SIZE |
+           1ull << VIRTIO_BLK_F_FLUSH |
+           1ull << VIRTIO_BLK_F_CONFIG_WCE |
+           1ull << VIRTIO_F_VERSION_1 |
+           1ull << VHOST_USER_F_PROTOCOL_FEATURES;
+}
+
+static int
+vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
+{
+    VugDev *gdev;
+    VubDev *vdev_blk;
+
+    gdev = container_of(vu_dev, VugDev, parent);
+    vdev_blk = container_of(gdev, VubDev, parent);
+    memcpy(config, &vdev_blk->blkcfg, len);
+
+    return 0;
+}
+
+static int
+vub_set_config(VuDev *vu_dev, const uint8_t *data,
+               uint32_t offset, uint32_t size, uint32_t flags)
+{
+    VugDev *gdev;
+    VubDev *vdev_blk;
+    uint8_t wce;
+    int fd;
+
+    /* don't support live migration */
+    if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
+        return -1;
+    }
+
+    gdev = container_of(vu_dev, VugDev, parent);
+    vdev_blk = container_of(gdev, VubDev, parent);
+
+    if (offset != offsetof(struct virtio_blk_config, wce) ||
+        size != 1) {
+        return -1;
+    }
+
+    wce = *data;
+    if (wce == vdev_blk->blkcfg.wce) {
+        /* Do nothing as same with old configuration */
+        return 0;
+    }
+
+    vdev_blk->blkcfg.wce = wce;
+    fprintf(stdout, "Write Cache Policy Changed\n");
+    if (vdev_blk->blk_fd >= 0) {
+        close(vdev_blk->blk_fd);
+        vdev_blk->blk_fd = -1;
+    }
+
+    fd = vub_open(vdev_blk->blk_name, wce);
+    if (fd < 0) {
+        fprintf(stderr, "Error to open block device %s\n", vdev_blk->blk_name);
+        vdev_blk->blk_fd = -1;
+        return -1;
+    }
+    vdev_blk->blk_fd = fd;
+
+    return 0;
+}
+
+static const VuDevIface vub_iface = {
+    .get_features = vub_get_features,
+    .queue_set_started = vub_queue_set_started,
+    .get_config = vub_get_config,
+    .set_config = vub_set_config,
+};
+
+static int unix_sock_new(char *unix_fn)
+{
+    int sock;
+    struct sockaddr_un un;
+    size_t len;
+
+    assert(unix_fn);
+
+    sock = socket(AF_UNIX, SOCK_STREAM, 0);
+    if (sock <= 0) {
+        perror("socket");
+        return -1;
+    }
+
+    un.sun_family = AF_UNIX;
+    (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn);
+    len = sizeof(un.sun_family) + strlen(un.sun_path);
+
+    (void)unlink(unix_fn);
+    if (bind(sock, (struct sockaddr *)&un, len) < 0) {
+        perror("bind");
+        goto fail;
+    }
+
+    if (listen(sock, 1) < 0) {
+        perror("listen");
+        goto fail;
+    }
+
+    return sock;
+
+fail:
+    (void)close(sock);
+
+    return -1;
+}
+
+static void vub_free(struct VubDev *vdev_blk)
+{
+    if (!vdev_blk) {
+        return;
+    }
+
+    g_main_loop_unref(vdev_blk->loop);
+    if (vdev_blk->blk_fd >= 0) {
+        close(vdev_blk->blk_fd);
+    }
+    g_free(vdev_blk);
+}
+
+static uint32_t
+vub_get_blocksize(int fd)
+{
+    uint32_t blocksize = 512;
+
+#if defined(__linux__) && defined(BLKSSZGET)
+    if (ioctl(fd, BLKSSZGET, &blocksize) == 0) {
+        return blocklen;
+    }
+#endif
+
+    return blocksize;
+}
+
+static void
+vub_initialize_config(int fd, struct virtio_blk_config *config)
+{
+    off64_t capacity;
+
+    capacity = lseek64(fd, 0, SEEK_END);
+    config->capacity = capacity >> 9;
+    config->blk_size = vub_get_blocksize(fd);
+    config->size_max = 65536;
+    config->seg_max = 128 - 2;
+    config->min_io_size = 1;
+    config->opt_io_size = 1;
+    config->num_queues = 1;
+}
+
+static VubDev *
+vub_new(char *blk_file)
+{
+    VubDev *vdev_blk;
+
+    vdev_blk = g_new0(VubDev, 1);
+    vdev_blk->loop = g_main_loop_new(NULL, FALSE);
+    vdev_blk->blk_fd = vub_open(blk_file, 0);
+    if (vdev_blk->blk_fd  < 0) {
+        fprintf(stderr, "Error to open block device %s\n", blk_file);
+        vub_free(vdev_blk);
+        return NULL;
+    }
+    vdev_blk->blkcfg.wce = 0;
+    vdev_blk->blk_name = blk_file;
+
+    /* fill virtio_blk_config with block parameters */
+    vub_initialize_config(vdev_blk->blk_fd, &vdev_blk->blkcfg);
+
+    return vdev_blk;
+}
+
+int main(int argc, char **argv)
+{
+    int opt;
+    char *unix_socket = NULL;
+    char *blk_file = NULL;
+    int lsock = -1, csock = -1;
+    VubDev *vdev_blk = NULL;
+
+    while ((opt = getopt(argc, argv, "b:s:h")) != -1) {
+        switch (opt) {
+        case 'b':
+            blk_file = g_strdup(optarg);
+            break;
+        case 's':
+            unix_socket = g_strdup(optarg);
+            break;
+        case 'h':
+        default:
+            printf("Usage: %s [-b block device or file, -s UNIX domain socket]"
+                   " | [ -h ]\n", argv[0]);
+            return 0;
+        }
+    }
+
+    if (!unix_socket || !blk_file) {
+        printf("Usage: %s [-b block device or file, -s UNIX domain socket] |"
+               " [ -h ]\n", argv[0]);
+        return -1;
+    }
+
+    lsock = unix_sock_new(unix_socket);
+    if (lsock < 0) {
+        goto err;
+    }
+
+    csock = accept(lsock, (void *)0, (void *)0);
+    if (csock < 0) {
+        fprintf(stderr, "Accept error %s\n", strerror(errno));
+        goto err;
+    }
+
+    vdev_blk = vub_new(blk_file);
+    if (!vdev_blk) {
+        goto err;
+    }
+
+    vug_init(&vdev_blk->parent, csock, vub_panic_cb, &vub_iface);
+
+    g_main_loop_run(vdev_blk->loop);
+
+    vug_deinit(&vdev_blk->parent);
+
+err:
+    vub_free(vdev_blk);
+    if (csock >= 0) {
+        close(csock);
+    }
+    if (lsock >= 0) {
+        close(lsock);
+    }
+    g_free(unix_socket);
+    g_free(blk_file);
+
+    return 0;
+}

From f87d72f5c5bff0837d409a56bd34f439a90119ca Mon Sep 17 00:00:00 2001
From: Gal Hammer <ghammer@redhat.com>
Date: Sun, 14 Jan 2018 12:06:54 +0200
Subject: [PATCH 06/29] qemu: add a cleanup callback function to EventNotifier

Adding a cleanup callback function to the EventNotifier struct
which allows users to execute event_notifier_cleanup in a
different context.

Signed-off-by: Gal Hammer <ghammer@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/qemu/event_notifier.h | 1 +
 util/event_notifier-posix.c   | 5 ++++-
 util/event_notifier-win32.c   | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/qemu/event_notifier.h b/include/qemu/event_notifier.h
index 599c99f1a5..b30a45474f 100644
--- a/include/qemu/event_notifier.h
+++ b/include/qemu/event_notifier.h
@@ -26,6 +26,7 @@ struct EventNotifier {
     int rfd;
     int wfd;
 #endif
+    void (*cleanup)(EventNotifier *);
 };
 
 typedef void EventNotifierHandler(EventNotifier *);
diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c
index 73c4046b58..652566634a 100644
--- a/util/event_notifier-posix.c
+++ b/util/event_notifier-posix.c
@@ -29,6 +29,7 @@ void event_notifier_init_fd(EventNotifier *e, int fd)
 {
     e->rfd = fd;
     e->wfd = fd;
+    e->cleanup = NULL;
 }
 #endif
 
@@ -65,6 +66,7 @@ int event_notifier_init(EventNotifier *e, int active)
         e->rfd = fds[0];
         e->wfd = fds[1];
     }
+    e->cleanup = NULL;
     if (active) {
         event_notifier_set(e);
     }
@@ -80,10 +82,11 @@ void event_notifier_cleanup(EventNotifier *e)
 {
     if (e->rfd != e->wfd) {
         close(e->rfd);
-        e->rfd = -1;
     }
     close(e->wfd);
+    e->rfd = -1;
     e->wfd = -1;
+    e->cleanup = NULL;
 }
 
 int event_notifier_get_fd(const EventNotifier *e)
diff --git a/util/event_notifier-win32.c b/util/event_notifier-win32.c
index 62c53b0a99..eff86701ad 100644
--- a/util/event_notifier-win32.c
+++ b/util/event_notifier-win32.c
@@ -19,6 +19,7 @@ int event_notifier_init(EventNotifier *e, int active)
 {
     e->event = CreateEvent(NULL, TRUE, FALSE, NULL);
     assert(e->event);
+    e->cleanup = NULL;
     return 0;
 }
 
@@ -26,6 +27,7 @@ void event_notifier_cleanup(EventNotifier *e)
 {
     CloseHandle(e->event);
     e->event = NULL;
+    e->cleanup = NULL;
 }
 
 HANDLE event_notifier_get_handle(EventNotifier *e)

From 4fe6d78b2e241f41208dfb07605aace4becfc747 Mon Sep 17 00:00:00 2001
From: Gal Hammer <ghammer@redhat.com>
Date: Sun, 14 Jan 2018 12:06:55 +0200
Subject: [PATCH 07/29] virtio: postpone the execution of
 event_notifier_cleanup function

Use the EventNotifier's cleanup callback function to execute the
event_notifier_cleanup function after kvm unregistered the eventfd.

This change supports running the virtio_bus_set_host_notifier
function inside a memory region transaction. Otherwise, a closed
fd is sent to kvm, which results in a failure.

Signed-off-by: Gal Hammer <ghammer@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 accel/kvm/kvm-all.c    |  4 ++++
 hw/virtio/virtio-bus.c | 19 +++++++++++--------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f290f487a5..071f4f57c0 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -812,6 +812,10 @@ static void kvm_mem_ioeventfd_del(MemoryListener *listener,
     if (r < 0) {
         abort();
     }
+
+    if (e->cleanup) {
+        e->cleanup(e);
+    }
 }
 
 static void kvm_io_ioeventfd_add(MemoryListener *listener,
diff --git a/hw/virtio/virtio-bus.c b/hw/virtio/virtio-bus.c
index 3042232daf..8106346927 100644
--- a/hw/virtio/virtio-bus.c
+++ b/hw/virtio/virtio-bus.c
@@ -256,6 +256,15 @@ bool virtio_bus_ioeventfd_enabled(VirtioBusState *bus)
     return k->ioeventfd_assign && k->ioeventfd_enabled(proxy);
 }
 
+static void virtio_bus_cleanup_event_notifier(EventNotifier *notifier)
+{
+    /* Test and clear notifier after disabling event,
+     * in case poll callback didn't have time to run.
+     */
+    virtio_queue_host_notifier_read(notifier);
+    event_notifier_cleanup(notifier);
+}
+
 /*
  * This function switches ioeventfd on/off in the device.
  * The caller must set or clear the handlers for the EventNotifier.
@@ -283,19 +292,13 @@ int virtio_bus_set_host_notifier(VirtioBusState *bus, int n, bool assign)
         r = k->ioeventfd_assign(proxy, notifier, n, true);
         if (r < 0) {
             error_report("%s: unable to assign ioeventfd: %d", __func__, r);
-            goto cleanup_event_notifier;
+            virtio_bus_cleanup_event_notifier(notifier);
         }
-        return 0;
     } else {
+        notifier->cleanup = virtio_bus_cleanup_event_notifier;
         k->ioeventfd_assign(proxy, notifier, n, false);
     }
 
-cleanup_event_notifier:
-    /* Test and clear notifier after disabling event,
-     * in case poll callback didn't have time to run.
-     */
-    virtio_queue_host_notifier_read(notifier);
-    event_notifier_cleanup(notifier);
     return r;
 }
 

From 6f0bb230722931d17fb284eee8efd40b9d653822 Mon Sep 17 00:00:00 2001
From: Gal Hammer <ghammer@redhat.com>
Date: Sun, 14 Jan 2018 12:06:56 +0200
Subject: [PATCH 08/29] virtio: improve virtio devices initialization time

The loading time of a VM is quite significant when its virtio
devices use a large amount of virt-queues (e.g. a virtio-serial
device with max_ports=511). Most of the time is spend in the
creation of all the required event notifiers (ioeventfd and memory
regions).

This patch pack all the changes to the memory regions in a
single memory transaction.

Reported-by: Sitong Liu <siliu@redhat.com>
Reported-by: Xiaoling Gao <xiagao@redhat.com>
Signed-off-by: Gal Hammer <ghammer@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/virtio.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index d6002ee550..3ac3491bee 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2574,6 +2574,7 @@ static int virtio_device_start_ioeventfd_impl(VirtIODevice *vdev)
     VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev)));
     int n, r, err;
 
+    memory_region_transaction_begin();
     for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
         VirtQueue *vq = &vdev->vq[n];
         if (!virtio_queue_get_num(vdev, n)) {
@@ -2596,6 +2597,7 @@ static int virtio_device_start_ioeventfd_impl(VirtIODevice *vdev)
         }
         event_notifier_set(&vq->host_notifier);
     }
+    memory_region_transaction_commit();
     return 0;
 
 assign_error:
@@ -2609,6 +2611,7 @@ assign_error:
         r = virtio_bus_set_host_notifier(qbus, n, false);
         assert(r >= 0);
     }
+    memory_region_transaction_commit();
     return err;
 }
 
@@ -2625,6 +2628,7 @@ static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev)
     VirtioBusState *qbus = VIRTIO_BUS(qdev_get_parent_bus(DEVICE(vdev)));
     int n, r;
 
+    memory_region_transaction_begin();
     for (n = 0; n < VIRTIO_QUEUE_MAX; n++) {
         VirtQueue *vq = &vdev->vq[n];
 
@@ -2635,6 +2639,7 @@ static void virtio_device_stop_ioeventfd_impl(VirtIODevice *vdev)
         r = virtio_bus_set_host_notifier(qbus, n, false);
         assert(r >= 0);
     }
+    memory_region_transaction_commit();
 }
 
 void virtio_device_stop_ioeventfd(VirtIODevice *vdev)

From 37e626cedae08288f73f2356530a0bd5f045c8b9 Mon Sep 17 00:00:00 2001
From: Yuval Shaia <yuval.shaia@oracle.com>
Date: Sun, 14 Jan 2018 11:01:43 +0200
Subject: [PATCH 09/29] pci/shpc: Move function to generic header file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function should be declared in generic header file so we can
utilize it.

Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci/shpc.c             | 13 ++-----------
 include/qemu/host-utils.h | 10 ++++++++++
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/hw/pci/shpc.c b/hw/pci/shpc.c
index 69fc14b218..a8462d48bb 100644
--- a/hw/pci/shpc.c
+++ b/hw/pci/shpc.c
@@ -1,6 +1,7 @@
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qemu-common.h"
+#include "qemu/host-utils.h"
 #include "qemu/range.h"
 #include "qemu/error-report.h"
 #include "hw/pci/shpc.h"
@@ -122,16 +123,6 @@
 #define SHPC_PCI_TO_IDX(pci_slot) ((pci_slot) - 1)
 #define SHPC_IDX_TO_PHYSICAL(slot) ((slot) + 1)
 
-static int roundup_pow_of_two(int x)
-{
-    x |= (x >> 1);
-    x |= (x >> 2);
-    x |= (x >> 4);
-    x |= (x >> 8);
-    x |= (x >> 16);
-    return x + 1;
-}
-
 static uint16_t shpc_get_status(SHPCDevice *shpc, int slot, uint16_t msk)
 {
     uint8_t *status = shpc->config + SHPC_SLOT_STATUS(slot);
@@ -656,7 +647,7 @@ int shpc_init(PCIDevice *d, PCIBus *sec_bus, MemoryRegion *bar,
 
 int shpc_bar_size(PCIDevice *d)
 {
-    return roundup_pow_of_two(SHPC_SLOT_REG(SHPC_MAX_SLOTS));
+    return pow2roundup32(SHPC_SLOT_REG(SHPC_MAX_SLOTS));
 }
 
 void shpc_cleanup(PCIDevice *d, MemoryRegion *bar)
diff --git a/include/qemu/host-utils.h b/include/qemu/host-utils.h
index 5ac621cf1f..38da849be9 100644
--- a/include/qemu/host-utils.h
+++ b/include/qemu/host-utils.h
@@ -400,6 +400,16 @@ static inline uint64_t pow2ceil(uint64_t value)
     return 0x8000000000000000ull >> (n - 1);
 }
 
+static inline uint32_t pow2roundup32(uint32_t x)
+{
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    return x + 1;
+}
+
 /**
  * urshift - 128-bit Unsigned Right Shift.
  * @plow: in/out - lower 64-bit integer.

From 7de22778e1e25d974bad61fbd00757296e3e421e Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Fri, 12 Jan 2018 15:56:55 +0100
Subject: [PATCH 10/29] vhost-user: fix multiple queue specification

The number of queues supported by the slave is queried with
message VHOST_USER_GET_QUEUE_NUM, not with message
VHOST_USER_GET_PROTOCOL_FEATURES.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 docs/interop/vhost-user.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/interop/vhost-user.txt b/docs/interop/vhost-user.txt
index 0875ef4ec3..9fcf48d611 100644
--- a/docs/interop/vhost-user.txt
+++ b/docs/interop/vhost-user.txt
@@ -228,8 +228,8 @@ Multiple queue is treated as a protocol extension, hence the slave has to
 implement protocol features first. The multiple queues feature is supported
 only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set.
 
-The max number of queues the slave supports can be queried with message
-VHOST_USER_GET_PROTOCOL_FEATURES. Master should stop when the number of
+The max number of queue pairs the slave supports can be queried with message
+VHOST_USER_GET_QUEUE_NUM. Master should stop when the number of
 requested queues is bigger than that.
 
 As all queues share one connection, the master uses a unique index for each

From 92e5d85e8345a22e87eda940ffe0f6422eb45360 Mon Sep 17 00:00:00 2001
From: Prasad Singamsetty <prasad.singamsetty@oracle.com>
Date: Tue, 14 Nov 2017 18:13:49 -0500
Subject: [PATCH 11/29] intel-iommu: Redefine macros to enable supporting 48
 bit address width

The current implementation of Intel IOMMU code only supports 39 bits
host/iova address width so number of macros use hard coded values based
on that. This patch is to redefine them so they can be used with
variable address widths. This patch doesn't add any new functionality
but enables adding support for 48 bit address width.

Signed-off-by: Prasad Singamsetty <prasad.singamsety@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/intel_iommu.c          | 54 +++++++++++++++++++---------------
 hw/i386/intel_iommu_internal.h | 34 ++++++++++++++++-----
 include/hw/i386/intel_iommu.h  |  6 ++--
 3 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index fe15d3ba84..fbcf43a622 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -523,7 +523,7 @@ static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
 
 static inline uint64_t vtd_get_slpte_addr(uint64_t slpte)
 {
-    return slpte & VTD_SL_PT_BASE_ADDR_MASK;
+    return slpte & VTD_SL_PT_BASE_ADDR_MASK(VTD_HOST_ADDRESS_WIDTH);
 }
 
 /* Whether the pte indicates the address of the page frame */
@@ -624,19 +624,12 @@ static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce)
     return !(iova & ~(vtd_iova_limit(ce) - 1));
 }
 
-static const uint64_t vtd_paging_entry_rsvd_field[] = {
-    [0] = ~0ULL,
-    /* For not large page */
-    [1] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [2] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [3] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [4] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    /* For large page */
-    [5] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [6] = 0x1ff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [7] = 0x3ffff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [8] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-};
+/*
+ * Rsvd field masks for spte:
+ *     Index [1] to [4] 4k pages
+ *     Index [5] to [8] large pages
+ */
+static uint64_t vtd_paging_entry_rsvd_field[9];
 
 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
 {
@@ -874,7 +867,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
         return -VTD_FR_ROOT_ENTRY_P;
     }
 
-    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) {
+    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(VTD_HOST_ADDRESS_WIDTH))) {
         trace_vtd_re_invalid(re.rsvd, re.val);
         return -VTD_FR_ROOT_ENTRY_RSVD;
     }
@@ -891,7 +884,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     }
 
     if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
-        (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
+               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(VTD_HOST_ADDRESS_WIDTH))) {
         trace_vtd_ce_invalid(ce->hi, ce->lo);
         return -VTD_FR_CONTEXT_ENTRY_RSVD;
     }
@@ -1207,7 +1200,7 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
 {
     s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
     s->root_extended = s->root & VTD_RTADDR_RTT;
-    s->root &= VTD_RTADDR_ADDR_MASK;
+    s->root &= VTD_RTADDR_ADDR_MASK(VTD_HOST_ADDRESS_WIDTH);
 
     trace_vtd_reg_dmar_root(s->root, s->root_extended);
 }
@@ -1223,7 +1216,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
     uint64_t value = 0;
     value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
     s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
-    s->intr_root = value & VTD_IRTA_ADDR_MASK;
+    s->intr_root = value & VTD_IRTA_ADDR_MASK(VTD_HOST_ADDRESS_WIDTH);
     s->intr_eime = value & VTD_IRTA_EIME;
 
     /* Notify global invalidation */
@@ -1479,7 +1472,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
     trace_vtd_inv_qi_enable(en);
 
     if (en) {
-        s->iq = iqa_val & VTD_IQA_IQA_MASK;
+        s->iq = iqa_val & VTD_IQA_IQA_MASK(VTD_HOST_ADDRESS_WIDTH);
         /* 2^(x+8) entries */
         s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
         s->qi_enabled = true;
@@ -2772,12 +2765,12 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
      * VT-d spec), otherwise we need to consider overflow of 64 bits.
      */
 
-    if (end > VTD_ADDRESS_SIZE) {
+    if (end > VTD_ADDRESS_SIZE(VTD_HOST_ADDRESS_WIDTH)) {
         /*
          * Don't need to unmap regions that is bigger than the whole
          * VT-d supported address space size
          */
-        end = VTD_ADDRESS_SIZE;
+        end = VTD_ADDRESS_SIZE(VTD_HOST_ADDRESS_WIDTH);
     }
 
     assert(start <= end);
@@ -2866,6 +2859,7 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
 static void vtd_init(IntelIOMMUState *s)
 {
     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
+    uint8_t aw_bits = VTD_HOST_ADDRESS_WIDTH;
 
     memset(s->csr, 0, DMAR_REG_SIZE);
     memset(s->wmask, 0, DMAR_REG_SIZE);
@@ -2882,10 +2876,24 @@ static void vtd_init(IntelIOMMUState *s)
     s->qi_enabled = false;
     s->iq_last_desc_type = VTD_INV_DESC_NONE;
     s->next_frcd_reg = 0;
-    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
-             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
+    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
+             VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
+             VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(VTD_HOST_ADDRESS_WIDTH);
     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
+    /*
+     * Rsvd field masks for spte
+     */
+    vtd_paging_entry_rsvd_field[0] = ~0ULL;
+    vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(aw_bits);
+
     if (x86_iommu->intr_supported) {
         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
         if (s->intr_eim == ON_OFF_AUTO_ON) {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 0e73a65bf2..77e4a9833a 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -172,10 +172,10 @@
 
 /* RTADDR_REG */
 #define VTD_RTADDR_RTT              (1ULL << 11)
-#define VTD_RTADDR_ADDR_MASK        (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_RTADDR_ADDR_MASK(aw)    (VTD_HAW_MASK(aw) ^ 0xfffULL)
 
 /* IRTA_REG */
-#define VTD_IRTA_ADDR_MASK          (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_IRTA_ADDR_MASK(aw)      (VTD_HAW_MASK(aw) ^ 0xfffULL)
 #define VTD_IRTA_EIME               (1ULL << 11)
 #define VTD_IRTA_SIZE_MASK          (0xfULL)
 
@@ -198,8 +198,8 @@
 #define VTD_DOMAIN_ID_MASK          ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
 #define VTD_CAP_ND                  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
 #define VTD_MGAW                    39  /* Maximum Guest Address Width */
-#define VTD_ADDRESS_SIZE            (1ULL << VTD_MGAW)
-#define VTD_CAP_MGAW                (((VTD_MGAW - 1) & 0x3fULL) << 16)
+#define VTD_ADDRESS_SIZE(aw)        (1ULL << (aw))
+#define VTD_CAP_MGAW(aw)            ((((aw) - 1) & 0x3fULL) << 16)
 #define VTD_MAMV                    18ULL
 #define VTD_CAP_MAMV                (VTD_MAMV << 48)
 #define VTD_CAP_PSI                 (1ULL << 39)
@@ -219,7 +219,7 @@
 #define VTD_IQT_QT(val)             (((val) >> 4) & 0x7fffULL)
 
 /* IQA_REG */
-#define VTD_IQA_IQA_MASK            (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_IQA_IQA_MASK(aw)        (VTD_HAW_MASK(aw) ^ 0xfffULL)
 #define VTD_IQA_QS                  0x7ULL
 
 /* IQH_REG */
@@ -373,6 +373,24 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8
 
+/* Rsvd field masks for spte */
+#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L3_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
+        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L1_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw) \
+        (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw) \
+        (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L4_RSVD_MASK(aw) \
+        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
     uint16_t domain_id;
@@ -403,7 +421,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_ROOT_ENTRY_CTP          (~0xfffULL)
 
 #define VTD_ROOT_ENTRY_NR           (VTD_PAGE_SIZE / sizeof(VTDRootEntry))
-#define VTD_ROOT_ENTRY_RSVD         (0xffeULL | ~VTD_HAW_MASK)
+#define VTD_ROOT_ENTRY_RSVD(aw)     (0xffeULL | ~VTD_HAW_MASK(aw))
 
 /* Masks for struct VTDContextEntry */
 /* lo */
@@ -415,7 +433,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_CONTEXT_TT_PASS_THROUGH (2ULL << 2)
 /* Second Level Page Translation Pointer*/
 #define VTD_CONTEXT_ENTRY_SLPTPTR   (~0xfffULL)
-#define VTD_CONTEXT_ENTRY_RSVD_LO   (0xff0ULL | ~VTD_HAW_MASK)
+#define VTD_CONTEXT_ENTRY_RSVD_LO(aw) (0xff0ULL | ~VTD_HAW_MASK(aw))
 /* hi */
 #define VTD_CONTEXT_ENTRY_AW        7ULL /* Adjusted guest-address-width */
 #define VTD_CONTEXT_ENTRY_DID(val)  (((val) >> 8) & VTD_DOMAIN_ID_MASK)
@@ -439,7 +457,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_RW_MASK              3ULL
 #define VTD_SL_R                    1ULL
 #define VTD_SL_W                    (1ULL << 1)
-#define VTD_SL_PT_BASE_ADDR_MASK    (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK)
+#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
 #define VTD_SL_IGN_COM              0xbff0000000000000ULL
 
 #endif
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index ac15e6be14..372b06df45 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -46,8 +46,10 @@
 #define VTD_SID_TO_DEVFN(sid)       ((sid) & 0xff)
 
 #define DMAR_REG_SIZE               0x230
-#define VTD_HOST_ADDRESS_WIDTH      39
-#define VTD_HAW_MASK                ((1ULL << VTD_HOST_ADDRESS_WIDTH) - 1)
+#define VTD_HOST_AW_39BIT           39
+#define VTD_HOST_AW_48BIT           48
+#define VTD_HOST_ADDRESS_WIDTH      VTD_HOST_AW_39BIT
+#define VTD_HAW_MASK(aw)            ((1ULL << (aw)) - 1)
 
 #define DMAR_REPORT_F_INTR          (1)
 

From 37f51384ae05bd50f83308339dbffa3e78404874 Mon Sep 17 00:00:00 2001
From: Prasad Singamsetty <prasad.singamsetty@oracle.com>
Date: Tue, 14 Nov 2017 18:13:50 -0500
Subject: [PATCH 12/29] intel-iommu: Extend address width to 48 bits

The current implementation of Intel IOMMU code only supports 39 bits
iova address width. This patch provides a new parameter (x-aw-bits)
for intel-iommu to extend its address width to 48 bits but keeping the
default the same (39 bits). The reason for not changing the default
is to avoid potential compatibility problems with live migration of
intel-iommu enabled QEMU guest. The only valid values for 'x-aw-bits'
parameter are 39 and 48.

After enabling larger address width (48), we should be able to map
larger iova addresses in the guest. For example, a QEMU guest that
is configured with large memory ( >=1TB ). To check whether 48 bits
aw is enabled, we can grep in the guest dmesg output with line:
"DMAR: Host address width 48".

Signed-off-by: Prasad Singamsetty <prasad.singamsety@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/acpi-build.c           |   3 +-
 hw/i386/intel_iommu.c          | 101 +++++++++++++++++++--------------
 hw/i386/intel_iommu_internal.h |   9 +--
 include/hw/i386/intel_iommu.h  |   1 +
 4 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 18b939e469..6f38fb9d27 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2473,6 +2473,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker)
     AcpiDmarDeviceScope *scope = NULL;
     /* Root complex IOAPIC use one path[0] only */
     size_t ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]);
+    IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu);
 
     assert(iommu);
     if (iommu->intr_supported) {
@@ -2480,7 +2481,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker)
     }
 
     dmar = acpi_data_push(table_data, sizeof(*dmar));
-    dmar->host_address_width = VTD_HOST_ADDRESS_WIDTH - 1;
+    dmar->host_address_width = intel_iommu->aw_bits - 1;
     dmar->flags = dmar_flags;
 
     /* DMAR Remapping Hardware Unit Definition structure */
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index fbcf43a622..4e8642ea6a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -521,9 +521,9 @@ static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
     return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
 }
 
-static inline uint64_t vtd_get_slpte_addr(uint64_t slpte)
+static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
 {
-    return slpte & VTD_SL_PT_BASE_ADDR_MASK(VTD_HOST_ADDRESS_WIDTH);
+    return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
 }
 
 /* Whether the pte indicates the address of the page frame */
@@ -608,20 +608,21 @@ static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
     return true;
 }
 
-static inline uint64_t vtd_iova_limit(VTDContextEntry *ce)
+static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw)
 {
     uint32_t ce_agaw = vtd_ce_get_agaw(ce);
-    return 1ULL << MIN(ce_agaw, VTD_MGAW);
+    return 1ULL << MIN(ce_agaw, aw);
 }
 
 /* Return true if IOVA passes range check, otherwise false. */
-static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce)
+static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
+                                        uint8_t aw)
 {
     /*
      * Check if @iova is above 2^X-1, where X is the minimum of MGAW
      * in CAP_REG and AW in context-entry.
      */
-    return !(iova & ~(vtd_iova_limit(ce) - 1));
+    return !(iova & ~(vtd_iova_limit(ce, aw) - 1));
 }
 
 /*
@@ -669,7 +670,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
  */
 static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
                              uint64_t *slptep, uint32_t *slpte_level,
-                             bool *reads, bool *writes)
+                             bool *reads, bool *writes, uint8_t aw_bits)
 {
     dma_addr_t addr = vtd_ce_get_slpt_base(ce);
     uint32_t level = vtd_ce_get_level(ce);
@@ -677,7 +678,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
     uint64_t slpte;
     uint64_t access_right_check;
 
-    if (!vtd_iova_range_check(iova, ce)) {
+    if (!vtd_iova_range_check(iova, ce, aw_bits)) {
         trace_vtd_err_dmar_iova_overflow(iova);
         return -VTD_FR_ADDR_BEYOND_MGAW;
     }
@@ -714,7 +715,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
             *slpte_level = level;
             return 0;
         }
-        addr = vtd_get_slpte_addr(slpte);
+        addr = vtd_get_slpte_addr(slpte, aw_bits);
         level--;
     }
 }
@@ -732,11 +733,12 @@ typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
  * @read: whether parent level has read permission
  * @write: whether parent level has write permission
  * @notify_unmap: whether we should notify invalid entries
+ * @aw: maximum address width
  */
 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                                uint64_t end, vtd_page_walk_hook hook_fn,
-                               void *private, uint32_t level,
-                               bool read, bool write, bool notify_unmap)
+                               void *private, uint32_t level, bool read,
+                               bool write, bool notify_unmap, uint8_t aw)
 {
     bool read_cur, write_cur, entry_valid;
     uint32_t offset;
@@ -783,7 +785,7 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
             entry.target_as = &address_space_memory;
             entry.iova = iova & subpage_mask;
             /* NOTE: this is only meaningful if entry_valid == true */
-            entry.translated_addr = vtd_get_slpte_addr(slpte);
+            entry.translated_addr = vtd_get_slpte_addr(slpte, aw);
             entry.addr_mask = ~subpage_mask;
             entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
             if (!entry_valid && !notify_unmap) {
@@ -803,10 +805,10 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
                 trace_vtd_page_walk_skip_perm(iova, iova_next);
                 goto next;
             }
-            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova,
+            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova,
                                       MIN(iova_next, end), hook_fn, private,
                                       level - 1, read_cur, write_cur,
-                                      notify_unmap);
+                                      notify_unmap, aw);
             if (ret < 0) {
                 return ret;
             }
@@ -827,25 +829,26 @@ next:
  * @end: IOVA range end address (start <= addr < end)
  * @hook_fn: the hook that to be called for each detected area
  * @private: private data for the hook function
+ * @aw: maximum address width
  */
 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
                          vtd_page_walk_hook hook_fn, void *private,
-                         bool notify_unmap)
+                         bool notify_unmap, uint8_t aw)
 {
     dma_addr_t addr = vtd_ce_get_slpt_base(ce);
     uint32_t level = vtd_ce_get_level(ce);
 
-    if (!vtd_iova_range_check(start, ce)) {
+    if (!vtd_iova_range_check(start, ce, aw)) {
         return -VTD_FR_ADDR_BEYOND_MGAW;
     }
 
-    if (!vtd_iova_range_check(end, ce)) {
+    if (!vtd_iova_range_check(end, ce, aw)) {
         /* Fix end so that it reaches the maximum */
-        end = vtd_iova_limit(ce);
+        end = vtd_iova_limit(ce, aw);
     }
 
     return vtd_page_walk_level(addr, start, end, hook_fn, private,
-                               level, true, true, notify_unmap);
+                               level, true, true, notify_unmap, aw);
 }
 
 /* Map a device to its corresponding domain (context-entry) */
@@ -867,7 +870,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
         return -VTD_FR_ROOT_ENTRY_P;
     }
 
-    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(VTD_HOST_ADDRESS_WIDTH))) {
+    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) {
         trace_vtd_re_invalid(re.rsvd, re.val);
         return -VTD_FR_ROOT_ENTRY_RSVD;
     }
@@ -884,7 +887,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
     }
 
     if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
-               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(VTD_HOST_ADDRESS_WIDTH))) {
+               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
         trace_vtd_ce_invalid(ce->hi, ce->lo);
         return -VTD_FR_CONTEXT_ENTRY_RSVD;
     }
@@ -1166,7 +1169,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
     }
 
     ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
-                               &reads, &writes);
+                               &reads, &writes, s->aw_bits);
     if (ret_fr) {
         ret_fr = -ret_fr;
         if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
@@ -1183,7 +1186,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
                      access_flags, level);
 out:
     entry->iova = addr & page_mask;
-    entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask;
+    entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
     entry->addr_mask = ~page_mask;
     entry->perm = access_flags;
     return true;
@@ -1200,7 +1203,7 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
 {
     s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
     s->root_extended = s->root & VTD_RTADDR_RTT;
-    s->root &= VTD_RTADDR_ADDR_MASK(VTD_HOST_ADDRESS_WIDTH);
+    s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);
 
     trace_vtd_reg_dmar_root(s->root, s->root_extended);
 }
@@ -1216,7 +1219,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
     uint64_t value = 0;
     value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
     s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
-    s->intr_root = value & VTD_IRTA_ADDR_MASK(VTD_HOST_ADDRESS_WIDTH);
+    s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
     s->intr_eime = value & VTD_IRTA_EIME;
 
     /* Notify global invalidation */
@@ -1392,7 +1395,7 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
         if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
             vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
                           vtd_page_invalidate_notify_hook,
-                          (void *)&vtd_as->iommu, true);
+                          (void *)&vtd_as->iommu, true, s->aw_bits);
         }
     }
 }
@@ -1472,7 +1475,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
     trace_vtd_inv_qi_enable(en);
 
     if (en) {
-        s->iq = iqa_val & VTD_IQA_IQA_MASK(VTD_HOST_ADDRESS_WIDTH);
+        s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
         /* 2^(x+8) entries */
         s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
         s->qi_enabled = true;
@@ -2403,6 +2406,8 @@ static Property vtd_properties[] = {
     DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
                             ON_OFF_AUTO_AUTO),
     DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+    DEFINE_PROP_UINT8("x-aw-bits", IntelIOMMUState, aw_bits,
+                      VTD_HOST_ADDRESS_WIDTH),
     DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
     DEFINE_PROP_END_OF_LIST(),
 };
@@ -2758,6 +2763,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
     hwaddr size;
     hwaddr start = n->start;
     hwaddr end = n->end;
+    IntelIOMMUState *s = as->iommu_state;
 
     /*
      * Note: all the codes in this function has a assumption that IOVA
@@ -2765,12 +2771,12 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
      * VT-d spec), otherwise we need to consider overflow of 64 bits.
      */
 
-    if (end > VTD_ADDRESS_SIZE(VTD_HOST_ADDRESS_WIDTH)) {
+    if (end > VTD_ADDRESS_SIZE(s->aw_bits)) {
         /*
          * Don't need to unmap regions that is bigger than the whole
          * VT-d supported address space size
          */
-        end = VTD_ADDRESS_SIZE(VTD_HOST_ADDRESS_WIDTH);
+        end = VTD_ADDRESS_SIZE(s->aw_bits);
     }
 
     assert(start <= end);
@@ -2782,9 +2788,9 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
          * suite the minimum available mask.
          */
         int n = 64 - clz64(size);
-        if (n > VTD_MGAW) {
+        if (n > s->aw_bits) {
             /* should not happen, but in case it happens, limit it */
-            n = VTD_MGAW;
+            n = s->aw_bits;
         }
         size = 1ULL << n;
     }
@@ -2844,7 +2850,8 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                                   PCI_FUNC(vtd_as->devfn),
                                   VTD_CONTEXT_ENTRY_DID(ce.hi),
                                   ce.hi, ce.lo);
-        vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false);
+        vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
+                      s->aw_bits);
     } else {
         trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
                                     PCI_FUNC(vtd_as->devfn));
@@ -2859,7 +2866,6 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
 static void vtd_init(IntelIOMMUState *s)
 {
     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
-    uint8_t aw_bits = VTD_HOST_ADDRESS_WIDTH;
 
     memset(s->csr, 0, DMAR_REG_SIZE);
     memset(s->wmask, 0, DMAR_REG_SIZE);
@@ -2878,21 +2884,24 @@ static void vtd_init(IntelIOMMUState *s)
     s->next_frcd_reg = 0;
     s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
              VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
-             VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(VTD_HOST_ADDRESS_WIDTH);
+             VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
+    if (s->aw_bits == VTD_HOST_AW_48BIT) {
+        s->cap |= VTD_CAP_SAGAW_48bit;
+    }
     s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
     /*
      * Rsvd field masks for spte
      */
     vtd_paging_entry_rsvd_field[0] = ~0ULL;
-    vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(aw_bits);
-    vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(aw_bits);
-    vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(aw_bits);
-    vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(aw_bits);
-    vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(aw_bits);
-    vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(aw_bits);
-    vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(aw_bits);
-    vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(aw_bits);
+    vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits);
 
     if (x86_iommu->intr_supported) {
         s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
@@ -3029,6 +3038,14 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
         }
     }
 
+    /* Currently only address widths supported are 39 and 48 bits */
+    if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
+        (s->aw_bits != VTD_HOST_AW_48BIT)) {
+        error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
+                   VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
+        return false;
+    }
+
     return true;
 }
 
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 77e4a9833a..d084099ed9 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -131,7 +131,7 @@
 #define VTD_TLB_DID(val)            (((val) >> 32) & VTD_DOMAIN_ID_MASK)
 
 /* IVA_REG */
-#define VTD_IVA_ADDR(val)       ((val) & ~0xfffULL & ((1ULL << VTD_MGAW) - 1))
+#define VTD_IVA_ADDR(val)       ((val) & ~0xfffULL)
 #define VTD_IVA_AM(val)         ((val) & 0x3fULL)
 
 /* GCMD_REG */
@@ -197,7 +197,6 @@
 #define VTD_DOMAIN_ID_SHIFT         16  /* 16-bit domain id for 64K domains */
 #define VTD_DOMAIN_ID_MASK          ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
 #define VTD_CAP_ND                  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
-#define VTD_MGAW                    39  /* Maximum Guest Address Width */
 #define VTD_ADDRESS_SIZE(aw)        (1ULL << (aw))
 #define VTD_CAP_MGAW(aw)            ((((aw) - 1) & 0x3fULL) << 16)
 #define VTD_MAMV                    18ULL
@@ -213,7 +212,6 @@
 #define VTD_CAP_SAGAW_39bit         (0x2ULL << VTD_CAP_SAGAW_SHIFT)
  /* 48-bit AGAW, 4-level page-table */
 #define VTD_CAP_SAGAW_48bit         (0x4ULL << VTD_CAP_SAGAW_SHIFT)
-#define VTD_CAP_SAGAW               VTD_CAP_SAGAW_39bit
 
 /* IQT_REG */
 #define VTD_IQT_QT(val)             (((val) >> 4) & 0x7fffULL)
@@ -252,7 +250,7 @@
 #define VTD_FRCD_SID_MASK       0xffffULL
 #define VTD_FRCD_SID(val)       ((val) & VTD_FRCD_SID_MASK)
 /* For the low 64-bit of 128-bit */
-#define VTD_FRCD_FI(val)        ((val) & (((1ULL << VTD_MGAW) - 1) ^ 0xfffULL))
+#define VTD_FRCD_FI(val)        ((val) & ~0xfffULL)
 
 /* DMA Remapping Fault Conditions */
 typedef enum VTDFaultReason {
@@ -360,8 +358,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_IOTLB_DOMAIN       (2ULL << 4)
 #define VTD_INV_DESC_IOTLB_PAGE         (3ULL << 4)
 #define VTD_INV_DESC_IOTLB_DID(val)     (((val) >> 16) & VTD_DOMAIN_ID_MASK)
-#define VTD_INV_DESC_IOTLB_ADDR(val)    ((val) & ~0xfffULL & \
-                                         ((1ULL << VTD_MGAW) - 1))
+#define VTD_INV_DESC_IOTLB_ADDR(val)    ((val) & ~0xfffULL)
 #define VTD_INV_DESC_IOTLB_AM(val)      ((val) & 0x3fULL)
 #define VTD_INV_DESC_IOTLB_RSVD_LO      0xffffffff0000ff00ULL
 #define VTD_INV_DESC_IOTLB_RSVD_HI      0xf80ULL
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 372b06df45..45ec8919b6 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -304,6 +304,7 @@ struct IntelIOMMUState {
     bool intr_eime;                 /* Extended interrupt mode enabled */
     OnOffAuto intr_eim;             /* Toggle for EIM cabability */
     bool buggy_eim;                 /* Force buggy EIM unless eim=off */
+    uint8_t aw_bits;                /* Host/IOVA address width (in bits) */
 };
 
 /* Find the VTD Address space associated with the given bus pointer,

From fced4d00e68e7559c73746d963265f7fd0b6abf9 Mon Sep 17 00:00:00 2001
From: Marcel Apfelbaum <marcel@redhat.com>
Date: Wed, 10 Jan 2018 21:09:09 +0200
Subject: [PATCH 13/29] hw/pci-bridge: fix QEMU crash because of pcie-root-port

If we try to use more pcie_root_ports then available slots
and an IO hint is passed to the port, QEMU crashes because
we try to init the "IO hint" capability even if the device
is not created.
Fix it by checking for error before adding the capability,
so QEMU can fail gracefully.

Signed-off-by: Marcel Apfelbaum <marcel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/pci-bridge/gen_pcie_root_port.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/pci-bridge/gen_pcie_root_port.c b/hw/pci-bridge/gen_pcie_root_port.c
index ad4e6aa7ff..0e2f2e8bf1 100644
--- a/hw/pci-bridge/gen_pcie_root_port.c
+++ b/hw/pci-bridge/gen_pcie_root_port.c
@@ -74,8 +74,13 @@ static void gen_rp_realize(DeviceState *dev, Error **errp)
     PCIDevice *d = PCI_DEVICE(dev);
     GenPCIERootPort *grp = GEN_PCIE_ROOT_PORT(d);
     PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(d);
+    Error *local_err = NULL;
 
-    rpc->parent_realize(dev, errp);
+    rpc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
 
     int rc = pci_bridge_qemu_reserve_cap_init(d, 0, grp->bus_reserve,
             grp->io_reserve, grp->mem_reserve, grp->pref32_reserve,

From d82c4f82e0233da642c36c40ef8da781fee689bc Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Thu, 14 Dec 2017 12:08:54 +0800
Subject: [PATCH 14/29] ACPI/unit-test: Add a testcase for RAM allocation in
 numa node

As QEMU supports the memory-less node, it is possible that there is
no RAM in the first numa node(also be called as node0). eg:
  ... \
  -m 128,slots=3,maxmem=1G \
  -numa node -numa node,mem=128M \

But, this makes it hard for QEMU to build a known-to-work ACPI SRAT
table. Only fixing it is not enough.

Add a testcase for this situation to make sure the ACPI table is
correct for guest.

Suggested-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/acpi-test-data/pc/DSDT.numamem  | Bin 0 -> 5150 bytes
 tests/acpi-test-data/pc/SRAT.numamem  | Bin 0 -> 224 bytes
 tests/acpi-test-data/q35/DSDT.numamem | Bin 0 -> 7834 bytes
 tests/acpi-test-data/q35/SRAT.numamem | Bin 0 -> 224 bytes
 tests/bios-tables-test.c              |  24 ++++++++++++++++++++++++
 5 files changed, 24 insertions(+)
 create mode 100644 tests/acpi-test-data/pc/DSDT.numamem
 create mode 100644 tests/acpi-test-data/pc/SRAT.numamem
 create mode 100644 tests/acpi-test-data/q35/DSDT.numamem
 create mode 100644 tests/acpi-test-data/q35/SRAT.numamem

diff --git a/tests/acpi-test-data/pc/DSDT.numamem b/tests/acpi-test-data/pc/DSDT.numamem
new file mode 100644
index 0000000000000000000000000000000000000000..224cfdd9e983e02dac5f4bf7e210eaa64cb0dc78
GIT binary patch
literal 5150
zcmb7I-EJG#5uUTVl$K{nX(_FLmDq$F*GSsf{PAB<pb)uBiL^v&&81uyE0jxFLFpiX
z7fuXB5rWbTpg6t@R2RMI1#R%!dW7~7@(AfGxM-}aurs?n6hmo&gaDUk&V2jL%sF%B
zjAS`h<AMRe-W1o(vXd^}w@nM@7y!_ie)Wp732w(Kt~8k@Da?JU1!H^^RpWP7abt!3
zqwW3O^FIH^V=W)HUg<BkTK?D1ye%NmlP#Zf8t98nr`UA63$|IwsLgW4N)*25vf3<I
zbx@ld6^rRrHN=$EvR!Hj5JM2E(T%d*s6qnZ`=2yW+^0Ip^Y=<MD{a@UX4<&fbQ%Tl
zX~y$us`aJEb+4EBZr_7_Pa6#S(3|;gzXgA5CE!2*i}j(;XVs=zcTy=nvlqvRksK6&
z9ndddC2z=Gw|maMpUnrpCS+jfik;1y67Ye(6dQw?O2sKOLmVzF;jU*)iF+K~{mq}R
z5(~WvP`(D!Yj&x|;5Nu+fd;Z!#2?+fcuf|DiOubPSZ|m}8ZMcJi$(sP<)>Dd4?gJ9
zBCr+q7#@Q(wF7SV)@soj!DZQq2dgp)G<eYi4;a3+4cqv;C}F&A_eynrWAx(e{N#%&
zk1c{uz=FDLnWIgd9(uIE&#tgH@*5eH74}wsiwm{51Rp2?cXoNrE{M%uwkmUJ9e9kd
z^9<t$T;iAUJI5FJfPcz=<g;8=o6A-yb%q<1hT|Uz1-JW$HM=qLRRQB{xb?K8UT+Gw
zyALmwQOr6|uCOoUFJx6+>;(Rl6mz|r6^j~UVFn5s+K?!kL-|k!bx{v!mWd`eLBpjH
z5AJ9rk8~&@kBU5cSv^Xkj%_*(ron5jVv3VsVh(Pk@nNOij#IjWM^SxE8Kse67Bi(g
zs3_K|b*AZ|f&{Zz+o+~pR$Mbz!MJRjr8|;)iKM~6Z<EsBVGn^`cdyWNAM_d?p<+{C
z<&^e@#hZ4$0sfrS@qB2#D#l;5c*p;;#U<GCcOSR5fXncpr|&;NP#llVT5Z*W?cRI)
z4_Gg?-{Q(>+JWtt$&pk*P-_1Hmqs(i;fy?*F=5;PYG;e<fCm?zZ3FnbEwuM+n}NUm
z%xl4uR^M}<zIRO~Z|SViX=8T%ZbrpNmL55J<W*z7au-*%7vJTbHXizTpKHd=%3ZS_
zM?gK3n&H~FQxFf5Z3mvy9FO6%($ikFG3GXn`!wx#*QPN{Oy=-FCa{1|c~Wt$oYZMP
z|LqLK<rrou$FGLp9WZ9YGe}O5dq%2hRUG(=#=H~E%+jfbKDiXgXPQpkVb9y47g){e
z6`CJfFHTs;{Qbq&vt!n?f%Po<H*P&^oNPVocB>S-IboekzruWO%zQ2|pPMkBi!h%X
znzts+;|1}HqtB0-&j;r76Xx>~=JRfMUBT#`N%Kl*UZfD+BDm$E>}tkTu-S$dt}(fa
zQ?$GK`mON7Gx_FG(YNcRkqJH(Bv;b3H`3)tpZlLa`Ahoo$DciV^7z5WpMlFvdu?@C
zev_Q9FgSn%mcSZ~NflI)1D<f7ULv6YX_GD-vr==IS+Cw>SP(J{6w|C)SZd}7B%4lY
zEsaR4&`2AJW~Ek9eV|FVTl{b{s8Z-l=wGs6+LcTun;{Rv`#ff(%*OJcq#oaI!=9PR
zWF8vjDXd|IxO-{ynj$VXijjP$N;El*#(eO4=l3TS<h{>lJ@?8n&Dv_;GpaF)=+_xy
zlT^rXBEM0dF&(p}Muow=R|Rd~!z&p}<s>PtaBoF97ErJgxLlKPTzuvq^9<2G^Ionh
zz4CFYU_Vc$;_`7Wgd^9<fmY+@y3<HR1$yFs9di~V_8*=Qm(QiiS%Q8tIf+YfqId4s
zs|uwRo-zDjhGJ@|eM&G)!fK(B(zpsL^<Jpenv~&`6kyCWF({29gZ9Hu@WkG)g0X-Z
zy<(S|rA!X@ENHtZyJ@5Kb%j*HSsx~P<|Y^)AvO!NHlrpjyivy{Y_k?P|1*Sc8sYJw
zyDDtqmN@_Cvm{8^(c))74{XT}6Nd?=;ylAv6F+ZJeV7=@NxTaI{`<`tfKX5O%wRzG
z41FXBG@@xz2ZMk86l(tc<MZDhy%Ww6Y%m!7Rlvhp=+G}UdZvO)5H@;AyJlw!xCCKA
zqWQ5(gpGTnz_l>0pc4IX@WTO9h~$o9m+gAEkUuBEr8uFg9p{4A@R}LeC%wOk=AC?V
zZc1V3Gmft}=TrrIN9+nwx<MLpUjO~kV!Pd|%YQVNLkI4bsvDR-IYLi(tjF{i)lR+S
zZw6gZ^--`_MJz}dhUR24*m2UID{l7#Cp#2ycO)>7Eo))VNrRji<Z0k`-*>XN@kM8&
zHvDr3Uy;F>3zCz%GBQAFg64>ppm?dEDTHEk8vzxpF#(N5LGywYual5<DiSJKrv-F6
z3VL0zVzVBys*zB^8W+%b6!eB*<pRnhp@J0?FLXeQg5DIY2?0$+LIrD5K$B6>TY_~)
zKxZPMf)(4v(AU{0=xxEO3#c9m6|5-%O+`UJ60CCqIu{8Qtn&go9|bK4R?Pe`EF%&s
zSWgM)sVHbsu$~ss(~(fYdPYFcL_tl#dR9QsMnVPaIRQNv1+@h0c>z5i2^Fjh0=f_d
zEeY0l1oWLqs9=3pK;MmmE(_KR0(v15Dp)TH=*5$uMtQ-yS8GVU#BLjJ-aV>y4+B9*
z0jpUkq)8B(B^0nCg_;ftq)G0!HN3X>94sqCNg`>aQ&7U<Fw~K)8zUVV!y`V_k+qv6
z9ht*}I@FQ9Tcn%aZDV!CTV&6{11Ds>M^HY}<889%;L#C!Y%Nee((5GSArSg>ARp<M
zNXB#)`c5Dp>4S(dF@+v4lRd{A(^J%#Vk14pZH?(Ea!i(yK27qNehFpV_L06rGU|<s
X)TZG7kLVu(w1s%rZLs0M;`09i9gxjE

literal 0
HcmV?d00001

diff --git a/tests/acpi-test-data/pc/SRAT.numamem b/tests/acpi-test-data/pc/SRAT.numamem
new file mode 100644
index 0000000000000000000000000000000000000000..dbc595d9cb85d3fcb5a4243153f42bb431c9de8f
GIT binary patch
literal 224
zcmWFzatwLEz`(%x#mV2<BUr&HBEUHqD8>jB1F=Cg2*ZH@DxXmUE`z~9*2k!U%mXRq
Yf~!ZCL8t>-1O^}2VG2>z!9?-X08OU~0RR91

literal 0
HcmV?d00001

diff --git a/tests/acpi-test-data/q35/DSDT.numamem b/tests/acpi-test-data/q35/DSDT.numamem
new file mode 100644
index 0000000000000000000000000000000000000000..8c9fa445b0119b6f67533cb968855b41fb9925d9
GIT binary patch
literal 7834
zcmb7JTW=f38J#7U)M`meOKN5LF2Q*SdPz*#X;QRkU~-qYN|Y#)juTJ-*KX3vae)xN
z07;Dik^<!7hXRS(peXu42l^j%|3Thrpih16Yf(f`VbAvsN1j<yK*R@f&+Iw#&6&NN
z<xbcMyMMhbg!PZ|W>D{}l;3UzA=)e<1Z~r=*RZ!mZ#OL0`c@`kTfMyuZ{w@%_&*KH
zH*3~kcA~GN=;3FPb$cN0eB=H3&h5b`pGS8DksjR{xGrsYYi_?&?)UZsw-q#sPQTs=
zv!!oc$LR-hE9vx0VOu!;n~l)&*Jt{hoxpx_PXytu)!|0!U?;!edcEequ79_D@y6G;
z9+sZ{>Yu-Ta?iCvxQ1T`zt;!!6+MpJ<!I=Ab#T}EnB=@~b-3`=0q>S=9_<-K)bcbd
zo2aYR!+T!Skz=hm^;V^;#%f`N7#rrJ-s(0*R)}xE&j`D>=Mi3btFjTUwAx{R#ecou
z>6XNRTA~3p-Tp)5deJcZ<v}D4qReP?D28)i9^4jxx}6pO{$J`)9J=$EiFPNScjrEx
z#nhv7VGHq*@NUe!O2eUP!}O&aol>8t3IkZiU|Nm#R>F<kY;SiRM=;^;x{2Q2IocM%
zX0wzO7Jkb3S#Nd21hXm$cb`~`U{yp=s%%(68DHxye0evPt|nDQs@B_orQHosR35Vo
zjuDOYth-pV24Wz*#1&!Hto!1=NTfd!z5O@V%Hc6Rne=CQK8`K2FcOilpn6xli{C2=
zIPLAf+}yl*ESz763mrFgMR-Jf6JCyqw(!r&8K?0_!!&03P&~Dd3wmy@W__6aFzGl~
zcDYc+#+<AwM=5K3){VQN2|1~oqvTOiI9ELGrb^vhi*@5B)Np7{PwXdglt4cY`f*x~
zpep=qAHUXMhmiNjcPu&tUhp5jTw3jJtFZQ9w~(=M_K5Y3wTh727PhORkfnAv0Z>yQ
z2~Zb>oShwwa4}2X7?BFZIk}H@pejg8^v~P`E5RKjQYvafuo6^O0+JG=VQ!L@Y6Nut
zaj#5GN+qZYl2TQ{c`v!#kIhO&wWU%G9#Hq0GJK|3si-NbRD%c9eWnecX;vy~+Vlyi
z`<yX+&KN#tOrL<dPsi}-7(N}-C!p>VcL;5!XAPgTrcXfKr)&6h4WF**6Hxb=F??nW
zpBd99pzafQD(!e?4WC)lC!p>#XZXw+K69o|K;7q@;d9RLIcNF=)P2qyKIaXe^QKQg
z-KS^x^bDV#=@U@*nKyjq4WD__C!p@LVE8N;J`1K#K;7qp;d8<8xnTMP)P4GfPv7wA
zn?3<`pNodiMZ@Q!=@U@*dB*U0#_)N@^a-f@Trzwv89tXxpMbj0vxd*JhR?I6Pe9%0
zIaZd&cfxb5ERSz?=cF=t0ZAE-x#tb#c|&>LR08VC3x@K7p}b%!0d?g?LwV6qUNn_}
zy7H2tyksaZnMy!ii6>9(o^Y9!>K=btD%Fh}kd!J(mW`UrM$KijCZMe8AV5o6fYv&V
z;{e4r3XcO60A(de0;r-0P|4#+vhcjaM3tBcNmY=PT7XKf3Q&zo0V=7iqyPoPXjFho
z?if)`m9-S0l6oAVfEYS+5ulPgMx<1eO93jWtfT-1R6Y@)lFLd2s79p#l~h(zfC4I?
z2vEsoB?45VQh-V-D=9z$l}`ky<gyY0s!=IGC6$#Fpn%FJ0#tHYi2&886rhsIN(xXw
z<r4uaxvWHhYE%kPNo6GkD4_C*0F_);B0x1N1*oL5k^&S^`9y$9E-MkB8kGW6Qdvm>
z3aES{KqZ%z2vChm0V=7iqyPm}J`tdj%Sr^OMx_9iR8~@e0xF*fP|0N_0#u_?fJ!PW
zDL?_0PXwssvJwHRQ7J$rm6a5rfXXKVRB~B~0M)1zppwc;3Q$1h69Fo@tVDonR0>c@
zWhDhDpz?_Tm0VULKs71_sHC!z0u)gBM1V>zD-oa?l>$^!SxEs3sC*(oC6|>5P>o6f
zDygib00mS&5ulRGN(87zr2v&wR#JchDxU~Y$z>%1RHIUWN-8TUKmnCc1gPY)5&^1F
zDL^Hal@y?W$|nL;a#@K0)u<GplFCX7P(bAq0ScrAD3BhYfO>!eY5}S#5uloq0#s8{
zfNDwvsHQ}KYDx-FO-TW&DG{KW5&^0yDL^$P1*oP(fNDwvsHUU<)sz&Vni2sDq>e5E
z6bMI*CzApc5QjlQQmVN{fC8zxMSudSxupOFRC7xK3K$Cid>;`ihS>xDvwBE>NS~(Z
zr_|f~#nmMJOQm-O^ftn3wYN%0+^sTw1@sj<0`Eq)DzsgtuNr;T9e<&*)#+Np*~jkg
z4qk%H9=aL-wZ>M^o&}*@%+I-*FVbR`UPj@c@)K%vQ}}KS9`CsGg0?V-Hr|Y^&)^xM
zr2dO%@j8WG);Jh;Cx6CW_+n1XCXsFe_iuqaF?#do<$DPl0!CBv^Zl^V5g*gGyUQ)}
z^rk}`(tsE*^*im3^<-x}1}abO<Lc#Db-Y&i!PU!J^)gp4V|;q`vOm3gxwqdW??J4(
zn>a=JKr0_``5;z4IH7znE+59r<8|2&b{}fxLoOf2%7-VE569&z@$&X5%2%}V6)s<i
zm9LyozA`Rfjh9cIqI^{=U*+=ESo!J+<*VcJwRrjTDazNh@-;4Bi<PgPP`)-UUyqlc
zIYs%pR=&>V>#_3n6Ux_7-lkhwyu2sNtD6TNE9f;FzqK^m{4`(k_@V21++F-KcJpR;
zc-ENcPPR?$ZsNKunBCz4WTHFSHnqEn>uka74o@W$-O0A8-A!CKzS$ifRVKQVZBx6O
zxDFQ0?(qCF(Vc9Y+TFx?ea7q#4>c3r$+oH8O`NAoW_NhfndnZoncaDtZ^#vbz}<fF
zuDsQ*bkv#X;sQke<bu}CmHGz{2fu&xyOsOzJ^0|!{olOzK=iDjwZ1k!BXh1>vz}+4
zXRRz=Z6kLOh?(bEywKl!i9kuL1oU1$Xl!(>pxu1S3M#)YTEY&3X6W?x3Pef<oyz(O
zm|>~Af>+6njfOK2)Y#iEzp~9M2pEeTU#=9i8pYyjkwzd_Us8TP=vLoJ4Bdkr@o7*j
z7VrA?bb#u!y_dGH+G;(3Jkr=lqrpzR8N~)78lFFh@1W5S-1(^Y^2DC?wZq77rpVFh
z3R&^4(8JXJ{rX(c2{xRQY*^7ZrOZKum5gX0;+<lKhc$b`5FZIlSDcQ?w+0g<1AAaa
zPuJ{WeX=V)p3}Q>laqacj=rWFZO#Upoo@Cdqf?L9#9-lgaCeHYft!f867~IfBc@>L
z=z5>5*>qar3d09yC{8WskWMh1gw0YnpK%?5|MT>u=V$tKhSNzQ{5dQHb~EHm?A(1H
z*X!F&;a|WRU29ePmEtuVGhVHtvzrdI=hGg(4#($t@pa+V=H)dY7awW)>W1t7?em;S
z;J-CKBl^N})KT^*YdL7QvRJ<m;ydg<${yn!7GH$;?`OCv$o=#@IU3RPBz+Qs_wdsy
z;Iq-_@89zd@$AhfKY5{%ZA2U41J}lfFfDQgHjK!n(p+n!RV=}XysJ>Vc00jxWw+#J
zWWe$5WF@_lwy;v>{b(DK&h>sFZCj{^?brGR$95Ag7IuJsMti8b_sbU()Z2fxdMl8(
z^4wOXc@xL&8nI(lyp)%so0hj>Z}VzZ)t0N2nNG2Y<vCi<?d|`#Q~bFEm`h6_UplKH
T7U*c>n%cO|8|hZC(G~v(I>Rw)

literal 0
HcmV?d00001

diff --git a/tests/acpi-test-data/q35/SRAT.numamem b/tests/acpi-test-data/q35/SRAT.numamem
new file mode 100644
index 0000000000000000000000000000000000000000..dbc595d9cb85d3fcb5a4243153f42bb431c9de8f
GIT binary patch
literal 224
zcmWFzatwLEz`(%x#mV2<BUr&HBEUHqD8>jB1F=Cg2*ZH@DxXmUE`z~9*2k!U%mXRq
Yf~!ZCL8t>-1O^}2VG2>z!9?-X08OU~0RR91

literal 0
HcmV?d00001

diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c
index e28e0c98cf..def81fc243 100644
--- a/tests/bios-tables-test.c
+++ b/tests/bios-tables-test.c
@@ -810,6 +810,28 @@ static void test_acpi_piix4_tcg_memhp(void)
     free_test_data(&data);
 }
 
+static void test_acpi_q35_tcg_numamem(void)
+{
+    test_data data;
+
+    memset(&data, 0, sizeof(data));
+    data.machine = MACHINE_Q35;
+    data.variant = ".numamem";
+    test_acpi_one(" -numa node -numa node,mem=128", &data);
+    free_test_data(&data);
+}
+
+static void test_acpi_piix4_tcg_numamem(void)
+{
+    test_data data;
+
+    memset(&data, 0, sizeof(data));
+    data.machine = MACHINE_PC;
+    data.variant = ".numamem";
+    test_acpi_one(" -numa node -numa node,mem=128", &data);
+    free_test_data(&data);
+}
+
 int main(int argc, char *argv[])
 {
     const char *arch = qtest_get_arch();
@@ -832,6 +854,8 @@ int main(int argc, char *argv[])
         qtest_add_func("acpi/q35/cpuhp", test_acpi_q35_tcg_cphp);
         qtest_add_func("acpi/piix4/memhp", test_acpi_piix4_tcg_memhp);
         qtest_add_func("acpi/q35/memhp", test_acpi_q35_tcg_memhp);
+        qtest_add_func("acpi/piix4/numamem", test_acpi_piix4_tcg_numamem);
+        qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem);
     }
     ret = g_test_run();
     boot_sector_cleanup(disk);

From 6cf6fe394acdf9b0025ae36ff75dfa8dc68d5bca Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Thu, 14 Dec 2017 12:08:55 +0800
Subject: [PATCH 15/29] hw/acpi-build: Make next_base easy to follow

It may be hard to read the assignment statement of "next_base", so

S/next_base += (1ULL << 32) - pcms->below_4g_mem_size;
 /next_base = mem_base + mem_len;

... for readability.

No functionality change.

Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Reviewed-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/acpi-build.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 6f38fb9d27..dc4b2b9ffe 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2394,7 +2394,7 @@ build_srat(GArray *table_data, BIOSLinker *linker, MachineState *machine)
             }
             mem_base = 1ULL << 32;
             mem_len = next_base - pcms->below_4g_mem_size;
-            next_base += (1ULL << 32) - pcms->below_4g_mem_size;
+            next_base = mem_base + mem_len;
         }
         numamem = acpi_data_push(table_data, sizeof *numamem);
         build_srat_memory(numamem, mem_base, mem_len, i - 1,

From 0d85e7d99a099b84181ee8c00b80ee6148866f9b Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Thu, 21 Dec 2017 22:21:21 +0100
Subject: [PATCH 16/29] vhost-user-test: fix features mask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VIRTIO_NET_F_MAC is a bit position, not a bit mask.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 tests/vhost-user-test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index e2c89ed376..43c6528644 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -177,7 +177,7 @@ static void init_virtio_dev(TestServer *s)
     qvirtio_set_driver(&dev->vdev);
 
     features = qvirtio_get_features(&dev->vdev);
-    features = features & VIRTIO_NET_F_MAC;
+    features = features & (1u << VIRTIO_NET_F_MAC);
     qvirtio_set_features(&dev->vdev, features);
 
     qvirtio_set_driver_ok(&dev->vdev);

From e364c7037c9448b88ef39849d2008825c45cc04c Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Thu, 21 Dec 2017 22:21:22 +0100
Subject: [PATCH 17/29] vhost-user-test: extract read-guest-mem test from main
 loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes read-guest-test consistent with other tests,
i.e. create the test server in the test function.

Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/vhost-user-test.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index 43c6528644..df567248ae 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -617,6 +617,28 @@ GSourceFuncs test_migrate_source_funcs = {
     .check = test_migrate_source_check,
 };
 
+static void test_read_guest_mem(void)
+{
+    TestServer *server = NULL;
+    char *qemu_cmd = NULL;
+    QTestState *s = NULL;
+
+    server = test_server_new("test");
+    test_server_listen(server);
+
+    qemu_cmd = GET_QEMU_CMD(server);
+
+    s = qtest_start(qemu_cmd);
+    g_free(qemu_cmd);
+
+    init_virtio_dev(server);
+
+    read_guest_mem(server);
+
+    qtest_quit(s);
+    test_server_free(server);
+}
+
 static void test_migrate(void)
 {
     TestServer *s = test_server_new("src");
@@ -919,10 +941,7 @@ static void test_multiqueue(void)
 
 int main(int argc, char **argv)
 {
-    QTestState *s = NULL;
-    TestServer *server = NULL;
     const char *hugefs;
-    char *qemu_cmd = NULL;
     int ret;
     char template[] = "/tmp/vhost-test-XXXXXX";
     GMainLoop *loop;
@@ -947,20 +966,11 @@ int main(int argc, char **argv)
         root = tmpfs;
     }
 
-    server = test_server_new("test");
-    test_server_listen(server);
-
     loop = g_main_loop_new(NULL, FALSE);
     /* run the main loop thread so the chardev may operate */
     thread = g_thread_new(NULL, thread_function, loop);
 
-    qemu_cmd = GET_QEMU_CMD(server);
-
-    s = qtest_start(qemu_cmd);
-    g_free(qemu_cmd);
-    init_virtio_dev(server);
-
-    qtest_add_data_func("/vhost-user/read-guest-mem", server, read_guest_mem);
+    qtest_add_func("/vhost-user/read-guest-mem", test_read_guest_mem);
     qtest_add_func("/vhost-user/migrate", test_migrate);
     qtest_add_func("/vhost-user/multiqueue", test_multiqueue);
 
@@ -978,12 +988,7 @@ int main(int argc, char **argv)
 
     ret = g_test_run();
 
-    if (s) {
-        qtest_quit(s);
-    }
-
     /* cleanup */
-    test_server_free(server);
 
     /* finish the helper thread and dispatch pending sources */
     g_main_loop_quit(loop);

From 026eb179733c569b499fb67dc7de6405760bc665 Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Thu, 21 Dec 2017 22:21:23 +0100
Subject: [PATCH 18/29] vhost-user-test: setup virtqueues in all tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only the multiqueue test setups the virtqueues.
This patch generalizes the setup of virtqueues for all tests.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 tests/vhost-user-test.c | 53 ++++++++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index df567248ae..969e3932cc 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -55,6 +55,7 @@
 /*********** FROM hw/virtio/vhost-user.c *************************************/
 
 #define VHOST_MEMORY_MAX_NREGIONS    8
+#define VHOST_MAX_VIRTQUEUES    0x100
 
 #define VHOST_USER_F_PROTOCOL_FEATURES 30
 #define VHOST_USER_PROTOCOL_F_MQ 0
@@ -141,6 +142,8 @@ enum {
 
 typedef struct TestServer {
     QPCIBus *bus;
+    QVirtioPCIDevice *dev;
+    QVirtQueue *vq[VHOST_MAX_VIRTQUEUES];
     gchar *socket_path;
     gchar *mig_path;
     gchar *chr_name;
@@ -155,6 +158,7 @@ typedef struct TestServer {
     bool test_fail;
     int test_flags;
     int queues;
+    QGuestAllocator *alloc;
 } TestServer;
 
 static const char *tmpfs;
@@ -162,26 +166,43 @@ static const char *root;
 
 static void init_virtio_dev(TestServer *s)
 {
-    QVirtioPCIDevice *dev;
     uint32_t features;
+    int i;
 
     s->bus = qpci_init_pc(NULL);
     g_assert_nonnull(s->bus);
 
-    dev = qvirtio_pci_device_find(s->bus, VIRTIO_ID_NET);
-    g_assert_nonnull(dev);
+    s->dev = qvirtio_pci_device_find(s->bus, VIRTIO_ID_NET);
+    g_assert_nonnull(s->dev);
 
-    qvirtio_pci_device_enable(dev);
-    qvirtio_reset(&dev->vdev);
-    qvirtio_set_acknowledge(&dev->vdev);
-    qvirtio_set_driver(&dev->vdev);
+    qvirtio_pci_device_enable(s->dev);
+    qvirtio_reset(&s->dev->vdev);
+    qvirtio_set_acknowledge(&s->dev->vdev);
+    qvirtio_set_driver(&s->dev->vdev);
 
-    features = qvirtio_get_features(&dev->vdev);
+    s->alloc = pc_alloc_init();
+
+    for (i = 0; i < s->queues * 2; i++) {
+        s->vq[i] = qvirtqueue_setup(&s->dev->vdev, s->alloc, i);
+    }
+
+    features = qvirtio_get_features(&s->dev->vdev);
     features = features & (1u << VIRTIO_NET_F_MAC);
-    qvirtio_set_features(&dev->vdev, features);
+    qvirtio_set_features(&s->dev->vdev, features);
 
-    qvirtio_set_driver_ok(&dev->vdev);
-    qvirtio_pci_device_free(dev);
+    qvirtio_set_driver_ok(&s->dev->vdev);
+}
+
+static void uninit_virtio_dev(TestServer *s)
+{
+    int i;
+
+    for (i = 0; i < s->queues * 2; i++) {
+        qvirtqueue_cleanup(s->dev->vdev.bus, s->vq[i], s->alloc);
+    }
+    pc_alloc_uninit(s->alloc);
+
+    qvirtio_pci_device_free(s->dev);
 }
 
 static void wait_for_fds(TestServer *s)
@@ -635,6 +656,8 @@ static void test_read_guest_mem(void)
 
     read_guest_mem(server);
 
+    uninit_virtio_dev(server);
+
     qtest_quit(s);
     test_server_free(server);
 }
@@ -711,6 +734,8 @@ static void test_migrate(void)
 
     read_guest_mem(dest);
 
+    uninit_virtio_dev(s);
+
     g_source_destroy(source);
     g_source_unref(source);
 
@@ -789,6 +814,8 @@ static void test_reconnect_subprocess(void)
     wait_for_fds(s);
     wait_for_rings_started(s, 2);
 
+    uninit_virtio_dev(s);
+
     qtest_end();
     test_server_free(s);
     return;
@@ -818,6 +845,8 @@ static void test_connect_fail_subprocess(void)
     wait_for_fds(s);
     wait_for_rings_started(s, 2);
 
+    uninit_virtio_dev(s);
+
     qtest_end();
     test_server_free(s);
 }
@@ -846,6 +875,8 @@ static void test_flags_mismatch_subprocess(void)
     wait_for_fds(s);
     wait_for_rings_started(s, 2);
 
+    uninit_virtio_dev(s);
+
     qtest_end();
     test_server_free(s);
 }

From d3b2a5d1e4a627395cdc8f40d05b12bf8ab58817 Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Thu, 21 Dec 2017 22:21:24 +0100
Subject: [PATCH 19/29] vhost-user-test: make features mask an
 init_virtio_dev() argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The goal is to generalize the use of [un]init_virtio_dev() to
all tests, which does not necessarily expose the same features
set.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 tests/vhost-user-test.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index 969e3932cc..6a144e8d7e 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -164,7 +164,7 @@ typedef struct TestServer {
 static const char *tmpfs;
 static const char *root;
 
-static void init_virtio_dev(TestServer *s)
+static void init_virtio_dev(TestServer *s, uint32_t features_mask)
 {
     uint32_t features;
     int i;
@@ -187,7 +187,7 @@ static void init_virtio_dev(TestServer *s)
     }
 
     features = qvirtio_get_features(&s->dev->vdev);
-    features = features & (1u << VIRTIO_NET_F_MAC);
+    features = features & features_mask;
     qvirtio_set_features(&s->dev->vdev, features);
 
     qvirtio_set_driver_ok(&s->dev->vdev);
@@ -652,7 +652,7 @@ static void test_read_guest_mem(void)
     s = qtest_start(qemu_cmd);
     g_free(qemu_cmd);
 
-    init_virtio_dev(server);
+    init_virtio_dev(server, 1u << VIRTIO_NET_F_MAC);
 
     read_guest_mem(server);
 
@@ -681,7 +681,7 @@ static void test_migrate(void)
     from = qtest_start(cmd);
     g_free(cmd);
 
-    init_virtio_dev(s);
+    init_virtio_dev(s, 1u << VIRTIO_NET_F_MAC);
     wait_for_fds(s);
     size = get_log_size(s);
     g_assert_cmpint(size, ==, (2 * 1024 * 1024) / (VHOST_LOG_PAGE * 8));
@@ -803,7 +803,7 @@ static void test_reconnect_subprocess(void)
     qtest_start(cmd);
     g_free(cmd);
 
-    init_virtio_dev(s);
+    init_virtio_dev(s, 1u << VIRTIO_NET_F_MAC);
     wait_for_fds(s);
     wait_for_rings_started(s, 2);
 
@@ -841,7 +841,7 @@ static void test_connect_fail_subprocess(void)
     qtest_start(cmd);
     g_free(cmd);
 
-    init_virtio_dev(s);
+    init_virtio_dev(s, 1u << VIRTIO_NET_F_MAC);
     wait_for_fds(s);
     wait_for_rings_started(s, 2);
 
@@ -871,7 +871,7 @@ static void test_flags_mismatch_subprocess(void)
     qtest_start(cmd);
     g_free(cmd);
 
-    init_virtio_dev(s);
+    init_virtio_dev(s, 1u << VIRTIO_NET_F_MAC);
     wait_for_fds(s);
     wait_for_rings_started(s, 2);
 

From 459f5d29d27c26df194ccc0db7902d1e6fafd0c8 Mon Sep 17 00:00:00 2001
From: Maxime Coquelin <maxime.coquelin@redhat.com>
Date: Thu, 21 Dec 2017 22:21:25 +0100
Subject: [PATCH 20/29] vhost-user-test: use init_virtio_dev in multiqueue test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that init_virtio_dev() has been generalized to all cases,
use it in test_multiqueue() to avoid code duplication.

Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 tests/vhost-user-test.c | 65 +++++------------------------------------
 1 file changed, 8 insertions(+), 57 deletions(-)

diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index 6a144e8d7e..ec6ac9dc9e 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -892,79 +892,30 @@ static void test_flags_mismatch(void)
 
 #endif
 
-static QVirtioPCIDevice *virtio_net_pci_init(QPCIBus *bus, int slot)
-{
-    QVirtioPCIDevice *dev;
-
-    dev = qvirtio_pci_device_find(bus, VIRTIO_ID_NET);
-    g_assert(dev != NULL);
-    g_assert_cmphex(dev->vdev.device_type, ==, VIRTIO_ID_NET);
-
-    qvirtio_pci_device_enable(dev);
-    qvirtio_reset(&dev->vdev);
-    qvirtio_set_acknowledge(&dev->vdev);
-    qvirtio_set_driver(&dev->vdev);
-
-    return dev;
-}
-
-static void driver_init(QVirtioDevice *dev)
-{
-    uint32_t features;
-
-    features = qvirtio_get_features(dev);
-    features = features & ~(QVIRTIO_F_BAD_FEATURE |
-                            (1u << VIRTIO_RING_F_INDIRECT_DESC) |
-                            (1u << VIRTIO_RING_F_EVENT_IDX));
-    qvirtio_set_features(dev, features);
-
-    qvirtio_set_driver_ok(dev);
-}
-
-#define PCI_SLOT                0x04
-
 static void test_multiqueue(void)
 {
-    const int queues = 2;
     TestServer *s = test_server_new("mq");
-    QVirtioPCIDevice *dev;
-    QPCIBus *bus;
-    QVirtQueuePCI *vq[queues * 2];
-    QGuestAllocator *alloc;
     char *cmd;
-    int i;
-
-    s->queues = queues;
+    uint32_t features_mask = ~(QVIRTIO_F_BAD_FEATURE |
+                            (1u << VIRTIO_RING_F_INDIRECT_DESC) |
+                            (1u << VIRTIO_RING_F_EVENT_IDX));
+    s->queues = 2;
     test_server_listen(s);
 
     cmd = g_strdup_printf(QEMU_CMD_MEM QEMU_CMD_CHR QEMU_CMD_NETDEV ",queues=%d "
                           "-device virtio-net-pci,netdev=net0,mq=on,vectors=%d",
                           512, 512, root, s->chr_name,
                           s->socket_path, "", s->chr_name,
-                          queues, queues * 2 + 2);
+                          s->queues, s->queues * 2 + 2);
     qtest_start(cmd);
     g_free(cmd);
 
-    bus = qpci_init_pc(NULL);
-    dev = virtio_net_pci_init(bus, PCI_SLOT);
+    init_virtio_dev(s, features_mask);
 
-    alloc = pc_alloc_init();
-    for (i = 0; i < queues * 2; i++) {
-        vq[i] = (QVirtQueuePCI *)qvirtqueue_setup(&dev->vdev, alloc, i);
-    }
+    wait_for_rings_started(s, s->queues * 2);
 
-    driver_init(&dev->vdev);
-    wait_for_rings_started(s, queues * 2);
+    uninit_virtio_dev(s);
 
-    /* End test */
-    for (i = 0; i < queues * 2; i++) {
-        qvirtqueue_cleanup(dev->vdev.bus, &vq[i]->vq, alloc);
-    }
-    pc_alloc_uninit(alloc);
-    qvirtio_pci_device_disable(dev);
-    g_free(dev->pdev);
-    g_free(dev);
-    qpci_free_pc(bus);
     qtest_end();
 
     test_server_free(s);

From 29396ed9acfaee9936377ddece4b05452b417861 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <mgamal@redhat.com>
Date: Wed, 29 Nov 2017 13:33:12 +0100
Subject: [PATCH 21/29] x86_iommu: Move machine check to x86_iommu_realize()

Instead of having the same error checks in vtd_realize()
and amdvi_realize(), move that over to the generic
x86_iommu_realize().

Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Mohammed Gamal <mgamal@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
---
 hw/i386/amd_iommu.c   | 13 ++-----------
 hw/i386/intel_iommu.c | 13 ++-----------
 hw/i386/x86-iommu.c   | 13 +++++++++++++
 3 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index eeaf0e0aa8..63d46ff6ee 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1144,18 +1144,9 @@ static void amdvi_realize(DeviceState *dev, Error **err)
     AMDVIState *s = AMD_IOMMU_DEVICE(dev);
     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
     MachineState *ms = MACHINE(qdev_get_machine());
-    MachineClass *mc = MACHINE_GET_CLASS(ms);
-    PCMachineState *pcms =
-        PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
-    PCIBus *bus;
+    PCMachineState *pcms = PC_MACHINE(ms);
+    PCIBus *bus = pcms->bus;
 
-    if (!pcms) {
-        error_setg(err, "Machine-type '%s' not supported by amd-iommu",
-                   mc->name);
-        return;
-    }
-
-    bus = pcms->bus;
     s->iotlb = g_hash_table_new_full(amdvi_uint64_hash,
                                      amdvi_uint64_equal, g_free, g_free);
 
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4e8642ea6a..2e841cde27 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3052,20 +3052,11 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
 static void vtd_realize(DeviceState *dev, Error **errp)
 {
     MachineState *ms = MACHINE(qdev_get_machine());
-    MachineClass *mc = MACHINE_GET_CLASS(ms);
-    PCMachineState *pcms =
-        PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
-    PCIBus *bus;
+    PCMachineState *pcms = PC_MACHINE(ms);
+    PCIBus *bus = pcms->bus;
     IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev);
     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
 
-    if (!pcms) {
-        error_setg(errp, "Machine-type '%s' not supported by intel-iommu",
-                   mc->name);
-        return;
-    }
-
-    bus = pcms->bus;
     x86_iommu->type = TYPE_INTEL;
 
     if (!vtd_decide_config(s, errp)) {
diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
index 293caf83ef..51de519ec3 100644
--- a/hw/i386/x86-iommu.c
+++ b/hw/i386/x86-iommu.c
@@ -21,6 +21,8 @@
 #include "hw/sysbus.h"
 #include "hw/boards.h"
 #include "hw/i386/x86-iommu.h"
+#include "hw/i386/pc.h"
+#include "qapi/error.h"
 #include "qemu/error-report.h"
 #include "trace.h"
 
@@ -80,7 +82,18 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp)
 {
     X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev);
     X86IOMMUClass *x86_class = X86_IOMMU_GET_CLASS(dev);
+    MachineState *ms = MACHINE(qdev_get_machine());
+    MachineClass *mc = MACHINE_GET_CLASS(ms);
+    PCMachineState *pcms =
+        PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
     QLIST_INIT(&x86_iommu->iec_notifiers);
+
+    if (!pcms) {
+        error_setg(errp, "Machine-type '%s' not supported by IOMMU",
+                   mc->name);
+        return;
+    }
+
     if (x86_class->realize) {
         x86_class->realize(dev, errp);
     }

From a0c167a18470831e359f0538c3cf67907808f13e Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <mgamal@redhat.com>
Date: Wed, 29 Nov 2017 13:33:13 +0100
Subject: [PATCH 22/29] x86_iommu: check if machine has PCI bus

Starting qemu with
qemu-system-x86_64 -S -M isapc -device {amd|intel}-iommu
leads to a segfault. The code assume PCI bus is present and
tries to access the bus structure without checking.

Since Intel VT-d and AMDVI should only work with PCI, add a
check for PCI bus and return error if not present.

Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Mohammed Gamal <mgamal@redhat.com>
Reviewed-by: Thomas Huth <thuth@redhat.com>
---
 hw/i386/x86-iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/i386/x86-iommu.c b/hw/i386/x86-iommu.c
index 51de519ec3..8a01a2dd25 100644
--- a/hw/i386/x86-iommu.c
+++ b/hw/i386/x86-iommu.c
@@ -88,7 +88,7 @@ static void x86_iommu_realize(DeviceState *dev, Error **errp)
         PC_MACHINE(object_dynamic_cast(OBJECT(ms), TYPE_PC_MACHINE));
     QLIST_INIT(&x86_iommu->iec_notifiers);
 
-    if (!pcms) {
+    if (!pcms || !pcms->bus) {
         error_setg(errp, "Machine-type '%s' not supported by IOMMU",
                    mc->name);
         return;

From b24b9d947243c1108b958e544d48ec69d4dfbb2f Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Fri, 29 Dec 2017 16:16:38 +0100
Subject: [PATCH 23/29] tests: acpi: move tested tables array allocation
 outside of test_acpi_dsdt_table()

at best it's confusing that array for list of tables to be tested
against reference tables is allocated within test_acpi_dsdt_table()
and at worst it would just overwrite list of tables if they were
added before test_acpi_dsdt_table().
Move array initialization to test_acpi_one() before we start
processing tables.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/bios-tables-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c
index def81fc243..a2f64c8e4a 100644
--- a/tests/bios-tables-test.c
+++ b/tests/bios-tables-test.c
@@ -234,12 +234,11 @@ static void test_acpi_dsdt_table(test_data *data)
     uint32_t addr = le32_to_cpu(data->fadt_table.dsdt);
 
     memset(&dsdt_table, 0, sizeof(dsdt_table));
-    data->tables = g_array_new(false, true, sizeof(AcpiSdtTable));
 
     test_dst_table(&dsdt_table, addr);
     ACPI_ASSERT_CMP(dsdt_table.header.signature, "DSDT");
 
-    /* Place DSDT first */
+    /* Since DSDT isn't in RSDT, add DSDT to ASL test tables list manually */
     g_array_append_val(data->tables, dsdt_table);
 }
 
@@ -636,6 +635,7 @@ static void test_acpi_one(const char *params, test_data *data)
 
     boot_sector_test();
 
+    data->tables = g_array_new(false, true, sizeof(AcpiSdtTable));
     test_acpi_rsdp_address(data);
     test_acpi_rsdp_table(data);
     test_acpi_rsdt_table(data);

From dabc7f216b76ed29b04d8f89f5664b69d5d09e9b Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Fri, 29 Dec 2017 16:16:39 +0100
Subject: [PATCH 24/29] tests: acpi: init table descriptor in test_dst_table()

remove code duplication and make sure that table descriptor
passed in for initialization is in expected state.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/bios-tables-test.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c
index a2f64c8e4a..4b357cd6f3 100644
--- a/tests/bios-tables-test.c
+++ b/tests/bios-tables-test.c
@@ -214,6 +214,7 @@ static void test_dst_table(AcpiSdtTable *sdt_table, uint32_t addr)
 {
     uint8_t checksum;
 
+    memset(sdt_table, 0, sizeof(*sdt_table));
     ACPI_READ_TABLE_HEADER(&sdt_table->header, addr);
 
     sdt_table->aml_len = le32_to_cpu(sdt_table->header.length)
@@ -233,8 +234,6 @@ static void test_acpi_dsdt_table(test_data *data)
     AcpiSdtTable dsdt_table;
     uint32_t addr = le32_to_cpu(data->fadt_table.dsdt);
 
-    memset(&dsdt_table, 0, sizeof(dsdt_table));
-
     test_dst_table(&dsdt_table, addr);
     ACPI_ASSERT_CMP(dsdt_table.header.signature, "DSDT");
 
@@ -251,7 +250,6 @@ static void test_acpi_tables(test_data *data)
         AcpiSdtTable ssdt_table;
         uint32_t addr;
 
-        memset(&ssdt_table, 0, sizeof(ssdt_table));
         addr = le32_to_cpu(data->rsdt_tables_addr[i + 1]); /* fadt is first */
         test_dst_table(&ssdt_table, addr);
         g_array_append_val(data->tables, ssdt_table);

From 03010579835a17450693888f8b35a66817668d68 Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Fri, 29 Dec 2017 16:16:40 +0100
Subject: [PATCH 25/29] tests: acpi: rename test_acpi_tables()/test_dst_table()
 to reflect its usage

Main purpose of test_dst_table() is loading a table from QEMU
with checking that checksum in header matches actual one,
rename it reflect main action it performs.

Likewise test_acpi_tables() name is to broad, while the function
only loads tables referenced by RSDT, rename it to reflect it.

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/bios-tables-test.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c
index 4b357cd6f3..1314ad8f37 100644
--- a/tests/bios-tables-test.c
+++ b/tests/bios-tables-test.c
@@ -210,7 +210,11 @@ static void test_acpi_facs_table(test_data *data)
     ACPI_ASSERT_CMP(facs_table->signature, "FACS");
 }
 
-static void test_dst_table(AcpiSdtTable *sdt_table, uint32_t addr)
+/** fetch_table
+ *   load ACPI table at @addr into table descriptor @sdt_table
+ *   and check that header checksum matches actual one.
+ */
+static void fetch_table(AcpiSdtTable *sdt_table, uint32_t addr)
 {
     uint8_t checksum;
 
@@ -234,14 +238,15 @@ static void test_acpi_dsdt_table(test_data *data)
     AcpiSdtTable dsdt_table;
     uint32_t addr = le32_to_cpu(data->fadt_table.dsdt);
 
-    test_dst_table(&dsdt_table, addr);
+    fetch_table(&dsdt_table, addr);
     ACPI_ASSERT_CMP(dsdt_table.header.signature, "DSDT");
 
     /* Since DSDT isn't in RSDT, add DSDT to ASL test tables list manually */
     g_array_append_val(data->tables, dsdt_table);
 }
 
-static void test_acpi_tables(test_data *data)
+/* Load all tables and add to test list directly RSDT referenced tables */
+static void fetch_rsdt_referenced_tables(test_data *data)
 {
     int tables_nr = data->rsdt_tables_nr - 1; /* fadt is first */
     int i;
@@ -251,7 +256,7 @@ static void test_acpi_tables(test_data *data)
         uint32_t addr;
 
         addr = le32_to_cpu(data->rsdt_tables_addr[i + 1]); /* fadt is first */
-        test_dst_table(&ssdt_table, addr);
+        fetch_table(&ssdt_table, addr);
         g_array_append_val(data->tables, ssdt_table);
     }
 }
@@ -640,7 +645,7 @@ static void test_acpi_one(const char *params, test_data *data)
     test_acpi_fadt_table(data);
     test_acpi_facs_table(data);
     test_acpi_dsdt_table(data);
-    test_acpi_tables(data);
+    fetch_rsdt_referenced_tables(data);
 
     if (iasl) {
         if (getenv(ACPI_REBUILD_EXPECTED_AML)) {

From ab20bbd29767a634bb11ca9f2115233f1e9e950c Mon Sep 17 00:00:00 2001
From: Igor Mammedov <imammedo@redhat.com>
Date: Fri, 29 Dec 2017 16:16:41 +0100
Subject: [PATCH 26/29] tests: acpi: add comments to
 fetch_rsdt_referenced_tables/data->tables usage

Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 tests/bios-tables-test.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c
index 1314ad8f37..b354aaafe6 100644
--- a/tests/bios-tables-test.c
+++ b/tests/bios-tables-test.c
@@ -257,6 +257,8 @@ static void fetch_rsdt_referenced_tables(test_data *data)
 
         addr = le32_to_cpu(data->rsdt_tables_addr[i + 1]); /* fadt is first */
         fetch_table(&ssdt_table, addr);
+
+        /* Add table to ASL test tables list */
         g_array_append_val(data->tables, ssdt_table);
     }
 }
@@ -427,6 +429,7 @@ try_again:
     return exp_tables;
 }
 
+/* test the list of tables in @data->tables against reference tables */
 static void test_acpi_asl(test_data *data)
 {
     int i;

From 24e34754eb78064abafd3f9a59bb35a0e2bcf406 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 8 Jan 2018 19:46:02 +0200
Subject: [PATCH 27/29] vhost-user: factor out msg head and payload

split header and payload into separate structures,
to enable easier handling of alignment issues.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 198 +++++++++++++++++++++--------------------
 1 file changed, 101 insertions(+), 97 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 8b946880fe..6ac3610996 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -112,7 +112,7 @@ static VhostUserConfig c __attribute__ ((unused));
                                    + sizeof(c.size) \
                                    + sizeof(c.flags))
 
-typedef struct VhostUserMsg {
+typedef struct {
     VhostUserRequest request;
 
 #define VHOST_USER_VERSION_MASK     (0x3)
@@ -120,7 +120,9 @@ typedef struct VhostUserMsg {
 #define VHOST_USER_NEED_REPLY_MASK  (0x1 << 3)
     uint32_t flags;
     uint32_t size; /* the following payload size */
-    union {
+} QEMU_PACKED VhostUserHeader;
+
+typedef union {
 #define VHOST_USER_VRING_IDX_MASK   (0xff)
 #define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
         uint64_t u64;
@@ -130,15 +132,17 @@ typedef struct VhostUserMsg {
         VhostUserLog log;
         struct vhost_iotlb_msg iotlb;
         VhostUserConfig config;
-    } payload;
+} VhostUserPayload;
+
+typedef struct VhostUserMsg {
+    VhostUserHeader hdr;
+    VhostUserPayload payload;
 } QEMU_PACKED VhostUserMsg;
 
 static VhostUserMsg m __attribute__ ((unused));
-#define VHOST_USER_HDR_SIZE (sizeof(m.request) \
-                            + sizeof(m.flags) \
-                            + sizeof(m.size))
+#define VHOST_USER_HDR_SIZE (sizeof(VhostUserHeader))
 
-#define VHOST_USER_PAYLOAD_SIZE (sizeof(m) - VHOST_USER_HDR_SIZE)
+#define VHOST_USER_PAYLOAD_SIZE (sizeof(VhostUserPayload))
 
 /* The version of the protocol we support */
 #define VHOST_USER_VERSION    (0x1)
@@ -163,33 +167,33 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
     r = qemu_chr_fe_read_all(chr, p, size);
     if (r != size) {
         error_report("Failed to read msg header. Read %d instead of %d."
-                     " Original request %d.", r, size, msg->request);
+                     " Original request %d.", r, size, msg->hdr.request);
         goto fail;
     }
 
     /* validate received flags */
-    if (msg->flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) {
+    if (msg->hdr.flags != (VHOST_USER_REPLY_MASK | VHOST_USER_VERSION)) {
         error_report("Failed to read msg header."
-                " Flags 0x%x instead of 0x%x.", msg->flags,
+                " Flags 0x%x instead of 0x%x.", msg->hdr.flags,
                 VHOST_USER_REPLY_MASK | VHOST_USER_VERSION);
         goto fail;
     }
 
     /* validate message size is sane */
-    if (msg->size > VHOST_USER_PAYLOAD_SIZE) {
+    if (msg->hdr.size > VHOST_USER_PAYLOAD_SIZE) {
         error_report("Failed to read msg header."
-                " Size %d exceeds the maximum %zu.", msg->size,
+                " Size %d exceeds the maximum %zu.", msg->hdr.size,
                 VHOST_USER_PAYLOAD_SIZE);
         goto fail;
     }
 
-    if (msg->size) {
+    if (msg->hdr.size) {
         p += VHOST_USER_HDR_SIZE;
-        size = msg->size;
+        size = msg->hdr.size;
         r = qemu_chr_fe_read_all(chr, p, size);
         if (r != size) {
             error_report("Failed to read msg payload."
-                         " Read %d instead of %d.", r, msg->size);
+                         " Read %d instead of %d.", r, msg->hdr.size);
             goto fail;
         }
     }
@@ -205,7 +209,7 @@ static int process_message_reply(struct vhost_dev *dev,
 {
     VhostUserMsg msg_reply;
 
-    if ((msg->flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
+    if ((msg->hdr.flags & VHOST_USER_NEED_REPLY_MASK) == 0) {
         return 0;
     }
 
@@ -213,10 +217,10 @@ static int process_message_reply(struct vhost_dev *dev,
         return -1;
     }
 
-    if (msg_reply.request != msg->request) {
+    if (msg_reply.hdr.request != msg->hdr.request) {
         error_report("Received unexpected msg type."
                      "Expected %d received %d",
-                     msg->request, msg_reply.request);
+                     msg->hdr.request, msg_reply.hdr.request);
         return -1;
     }
 
@@ -243,15 +247,15 @@ static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg,
 {
     struct vhost_user *u = dev->opaque;
     CharBackend *chr = u->chr;
-    int ret, size = VHOST_USER_HDR_SIZE + msg->size;
+    int ret, size = VHOST_USER_HDR_SIZE + msg->hdr.size;
 
     /*
      * For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE,
      * we just need send it once in the first time. For later such
      * request, we just ignore it.
      */
-    if (vhost_user_one_time_request(msg->request) && dev->vq_index != 0) {
-        msg->flags &= ~VHOST_USER_NEED_REPLY_MASK;
+    if (vhost_user_one_time_request(msg->hdr.request) && dev->vq_index != 0) {
+        msg->hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
         return 0;
     }
 
@@ -278,11 +282,11 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
     bool shmfd = virtio_has_feature(dev->protocol_features,
                                     VHOST_USER_PROTOCOL_F_LOG_SHMFD);
     VhostUserMsg msg = {
-        .request = VHOST_USER_SET_LOG_BASE,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_SET_LOG_BASE,
+        .hdr.flags = VHOST_USER_VERSION,
         .payload.log.mmap_size = log->size * sizeof(*(log->log)),
         .payload.log.mmap_offset = 0,
-        .size = sizeof(msg.payload.log),
+        .hdr.size = sizeof(msg.payload.log),
     };
 
     if (shmfd && log->fd != -1) {
@@ -294,15 +298,15 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
     }
 
     if (shmfd) {
-        msg.size = 0;
+        msg.hdr.size = 0;
         if (vhost_user_read(dev, &msg) < 0) {
             return -1;
         }
 
-        if (msg.request != VHOST_USER_SET_LOG_BASE) {
+        if (msg.hdr.request != VHOST_USER_SET_LOG_BASE) {
             error_report("Received unexpected msg type. "
                          "Expected %d received %d",
-                         VHOST_USER_SET_LOG_BASE, msg.request);
+                         VHOST_USER_SET_LOG_BASE, msg.hdr.request);
             return -1;
         }
     }
@@ -320,12 +324,12 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
                                               VHOST_USER_PROTOCOL_F_REPLY_ACK);
 
     VhostUserMsg msg = {
-        .request = VHOST_USER_SET_MEM_TABLE,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_SET_MEM_TABLE,
+        .hdr.flags = VHOST_USER_VERSION,
     };
 
     if (reply_supported) {
-        msg.flags |= VHOST_USER_NEED_REPLY_MASK;
+        msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
     }
 
     for (i = 0; i < dev->mem->nregions; ++i) {
@@ -355,9 +359,9 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
         return -1;
     }
 
-    msg.size = sizeof(msg.payload.memory.nregions);
-    msg.size += sizeof(msg.payload.memory.padding);
-    msg.size += fd_num * sizeof(VhostUserMemoryRegion);
+    msg.hdr.size = sizeof(msg.payload.memory.nregions);
+    msg.hdr.size += sizeof(msg.payload.memory.padding);
+    msg.hdr.size += fd_num * sizeof(VhostUserMemoryRegion);
 
     if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
         return -1;
@@ -374,10 +378,10 @@ static int vhost_user_set_vring_addr(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr)
 {
     VhostUserMsg msg = {
-        .request = VHOST_USER_SET_VRING_ADDR,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_SET_VRING_ADDR,
+        .hdr.flags = VHOST_USER_VERSION,
         .payload.addr = *addr,
-        .size = sizeof(msg.payload.addr),
+        .hdr.size = sizeof(msg.payload.addr),
     };
 
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
@@ -393,10 +397,10 @@ static int vhost_user_set_vring_endian(struct vhost_dev *dev,
     bool cross_endian = virtio_has_feature(dev->protocol_features,
                                            VHOST_USER_PROTOCOL_F_CROSS_ENDIAN);
     VhostUserMsg msg = {
-        .request = VHOST_USER_SET_VRING_ENDIAN,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_SET_VRING_ENDIAN,
+        .hdr.flags = VHOST_USER_VERSION,
         .payload.state = *ring,
-        .size = sizeof(msg.payload.state),
+        .hdr.size = sizeof(msg.payload.state),
     };
 
     if (!cross_endian) {
@@ -416,10 +420,10 @@ static int vhost_set_vring(struct vhost_dev *dev,
                            struct vhost_vring_state *ring)
 {
     VhostUserMsg msg = {
-        .request = request,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = request,
+        .hdr.flags = VHOST_USER_VERSION,
         .payload.state = *ring,
-        .size = sizeof(msg.payload.state),
+        .hdr.size = sizeof(msg.payload.state),
     };
 
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
@@ -465,10 +469,10 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev,
                                      struct vhost_vring_state *ring)
 {
     VhostUserMsg msg = {
-        .request = VHOST_USER_GET_VRING_BASE,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_GET_VRING_BASE,
+        .hdr.flags = VHOST_USER_VERSION,
         .payload.state = *ring,
-        .size = sizeof(msg.payload.state),
+        .hdr.size = sizeof(msg.payload.state),
     };
 
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
@@ -479,13 +483,13 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev,
         return -1;
     }
 
-    if (msg.request != VHOST_USER_GET_VRING_BASE) {
+    if (msg.hdr.request != VHOST_USER_GET_VRING_BASE) {
         error_report("Received unexpected msg type. Expected %d received %d",
-                     VHOST_USER_GET_VRING_BASE, msg.request);
+                     VHOST_USER_GET_VRING_BASE, msg.hdr.request);
         return -1;
     }
 
-    if (msg.size != sizeof(msg.payload.state)) {
+    if (msg.hdr.size != sizeof(msg.payload.state)) {
         error_report("Received bad msg size.");
         return -1;
     }
@@ -502,10 +506,10 @@ static int vhost_set_vring_file(struct vhost_dev *dev,
     int fds[VHOST_MEMORY_MAX_NREGIONS];
     size_t fd_num = 0;
     VhostUserMsg msg = {
-        .request = request,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = request,
+        .hdr.flags = VHOST_USER_VERSION,
         .payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
-        .size = sizeof(msg.payload.u64),
+        .hdr.size = sizeof(msg.payload.u64),
     };
 
     if (ioeventfd_enabled() && file->fd > 0) {
@@ -536,10 +540,10 @@ static int vhost_user_set_vring_call(struct vhost_dev *dev,
 static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64)
 {
     VhostUserMsg msg = {
-        .request = request,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = request,
+        .hdr.flags = VHOST_USER_VERSION,
         .payload.u64 = u64,
-        .size = sizeof(msg.payload.u64),
+        .hdr.size = sizeof(msg.payload.u64),
     };
 
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
@@ -564,8 +568,8 @@ static int vhost_user_set_protocol_features(struct vhost_dev *dev,
 static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64)
 {
     VhostUserMsg msg = {
-        .request = request,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = request,
+        .hdr.flags = VHOST_USER_VERSION,
     };
 
     if (vhost_user_one_time_request(request) && dev->vq_index != 0) {
@@ -580,13 +584,13 @@ static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64)
         return -1;
     }
 
-    if (msg.request != request) {
+    if (msg.hdr.request != request) {
         error_report("Received unexpected msg type. Expected %d received %d",
-                     request, msg.request);
+                     request, msg.hdr.request);
         return -1;
     }
 
-    if (msg.size != sizeof(msg.payload.u64)) {
+    if (msg.hdr.size != sizeof(msg.payload.u64)) {
         error_report("Received bad msg size.");
         return -1;
     }
@@ -604,8 +608,8 @@ static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features)
 static int vhost_user_set_owner(struct vhost_dev *dev)
 {
     VhostUserMsg msg = {
-        .request = VHOST_USER_SET_OWNER,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_SET_OWNER,
+        .hdr.flags = VHOST_USER_VERSION,
     };
 
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
@@ -618,8 +622,8 @@ static int vhost_user_set_owner(struct vhost_dev *dev)
 static int vhost_user_reset_device(struct vhost_dev *dev)
 {
     VhostUserMsg msg = {
-        .request = VHOST_USER_RESET_OWNER,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_RESET_OWNER,
+        .hdr.flags = VHOST_USER_VERSION,
     };
 
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
@@ -658,21 +662,21 @@ static void slave_read(void *opaque)
         goto err;
     }
 
-    if (msg.size > VHOST_USER_PAYLOAD_SIZE) {
+    if (msg.hdr.size > VHOST_USER_PAYLOAD_SIZE) {
         error_report("Failed to read msg header."
-                " Size %d exceeds the maximum %zu.", msg.size,
+                " Size %d exceeds the maximum %zu.", msg.hdr.size,
                 VHOST_USER_PAYLOAD_SIZE);
         goto err;
     }
 
     /* Read payload */
-    size = read(u->slave_fd, &msg.payload, msg.size);
-    if (size != msg.size) {
+    size = read(u->slave_fd, &msg.payload, msg.hdr.size);
+    if (size != msg.hdr.size) {
         error_report("Failed to read payload from slave.");
         goto err;
     }
 
-    switch (msg.request) {
+    switch (msg.hdr.request) {
     case VHOST_USER_SLAVE_IOTLB_MSG:
         ret = vhost_backend_handle_iotlb_msg(dev, &msg.payload.iotlb);
         break;
@@ -688,15 +692,15 @@ static void slave_read(void *opaque)
      * REPLY_ACK feature handling. Other reply types has to be managed
      * directly in their request handlers.
      */
-    if (msg.flags & VHOST_USER_NEED_REPLY_MASK) {
-        msg.flags &= ~VHOST_USER_NEED_REPLY_MASK;
-        msg.flags |= VHOST_USER_REPLY_MASK;
+    if (msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
+        msg.hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
+        msg.hdr.flags |= VHOST_USER_REPLY_MASK;
 
         msg.payload.u64 = !!ret;
-        msg.size = sizeof(msg.payload.u64);
+        msg.hdr.size = sizeof(msg.payload.u64);
 
-        size = write(u->slave_fd, &msg, VHOST_USER_HDR_SIZE + msg.size);
-        if (size != VHOST_USER_HDR_SIZE + msg.size) {
+        size = write(u->slave_fd, &msg, VHOST_USER_HDR_SIZE + msg.hdr.size);
+        if (size != VHOST_USER_HDR_SIZE + msg.hdr.size) {
             error_report("Failed to send msg reply to slave.");
             goto err;
         }
@@ -714,8 +718,8 @@ err:
 static int vhost_setup_slave_channel(struct vhost_dev *dev)
 {
     VhostUserMsg msg = {
-        .request = VHOST_USER_SET_SLAVE_REQ_FD,
-        .flags = VHOST_USER_VERSION,
+        .hdr.request = VHOST_USER_SET_SLAVE_REQ_FD,
+        .hdr.flags = VHOST_USER_VERSION,
     };
     struct vhost_user *u = dev->opaque;
     int sv[2], ret = 0;
@@ -736,7 +740,7 @@ static int vhost_setup_slave_channel(struct vhost_dev *dev)
     qemu_set_fd_handler(u->slave_fd, slave_read, NULL, dev);
 
     if (reply_supported) {
-        msg.flags |= VHOST_USER_NEED_REPLY_MASK;
+        msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
     }
 
     ret = vhost_user_write(dev, &msg, &sv[1], 1);
@@ -881,10 +885,10 @@ static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
     /* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */
     if (virtio_has_feature(dev->protocol_features,
                            VHOST_USER_PROTOCOL_F_RARP)) {
-        msg.request = VHOST_USER_SEND_RARP;
-        msg.flags = VHOST_USER_VERSION;
+        msg.hdr.request = VHOST_USER_SEND_RARP;
+        msg.hdr.flags = VHOST_USER_VERSION;
         memcpy((char *)&msg.payload.u64, mac_addr, 6);
-        msg.size = sizeof(msg.payload.u64);
+        msg.hdr.size = sizeof(msg.payload.u64);
 
         return vhost_user_write(dev, &msg, NULL, 0);
     }
@@ -918,12 +922,12 @@ static int vhost_user_net_set_mtu(struct vhost_dev *dev, uint16_t mtu)
         return 0;
     }
 
-    msg.request = VHOST_USER_NET_SET_MTU;
+    msg.hdr.request = VHOST_USER_NET_SET_MTU;
     msg.payload.u64 = mtu;
-    msg.size = sizeof(msg.payload.u64);
-    msg.flags = VHOST_USER_VERSION;
+    msg.hdr.size = sizeof(msg.payload.u64);
+    msg.hdr.flags = VHOST_USER_VERSION;
     if (reply_supported) {
-        msg.flags |= VHOST_USER_NEED_REPLY_MASK;
+        msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
     }
 
     if (vhost_user_write(dev, &msg, NULL, 0) < 0) {
@@ -942,9 +946,9 @@ static int vhost_user_send_device_iotlb_msg(struct vhost_dev *dev,
                                             struct vhost_iotlb_msg *imsg)
 {
     VhostUserMsg msg = {
-        .request = VHOST_USER_IOTLB_MSG,
-        .size = sizeof(msg.payload.iotlb),
-        .flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
+        .hdr.request = VHOST_USER_IOTLB_MSG,
+        .hdr.size = sizeof(msg.payload.iotlb),
+        .hdr.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY_MASK,
         .payload.iotlb = *imsg,
     };
 
@@ -965,9 +969,9 @@ static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config,
                                  uint32_t config_len)
 {
     VhostUserMsg msg = {
-        msg.request = VHOST_USER_GET_CONFIG,
-        msg.flags = VHOST_USER_VERSION,
-        msg.size = VHOST_USER_CONFIG_HDR_SIZE + config_len,
+        .hdr.request = VHOST_USER_GET_CONFIG,
+        .hdr.flags = VHOST_USER_VERSION,
+        .hdr.size = VHOST_USER_CONFIG_HDR_SIZE + config_len,
     };
 
     if (config_len > VHOST_USER_MAX_CONFIG_SIZE) {
@@ -984,13 +988,13 @@ static int vhost_user_get_config(struct vhost_dev *dev, uint8_t *config,
         return -1;
     }
 
-    if (msg.request != VHOST_USER_GET_CONFIG) {
+    if (msg.hdr.request != VHOST_USER_GET_CONFIG) {
         error_report("Received unexpected msg type. Expected %d received %d",
-                     VHOST_USER_GET_CONFIG, msg.request);
+                     VHOST_USER_GET_CONFIG, msg.hdr.request);
         return -1;
     }
 
-    if (msg.size != VHOST_USER_CONFIG_HDR_SIZE + config_len) {
+    if (msg.hdr.size != VHOST_USER_CONFIG_HDR_SIZE + config_len) {
         error_report("Received bad msg size.");
         return -1;
     }
@@ -1008,13 +1012,13 @@ static int vhost_user_set_config(struct vhost_dev *dev, const uint8_t *data,
                                               VHOST_USER_PROTOCOL_F_REPLY_ACK);
 
     VhostUserMsg msg = {
-        msg.request = VHOST_USER_SET_CONFIG,
-        msg.flags = VHOST_USER_VERSION,
-        msg.size = VHOST_USER_CONFIG_HDR_SIZE + size,
+        .hdr.request = VHOST_USER_SET_CONFIG,
+        .hdr.flags = VHOST_USER_VERSION,
+        .hdr.size = VHOST_USER_CONFIG_HDR_SIZE + size,
     };
 
     if (reply_supported) {
-        msg.flags |= VHOST_USER_NEED_REPLY_MASK;
+        msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
     }
 
     if (size > VHOST_USER_MAX_CONFIG_SIZE) {

From 69aff030643c1616474458cf7e19fc15a5f8f462 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 8 Jan 2018 19:47:11 +0200
Subject: [PATCH 28/29] vhost-user: fix misaligned access to payload

We currently take a pointer to a misaligned field of a packed structure.
clang reports this as a build warning.
A fix is to keep payload in a separate structure, and access is it
from there using a vectored write.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 6ac3610996..7930fd85fc 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -652,33 +652,34 @@ static void slave_read(void *opaque)
 {
     struct vhost_dev *dev = opaque;
     struct vhost_user *u = dev->opaque;
-    VhostUserMsg msg = { 0, };
+    VhostUserHeader hdr = { 0, };
+    VhostUserPayload payload = { 0, };
     int size, ret = 0;
 
     /* Read header */
-    size = read(u->slave_fd, &msg, VHOST_USER_HDR_SIZE);
+    size = read(u->slave_fd, &hdr, VHOST_USER_HDR_SIZE);
     if (size != VHOST_USER_HDR_SIZE) {
         error_report("Failed to read from slave.");
         goto err;
     }
 
-    if (msg.hdr.size > VHOST_USER_PAYLOAD_SIZE) {
+    if (hdr.size > VHOST_USER_PAYLOAD_SIZE) {
         error_report("Failed to read msg header."
-                " Size %d exceeds the maximum %zu.", msg.hdr.size,
+                " Size %d exceeds the maximum %zu.", hdr.size,
                 VHOST_USER_PAYLOAD_SIZE);
         goto err;
     }
 
     /* Read payload */
-    size = read(u->slave_fd, &msg.payload, msg.hdr.size);
-    if (size != msg.hdr.size) {
+    size = read(u->slave_fd, &payload, hdr.size);
+    if (size != hdr.size) {
         error_report("Failed to read payload from slave.");
         goto err;
     }
 
-    switch (msg.hdr.request) {
+    switch (hdr.request) {
     case VHOST_USER_SLAVE_IOTLB_MSG:
-        ret = vhost_backend_handle_iotlb_msg(dev, &msg.payload.iotlb);
+        ret = vhost_backend_handle_iotlb_msg(dev, &payload.iotlb);
         break;
     case VHOST_USER_SLAVE_CONFIG_CHANGE_MSG :
         ret = vhost_user_slave_handle_config_change(dev);
@@ -692,15 +693,23 @@ static void slave_read(void *opaque)
      * REPLY_ACK feature handling. Other reply types has to be managed
      * directly in their request handlers.
      */
-    if (msg.hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
-        msg.hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
-        msg.hdr.flags |= VHOST_USER_REPLY_MASK;
+    if (hdr.flags & VHOST_USER_NEED_REPLY_MASK) {
+        struct iovec iovec[2];
 
-        msg.payload.u64 = !!ret;
-        msg.hdr.size = sizeof(msg.payload.u64);
 
-        size = write(u->slave_fd, &msg, VHOST_USER_HDR_SIZE + msg.hdr.size);
-        if (size != VHOST_USER_HDR_SIZE + msg.hdr.size) {
+        hdr.flags &= ~VHOST_USER_NEED_REPLY_MASK;
+        hdr.flags |= VHOST_USER_REPLY_MASK;
+
+        payload.u64 = !!ret;
+        hdr.size = sizeof(payload.u64);
+
+        iovec[0].iov_base = &hdr;
+        iovec[0].iov_len = VHOST_USER_HDR_SIZE;
+        iovec[1].iov_base = &payload;
+        iovec[1].iov_len = hdr.size;
+
+        size = writev(u->slave_fd, iovec, ARRAY_SIZE(iovec));
+        if (size != VHOST_USER_HDR_SIZE + hdr.size) {
             error_report("Failed to send msg reply to slave.");
             goto err;
         }

From f4bf56fb78ed0e9f60fa1ed656c14ff4c494da5a Mon Sep 17 00:00:00 2001
From: Jay Zhou <jianjay.zhou@huawei.com>
Date: Fri, 12 Jan 2018 10:47:57 +0800
Subject: [PATCH 29/29] vhost: remove assertion to prevent crash

QEMU will assert on vhost-user backed virtio device hotplug if QEMU is
using more RAM regions than VHOST_MEMORY_MAX_NREGIONS (for example if
it were started with a lot of DIMM devices).

Fix it by returning error instead of asserting and let callers of
vhost_set_mem_table() handle error condition gracefully.

Cc: qemu-stable@nongnu.org
Signed-off-by: Igor Mammedov <imammedo@redhat.com>
Signed-off-by: Jay Zhou <jianjay.zhou@huawei.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio/vhost-user.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 7930fd85fc..6eb97980ad 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -342,11 +342,14 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
                                      &offset);
         fd = memory_region_get_fd(mr);
         if (fd > 0) {
+            if (fd_num == VHOST_MEMORY_MAX_NREGIONS) {
+                error_report("Failed preparing vhost-user memory table msg");
+                return -1;
+            }
             msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
             msg.payload.memory.regions[fd_num].memory_size  = reg->memory_size;
             msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
             msg.payload.memory.regions[fd_num].mmap_offset = offset;
-            assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
             fds[fd_num++] = fd;
         }
     }