iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC

Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that
provides user space (VMM) another FD to read the vIOMMU Events.

Allow a vIOMMU object to allocate vEVENTQs, with a condition that each
vIOMMU can only have one single vEVENTQ per type.

Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl.

Link: https://patch.msgid.link/r/21acf0751dd5c93846935ee06f93b9c65eff5e04.1741719725.git.nicolinc@nvidia.com
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
This commit is contained in:
Nicolin Chen 2025-03-11 12:44:23 -07:00 committed by Jason Gunthorpe
parent 0507f337fc
commit e36ba5ab80
6 changed files with 384 additions and 1 deletions

View File

@ -262,13 +262,148 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b
return done == 0 ? rc : done;
}
/* IOMMUFD_OBJ_VEVENTQ Functions */
void iommufd_veventq_abort(struct iommufd_object *obj)
{
struct iommufd_eventq *eventq =
container_of(obj, struct iommufd_eventq, obj);
struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
struct iommufd_viommu *viommu = veventq->viommu;
struct iommufd_vevent *cur, *next;
lockdep_assert_held_write(&viommu->veventqs_rwsem);
list_for_each_entry_safe(cur, next, &eventq->deliver, node) {
list_del(&cur->node);
if (cur != &veventq->lost_events_header)
kfree(cur);
}
refcount_dec(&viommu->obj.users);
list_del(&veventq->node);
}
void iommufd_veventq_destroy(struct iommufd_object *obj)
{
struct iommufd_veventq *veventq = eventq_to_veventq(
container_of(obj, struct iommufd_eventq, obj));
down_write(&veventq->viommu->veventqs_rwsem);
iommufd_veventq_abort(obj);
up_write(&veventq->viommu->veventqs_rwsem);
}
static struct iommufd_vevent *
iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq)
{
struct iommufd_eventq *eventq = &veventq->common;
struct list_head *list = &eventq->deliver;
struct iommufd_vevent *vevent = NULL;
spin_lock(&eventq->lock);
if (!list_empty(list)) {
struct iommufd_vevent *next;
next = list_first_entry(list, struct iommufd_vevent, node);
/* Make a copy of the lost_events_header for copy_to_user */
if (next == &veventq->lost_events_header) {
vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC);
if (!vevent)
goto out_unlock;
}
list_del(&next->node);
if (vevent)
memcpy(vevent, next, sizeof(*vevent));
else
vevent = next;
}
out_unlock:
spin_unlock(&eventq->lock);
return vevent;
}
static void iommufd_veventq_deliver_restore(struct iommufd_veventq *veventq,
struct iommufd_vevent *vevent)
{
struct iommufd_eventq *eventq = &veventq->common;
struct list_head *list = &eventq->deliver;
spin_lock(&eventq->lock);
if (vevent_for_lost_events_header(vevent)) {
/* Remove the copy of the lost_events_header */
kfree(vevent);
vevent = NULL;
/* An empty list needs the lost_events_header back */
if (list_empty(list))
vevent = &veventq->lost_events_header;
}
if (vevent)
list_add(&vevent->node, list);
spin_unlock(&eventq->lock);
}
static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf,
size_t count, loff_t *ppos)
{
struct iommufd_eventq *eventq = filep->private_data;
struct iommufd_veventq *veventq = eventq_to_veventq(eventq);
struct iommufd_vevent_header *hdr;
struct iommufd_vevent *cur;
size_t done = 0;
int rc = 0;
if (*ppos)
return -ESPIPE;
while ((cur = iommufd_veventq_deliver_fetch(veventq))) {
/* Validate the remaining bytes against the header size */
if (done >= count || sizeof(*hdr) > count - done) {
iommufd_veventq_deliver_restore(veventq, cur);
break;
}
hdr = &cur->header;
/* If being a normal vEVENT, validate against the full size */
if (!vevent_for_lost_events_header(cur) &&
sizeof(hdr) + cur->data_len > count - done) {
iommufd_veventq_deliver_restore(veventq, cur);
break;
}
if (copy_to_user(buf + done, hdr, sizeof(*hdr))) {
iommufd_veventq_deliver_restore(veventq, cur);
rc = -EFAULT;
break;
}
done += sizeof(*hdr);
if (cur->data_len &&
copy_to_user(buf + done, cur->event_data, cur->data_len)) {
iommufd_veventq_deliver_restore(veventq, cur);
rc = -EFAULT;
break;
}
spin_lock(&eventq->lock);
veventq->num_events--;
spin_unlock(&eventq->lock);
done += cur->data_len;
kfree(cur);
}
return done == 0 ? rc : done;
}
/* Common Event Queue Functions */
static __poll_t iommufd_eventq_fops_poll(struct file *filep,
struct poll_table_struct *wait)
{
struct iommufd_eventq *eventq = filep->private_data;
__poll_t pollflags = EPOLLOUT;
__poll_t pollflags = 0;
if (eventq->obj.type == IOMMUFD_OBJ_FAULT)
pollflags |= EPOLLOUT;
poll_wait(filep, &eventq->wait_queue, wait);
spin_lock(&eventq->lock);
@ -388,3 +523,75 @@ int iommufd_fault_iopf_handler(struct iopf_group *group)
return 0;
}
static const struct file_operations iommufd_veventq_fops =
INIT_EVENTQ_FOPS(iommufd_veventq_fops_read, NULL);
int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_veventq_alloc *cmd = ucmd->cmd;
struct iommufd_veventq *veventq;
struct iommufd_viommu *viommu;
int fdno;
int rc;
if (cmd->flags || cmd->__reserved ||
cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT)
return -EOPNOTSUPP;
if (!cmd->veventq_depth)
return -EINVAL;
viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
if (IS_ERR(viommu))
return PTR_ERR(viommu);
down_write(&viommu->veventqs_rwsem);
if (iommufd_viommu_find_veventq(viommu, cmd->type)) {
rc = -EEXIST;
goto out_unlock_veventqs;
}
veventq = __iommufd_object_alloc(ucmd->ictx, veventq,
IOMMUFD_OBJ_VEVENTQ, common.obj);
if (IS_ERR(veventq)) {
rc = PTR_ERR(veventq);
goto out_unlock_veventqs;
}
veventq->type = cmd->type;
veventq->viommu = viommu;
refcount_inc(&viommu->obj.users);
veventq->depth = cmd->veventq_depth;
list_add_tail(&veventq->node, &viommu->veventqs);
veventq->lost_events_header.header.flags =
IOMMU_VEVENTQ_FLAG_LOST_EVENTS;
fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]",
ucmd->ictx, &iommufd_veventq_fops);
if (fdno < 0) {
rc = fdno;
goto out_abort;
}
cmd->out_veventq_id = veventq->common.obj.id;
cmd->out_veventq_fd = fdno;
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
if (rc)
goto out_put_fdno;
iommufd_object_finalize(ucmd->ictx, &veventq->common.obj);
fd_install(fdno, veventq->common.filep);
goto out_unlock_veventqs;
out_put_fdno:
put_unused_fd(fdno);
fput(veventq->common.filep);
out_abort:
iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj);
out_unlock_veventqs:
up_write(&viommu->veventqs_rwsem);
iommufd_put_object(ucmd->ictx, &viommu->obj);
return rc;
}

View File

@ -507,6 +507,74 @@ void iommufd_fault_iopf_disable(struct iommufd_device *idev);
void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
struct iommufd_attach_handle *handle);
/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */
struct iommufd_vevent {
struct iommufd_vevent_header header;
struct list_head node; /* for iommufd_eventq::deliver */
ssize_t data_len;
u64 event_data[] __counted_by(data_len);
};
#define vevent_for_lost_events_header(vevent) \
(vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS)
/*
* An iommufd_veventq object represents an interface to deliver vIOMMU events to
* the user space. It is created/destroyed by the user space and associated with
* a vIOMMU object during the allocations.
*/
struct iommufd_veventq {
struct iommufd_eventq common;
struct iommufd_viommu *viommu;
struct list_head node; /* for iommufd_viommu::veventqs */
struct iommufd_vevent lost_events_header;
unsigned int type;
unsigned int depth;
/* Use common.lock for protection */
u32 num_events;
u32 sequence;
};
static inline struct iommufd_veventq *
eventq_to_veventq(struct iommufd_eventq *eventq)
{
return container_of(eventq, struct iommufd_veventq, common);
}
static inline struct iommufd_veventq *
iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id)
{
return container_of(iommufd_get_object(ucmd->ictx, id,
IOMMUFD_OBJ_VEVENTQ),
struct iommufd_veventq, common.obj);
}
int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd);
void iommufd_veventq_destroy(struct iommufd_object *obj);
void iommufd_veventq_abort(struct iommufd_object *obj);
static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq,
struct iommufd_vevent *vevent)
{
struct iommufd_eventq *eventq = &veventq->common;
lockdep_assert_held(&eventq->lock);
/*
* Remove the lost_events_header and add the new node at the same time.
* Note the new node can be lost_events_header, for a sequence update.
*/
if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver))
list_del(&veventq->lost_events_header.node);
list_add_tail(&vevent->node, &eventq->deliver);
vevent->header.sequence = veventq->sequence;
veventq->sequence = (veventq->sequence + 1) & INT_MAX;
wake_up_interruptible(&eventq->wait_queue);
}
static inline struct iommufd_viommu *
iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
{
@ -515,6 +583,20 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
struct iommufd_viommu, obj);
}
static inline struct iommufd_veventq *
iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, u32 type)
{
struct iommufd_veventq *veventq, *next;
lockdep_assert_held(&viommu->veventqs_rwsem);
list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) {
if (veventq->type == type)
return veventq;
}
return NULL;
}
int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
void iommufd_viommu_destroy(struct iommufd_object *obj);
int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd);

View File

@ -317,6 +317,7 @@ union ucmd_buffer {
struct iommu_ioas_unmap unmap;
struct iommu_option option;
struct iommu_vdevice_alloc vdev;
struct iommu_veventq_alloc veventq;
struct iommu_vfio_ioas vfio_ioas;
struct iommu_viommu_alloc viommu;
#ifdef CONFIG_IOMMUFD_TEST
@ -372,6 +373,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64),
IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl,
struct iommu_vdevice_alloc, virt_id),
IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc,
struct iommu_veventq_alloc, out_veventq_fd),
IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
__reserved),
IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
@ -514,6 +517,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_VDEVICE] = {
.destroy = iommufd_vdevice_destroy,
},
[IOMMUFD_OBJ_VEVENTQ] = {
.destroy = iommufd_veventq_destroy,
.abort = iommufd_veventq_abort,
},
[IOMMUFD_OBJ_VIOMMU] = {
.destroy = iommufd_viommu_destroy,
},

View File

@ -59,6 +59,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
viommu->ictx = ucmd->ictx;
viommu->hwpt = hwpt_paging;
refcount_inc(&viommu->hwpt->common.obj.users);
INIT_LIST_HEAD(&viommu->veventqs);
init_rwsem(&viommu->veventqs_rwsem);
/*
* It is the most likely case that a physical IOMMU is unpluggable. A
* pluggable IOMMU instance (if exists) is responsible for refcounting

View File

@ -34,6 +34,7 @@ enum iommufd_object_type {
IOMMUFD_OBJ_FAULT,
IOMMUFD_OBJ_VIOMMU,
IOMMUFD_OBJ_VDEVICE,
IOMMUFD_OBJ_VEVENTQ,
#ifdef CONFIG_IOMMUFD_TEST
IOMMUFD_OBJ_SELFTEST,
#endif
@ -93,6 +94,8 @@ struct iommufd_viommu {
const struct iommufd_viommu_ops *ops;
struct xarray vdevs;
struct list_head veventqs;
struct rw_semaphore veventqs_rwsem;
unsigned int type;
};

View File

@ -55,6 +55,7 @@ enum {
IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93,
};
/**
@ -1014,4 +1015,85 @@ struct iommu_ioas_change_process {
#define IOMMU_IOAS_CHANGE_PROCESS \
_IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
/**
* enum iommu_veventq_flag - flag for struct iommufd_vevent_header
* @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs
*/
enum iommu_veventq_flag {
IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0),
};
/**
* struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
* @flags: Combination of enum iommu_veventq_flag
* @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of
* [0, INT_MAX] where the following index of INT_MAX is 0
*
* Each iommufd_vevent_header reports a sequence index of the following vEVENT:
* -------------------------------------------------------------------------
* | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN |
* -------------------------------------------------------------------------
* And this sequence index is expected to be monotonic to the sequence index of
* the previous vEVENT. If two adjacent sequence indexes has a delta larger than
* 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs:
* -------------------------------------------------------------------------
* | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... |
* -------------------------------------------------------------------------
* If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT
* providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header
* would be added to the tail, and no data would follow this header:
* ---------------------------------------------------------------------------
* |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} |
* ---------------------------------------------------------------------------
*/
struct iommufd_vevent_header {
__u32 flags;
__u32 sequence;
};
/**
* enum iommu_veventq_type - Virtual Event Queue Type
* @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use
*/
enum iommu_veventq_type {
IOMMU_VEVENTQ_TYPE_DEFAULT = 0,
};
/**
* struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC)
* @size: sizeof(struct iommu_veventq_alloc)
* @flags: Must be 0
* @viommu_id: virtual IOMMU ID to associate the vEVENTQ with
* @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type
* @veventq_depth: Maximum number of events in the vEVENTQ
* @out_veventq_id: The ID of the new vEVENTQ
* @out_veventq_fd: The fd of the new vEVENTQ. User space must close the
* successfully returned fd after using it
* @__reserved: Must be 0
*
* Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU
* can have multiple FDs for different types, but is confined to one per @type.
* User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ,
* if there are vEVENTs available. A vEVENTQ will lose events due to overflow,
* if the number of the vEVENTs hits @veventq_depth.
*
* Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by
* a type-specific data structure, in a normal case:
* -------------------------------------------------------------
* || header0 | data0 | header1 | data1 | ... | headerN | dataN ||
* -------------------------------------------------------------
* unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to
* struct iommufd_vevent_header).
*/
struct iommu_veventq_alloc {
__u32 size;
__u32 flags;
__u32 viommu_id;
__u32 type;
__u32 veventq_depth;
__u32 out_veventq_id;
__u32 out_veventq_fd;
__u32 __reserved;
};
#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC)
#endif