mirror of
https://git.proxmox.com/git/mirror_ubuntu-kernels.git
synced 2025-12-08 00:05:24 +00:00
This branch has three new iommufd capabilities:
- Dirty tracking for DMA. AMD/ARM/Intel CPUs can now record if a DMA
writes to a page in the IOPTEs within the IO page table. This can be used
to generate a record of what memory is being dirtied by DMA activities
during a VM migration process. A VMM like qemu will combine the IOMMU
dirty bits with the CPU's dirty log to determine what memory to
transfer.
VFIO already has a DMA dirty tracking framework that requires PCI
devices to implement tracking HW internally. The iommufd version
provides an alternative that the VMM can select, if available. The two
are designed to have very similar APIs.
- Userspace controlled attributes for hardware page
tables (HWPT/iommu_domain). There are currently a few generic attributes
for HWPTs (support dirty tracking, and parent of a nest). This is an
entry point for the userspace iommu driver to control the HW in detail.
- Nested translation support for HWPTs. This is a 2D translation scheme
similar to the CPU where a DMA goes through a first stage to determine
an intermediate address which is then translated trough a second stage
to a physical address.
Like for CPU translation the first stage table would exist in VM
controlled memory and the second stage is in the kernel and matches the
VM's guest to physical map.
As every IOMMU has a unique set of parameter to describe the S1 IO page
table and its associated parameters the userspace IOMMU driver has to
marshal the information into the correct format.
This is 1/3 of the feature, it allows creating the nested translation
and binding it to VFIO devices, however the API to support IOTLB and
ATC invalidation of the stage 1 io page table, and forwarding of IO
faults are still in progress.
The series includes AMD and Intel support for dirty tracking. Intel
support for nested translation.
Along the way are a number of internal items:
- New iommu core items: ops->domain_alloc_user(), ops->set_dirty_tracking,
ops->read_and_clear_dirty(), IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user
- UAF fix in iopt_area_split()
- Spelling fixes and some test suite improvement
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQRRRCHOFoQz/8F5bUaFwuHvBreFYQUCZUDu2wAKCRCFwuHvBreF
YcdeAQDaBmjyGLrRIlzPyohF6FrombyWo2512n51Hs8IHR4IvQEA3oRNgQ2tsJRr
1UPuOqnOD5T/oVX6AkUPRBwanCUQwwM=
=nyJ3
-----END PGP SIGNATURE-----
Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd
Pull iommufd updates from Jason Gunthorpe:
"This brings three new iommufd capabilities:
- Dirty tracking for DMA.
AMD/ARM/Intel CPUs can now record if a DMA writes to a page in the
IOPTEs within the IO page table. This can be used to generate a
record of what memory is being dirtied by DMA activities during a
VM migration process. A VMM like qemu will combine the IOMMU dirty
bits with the CPU's dirty log to determine what memory to transfer.
VFIO already has a DMA dirty tracking framework that requires PCI
devices to implement tracking HW internally. The iommufd version
provides an alternative that the VMM can select, if available. The
two are designed to have very similar APIs.
- Userspace controlled attributes for hardware page tables
(HWPT/iommu_domain). There are currently a few generic attributes
for HWPTs (support dirty tracking, and parent of a nest). This is
an entry point for the userspace iommu driver to control the HW in
detail.
- Nested translation support for HWPTs. This is a 2D translation
scheme similar to the CPU where a DMA goes through a first stage to
determine an intermediate address which is then translated trough a
second stage to a physical address.
Like for CPU translation the first stage table would exist in VM
controlled memory and the second stage is in the kernel and matches
the VM's guest to physical map.
As every IOMMU has a unique set of parameter to describe the S1 IO
page table and its associated parameters the userspace IOMMU driver
has to marshal the information into the correct format.
This is 1/3 of the feature, it allows creating the nested
translation and binding it to VFIO devices, however the API to
support IOTLB and ATC invalidation of the stage 1 io page table,
and forwarding of IO faults are still in progress.
The series includes AMD and Intel support for dirty tracking. Intel
support for nested translation.
Along the way are a number of internal items:
- New iommu core items: ops->domain_alloc_user(),
ops->set_dirty_tracking, ops->read_and_clear_dirty(),
IOMMU_DOMAIN_NESTED, and iommu_copy_struct_from_user
- UAF fix in iopt_area_split()
- Spelling fixes and some test suite improvement"
* tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/git/jgg/iommufd: (52 commits)
iommufd: Organize the mock domain alloc functions closer to Joerg's tree
iommufd/selftest: Fix page-size check in iommufd_test_dirty()
iommufd: Add iopt_area_alloc()
iommufd: Fix missing update of domains_itree after splitting iopt_area
iommu/vt-d: Disallow read-only mappings to nest parent domain
iommu/vt-d: Add nested domain allocation
iommu/vt-d: Set the nested domain to a device
iommu/vt-d: Make domain attach helpers to be extern
iommu/vt-d: Add helper to setup pasid nested translation
iommu/vt-d: Add helper for nested domain allocation
iommu/vt-d: Extend dmar_domain to support nested domain
iommufd: Add data structure for Intel VT-d stage-1 domain allocation
iommu/vt-d: Enhance capability check for nested parent domain allocation
iommufd/selftest: Add coverage for IOMMU_HWPT_ALLOC with nested HWPTs
iommufd/selftest: Add nested domain allocation for mock domain
iommu: Add iommu_copy_struct_from_user helper
iommufd: Add a nested HW pagetable object
iommu: Pass in parent domain with user_data to domain_alloc_user op
iommufd: Share iommufd_hwpt_alloc with IOMMUFD_OBJ_HWPT_NESTED
iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable
...
1526 lines
38 KiB
C
1526 lines
38 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
|
|
*/
|
|
|
|
#include <linux/device.h>
|
|
#include <linux/eventfd.h>
|
|
#include <linux/file.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/iommu.h>
|
|
#include <linux/module.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/pci.h>
|
|
#include <linux/pm_runtime.h>
|
|
#include <linux/types.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/vfio.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/anon_inodes.h>
|
|
|
|
#include "cmd.h"
|
|
|
|
/* Device specification max LOAD size */
|
|
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
|
|
|
|
#define MAX_CHUNK_SIZE SZ_8M
|
|
|
|
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
|
|
{
|
|
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
|
|
|
|
return container_of(core_device, struct mlx5vf_pci_core_device,
|
|
core_device);
|
|
}
|
|
|
|
struct page *
|
|
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
|
|
unsigned long offset)
|
|
{
|
|
unsigned long cur_offset = 0;
|
|
struct scatterlist *sg;
|
|
unsigned int i;
|
|
|
|
/* All accesses are sequential */
|
|
if (offset < buf->last_offset || !buf->last_offset_sg) {
|
|
buf->last_offset = 0;
|
|
buf->last_offset_sg = buf->table.sgt.sgl;
|
|
buf->sg_last_entry = 0;
|
|
}
|
|
|
|
cur_offset = buf->last_offset;
|
|
|
|
for_each_sg(buf->last_offset_sg, sg,
|
|
buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
|
|
if (offset < sg->length + cur_offset) {
|
|
buf->last_offset_sg = sg;
|
|
buf->sg_last_entry += i;
|
|
buf->last_offset = cur_offset;
|
|
return nth_page(sg_page(sg),
|
|
(offset - cur_offset) / PAGE_SIZE);
|
|
}
|
|
cur_offset += sg->length;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
|
|
unsigned int npages)
|
|
{
|
|
unsigned int to_alloc = npages;
|
|
struct page **page_list;
|
|
unsigned long filled;
|
|
unsigned int to_fill;
|
|
int ret;
|
|
|
|
to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
|
|
page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
|
|
if (!page_list)
|
|
return -ENOMEM;
|
|
|
|
do {
|
|
filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
|
|
page_list);
|
|
if (!filled) {
|
|
ret = -ENOMEM;
|
|
goto err;
|
|
}
|
|
to_alloc -= filled;
|
|
ret = sg_alloc_append_table_from_pages(
|
|
&buf->table, page_list, filled, 0,
|
|
filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
|
|
GFP_KERNEL_ACCOUNT);
|
|
|
|
if (ret)
|
|
goto err;
|
|
buf->allocated_length += filled * PAGE_SIZE;
|
|
/* clean input for another bulk allocation */
|
|
memset(page_list, 0, filled * sizeof(*page_list));
|
|
to_fill = min_t(unsigned int, to_alloc,
|
|
PAGE_SIZE / sizeof(*page_list));
|
|
} while (to_alloc > 0);
|
|
|
|
kvfree(page_list);
|
|
return 0;
|
|
|
|
err:
|
|
kvfree(page_list);
|
|
return ret;
|
|
}
|
|
|
|
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
|
|
{
|
|
mutex_lock(&migf->lock);
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
migf->filp->f_pos = 0;
|
|
mutex_unlock(&migf->lock);
|
|
}
|
|
|
|
static int mlx5vf_release_file(struct inode *inode, struct file *filp)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
|
|
mlx5vf_disable_fd(migf);
|
|
mutex_destroy(&migf->lock);
|
|
kfree(migf);
|
|
return 0;
|
|
}
|
|
|
|
static struct mlx5_vhca_data_buffer *
|
|
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
|
|
bool *end_of_data)
|
|
{
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
bool found = false;
|
|
|
|
*end_of_data = false;
|
|
spin_lock_irq(&migf->list_lock);
|
|
if (list_empty(&migf->buf_list)) {
|
|
*end_of_data = true;
|
|
goto end;
|
|
}
|
|
|
|
buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
|
|
buf_elm);
|
|
if (pos >= buf->start_pos &&
|
|
pos < buf->start_pos + buf->length) {
|
|
found = true;
|
|
goto end;
|
|
}
|
|
|
|
/*
|
|
* As we use a stream based FD we may expect having the data always
|
|
* on first chunk
|
|
*/
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
|
|
end:
|
|
spin_unlock_irq(&migf->list_lock);
|
|
return found ? buf : NULL;
|
|
}
|
|
|
|
static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = vhca_buf->migf;
|
|
|
|
if (vhca_buf->stop_copy_chunk_num) {
|
|
bool is_header = vhca_buf->dma_dir == DMA_NONE;
|
|
u8 chunk_num = vhca_buf->stop_copy_chunk_num;
|
|
size_t next_required_umem_size = 0;
|
|
|
|
if (is_header)
|
|
migf->buf_header[chunk_num - 1] = vhca_buf;
|
|
else
|
|
migf->buf[chunk_num - 1] = vhca_buf;
|
|
|
|
spin_lock_irq(&migf->list_lock);
|
|
list_del_init(&vhca_buf->buf_elm);
|
|
if (!is_header) {
|
|
next_required_umem_size =
|
|
migf->next_required_umem_size;
|
|
migf->next_required_umem_size = 0;
|
|
migf->num_ready_chunks--;
|
|
}
|
|
spin_unlock_irq(&migf->list_lock);
|
|
if (next_required_umem_size)
|
|
mlx5vf_mig_file_set_save_work(migf, chunk_num,
|
|
next_required_umem_size);
|
|
return;
|
|
}
|
|
|
|
spin_lock_irq(&migf->list_lock);
|
|
list_del_init(&vhca_buf->buf_elm);
|
|
list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
|
|
spin_unlock_irq(&migf->list_lock);
|
|
}
|
|
|
|
static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
|
|
char __user **buf, size_t *len, loff_t *pos)
|
|
{
|
|
unsigned long offset;
|
|
ssize_t done = 0;
|
|
size_t copy_len;
|
|
|
|
copy_len = min_t(size_t,
|
|
vhca_buf->start_pos + vhca_buf->length - *pos, *len);
|
|
while (copy_len) {
|
|
size_t page_offset;
|
|
struct page *page;
|
|
size_t page_len;
|
|
u8 *from_buff;
|
|
int ret;
|
|
|
|
offset = *pos - vhca_buf->start_pos;
|
|
page_offset = offset % PAGE_SIZE;
|
|
offset -= page_offset;
|
|
page = mlx5vf_get_migration_page(vhca_buf, offset);
|
|
if (!page)
|
|
return -EINVAL;
|
|
page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
|
|
from_buff = kmap_local_page(page);
|
|
ret = copy_to_user(*buf, from_buff + page_offset, page_len);
|
|
kunmap_local(from_buff);
|
|
if (ret)
|
|
return -EFAULT;
|
|
*pos += page_len;
|
|
*len -= page_len;
|
|
*buf += page_len;
|
|
done += page_len;
|
|
copy_len -= page_len;
|
|
}
|
|
|
|
if (*pos >= vhca_buf->start_pos + vhca_buf->length)
|
|
mlx5vf_buf_read_done(vhca_buf);
|
|
|
|
return done;
|
|
}
|
|
|
|
static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
|
|
loff_t *pos)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
struct mlx5_vhca_data_buffer *vhca_buf;
|
|
bool first_loop_call = true;
|
|
bool end_of_data;
|
|
ssize_t done = 0;
|
|
|
|
if (pos)
|
|
return -ESPIPE;
|
|
pos = &filp->f_pos;
|
|
|
|
if (!(filp->f_flags & O_NONBLOCK)) {
|
|
if (wait_event_interruptible(migf->poll_wait,
|
|
!list_empty(&migf->buf_list) ||
|
|
migf->state == MLX5_MIGF_STATE_ERROR ||
|
|
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
|
|
migf->state == MLX5_MIGF_STATE_PRE_COPY ||
|
|
migf->state == MLX5_MIGF_STATE_COMPLETE))
|
|
return -ERESTARTSYS;
|
|
}
|
|
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR) {
|
|
done = -ENODEV;
|
|
goto out_unlock;
|
|
}
|
|
|
|
while (len) {
|
|
ssize_t count;
|
|
|
|
vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
|
|
&end_of_data);
|
|
if (first_loop_call) {
|
|
first_loop_call = false;
|
|
/* Temporary end of file as part of PRE_COPY */
|
|
if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
|
|
migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
|
|
done = -ENOMSG;
|
|
goto out_unlock;
|
|
}
|
|
|
|
if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
|
|
if (filp->f_flags & O_NONBLOCK) {
|
|
done = -EAGAIN;
|
|
goto out_unlock;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (end_of_data)
|
|
goto out_unlock;
|
|
|
|
if (!vhca_buf) {
|
|
done = -EINVAL;
|
|
goto out_unlock;
|
|
}
|
|
|
|
count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
|
|
if (count < 0) {
|
|
done = count;
|
|
goto out_unlock;
|
|
}
|
|
done += count;
|
|
}
|
|
|
|
out_unlock:
|
|
mutex_unlock(&migf->lock);
|
|
return done;
|
|
}
|
|
|
|
static __poll_t mlx5vf_save_poll(struct file *filp,
|
|
struct poll_table_struct *wait)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
__poll_t pollflags = 0;
|
|
|
|
poll_wait(filp, &migf->poll_wait, wait);
|
|
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR)
|
|
pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
|
|
else if (!list_empty(&migf->buf_list) ||
|
|
migf->state == MLX5_MIGF_STATE_COMPLETE)
|
|
pollflags = EPOLLIN | EPOLLRDNORM;
|
|
mutex_unlock(&migf->lock);
|
|
|
|
return pollflags;
|
|
}
|
|
|
|
/*
|
|
* FD is exposed and user can use it after receiving an error.
|
|
* Mark migf in error, and wake the user.
|
|
*/
|
|
static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
|
|
{
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
wake_up_interruptible(&migf->poll_wait);
|
|
}
|
|
|
|
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
|
|
u8 chunk_num, size_t next_required_umem_size)
|
|
{
|
|
migf->save_data[chunk_num - 1].next_required_umem_size =
|
|
next_required_umem_size;
|
|
migf->save_data[chunk_num - 1].migf = migf;
|
|
get_file(migf->filp);
|
|
queue_work(migf->mvdev->cb_wq,
|
|
&migf->save_data[chunk_num - 1].work);
|
|
}
|
|
|
|
static struct mlx5_vhca_data_buffer *
|
|
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
|
|
u8 index, size_t required_length)
|
|
{
|
|
struct mlx5_vhca_data_buffer *buf = migf->buf[index];
|
|
u8 chunk_num;
|
|
|
|
WARN_ON(!buf);
|
|
chunk_num = buf->stop_copy_chunk_num;
|
|
buf->migf->buf[index] = NULL;
|
|
/* Checking whether the pre-allocated buffer can fit */
|
|
if (buf->allocated_length >= required_length)
|
|
return buf;
|
|
|
|
mlx5vf_put_data_buffer(buf);
|
|
buf = mlx5vf_get_data_buffer(buf->migf, required_length,
|
|
DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf))
|
|
return buf;
|
|
|
|
buf->stop_copy_chunk_num = chunk_num;
|
|
return buf;
|
|
}
|
|
|
|
static void mlx5vf_mig_file_save_work(struct work_struct *_work)
|
|
{
|
|
struct mlx5vf_save_work_data *save_data = container_of(_work,
|
|
struct mlx5vf_save_work_data, work);
|
|
struct mlx5_vf_migration_file *migf = save_data->migf;
|
|
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR)
|
|
goto end;
|
|
|
|
buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
|
|
save_data->chunk_num - 1,
|
|
save_data->next_required_umem_size);
|
|
if (IS_ERR(buf))
|
|
goto err;
|
|
|
|
if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
|
|
goto err_save;
|
|
|
|
goto end;
|
|
|
|
err_save:
|
|
mlx5vf_put_data_buffer(buf);
|
|
err:
|
|
mlx5vf_mark_err(migf);
|
|
end:
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
fput(migf->filp);
|
|
}
|
|
|
|
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
|
|
bool track)
|
|
{
|
|
size_t size = sizeof(struct mlx5_vf_migration_header) +
|
|
sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
|
|
struct mlx5_vf_migration_tag_stop_copy_data data = {};
|
|
struct mlx5_vhca_data_buffer *header_buf = NULL;
|
|
struct mlx5_vf_migration_header header = {};
|
|
unsigned long flags;
|
|
struct page *page;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
|
|
if (IS_ERR(header_buf))
|
|
return PTR_ERR(header_buf);
|
|
|
|
header.record_size = cpu_to_le64(sizeof(data));
|
|
header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
|
|
header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
|
|
page = mlx5vf_get_migration_page(header_buf, 0);
|
|
if (!page) {
|
|
ret = -EINVAL;
|
|
goto err;
|
|
}
|
|
to_buff = kmap_local_page(page);
|
|
memcpy(to_buff, &header, sizeof(header));
|
|
header_buf->length = sizeof(header);
|
|
data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
|
|
memcpy(to_buff + sizeof(header), &data, sizeof(data));
|
|
header_buf->length += sizeof(data);
|
|
kunmap_local(to_buff);
|
|
header_buf->start_pos = header_buf->migf->max_pos;
|
|
migf->max_pos += header_buf->length;
|
|
spin_lock_irqsave(&migf->list_lock, flags);
|
|
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
|
|
spin_unlock_irqrestore(&migf->list_lock, flags);
|
|
if (track)
|
|
migf->pre_copy_initial_bytes = size;
|
|
return 0;
|
|
err:
|
|
mlx5vf_put_data_buffer(header_buf);
|
|
return ret;
|
|
}
|
|
|
|
static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
|
|
struct mlx5_vf_migration_file *migf,
|
|
size_t state_size, u64 full_size,
|
|
bool track)
|
|
{
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
size_t inc_state_size;
|
|
int num_chunks;
|
|
int ret;
|
|
int i;
|
|
|
|
if (mvdev->chunk_mode) {
|
|
size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
|
|
|
|
/* from firmware perspective at least 'state_size' buffer should be set */
|
|
inc_state_size = max(state_size, chunk_size);
|
|
} else {
|
|
if (track) {
|
|
/* let's be ready for stop_copy size that might grow by 10 percents */
|
|
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
|
|
inc_state_size = state_size;
|
|
} else {
|
|
inc_state_size = state_size;
|
|
}
|
|
}
|
|
|
|
/* let's not overflow the device specification max SAVE size */
|
|
inc_state_size = min_t(size_t, inc_state_size,
|
|
(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
|
|
|
|
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
|
|
for (i = 0; i < num_chunks; i++) {
|
|
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto err;
|
|
}
|
|
|
|
migf->buf[i] = buf;
|
|
buf = mlx5vf_get_data_buffer(migf,
|
|
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto err;
|
|
}
|
|
migf->buf_header[i] = buf;
|
|
if (mvdev->chunk_mode) {
|
|
migf->buf[i]->stop_copy_chunk_num = i + 1;
|
|
migf->buf_header[i]->stop_copy_chunk_num = i + 1;
|
|
INIT_WORK(&migf->save_data[i].work,
|
|
mlx5vf_mig_file_save_work);
|
|
migf->save_data[i].chunk_num = i + 1;
|
|
}
|
|
}
|
|
|
|
ret = mlx5vf_add_stop_copy_header(migf, track);
|
|
if (ret)
|
|
goto err;
|
|
return 0;
|
|
|
|
err:
|
|
for (i = 0; i < num_chunks; i++) {
|
|
if (migf->buf[i]) {
|
|
mlx5vf_put_data_buffer(migf->buf[i]);
|
|
migf->buf[i] = NULL;
|
|
}
|
|
if (migf->buf_header[i]) {
|
|
mlx5vf_put_data_buffer(migf->buf_header[i]);
|
|
migf->buf_header[i] = NULL;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
|
|
unsigned long arg)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
struct vfio_precopy_info info = {};
|
|
loff_t *pos = &filp->f_pos;
|
|
unsigned long minsz;
|
|
size_t inc_length = 0;
|
|
bool end_of_data = false;
|
|
int ret;
|
|
|
|
if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
|
|
return -ENOTTY;
|
|
|
|
minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
|
|
|
|
if (copy_from_user(&info, (void __user *)arg, minsz))
|
|
return -EFAULT;
|
|
|
|
if (info.argsz < minsz)
|
|
return -EINVAL;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
|
|
mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
|
|
ret = -EINVAL;
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
/*
|
|
* We can't issue a SAVE command when the device is suspended, so as
|
|
* part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
|
|
* bytes that can't be read.
|
|
*/
|
|
if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
|
|
/*
|
|
* Once the query returns it's guaranteed that there is no
|
|
* active SAVE command.
|
|
* As so, the other code below is safe with the proper locks.
|
|
*/
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
|
|
NULL, MLX5VF_QUERY_INC);
|
|
if (ret)
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR) {
|
|
ret = -ENODEV;
|
|
goto err_migf_unlock;
|
|
}
|
|
|
|
if (migf->pre_copy_initial_bytes > *pos) {
|
|
info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
|
|
} else {
|
|
info.dirty_bytes = migf->max_pos - *pos;
|
|
if (!info.dirty_bytes)
|
|
end_of_data = true;
|
|
info.dirty_bytes += inc_length;
|
|
}
|
|
|
|
if (!end_of_data || !inc_length) {
|
|
mutex_unlock(&migf->lock);
|
|
goto done;
|
|
}
|
|
|
|
mutex_unlock(&migf->lock);
|
|
/*
|
|
* We finished transferring the current state and the device has a
|
|
* dirty state, save a new state to be ready for.
|
|
*/
|
|
buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
mlx5vf_mark_err(migf);
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
|
|
if (ret) {
|
|
mlx5vf_mark_err(migf);
|
|
mlx5vf_put_data_buffer(buf);
|
|
goto err_state_unlock;
|
|
}
|
|
|
|
done:
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
if (copy_to_user((void __user *)arg, &info, minsz))
|
|
return -EFAULT;
|
|
return 0;
|
|
|
|
err_migf_unlock:
|
|
mutex_unlock(&migf->lock);
|
|
err_state_unlock:
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return ret;
|
|
}
|
|
|
|
static const struct file_operations mlx5vf_save_fops = {
|
|
.owner = THIS_MODULE,
|
|
.read = mlx5vf_save_read,
|
|
.poll = mlx5vf_save_poll,
|
|
.unlocked_ioctl = mlx5vf_precopy_ioctl,
|
|
.compat_ioctl = compat_ptr_ioctl,
|
|
.release = mlx5vf_release_file,
|
|
.llseek = no_llseek,
|
|
};
|
|
|
|
static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
size_t length;
|
|
int ret;
|
|
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR)
|
|
return -ENODEV;
|
|
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
|
|
MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
|
|
if (ret)
|
|
goto err;
|
|
|
|
buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto err;
|
|
}
|
|
|
|
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
|
|
if (ret)
|
|
goto err_save;
|
|
|
|
return 0;
|
|
|
|
err_save:
|
|
mlx5vf_put_data_buffer(buf);
|
|
err:
|
|
mlx5vf_mark_err(migf);
|
|
return ret;
|
|
}
|
|
|
|
static struct mlx5_vf_migration_file *
|
|
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
|
|
{
|
|
struct mlx5_vf_migration_file *migf;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
size_t length;
|
|
u64 full_size;
|
|
int ret;
|
|
|
|
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
|
|
if (!migf)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
|
|
O_RDONLY);
|
|
if (IS_ERR(migf->filp)) {
|
|
ret = PTR_ERR(migf->filp);
|
|
goto end;
|
|
}
|
|
|
|
migf->mvdev = mvdev;
|
|
ret = mlx5vf_cmd_alloc_pd(migf);
|
|
if (ret)
|
|
goto out_free;
|
|
|
|
stream_open(migf->filp->f_inode, migf->filp);
|
|
mutex_init(&migf->lock);
|
|
init_waitqueue_head(&migf->poll_wait);
|
|
init_completion(&migf->save_comp);
|
|
/*
|
|
* save_comp is being used as a binary semaphore built from
|
|
* a completion. A normal mutex cannot be used because the lock is
|
|
* passed between kernel threads and lockdep can't model this.
|
|
*/
|
|
complete(&migf->save_comp);
|
|
mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
|
|
INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
|
|
INIT_LIST_HEAD(&migf->buf_list);
|
|
INIT_LIST_HEAD(&migf->avail_list);
|
|
spin_lock_init(&migf->list_lock);
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
|
|
if (ret)
|
|
goto out_pd;
|
|
|
|
ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
|
|
if (ret)
|
|
goto out_pd;
|
|
|
|
if (track) {
|
|
/* leave the allocated buffer ready for the stop-copy phase */
|
|
buf = mlx5vf_alloc_data_buffer(migf,
|
|
migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto out_pd;
|
|
}
|
|
} else {
|
|
buf = migf->buf[0];
|
|
migf->buf[0] = NULL;
|
|
}
|
|
|
|
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
|
|
if (ret)
|
|
goto out_save;
|
|
return migf;
|
|
out_save:
|
|
mlx5vf_free_data_buffer(buf);
|
|
out_pd:
|
|
mlx5fv_cmd_clean_migf_resources(migf);
|
|
out_free:
|
|
fput(migf->filp);
|
|
end:
|
|
kfree(migf);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
static int
|
|
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
|
|
const char __user **buf, size_t *len,
|
|
loff_t *pos, ssize_t *done)
|
|
{
|
|
unsigned long offset;
|
|
size_t page_offset;
|
|
struct page *page;
|
|
size_t page_len;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
offset = *pos - vhca_buf->start_pos;
|
|
page_offset = offset % PAGE_SIZE;
|
|
|
|
page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
|
|
if (!page)
|
|
return -EINVAL;
|
|
page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
|
|
to_buff = kmap_local_page(page);
|
|
ret = copy_from_user(to_buff + page_offset, *buf, page_len);
|
|
kunmap_local(to_buff);
|
|
if (ret)
|
|
return -EFAULT;
|
|
|
|
*pos += page_len;
|
|
*done += page_len;
|
|
*buf += page_len;
|
|
*len -= page_len;
|
|
vhca_buf->length += page_len;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
|
|
loff_t requested_length,
|
|
const char __user **buf, size_t *len,
|
|
loff_t *pos, ssize_t *done)
|
|
{
|
|
int ret;
|
|
|
|
if (requested_length > MAX_LOAD_SIZE)
|
|
return -ENOMEM;
|
|
|
|
if (vhca_buf->allocated_length < requested_length) {
|
|
ret = mlx5vf_add_migration_pages(
|
|
vhca_buf,
|
|
DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
|
|
PAGE_SIZE));
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
while (*len) {
|
|
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
|
|
done);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static ssize_t
|
|
mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
|
|
struct mlx5_vhca_data_buffer *vhca_buf,
|
|
size_t image_size, const char __user **buf,
|
|
size_t *len, loff_t *pos, ssize_t *done,
|
|
bool *has_work)
|
|
{
|
|
size_t copy_len, to_copy;
|
|
int ret;
|
|
|
|
to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
|
|
copy_len = to_copy;
|
|
while (to_copy) {
|
|
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
|
|
done);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
*len -= copy_len;
|
|
if (vhca_buf->length == image_size) {
|
|
migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
|
|
migf->max_pos += image_size;
|
|
*has_work = true;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
|
|
struct mlx5_vhca_data_buffer *vhca_buf,
|
|
const char __user **buf, size_t *len,
|
|
loff_t *pos, ssize_t *done)
|
|
{
|
|
size_t copy_len, to_copy;
|
|
size_t required_data;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
required_data = migf->record_size - vhca_buf->length;
|
|
to_copy = min_t(size_t, *len, required_data);
|
|
copy_len = to_copy;
|
|
while (to_copy) {
|
|
ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
|
|
done);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
*len -= copy_len;
|
|
if (vhca_buf->length == migf->record_size) {
|
|
switch (migf->record_tag) {
|
|
case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
|
|
{
|
|
struct page *page;
|
|
|
|
page = mlx5vf_get_migration_page(vhca_buf, 0);
|
|
if (!page)
|
|
return -EINVAL;
|
|
to_buff = kmap_local_page(page);
|
|
migf->stop_copy_prep_size = min_t(u64,
|
|
le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
|
|
kunmap_local(to_buff);
|
|
break;
|
|
}
|
|
default:
|
|
/* Optional tag */
|
|
break;
|
|
}
|
|
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
|
|
migf->max_pos += migf->record_size;
|
|
vhca_buf->length = 0;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
|
|
struct mlx5_vhca_data_buffer *vhca_buf,
|
|
const char __user **buf,
|
|
size_t *len, loff_t *pos,
|
|
ssize_t *done, bool *has_work)
|
|
{
|
|
struct page *page;
|
|
size_t copy_len;
|
|
u8 *to_buff;
|
|
int ret;
|
|
|
|
copy_len = min_t(size_t, *len,
|
|
sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
|
|
page = mlx5vf_get_migration_page(vhca_buf, 0);
|
|
if (!page)
|
|
return -EINVAL;
|
|
to_buff = kmap_local_page(page);
|
|
ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
|
|
if (ret) {
|
|
ret = -EFAULT;
|
|
goto end;
|
|
}
|
|
|
|
*buf += copy_len;
|
|
*pos += copy_len;
|
|
*done += copy_len;
|
|
*len -= copy_len;
|
|
vhca_buf->length += copy_len;
|
|
if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
|
|
u64 record_size;
|
|
u32 flags;
|
|
|
|
record_size = le64_to_cpup((__le64 *)to_buff);
|
|
if (record_size > MAX_LOAD_SIZE) {
|
|
ret = -ENOMEM;
|
|
goto end;
|
|
}
|
|
|
|
migf->record_size = record_size;
|
|
flags = le32_to_cpup((__le32 *)(to_buff +
|
|
offsetof(struct mlx5_vf_migration_header, flags)));
|
|
migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
|
|
offsetof(struct mlx5_vf_migration_header, tag)));
|
|
switch (migf->record_tag) {
|
|
case MLX5_MIGF_HEADER_TAG_FW_DATA:
|
|
migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
|
|
break;
|
|
case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
|
|
migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
|
|
break;
|
|
default:
|
|
if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
|
|
ret = -EOPNOTSUPP;
|
|
goto end;
|
|
}
|
|
/* We may read and skip this optional record data */
|
|
migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
|
|
}
|
|
|
|
migf->max_pos += vhca_buf->length;
|
|
vhca_buf->length = 0;
|
|
*has_work = true;
|
|
}
|
|
end:
|
|
kunmap_local(to_buff);
|
|
return ret;
|
|
}
|
|
|
|
static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
|
|
size_t len, loff_t *pos)
|
|
{
|
|
struct mlx5_vf_migration_file *migf = filp->private_data;
|
|
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
|
|
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
|
|
loff_t requested_length;
|
|
bool has_work = false;
|
|
ssize_t done = 0;
|
|
int ret = 0;
|
|
|
|
if (pos)
|
|
return -ESPIPE;
|
|
pos = &filp->f_pos;
|
|
|
|
if (*pos < 0 ||
|
|
check_add_overflow((loff_t)len, *pos, &requested_length))
|
|
return -EINVAL;
|
|
|
|
mutex_lock(&migf->mvdev->state_mutex);
|
|
mutex_lock(&migf->lock);
|
|
if (migf->state == MLX5_MIGF_STATE_ERROR) {
|
|
ret = -ENODEV;
|
|
goto out_unlock;
|
|
}
|
|
|
|
while (len || has_work) {
|
|
has_work = false;
|
|
switch (migf->load_state) {
|
|
case MLX5_VF_LOAD_STATE_READ_HEADER:
|
|
ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
|
|
&buf, &len, pos,
|
|
&done, &has_work);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
|
|
if (vhca_buf_header->allocated_length < migf->record_size) {
|
|
mlx5vf_free_data_buffer(vhca_buf_header);
|
|
|
|
migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
|
|
migf->record_size, DMA_NONE);
|
|
if (IS_ERR(migf->buf_header[0])) {
|
|
ret = PTR_ERR(migf->buf_header[0]);
|
|
migf->buf_header[0] = NULL;
|
|
goto out_unlock;
|
|
}
|
|
|
|
vhca_buf_header = migf->buf_header[0];
|
|
}
|
|
|
|
vhca_buf_header->start_pos = migf->max_pos;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
|
|
ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
|
|
&buf, &len, pos, &done);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_PREP_IMAGE:
|
|
{
|
|
u64 size = max(migf->record_size,
|
|
migf->stop_copy_prep_size);
|
|
|
|
if (vhca_buf->allocated_length < size) {
|
|
mlx5vf_free_data_buffer(vhca_buf);
|
|
|
|
migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
|
|
size, DMA_TO_DEVICE);
|
|
if (IS_ERR(migf->buf[0])) {
|
|
ret = PTR_ERR(migf->buf[0]);
|
|
migf->buf[0] = NULL;
|
|
goto out_unlock;
|
|
}
|
|
|
|
vhca_buf = migf->buf[0];
|
|
}
|
|
|
|
vhca_buf->start_pos = migf->max_pos;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
|
|
break;
|
|
}
|
|
case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
|
|
ret = mlx5vf_resume_read_image_no_header(vhca_buf,
|
|
requested_length,
|
|
&buf, &len, pos, &done);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_READ_IMAGE:
|
|
ret = mlx5vf_resume_read_image(migf, vhca_buf,
|
|
migf->record_size,
|
|
&buf, &len, pos, &done, &has_work);
|
|
if (ret)
|
|
goto out_unlock;
|
|
break;
|
|
case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
|
|
ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
|
|
if (ret)
|
|
goto out_unlock;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
|
|
|
|
/* prep header buf for next image */
|
|
vhca_buf_header->length = 0;
|
|
/* prep data buf for next image */
|
|
vhca_buf->length = 0;
|
|
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
out_unlock:
|
|
if (ret)
|
|
migf->state = MLX5_MIGF_STATE_ERROR;
|
|
mutex_unlock(&migf->lock);
|
|
mlx5vf_state_mutex_unlock(migf->mvdev);
|
|
return ret ? ret : done;
|
|
}
|
|
|
|
static const struct file_operations mlx5vf_resume_fops = {
|
|
.owner = THIS_MODULE,
|
|
.write = mlx5vf_resume_write,
|
|
.release = mlx5vf_release_file,
|
|
.llseek = no_llseek,
|
|
};
|
|
|
|
static struct mlx5_vf_migration_file *
|
|
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
struct mlx5_vf_migration_file *migf;
|
|
struct mlx5_vhca_data_buffer *buf;
|
|
int ret;
|
|
|
|
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
|
|
if (!migf)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
|
|
O_WRONLY);
|
|
if (IS_ERR(migf->filp)) {
|
|
ret = PTR_ERR(migf->filp);
|
|
goto end;
|
|
}
|
|
|
|
migf->mvdev = mvdev;
|
|
ret = mlx5vf_cmd_alloc_pd(migf);
|
|
if (ret)
|
|
goto out_free;
|
|
|
|
buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto out_pd;
|
|
}
|
|
|
|
migf->buf[0] = buf;
|
|
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
|
|
buf = mlx5vf_alloc_data_buffer(migf,
|
|
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
|
|
if (IS_ERR(buf)) {
|
|
ret = PTR_ERR(buf);
|
|
goto out_buf;
|
|
}
|
|
|
|
migf->buf_header[0] = buf;
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
|
|
} else {
|
|
/* Initial state will be to read the image */
|
|
migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
|
|
}
|
|
|
|
stream_open(migf->filp->f_inode, migf->filp);
|
|
mutex_init(&migf->lock);
|
|
INIT_LIST_HEAD(&migf->buf_list);
|
|
INIT_LIST_HEAD(&migf->avail_list);
|
|
spin_lock_init(&migf->list_lock);
|
|
return migf;
|
|
out_buf:
|
|
mlx5vf_free_data_buffer(migf->buf[0]);
|
|
out_pd:
|
|
mlx5vf_cmd_dealloc_pd(migf);
|
|
out_free:
|
|
fput(migf->filp);
|
|
end:
|
|
kfree(migf);
|
|
return ERR_PTR(ret);
|
|
}
|
|
|
|
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
if (mvdev->resuming_migf) {
|
|
mlx5vf_disable_fd(mvdev->resuming_migf);
|
|
mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
|
|
fput(mvdev->resuming_migf->filp);
|
|
mvdev->resuming_migf = NULL;
|
|
}
|
|
if (mvdev->saving_migf) {
|
|
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
|
|
cancel_work_sync(&mvdev->saving_migf->async_data.work);
|
|
mlx5vf_disable_fd(mvdev->saving_migf);
|
|
wake_up_interruptible(&mvdev->saving_migf->poll_wait);
|
|
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
|
|
fput(mvdev->saving_migf->filp);
|
|
mvdev->saving_migf = NULL;
|
|
}
|
|
}
|
|
|
|
static struct file *
|
|
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
|
|
u32 new)
|
|
{
|
|
u32 cur = mvdev->mig_state;
|
|
int ret;
|
|
|
|
if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
|
|
ret = mlx5vf_cmd_suspend_vhca(mvdev,
|
|
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
|
|
ret = mlx5vf_cmd_resume_vhca(mvdev,
|
|
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
|
|
ret = mlx5vf_cmd_suspend_vhca(mvdev,
|
|
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
|
|
ret = mlx5vf_cmd_resume_vhca(mvdev,
|
|
MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
return NULL;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
|
|
struct mlx5_vf_migration_file *migf;
|
|
|
|
migf = mlx5vf_pci_save_device_data(mvdev, false);
|
|
if (IS_ERR(migf))
|
|
return ERR_CAST(migf);
|
|
get_file(migf->filp);
|
|
mvdev->saving_migf = migf;
|
|
return migf->filp;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
|
|
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
|
|
new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
|
|
mlx5vf_disable_fds(mvdev);
|
|
return NULL;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
|
|
struct mlx5_vf_migration_file *migf;
|
|
|
|
migf = mlx5vf_pci_resume_device_data(mvdev);
|
|
if (IS_ERR(migf))
|
|
return ERR_CAST(migf);
|
|
get_file(migf->filp);
|
|
mvdev->resuming_migf = migf;
|
|
return migf->filp;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
|
|
if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
|
|
ret = mlx5vf_cmd_load_vhca_state(mvdev,
|
|
mvdev->resuming_migf,
|
|
mvdev->resuming_migf->buf[0]);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
}
|
|
mlx5vf_disable_fds(mvdev);
|
|
return NULL;
|
|
}
|
|
|
|
if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
|
|
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
|
|
new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
|
|
struct mlx5_vf_migration_file *migf;
|
|
|
|
migf = mlx5vf_pci_save_device_data(mvdev, true);
|
|
if (IS_ERR(migf))
|
|
return ERR_CAST(migf);
|
|
get_file(migf->filp);
|
|
mvdev->saving_migf = migf;
|
|
return migf->filp;
|
|
}
|
|
|
|
if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
|
|
ret = mlx5vf_cmd_suspend_vhca(mvdev,
|
|
MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
|
|
if (ret)
|
|
return ERR_PTR(ret);
|
|
ret = mlx5vf_pci_save_device_inc_data(mvdev);
|
|
return ret ? ERR_PTR(ret) : NULL;
|
|
}
|
|
|
|
/*
|
|
* vfio_mig_get_next_state() does not use arcs other than the above
|
|
*/
|
|
WARN_ON(true);
|
|
return ERR_PTR(-EINVAL);
|
|
}
|
|
|
|
/*
|
|
* This function is called in all state_mutex unlock cases to
|
|
* handle a 'deferred_reset' if exists.
|
|
*/
|
|
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
|
|
{
|
|
again:
|
|
spin_lock(&mvdev->reset_lock);
|
|
if (mvdev->deferred_reset) {
|
|
mvdev->deferred_reset = false;
|
|
spin_unlock(&mvdev->reset_lock);
|
|
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
|
|
mlx5vf_disable_fds(mvdev);
|
|
goto again;
|
|
}
|
|
mutex_unlock(&mvdev->state_mutex);
|
|
spin_unlock(&mvdev->reset_lock);
|
|
}
|
|
|
|
static struct file *
|
|
mlx5vf_pci_set_device_state(struct vfio_device *vdev,
|
|
enum vfio_device_mig_state new_state)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
enum vfio_device_mig_state next_state;
|
|
struct file *res = NULL;
|
|
int ret;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
while (new_state != mvdev->mig_state) {
|
|
ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
|
|
new_state, &next_state);
|
|
if (ret) {
|
|
res = ERR_PTR(ret);
|
|
break;
|
|
}
|
|
res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
|
|
if (IS_ERR(res))
|
|
break;
|
|
mvdev->mig_state = next_state;
|
|
if (WARN_ON(res && new_state != mvdev->mig_state)) {
|
|
fput(res);
|
|
res = ERR_PTR(-EINVAL);
|
|
break;
|
|
}
|
|
}
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return res;
|
|
}
|
|
|
|
static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
|
|
unsigned long *stop_copy_length)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
size_t state_size;
|
|
u64 total_size;
|
|
int ret;
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
|
|
&total_size, 0);
|
|
if (!ret)
|
|
*stop_copy_length = total_size;
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return ret;
|
|
}
|
|
|
|
static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
|
|
enum vfio_device_mig_state *curr_state)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
|
|
mutex_lock(&mvdev->state_mutex);
|
|
*curr_state = mvdev->mig_state;
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
return 0;
|
|
}
|
|
|
|
static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
|
|
|
|
if (!mvdev->migrate_cap)
|
|
return;
|
|
|
|
/*
|
|
* As the higher VFIO layers are holding locks across reset and using
|
|
* those same locks with the mm_lock we need to prevent ABBA deadlock
|
|
* with the state_mutex and mm_lock.
|
|
* In case the state_mutex was taken already we defer the cleanup work
|
|
* to the unlock flow of the other running context.
|
|
*/
|
|
spin_lock(&mvdev->reset_lock);
|
|
mvdev->deferred_reset = true;
|
|
if (!mutex_trylock(&mvdev->state_mutex)) {
|
|
spin_unlock(&mvdev->reset_lock);
|
|
return;
|
|
}
|
|
spin_unlock(&mvdev->reset_lock);
|
|
mlx5vf_state_mutex_unlock(mvdev);
|
|
}
|
|
|
|
static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
struct vfio_pci_core_device *vdev = &mvdev->core_device;
|
|
int ret;
|
|
|
|
ret = vfio_pci_core_enable(vdev);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (mvdev->migrate_cap)
|
|
mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
|
|
vfio_pci_core_finish_enable(vdev);
|
|
return 0;
|
|
}
|
|
|
|
static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(
|
|
core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
|
|
|
|
mlx5vf_cmd_close_migratable(mvdev);
|
|
vfio_pci_core_close_device(core_vdev);
|
|
}
|
|
|
|
static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
|
|
.migration_set_state = mlx5vf_pci_set_device_state,
|
|
.migration_get_state = mlx5vf_pci_get_device_state,
|
|
.migration_get_data_size = mlx5vf_pci_get_data_size,
|
|
};
|
|
|
|
static const struct vfio_log_ops mlx5vf_pci_log_ops = {
|
|
.log_start = mlx5vf_start_page_tracker,
|
|
.log_stop = mlx5vf_stop_page_tracker,
|
|
.log_read_and_clear = mlx5vf_tracker_read_and_clear,
|
|
};
|
|
|
|
static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
|
|
struct mlx5vf_pci_core_device, core_device.vdev);
|
|
int ret;
|
|
|
|
ret = vfio_pci_core_init_dev(core_vdev);
|
|
if (ret)
|
|
return ret;
|
|
|
|
mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
|
|
&mlx5vf_pci_log_ops);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
|
|
struct mlx5vf_pci_core_device, core_device.vdev);
|
|
|
|
mlx5vf_cmd_remove_migratable(mvdev);
|
|
vfio_pci_core_release_dev(core_vdev);
|
|
}
|
|
|
|
static const struct vfio_device_ops mlx5vf_pci_ops = {
|
|
.name = "mlx5-vfio-pci",
|
|
.init = mlx5vf_pci_init_dev,
|
|
.release = mlx5vf_pci_release_dev,
|
|
.open_device = mlx5vf_pci_open_device,
|
|
.close_device = mlx5vf_pci_close_device,
|
|
.ioctl = vfio_pci_core_ioctl,
|
|
.device_feature = vfio_pci_core_ioctl_feature,
|
|
.read = vfio_pci_core_read,
|
|
.write = vfio_pci_core_write,
|
|
.mmap = vfio_pci_core_mmap,
|
|
.request = vfio_pci_core_request,
|
|
.match = vfio_pci_core_match,
|
|
.bind_iommufd = vfio_iommufd_physical_bind,
|
|
.unbind_iommufd = vfio_iommufd_physical_unbind,
|
|
.attach_ioas = vfio_iommufd_physical_attach_ioas,
|
|
.detach_ioas = vfio_iommufd_physical_detach_ioas,
|
|
};
|
|
|
|
static int mlx5vf_pci_probe(struct pci_dev *pdev,
|
|
const struct pci_device_id *id)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev;
|
|
int ret;
|
|
|
|
mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
|
|
&pdev->dev, &mlx5vf_pci_ops);
|
|
if (IS_ERR(mvdev))
|
|
return PTR_ERR(mvdev);
|
|
|
|
dev_set_drvdata(&pdev->dev, &mvdev->core_device);
|
|
ret = vfio_pci_core_register_device(&mvdev->core_device);
|
|
if (ret)
|
|
goto out_put_vdev;
|
|
return 0;
|
|
|
|
out_put_vdev:
|
|
vfio_put_device(&mvdev->core_device.vdev);
|
|
return ret;
|
|
}
|
|
|
|
static void mlx5vf_pci_remove(struct pci_dev *pdev)
|
|
{
|
|
struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
|
|
|
|
vfio_pci_core_unregister_device(&mvdev->core_device);
|
|
vfio_put_device(&mvdev->core_device.vdev);
|
|
}
|
|
|
|
static const struct pci_device_id mlx5vf_pci_table[] = {
|
|
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
|
|
{}
|
|
};
|
|
|
|
MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
|
|
|
|
static const struct pci_error_handlers mlx5vf_err_handlers = {
|
|
.reset_done = mlx5vf_pci_aer_reset_done,
|
|
.error_detected = vfio_pci_core_aer_err_detected,
|
|
};
|
|
|
|
static struct pci_driver mlx5vf_pci_driver = {
|
|
.name = KBUILD_MODNAME,
|
|
.id_table = mlx5vf_pci_table,
|
|
.probe = mlx5vf_pci_probe,
|
|
.remove = mlx5vf_pci_remove,
|
|
.err_handler = &mlx5vf_err_handlers,
|
|
.driver_managed_dma = true,
|
|
};
|
|
|
|
module_pci_driver(mlx5vf_pci_driver);
|
|
|
|
MODULE_IMPORT_NS(IOMMUFD);
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
|
|
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
|
|
MODULE_DESCRIPTION(
|
|
"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
|