mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-08-15 21:56:21 +00:00
io_uring: add support for kernel registered bvecs
Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register an empty fixed buffer table with io_uring in order for the kernel to make use of it. Signed-off-by: Keith Busch <kbusch@kernel.org> Link: https://lore.kernel.org/r/20250227223916.143006-5-kbusch@meta.com Reviewed-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
99fde895ff
commit
27cb27b6d5
@ -4,6 +4,7 @@
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
#include <linux/io_uring_types.h>
|
||||
#include <linux/blk-mq.h>
|
||||
|
||||
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
|
||||
#define IORING_URING_CMD_CANCELABLE (1U << 30)
|
||||
@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur
|
||||
return cmd_to_io_kiocb(cmd)->async_data;
|
||||
}
|
||||
|
||||
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
|
||||
void (*release)(void *), unsigned int index,
|
||||
unsigned int issue_flags);
|
||||
void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
|
||||
unsigned int issue_flags);
|
||||
|
||||
#endif /* _LINUX_IO_URING_CMD_H */
|
||||
|
@ -3940,6 +3940,9 @@ static int __init io_uring_init(void)
|
||||
|
||||
io_uring_optable_init();
|
||||
|
||||
/* imu->dir is u8 */
|
||||
BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX);
|
||||
|
||||
/*
|
||||
* Allow user copy in the per-command field, which starts after the
|
||||
* file in io_kiocb and until the opcode field. The openat2 handling
|
||||
|
123
io_uring/rsrc.c
123
io_uring/rsrc.c
@ -9,6 +9,7 @@
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/compat.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/io_uring/cmd.h>
|
||||
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
@ -101,17 +102,23 @@ static int io_buffer_validate(struct iovec *iov)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
|
||||
static void io_release_ubuf(void *priv)
|
||||
{
|
||||
struct io_mapped_ubuf *imu = node->buf;
|
||||
struct io_mapped_ubuf *imu = priv;
|
||||
unsigned int i;
|
||||
|
||||
if (!refcount_dec_and_test(&imu->refs))
|
||||
return;
|
||||
for (i = 0; i < imu->nr_bvecs; i++)
|
||||
unpin_user_page(imu->bvec[i].bv_page);
|
||||
}
|
||||
|
||||
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu)
|
||||
{
|
||||
if (!refcount_dec_and_test(&imu->refs))
|
||||
return;
|
||||
|
||||
if (imu->acct_pages)
|
||||
io_unaccount_mem(ctx, imu->acct_pages);
|
||||
imu->release(imu->priv);
|
||||
kvfree(imu);
|
||||
}
|
||||
|
||||
@ -451,7 +458,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
|
||||
break;
|
||||
case IORING_RSRC_BUFFER:
|
||||
if (node->buf)
|
||||
io_buffer_unmap(ctx, node);
|
||||
io_buffer_unmap(ctx, node->buf);
|
||||
break;
|
||||
default:
|
||||
WARN_ON_ONCE(1);
|
||||
@ -761,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
||||
imu->len = iov->iov_len;
|
||||
imu->nr_bvecs = nr_pages;
|
||||
imu->folio_shift = PAGE_SHIFT;
|
||||
imu->release = io_release_ubuf;
|
||||
imu->priv = imu;
|
||||
imu->is_kbuf = false;
|
||||
imu->dir = IO_IMU_DEST | IO_IMU_SOURCE;
|
||||
if (coalesced)
|
||||
imu->folio_shift = data.folio_shift;
|
||||
refcount_set(&imu->refs, 1);
|
||||
@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
|
||||
void (*release)(void *), unsigned int index,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
|
||||
struct io_rsrc_data *data = &ctx->buf_table;
|
||||
struct req_iterator rq_iter;
|
||||
struct io_mapped_ubuf *imu;
|
||||
struct io_rsrc_node *node;
|
||||
struct bio_vec bv, *bvec;
|
||||
u16 nr_bvecs;
|
||||
int ret = 0;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
if (index >= data->nr) {
|
||||
ret = -EINVAL;
|
||||
goto unlock;
|
||||
}
|
||||
index = array_index_nospec(index, data->nr);
|
||||
|
||||
if (data->nodes[index]) {
|
||||
ret = -EBUSY;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
|
||||
if (!node) {
|
||||
ret = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
nr_bvecs = blk_rq_nr_phys_segments(rq);
|
||||
imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL);
|
||||
if (!imu) {
|
||||
kfree(node);
|
||||
ret = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
imu->ubuf = 0;
|
||||
imu->len = blk_rq_bytes(rq);
|
||||
imu->acct_pages = 0;
|
||||
imu->folio_shift = PAGE_SHIFT;
|
||||
imu->nr_bvecs = nr_bvecs;
|
||||
refcount_set(&imu->refs, 1);
|
||||
imu->release = release;
|
||||
imu->priv = rq;
|
||||
imu->is_kbuf = true;
|
||||
|
||||
if (op_is_write(req_op(rq)))
|
||||
imu->dir = IO_IMU_SOURCE;
|
||||
else
|
||||
imu->dir = IO_IMU_DEST;
|
||||
|
||||
bvec = imu->bvec;
|
||||
rq_for_each_bvec(bv, rq, rq_iter)
|
||||
*bvec++ = bv;
|
||||
|
||||
node->buf = imu;
|
||||
data->nodes[index] = node;
|
||||
unlock:
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_buffer_register_bvec);
|
||||
|
||||
void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx;
|
||||
struct io_rsrc_data *data = &ctx->buf_table;
|
||||
struct io_rsrc_node *node;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
if (index >= data->nr)
|
||||
goto unlock;
|
||||
index = array_index_nospec(index, data->nr);
|
||||
|
||||
node = data->nodes[index];
|
||||
if (!node || !node->buf->is_kbuf)
|
||||
goto unlock;
|
||||
|
||||
io_put_rsrc_node(ctx, node);
|
||||
data->nodes[index] = NULL;
|
||||
unlock:
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec);
|
||||
|
||||
static int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
struct io_mapped_ubuf *imu,
|
||||
u64 buf_addr, size_t len)
|
||||
@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
/* not inside the mapped region */
|
||||
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
|
||||
return -EFAULT;
|
||||
if (!(imu->dir & (1 << ddir)))
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
* Might not be a start of buffer, set size appropriately
|
||||
@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
/*
|
||||
* Don't use iov_iter_advance() here, as it's really slow for
|
||||
* using the latter parts of a big fixed buffer - it iterates
|
||||
* over each segment manually. We can cheat a bit here, because
|
||||
* we know that:
|
||||
* over each segment manually. We can cheat a bit here for user
|
||||
* registered nodes, because we know that:
|
||||
*
|
||||
* 1) it's a BVEC iter, we set it up
|
||||
* 2) all bvecs are the same in size, except potentially the
|
||||
@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||
*/
|
||||
const struct bio_vec *bvec = imu->bvec;
|
||||
|
||||
/*
|
||||
* Kernel buffer bvecs, on the other hand, don't necessarily
|
||||
* have the size property of user registered ones, so we have
|
||||
* to use the slow iter advance.
|
||||
*/
|
||||
if (offset < bvec->bv_len) {
|
||||
iter->iov_offset = offset;
|
||||
} else if (imu->is_kbuf) {
|
||||
iov_iter_advance(iter, offset);
|
||||
} else {
|
||||
unsigned long seg_skip;
|
||||
|
||||
|
@ -20,6 +20,11 @@ struct io_rsrc_node {
|
||||
};
|
||||
};
|
||||
|
||||
enum {
|
||||
IO_IMU_DEST = 1 << ITER_DEST,
|
||||
IO_IMU_SOURCE = 1 << ITER_SOURCE,
|
||||
};
|
||||
|
||||
struct io_mapped_ubuf {
|
||||
u64 ubuf;
|
||||
unsigned int len;
|
||||
@ -27,6 +32,10 @@ struct io_mapped_ubuf {
|
||||
unsigned int folio_shift;
|
||||
refcount_t refs;
|
||||
unsigned long acct_pages;
|
||||
void (*release)(void *);
|
||||
void *priv;
|
||||
bool is_kbuf;
|
||||
u8 dir;
|
||||
struct bio_vec bvec[] __counted_by(nr_bvecs);
|
||||
};
|
||||
|
||||
|
@ -629,6 +629,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
|
||||
*/
|
||||
static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
|
||||
{
|
||||
struct io_kiocb *req = cmd_to_io_kiocb(rw);
|
||||
struct kiocb *kiocb = &rw->kiocb;
|
||||
struct file *file = kiocb->ki_filp;
|
||||
ssize_t ret = 0;
|
||||
@ -644,6 +645,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
|
||||
if ((kiocb->ki_flags & IOCB_NOWAIT) &&
|
||||
!(kiocb->ki_filp->f_flags & O_NONBLOCK))
|
||||
return -EAGAIN;
|
||||
if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf)
|
||||
return -EFAULT;
|
||||
|
||||
ppos = io_kiocb_ppos(kiocb);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user