summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-28 15:07:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-28 15:07:04 -0700
commiteff5f16bfd87ae48c56751741af41a825d5d4618 (patch)
tree9b1e58d1038902a754107b35621d428ba24f5165 /io_uring
parent6df9d086ffcb6b0521872fef5f9f4dd1907abb9a (diff)
parent6889ae1b4df1579bcdffef023e2ea9a982565dff (diff)
Merge tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux
Pull more io_uring updates from Jens Axboe: "Final separate updates for io_uring. This started out as a series of cleanups improvements and improvements for registered buffers, but as the last series of the io_uring changes for 6.15, it also collected a few fixes for the other branches on top: - Add support for vectored fixed/registered buffers. Previously only single segments have been supported for commands, now vectored variants are supported as well. This series includes networking and file read/write support. - Small series unifying return codes across multi and single shot. - Small series cleaning up registerd buffer importing. - Adding support for vectored registered buffers for uring_cmd. - Fix for io-wq handling of command reissue. - Various little fixes and tweaks" * tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux: (25 commits) io_uring/net: fix io_req_post_cqe abuse by send bundle io_uring/net: use REQ_F_IMPORT_BUFFER for send_zc io_uring: move min_events sanitisation io_uring: rename "min" arg in io_iopoll_check() io_uring: open code __io_post_aux_cqe() io_uring: defer iowq cqe overflow via task_work io_uring: fix retry handling off iowq io_uring/net: only import send_zc buffer once io_uring/cmd: introduce io_uring_cmd_import_fixed_vec io_uring/cmd: add iovec cache for commands io_uring/cmd: don't expose entire cmd async data io_uring: rename the data cmd cache io_uring: rely on io_prep_reg_vec for iovec placement io_uring: introduce io_prep_reg_iovec() io_uring: unify STOP_MULTISHOT with IOU_OK io_uring: return -EAGAIN to continue multishot io_uring: cap cached iovec/bvec size io_uring/net: implement vectored reg bufs for zctx io_uring/net: convert to struct iou_vec io_uring/net: pull vec alloc out of msghdr import ...
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/alloc_cache.h9
-rw-r--r--io_uring/io_uring.c65
-rw-r--r--io_uring/io_uring.h19
-rw-r--r--io_uring/net.c235
-rw-r--r--io_uring/net.h6
-rw-r--r--io_uring/opdef.c42
-rw-r--r--io_uring/poll.c5
-rw-r--r--io_uring/rsrc.c163
-rw-r--r--io_uring/rsrc.h24
-rw-r--r--io_uring/rw.c94
-rw-r--r--io_uring/rw.h6
-rw-r--r--io_uring/uring_cmd.c59
-rw-r--r--io_uring/uring_cmd.h17
13 files changed, 534 insertions, 210 deletions
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 7f68eff2e7f3..d33ce159ef33 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache,
void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp);
-static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr)
-{
- if (IS_ENABLED(CONFIG_KASAN)) {
- kfree(*iov);
- *iov = NULL;
- *nr = 0;
- }
-}
-
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
void *entry)
{
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f743581cc81b..3ba49c628337 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -289,7 +289,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->apoll_cache, kfree);
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
- io_alloc_cache_free(&ctx->uring_cache, kfree);
+ io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free);
io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_rsrc_cache_free(ctx);
@@ -334,8 +334,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_async_rw),
offsetof(struct io_async_rw, clear));
- ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
- sizeof(struct io_uring_cmd_data), 0);
+ ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX,
+ sizeof(struct io_async_cmd),
+ sizeof(struct io_async_cmd));
spin_lock_init(&ctx->msg_lock);
ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_kiocb), 0);
@@ -833,24 +834,14 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
-static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
- u32 cflags)
+bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
+ io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
-
- return filled;
-}
-
-bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
-{
- bool filled;
-
- io_cq_lock(ctx);
- filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
io_cq_unlock_post(ctx);
return filled;
}
@@ -891,6 +882,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool completed = true;
/*
* All execution paths but io-wq use the deferred completions by
@@ -903,19 +895,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
* Handle special CQ sync cases via task_work. DEFER_TASKRUN requires
* the submitter task context, IOPOLL protects with uring_lock.
*/
- if (ctx->lockless_cq) {
+ if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) {
+defer_complete:
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req);
return;
}
io_cq_lock(ctx);
- if (!(req->flags & REQ_F_CQE_SKIP)) {
- if (!io_fill_cqe_req(ctx, req))
- io_req_cqe_overflow(req);
- }
+ if (!(req->flags & REQ_F_CQE_SKIP))
+ completed = io_fill_cqe_req(ctx, req);
io_cq_unlock_post(ctx);
+ if (!completed)
+ goto defer_complete;
+
/*
* We don't free the request here because we know it's called from
* io-wq only, which holds a reference, so it cannot be the last put.
@@ -1511,11 +1505,13 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
{
unsigned int nr_events = 0;
unsigned long check_cq;
+ min_events = min(min_events, ctx->cq_entries);
+
lockdep_assert_held(&ctx->uring_lock);
if (!io_allowed_run_tw(ctx))
@@ -1557,7 +1553,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
io_task_work_pending(ctx)) {
u32 tail = ctx->cached_cq_tail;
- (void) io_run_local_work_locked(ctx, min);
+ (void) io_run_local_work_locked(ctx, min_events);
if (task_work_pending(current) ||
wq_list_empty(&ctx->iopoll_list)) {
@@ -1570,7 +1566,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
wq_list_empty(&ctx->iopoll_list))
break;
}
- ret = io_do_iopoll(ctx, !min);
+ ret = io_do_iopoll(ctx, !min_events);
if (unlikely(ret < 0))
return ret;
@@ -1580,7 +1576,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
break;
nr_events += ret;
- } while (nr_events < min);
+ } while (nr_events < min_events);
return 0;
}
@@ -1791,10 +1787,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw)
ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]);
- WARN_ON_ONCE(ret == IOU_OK);
-
- if (ret == IOU_ISSUE_SKIP_COMPLETE)
- ret = 0;
+ WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE);
return ret;
}
@@ -1847,7 +1840,7 @@ fail:
* Don't allow any multishot execution from io-wq. It's more restrictive
* than necessary and also cleaner.
*/
- if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) {
err = -EBADFD;
if (!io_file_can_poll(req))
goto fail;
@@ -1858,7 +1851,7 @@ fail:
goto fail;
return;
} else {
- req->flags &= ~REQ_F_APOLL_MULTISHOT;
+ req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT);
}
}
@@ -2549,6 +2542,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
ktime_t start_time;
int ret;
+ min_events = min_t(int, min_events, ctx->cq_entries);
+
if (!io_allowed_run_tw(ctx))
return -EEXIST;
if (io_local_work_pending(ctx))
@@ -3435,22 +3430,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
mutex_lock(&ctx->uring_lock);
iopoll_locked:
ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
- if (likely(!ret2)) {
- min_complete = min(min_complete,
- ctx->cq_entries);
+ if (likely(!ret2))
ret2 = io_iopoll_check(ctx, min_complete);
- }
mutex_unlock(&ctx->uring_lock);
} else {
struct ext_arg ext_arg = { .argsz = argsz };
ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
- if (likely(!ret2)) {
- min_complete = min(min_complete,
- ctx->cq_entries);
+ if (likely(!ret2))
ret2 = io_cqring_wait(ctx, min_complete, flags,
&ext_arg);
- }
}
if (!ret) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index b95dab77e32d..87f883130286 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -19,22 +19,25 @@
#endif
enum {
- IOU_OK = 0,
+ IOU_OK = 0, /* deprecated, use IOU_COMPLETE */
+ IOU_COMPLETE = 0,
+
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
/*
+ * The request has more work to do and should be retried. io_uring will
+ * attempt to wait on the file for eligible opcodes, but otherwise
+ * it'll be handed to iowq for blocking execution. It works for normal
+ * requests as well as for the multi shot mode.
+ */
+ IOU_RETRY = -EAGAIN,
+
+ /*
* Requeue the task_work to restart operations on this request. The
* actual value isn't important, should just be not an otherwise
* valid error code, yet less than -MAX_ERRNO and valid internally.
*/
IOU_REQUEUE = -3072,
-
- /*
- * Intended only when both IO_URING_F_MULTISHOT is passed
- * to indicate to the poll runner that multishot should be
- * removed and the result is set on req->cqe.res.
- */
- IOU_STOP_MULTISHOT = -ECANCELED,
};
struct io_wait_queue {
diff --git a/io_uring/net.c b/io_uring/net.c
index 89cd45bacd7c..8944eb679024 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -136,11 +136,8 @@ static bool io_net_retry(struct socket *sock, int flags)
static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg)
{
- if (kmsg->free_iov) {
- kfree(kmsg->free_iov);
- kmsg->free_iov_nr = 0;
- kmsg->free_iov = NULL;
- }
+ if (kmsg->vec.iovec)
+ io_vec_free(&kmsg->vec);
}
static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
@@ -154,7 +151,10 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
}
/* Let normal cleanup path reap it if we fail adding to the cache */
- io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr);
+ io_alloc_cache_vec_kasan(&hdr->vec);
+ if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP)
+ io_vec_free(&hdr->vec);
+
if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
req->async_data = NULL;
req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
@@ -171,7 +171,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
return NULL;
/* If the async data was cached, we might have an iov cached inside. */
- if (hdr->free_iov)
+ if (hdr->vec.iovec)
req->flags |= REQ_F_NEED_CLEANUP;
return hdr;
}
@@ -182,10 +182,7 @@ static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg
{
if (iov) {
req->flags |= REQ_F_NEED_CLEANUP;
- kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs;
- if (kmsg->free_iov)
- kfree(kmsg->free_iov);
- kmsg->free_iov = iov;
+ io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs);
}
}
@@ -208,9 +205,9 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg
struct iovec *iov;
int ret, nr_segs;
- if (iomsg->free_iov) {
- nr_segs = iomsg->free_iov_nr;
- iov = iomsg->free_iov;
+ if (iomsg->vec.iovec) {
+ nr_segs = iomsg->vec.nr;
+ iov = iomsg->vec.iovec;
} else {
nr_segs = 1;
iov = &iomsg->fast_iov;
@@ -253,12 +250,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req,
return -EFAULT;
sr->len = tmp_iov.iov_len;
}
-
- return 0;
}
-
- return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov,
- msg->msg_iovlen, ddir);
+ return 0;
}
static int io_copy_msghdr_from_user(struct user_msghdr *msg,
@@ -287,6 +280,24 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
struct user_msghdr __user *umsg = sr->umsg;
int ret;
+ iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->msg.msg_iter.nr_segs = 0;
+
+ if (io_is_compat(req->ctx)) {
+ struct compat_msghdr cmsg;
+
+ ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr);
+ if (ret)
+ return ret;
+
+ memset(msg, 0, sizeof(*msg));
+ msg->msg_namelen = cmsg.msg_namelen;
+ msg->msg_controllen = cmsg.msg_controllen;
+ msg->msg_iov = compat_ptr(cmsg.msg_iov);
+ msg->msg_iovlen = cmsg.msg_iovlen;
+ return 0;
+ }
+
ret = io_copy_msghdr_from_user(msg, umsg);
if (unlikely(ret))
return ret;
@@ -310,10 +321,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
return -EFAULT;
sr->len = tmp_iov.iov_len;
}
- return 0;
}
-
- return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir);
+ return 0;
}
static int io_sendmsg_copy_hdr(struct io_kiocb *req,
@@ -323,19 +332,13 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req,
struct user_msghdr msg;
int ret;
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->msg.msg_iter.nr_segs = 0;
-
- if (io_is_compat(req->ctx)) {
- struct compat_msghdr cmsg;
-
- ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE,
- NULL);
- sr->msg_control = iomsg->msg.msg_control_user;
+ ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
+ if (unlikely(ret))
return ret;
- }
- ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL);
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
+ ITER_SOURCE);
/* save msg_control as sys_sendmsg() overwrites it */
sr->msg_control = iomsg->msg.msg_control_user;
return ret;
@@ -395,6 +398,27 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe
return io_sendmsg_copy_hdr(req, kmsg);
}
+static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+ struct io_async_msghdr *kmsg = req->async_data;
+ struct user_msghdr msg;
+ int ret;
+
+ if (!(sr->flags & IORING_RECVSEND_FIXED_BUF))
+ return io_sendmsg_setup(req, sqe);
+
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+
+ ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL);
+ if (unlikely(ret))
+ return ret;
+ sr->msg_control = kmsg->msg.msg_control_user;
+ kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen;
+
+ return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, msg.msg_iovlen);
+}
+
#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -424,6 +448,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags |= MSG_WAITALL;
sr->buf_group = req->buf_index;
req->buf_list = NULL;
+ req->flags |= REQ_F_MULTISHOT;
}
if (io_is_compat(req->ctx))
@@ -461,7 +486,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
if (iter_is_ubuf(&kmsg->msg.msg_iter))
return 1;
- iov = kmsg->free_iov;
+ iov = kmsg->vec.iovec;
if (!iov)
iov = &kmsg->fast_iov;
@@ -577,9 +602,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
.nr_iovs = 1,
};
- if (kmsg->free_iov) {
- arg.nr_iovs = kmsg->free_iov_nr;
- arg.iovs = kmsg->free_iov;
+ if (kmsg->vec.iovec) {
+ arg.nr_iovs = kmsg->vec.nr;
+ arg.iovs = kmsg->vec.iovec;
arg.mode = KBUF_MODE_FREE;
}
@@ -592,9 +617,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
if (unlikely(ret < 0))
return ret;
- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
- kmsg->free_iov_nr = ret;
- kmsg->free_iov = arg.iovs;
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+ kmsg->vec.nr = ret;
+ kmsg->vec.iovec = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP;
}
sr->len = arg.out_len;
@@ -709,23 +734,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
struct user_msghdr msg;
int ret;
- iomsg->msg.msg_name = &iomsg->addr;
- iomsg->msg.msg_iter.nr_segs = 0;
-
- if (io_is_compat(req->ctx)) {
- struct compat_msghdr cmsg;
-
- ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST,
- &iomsg->uaddr);
- memset(&msg, 0, sizeof(msg));
- msg.msg_namelen = cmsg.msg_namelen;
- msg.msg_controllen = cmsg.msg_controllen;
- } else {
- ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
- }
-
+ ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr);
if (unlikely(ret))
return ret;
+
+ if (!(req->flags & REQ_F_BUFFER_SELECT)) {
+ ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen,
+ ITER_DEST);
+ if (unlikely(ret))
+ return ret;
+ }
return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
msg.msg_controllen);
}
@@ -863,8 +881,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
*/
if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
- int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
-
+ *ret = IOU_RETRY;
io_mshot_prep_retry(req, kmsg);
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) {
@@ -872,23 +889,16 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
return false;
/* mshot retries exceeded, force a requeue */
sr->nr_multishot_loops = 0;
- mshot_retry_ret = IOU_REQUEUE;
+ if (issue_flags & IO_URING_F_MULTISHOT)
+ *ret = IOU_REQUEUE;
}
- if (issue_flags & IO_URING_F_MULTISHOT)
- *ret = mshot_retry_ret;
- else
- *ret = -EAGAIN;
return true;
}
/* Finish the request / stop multishot. */
finish:
io_req_set_res(req, *ret, cflags);
-
- if (issue_flags & IO_URING_F_MULTISHOT)
- *ret = IOU_STOP_MULTISHOT;
- else
- *ret = IOU_OK;
+ *ret = IOU_COMPLETE;
io_req_msg_cleanup(req, issue_flags);
return true;
}
@@ -1035,16 +1045,15 @@ retry_multishot:
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
- if (issue_flags & IO_URING_F_MULTISHOT) {
+ if (issue_flags & IO_URING_F_MULTISHOT)
io_kbuf_recycle(req, issue_flags);
- return IOU_ISSUE_SKIP_COMPLETE;
- }
- return -EAGAIN;
+
+ return IOU_RETRY;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->done_io += ret;
req->flags |= REQ_F_BL_NO_RECYCLE;
- return -EAGAIN;
+ return IOU_RETRY;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -1085,9 +1094,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
.mode = KBUF_MODE_EXPAND,
};
- if (kmsg->free_iov) {
- arg.nr_iovs = kmsg->free_iov_nr;
- arg.iovs = kmsg->free_iov;
+ if (kmsg->vec.iovec) {
+ arg.nr_iovs = kmsg->vec.nr;
+ arg.iovs = kmsg->vec.iovec;
arg.mode |= KBUF_MODE_FREE;
}
@@ -1106,9 +1115,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
}
iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
arg.out_len);
- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
- kmsg->free_iov_nr = ret;
- kmsg->free_iov = arg.iovs;
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+ kmsg->vec.nr = ret;
+ kmsg->vec.iovec = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP;
}
} else {
@@ -1172,12 +1181,10 @@ retry_multishot:
ret = sock_recvmsg(sock, &kmsg->msg, flags);
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
- if (issue_flags & IO_URING_F_MULTISHOT) {
+ if (issue_flags & IO_URING_F_MULTISHOT)
io_kbuf_recycle(req, issue_flags);
- return IOU_ISSUE_SKIP_COMPLETE;
- }
- return -EAGAIN;
+ return IOU_RETRY;
}
if (ret > 0 && io_net_retry(sock, flags)) {
sr->len -= ret;
@@ -1260,9 +1267,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
if (len && zc->len == 0) {
io_req_set_res(req, 0, 0);
- if (issue_flags & IO_URING_F_MULTISHOT)
- return IOU_STOP_MULTISHOT;
- return IOU_OK;
+ return IOU_COMPLETE;
}
if (unlikely(ret <= 0) && ret != -EAGAIN) {
if (ret == -ERESTARTSYS)
@@ -1272,15 +1277,9 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_set_res(req, ret, 0);
-
- if (issue_flags & IO_URING_F_MULTISHOT)
- return IOU_STOP_MULTISHOT;
- return IOU_OK;
+ return IOU_COMPLETE;
}
-
- if (issue_flags & IO_URING_F_MULTISHOT)
- return IOU_ISSUE_SKIP_COMPLETE;
- return -EAGAIN;
+ return IOU_RETRY;
}
void io_send_zc_cleanup(struct io_kiocb *req)
@@ -1339,8 +1338,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (req->opcode != IORING_OP_SEND_ZC) {
if (unlikely(sqe->addr2 || sqe->file_index))
return -EINVAL;
- if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
- return -EINVAL;
}
zc->len = READ_ONCE(sqe->len);
@@ -1354,9 +1351,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(!io_msg_alloc_async(req)))
return -ENOMEM;
- if (req->opcode != IORING_OP_SENDMSG_ZC)
+ if (req->opcode == IORING_OP_SEND_ZC) {
+ req->flags |= REQ_F_IMPORT_BUFFER;
return io_send_setup(req, sqe);
- return io_sendmsg_setup(req, sqe);
+ }
+ return io_sendmsg_zc_setup(req, sqe);
}
static int io_sg_from_iter_iovec(struct sk_buff *skb,
@@ -1454,7 +1453,8 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
(zc->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN;
- if (!zc->done_io) {
+ if (req->flags & REQ_F_IMPORT_BUFFER) {
+ req->flags &= ~REQ_F_IMPORT_BUFFER;
ret = io_send_zc_import(req, issue_flags);
if (unlikely(ret))
return ret;
@@ -1513,6 +1513,20 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
unsigned flags;
int ret, min_ret = 0;
+ kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
+
+ if (req->flags & REQ_F_IMPORT_BUFFER) {
+ unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs;
+ int ret;
+
+ ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req,
+ &kmsg->vec, uvec_segs, issue_flags);
+ if (unlikely(ret))
+ return ret;
+ kmsg->msg.sg_from_iter = io_sg_from_iter;
+ req->flags &= ~REQ_F_IMPORT_BUFFER;
+ }
+
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
@@ -1531,7 +1545,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
kmsg->msg.msg_control_user = sr->msg_control;
kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
- kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (unlikely(ret < min_ret)) {
@@ -1646,16 +1659,9 @@ retry:
put_unused_fd(fd);
ret = PTR_ERR(file);
if (ret == -EAGAIN && force_nonblock &&
- !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) {
- /*
- * if it's multishot and polled, we don't need to
- * return EAGAIN to arm the poll infra since it
- * has already been done
- */
- if (issue_flags & IO_URING_F_MULTISHOT)
- return IOU_ISSUE_SKIP_COMPLETE;
- return ret;
- }
+ !(accept->iou_flags & IORING_ACCEPT_DONTWAIT))
+ return IOU_RETRY;
+
if (ret == -ERESTARTSYS)
ret = -EINTR;
} else if (!fixed) {
@@ -1674,17 +1680,13 @@ retry:
io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1)
goto retry;
- if (issue_flags & IO_URING_F_MULTISHOT)
- return IOU_ISSUE_SKIP_COMPLETE;
- return -EAGAIN;
+ return IOU_RETRY;
}
io_req_set_res(req, ret, cflags);
if (ret < 0)
req_set_fail(req);
- if (!(issue_flags & IO_URING_F_MULTISHOT))
- return IOU_OK;
- return IOU_STOP_MULTISHOT;
+ return IOU_COMPLETE;
}
int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1876,8 +1878,7 @@ void io_netmsg_cache_free(const void *entry)
{
struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
- if (kmsg->free_iov)
- io_netmsg_iovec_free(kmsg);
+ io_vec_free(&kmsg->vec);
kfree(kmsg);
}
#endif
diff --git a/io_uring/net.h b/io_uring/net.h
index b804c2b36e60..43e5ce5416b7 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -2,12 +2,12 @@
#include <linux/net.h>
#include <linux/uio.h>
+#include <linux/io_uring_types.h>
struct io_async_msghdr {
#if defined(CONFIG_NET)
- struct iovec *free_iov;
- /* points to an allocated iov, if NULL we use fast_iov instead */
- int free_iov_nr;
+ struct iou_vec vec;
+
struct_group(clear,
int namelen;
struct iovec fast_iov;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index db77df513d55..489384c0438b 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -416,7 +416,7 @@ const struct io_issue_def io_issue_defs[] = {
.plug = 1,
.iopoll = 1,
.iopoll_queue = 1,
- .async_size = sizeof(struct io_uring_cmd_data),
+ .async_size = sizeof(struct io_async_cmd),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
},
@@ -540,6 +540,35 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_READV_FIXED] = {
+ .needs_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollin = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .vectored = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_readv_fixed,
+ .issue = io_read,
+ },
+ [IORING_OP_WRITEV_FIXED] = {
+ .needs_file = 1,
+ .hash_reg_file = 1,
+ .unbound_nonreg_file = 1,
+ .pollout = 1,
+ .plug = 1,
+ .audit_skip = 1,
+ .ioprio = 1,
+ .iopoll = 1,
+ .iopoll_queue = 1,
+ .vectored = 1,
+ .async_size = sizeof(struct io_async_rw),
+ .prep = io_prep_writev_fixed,
+ .issue = io_write,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -726,6 +755,7 @@ const struct io_cold_def io_cold_defs[] = {
},
[IORING_OP_URING_CMD] = {
.name = "URING_CMD",
+ .cleanup = io_uring_cmd_cleanup,
},
[IORING_OP_SEND_ZC] = {
.name = "SEND_ZC",
@@ -775,6 +805,16 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_EPOLL_WAIT] = {
.name = "EPOLL_WAIT",
},
+ [IORING_OP_READV_FIXED] = {
+ .name = "READV_FIXED",
+ .cleanup = io_readv_writev_cleanup,
+ .fail = io_rw_fail,
+ },
+ [IORING_OP_WRITEV_FIXED] = {
+ .name = "WRITEV_FIXED",
+ .cleanup = io_readv_writev_cleanup,
+ .fail = io_rw_fail,
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 176854882ba6..8eb744eb9f4c 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -289,11 +289,12 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
}
} else {
int ret = io_poll_issue(req, tw);
- if (ret == IOU_STOP_MULTISHOT)
+
+ if (ret == IOU_COMPLETE)
return IOU_POLL_REMOVE_POLL_USE_RES;
else if (ret == IOU_REQUEUE)
return IOU_POLL_REQUEUE;
- if (ret < 0)
+ if (ret != IOU_RETRY && ret < 0)
return ret;
}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index a59563fbb4ad..3f195e24777e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1262,3 +1262,166 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
fput(file);
return ret;
}
+
+void io_vec_free(struct iou_vec *iv)
+{
+ if (!iv->iovec)
+ return;
+ kfree(iv->iovec);
+ iv->iovec = NULL;
+ iv->nr = 0;
+}
+
+int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries)
+{
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ struct iovec *iov;
+
+ iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);
+ if (!iov)
+ return -ENOMEM;
+
+ io_vec_free(iv);
+ iv->iovec = iov;
+ iv->nr = nr_entries;
+ return 0;
+}
+
+static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
+ struct io_mapped_ubuf *imu,
+ struct iovec *iovec, unsigned nr_iovs,
+ struct iou_vec *vec)
+{
+ unsigned long folio_size = 1 << imu->folio_shift;
+ unsigned long folio_mask = folio_size - 1;
+ u64 folio_addr = imu->ubuf & ~folio_mask;
+ struct bio_vec *res_bvec = vec->bvec;
+ size_t total_len = 0;
+ unsigned bvec_idx = 0;
+ unsigned iov_idx;
+
+ for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) {
+ size_t iov_len = iovec[iov_idx].iov_len;
+ u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base;
+ struct bio_vec *src_bvec;
+ size_t offset;
+ u64 buf_end;
+
+ if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end)))
+ return -EFAULT;
+ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
+ return -EFAULT;
+ if (unlikely(!iov_len))
+ return -EFAULT;
+ if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
+ return -EOVERFLOW;
+
+ /* by using folio address it also accounts for bvec offset */
+ offset = buf_addr - folio_addr;
+ src_bvec = imu->bvec + (offset >> imu->folio_shift);
+ offset &= folio_mask;
+
+ for (; iov_len; offset = 0, bvec_idx++, src_bvec++) {
+ size_t seg_size = min_t(size_t, iov_len,
+ folio_size - offset);
+
+ bvec_set_page(&res_bvec[bvec_idx],
+ src_bvec->bv_page, seg_size, offset);
+ iov_len -= seg_size;
+ }
+ }
+ if (total_len > MAX_RW_COUNT)
+ return -EINVAL;
+
+ iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len);
+ return 0;
+}
+
+static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs,
+ struct io_mapped_ubuf *imu)
+{
+ unsigned shift = imu->folio_shift;
+ size_t max_segs = 0;
+ unsigned i;
+
+ for (i = 0; i < nr_iovs; i++)
+ max_segs += (iov[i].iov_len >> shift) + 2;
+ return max_segs;
+}
+
+int io_import_reg_vec(int ddir, struct iov_iter *iter,
+ struct io_kiocb *req, struct iou_vec *vec,
+ unsigned nr_iovs, unsigned issue_flags)
+{
+ struct io_rsrc_node *node;
+ struct io_mapped_ubuf *imu;
+ unsigned iovec_off;
+ struct iovec *iov;
+ unsigned nr_segs;
+
+ node = io_find_buf_node(req, issue_flags);
+ if (!node)
+ return -EFAULT;
+ imu = node->buf;
+ if (imu->is_kbuf)
+ return -EOPNOTSUPP;
+ if (!(imu->dir & (1 << ddir)))
+ return -EFAULT;
+
+ iovec_off = vec->nr - nr_iovs;
+ iov = vec->iovec + iovec_off;
+ nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu);
+
+ if (sizeof(struct bio_vec) > sizeof(struct iovec)) {
+ size_t bvec_bytes;
+
+ bvec_bytes = nr_segs * sizeof(struct bio_vec);
+ nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov);
+ nr_segs += nr_iovs;
+ }
+
+ if (nr_segs > vec->nr) {
+ struct iou_vec tmp_vec = {};
+ int ret;
+
+ ret = io_vec_realloc(&tmp_vec, nr_segs);
+ if (ret)
+ return ret;
+
+ iovec_off = tmp_vec.nr - nr_iovs;
+ memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs);
+ io_vec_free(vec);
+
+ *vec = tmp_vec;
+ iov = vec->iovec + iovec_off;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+
+ return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec);
+}
+
+int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
+ const struct iovec __user *uvec, size_t uvec_segs)
+{
+ struct iovec *iov;
+ int iovec_off, ret;
+ void *res;
+
+ if (uvec_segs > iv->nr) {
+ ret = io_vec_realloc(iv, uvec_segs);
+ if (ret)
+ return ret;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+
+ /* pad iovec to the right */
+ iovec_off = iv->nr - uvec_segs;
+ iov = iv->iovec + iovec_off;
+ res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov,
+ io_is_compat(req->ctx));
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ req->flags |= REQ_F_IMPORT_BUFFER;
+ return 0;
+}
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 284e300e63fb..b52242852ff3 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -5,6 +5,8 @@
#include <linux/io_uring_types.h>
#include <linux/lockdep.h>
+#define IO_VEC_CACHE_SOFT_CAP 256
+
enum {
IORING_RSRC_FILE = 0,
IORING_RSRC_BUFFER = 1,
@@ -61,6 +63,11 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
u64 buf_addr, size_t len, int ddir,
unsigned issue_flags);
+int io_import_reg_vec(int ddir, struct iov_iter *iter,
+ struct io_kiocb *req, struct iou_vec *vec,
+ unsigned nr_iovs, unsigned issue_flags);
+int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv,
+ const struct iovec __user *uvec, size_t uvec_segs);
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
@@ -145,4 +152,21 @@ static inline void __io_unaccount_mem(struct user_struct *user,
atomic_long_sub(nr_pages, &user->locked_vm);
}
+void io_vec_free(struct iou_vec *iv);
+int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries);
+
+static inline void io_vec_reset_iovec(struct iou_vec *iv,
+ struct iovec *iovec, unsigned nr)
+{
+ io_vec_free(iv);
+ iv->iovec = iovec;
+ iv->nr = nr;
+}
+
+static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv)
+{
+ if (IS_ENABLED(CONFIG_KASAN))
+ io_vec_free(iv);
+}
+
#endif
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 7c2f5f70a2c5..039e063f7091 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -87,9 +87,9 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
int ret, nr_segs;
struct iovec *iov;
- if (io->free_iovec) {
- nr_segs = io->free_iov_nr;
- iov = io->free_iovec;
+ if (io->vec.iovec) {
+ nr_segs = io->vec.nr;
+ iov = io->vec.iovec;
} else {
nr_segs = 1;
iov = &io->fast_iov;
@@ -101,9 +101,7 @@ static int io_import_vec(int ddir, struct io_kiocb *req,
return ret;
if (iov) {
req->flags |= REQ_F_NEED_CLEANUP;
- io->free_iov_nr = io->iter.nr_segs;
- kfree(io->free_iovec);
- io->free_iovec = iov;
+ io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs);
}
return 0;
}
@@ -151,7 +149,10 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
return;
- io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr);
+ io_alloc_cache_vec_kasan(&rw->vec);
+ if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP)
+ io_vec_free(&rw->vec);
+
if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
@@ -201,7 +202,7 @@ static int io_rw_alloc_async(struct io_kiocb *req)
rw = io_uring_alloc_async_data(&ctx->rw_cache, req);
if (!rw)
return -ENOMEM;
- if (rw->free_iovec)
+ if (rw->vec.iovec)
req->flags |= REQ_F_NEED_CLEANUP;
rw->bytes_done = 0;
return 0;
@@ -383,6 +384,53 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return __io_prep_rw(req, sqe, ITER_SOURCE);
}
+static int io_rw_import_reg_vec(struct io_kiocb *req,
+ struct io_async_rw *io,
+ int ddir, unsigned int issue_flags)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ unsigned uvec_segs = rw->len;
+ int ret;
+
+ ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec,
+ uvec_segs, issue_flags);
+ if (unlikely(ret))
+ return ret;
+ iov_iter_save_state(&io->iter, &io->iter_state);
+ req->flags &= ~REQ_F_IMPORT_BUFFER;
+ return 0;
+}
+
+static int io_rw_prep_reg_vec(struct io_kiocb *req)
+{
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io = req->async_data;
+ const struct iovec __user *uvec;
+
+ uvec = u64_to_user_ptr(rw->addr);
+ return io_prep_reg_iovec(req, &io->vec, uvec, rw->len);
+}
+
+int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ int ret;
+
+ ret = __io_prep_rw(req, sqe, ITER_DEST);
+ if (unlikely(ret))
+ return ret;
+ return io_rw_prep_reg_vec(req);
+}
+
+int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ int ret;
+
+ ret = __io_prep_rw(req, sqe, ITER_SOURCE);
+ if (unlikely(ret))
+ return ret;
+ return io_rw_prep_reg_vec(req);
+}
+
/*
* Multishot read is prepared just like a normal read/write request, only
* difference is that we set the MULTISHOT flag.
@@ -856,7 +904,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret;
loff_t *ppos;
- if (io_do_buffer_select(req)) {
+ if (req->flags & REQ_F_IMPORT_BUFFER) {
+ ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags);
+ if (unlikely(ret))
+ return ret;
+ } else if (io_do_buffer_select(req)) {
ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags);
if (unlikely(ret < 0))
return ret;
@@ -995,9 +1047,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
*/
if (io_kbuf_recycle(req, issue_flags))
rw->len = 0;
- if (issue_flags & IO_URING_F_MULTISHOT)
- return IOU_ISSUE_SKIP_COMPLETE;
- return -EAGAIN;
+ return IOU_RETRY;
} else if (ret <= 0) {
io_kbuf_recycle(req, issue_flags);
if (ret < 0)
@@ -1015,16 +1065,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
rw->len = 0; /* similarly to above, reset len to 0 */
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
- if (issue_flags & IO_URING_F_MULTISHOT) {
+ if (issue_flags & IO_URING_F_MULTISHOT)
/*
* Force retry, as we might have more data to
* be read and otherwise it won't get retried
* until (if ever) another poll is triggered.
*/
io_poll_multishot_retry(req);
- return IOU_ISSUE_SKIP_COMPLETE;
- }
- return -EAGAIN;
+
+ return IOU_RETRY;
}
}
@@ -1034,9 +1083,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
*/
io_req_set_res(req, ret, cflags);
io_req_rw_cleanup(req, issue_flags);
- if (issue_flags & IO_URING_F_MULTISHOT)
- return IOU_STOP_MULTISHOT;
- return IOU_OK;
+ return IOU_COMPLETE;
}
static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb)
@@ -1067,6 +1114,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
ssize_t ret, ret2;
loff_t *ppos;
+ if (req->flags & REQ_F_IMPORT_BUFFER) {
+ ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags);
+ if (unlikely(ret))
+ return ret;
+ }
+
ret = io_rw_init_file(req, FMODE_WRITE, WRITE);
if (unlikely(ret))
return ret;
@@ -1326,7 +1379,6 @@ void io_rw_cache_free(const void *entry)
{
struct io_async_rw *rw = (struct io_async_rw *) entry;
- if (rw->free_iovec)
- kfree(rw->free_iovec);
+ io_vec_free(&rw->vec);
kfree(rw);
}
diff --git a/io_uring/rw.h b/io_uring/rw.h
index bf121b81ebe8..81d6d9a8cf69 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -9,13 +9,13 @@ struct io_meta_state {
};
struct io_async_rw {
+ struct iou_vec vec;
size_t bytes_done;
- struct iovec *free_iovec;
+
struct_group(clear,
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov;
- int free_iov_nr;
/*
* wpq is for buffered io, while meta fields are used with
* direct io
@@ -32,6 +32,8 @@ struct io_async_rw {
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index de39b602aa82..f2cfc371f3d0 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -16,10 +16,19 @@
#include "rsrc.h"
#include "uring_cmd.h"
+void io_cmd_cache_free(const void *entry)
+{
+ struct io_async_cmd *ac = (struct io_async_cmd *)entry;
+
+ io_vec_free(&ac->vec);
+ kfree(ac);
+}
+
static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
- struct io_uring_cmd_data *cache = req->async_data;
+ struct io_async_cmd *ac = req->async_data;
+ struct io_uring_cmd_data *cache = &ac->data;
if (cache->op_data) {
kfree(cache->op_data);
@@ -28,13 +37,23 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
if (issue_flags & IO_URING_F_UNLOCKED)
return;
- if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) {
+
+ io_alloc_cache_vec_kasan(&ac->vec);
+ if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP)
+ io_vec_free(&ac->vec);
+
+ if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) {
ioucmd->sqe = NULL;
req->async_data = NULL;
- req->flags &= ~REQ_F_ASYNC_DATA;
+ req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
}
}
+void io_uring_cmd_cleanup(struct io_kiocb *req)
+{
+ io_req_uring_cleanup(req, 0);
+}
+
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all)
{
@@ -169,12 +188,15 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
- struct io_uring_cmd_data *cache;
+ struct io_async_cmd *ac;
+
+ /* see io_uring_cmd_get_async_data() */
+ BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0);
- cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req);
- if (!cache)
+ ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req);
+ if (!ac)
return -ENOMEM;
- cache->op_data = NULL;
+ ac->data.op_data = NULL;
/*
* Unconditionally cache the SQE for now - this is only needed for
@@ -183,8 +205,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req,
* that it doesn't read in per-op data, play it safe and ensure that
* any SQE data is stable beyond prep. This can later get relaxed.
*/
- memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx));
- ioucmd->sqe = cache->sqes;
+ memcpy(ac->data.sqes, sqe, uring_sqe_size(req->ctx));
+ ioucmd->sqe = ac->data.sqes;
return 0;
}
@@ -255,6 +277,25 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
+int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
+ const struct iovec __user *uvec,
+ size_t uvec_segs,
+ int ddir, struct iov_iter *iter,
+ unsigned issue_flags)
+{
+ struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ struct io_async_cmd *ac = req->async_data;
+ int ret;
+
+ ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs);
+ if (ret)
+ return ret;
+
+ return io_import_reg_vec(ddir, iter, req, &ac->vec, uvec_segs,
+ issue_flags);
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed_vec);
+
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index f6837ee0955b..14e525255854 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -1,7 +1,24 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/io_uring/cmd.h>
+#include <linux/io_uring_types.h>
+
+struct io_async_cmd {
+ struct io_uring_cmd_data data;
+ struct iou_vec vec;
+};
+
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+void io_uring_cmd_cleanup(struct io_kiocb *req);
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all);
+
+void io_cmd_cache_free(const void *entry);
+
+int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
+ const struct iovec __user *uvec,
+ size_t uvec_segs,
+ int ddir, struct iov_iter *iter,
+ unsigned issue_flags);