diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-28 15:07:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-03-28 15:07:04 -0700 |
commit | eff5f16bfd87ae48c56751741af41a825d5d4618 (patch) | |
tree | 9b1e58d1038902a754107b35621d428ba24f5165 /io_uring | |
parent | 6df9d086ffcb6b0521872fef5f9f4dd1907abb9a (diff) | |
parent | 6889ae1b4df1579bcdffef023e2ea9a982565dff (diff) |
Merge tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux
Pull more io_uring updates from Jens Axboe:
"Final separate updates for io_uring.
This started out as a series of cleanups improvements and improvements
for registered buffers, but as the last series of the io_uring changes
for 6.15, it also collected a few fixes for the other branches on top:
- Add support for vectored fixed/registered buffers.
Previously only single segments have been supported for commands,
now vectored variants are supported as well. This series includes
networking and file read/write support.
- Small series unifying return codes across multi and single shot.
- Small series cleaning up registerd buffer importing.
- Adding support for vectored registered buffers for uring_cmd.
- Fix for io-wq handling of command reissue.
- Various little fixes and tweaks"
* tag 'for-6.15/io_uring-reg-vec-20250327' of git://git.kernel.dk/linux: (25 commits)
io_uring/net: fix io_req_post_cqe abuse by send bundle
io_uring/net: use REQ_F_IMPORT_BUFFER for send_zc
io_uring: move min_events sanitisation
io_uring: rename "min" arg in io_iopoll_check()
io_uring: open code __io_post_aux_cqe()
io_uring: defer iowq cqe overflow via task_work
io_uring: fix retry handling off iowq
io_uring/net: only import send_zc buffer once
io_uring/cmd: introduce io_uring_cmd_import_fixed_vec
io_uring/cmd: add iovec cache for commands
io_uring/cmd: don't expose entire cmd async data
io_uring: rename the data cmd cache
io_uring: rely on io_prep_reg_vec for iovec placement
io_uring: introduce io_prep_reg_iovec()
io_uring: unify STOP_MULTISHOT with IOU_OK
io_uring: return -EAGAIN to continue multishot
io_uring: cap cached iovec/bvec size
io_uring/net: implement vectored reg bufs for zctx
io_uring/net: convert to struct iou_vec
io_uring/net: pull vec alloc out of msghdr import
...
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/alloc_cache.h | 9 | ||||
-rw-r--r-- | io_uring/io_uring.c | 65 | ||||
-rw-r--r-- | io_uring/io_uring.h | 19 | ||||
-rw-r--r-- | io_uring/net.c | 235 | ||||
-rw-r--r-- | io_uring/net.h | 6 | ||||
-rw-r--r-- | io_uring/opdef.c | 42 | ||||
-rw-r--r-- | io_uring/poll.c | 5 | ||||
-rw-r--r-- | io_uring/rsrc.c | 163 | ||||
-rw-r--r-- | io_uring/rsrc.h | 24 | ||||
-rw-r--r-- | io_uring/rw.c | 94 | ||||
-rw-r--r-- | io_uring/rw.h | 6 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 59 | ||||
-rw-r--r-- | io_uring/uring_cmd.h | 17 |
13 files changed, 534 insertions, 210 deletions
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 7f68eff2e7f3..d33ce159ef33 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache, void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); -static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) -{ - if (IS_ENABLED(CONFIG_KASAN)) { - kfree(*iov); - *iov = NULL; - *nr = 0; - } -} - static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f743581cc81b..3ba49c628337 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -289,7 +289,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); @@ -334,8 +334,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw), offsetof(struct io_async_rw, clear)); - ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_uring_cmd_data), 0); + ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_cmd), + sizeof(struct io_async_cmd)); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb), 0); @@ -833,24 +834,14 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags) +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; + io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - - return filled; -} - -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) -{ - bool filled; - - io_cq_lock(ctx); - filled = __io_post_aux_cqe(ctx, user_data, res, cflags); io_cq_unlock_post(ctx); return filled; } @@ -891,6 +882,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool completed = true; /* * All execution paths but io-wq use the deferred completions by @@ -903,19 +895,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->lockless_cq) { + if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { +defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; } io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) { - if (!io_fill_cqe_req(ctx, req)) - io_req_cqe_overflow(req); - } + if (!(req->flags & REQ_F_CQE_SKIP)) + completed = io_fill_cqe_req(ctx, req); io_cq_unlock_post(ctx); + if (!completed) + goto defer_complete; + /* * We don't free the request here because we know it's called from * io-wq only, which holds a reference, so it cannot be the last put. @@ -1511,11 +1505,13 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { unsigned int nr_events = 0; unsigned long check_cq; + min_events = min(min_events, ctx->cq_entries); + lockdep_assert_held(&ctx->uring_lock); if (!io_allowed_run_tw(ctx)) @@ -1557,7 +1553,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx, min); + (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { @@ -1570,7 +1566,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, !min); + ret = io_do_iopoll(ctx, !min_events); if (unlikely(ret < 0)) return ret; @@ -1580,7 +1576,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) break; nr_events += ret; - } while (nr_events < min); + } while (nr_events < min_events); return 0; } @@ -1791,10 +1787,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); - WARN_ON_ONCE(ret == IOU_OK); - - if (ret == IOU_ISSUE_SKIP_COMPLETE) - ret = 0; + WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); return ret; } @@ -1847,7 +1840,7 @@ fail: * Don't allow any multishot execution from io-wq. It's more restrictive * than necessary and also cleaner. */ - if (req->flags & REQ_F_APOLL_MULTISHOT) { + if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { err = -EBADFD; if (!io_file_can_poll(req)) goto fail; @@ -1858,7 +1851,7 @@ fail: goto fail; return; } else { - req->flags &= ~REQ_F_APOLL_MULTISHOT; + req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); } } @@ -2549,6 +2542,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, ktime_t start_time; int ret; + min_events = min_t(int, min_events, ctx->cq_entries); + if (!io_allowed_run_tw(ctx)) return -EEXIST; if (io_local_work_pending(ctx)) @@ -3435,22 +3430,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); - } mutex_unlock(&ctx->uring_lock); } else { struct ext_arg ext_arg = { .argsz = argsz }; ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_cqring_wait(ctx, min_complete, flags, &ext_arg); - } } if (!ret) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index b95dab77e32d..87f883130286 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -19,22 +19,25 @@ #endif enum { - IOU_OK = 0, + IOU_OK = 0, /* deprecated, use IOU_COMPLETE */ + IOU_COMPLETE = 0, + IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, /* + * The request has more work to do and should be retried. io_uring will + * attempt to wait on the file for eligible opcodes, but otherwise + * it'll be handed to iowq for blocking execution. It works for normal + * requests as well as for the multi shot mode. + */ + IOU_RETRY = -EAGAIN, + + /* * Requeue the task_work to restart operations on this request. The * actual value isn't important, should just be not an otherwise * valid error code, yet less than -MAX_ERRNO and valid internally. */ IOU_REQUEUE = -3072, - - /* - * Intended only when both IO_URING_F_MULTISHOT is passed - * to indicate to the poll runner that multishot should be - * removed and the result is set on req->cqe.res. - */ - IOU_STOP_MULTISHOT = -ECANCELED, }; struct io_wait_queue { diff --git a/io_uring/net.c b/io_uring/net.c index 89cd45bacd7c..8944eb679024 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -136,11 +136,8 @@ static bool io_net_retry(struct socket *sock, int flags) static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) { - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov_nr = 0; - kmsg->free_iov = NULL; - } + if (kmsg->vec.iovec) + io_vec_free(&kmsg->vec); } static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) @@ -154,7 +151,10 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } /* Let normal cleanup path reap it if we fail adding to the cache */ - io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr); + io_alloc_cache_vec_kasan(&hdr->vec); + if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&hdr->vec); + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { req->async_data = NULL; req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); @@ -171,7 +171,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) return NULL; /* If the async data was cached, we might have an iov cached inside. */ - if (hdr->free_iov) + if (hdr->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; return hdr; } @@ -182,10 +182,7 @@ static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg { if (iov) { req->flags |= REQ_F_NEED_CLEANUP; - kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; - if (kmsg->free_iov) - kfree(kmsg->free_iov); - kmsg->free_iov = iov; + io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs); } } @@ -208,9 +205,9 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg struct iovec *iov; int ret, nr_segs; - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; + if (iomsg->vec.iovec) { + nr_segs = iomsg->vec.nr; + iov = iomsg->vec.iovec; } else { nr_segs = 1; iov = &iomsg->fast_iov; @@ -253,12 +250,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return -EFAULT; sr->len = tmp_iov.iov_len; } - - return 0; } - - return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov, - msg->msg_iovlen, ddir); + return 0; } static int io_copy_msghdr_from_user(struct user_msghdr *msg, @@ -287,6 +280,24 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr __user *umsg = sr->umsg; int ret; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->msg.msg_iter.nr_segs = 0; + + if (io_is_compat(req->ctx)) { + struct compat_msghdr cmsg; + + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); + if (ret) + return ret; + + memset(msg, 0, sizeof(*msg)); + msg->msg_namelen = cmsg.msg_namelen; + msg->msg_controllen = cmsg.msg_controllen; + msg->msg_iov = compat_ptr(cmsg.msg_iov); + msg->msg_iovlen = cmsg.msg_iovlen; + return 0; + } + ret = io_copy_msghdr_from_user(msg, umsg); if (unlikely(ret)) return ret; @@ -310,10 +321,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, return -EFAULT; sr->len = tmp_iov.iov_len; } - return 0; } - - return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir); + return 0; } static int io_sendmsg_copy_hdr(struct io_kiocb *req, @@ -323,19 +332,13 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct user_msghdr msg; int ret; - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; - - if (io_is_compat(req->ctx)) { - struct compat_msghdr cmsg; - - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE, - NULL); - sr->msg_control = iomsg->msg.msg_control_user; + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); + if (unlikely(ret)) return ret; - } - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); + if (!(req->flags & REQ_F_BUFFER_SELECT)) + ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, + ITER_SOURCE); /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; return ret; @@ -395,6 +398,27 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe return io_sendmsg_copy_hdr(req, kmsg); } +static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg = req->async_data; + struct user_msghdr msg; + int ret; + + if (!(sr->flags & IORING_RECVSEND_FIXED_BUF)) + return io_sendmsg_setup(req, sqe); + + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); + if (unlikely(ret)) + return ret; + sr->msg_control = kmsg->msg.msg_control_user; + kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; + + return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, msg.msg_iovlen); +} + #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -424,6 +448,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_WAITALL; sr->buf_group = req->buf_index; req->buf_list = NULL; + req->flags |= REQ_F_MULTISHOT; } if (io_is_compat(req->ctx)) @@ -461,7 +486,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) if (iter_is_ubuf(&kmsg->msg.msg_iter)) return 1; - iov = kmsg->free_iov; + iov = kmsg->vec.iovec; if (!iov) iov = &kmsg->fast_iov; @@ -577,9 +602,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, .nr_iovs = 1, }; - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; arg.mode = KBUF_MODE_FREE; } @@ -592,9 +617,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, if (unlikely(ret < 0)) return ret; - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } sr->len = arg.out_len; @@ -709,23 +734,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct user_msghdr msg; int ret; - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; - - if (io_is_compat(req->ctx)) { - struct compat_msghdr cmsg; - - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, - &iomsg->uaddr); - memset(&msg, 0, sizeof(msg)); - msg.msg_namelen = cmsg.msg_namelen; - msg.msg_controllen = cmsg.msg_controllen; - } else { - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); - } - + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); if (unlikely(ret)) return ret; + + if (!(req->flags & REQ_F_BUFFER_SELECT)) { + ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, + ITER_DEST); + if (unlikely(ret)) + return ret; + } return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, msg.msg_controllen); } @@ -863,8 +881,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { - int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - + *ret = IOU_RETRY; io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { @@ -872,23 +889,16 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, return false; /* mshot retries exceeded, force a requeue */ sr->nr_multishot_loops = 0; - mshot_retry_ret = IOU_REQUEUE; + if (issue_flags & IO_URING_F_MULTISHOT) + *ret = IOU_REQUEUE; } - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = mshot_retry_ret; - else - *ret = -EAGAIN; return true; } /* Finish the request / stop multishot. */ finish: io_req_set_res(req, *ret, cflags); - - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = IOU_STOP_MULTISHOT; - else - *ret = IOU_OK; + *ret = IOU_COMPLETE; io_req_msg_cleanup(req, issue_flags); return true; } @@ -1035,16 +1045,15 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return IOU_RETRY; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1085,9 +1094,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg .mode = KBUF_MODE_EXPAND, }; - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; arg.mode |= KBUF_MODE_FREE; } @@ -1106,9 +1115,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } } else { @@ -1172,12 +1181,10 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; @@ -1260,9 +1267,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) if (len && zc->len == 0) { io_req_set_res(req, 0, 0); - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; } if (unlikely(ret <= 0) && ret != -EAGAIN) { if (ret == -ERESTARTSYS) @@ -1272,15 +1277,9 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_set_res(req, ret, 0); - - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; } - - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } void io_send_zc_cleanup(struct io_kiocb *req) @@ -1339,8 +1338,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->opcode != IORING_OP_SEND_ZC) { if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; - if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) - return -EINVAL; } zc->len = READ_ONCE(sqe->len); @@ -1354,9 +1351,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; - if (req->opcode != IORING_OP_SENDMSG_ZC) + if (req->opcode == IORING_OP_SEND_ZC) { + req->flags |= REQ_F_IMPORT_BUFFER; return io_send_setup(req, sqe); - return io_sendmsg_setup(req, sqe); + } + return io_sendmsg_zc_setup(req, sqe); } static int io_sg_from_iter_iovec(struct sk_buff *skb, @@ -1454,7 +1453,8 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) (zc->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - if (!zc->done_io) { + if (req->flags & REQ_F_IMPORT_BUFFER) { + req->flags &= ~REQ_F_IMPORT_BUFFER; ret = io_send_zc_import(req, issue_flags); if (unlikely(ret)) return ret; @@ -1513,6 +1513,20 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; + + if (req->flags & REQ_F_IMPORT_BUFFER) { + unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; + int ret; + + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, + &kmsg->vec, uvec_segs, issue_flags); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter; + req->flags &= ~REQ_F_IMPORT_BUFFER; + } + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -1531,7 +1545,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { @@ -1646,16 +1659,9 @@ retry: put_unused_fd(fd); ret = PTR_ERR(file); if (ret == -EAGAIN && force_nonblock && - !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return ret; - } + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) + return IOU_RETRY; + if (ret == -ERESTARTSYS) ret = -EINTR; } else if (!fixed) { @@ -1674,17 +1680,13 @@ retry: io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) goto retry; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } io_req_set_res(req, ret, cflags); if (ret < 0) req_set_fail(req); - if (!(issue_flags & IO_URING_F_MULTISHOT)) - return IOU_OK; - return IOU_STOP_MULTISHOT; + return IOU_COMPLETE; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1876,8 +1878,7 @@ void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; - if (kmsg->free_iov) - io_netmsg_iovec_free(kmsg); + io_vec_free(&kmsg->vec); kfree(kmsg); } #endif diff --git a/io_uring/net.h b/io_uring/net.h index b804c2b36e60..43e5ce5416b7 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -2,12 +2,12 @@ #include <linux/net.h> #include <linux/uio.h> +#include <linux/io_uring_types.h> struct io_async_msghdr { #if defined(CONFIG_NET) - struct iovec *free_iov; - /* points to an allocated iov, if NULL we use fast_iov instead */ - int free_iov_nr; + struct iou_vec vec; + struct_group(clear, int namelen; struct iovec fast_iov; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index db77df513d55..489384c0438b 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -416,7 +416,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_uring_cmd_data), + .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, @@ -540,6 +540,35 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_READV_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_readv_fixed, + .issue = io_read, + }, + [IORING_OP_WRITEV_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_writev_fixed, + .issue = io_write, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -726,6 +755,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", + .cleanup = io_uring_cmd_cleanup, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", @@ -775,6 +805,16 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_EPOLL_WAIT] = { .name = "EPOLL_WAIT", }, + [IORING_OP_READV_FIXED] = { + .name = "READV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_WRITEV_FIXED] = { + .name = "WRITEV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/poll.c b/io_uring/poll.c index 176854882ba6..8eb744eb9f4c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -289,11 +289,12 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) } } else { int ret = io_poll_issue(req, tw); - if (ret == IOU_STOP_MULTISHOT) + + if (ret == IOU_COMPLETE) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; - if (ret < 0) + if (ret != IOU_RETRY && ret < 0) return ret; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a59563fbb4ad..3f195e24777e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1262,3 +1262,166 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) fput(file); return ret; } + +void io_vec_free(struct iou_vec *iv) +{ + if (!iv->iovec) + return; + kfree(iv->iovec); + iv->iovec = NULL; + iv->nr = 0; +} + +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) +{ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + struct iovec *iov; + + iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); + if (!iov) + return -ENOMEM; + + io_vec_free(iv); + iv->iovec = iov; + iv->nr = nr_entries; + return 0; +} + +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + unsigned long folio_size = 1 << imu->folio_shift; + unsigned long folio_mask = folio_size - 1; + u64 folio_addr = imu->ubuf & ~folio_mask; + struct bio_vec *res_bvec = vec->bvec; + size_t total_len = 0; + unsigned bvec_idx = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t iov_len = iovec[iov_idx].iov_len; + u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; + struct bio_vec *src_bvec; + size_t offset; + u64 buf_end; + + if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) + return -EFAULT; + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) + return -EFAULT; + if (unlikely(!iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) + return -EOVERFLOW; + + /* by using folio address it also accounts for bvec offset */ + offset = buf_addr - folio_addr; + src_bvec = imu->bvec + (offset >> imu->folio_shift); + offset &= folio_mask; + + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { + size_t seg_size = min_t(size_t, iov_len, + folio_size - offset); + + bvec_set_page(&res_bvec[bvec_idx], + src_bvec->bv_page, seg_size, offset); + iov_len -= seg_size; + } + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); + return 0; +} + +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu) +{ + unsigned shift = imu->folio_shift; + size_t max_segs = 0; + unsigned i; + + for (i = 0; i < nr_iovs; i++) + max_segs += (iov[i].iov_len >> shift) + 2; + return max_segs; +} + +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned issue_flags) +{ + struct io_rsrc_node *node; + struct io_mapped_ubuf *imu; + unsigned iovec_off; + struct iovec *iov; + unsigned nr_segs; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + imu = node->buf; + if (imu->is_kbuf) + return -EOPNOTSUPP; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + iovec_off = vec->nr - nr_iovs; + iov = vec->iovec + iovec_off; + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + + if (sizeof(struct bio_vec) > sizeof(struct iovec)) { + size_t bvec_bytes; + + bvec_bytes = nr_segs * sizeof(struct bio_vec); + nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); + nr_segs += nr_iovs; + } + + if (nr_segs > vec->nr) { + struct iou_vec tmp_vec = {}; + int ret; + + ret = io_vec_realloc(&tmp_vec, nr_segs); + if (ret) + return ret; + + iovec_off = tmp_vec.nr - nr_iovs; + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); + io_vec_free(vec); + + *vec = tmp_vec; + iov = vec->iovec + iovec_off; + req->flags |= REQ_F_NEED_CLEANUP; + } + + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); +} + +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs) +{ + struct iovec *iov; + int iovec_off, ret; + void *res; + + if (uvec_segs > iv->nr) { + ret = io_vec_realloc(iv, uvec_segs); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + } + + /* pad iovec to the right */ + iovec_off = iv->nr - uvec_segs; + iov = iv->iovec + iovec_off; + res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, + io_is_compat(req->ctx)); + if (IS_ERR(res)) + return PTR_ERR(res); + + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 284e300e63fb..b52242852ff3 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -5,6 +5,8 @@ #include <linux/io_uring_types.h> #include <linux/lockdep.h> +#define IO_VEC_CACHE_SOFT_CAP 256 + enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, @@ -61,6 +63,11 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, unsigned issue_flags); +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned issue_flags); +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); @@ -145,4 +152,21 @@ static inline void __io_unaccount_mem(struct user_struct *user, atomic_long_sub(nr_pages, &user->locked_vm); } +void io_vec_free(struct iou_vec *iv); +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); + +static inline void io_vec_reset_iovec(struct iou_vec *iv, + struct iovec *iovec, unsigned nr) +{ + io_vec_free(iv); + iv->iovec = iovec; + iv->nr = nr; +} + +static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv) +{ + if (IS_ENABLED(CONFIG_KASAN)) + io_vec_free(iv); +} + #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index 7c2f5f70a2c5..039e063f7091 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -87,9 +87,9 @@ static int io_import_vec(int ddir, struct io_kiocb *req, int ret, nr_segs; struct iovec *iov; - if (io->free_iovec) { - nr_segs = io->free_iov_nr; - iov = io->free_iovec; + if (io->vec.iovec) { + nr_segs = io->vec.nr; + iov = io->vec.iovec; } else { nr_segs = 1; iov = &io->fast_iov; @@ -101,9 +101,7 @@ static int io_import_vec(int ddir, struct io_kiocb *req, return ret; if (iov) { req->flags |= REQ_F_NEED_CLEANUP; - io->free_iov_nr = io->iter.nr_segs; - kfree(io->free_iovec); - io->free_iovec = iov; + io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs); } return 0; } @@ -151,7 +149,10 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) return; - io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr); + io_alloc_cache_vec_kasan(&rw->vec); + if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&rw->vec); + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; @@ -201,7 +202,7 @@ static int io_rw_alloc_async(struct io_kiocb *req) rw = io_uring_alloc_async_data(&ctx->rw_cache, req); if (!rw) return -ENOMEM; - if (rw->free_iovec) + if (rw->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; rw->bytes_done = 0; return 0; @@ -383,6 +384,53 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_prep_rw(req, sqe, ITER_SOURCE); } +static int io_rw_import_reg_vec(struct io_kiocb *req, + struct io_async_rw *io, + int ddir, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + unsigned uvec_segs = rw->len; + int ret; + + ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, + uvec_segs, issue_flags); + if (unlikely(ret)) + return ret; + iov_iter_save_state(&io->iter, &io->iter_state); + req->flags &= ~REQ_F_IMPORT_BUFFER; + return 0; +} + +static int io_rw_prep_reg_vec(struct io_kiocb *req) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io = req->async_data; + const struct iovec __user *uvec; + + uvec = u64_to_user_ptr(rw->addr); + return io_prep_reg_iovec(req, &io->vec, uvec, rw->len); +} + +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_DEST); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req); +} + +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_SOURCE); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req); +} + /* * Multishot read is prepared just like a normal read/write request, only * difference is that we set the MULTISHOT flag. @@ -856,7 +904,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret; loff_t *ppos; - if (io_do_buffer_select(req)) { + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); + if (unlikely(ret)) + return ret; + } else if (io_do_buffer_select(req)) { ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; @@ -995,9 +1047,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ if (io_kbuf_recycle(req, issue_flags)) rw->len = 0; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } else if (ret <= 0) { io_kbuf_recycle(req, issue_flags); if (ret < 0) @@ -1015,16 +1065,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) /* * Force retry, as we might have more data to * be read and otherwise it won't get retried * until (if ever) another poll is triggered. */ io_poll_multishot_retry(req); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + + return IOU_RETRY; } } @@ -1034,9 +1083,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ io_req_set_res(req, ret, cflags); io_req_rw_cleanup(req, issue_flags); - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; } static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) @@ -1067,6 +1114,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); + if (unlikely(ret)) + return ret; + } + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; @@ -1326,7 +1379,6 @@ void io_rw_cache_free(const void *entry) { struct io_async_rw *rw = (struct io_async_rw *) entry; - if (rw->free_iovec) - kfree(rw->free_iovec); + io_vec_free(&rw->vec); kfree(rw); } diff --git a/io_uring/rw.h b/io_uring/rw.h index bf121b81ebe8..81d6d9a8cf69 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -9,13 +9,13 @@ struct io_meta_state { }; struct io_async_rw { + struct iou_vec vec; size_t bytes_done; - struct iovec *free_iovec; + struct_group(clear, struct iov_iter iter; struct iov_iter_state iter_state; struct iovec fast_iov; - int free_iov_nr; /* * wpq is for buffered io, while meta fields are used with * direct io @@ -32,6 +32,8 @@ struct io_async_rw { int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index de39b602aa82..f2cfc371f3d0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,10 +16,19 @@ #include "rsrc.h" #include "uring_cmd.h" +void io_cmd_cache_free(const void *entry) +{ + struct io_async_cmd *ac = (struct io_async_cmd *)entry; + + io_vec_free(&ac->vec); + kfree(ac); +} + static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache = req->async_data; + struct io_async_cmd *ac = req->async_data; + struct io_uring_cmd_data *cache = &ac->data; if (cache->op_data) { kfree(cache->op_data); @@ -28,13 +37,23 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_UNLOCKED) return; - if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { + + io_alloc_cache_vec_kasan(&ac->vec); + if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&ac->vec); + + if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) { ioucmd->sqe = NULL; req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); } } +void io_uring_cmd_cleanup(struct io_kiocb *req) +{ + io_req_uring_cleanup(req, 0); +} + bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { @@ -169,12 +188,15 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache; + struct io_async_cmd *ac; + + /* see io_uring_cmd_get_async_data() */ + BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0); - cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req); - if (!cache) + ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); + if (!ac) return -ENOMEM; - cache->op_data = NULL; + ac->data.op_data = NULL; /* * Unconditionally cache the SQE for now - this is only needed for @@ -183,8 +205,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, * that it doesn't read in per-op data, play it safe and ensure that * any SQE data is stable beyond prep. This can later get relaxed. */ - memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = cache->sqes; + memcpy(ac->data.sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = ac->data.sqes; return 0; } @@ -255,6 +277,25 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); +int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + struct io_async_cmd *ac = req->async_data; + int ret; + + ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs); + if (ret) + return ret; + + return io_import_reg_vec(ddir, iter, req, &ac->vec, uvec_segs, + issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed_vec); + void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index f6837ee0955b..14e525255854 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,7 +1,24 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring/cmd.h> +#include <linux/io_uring_types.h> + +struct io_async_cmd { + struct io_uring_cmd_data data; + struct iou_vec vec; +}; + int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +void io_uring_cmd_cleanup(struct io_kiocb *req); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); + +void io_cmd_cache_free(const void *entry); + +int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags); |