From e1d499590977a492ae120d9263bd55076aabd460 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:29 +0000 Subject: io_uring: introduce struct iou_vec I need a convenient way to pass around and work with iovec+size pair, put them into a structure and makes use of it in rw.c Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d39fadafc9e9047b0a292e5be6db3cf2f48bb1f7.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 9 +++++++++ io_uring/rsrc.h | 16 ++++++++++++++++ io_uring/rw.c | 17 +++++++---------- io_uring/rw.h | 4 ++-- 4 files changed, 34 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 28783f1dde00..bac509f85c80 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1260,3 +1260,12 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) fput(file); return ret; } + +void io_vec_free(struct iou_vec *iv) +{ + if (!iv->iovec) + return; + kfree(iv->iovec); + iv->iovec = NULL; + iv->nr = 0; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 284e300e63fb..f35e1a07619a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -145,4 +145,20 @@ static inline void __io_unaccount_mem(struct user_struct *user, atomic_long_sub(nr_pages, &user->locked_vm); } +void io_vec_free(struct iou_vec *iv); + +static inline void io_vec_reset_iovec(struct iou_vec *iv, + struct iovec *iovec, unsigned nr) +{ + io_vec_free(iv); + iv->iovec = iovec; + iv->nr = nr; +} + +static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv) +{ + if (IS_ENABLED(CONFIG_KASAN)) + io_vec_free(iv); +} + #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index 5ee9f8949e8b..ad7f647d48e9 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -87,9 +87,9 @@ static int io_import_vec(int ddir, struct io_kiocb *req, int ret, nr_segs; struct iovec *iov; - if (io->free_iovec) { - nr_segs = io->free_iov_nr; - iov = io->free_iovec; + if (io->vec.iovec) { + nr_segs = io->vec.nr; + iov = io->vec.iovec; } else { nr_segs = 1; iov = &io->fast_iov; @@ -101,9 +101,7 @@ static int io_import_vec(int ddir, struct io_kiocb *req, return ret; if (iov) { req->flags |= REQ_F_NEED_CLEANUP; - io->free_iov_nr = io->iter.nr_segs; - kfree(io->free_iovec); - io->free_iovec = iov; + io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs); } return 0; } @@ -151,7 +149,7 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) return; - io_alloc_cache_kasan(&rw->free_iovec, &rw->free_iov_nr); + io_alloc_cache_vec_kasan(&rw->vec); if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; @@ -201,7 +199,7 @@ static int io_rw_alloc_async(struct io_kiocb *req) rw = io_uring_alloc_async_data(&ctx->rw_cache, req); if (!rw) return -ENOMEM; - if (rw->free_iovec) + if (rw->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; rw->bytes_done = 0; return 0; @@ -1327,7 +1325,6 @@ void io_rw_cache_free(const void *entry) { struct io_async_rw *rw = (struct io_async_rw *) entry; - if (rw->free_iovec) - kfree(rw->free_iovec); + io_vec_free(&rw->vec); kfree(rw); } diff --git a/io_uring/rw.h b/io_uring/rw.h index bf121b81ebe8..529fd2f96a7f 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -9,13 +9,13 @@ struct io_meta_state { }; struct io_async_rw { + struct iou_vec vec; size_t bytes_done; - struct iovec *free_iovec; + struct_group(clear, struct iov_iter iter; struct iov_iter_state iter_state; struct iovec fast_iov; - int free_iov_nr; /* * wpq is for buffered io, while meta fields are used with * direct io -- cgit From 9ef4cbbcb4ac3786a1a4164507511b76b2a572c5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:30 +0000 Subject: io_uring: add infra for importing vectored reg buffers Add io_import_reg_vec(), which will be responsible for importing vectored registered buffers. The function might reallocate the vector, but it'd try to do the conversion in place first, which is why it's required of the user to pad the iovec to the right border of the cache. Overlapping also depends on struct iovec being larger than bvec, which is not the case on e.g. 32 bit architectures. Don't try to complicate this case and make sure vectors never overlap, it'll be improved later. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/60bd246b1249476a6996407c1dbc38ef6febad14.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 5 +++ 2 files changed, 133 insertions(+) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index bac509f85c80..71fe47facd4c 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1269,3 +1269,131 @@ void io_vec_free(struct iou_vec *iv) iv->iovec = NULL; iv->nr = 0; } + +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) +{ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + struct iovec *iov; + + iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); + if (!iov) + return -ENOMEM; + + io_vec_free(iv); + iv->iovec = iov; + iv->nr = nr_entries; + return 0; +} + +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + unsigned long folio_size = 1 << imu->folio_shift; + unsigned long folio_mask = folio_size - 1; + u64 folio_addr = imu->ubuf & ~folio_mask; + struct bio_vec *res_bvec = vec->bvec; + size_t total_len = 0; + unsigned bvec_idx = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t iov_len = iovec[iov_idx].iov_len; + u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; + struct bio_vec *src_bvec; + size_t offset; + u64 buf_end; + + if (unlikely(check_add_overflow(buf_addr, (u64)iov_len, &buf_end))) + return -EFAULT; + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) + return -EFAULT; + if (unlikely(!iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) + return -EOVERFLOW; + + /* by using folio address it also accounts for bvec offset */ + offset = buf_addr - folio_addr; + src_bvec = imu->bvec + (offset >> imu->folio_shift); + offset &= folio_mask; + + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { + size_t seg_size = min_t(size_t, iov_len, + folio_size - offset); + + bvec_set_page(&res_bvec[bvec_idx], + src_bvec->bv_page, seg_size, offset); + iov_len -= seg_size; + } + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); + return 0; +} + +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu) +{ + unsigned shift = imu->folio_shift; + size_t max_segs = 0; + unsigned i; + + for (i = 0; i < nr_iovs; i++) + max_segs += (iov[i].iov_len >> shift) + 2; + return max_segs; +} + +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned iovec_off, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + struct io_mapped_ubuf *imu; + struct iovec *iov; + unsigned nr_segs; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + imu = node->buf; + if (imu->is_kbuf) + return -EOPNOTSUPP; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + iov = vec->iovec + iovec_off; + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + + if (sizeof(struct bio_vec) > sizeof(struct iovec)) { + size_t bvec_bytes; + + bvec_bytes = nr_segs * sizeof(struct bio_vec); + nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); + nr_segs += nr_iovs; + } + + if (WARN_ON_ONCE(iovec_off + nr_iovs != vec->nr) || + nr_segs > vec->nr) { + struct iou_vec tmp_vec = {}; + int ret; + + ret = io_vec_realloc(&tmp_vec, nr_segs); + if (ret) + return ret; + + iovec_off = tmp_vec.nr - nr_iovs; + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); + io_vec_free(vec); + + *vec = tmp_vec; + iov = vec->iovec + iovec_off; + req->flags |= REQ_F_NEED_CLEANUP; + } + + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f35e1a07619a..0d5c18296130 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -61,6 +61,10 @@ struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, u64 buf_addr, size_t len, int ddir, unsigned issue_flags); +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned iovec_off, + unsigned issue_flags); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); @@ -146,6 +150,7 @@ static inline void __io_unaccount_mem(struct user_struct *user, } void io_vec_free(struct iou_vec *iv); +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); static inline void io_vec_reset_iovec(struct iou_vec *iv, struct iovec *iovec, unsigned nr) -- cgit From bdabba04bb1023e0327998b1eb0be096079bde65 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:31 +0000 Subject: io_uring/rw: implement vectored registered rw Implement registered buffer vectored reads with new opcodes IORING_OP_WRITEV_FIXED and IORING_OP_READV_FIXED. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d7c89eb481e870f598edc91cc66ff4d1e4ae3788.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/opdef.c | 39 +++++++++++++++++++++++++++++++++++++++ io_uring/rw.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ io_uring/rw.h | 2 ++ 3 files changed, 92 insertions(+) (limited to 'io_uring') diff --git a/io_uring/opdef.c b/io_uring/opdef.c index db77df513d55..7fd173197b1e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -540,6 +540,35 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_READV_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_readv_fixed, + .issue = io_read, + }, + [IORING_OP_WRITEV_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_writev_fixed, + .issue = io_write, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -775,6 +804,16 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_EPOLL_WAIT] = { .name = "EPOLL_WAIT", }, + [IORING_OP_READV_FIXED] = { + .name = "READV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_WRITEV_FIXED] = { + .name = "WRITEV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/rw.c b/io_uring/rw.c index ad7f647d48e9..4c4229f41aaa 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -381,6 +381,57 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_prep_rw(req, sqe, ITER_SOURCE); } +static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io = req->async_data; + const struct iovec __user *uvec; + size_t uvec_segs = rw->len; + struct iovec *iov; + int iovec_off, ret; + void *res; + + if (uvec_segs > io->vec.nr) { + ret = io_vec_realloc(&io->vec, uvec_segs); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + } + /* pad iovec to the right */ + iovec_off = io->vec.nr - uvec_segs; + iov = io->vec.iovec + iovec_off; + uvec = u64_to_user_ptr(rw->addr); + res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, + io_is_compat(req->ctx)); + if (IS_ERR(res)) + return PTR_ERR(res); + + ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, + uvec_segs, iovec_off, 0); + iov_iter_save_state(&io->iter, &io->iter_state); + return ret; +} + +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_DEST); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req, ITER_DEST); +} + +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_SOURCE); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req, ITER_SOURCE); +} + /* * Multishot read is prepared just like a normal read/write request, only * difference is that we set the MULTISHOT flag. diff --git a/io_uring/rw.h b/io_uring/rw.h index 529fd2f96a7f..81d6d9a8cf69 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -32,6 +32,8 @@ struct io_async_rw { int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); -- cgit From 835c4bdf95d5c71fd5b41f77f2343b695b4494aa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:32 +0000 Subject: io_uring/rw: defer reg buf vec import Import registered buffers for vectored reads and writes later at issue time as we now do for other fixed ops. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e8491c976e4ab83a4e3dc428e9fe7555e59583b8.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rw.c b/io_uring/rw.c index 4c4229f41aaa..e62f4ce34171 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -381,7 +381,25 @@ int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_prep_rw(req, sqe, ITER_SOURCE); } -static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir) +static int io_rw_import_reg_vec(struct io_kiocb *req, + struct io_async_rw *io, + int ddir, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + unsigned uvec_segs = rw->len; + unsigned iovec_off = io->vec.nr - uvec_segs; + int ret; + + ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, + uvec_segs, iovec_off, issue_flags); + if (unlikely(ret)) + return ret; + iov_iter_save_state(&io->iter, &io->iter_state); + req->flags &= ~REQ_F_IMPORT_BUFFER; + return 0; +} + +static int io_rw_prep_reg_vec(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_async_rw *io = req->async_data; @@ -406,10 +424,8 @@ static int io_rw_prep_reg_vec(struct io_kiocb *req, int ddir) if (IS_ERR(res)) return PTR_ERR(res); - ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, - uvec_segs, iovec_off, 0); - iov_iter_save_state(&io->iter, &io->iter_state); - return ret; + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; } int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -419,7 +435,7 @@ int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; - return io_rw_prep_reg_vec(req, ITER_DEST); + return io_rw_prep_reg_vec(req); } int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -429,7 +445,7 @@ int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) ret = __io_prep_rw(req, sqe, ITER_SOURCE); if (unlikely(ret)) return ret; - return io_rw_prep_reg_vec(req, ITER_SOURCE); + return io_rw_prep_reg_vec(req); } /* @@ -906,7 +922,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret; loff_t *ppos; - if (io_do_buffer_select(req)) { + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); + if (unlikely(ret)) + return ret; + } else if (io_do_buffer_select(req)) { ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; @@ -1117,6 +1137,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); + if (unlikely(ret)) + return ret; + } + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; -- cgit From 17523a821d2276d8d1031467a3fb87e9c7321384 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:33 +0000 Subject: io_uring/net: combine msghdr copy Call the compat version from inside of io_msg_copy_hdr() and don't duplicate it in callers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/25795660f7b31f9273911c99f495d9c2b169ecda.1741362889.git.asml.silence@gmail.com [axboe: fixup msg pointer vs variable braino in io_msg_copy_hdr()] Signed-off-by: Jens Axboe --- io_uring/net.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 905d1ee01201..f64400aebd54 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -287,6 +287,24 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr __user *umsg = sr->umsg; int ret; + iomsg->msg.msg_name = &iomsg->addr; + iomsg->msg.msg_iter.nr_segs = 0; + + if (io_is_compat(req->ctx)) { + struct compat_msghdr cmsg; + + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); + if (ret) + return ret; + + memset(msg, 0, sizeof(*msg)); + msg->msg_namelen = cmsg.msg_namelen; + msg->msg_controllen = cmsg.msg_controllen; + msg->msg_iov = compat_ptr(cmsg.msg_iov); + msg->msg_iovlen = cmsg.msg_iovlen; + return 0; + } + ret = io_copy_msghdr_from_user(msg, umsg); if (unlikely(ret)) return ret; @@ -323,18 +341,6 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct user_msghdr msg; int ret; - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; - - if (io_is_compat(req->ctx)) { - struct compat_msghdr cmsg; - - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE, - NULL); - sr->msg_control = iomsg->msg.msg_control_user; - return ret; - } - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; @@ -710,21 +716,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct user_msghdr msg; int ret; - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; - - if (io_is_compat(req->ctx)) { - struct compat_msghdr cmsg; - - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, - &iomsg->uaddr); - memset(&msg, 0, sizeof(msg)); - msg.msg_namelen = cmsg.msg_namelen; - msg.msg_controllen = cmsg.msg_controllen; - } else { - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); - } - + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); if (unlikely(ret)) return ret; return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, -- cgit From 9fcb349f5ad1355332b7ca711251bc01639bd852 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:34 +0000 Subject: io_uring/net: pull vec alloc out of msghdr import I'll need more control over iovec management, move io_net_import_vec() out of io_msg_copy_hdr(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9600ea6300f620e65d39da481c22605ddc898850.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index f64400aebd54..0c46e6315289 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -253,12 +253,8 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return -EFAULT; sr->len = tmp_iov.iov_len; } - - return 0; } - - return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov, - msg->msg_iovlen, ddir); + return 0; } static int io_copy_msghdr_from_user(struct user_msghdr *msg, @@ -328,10 +324,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, return -EFAULT; sr->len = tmp_iov.iov_len; } - return 0; } - - return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir); + return 0; } static int io_sendmsg_copy_hdr(struct io_kiocb *req, @@ -342,6 +336,12 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, int ret; ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); + if (unlikely(ret)) + return ret; + + if (!(req->flags & REQ_F_BUFFER_SELECT)) + ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, + ITER_SOURCE); /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; return ret; @@ -719,6 +719,13 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); if (unlikely(ret)) return ret; + + if (!(req->flags & REQ_F_BUFFER_SELECT)) { + ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, + ITER_DEST); + if (unlikely(ret)) + return ret; + } return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, msg.msg_controllen); } -- cgit From be7052a4b5a85367656352c614cd4449779ff36f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:35 +0000 Subject: io_uring/net: convert to struct iou_vec Convert net.c to use struct iou_vec. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6437b57dabed44eca708c02e390529c7ed211c78.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 9 --------- io_uring/net.c | 51 ++++++++++++++++++++++---------------------------- io_uring/net.h | 6 +++--- 3 files changed, 25 insertions(+), 41 deletions(-) (limited to 'io_uring') diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 7f68eff2e7f3..d33ce159ef33 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -16,15 +16,6 @@ bool io_alloc_cache_init(struct io_alloc_cache *cache, void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); -static inline void io_alloc_cache_kasan(struct iovec **iov, int *nr) -{ - if (IS_ENABLED(CONFIG_KASAN)) { - kfree(*iov); - *iov = NULL; - *nr = 0; - } -} - static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) { diff --git a/io_uring/net.c b/io_uring/net.c index 0c46e6315289..4825111185c3 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -136,11 +136,8 @@ static bool io_net_retry(struct socket *sock, int flags) static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) { - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov_nr = 0; - kmsg->free_iov = NULL; - } + if (kmsg->vec.iovec) + io_vec_free(&kmsg->vec); } static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) @@ -154,7 +151,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } /* Let normal cleanup path reap it if we fail adding to the cache */ - io_alloc_cache_kasan(&hdr->free_iov, &hdr->free_iov_nr); + io_alloc_cache_vec_kasan(&hdr->vec); if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; @@ -171,7 +168,7 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) return NULL; /* If the async data was cached, we might have an iov cached inside. */ - if (hdr->free_iov) + if (hdr->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; return hdr; } @@ -182,10 +179,7 @@ static void io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg { if (iov) { req->flags |= REQ_F_NEED_CLEANUP; - kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; - if (kmsg->free_iov) - kfree(kmsg->free_iov); - kmsg->free_iov = iov; + io_vec_reset_iovec(&kmsg->vec, iov, kmsg->msg.msg_iter.nr_segs); } } @@ -208,9 +202,9 @@ static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg struct iovec *iov; int ret, nr_segs; - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; + if (iomsg->vec.iovec) { + nr_segs = iomsg->vec.nr; + iov = iomsg->vec.iovec; } else { nr_segs = 1; iov = &iomsg->fast_iov; @@ -468,7 +462,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) if (iter_is_ubuf(&kmsg->msg.msg_iter)) return 1; - iov = kmsg->free_iov; + iov = kmsg->vec.iovec; if (!iov) iov = &kmsg->fast_iov; @@ -584,9 +578,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, .nr_iovs = 1, }; - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; arg.mode = KBUF_MODE_FREE; } @@ -599,9 +593,9 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, if (unlikely(ret < 0)) return ret; - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } sr->len = arg.out_len; @@ -1085,9 +1079,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg .mode = KBUF_MODE_EXPAND, }; - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; arg.mode |= KBUF_MODE_FREE; } @@ -1106,9 +1100,9 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; req->flags |= REQ_F_NEED_CLEANUP; } } else { @@ -1874,8 +1868,7 @@ void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; - if (kmsg->free_iov) - io_netmsg_iovec_free(kmsg); + io_vec_free(&kmsg->vec); kfree(kmsg); } #endif diff --git a/io_uring/net.h b/io_uring/net.h index b804c2b36e60..43e5ce5416b7 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -2,12 +2,12 @@ #include #include +#include struct io_async_msghdr { #if defined(CONFIG_NET) - struct iovec *free_iov; - /* points to an allocated iov, if NULL we use fast_iov instead */ - int free_iov_nr; + struct iou_vec vec; + struct_group(clear, int namelen; struct iovec fast_iov; -- cgit From 23371eac7d9a9bca5360cfb3eb3aa08648ee7246 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:36 +0000 Subject: io_uring/net: implement vectored reg bufs for zctx Add support for vectored registered buffers for send zc. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e484052875f862d2dca99f0f8c04407c1d51a1c1.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 4825111185c3..40ecf421798f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -395,6 +395,44 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe return io_sendmsg_copy_hdr(req, kmsg); } +static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg = req->async_data; + struct user_msghdr msg; + int ret, iovec_off; + struct iovec *iov; + void *res; + + if (!(sr->flags & IORING_RECVSEND_FIXED_BUF)) + return io_sendmsg_setup(req, sqe); + + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); + if (unlikely(ret)) + return ret; + sr->msg_control = kmsg->msg.msg_control_user; + + if (msg.msg_iovlen > kmsg->vec.nr || WARN_ON_ONCE(!kmsg->vec.iovec)) { + ret = io_vec_realloc(&kmsg->vec, msg.msg_iovlen); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + } + iovec_off = kmsg->vec.nr - msg.msg_iovlen; + iov = kmsg->vec.iovec + iovec_off; + + res = iovec_from_user(msg.msg_iov, msg.msg_iovlen, kmsg->vec.nr, iov, + io_is_compat(req->ctx)); + if (IS_ERR(res)) + return PTR_ERR(res); + + kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; + req->flags |= REQ_F_IMPORT_BUFFER; + return ret; +} + #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1333,8 +1371,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->opcode != IORING_OP_SEND_ZC) { if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; - if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) - return -EINVAL; } zc->len = READ_ONCE(sqe->len); @@ -1350,7 +1386,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG_ZC) return io_send_setup(req, sqe); - return io_sendmsg_setup(req, sqe); + return io_sendmsg_zc_setup(req, sqe); } static int io_sg_from_iter_iovec(struct sk_buff *skb, @@ -1506,6 +1542,22 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; + + if (req->flags & REQ_F_IMPORT_BUFFER) { + unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; + unsigned iovec_off = kmsg->vec.nr - uvec_segs; + int ret; + + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, + &kmsg->vec, uvec_segs, iovec_off, + issue_flags); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter; + req->flags &= ~REQ_F_IMPORT_BUFFER; + } + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -1524,7 +1576,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { -- cgit From 0396ad3766ad4879b35c5401cee41bba64fe75d2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 7 Mar 2025 16:00:37 +0000 Subject: io_uring: cap cached iovec/bvec size Bvecs can be large, put an arbitrary limit on the max vector size it can cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/823055fa6628daa24bbc9cd77c2da87e9a1e1e32.1741362889.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 3 +++ io_uring/rsrc.h | 2 ++ io_uring/rw.c | 3 +++ 3 files changed, 8 insertions(+) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 40ecf421798f..34a28689ec99 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -152,6 +152,9 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) /* Let normal cleanup path reap it if we fail adding to the cache */ io_alloc_cache_vec_kasan(&hdr->vec); + if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&hdr->vec); + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 0d5c18296130..b0097c06b577 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -5,6 +5,8 @@ #include #include +#define IO_VEC_CACHE_SOFT_CAP 256 + enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, diff --git a/io_uring/rw.c b/io_uring/rw.c index e62f4ce34171..bf35599d1078 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -150,6 +150,9 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) return; io_alloc_cache_vec_kasan(&rw->vec); + if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&rw->vec); + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; -- cgit From 7a9dcb05f5501b07a2ef7d0ef743f4f17e9f3055 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 8 Mar 2025 17:19:32 +0000 Subject: io_uring: return -EAGAIN to continue multishot Multishot errors can be mapped 1:1 to normal errors, but there are not identical. It leads to a peculiar situation where all multishot requests has to check in what context they're run and return different codes. Unify them starting with EAGAIN / IOU_ISSUE_SKIP_COMPLETE(EIOCBQUEUED) pair, which mean that core io_uring still owns the request and it should be retried. In case of multishot it's naturally just continues to poll, otherwise it might poll, use iowq or do any other kind of allowed blocking. Introduce IOU_RETRY aliased to -EAGAIN for that. Apart from obvious upsides, multishot can now also check for misuse of IOU_ISSUE_SKIP_COMPLETE. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/da117b79ce72ecc3ab488c744e29fae9ba54e23b.1741453534.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 +--- io_uring/io_uring.h | 8 ++++++++ io_uring/net.c | 47 ++++++++++++++--------------------------------- io_uring/poll.c | 3 ++- io_uring/rw.c | 11 ++++------- 5 files changed, 29 insertions(+), 44 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fa342be39158..6499d8e4d3d0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1791,10 +1791,8 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); + WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); WARN_ON_ONCE(ret == IOU_OK); - - if (ret == IOU_ISSUE_SKIP_COMPLETE) - ret = 0; return ret; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index daf0e3b740ee..3409740f6417 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -22,6 +22,14 @@ enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, + /* + * The request has more work to do and should be retried. io_uring will + * attempt to wait on the file for eligible opcodes, but otherwise + * it'll be handed to iowq for blocking execution. It works for normal + * requests as well as for the multi shot mode. + */ + IOU_RETRY = -EAGAIN, + /* * Requeue the task_work to restart operations on this request. The * actual value isn't important, should just be not an otherwise diff --git a/io_uring/net.c b/io_uring/net.c index 34a28689ec99..d9befb6fb8a7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -898,8 +898,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { - int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - + *ret = IOU_RETRY; io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { @@ -907,12 +906,9 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, return false; /* mshot retries exceeded, force a requeue */ sr->nr_multishot_loops = 0; - mshot_retry_ret = IOU_REQUEUE; + if (issue_flags & IO_URING_F_MULTISHOT) + *ret = IOU_REQUEUE; } - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = mshot_retry_ret; - else - *ret = -EAGAIN; return true; } @@ -1070,16 +1066,15 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return IOU_RETRY; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1207,12 +1202,10 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; @@ -1312,10 +1305,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) return IOU_STOP_MULTISHOT; return IOU_OK; } - - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } void io_send_zc_cleanup(struct io_kiocb *req) @@ -1692,16 +1682,9 @@ retry: put_unused_fd(fd); ret = PTR_ERR(file); if (ret == -EAGAIN && force_nonblock && - !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return ret; - } + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) + return IOU_RETRY; + if (ret == -ERESTARTSYS) ret = -EINTR; } else if (!fixed) { @@ -1720,9 +1703,7 @@ retry: io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) goto retry; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } io_req_set_res(req, ret, cflags); diff --git a/io_uring/poll.c b/io_uring/poll.c index 176854882ba6..52e3c3e923f4 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -289,11 +289,12 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) } } else { int ret = io_poll_issue(req, tw); + if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; - if (ret < 0) + if (ret != IOU_RETRY && ret < 0) return ret; } diff --git a/io_uring/rw.c b/io_uring/rw.c index bf35599d1078..9a9c636defad 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1068,9 +1068,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ if (io_kbuf_recycle(req, issue_flags)) rw->len = 0; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } else if (ret <= 0) { io_kbuf_recycle(req, issue_flags); if (ret < 0) @@ -1088,16 +1086,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) /* * Force retry, as we might have more data to * be read and otherwise it won't get retried * until (if ever) another poll is triggered. */ io_poll_multishot_retry(req); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + + return IOU_RETRY; } } -- cgit From 5027d02452c982bdc7b36205c66466ebd7e6ee17 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 8 Mar 2025 17:19:33 +0000 Subject: io_uring: unify STOP_MULTISHOT with IOU_OK IOU_OK means that the request ownership is now handed back to core io_uring and it has to complete it using the result provided in req->cqe. Same is true for multishot and IOU_STOP_MULTISHOT. Rename it into IOU_COMPLETE to avoid confusion and use for both modes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e6a5b2edb0eb9558acb1c8f1db38ac45fee95491.1741453534.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - io_uring/io_uring.h | 11 +++-------- io_uring/net.c | 19 ++++--------------- io_uring/poll.c | 2 +- io_uring/rw.c | 4 +--- 5 files changed, 9 insertions(+), 28 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6499d8e4d3d0..5ff30a7092ed 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1792,7 +1792,6 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); - WARN_ON_ONCE(ret == IOU_OK); return ret; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 3409740f6417..2308f39ed915 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -19,7 +19,9 @@ #endif enum { - IOU_OK = 0, + IOU_OK = 0, /* deprecated, use IOU_COMPLETE */ + IOU_COMPLETE = 0, + IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, /* @@ -36,13 +38,6 @@ enum { * valid error code, yet less than -MAX_ERRNO and valid internally. */ IOU_REQUEUE = -3072, - - /* - * Intended only when both IO_URING_F_MULTISHOT is passed - * to indicate to the poll runner that multishot should be - * removed and the result is set on req->cqe.res. - */ - IOU_STOP_MULTISHOT = -ECANCELED, }; struct io_wait_queue { diff --git a/io_uring/net.c b/io_uring/net.c index d9befb6fb8a7..9fa5c9570875 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -915,11 +915,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, /* Finish the request / stop multishot. */ finish: io_req_set_res(req, *ret, cflags); - - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = IOU_STOP_MULTISHOT; - else - *ret = IOU_OK; + *ret = IOU_COMPLETE; io_req_msg_cleanup(req, issue_flags); return true; } @@ -1288,9 +1284,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) if (len && zc->len == 0) { io_req_set_res(req, 0, 0); - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; } if (unlikely(ret <= 0) && ret != -EAGAIN) { if (ret == -ERESTARTSYS) @@ -1300,10 +1294,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_set_res(req, ret, 0); - - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; } return IOU_RETRY; } @@ -1709,9 +1700,7 @@ retry: io_req_set_res(req, ret, cflags); if (ret < 0) req_set_fail(req); - if (!(issue_flags & IO_URING_F_MULTISHOT)) - return IOU_OK; - return IOU_STOP_MULTISHOT; + return IOU_COMPLETE; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/poll.c b/io_uring/poll.c index 52e3c3e923f4..8eb744eb9f4c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -290,7 +290,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) } else { int ret = io_poll_issue(req, tw); - if (ret == IOU_STOP_MULTISHOT) + if (ret == IOU_COMPLETE) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; diff --git a/io_uring/rw.c b/io_uring/rw.c index 9a9c636defad..50037313555f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1104,9 +1104,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ io_req_set_res(req, ret, cflags); io_req_rw_cleanup(req, issue_flags); - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; } static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) -- cgit From d291fb65202051e996cd983b29dce3e390421bc6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 8 Mar 2025 18:21:15 +0000 Subject: io_uring: introduce io_prep_reg_iovec() iovecs that are turned into registered buffers are imported in a special way with an offset, so that later we can do an in place translation. Add a helper function taking care of it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7de2ecb9ed5efc3c5cf320232236966da5ad4ccc.1741457480.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 23 +++-------------------- io_uring/rsrc.c | 26 ++++++++++++++++++++++++++ io_uring/rsrc.h | 2 ++ io_uring/rw.c | 21 +-------------------- 4 files changed, 32 insertions(+), 40 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 9fa5c9570875..6b8dbadf445f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -403,9 +403,7 @@ static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe * struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; struct user_msghdr msg; - int ret, iovec_off; - struct iovec *iov; - void *res; + int ret; if (!(sr->flags & IORING_RECVSEND_FIXED_BUF)) return io_sendmsg_setup(req, sqe); @@ -416,24 +414,9 @@ static int io_sendmsg_zc_setup(struct io_kiocb *req, const struct io_uring_sqe * if (unlikely(ret)) return ret; sr->msg_control = kmsg->msg.msg_control_user; - - if (msg.msg_iovlen > kmsg->vec.nr || WARN_ON_ONCE(!kmsg->vec.iovec)) { - ret = io_vec_realloc(&kmsg->vec, msg.msg_iovlen); - if (ret) - return ret; - req->flags |= REQ_F_NEED_CLEANUP; - } - iovec_off = kmsg->vec.nr - msg.msg_iovlen; - iov = kmsg->vec.iovec + iovec_off; - - res = iovec_from_user(msg.msg_iov, msg.msg_iovlen, kmsg->vec.nr, iov, - io_is_compat(req->ctx)); - if (IS_ERR(res)) - return PTR_ERR(res); - kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; - req->flags |= REQ_F_IMPORT_BUFFER; - return ret; + + return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, msg.msg_iovlen); } #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 71fe47facd4c..0e413e910f3d 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1397,3 +1397,29 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); } + +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs) +{ + struct iovec *iov; + int iovec_off, ret; + void *res; + + if (uvec_segs > iv->nr) { + ret = io_vec_realloc(iv, uvec_segs); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + } + + /* pad iovec to the right */ + iovec_off = iv->nr - uvec_segs; + iov = iv->iovec + iovec_off; + res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, + io_is_compat(req->ctx)); + if (IS_ERR(res)) + return PTR_ERR(res); + + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index b0097c06b577..43f784915573 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -67,6 +67,8 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, unsigned nr_iovs, unsigned iovec_off, unsigned issue_flags); +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); diff --git a/io_uring/rw.c b/io_uring/rw.c index 50037313555f..4861b876f48e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -407,28 +407,9 @@ static int io_rw_prep_reg_vec(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_async_rw *io = req->async_data; const struct iovec __user *uvec; - size_t uvec_segs = rw->len; - struct iovec *iov; - int iovec_off, ret; - void *res; - if (uvec_segs > io->vec.nr) { - ret = io_vec_realloc(&io->vec, uvec_segs); - if (ret) - return ret; - req->flags |= REQ_F_NEED_CLEANUP; - } - /* pad iovec to the right */ - iovec_off = io->vec.nr - uvec_segs; - iov = io->vec.iovec + iovec_off; uvec = u64_to_user_ptr(rw->addr); - res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, - io_is_compat(req->ctx)); - if (IS_ERR(res)) - return PTR_ERR(res); - - req->flags |= REQ_F_IMPORT_BUFFER; - return 0; + return io_prep_reg_iovec(req, &io->vec, uvec, rw->len); } int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) -- cgit From 146acfd0f6494579996ae4168967cc5ada7d0e5a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 8 Mar 2025 18:21:16 +0000 Subject: io_uring: rely on io_prep_reg_vec for iovec placement All vectored reg buffer users should use io_import_reg_vec() for iovec imports, since iovec placement is the function's responsibility and callers shouldn't know much about it, drop the offset parameter from io_prep_reg_vec() and calculate it inside. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/08ed87ca4bbc06724373b6ce06f36b703fe60c4e.1741457480.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 4 +--- io_uring/rsrc.c | 8 ++++---- io_uring/rsrc.h | 3 +-- io_uring/rw.c | 3 +-- 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 6b8dbadf445f..1e36a72e4008 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1513,12 +1513,10 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (req->flags & REQ_F_IMPORT_BUFFER) { unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; - unsigned iovec_off = kmsg->vec.nr - uvec_segs; int ret; ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, - &kmsg->vec, uvec_segs, iovec_off, - issue_flags); + &kmsg->vec, uvec_segs, issue_flags); if (unlikely(ret)) return ret; kmsg->msg.sg_from_iter = io_sg_from_iter; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0e413e910f3d..607b09bd8374 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1349,11 +1349,11 @@ static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, - unsigned nr_iovs, unsigned iovec_off, - unsigned issue_flags) + unsigned nr_iovs, unsigned issue_flags) { struct io_rsrc_node *node; struct io_mapped_ubuf *imu; + unsigned iovec_off; struct iovec *iov; unsigned nr_segs; @@ -1366,6 +1366,7 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, if (!(imu->dir & (1 << ddir))) return -EFAULT; + iovec_off = vec->nr - nr_iovs; iov = vec->iovec + iovec_off; nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); @@ -1377,8 +1378,7 @@ int io_import_reg_vec(int ddir, struct iov_iter *iter, nr_segs += nr_iovs; } - if (WARN_ON_ONCE(iovec_off + nr_iovs != vec->nr) || - nr_segs > vec->nr) { + if (nr_segs > vec->nr) { struct iou_vec tmp_vec = {}; int ret; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 43f784915573..b52242852ff3 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -65,8 +65,7 @@ int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, unsigned issue_flags); int io_import_reg_vec(int ddir, struct iov_iter *iter, struct io_kiocb *req, struct iou_vec *vec, - unsigned nr_iovs, unsigned iovec_off, - unsigned issue_flags); + unsigned nr_iovs, unsigned issue_flags); int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, const struct iovec __user *uvec, size_t uvec_segs); diff --git a/io_uring/rw.c b/io_uring/rw.c index 4861b876f48e..246b22225919 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -390,11 +390,10 @@ static int io_rw_import_reg_vec(struct io_kiocb *req, { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned uvec_segs = rw->len; - unsigned iovec_off = io->vec.nr - uvec_segs; int ret; ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, - uvec_segs, iovec_off, issue_flags); + uvec_segs, issue_flags); if (unlikely(ret)) return ret; iov_iter_save_state(&io->iter, &io->iter_state); -- cgit From 575e7b0629d4bd485517c40ff20676180476f5f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Mar 2025 06:12:47 +0000 Subject: io_uring: rename the data cmd cache Pick a more descriptive name for the cmd async data cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/20250319061251.21452-2-sidong.yang@furiosa.ai Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/uring_cmd.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5ff30a7092ed..7f26ad334e30 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -289,7 +289,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->cmd_cache, kfree); io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); @@ -334,7 +334,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw), offsetof(struct io_async_rw, clear)); - ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_uring_cmd_data), 0); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index de39b602aa82..792bd54851b1 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -28,7 +28,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_UNLOCKED) return; - if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { + if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) { ioucmd->sqe = NULL; req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; @@ -171,7 +171,7 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd_data *cache; - cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req); + cache = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); if (!cache) return -ENOMEM; cache->op_data = NULL; -- cgit From 5f14404bfa245a156915ee44c827edc56655b067 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 19 Mar 2025 06:12:48 +0000 Subject: io_uring/cmd: don't expose entire cmd async data io_uring needs private bits in cmd's ->async_data, and they should never be exposed to drivers as it'd certainly be abused. Leave struct io_uring_cmd_data for the drivers but wrap it into a structure. It's a prep patch and doesn't do anything useful yet. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/20250319061251.21452-3-sidong.yang@furiosa.ai Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/opdef.c | 2 +- io_uring/uring_cmd.c | 18 +++++++++++------- io_uring/uring_cmd.h | 6 ++++++ 4 files changed, 19 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7f26ad334e30..5eb9be063a7c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -335,7 +335,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct io_async_rw), offsetof(struct io_async_rw, clear)); ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_uring_cmd_data), 0); + sizeof(struct io_async_cmd), 0); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb), 0); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 7fd173197b1e..e4aa61a414fb 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -416,7 +416,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_uring_cmd_data), + .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 792bd54851b1..7c126ee497ea 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -19,7 +19,8 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache = req->async_data; + struct io_async_cmd *ac = req->async_data; + struct io_uring_cmd_data *cache = &ac->data; if (cache->op_data) { kfree(cache->op_data); @@ -169,12 +170,15 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_uring_cmd_data *cache; + struct io_async_cmd *ac; - cache = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); - if (!cache) + /* see io_uring_cmd_get_async_data() */ + BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0); + + ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); + if (!ac) return -ENOMEM; - cache->op_data = NULL; + ac->data.op_data = NULL; /* * Unconditionally cache the SQE for now - this is only needed for @@ -183,8 +187,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, * that it doesn't read in per-op data, play it safe and ensure that * any SQE data is stable beyond prep. This can later get relaxed. */ - memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = cache->sqes; + memcpy(ac->data.sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = ac->data.sqes; return 0; } diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index f6837ee0955b..2ec3a8785534 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,5 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include + +struct io_async_cmd { + struct io_uring_cmd_data data; +}; + int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -- cgit From 3a4689ac109f18f23ea0d0c1c79e055142796858 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 21 Mar 2025 18:04:33 +0000 Subject: io_uring/cmd: add iovec cache for commands Add iou_vec to commands and wire caching for it, but don't expose it to users just yet. We need the vec cleared on initial alloc, but since we can't place it at the beginning at the moment, zero the entire async_data. It's cached, and the performance effects only the initial allocation, and it might be not a bad idea since we're exposing those bits to outside drivers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c0f2145b75791bc6106eb4e72add2cf6a2c72a7a.1742579999.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +++-- io_uring/opdef.c | 1 + io_uring/uring_cmd.c | 20 +++++++++++++++++++- io_uring/uring_cmd.h | 5 +++++ 4 files changed, 28 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5eb9be063a7c..e1128b9551aa 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -289,7 +289,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->cmd_cache, kfree); + io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_rsrc_cache_free(ctx); @@ -335,7 +335,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct io_async_rw), offsetof(struct io_async_rw, clear)); ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_cmd), 0); + sizeof(struct io_async_cmd), + sizeof(struct io_async_cmd)); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb), 0); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index e4aa61a414fb..489384c0438b 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -755,6 +755,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", + .cleanup = io_uring_cmd_cleanup, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 7c126ee497ea..6a21cdaaf495 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,6 +16,14 @@ #include "rsrc.h" #include "uring_cmd.h" +void io_cmd_cache_free(const void *entry) +{ + struct io_async_cmd *ac = (struct io_async_cmd *)entry; + + io_vec_free(&ac->vec); + kfree(ac); +} + static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -29,13 +37,23 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_UNLOCKED) return; + + io_alloc_cache_vec_kasan(&ac->vec); + if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&ac->vec); + if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) { ioucmd->sqe = NULL; req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); } } +void io_uring_cmd_cleanup(struct io_kiocb *req) +{ + io_req_uring_cleanup(req, 0); +} + bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 2ec3a8785534..b45ec7cffcd1 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,13 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include struct io_async_cmd { struct io_uring_cmd_data data; + struct iou_vec vec; }; int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +void io_uring_cmd_cleanup(struct io_kiocb *req); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); + +void io_cmd_cache_free(const void *entry); -- cgit From ef490275297267d9461733ecd9b02bd3b798b3a4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 21 Mar 2025 18:04:34 +0000 Subject: io_uring/cmd: introduce io_uring_cmd_import_fixed_vec io_uring_cmd_import_fixed_vec() is a cmd helper around vectored registered buffer import functions, which caches the memory under the hood. The lifetime of the vectore and hence the iterator is bound to the request. Furthermore, the user is not allowed to call it multiple times for a single request. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/97487a80dec3fb8cf8aeedf1f9026ef6d503fe4b.1742579999.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 19 +++++++++++++++++++ io_uring/uring_cmd.h | 6 ++++++ 2 files changed, 25 insertions(+) (limited to 'io_uring') diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 6a21cdaaf495..f2cfc371f3d0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -277,6 +277,25 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); +int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + struct io_async_cmd *ac = req->async_data; + int ret; + + ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs); + if (ret) + return ret; + + return io_import_reg_vec(ddir, iter, req, &ac->vec, uvec_segs, + issue_flags); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed_vec); + void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index b45ec7cffcd1..14e525255854 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -16,3 +16,9 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); void io_cmd_cache_free(const void *entry); + +int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags); -- cgit From 8e3100fcc5cbba03518b8b5c059624aba5c29d50 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 21 Mar 2025 12:48:17 -0600 Subject: io_uring/net: only import send_zc buffer once io_send_zc() guards its call to io_send_zc_import() with if (!done_io) in an attempt to avoid calling it redundantly on the same req. However, if the initial non-blocking issue returns -EAGAIN, done_io will stay 0. This causes the subsequent issue to unnecessarily re-import the buffer. Add an explicit flag "imported" to io_sr_msg to track if its buffer has already been imported. Clear the flag in io_send_zc_prep(). Call io_send_zc_import() and set the flag in io_send_zc() if it is unset. Signed-off-by: Caleb Sander Mateos Fixes: 54cdcca05abd ("io_uring/net: switch io_send() and io_send_zc() to using io_async_msghdr") Link: https://lore.kernel.org/r/20250321184819.3847386-2-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/net.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 1e36a72e4008..030b76e6f6f8 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -77,6 +77,7 @@ struct io_sr_msg { /* initialised and used only by !msg send variants */ u16 buf_group; bool retry; + bool imported; /* only for io_send_zc */ void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -1306,6 +1307,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->done_io = 0; zc->retry = false; + zc->imported = false; req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) @@ -1451,7 +1453,8 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) (zc->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - if (!zc->done_io) { + if (!zc->imported) { + zc->imported = true; ret = io_send_zc_import(req, issue_flags); if (unlikely(ret)) return ret; -- cgit From 3f0cb8de56b9a5c052a9e43fa548856926059810 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Mar 2025 15:32:32 +0000 Subject: io_uring: fix retry handling off iowq io_req_complete_post() doesn't handle reissue and if called with a REQ_F_REISSUE request it might post extra unexpected completions. Fix it by pushing into flush_completion via task work. Fixes: d803d123948fe ("io_uring/rw: handle -EAGAIN retry at IO completion time") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/badb3d7e462881e7edbfcc2be6301090b07dbe53.1742829388.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e1128b9551aa..e6c462948273 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -904,7 +904,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->lockless_cq) { + if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; -- cgit From 3afcb3b2e3a4ead24e3ab476576e87877d55ee22 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Mar 2025 15:32:33 +0000 Subject: io_uring: defer iowq cqe overflow via task_work Don't handle CQE overflows in io_req_complete_post() and defer it to flush_completions. It cuts some duplication, and I also want to limit the number of places directly overflowing completions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9046410ac27e18f2baa6f7cdb363ec921cbc3b79.1742829388.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e6c462948273..1fcfe62cecd9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -892,6 +892,7 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool completed = true; /* * All execution paths but io-wq use the deferred completions by @@ -905,18 +906,20 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * the submitter task context, IOPOLL protects with uring_lock. */ if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { +defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; } io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) { - if (!io_fill_cqe_req(ctx, req)) - io_req_cqe_overflow(req); - } + if (!(req->flags & REQ_F_CQE_SKIP)) + completed = io_fill_cqe_req(ctx, req); io_cq_unlock_post(ctx); + if (!completed) + goto defer_complete; + /* * We don't free the request here because we know it's called from * io-wq only, which holds a reference, so it cannot be the last put. -- cgit From 4c76de42cb6971fc2cc7984ed974caffe6ea7fda Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Mar 2025 15:32:34 +0000 Subject: io_uring: open code __io_post_aux_cqe() There is no reason to keep __io_post_aux_cqe() separately from io_post_aux_cqe(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2c4c1f68d694deea25a212fc09bbb11f330cd82e.1742829388.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1fcfe62cecd9..df3685803ef7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -834,24 +834,14 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags) +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; + io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - - return filled; -} - -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) -{ - bool filled; - - io_cq_lock(ctx); - filled = __io_post_aux_cqe(ctx, user_data, res, cflags); io_cq_unlock_post(ctx); return filled; } -- cgit From d73acd7af3a329e8ebe5df1c738ae8c1d0a5f778 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Mar 2025 15:32:35 +0000 Subject: io_uring: rename "min" arg in io_iopoll_check() Don't name arguments "min", it shadows the namesake function. min_events is also more consistent. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f52ce9d88d3bca5732a218b0da14924aa6968909.1742829388.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index df3685803ef7..6022a00de95b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1505,7 +1505,7 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, long min_events) { unsigned int nr_events = 0; unsigned long check_cq; @@ -1551,7 +1551,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx, min); + (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { @@ -1564,7 +1564,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, !min); + ret = io_do_iopoll(ctx, !min_events); if (unlikely(ret < 0)) return ret; @@ -1574,7 +1574,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) break; nr_events += ret; - } while (nr_events < min); + } while (nr_events < min_events); return 0; } -- cgit From 816619782bdc70d7f33a8d0cda36d61414cec467 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Mar 2025 15:32:36 +0000 Subject: io_uring: move min_events sanitisation iopoll and normal waiting already duplicate min_completion truncation, so move them inside the corresponding routines. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/254adb289cc04638f25d746a7499260fa89a179e.1742829388.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6022a00de95b..4ea684a17d01 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1505,11 +1505,13 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min_events) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { unsigned int nr_events = 0; unsigned long check_cq; + min_events = min(min_events, ctx->cq_entries); + lockdep_assert_held(&ctx->uring_lock); if (!io_allowed_run_tw(ctx)) @@ -2537,6 +2539,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, ktime_t start_time; int ret; + min_events = min_t(int, min_events, ctx->cq_entries); + if (!io_allowed_run_tw(ctx)) return -EEXIST; if (io_local_work_pending(ctx)) @@ -3420,22 +3424,16 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, mutex_lock(&ctx->uring_lock); iopoll_locked: ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); - } mutex_unlock(&ctx->uring_lock); } else { struct ext_arg ext_arg = { .argsz = argsz }; ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + if (likely(!ret2)) ret2 = io_cqring_wait(ctx, min_complete, flags, &ext_arg); - } } if (!ret) { -- cgit From 73b6dacb1c6feae8ca4a6ff120848430aeb57fbd Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 25 Mar 2025 08:39:42 -0600 Subject: io_uring/net: use REQ_F_IMPORT_BUFFER for send_zc Instead of a bool field in struct io_sr_msg, use REQ_F_IMPORT_BUFFER to track whether io_send_zc() has already imported the buffer. This flag already serves a similar purpose for sendmsg_zc and {read,write}v_fixed. Signed-off-by: Caleb Sander Mateos Suggested-by: Pavel Begunkov Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20250325143943.1226467-1-csander@purestorage.com Signed-off-by: Jens Axboe --- io_uring/net.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 030b76e6f6f8..c0275e7f034a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -77,7 +77,6 @@ struct io_sr_msg { /* initialised and used only by !msg send variants */ u16 buf_group; bool retry; - bool imported; /* only for io_send_zc */ void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -1307,7 +1306,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->done_io = 0; zc->retry = false; - zc->imported = false; req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) @@ -1353,8 +1351,10 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; - if (req->opcode != IORING_OP_SENDMSG_ZC) + if (req->opcode == IORING_OP_SEND_ZC) { + req->flags |= REQ_F_IMPORT_BUFFER; return io_send_setup(req, sqe); + } return io_sendmsg_zc_setup(req, sqe); } @@ -1453,8 +1453,8 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) (zc->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - if (!zc->imported) { - zc->imported = true; + if (req->flags & REQ_F_IMPORT_BUFFER) { + req->flags &= ~REQ_F_IMPORT_BUFFER; ret = io_send_zc_import(req, issue_flags); if (unlikely(ret)) return ret; -- cgit From 6889ae1b4df1579bcdffef023e2ea9a982565dff Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Mar 2025 09:57:27 +0000 Subject: io_uring/net: fix io_req_post_cqe abuse by send bundle [ 114.987980][ T5313] WARNING: CPU: 6 PID: 5313 at io_uring/io_uring.c:872 io_req_post_cqe+0x12e/0x4f0 [ 114.991597][ T5313] RIP: 0010:io_req_post_cqe+0x12e/0x4f0 [ 115.001880][ T5313] Call Trace: [ 115.002222][ T5313] [ 115.007813][ T5313] io_send+0x4fe/0x10f0 [ 115.009317][ T5313] io_issue_sqe+0x1a6/0x1740 [ 115.012094][ T5313] io_wq_submit_work+0x38b/0xed0 [ 115.013223][ T5313] io_worker_handle_work+0x62a/0x1600 [ 115.013876][ T5313] io_wq_worker+0x34f/0xdf0 As the comment states, io_req_post_cqe() should only be used by multishot requests, i.e. REQ_F_APOLL_MULTISHOT, which bundled sends are not. Add a flag signifying whether a request wants to post multiple CQEs. Eventually REQ_F_APOLL_MULTISHOT should imply the new flag, but that's left out for simplicity. Cc: stable@vger.kernel.org Fixes: a05d1f625c7aa ("io_uring/net: support bundles for send") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8b611dbb54d1cd47a88681f5d38c84d0c02bc563.1743067183.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/net.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4ea684a17d01..4e362c8542a7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1840,7 +1840,7 @@ fail: * Don't allow any multishot execution from io-wq. It's more restrictive * than necessary and also cleaner. */ - if (req->flags & REQ_F_APOLL_MULTISHOT) { + if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { err = -EBADFD; if (!io_file_can_poll(req)) goto fail; @@ -1851,7 +1851,7 @@ fail: goto fail; return; } else { - req->flags &= ~REQ_F_APOLL_MULTISHOT; + req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); } } diff --git a/io_uring/net.c b/io_uring/net.c index c0275e7f034a..616e953ef0ae 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -448,6 +448,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_WAITALL; sr->buf_group = req->buf_index; req->buf_list = NULL; + req->flags |= REQ_F_MULTISHOT; } if (io_is_compat(req->ctx)) -- cgit