From 4ec703ec0c384a2199808c4eb2e9037236285a8d Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sat, 18 Oct 2025 12:32:54 -0700 Subject: io_uring: fix incorrect unlikely() usage in io_waitid_prep() The negation operator is incorrectly placed outside the unlikely() macro: if (!unlikely(iwa)) This inverts the compiler branch prediction hint, marking the NULL case as likely instead of unlikely. The intent is to indicate that allocation failures are rare, consistent with common kernel patterns. Moving the negation inside unlikely(): if (unlikely(!iwa)) Fixes: 2b4fc4cd43f2 ("io_uring/waitid: setup async data in the prep handler") Signed-off-by: Alok Tiwari Reviewed-by: Gabriel Krisman Bertazi Reviewed-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- io_uring/waitid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/waitid.c b/io_uring/waitid.c index f25110fb1b12..53532ae6256c 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -250,7 +250,7 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; iwa = io_uring_alloc_async_data(NULL, req); - if (!unlikely(iwa)) + if (unlikely(!iwa)) return -ENOMEM; iwa->req = req; -- cgit From 8ac9b0d33e5c0a995338ee5f25fe1b6ff7d97f65 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Oct 2025 07:16:08 -0600 Subject: io_uring/sqpoll: switch away from getrusage() for CPU accounting getrusage() does a lot more than what the SQPOLL accounting needs, the latter only cares about (and uses) the stime. Rather than do a full RUSAGE_SELF summation, just query the used stime instead. Cc: stable@vger.kernel.org Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads") Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 8 ++++---- io_uring/sqpoll.c | 32 ++++++++++++++++++-------------- io_uring/sqpoll.h | 1 + 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index ff3364531c77..294c75a8a3bd 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -59,7 +59,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; - struct rusage sq_usage; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_head = READ_ONCE(r->sq.head); unsigned int sq_tail = READ_ONCE(r->sq.tail); @@ -152,14 +151,15 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) * thread termination. */ if (tsk) { + u64 usec; + get_task_struct(tsk); rcu_read_unlock(); - getrusage(tsk, RUSAGE_SELF, &sq_usage); + usec = io_sq_cpu_usec(tsk); put_task_struct(tsk); sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; - sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 - + sq_usage.ru_stime.tv_usec); + sq_total_time = usec; sq_work_time = sq->work_time; } else { rcu_read_unlock(); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index a3f11349ce06..2b816fdb9866 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -169,6 +170,20 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) return READ_ONCE(sqd->state); } +u64 io_sq_cpu_usec(struct task_struct *tsk) +{ + u64 utime, stime; + + task_cputime_adjusted(tsk, &utime, &stime); + do_div(stime, 1000); + return stime; +} + +static void io_sq_update_worktime(struct io_sq_data *sqd, u64 usec) +{ + sqd->work_time += io_sq_cpu_usec(current) - usec; +} + static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) { unsigned int to_submit; @@ -255,26 +270,15 @@ static bool io_sq_tw_pending(struct llist_node *retry_list) return retry_list || !llist_empty(&tctx->task_list); } -static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start) -{ - struct rusage end; - - getrusage(current, RUSAGE_SELF, &end); - end.ru_stime.tv_sec -= start->ru_stime.tv_sec; - end.ru_stime.tv_usec -= start->ru_stime.tv_usec; - - sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000; -} - static int io_sq_thread(void *data) { struct llist_node *retry_list = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; - struct rusage start; unsigned long timeout = 0; char buf[TASK_COMM_LEN] = {}; DEFINE_WAIT(wait); + u64 start; /* offload context creation failed, just exit */ if (!current->io_uring) { @@ -317,7 +321,7 @@ static int io_sq_thread(void *data) } cap_entries = !list_is_singular(&sqd->ctx_list); - getrusage(current, RUSAGE_SELF, &start); + start = io_sq_cpu_usec(current); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, cap_entries); @@ -333,7 +337,7 @@ static int io_sq_thread(void *data) if (sqt_spin || !time_after(jiffies, timeout)) { if (sqt_spin) { - io_sq_update_worktime(sqd, &start); + io_sq_update_worktime(sqd, start); timeout = jiffies + sqd->sq_thread_idle; } if (unlikely(need_resched())) { diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index b83dcdec9765..fd2f6f29b516 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -29,6 +29,7 @@ void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); +u64 io_sq_cpu_usec(struct task_struct *tsk); static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd) { -- cgit From a94e0657269c5b8e1a90b17aa2c048b3d276e16d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Oct 2025 11:44:39 -0600 Subject: io_uring/sqpoll: be smarter on when to update the stime usage The current approach is a bit naive, and hence calls the time querying way too often. Only start the "doing work" timer when there's actual work to do, and then use that information to terminate (and account) the work time once done. This greatly reduces the frequency of these calls, when they cannot have changed anyway. Running a basic random reader that is setup to use SQPOLL, a profile before this change shows these as the top cycle consumers: + 32.60% iou-sqp-1074 [kernel.kallsyms] [k] thread_group_cputime_adjusted + 19.97% iou-sqp-1074 [kernel.kallsyms] [k] thread_group_cputime + 12.20% io_uring io_uring [.] submitter_uring_fn + 4.13% iou-sqp-1074 [kernel.kallsyms] [k] getrusage + 2.45% iou-sqp-1074 [kernel.kallsyms] [k] io_submit_sqes + 2.18% iou-sqp-1074 [kernel.kallsyms] [k] __pi_memset_generic + 2.09% iou-sqp-1074 [kernel.kallsyms] [k] cputime_adjust and after this change, top of profile looks as follows: + 36.23% io_uring io_uring [.] submitter_uring_fn + 23.26% iou-sqp-819 [kernel.kallsyms] [k] io_sq_thread + 10.14% iou-sqp-819 [kernel.kallsyms] [k] io_sq_tw + 6.52% iou-sqp-819 [kernel.kallsyms] [k] tctx_task_work_run + 4.82% iou-sqp-819 [kernel.kallsyms] [k] nvme_submit_cmds.part.0 + 2.91% iou-sqp-819 [kernel.kallsyms] [k] io_submit_sqes [...] 0.02% iou-sqp-819 [kernel.kallsyms] [k] cputime_adjust where it's spending the cycles on things that actually matter. Reported-by: Fengnan Chang Cc: stable@vger.kernel.org Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads") Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 2b816fdb9866..e22f072c7d5f 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -170,6 +170,11 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) return READ_ONCE(sqd->state); } +struct io_sq_time { + bool started; + u64 usec; +}; + u64 io_sq_cpu_usec(struct task_struct *tsk) { u64 utime, stime; @@ -179,12 +184,24 @@ u64 io_sq_cpu_usec(struct task_struct *tsk) return stime; } -static void io_sq_update_worktime(struct io_sq_data *sqd, u64 usec) +static void io_sq_update_worktime(struct io_sq_data *sqd, struct io_sq_time *ist) +{ + if (!ist->started) + return; + ist->started = false; + sqd->work_time += io_sq_cpu_usec(current) - ist->usec; +} + +static void io_sq_start_worktime(struct io_sq_time *ist) { - sqd->work_time += io_sq_cpu_usec(current) - usec; + if (ist->started) + return; + ist->started = true; + ist->usec = io_sq_cpu_usec(current); } -static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) +static int __io_sq_thread(struct io_ring_ctx *ctx, struct io_sq_data *sqd, + bool cap_entries, struct io_sq_time *ist) { unsigned int to_submit; int ret = 0; @@ -197,6 +214,8 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { const struct cred *creds = NULL; + io_sq_start_worktime(ist); + if (ctx->sq_creds != current_cred()) creds = override_creds(ctx->sq_creds); @@ -278,7 +297,6 @@ static int io_sq_thread(void *data) unsigned long timeout = 0; char buf[TASK_COMM_LEN] = {}; DEFINE_WAIT(wait); - u64 start; /* offload context creation failed, just exit */ if (!current->io_uring) { @@ -313,6 +331,7 @@ static int io_sq_thread(void *data) mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; + struct io_sq_time ist = { }; if (io_sqd_events_pending(sqd) || signal_pending(current)) { if (io_sqd_handle_event(sqd)) @@ -321,9 +340,8 @@ static int io_sq_thread(void *data) } cap_entries = !list_is_singular(&sqd->ctx_list); - start = io_sq_cpu_usec(current); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - int ret = __io_sq_thread(ctx, cap_entries); + int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist); if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) sqt_spin = true; @@ -331,15 +349,18 @@ static int io_sq_thread(void *data) if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - if (io_napi(ctx)) + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + if (io_napi(ctx)) { + io_sq_start_worktime(&ist); io_napi_sqpoll_busy_poll(ctx); + } + } + + io_sq_update_worktime(sqd, &ist); if (sqt_spin || !time_after(jiffies, timeout)) { - if (sqt_spin) { - io_sq_update_worktime(sqd, start); + if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; - } if (unlikely(need_resched())) { mutex_unlock(&sqd->lock); cond_resched(); -- cgit From 915651b7c9473fd23d0e56fe227a97eda483cf7c Mon Sep 17 00:00:00 2001 From: Ranganath V N Date: Tue, 21 Oct 2025 22:59:30 +0530 Subject: io_uring: Fix code indentation error Fix the indentation to ensure consistent code style and improve readability and to fix the errors: ERROR: code indent should use tabs where possible + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE);$ ERROR: code indent should use tabs where possible +^I^I^I struct io_big_cqe *big_cqe)$ Tested by running the /scripts/checkpatch.pl Signed-off-by: Ranganath V N Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/net.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 820ef0527666..296667ba712c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -879,7 +879,7 @@ static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) } static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, - struct io_big_cqe *big_cqe) + struct io_big_cqe *big_cqe) { struct io_overflow_cqe *ocqe; diff --git a/io_uring/net.c b/io_uring/net.c index f99b90c762fc..a95cc9ca2a4d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -383,7 +383,7 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; if (sr->flags & IORING_SEND_VECTORIZED) - return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } -- cgit From 060aa0b0c26c9e88cfc1433fab3d0145700e8247 Mon Sep 17 00:00:00 2001 From: David Wei Date: Tue, 21 Oct 2025 13:29:44 -0700 Subject: io_uring zcrx: add MAINTAINERS entry Same as [1] but also with netdev@ as an additional mailing list. io_uring zero copy receive is of particular interest to netdev participants too, given its tight integration to netdev core. With this updated entry, folks running get_maintainer.pl on patches that touch io_uring/zcrx.* will know to send it to netdev@ as well. Note that this doesn't mean all changes require explicit acks from netdev; this is purely for wider visibility and for other contributors to know where to send patches. [1]: https://lore.kernel.org/io-uring/989528e611b51d71fb712691ebfb76d2059ba561.1755461246.git.asml.silence@gmail.com/ Signed-off-by: David Wei Acked-by: Jakub Kicinski Reviewed-by: Mina Almasry [axboe: use correct io_uring tree URL] Signed-off-by: Jens Axboe --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 46126ce2f968..2ed9efa2d2a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13110,6 +13110,15 @@ F: include/uapi/linux/io_uring.h F: include/uapi/linux/io_uring/ F: io_uring/ +IO_URING ZCRX +M: Pavel Begunkov +L: io-uring@vger.kernel.org +L: netdev@vger.kernel.org +T: git https://github.com/isilence/linux.git zcrx/for-next +T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux.git +S: Maintained +F: io_uring/zcrx.* + IPMI SUBSYSTEM M: Corey Minyard L: openipmi-developer@lists.sourceforge.net (moderated for non-subscribers) -- cgit From c5efc6a0b3940381d67887302ddb87a5cf623685 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Thu, 23 Oct 2025 04:55:24 -0700 Subject: io_uring: correct __must_hold annotation in io_install_fixed_file The __must_hold annotation references &req->ctx->uring_lock, but req is not in scope in io_install_fixed_file. This change updates the annotation to reference the correct ctx->uring_lock. improving code clarity. Fixes: f110ed8498af ("io_uring: split out fixed file installation and removal") Signed-off-by: Alok Tiwari Signed-off-by: Jens Axboe --- io_uring/filetable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index a21660e3145a..794ef95df293 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -57,7 +57,7 @@ void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table) static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, u32 slot_index) - __must_hold(&req->ctx->uring_lock) + __must_hold(&ctx->uring_lock) { struct io_rsrc_node *node; -- cgit From 6f1cbf6d6fd13fc169dde14e865897924cdc4bbd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 24 Oct 2025 09:34:59 +0800 Subject: io_uring: fix buffer auto-commit for multishot uring_cmd Commit 620a50c92700 ("io_uring: uring_cmd: add multishot support") added multishot uring_cmd support with explicit buffer upfront commit via io_uring_mshot_cmd_post_cqe(). However, the buffer selection path in io_ring_buffer_select() was auto-committing buffers for non-pollable files, which conflicts with uring_cmd's explicit upfront commit model. This way consumes the whole selected buffer immediately, and causes failure on the following buffer selection. Fix this by checking uring_cmd to identify operations that handle buffer commit explicitly, and skip auto-commit for these operations. Cc: Caleb Sander Mateos Fixes: 620a50c92700 ("io_uring: uring_cmd: add multishot support") Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index aad655e38672..a727e020fe03 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -155,6 +155,27 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, return 1; } +static bool io_should_commit(struct io_kiocb *req, unsigned int issue_flags) +{ + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here, otherwise nothing ensures that the buffer won't + * get used by others. This does mean it'll be pinned until the + * IO completes, coming in unlocked means we're being called from + * io-wq context and there may be further retries in async hybrid + * mode. For the locked case, the caller must call commit when + * the transfer completes (or if we get -EAGAIN and must poll of + * retry). + */ + if (issue_flags & IO_URING_F_UNLOCKED) + return true; + + /* uring_cmd commits kbuf upfront, no need to auto-commit */ + if (!io_file_can_poll(req) && req->opcode != IORING_OP_URING_CMD) + return true; + return false; +} + static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) @@ -181,17 +202,7 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, sel.buf_list = bl; sel.addr = u64_to_user_ptr(buf->addr); - if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { - /* - * If we came in unlocked, we have no choice but to consume the - * buffer here, otherwise nothing ensures that the buffer won't - * get used by others. This does mean it'll be pinned until the - * IO completes, coming in unlocked means we're being called from - * io-wq context and there may be further retries in async hybrid - * mode. For the locked case, the caller must call commit when - * the transfer completes (or if we get -EAGAIN and must poll of - * retry). - */ + if (io_should_commit(req, issue_flags)) { io_kbuf_commit(req, sel.buf_list, *len, 1); sel.buf_list = NULL; } -- cgit