summaryrefslogtreecommitdiff
path: root/io_uring/io_uring.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 20:27:33 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-20 20:27:33 -0800
commita312e1706ce6c124f04ec85ddece240f3bb2a696 (patch)
treeba749bac4f345fc48c7c79d9a9c5713fe87af300 /io_uring/io_uring.c
parent1cbfb828e05171ca2dd77b5988d068e6872480fe (diff)
parent561e3a0c40dc7e3ab7b0b3647a2b89eca16215d9 (diff)
Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: "Not a lot in terms of features this time around, mostly just cleanups and code consolidation: - Support for PI meta data read/write via io_uring, with NVMe and SCSI covered - Cleanup the per-op structure caching, making it consistent across various command types - Consolidate the various user mapped features into a concept called regions, making the various users of that consistent - Various cleanups and fixes" * tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux: (56 commits) io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname io_uring: reuse io_should_terminate_tw() for cmds io_uring: Factor out a function to parse restrictions io_uring/rsrc: require cloned buffers to share accounting contexts io_uring: simplify the SQPOLL thread check when cancelling requests io_uring: expose read/write attribute capability io_uring/rw: don't gate retry on completion context io_uring/rw: handle -EAGAIN retry at IO completion time io_uring/rw: use io_rw_recycle() from cleanup path io_uring/rsrc: simplify the bvec iter count calculation io_uring: ensure io_queue_deferred() is out-of-line io_uring/rw: always clear ->bytes_done on io_async_rw setup io_uring/rw: use NULL for rw->free_iovec assigment io_uring/rw: don't mask in f_iocb_flags io_uring/msg_ring: Drop custom destructor io_uring: Move old async data allocation helper to header io_uring/rw: Allocate async data through helper io_uring/net: Allocate msghdr async data through helper io_uring/uring_cmd: Allocate async data through generic helper io_uring/poll: Allocate apoll with generic alloc_cache helper ...
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r--io_uring/io_uring.c136
1 files changed, 60 insertions, 76 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4758f1ba902b..7bfbc7c22367 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -115,7 +115,7 @@
REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
- IO_REQ_CLEAN_FLAGS)
+ REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -143,7 +143,8 @@ struct io_defer_entry {
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_uring_task *tctx,
- bool cancel_all);
+ bool cancel_all,
+ bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
@@ -350,7 +351,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
io_napi_init(ctx);
- mutex_init(&ctx->resize_lock);
+ mutex_init(&ctx->mmap_lock);
return ctx;
@@ -361,7 +362,7 @@ err:
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
- io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
+ io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
kvfree(ctx->cancel_table.hbs);
xa_destroy(&ctx->io_bl_xa);
@@ -550,8 +551,9 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
-static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
+ spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
@@ -562,6 +564,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
io_req_task_queue(de->req);
kfree(de);
}
+ spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -570,11 +573,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active) {
- spin_lock(&ctx->completion_lock);
+ if (ctx->drain_active)
io_queue_deferred(ctx);
- spin_unlock(&ctx->completion_lock);
- }
if (ctx->has_evfd)
io_eventfd_flush_signal(ctx);
}
@@ -1401,6 +1401,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
comp_list);
if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+ if (req->flags & REQ_F_REISSUE) {
+ node = req->comp_list.next;
+ req->flags &= ~REQ_F_REISSUE;
+ io_queue_iowq(req);
+ continue;
+ }
if (req->flags & REQ_F_REFCOUNT) {
node = req->comp_list.next;
if (!req_ref_put_and_test(req))
@@ -1440,7 +1446,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP) &&
+ /*
+ * Requests marked with REQUEUE should not post a CQE, they
+ * will go through the io-wq retry machinery and post one
+ * later.
+ */
+ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->lockless_cq) {
spin_lock(&ctx->completion_lock);
@@ -1640,19 +1651,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
-bool io_alloc_async_data(struct io_kiocb *req)
-{
- const struct io_issue_def *def = &io_issue_defs[req->opcode];
-
- WARN_ON_ONCE(!def->async_size);
- req->async_data = kmalloc(def->async_size, GFP_KERNEL);
- if (req->async_data) {
- req->flags |= REQ_F_ASYNC_DATA;
- return false;
- }
- return true;
-}
-
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
@@ -2631,36 +2629,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
- size_t size)
-{
- return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
- size);
-}
-
-static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
- size_t size)
-{
- return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
- size);
-}
-
static void io_rings_free(struct io_ring_ctx *ctx)
{
- if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
- io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
- true);
- io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
- true);
- } else {
- io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
- ctx->n_ring_pages = 0;
- io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
- ctx->n_sqe_pages = 0;
- vunmap(ctx->rings);
- vunmap(ctx->sq_sqes);
- }
-
+ io_free_region(ctx, &ctx->sq_region);
+ io_free_region(ctx, &ctx->ring_region);
ctx->rings = NULL;
ctx->sq_sqes = NULL;
}
@@ -2732,7 +2704,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
- io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
+ io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
io_free_region(ctx, &ctx->param_region);
@@ -2894,7 +2866,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);
- while (io_uring_try_cancel_requests(ctx, NULL, true))
+ /* The SQPOLL thread never reaches this path */
+ while (io_uring_try_cancel_requests(ctx, NULL, true, false))
cond_resched();
if (ctx->sq_data) {
@@ -3062,7 +3035,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_uring_task *tctx,
- bool cancel_all)
+ bool cancel_all,
+ bool is_sqpoll_thread)
{
struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, };
enum io_wq_cancel cret;
@@ -3092,7 +3066,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
- (ctx->sq_data && ctx->sq_data->thread == current)) {
+ is_sqpoll_thread) {
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
@@ -3165,13 +3139,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
continue;
loop |= io_uring_try_cancel_requests(node->ctx,
current->io_uring,
- cancel_all);
+ cancel_all,
+ false);
}
} else {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
loop |= io_uring_try_cancel_requests(ctx,
current->io_uring,
- cancel_all);
+ cancel_all,
+ true);
}
if (loop) {
@@ -3233,6 +3209,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
end > ctx->cq_wait_size))
return ERR_PTR(-EFAULT);
+ offset = array_index_nospec(offset, ctx->cq_wait_size - size);
return ctx->cq_wait_arg + offset;
}
@@ -3477,9 +3454,10 @@ bool io_is_uring_fops(struct file *file)
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
+ struct io_uring_region_desc rd;
struct io_rings *rings;
size_t size, sq_array_offset;
- void *ptr;
+ int ret;
/* make sure these are sane, as we already accounted them */
ctx->sq_entries = p->sq_entries;
@@ -3490,15 +3468,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (size == SIZE_MAX)
return -EOVERFLOW;
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
- else
- rings = io_rings_map(ctx, p->cq_off.user_addr, size);
-
- if (IS_ERR(rings))
- return PTR_ERR(rings);
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (ctx->flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p->cq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
+ if (ret)
+ return ret;
+ ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
- ctx->rings = rings;
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1;
@@ -3515,17 +3495,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
}
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
- else
- ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
-
- if (IS_ERR(ptr)) {
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (ctx->flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p->sq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
+ if (ret) {
io_rings_free(ctx);
- return PTR_ERR(ptr);
+ return ret;
}
-
- ctx->sq_sqes = ptr;
+ ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region);
return 0;
}
@@ -3733,7 +3714,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
- IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
+ IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
+ IORING_FEAT_RW_ATTR;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -3894,6 +3876,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+ BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
+ BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=