diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/fdinfo.c | 52 | ||||
-rw-r--r-- | io_uring/io_uring.c | 82 | ||||
-rw-r--r-- | io_uring/kbuf.c | 2 | ||||
-rw-r--r-- | io_uring/memmap.c | 2 | ||||
-rw-r--r-- | io_uring/rsrc.c | 109 | ||||
-rw-r--r-- | io_uring/sqpoll.c | 2 | ||||
-rw-r--r-- | io_uring/uring_cmd.c | 5 | ||||
-rw-r--r-- | io_uring/zcrx.c | 56 | ||||
-rw-r--r-- | io_uring/zcrx.h | 6 |
9 files changed, 179 insertions, 137 deletions
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index f60d0a9d505e..e0d6a59a89fa 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -86,13 +86,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, } #endif -/* - * Caller holds a reference to the file already, we don't need to do - * anything else to get an extra reference. - */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; @@ -106,7 +101,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; - bool has_lock; unsigned int i; if (ctx->flags & IORING_SETUP_CQE32) @@ -123,11 +117,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqMask:\t0x%x\n", sq_mask); seq_printf(m, "SqHead:\t%u\n", sq_head); seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head)); seq_printf(m, "CqMask:\t0x%x\n", cq_mask); seq_printf(m, "CqHead:\t%u\n", cq_head); seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { @@ -176,15 +170,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "\n"); } - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; /* @@ -206,7 +192,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); - for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) { + for (i = 0; i < ctx->file_table.data.nr; i++) { struct file *f = NULL; if (ctx->file_table.data.nodes[i]) @@ -218,7 +204,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); - for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { + for (i = 0; i < ctx->buf_table.nr; i++) { struct io_mapped_ubuf *buf = NULL; if (ctx->buf_table.nodes[i]) @@ -228,7 +214,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) else seq_printf(m, "%5u: <none>\n", i); } - if (has_lock && !xa_empty(&ctx->personalities)) { + if (!xa_empty(&ctx->personalities)) { unsigned long index; const struct cred *cred; @@ -238,7 +224,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } seq_puts(m, "PollList:\n"); - for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) { + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; @@ -247,9 +233,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) task_work_pending(req->tctx->task)); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { @@ -262,4 +245,23 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) spin_unlock(&ctx->completion_lock); napi_show_fdinfo(ctx, m); } + +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. + */ + if (mutex_trylock(&ctx->uring_lock)) { + __io_uring_show_fdinfo(ctx, m); + mutex_unlock(&ctx->uring_lock); + } +} #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c6209fe44cb1..541e65a1eebf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -448,24 +448,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return req->link; } -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; @@ -518,7 +500,6 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { - struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); @@ -543,8 +524,6 @@ static void io_queue_iowq(struct io_kiocb *req) trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); } static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) @@ -869,13 +848,26 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) struct io_ring_ctx *ctx = req->ctx; bool posted; + /* + * If multishot has already posted deferred completions, ensure that + * those are flushed first before posting this one. If not, CQEs + * could get reordered. + */ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); + lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - __io_cq_lock(ctx); - posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + } + ctx->submit_state.cq_flush = true; - __io_cq_unlock_post(ctx); return posted; } @@ -1078,21 +1070,22 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync) while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; - if (sync && last_ctx != req->ctx) { + if (last_ctx != req->ctx) { if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } last_ctx = req->ctx; percpu_ref_get(&last_ctx->refs); } - if (llist_add(&req->io_task_work.node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) + schedule_delayed_work(&last_ctx->fallback_work, 1); } if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } } @@ -1718,15 +1711,22 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, return !!req->file; } +#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) + static inline int __io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags, const struct io_issue_def *def) { const struct cred *creds = NULL; + struct io_kiocb *link = NULL; int ret; - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); + if (req->flags & REQ_F_ARM_LTIMEOUT) + link = __io_prep_linked_timeout(req); + } if (!def->audit_skip) audit_uring_entry(req->opcode); @@ -1736,8 +1736,12 @@ static inline int __io_issue_sqe(struct io_kiocb *req, if (!def->audit_skip) audit_uring_exit(!ret, ret); - if (creds) - revert_creds(creds); + if (unlikely(creds || link)) { + if (creds) + revert_creds(creds); + if (link) + io_queue_linked_timeout(link); + } return ret; } @@ -1763,7 +1767,6 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; - io_arm_ltimeout(req); /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) @@ -1818,8 +1821,6 @@ void io_wq_submit_work(struct io_wq_work *work) else req_ref_get(req); - io_arm_ltimeout(req); - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: @@ -1935,15 +1936,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout; - if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { io_req_defer_failed(req, ret); return; } - linked_timeout = io_prep_linked_timeout(req); - switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); @@ -1956,9 +1953,6 @@ static void io_queue_async(struct io_kiocb *req, int ret) case IO_APOLL_OK: break; } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 098109259671..953d5e742569 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -504,6 +504,8 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe p->nbufs = tmp; p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); + if (!p->len) + return -EINVAL; if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, &size)) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 76fcc79656b0..07f8a5cbd37e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -116,7 +116,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr) void *ptr; if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { - if (ifd.nr_folios == 1) { + if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { mr->ptr = page_address(mr->pages[0]); return 0; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5e64a8bb30a4..f80a77c4973f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -175,6 +175,18 @@ void io_rsrc_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->imu_cache, kfree); } +static void io_clear_table_tags(struct io_rsrc_data *data) +{ + int i; + + for (i = 0; i < data->nr; i++) { + struct io_rsrc_node *node = data->nodes[i]; + + if (node) + node->tag = 0; + } +} + __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) { @@ -583,6 +595,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: + io_clear_table_tags(&ctx->file_table.data); io_sqe_files_unregister(ctx); return ret; } @@ -902,8 +915,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } ctx->buf_table = data; - if (ret) + if (ret) { + io_clear_table_tags(&ctx->buf_table); io_sqe_buffers_unregister(ctx); + } return ret; } @@ -1017,10 +1032,33 @@ static int validate_fixed_range(u64 buf_addr, size_t len, return 0; } +static int io_import_kbuf(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, size_t len, size_t offset) +{ + size_t count = len + offset; + + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); + iov_iter_advance(iter, offset); + + if (count < imu->len) { + const struct bio_vec *bvec = iter->bvec; + + while (len > bvec->bv_len) { + len -= bvec->bv_len; + bvec++; + } + iter->nr_segs = 1 + bvec - iter->bvec; + } + return 0; +} + static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { + const struct bio_vec *bvec; + size_t folio_mask; + unsigned nr_segs; size_t offset; int ret; @@ -1032,56 +1070,35 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, if (!(imu->dir & (1 << ddir))) return -EFAULT; - /* - * Might not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here for user - * registered nodes, because we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are the same in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be folio_size aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - /* - * Kernel buffer bvecs, on the other hand, don't necessarily - * have the size property of user registered ones, so we have - * to use the slow iter advance. - */ - if (offset < bvec->bv_len) { - iter->count -= offset; - iter->iov_offset = offset; - } else if (imu->is_kbuf) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; + if (imu->is_kbuf) + return io_import_kbuf(ddir, iter, imu, len, offset); - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> imu->folio_shift); + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are the same in size, except potentially the + * first and last bvec + */ + folio_mask = (1UL << imu->folio_shift) - 1; + bvec = imu->bvec; + if (offset >= bvec->bv_len) { + unsigned long seg_skip; - iter->bvec += seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); - } + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> imu->folio_shift); + bvec += seg_skip; + offset &= folio_mask; } - + nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; + iov_iter_bvec(iter, ddir, bvec, nr_segs, len); + iter->iov_offset = offset; return 0; } diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index d037cc68e9d3..03c699493b5a 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -20,7 +20,7 @@ #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -#define IORING_TW_CAP_ENTRIES_VALUE 8 +#define IORING_TW_CAP_ENTRIES_VALUE 32 enum { IO_SQ_THREAD_SHOULD_STOP = 0, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index a9ea7d29cdd9..430ed620ddfe 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -254,6 +254,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return -EOPNOTSUPP; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once */ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } ret = file->f_op->uring_cmd(ioucmd, issue_flags); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 80d4a6f71d29..fe86606b9f30 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -26,6 +26,11 @@ #include "zcrx.h" #include "rsrc.h" +static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) +{ + return pp->mp_priv; +} + #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, @@ -46,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); + area->is_mapped = false; } static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { int i; + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) + return 0; + for (i = 0; i < area->nia.num_niovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; dma_addr_t dma; @@ -181,7 +193,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area) kvfree(area->nia.niovs); kvfree(area->user_refs); if (area->pages) { - unpin_user_pages(area->pages, area->nia.num_niovs); + unpin_user_pages(area->pages, area->nr_folios); kvfree(area->pages); } kfree(area); @@ -192,7 +204,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_area_reg *area_reg) { struct io_zcrx_area *area; - int i, ret, nr_pages; + int i, ret, nr_pages, nr_iovs; struct iovec iov; if (area_reg->flags || area_reg->rq_area_token) @@ -220,27 +232,28 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, area->pages = NULL; goto err; } - area->nia.num_niovs = nr_pages; + area->nr_folios = nr_iovs = nr_pages; + area->nia.num_niovs = nr_iovs; - area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), + area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->nia.niovs) goto err; - area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), + area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), GFP_KERNEL | __GFP_ZERO); if (!area->freelist) goto err; - for (i = 0; i < nr_pages; i++) + for (i = 0; i < nr_iovs; i++) area->freelist[i] = i; - area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]), + area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->user_refs) goto err; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_iovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; niov->owner = &area->nia; @@ -248,7 +261,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, atomic_set(&area->user_refs[i], 0); } - area->free_count = nr_pages; + area->free_count = nr_iovs; area->ifq = ifq; /* we're only supporting one area per ifq for now */ area->area_id = 0; @@ -274,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) ifq->ctx = ctx; spin_lock_init(&ifq->lock); spin_lock_init(&ifq->rq_lock); + mutex_init(&ifq->dma_lock); return ifq; } @@ -323,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) put_device(ifq->dev); io_free_rbuf_ring(ifq); + mutex_destroy(&ifq->dma_lock); kfree(ifq); } @@ -353,7 +368,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) return -EFAULT; - if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || + reg.__resv2 || reg.zcrx_id) return -EINVAL; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; @@ -393,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; get_device(ifq->dev); - ret = io_zcrx_map_area(ifq, ifq->area); - if (ret) - goto err; - mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); @@ -584,7 +596,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); /* pp should already be ensuring that */ if (unlikely(pp->alloc.count)) @@ -616,7 +628,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) static int io_pp_zc_init(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + int ret; if (WARN_ON_ONCE(!ifq)) return -EINVAL; @@ -629,13 +642,17 @@ static int io_pp_zc_init(struct page_pool *pp) if (pp->p.dma_dir != DMA_FROM_DEVICE) return -EOPNOTSUPP; + ret = io_zcrx_map_area(ifq, ifq->area); + if (ret) + return ret; + percpu_ref_get(&ifq->ctx->refs); return 0; } static void io_pp_zc_destroy(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); struct io_zcrx_area *area = ifq->area; if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) @@ -664,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) struct io_zcrx_ifq *ifq = mp_priv; io_zcrx_drop_netdev(ifq); + if (ifq->area) + io_zcrx_unmap_area(ifq, ifq->area); + p->mp_ops = NULL; p->mp_priv = NULL; } @@ -790,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, niov = netmem_to_net_iov(frag->netmem); if (niov->pp->mp_ops != &io_uring_pp_zc_ops || - niov->pp->mp_priv != ifq) + io_pp_to_ifq(niov->pp) != ifq) return -EFAULT; if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 706cc7300780..f2bc811f022c 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -15,6 +15,7 @@ struct io_zcrx_area { bool is_mapped; u16 area_id; struct page **pages; + unsigned long nr_folios; /* freelist */ spinlock_t freelist_lock ____cacheline_aligned_in_smp; @@ -26,17 +27,18 @@ struct io_zcrx_ifq { struct io_ring_ctx *ctx; struct io_zcrx_area *area; + spinlock_t rq_lock ____cacheline_aligned_in_smp; struct io_uring *rq_ring; struct io_uring_zcrx_rqe *rqes; - u32 rq_entries; u32 cached_rq_head; - spinlock_t rq_lock; + u32 rq_entries; u32 if_rxq; struct device *dev; struct net_device *netdev; netdevice_tracker netdev_tracker; spinlock_t lock; + struct mutex dma_lock; }; #if defined(CONFIG_IO_URING_ZCRX) |