diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-04-18 09:13:52 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-04-18 09:13:52 -0700 |
commit | b1011b2b451c8b6d16be6b07f44f22a0a0dd7158 (patch) | |
tree | 643146e9111d7f8bd0f717bcbe5af3938be711f7 | |
parent | fc96b232f8e7c0a6c282f47726b2ff6a5fb341d2 (diff) | |
parent | f12ecf5e1c5eca48b8652e893afcdb730384a6aa (diff) |
Merge tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux
Pull io_uring fixes from Jens Axboe:
- Correctly cap iov_iter->nr_segs for imports of registered buffers,
both kbuf and normal ones.
Three cleanups to make it saner first, then two fixes for each of the
buffer types.
This fixes a performance regression where partial buffer usage
doesn't trim the tail number of segments, leading the block layer to
iterate the IOs to check if it needs splitting.
- Two patches tweaking the newly introduced zero-copy rx API, mostly to
keep the API consistent once we add multiple interface queues per
ring support in the 6.16 release.
- zc rx unmapping fix for a dead device
* tag 'io_uring-6.15-20250418' of git://git.kernel.dk/linux:
io_uring/zcrx: fix late dma unmap for a dead dev
io_uring/rsrc: ensure segments counts are correct on kbuf buffers
io_uring/rsrc: send exact nr_segs for fixed buffer
io_uring/rsrc: refactor io_import_fixed
io_uring/rsrc: separate kbuf offset adjustments
io_uring/rsrc: don't skip offset calculation
io_uring/zcrx: add pp to ifq conversion helper
io_uring/zcrx: return ifq id to the user
-rw-r--r-- | include/uapi/linux/io_uring.h | 4 | ||||
-rw-r--r-- | io_uring/rsrc.c | 92 | ||||
-rw-r--r-- | io_uring/zcrx.c | 37 | ||||
-rw-r--r-- | io_uring/zcrx.h | 1 |
4 files changed, 79 insertions, 55 deletions
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ed2beb4def3f..8f1fc12bac46 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1010,7 +1010,9 @@ struct io_uring_zcrx_ifq_reg { __u64 region_ptr; /* struct io_uring_region_desc * */ struct io_uring_zcrx_offsets offsets; - __u64 __resv[4]; + __u32 zcrx_id; + __u32 __resv2; + __u64 __resv[3]; }; #ifdef __cplusplus diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index b36c8825550e..f80a77c4973f 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1032,10 +1032,33 @@ static int validate_fixed_range(u64 buf_addr, size_t len, return 0; } +static int io_import_kbuf(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, size_t len, size_t offset) +{ + size_t count = len + offset; + + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); + iov_iter_advance(iter, offset); + + if (count < imu->len) { + const struct bio_vec *bvec = iter->bvec; + + while (len > bvec->bv_len) { + len -= bvec->bv_len; + bvec++; + } + iter->nr_segs = 1 + bvec - iter->bvec; + } + return 0; +} + static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { + const struct bio_vec *bvec; + size_t folio_mask; + unsigned nr_segs; size_t offset; int ret; @@ -1047,56 +1070,35 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, if (!(imu->dir & (1 << ddir))) return -EFAULT; - /* - * Might not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here for user - * registered nodes, because we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are the same in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be folio_size aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - /* - * Kernel buffer bvecs, on the other hand, don't necessarily - * have the size property of user registered ones, so we have - * to use the slow iter advance. - */ - if (offset < bvec->bv_len) { - iter->count -= offset; - iter->iov_offset = offset; - } else if (imu->is_kbuf) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; + if (imu->is_kbuf) + return io_import_kbuf(ddir, iter, imu, len, offset); - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> imu->folio_shift); + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are the same in size, except potentially the + * first and last bvec + */ + folio_mask = (1UL << imu->folio_shift) - 1; + bvec = imu->bvec; + if (offset >= bvec->bv_len) { + unsigned long seg_skip; - iter->bvec += seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); - } + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> imu->folio_shift); + bvec += seg_skip; + offset &= folio_mask; } - + nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; + iov_iter_bvec(iter, ddir, bvec, nr_segs, len); + iter->iov_offset = offset; return 0; } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 0f46e0404c04..fe86606b9f30 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -26,6 +26,11 @@ #include "zcrx.h" #include "rsrc.h" +static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) +{ + return pp->mp_priv; +} + #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, @@ -46,14 +51,21 @@ static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); + area->is_mapped = false; } static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { int i; + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) + return 0; + for (i = 0; i < area->nia.num_niovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; dma_addr_t dma; @@ -275,6 +287,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) ifq->ctx = ctx; spin_lock_init(&ifq->lock); spin_lock_init(&ifq->rq_lock); + mutex_init(&ifq->dma_lock); return ifq; } @@ -324,6 +337,7 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) put_device(ifq->dev); io_free_rbuf_ring(ifq); + mutex_destroy(&ifq->dma_lock); kfree(ifq); } @@ -354,7 +368,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) return -EFAULT; - if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || + reg.__resv2 || reg.zcrx_id) return -EINVAL; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; @@ -394,10 +409,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; get_device(ifq->dev); - ret = io_zcrx_map_area(ifq, ifq->area); - if (ret) - goto err; - mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); @@ -585,7 +596,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); /* pp should already be ensuring that */ if (unlikely(pp->alloc.count)) @@ -617,7 +628,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) static int io_pp_zc_init(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + int ret; if (WARN_ON_ONCE(!ifq)) return -EINVAL; @@ -630,13 +642,17 @@ static int io_pp_zc_init(struct page_pool *pp) if (pp->p.dma_dir != DMA_FROM_DEVICE) return -EOPNOTSUPP; + ret = io_zcrx_map_area(ifq, ifq->area); + if (ret) + return ret; + percpu_ref_get(&ifq->ctx->refs); return 0; } static void io_pp_zc_destroy(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); struct io_zcrx_area *area = ifq->area; if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) @@ -665,6 +681,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) struct io_zcrx_ifq *ifq = mp_priv; io_zcrx_drop_netdev(ifq); + if (ifq->area) + io_zcrx_unmap_area(ifq, ifq->area); + p->mp_ops = NULL; p->mp_priv = NULL; } @@ -791,7 +810,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, niov = netmem_to_net_iov(frag->netmem); if (niov->pp->mp_ops != &io_uring_pp_zc_ops || - niov->pp->mp_priv != ifq) + io_pp_to_ifq(niov->pp) != ifq) return -EFAULT; if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index 47f1c0e8c197..f2bc811f022c 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -38,6 +38,7 @@ struct io_zcrx_ifq { struct net_device *netdev; netdevice_tracker netdev_tracker; spinlock_t lock; + struct mutex dma_lock; }; #if defined(CONFIG_IO_URING_ZCRX) |