diff options
| -rw-r--r-- | include/uapi/linux/io_uring.h | 34 | ||||
| -rw-r--r-- | io_uring/net.c | 7 | ||||
| -rw-r--r-- | io_uring/register.c | 3 | ||||
| -rw-r--r-- | io_uring/zcrx.c | 326 | ||||
| -rw-r--r-- | io_uring/zcrx.h | 8 |
5 files changed, 317 insertions, 61 deletions
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e96080db3e4d..deb772222b6d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -697,6 +697,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ + IORING_REGISTER_ZCRX_CTRL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1060,6 +1063,10 @@ struct io_uring_zcrx_area_reg { __u64 __resv2[2]; }; +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ @@ -1078,6 +1085,33 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, + + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[2]; + + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/net.c b/io_uring/net.c index a95cc9ca2a4d..69f901fa3040 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -110,7 +110,6 @@ enum sr_retry_flags { struct io_recvzc { struct file *file; - unsigned msg_flags; u16 flags; u32 len; struct io_zcrx_ifq *ifq; @@ -1253,8 +1252,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); - zc->msg_flags = READ_ONCE(sqe->msg_flags); - if (zc->msg_flags) + if (READ_ONCE(sqe->msg_flags)) return -EINVAL; if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) return -EINVAL; @@ -1283,8 +1281,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; len = zc->len; - ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, - issue_flags, &zc->len); + ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len); if (len && zc->len == 0) { io_req_set_res(req, 0, 0); diff --git a/io_uring/register.c b/io_uring/register.c index 334a457da3f7..fc66a5364483 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -815,6 +815,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, case IORING_REGISTER_QUERY: ret = io_query(ctx, arg, nr_args); break; + case IORING_REGISTER_ZCRX_CTRL: + ret = io_zcrx_ctrl(ctx, arg, nr_args); + break; default: ret = -EINVAL; break; diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index c57ab332acbd..b99cf2c6670a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,7 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff_ref.h> +#include <linux/anon_inodes.h> #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> @@ -170,7 +171,7 @@ static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pag if (folio == last_folio) continue; last_folio = folio; - res += 1UL << folio_order(folio); + res += folio_nr_pages(folio); } return res; } @@ -344,6 +345,13 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } +static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) +{ + offsets->head = offsetof(struct io_uring, head); + offsets->tail = offsetof(struct io_uring, tail); + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); +} + static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, @@ -355,7 +363,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, void *ptr; int ret; - off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); + io_fill_zcrx_offsets(®->offsets); + off = reg->offsets.rqes; size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; if (size > rd->size) return -EINVAL; @@ -371,9 +380,6 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); - reg->offsets.head = offsetof(struct io_uring, head); - reg->offsets.tail = offsetof(struct io_uring, tail); - reg->offsets.rqes = off; return 0; } @@ -482,6 +488,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) spin_lock_init(&ifq->rq_lock); mutex_init(&ifq->pp_lock); refcount_set(&ifq->refs, 1); + refcount_set(&ifq->user_refs, 1); return ifq; } @@ -543,6 +550,57 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) io_zcrx_ifq_free(ifq); } +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->desc.pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } + io_put_zcrx_ifq(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -553,6 +611,112 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, return ifq ? &ifq->region : NULL; } +static int zcrx_box_release(struct inode *inode, struct file *file) +{ + struct io_zcrx_ifq *ifq = file->private_data; + + if (WARN_ON_ONCE(!ifq)) + return -EFAULT; + zcrx_unregister(ifq); + return 0; +} + +static const struct file_operations zcrx_box_fops = { + .owner = THIS_MODULE, + .release = zcrx_box_release, +}; + +static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, + struct zcrx_ctrl *ctrl, void __user *arg) +{ + struct zcrx_ctrl_export *ce = &ctrl->zc_export; + struct file *file; + int fd = -1; + + if (!mem_is_zero(ce, sizeof(*ce))) + return -EINVAL; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + ce->zcrx_fd = fd; + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { + put_unused_fd(fd); + return -EFAULT; + } + + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, + ifq, O_CLOEXEC, NULL); + if (IS_ERR(file)) { + put_unused_fd(fd); + zcrx_unregister(ifq); + return PTR_ERR(file); + } + + fd_install(fd, file); + return 0; +} + +static int import_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg, + struct io_uring_zcrx_ifq_reg *reg) +{ + struct io_zcrx_ifq *ifq; + struct file *file; + int fd, ret; + u32 id; + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) + return -EINVAL; + + fd = reg->if_idx; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + file = fd_file(f); + if (file->f_op != &zcrx_box_fops || !file->private_data) + return -EBADF; + + ifq = file->private_data; + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto err; + } + + reg->zcrx_id = id; + io_fill_zcrx_offsets(®->offsets); + if (copy_to_user(arg, reg, sizeof(*reg))) { + ret = -EFAULT; + goto err_xa_erase; + } + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err_xa_erase; + } + + return 0; +err_xa_erase: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +err: + zcrx_unregister(ifq); + return ret; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -578,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) - return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.__resv2 || reg.zcrx_id) return -EINVAL; + if (reg.flags & ZCRX_REG_IMPORT) + return import_zcrx(ctx, arg, ®); + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { @@ -683,48 +849,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) return &area->nia.niovs[niov_idx]; } -static void io_zcrx_return_niov_freelist(struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - spin_lock_bh(&area->freelist_lock); - area->freelist[area->free_count++] = net_iov_idx(niov); - spin_unlock_bh(&area->freelist_lock); -} - -static void io_zcrx_return_niov(struct net_iov *niov) -{ - netmem_ref netmem = net_iov_to_netmem(niov); - - if (!niov->pp) { - /* copy fallback allocated niovs */ - io_zcrx_return_niov_freelist(niov); - return; - } - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); -} - -static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) -{ - struct io_zcrx_area *area = ifq->area; - int i; - - if (!area) - return; - - /* Reclaim back all buffers given to the user space. */ - for (i = 0; i < area->nia.num_niovs; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - int nr; - - if (!atomic_read(io_get_user_counter(niov))) - continue; - nr = atomic_xchg(io_get_user_counter(niov), 0); - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) - io_zcrx_return_niov(niov); - } -} - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -741,10 +865,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - - io_close_queue(ifq); - io_zcrx_scrub(ifq); - io_put_zcrx_ifq(ifq); + zcrx_unregister(ifq); } xa_destroy(&ctx->zcrx_ctxs); @@ -815,7 +936,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp, if (!page_pool_unref_and_test(netmem)) continue; - if (unlikely(niov->pp != pp)) { + if (unlikely(niov->desc.pp != pp)) { io_zcrx_return_niov(niov); continue; } @@ -941,6 +1062,97 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, + struct io_zcrx_ifq *zcrx) +{ + unsigned int mask = zcrx->rq_entries - 1; + unsigned int i; + + guard(spinlock_bh)(&zcrx->rq_lock); + + nr = min(nr, io_zcrx_rqring_entries(zcrx)); + for (i = 0; i < nr; i++) { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); + struct net_iov *niov; + + if (!io_parse_rqe(rqe, zcrx, &niov)) + break; + netmem_array[i] = net_iov_to_netmem(niov); + } + + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); + return i; +} + +#define ZCRX_FLUSH_BATCH 32 + +static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) { + netmem_ref netmem = netmems[i]; + struct net_iov *niov = netmem_to_net_iov(netmem); + + if (!io_zcrx_put_niov_uref(niov)) + continue; + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, + struct zcrx_ctrl *ctrl) +{ + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; + netmem_ref netmems[ZCRX_FLUSH_BATCH]; + unsigned total = 0; + unsigned nr; + + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) + return -EINVAL; + + do { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); + + zcrx_return_buffers(netmems, nr); + total += nr; + + if (fatal_signal_pending(current)) + break; + cond_resched(); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); + + return 0; +} + +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct zcrx_ctrl ctrl; + struct io_zcrx_ifq *zcrx; + + if (nr_args) + return -EINVAL; + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) + return -EFAULT; + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) + return -EFAULT; + + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); + if (!zcrx) + return -ENXIO; + + switch (ctrl.op) { + case ZCRX_CTRL_FLUSH_RQ: + return zcrx_flush_rq(ctx, zcrx, &ctrl); + case ZCRX_CTRL_EXPORT: + return zcrx_export(ctx, zcrx, &ctrl, arg); + } + + return -EOPNOTSUPP; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { @@ -1082,13 +1294,15 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { struct net_iov *niov; + struct page_pool *pp; if (unlikely(!skb_frag_is_net_iov(frag))) return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || - io_pp_to_ifq(niov->pp) != ifq) + pp = niov->desc.pp; + + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) return -EFAULT; if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index c9b9bfae0547..32ab95b2cb81 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -55,6 +55,8 @@ struct io_zcrx_ifq { struct net_device *netdev; netdevice_tracker netdev_tracker; refcount_t refs; + /* counts userspace facing users like io_uring */ + refcount_t user_refs; /* * Page pool and net configuration lock, can be taken deeper in the @@ -65,6 +67,7 @@ struct io_zcrx_ifq { }; #if defined(CONFIG_IO_URING_ZCRX) +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg); void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); @@ -93,6 +96,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct { return NULL; } +static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, + void __user *arg, unsigned nr_arg) +{ + return -EOPNOTSUPP; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); |
