summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/uapi/linux/io_uring.h34
-rw-r--r--io_uring/net.c7
-rw-r--r--io_uring/register.c3
-rw-r--r--io_uring/zcrx.c326
-rw-r--r--io_uring/zcrx.h8
5 files changed, 317 insertions, 61 deletions
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e96080db3e4d..deb772222b6d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -697,6 +697,9 @@ enum io_uring_register_op {
/* query various aspects of io_uring, see linux/io_uring/query.h */
IORING_REGISTER_QUERY = 35,
+ /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
+ IORING_REGISTER_ZCRX_CTRL = 36,
+
/* this goes last */
IORING_REGISTER_LAST,
@@ -1060,6 +1063,10 @@ struct io_uring_zcrx_area_reg {
__u64 __resv2[2];
};
+enum zcrx_reg_flags {
+ ZCRX_REG_IMPORT = 1,
+};
+
/*
* Argument for IORING_REGISTER_ZCRX_IFQ
*/
@@ -1078,6 +1085,33 @@ struct io_uring_zcrx_ifq_reg {
__u64 __resv[3];
};
+enum zcrx_ctrl_op {
+ ZCRX_CTRL_FLUSH_RQ,
+ ZCRX_CTRL_EXPORT,
+
+ __ZCRX_CTRL_LAST,
+};
+
+struct zcrx_ctrl_flush_rq {
+ __u64 __resv[6];
+};
+
+struct zcrx_ctrl_export {
+ __u32 zcrx_fd;
+ __u32 __resv1[11];
+};
+
+struct zcrx_ctrl {
+ __u32 zcrx_id;
+ __u32 op; /* see enum zcrx_ctrl_op */
+ __u64 __resv[2];
+
+ union {
+ struct zcrx_ctrl_export zc_export;
+ struct zcrx_ctrl_flush_rq zc_flush;
+ };
+};
+
#ifdef __cplusplus
}
#endif
diff --git a/io_uring/net.c b/io_uring/net.c
index a95cc9ca2a4d..69f901fa3040 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -110,7 +110,6 @@ enum sr_retry_flags {
struct io_recvzc {
struct file *file;
- unsigned msg_flags;
u16 flags;
u32 len;
struct io_zcrx_ifq *ifq;
@@ -1253,8 +1252,7 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
zc->len = READ_ONCE(sqe->len);
zc->flags = READ_ONCE(sqe->ioprio);
- zc->msg_flags = READ_ONCE(sqe->msg_flags);
- if (zc->msg_flags)
+ if (READ_ONCE(sqe->msg_flags))
return -EINVAL;
if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT))
return -EINVAL;
@@ -1283,8 +1281,7 @@ int io_recvzc(struct io_kiocb *req, unsigned int issue_flags)
return -ENOTSOCK;
len = zc->len;
- ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT,
- issue_flags, &zc->len);
+ ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len);
if (len && zc->len == 0) {
io_req_set_res(req, 0, 0);
diff --git a/io_uring/register.c b/io_uring/register.c
index 334a457da3f7..fc66a5364483 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -815,6 +815,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
case IORING_REGISTER_QUERY:
ret = io_query(ctx, arg, nr_args);
break;
+ case IORING_REGISTER_ZCRX_CTRL:
+ ret = io_zcrx_ctrl(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index c57ab332acbd..b99cf2c6670a 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -8,6 +8,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff_ref.h>
+#include <linux/anon_inodes.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
@@ -170,7 +171,7 @@ static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pag
if (folio == last_folio)
continue;
last_folio = folio;
- res += 1UL << folio_order(folio);
+ res += folio_nr_pages(folio);
}
return res;
}
@@ -344,6 +345,13 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
+static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets)
+{
+ offsets->head = offsetof(struct io_uring, head);
+ offsets->tail = offsetof(struct io_uring, tail);
+ offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
+}
+
static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
@@ -355,7 +363,8 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
void *ptr;
int ret;
- off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES);
+ io_fill_zcrx_offsets(&reg->offsets);
+ off = reg->offsets.rqes;
size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
if (size > rd->size)
return -EINVAL;
@@ -371,9 +380,6 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
- reg->offsets.head = offsetof(struct io_uring, head);
- reg->offsets.tail = offsetof(struct io_uring, tail);
- reg->offsets.rqes = off;
return 0;
}
@@ -482,6 +488,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
spin_lock_init(&ifq->rq_lock);
mutex_init(&ifq->pp_lock);
refcount_set(&ifq->refs, 1);
+ refcount_set(&ifq->user_refs, 1);
return ifq;
}
@@ -543,6 +550,57 @@ static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq)
io_zcrx_ifq_free(ifq);
}
+static void io_zcrx_return_niov_freelist(struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ spin_lock_bh(&area->freelist_lock);
+ area->freelist[area->free_count++] = net_iov_idx(niov);
+ spin_unlock_bh(&area->freelist_lock);
+}
+
+static void io_zcrx_return_niov(struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ if (!niov->desc.pp) {
+ /* copy fallback allocated niovs */
+ io_zcrx_return_niov_freelist(niov);
+ return;
+ }
+ page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false);
+}
+
+static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
+{
+ struct io_zcrx_area *area = ifq->area;
+ int i;
+
+ if (!area)
+ return;
+
+ /* Reclaim back all buffers given to the user space. */
+ for (i = 0; i < area->nia.num_niovs; i++) {
+ struct net_iov *niov = &area->nia.niovs[i];
+ int nr;
+
+ if (!atomic_read(io_get_user_counter(niov)))
+ continue;
+ nr = atomic_xchg(io_get_user_counter(niov), 0);
+ if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
+ io_zcrx_return_niov(niov);
+ }
+}
+
+static void zcrx_unregister(struct io_zcrx_ifq *ifq)
+{
+ if (refcount_dec_and_test(&ifq->user_refs)) {
+ io_close_queue(ifq);
+ io_zcrx_scrub(ifq);
+ }
+ io_put_zcrx_ifq(ifq);
+}
+
struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
unsigned int id)
{
@@ -553,6 +611,112 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
return ifq ? &ifq->region : NULL;
}
+static int zcrx_box_release(struct inode *inode, struct file *file)
+{
+ struct io_zcrx_ifq *ifq = file->private_data;
+
+ if (WARN_ON_ONCE(!ifq))
+ return -EFAULT;
+ zcrx_unregister(ifq);
+ return 0;
+}
+
+static const struct file_operations zcrx_box_fops = {
+ .owner = THIS_MODULE,
+ .release = zcrx_box_release,
+};
+
+static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq,
+ struct zcrx_ctrl *ctrl, void __user *arg)
+{
+ struct zcrx_ctrl_export *ce = &ctrl->zc_export;
+ struct file *file;
+ int fd = -1;
+
+ if (!mem_is_zero(ce, sizeof(*ce)))
+ return -EINVAL;
+ fd = get_unused_fd_flags(O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ ce->zcrx_fd = fd;
+ if (copy_to_user(arg, ctrl, sizeof(*ctrl))) {
+ put_unused_fd(fd);
+ return -EFAULT;
+ }
+
+ refcount_inc(&ifq->refs);
+ refcount_inc(&ifq->user_refs);
+
+ file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops,
+ ifq, O_CLOEXEC, NULL);
+ if (IS_ERR(file)) {
+ put_unused_fd(fd);
+ zcrx_unregister(ifq);
+ return PTR_ERR(file);
+ }
+
+ fd_install(fd, file);
+ return 0;
+}
+
+static int import_zcrx(struct io_ring_ctx *ctx,
+ struct io_uring_zcrx_ifq_reg __user *arg,
+ struct io_uring_zcrx_ifq_reg *reg)
+{
+ struct io_zcrx_ifq *ifq;
+ struct file *file;
+ int fd, ret;
+ u32 id;
+
+ if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+ return -EINVAL;
+ if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))
+ return -EINVAL;
+ if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr)
+ return -EINVAL;
+
+ fd = reg->if_idx;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ file = fd_file(f);
+ if (file->f_op != &zcrx_box_fops || !file->private_data)
+ return -EBADF;
+
+ ifq = file->private_data;
+ refcount_inc(&ifq->refs);
+ refcount_inc(&ifq->user_refs);
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+ if (ret)
+ goto err;
+ }
+
+ reg->zcrx_id = id;
+ io_fill_zcrx_offsets(&reg->offsets);
+ if (copy_to_user(arg, reg, sizeof(*reg))) {
+ ret = -EFAULT;
+ goto err_xa_erase;
+ }
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ ret = -ENOMEM;
+ if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+ goto err_xa_erase;
+ }
+
+ return 0;
+err_xa_erase:
+ scoped_guard(mutex, &ctx->mmap_lock)
+ xa_erase(&ctx->zcrx_ctxs, id);
+err:
+ zcrx_unregister(ifq);
+ return ret;
+}
+
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -578,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
return -EINVAL;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
- if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
- return -EFAULT;
if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) ||
reg.__resv2 || reg.zcrx_id)
return -EINVAL;
+ if (reg.flags & ZCRX_REG_IMPORT)
+ return import_zcrx(ctx, arg, &reg);
+ if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
+ return -EFAULT;
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
return -EINVAL;
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
@@ -683,48 +849,6 @@ static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
return &area->nia.niovs[niov_idx];
}
-static void io_zcrx_return_niov_freelist(struct net_iov *niov)
-{
- struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-
- spin_lock_bh(&area->freelist_lock);
- area->freelist[area->free_count++] = net_iov_idx(niov);
- spin_unlock_bh(&area->freelist_lock);
-}
-
-static void io_zcrx_return_niov(struct net_iov *niov)
-{
- netmem_ref netmem = net_iov_to_netmem(niov);
-
- if (!niov->pp) {
- /* copy fallback allocated niovs */
- io_zcrx_return_niov_freelist(niov);
- return;
- }
- page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false);
-}
-
-static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
-{
- struct io_zcrx_area *area = ifq->area;
- int i;
-
- if (!area)
- return;
-
- /* Reclaim back all buffers given to the user space. */
- for (i = 0; i < area->nia.num_niovs; i++) {
- struct net_iov *niov = &area->nia.niovs[i];
- int nr;
-
- if (!atomic_read(io_get_user_counter(niov)))
- continue;
- nr = atomic_xchg(io_get_user_counter(niov), 0);
- if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr))
- io_zcrx_return_niov(niov);
- }
-}
-
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
struct io_zcrx_ifq *ifq;
@@ -741,10 +865,7 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
}
if (!ifq)
break;
-
- io_close_queue(ifq);
- io_zcrx_scrub(ifq);
- io_put_zcrx_ifq(ifq);
+ zcrx_unregister(ifq);
}
xa_destroy(&ctx->zcrx_ctxs);
@@ -815,7 +936,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp,
if (!page_pool_unref_and_test(netmem))
continue;
- if (unlikely(niov->pp != pp)) {
+ if (unlikely(niov->desc.pp != pp)) {
io_zcrx_return_niov(niov);
continue;
}
@@ -941,6 +1062,97 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = {
.uninstall = io_pp_uninstall,
};
+static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr,
+ struct io_zcrx_ifq *zcrx)
+{
+ unsigned int mask = zcrx->rq_entries - 1;
+ unsigned int i;
+
+ guard(spinlock_bh)(&zcrx->rq_lock);
+
+ nr = min(nr, io_zcrx_rqring_entries(zcrx));
+ for (i = 0; i < nr; i++) {
+ struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask);
+ struct net_iov *niov;
+
+ if (!io_parse_rqe(rqe, zcrx, &niov))
+ break;
+ netmem_array[i] = net_iov_to_netmem(niov);
+ }
+
+ smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head);
+ return i;
+}
+
+#define ZCRX_FLUSH_BATCH 32
+
+static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr)
+{
+ unsigned i;
+
+ for (i = 0; i < nr; i++) {
+ netmem_ref netmem = netmems[i];
+ struct net_iov *niov = netmem_to_net_iov(netmem);
+
+ if (!io_zcrx_put_niov_uref(niov))
+ continue;
+ if (!page_pool_unref_and_test(netmem))
+ continue;
+ io_zcrx_return_niov(niov);
+ }
+}
+
+static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx,
+ struct zcrx_ctrl *ctrl)
+{
+ struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush;
+ netmem_ref netmems[ZCRX_FLUSH_BATCH];
+ unsigned total = 0;
+ unsigned nr;
+
+ if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv)))
+ return -EINVAL;
+
+ do {
+ nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx);
+
+ zcrx_return_buffers(netmems, nr);
+ total += nr;
+
+ if (fatal_signal_pending(current))
+ break;
+ cond_resched();
+ } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries);
+
+ return 0;
+}
+
+int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
+{
+ struct zcrx_ctrl ctrl;
+ struct io_zcrx_ifq *zcrx;
+
+ if (nr_args)
+ return -EINVAL;
+ if (copy_from_user(&ctrl, arg, sizeof(ctrl)))
+ return -EFAULT;
+ if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv)))
+ return -EFAULT;
+
+ zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id);
+ if (!zcrx)
+ return -ENXIO;
+
+ switch (ctrl.op) {
+ case ZCRX_CTRL_FLUSH_RQ:
+ return zcrx_flush_rq(ctx, zcrx, &ctrl);
+ case ZCRX_CTRL_EXPORT:
+ return zcrx_export(ctx, zcrx, &ctrl, arg);
+ }
+
+ return -EOPNOTSUPP;
+}
+
static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov,
struct io_zcrx_ifq *ifq, int off, int len)
{
@@ -1082,13 +1294,15 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
const skb_frag_t *frag, int off, int len)
{
struct net_iov *niov;
+ struct page_pool *pp;
if (unlikely(!skb_frag_is_net_iov(frag)))
return io_zcrx_copy_frag(req, ifq, frag, off, len);
niov = netmem_to_net_iov(frag->netmem);
- if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
- io_pp_to_ifq(niov->pp) != ifq)
+ pp = niov->desc.pp;
+
+ if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq)
return -EFAULT;
if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index c9b9bfae0547..32ab95b2cb81 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -55,6 +55,8 @@ struct io_zcrx_ifq {
struct net_device *netdev;
netdevice_tracker netdev_tracker;
refcount_t refs;
+ /* counts userspace facing users like io_uring */
+ refcount_t user_refs;
/*
* Page pool and net configuration lock, can be taken deeper in the
@@ -65,6 +67,7 @@ struct io_zcrx_ifq {
};
#if defined(CONFIG_IO_URING_ZCRX)
+int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg);
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg);
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
@@ -93,6 +96,11 @@ static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ct
{
return NULL;
}
+static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned nr_arg)
+{
+ return -EOPNOTSUPP;
+}
#endif
int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);