diff options
-rw-r--r-- | Documentation/networking/index.rst | 1 | ||||
-rw-r--r-- | Documentation/networking/iou-zcrx.rst | 202 | ||||
-rw-r--r-- | Kconfig | 2 | ||||
-rw-r--r-- | include/linux/io_uring_types.h | 6 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 54 | ||||
-rw-r--r-- | io_uring/Kconfig | 10 | ||||
-rw-r--r-- | io_uring/Makefile | 1 | ||||
-rw-r--r-- | io_uring/io_uring.c | 7 | ||||
-rw-r--r-- | io_uring/io_uring.h | 10 | ||||
-rw-r--r-- | io_uring/memmap.c | 2 | ||||
-rw-r--r-- | io_uring/memmap.h | 1 | ||||
-rw-r--r-- | io_uring/net.c | 84 | ||||
-rw-r--r-- | io_uring/opdef.c | 16 | ||||
-rw-r--r-- | io_uring/register.c | 7 | ||||
-rw-r--r-- | io_uring/rsrc.c | 2 | ||||
-rw-r--r-- | io_uring/rsrc.h | 1 | ||||
-rw-r--r-- | io_uring/zcrx.c | 960 | ||||
-rw-r--r-- | io_uring/zcrx.h | 73 | ||||
-rw-r--r-- | tools/testing/selftests/drivers/net/hw/.gitignore | 2 | ||||
-rw-r--r-- | tools/testing/selftests/drivers/net/hw/Makefile | 5 | ||||
-rw-r--r-- | tools/testing/selftests/drivers/net/hw/iou-zcrx.c | 457 | ||||
-rwxr-xr-x | tools/testing/selftests/drivers/net/hw/iou-zcrx.py | 87 |
22 files changed, 1988 insertions, 2 deletions
diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 058193ed2eeb..c64133d309bf 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -63,6 +63,7 @@ Contents: gtp ila ioam6-sysctl + iou-zcrx ip_dynaddr ipsec ip-sysctl diff --git a/Documentation/networking/iou-zcrx.rst b/Documentation/networking/iou-zcrx.rst new file mode 100644 index 000000000000..0127319b30bb --- /dev/null +++ b/Documentation/networking/iou-zcrx.rst @@ -0,0 +1,202 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +io_uring zero copy Rx +===================== + +Introduction +============ + +io_uring zero copy Rx (ZC Rx) is a feature that removes kernel-to-user copy on +the network receive path, allowing packet data to be received directly into +userspace memory. This feature is different to TCP_ZEROCOPY_RECEIVE in that +there are no strict alignment requirements and no need to mmap()/munmap(). +Compared to kernel bypass solutions such as e.g. DPDK, the packet headers are +processed by the kernel TCP stack as normal. + +NIC HW Requirements +=================== + +Several NIC HW features are required for io_uring ZC Rx to work. For now the +kernel API does not configure the NIC and it must be done by the user. + +Header/data split +----------------- + +Required to split packets at the L4 boundary into a header and a payload. +Headers are received into kernel memory as normal and processed by the TCP +stack as normal. Payloads are received into userspace memory directly. + +Flow steering +------------- + +Specific HW Rx queues are configured for this feature, but modern NICs +typically distribute flows across all HW Rx queues. Flow steering is required +to ensure that only desired flows are directed towards HW queues that are +configured for io_uring ZC Rx. + +RSS +--- + +In addition to flow steering above, RSS is required to steer all other non-zero +copy flows away from queues that are configured for io_uring ZC Rx. + +Usage +===== + +Setup NIC +--------- + +Must be done out of band for now. + +Ensure there are at least two queues:: + + ethtool -L eth0 combined 2 + +Enable header/data split:: + + ethtool -G eth0 tcp-data-split on + +Carve out half of the HW Rx queues for zero copy using RSS:: + + ethtool -X eth0 equal 1 + +Set up flow steering, bearing in mind that queues are 0-indexed:: + + ethtool -N eth0 flow-type tcp6 ... action 1 + +Setup io_uring +-------------- + +This section describes the low level io_uring kernel API. Please refer to +liburing documentation for how to use the higher level API. + +Create an io_uring instance with the following required setup flags:: + + IORING_SETUP_SINGLE_ISSUER + IORING_SETUP_DEFER_TASKRUN + IORING_SETUP_CQE32 + +Create memory area +------------------ + +Allocate userspace memory area for receiving zero copy data:: + + void *area_ptr = mmap(NULL, area_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, 0); + +Create refill ring +------------------ + +Allocate memory for a shared ringbuf used for returning consumed buffers:: + + void *ring_ptr = mmap(NULL, ring_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, 0); + +This refill ring consists of some space for the header, followed by an array of +``struct io_uring_zcrx_rqe``:: + + size_t rq_entries = 4096; + size_t ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe) + PAGE_SIZE; + /* align to page size */ + ring_size = (ring_size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); + +Register ZC Rx +-------------- + +Fill in registration structs:: + + struct io_uring_zcrx_area_reg area_reg = { + .addr = (__u64)(unsigned long)area_ptr, + .len = area_size, + .flags = 0, + }; + + struct io_uring_region_desc region_reg = { + .user_addr = (__u64)(unsigned long)ring_ptr, + .size = ring_size, + .flags = IORING_MEM_REGION_TYPE_USER, + }; + + struct io_uring_zcrx_ifq_reg reg = { + .if_idx = if_nametoindex("eth0"), + /* this is the HW queue with desired flow steered into it */ + .if_rxq = 1, + .rq_entries = rq_entries, + .area_ptr = (__u64)(unsigned long)&area_reg, + .region_ptr = (__u64)(unsigned long)®ion_reg, + }; + +Register with kernel:: + + io_uring_register_ifq(ring, ®); + +Map refill ring +--------------- + +The kernel fills in fields for the refill ring in the registration ``struct +io_uring_zcrx_ifq_reg``. Map it into userspace:: + + struct io_uring_zcrx_rq refill_ring; + + refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.head); + refill_ring.khead = (unsigned *)((char *)ring_ptr + reg.offsets.tail); + refill_ring.rqes = + (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes); + refill_ring.rq_tail = 0; + refill_ring.ring_ptr = ring_ptr; + +Receiving data +-------------- + +Prepare a zero copy recv request:: + + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, fd, NULL, 0, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + +Now, submit and wait:: + + io_uring_submit_and_wait(ring, 1); + +Finally, process completions:: + + struct io_uring_cqe *cqe; + unsigned int count = 0; + unsigned int head; + + io_uring_for_each_cqe(ring, head, cqe) { + struct io_uring_zcrx_cqe *rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + + unsigned long mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1; + unsigned char *data = area_ptr + (rcqe->off & mask); + /* do something with the data */ + + count++; + } + io_uring_cq_advance(ring, count); + +Recycling buffers +----------------- + +Return buffers back to the kernel to be used again:: + + struct io_uring_zcrx_rqe *rqe; + unsigned mask = refill_ring.ring_entries - 1; + rqe = &refill_ring.rqes[refill_ring.rq_tail & mask]; + + unsigned long area_offset = rcqe->off & ~IORING_ZCRX_AREA_MASK; + rqe->off = area_offset | area_reg.rq_area_token; + rqe->len = cqe->res; + IO_URING_WRITE_ONCE(*refill_ring.ktail, ++refill_ring.rq_tail); + +Testing +======= + +See ``tools/testing/selftests/drivers/net/hw/iou-zcrx.c`` @@ -30,3 +30,5 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +source "io_uring/Kconfig" diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 72aac84dca93..35fc241c4672 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -40,6 +40,8 @@ enum io_uring_cmd_flags { IO_URING_F_TASK_DEAD = (1 << 13), }; +struct io_zcrx_ifq; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -384,6 +386,8 @@ struct io_ring_ctx { struct wait_queue_head poll_wq; struct io_restriction restrictions; + struct io_zcrx_ifq *ifq; + u32 pers_next; struct xarray personalities; @@ -436,6 +440,8 @@ struct io_ring_ctx { struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; + /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */ + struct io_mapped_region zcrx_region; }; /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0d6c83c8d1cf..6a446e338162 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -87,6 +87,7 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + __u32 zcrx_ifq_idx; __u32 optlen; struct { __u16 addr_len; @@ -278,6 +279,7 @@ enum io_uring_op { IORING_OP_FTRUNCATE, IORING_OP_BIND, IORING_OP_LISTEN, + IORING_OP_RECV_ZC, /* this goes last, obviously */ IORING_OP_LAST, @@ -641,7 +643,8 @@ enum io_uring_register_op { /* send MSG_RING without having a ring */ IORING_REGISTER_SEND_MSG_RING = 31, - /* 32 reserved for zc rx */ + /* register a netdev hw rx queue for zerocopy */ + IORING_REGISTER_ZCRX_IFQ = 32, /* resize CQ ring */ IORING_REGISTER_RESIZE_RINGS = 33, @@ -958,6 +961,55 @@ enum io_uring_socket_op { SOCKET_URING_OP_SETSOCKOPT, }; +/* Zero copy receive refill queue entry */ +struct io_uring_zcrx_rqe { + __u64 off; + __u32 len; + __u32 __pad; +}; + +struct io_uring_zcrx_cqe { + __u64 off; + __u64 __pad; +}; + +/* The bit from which area id is encoded into offsets */ +#define IORING_ZCRX_AREA_SHIFT 48 +#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) + +struct io_uring_zcrx_offsets { + __u32 head; + __u32 tail; + __u32 rqes; + __u32 __resv2; + __u64 __resv[2]; +}; + +struct io_uring_zcrx_area_reg { + __u64 addr; + __u64 len; + __u64 rq_area_token; + __u32 flags; + __u32 __resv1; + __u64 __resv2[2]; +}; + +/* + * Argument for IORING_REGISTER_ZCRX_IFQ + */ +struct io_uring_zcrx_ifq_reg { + __u32 if_idx; + __u32 if_rxq; + __u32 rq_entries; + __u32 flags; + + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ + __u64 region_ptr; /* struct io_uring_region_desc * */ + + struct io_uring_zcrx_offsets offsets; + __u64 __resv[4]; +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/Kconfig b/io_uring/Kconfig new file mode 100644 index 000000000000..9e2a4beba1ef --- /dev/null +++ b/io_uring/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# io_uring configuration +# + +config IO_URING_ZCRX + def_bool y + depends on PAGE_POOL + depends on INET + depends on NET_RX_BUSY_POLL diff --git a/io_uring/Makefile b/io_uring/Makefile index d695b60dba4f..98e48339d84d 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ epoll.o statx.o timeout.o fdinfo.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o +obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 078475447264..f743581cc81b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -97,6 +97,7 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" @@ -2732,6 +2733,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); io_sqe_buffers_unregister(ctx); io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_free_alloc_caches(ctx); @@ -2891,6 +2893,11 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } + if (ctx->ifq) { + mutex_lock(&ctx->uring_lock); + io_shutdown_zcrx_ifqs(ctx); + mutex_unlock(&ctx->uring_lock); + } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 372129e24372..b95dab77e32d 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -189,6 +189,16 @@ static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret return io_get_cqe_overflow(ctx, ret, false); } +static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, + struct io_uring_cqe **cqe_ret) +{ + io_lockdep_assert_cq_locked(ctx); + + ctx->cq_extra++; + ctx->submit_state.cq_flush = true; + return io_get_cqe(ctx, cqe_ret); +} + static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 361134544427..76fcc79656b0 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -271,6 +271,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, return io_pbuf_get_region(ctx, bgid); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; + case IORING_MAP_OFF_ZCRX_REGION: + return &ctx->zcrx_region; } return NULL; } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index c898dcba2b4e..dad0aa5b1b45 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -2,6 +2,7 @@ #define IO_URING_MEMMAP_H #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL +#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/net.c b/io_uring/net.c index 3fc39af5159e..89cd45bacd7c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -16,6 +16,7 @@ #include "net.h" #include "notif.h" #include "rsrc.h" +#include "zcrx.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -88,6 +89,14 @@ struct io_sr_msg { */ #define MULTISHOT_MAX_RETRY 32 +struct io_recvzc { + struct file *file; + unsigned msg_flags; + u16 flags; + u32 len; + struct io_zcrx_ifq *ifq; +}; + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -1199,6 +1208,81 @@ out_free: return ret; } +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + unsigned ifq_idx; + + if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || + sqe->addr3)) + return -EINVAL; + + ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); + if (ifq_idx != 0) + return -EINVAL; + zc->ifq = req->ctx->ifq; + if (!zc->ifq) + return -EINVAL; + zc->len = READ_ONCE(sqe->len); + zc->flags = READ_ONCE(sqe->ioprio); + zc->msg_flags = READ_ONCE(sqe->msg_flags); + if (zc->msg_flags) + return -EINVAL; + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) + return -EINVAL; + /* multishot required */ + if (!(zc->flags & IORING_RECV_MULTISHOT)) + return -EINVAL; + /* All data completions are posted as aux CQEs. */ + req->flags |= REQ_F_APOLL_MULTISHOT; + + return 0; +} + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + struct socket *sock; + unsigned int len; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + len = zc->len; + ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, + issue_flags, &zc->len); + if (len && zc->len == 0) { + io_req_set_res(req, 0, 0); + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; + } + if (unlikely(ret <= 0) && ret != -EAGAIN) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret == IOU_REQUEUE) + return IOU_REQUEUE; + + req_set_fail(req); + io_req_set_res(req, ret, 0); + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_STOP_MULTISHOT; + return IOU_OK; + } + + if (issue_flags & IO_URING_F_MULTISHOT) + return IOU_ISSUE_SKIP_COMPLETE; + return -EAGAIN; +} + void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 306fd9c48b44..9511262c513e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -37,6 +37,7 @@ #include "waitid.h" #include "futex.h" #include "truncate.h" +#include "zcrx.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -516,6 +517,18 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_RECV_ZC] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_recvzc_prep, + .issue = io_recvzc, +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const struct io_cold_def io_cold_defs[] = { @@ -745,6 +758,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_LISTEN] = { .name = "LISTEN", }, + [IORING_OP_RECV_ZC] = { + .name = "RECV_ZC", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/register.c b/io_uring/register.c index 9a4d2fbce4ae..cc23a4c205cd 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -30,6 +30,7 @@ #include "eventfd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx_ifq(ctx, arg); + break; case IORING_REGISTER_RESIZE_RINGS: ret = -EINVAL; if (!arg || nr_args != 1) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5fff6ba2b7c0..a59563fbb4ad 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -80,7 +80,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_buffer_validate(struct iovec *iov) { unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f10a1252b3e9..284e300e63fb 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -76,6 +76,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +int io_buffer_validate(struct iovec *iov); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c new file mode 100644 index 000000000000..9c95b5b6ec4e --- /dev/null +++ b/io_uring/zcrx.c @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/dma-map-ops.h> +#include <linux/mm.h> +#include <linux/nospec.h> +#include <linux/io_uring.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff_ref.h> + +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/netlink.h> +#include <net/netdev_rx_queue.h> +#include <net/tcp.h> +#include <net/rps.h> + +#include <trace/events/page_pool.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "kbuf.h" +#include "memmap.h" +#include "zcrx.h" +#include "rsrc.h" + +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + +static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) +{ + int i; + + for (i = 0; i < nr_mapped; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + dma_addr_t dma; + + dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); + dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + net_mp_niov_set_dma_addr(niov, 0); + } +} + +static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + if (area->is_mapped) + __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); +} + +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + int i; + + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + dma_addr_t dma; + + dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (dma_mapping_error(ifq->dev, dma)) + break; + if (net_mp_niov_set_dma_addr(niov, dma)) { + dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + break; + } + } + + if (i != area->nia.num_niovs) { + __io_zcrx_unmap_area(ifq, area, i); + return -EINVAL; + } + + area->is_mapped = true; + return 0; +} + +static void io_zcrx_sync_for_device(const struct page_pool *pool, + struct net_iov *niov) +{ +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + dma_addr_t dma_addr; + + if (!dma_dev_need_sync(pool->p.dev)) + return; + + dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + PAGE_SIZE, pool->p.dma_dir); +#endif +} + +#define IO_RQ_MAX_ENTRIES 32768 + +#define IO_SKBS_PER_CALL_LIMIT 20 + +struct io_zcrx_args { + struct io_kiocb *req; + struct io_zcrx_ifq *ifq; + struct socket *sock; + unsigned nr_skbs; +}; + +static const struct memory_provider_ops io_uring_pp_zc_ops; + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline atomic_t *io_get_user_counter(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return &area->user_refs[net_iov_idx(niov)]; +} + +static bool io_zcrx_put_niov_uref(struct net_iov *niov) +{ + atomic_t *uref = io_get_user_counter(niov); + + if (unlikely(!atomic_read(uref))) + return false; + atomic_dec(uref); + return true; +} + +static void io_zcrx_get_niov_uref(struct net_iov *niov) +{ + atomic_inc(io_get_user_counter(niov)); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->pages[net_iov_idx(niov)]; +} + +static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_region_desc *rd) +{ + size_t off, size; + void *ptr; + int ret; + + off = sizeof(struct io_uring); + size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + if (size > rd->size) + return -EINVAL; + + ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, + IORING_MAP_OFF_ZCRX_REGION); + if (ret < 0) + return ret; + + ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ifq->rq_ring = (struct io_uring *)ptr; + ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + return 0; +} + +static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) +{ + io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + ifq->rq_ring = NULL; + ifq->rqes = NULL; +} + +static void io_zcrx_free_area(struct io_zcrx_area *area) +{ + io_zcrx_unmap_area(area->ifq, area); + + kvfree(area->freelist); + kvfree(area->nia.niovs); + kvfree(area->user_refs); + if (area->pages) { + unpin_user_pages(area->pages, area->nia.num_niovs); + kvfree(area->pages); + } + kfree(area); +} + +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area **res, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct io_zcrx_area *area; + int i, ret, nr_pages; + struct iovec iov; + + if (area_reg->flags || area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + iov.iov_base = u64_to_user_ptr(area_reg->addr); + iov.iov_len = area_reg->len; + ret = io_buffer_validate(&iov); + if (ret) + return ret; + + ret = -ENOMEM; + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + goto err; + + area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(area->pages)) { + ret = PTR_ERR(area->pages); + area->pages = NULL; + goto err; + } + area->nia.num_niovs = nr_pages; + + area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->nia.niovs) + goto err; + + area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->freelist) + goto err; + + for (i = 0; i < nr_pages; i++) + area->freelist[i] = i; + + area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->user_refs) + goto err; + + for (i = 0; i < nr_pages; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + + niov->owner = &area->nia; + area->freelist[i] = i; + atomic_set(&area->user_refs[i], 0); + } + + area->free_count = nr_pages; + area->ifq = ifq; + /* we're only supporting one area per ifq for now */ + area->area_id = 0; + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; + spin_lock_init(&area->freelist_lock); + *res = area; + return 0; +err: + if (area) + io_zcrx_free_area(area); + return ret; +} + +static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); + if (!ifq) + return NULL; + + ifq->if_rxq = -1; + ifq->ctx = ctx; + spin_lock_init(&ifq->lock); + spin_lock_init(&ifq->rq_lock); + return ifq; +} + +static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) +{ + spin_lock(&ifq->lock); + if (ifq->netdev) { + netdev_put(ifq->netdev, &ifq->netdev_tracker); + ifq->netdev = NULL; + } + spin_unlock(&ifq->lock); +} + +static void io_close_queue(struct io_zcrx_ifq *ifq) +{ + struct net_device *netdev; + netdevice_tracker netdev_tracker; + struct pp_memory_provider_params p = { + .mp_ops = &io_uring_pp_zc_ops, + .mp_priv = ifq, + }; + + if (ifq->if_rxq == -1) + return; + + spin_lock(&ifq->lock); + netdev = ifq->netdev; + netdev_tracker = ifq->netdev_tracker; + ifq->netdev = NULL; + spin_unlock(&ifq->lock); + + if (netdev) { + net_mp_close_rxq(netdev, ifq->if_rxq, &p); + netdev_put(netdev, &netdev_tracker); + } + ifq->if_rxq = -1; +} + +static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) +{ + io_close_queue(ifq); + io_zcrx_drop_netdev(ifq); + + if (ifq->area) + io_zcrx_free_area(ifq->area); + if (ifq->dev) + put_device(ifq->dev); + + io_free_rbuf_ring(ifq); + kfree(ifq); +} + +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + struct pp_memory_provider_params mp_param = {}; + struct io_uring_zcrx_area_reg area; + struct io_uring_zcrx_ifq_reg reg; + struct io_uring_region_desc rd; + struct io_zcrx_ifq *ifq; + int ret; + + /* + * 1. Interface queue allocation. + * 2. It can observe data destined for sockets of other tasks. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* mandatory io_uring features for zc rx */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && + ctx->flags & IORING_SETUP_CQE32)) + return -EINVAL; + if (ctx->ifq) + return -EBUSY; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + return -EINVAL; + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { + if (!(ctx->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + reg.rq_entries = IO_RQ_MAX_ENTRIES; + } + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); + + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) + return -EFAULT; + + ifq = io_zcrx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + + ret = io_allocate_rbuf_ring(ifq, ®, &rd); + if (ret) + goto err; + + ret = io_zcrx_create_area(ifq, &ifq->area, &area); + if (ret) + goto err; + + ifq->rq_entries = reg.rq_entries; + + ret = -ENODEV; + ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, + &ifq->netdev_tracker, GFP_KERNEL); + if (!ifq->netdev) + goto err; + + ifq->dev = ifq->netdev->dev.parent; + ret = -EOPNOTSUPP; + if (!ifq->dev) + goto err; + get_device(ifq->dev); + + ret = io_zcrx_map_area(ifq, ifq->area); + if (ret) + goto err; + + mp_param.mp_ops = &io_uring_pp_zc_ops; + mp_param.mp_priv = ifq; + ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); + if (ret) + goto err; + ifq->if_rxq = reg.if_rxq; + + reg.offsets.rqes = sizeof(struct io_uring); + reg.offsets.head = offsetof(struct io_uring, head); + reg.offsets.tail = offsetof(struct io_uring, tail); + + if (copy_to_user(arg, ®, sizeof(reg)) || + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || + copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + ret = -EFAULT; + goto err; + } + ctx->ifq = ifq; + return 0; +err: + io_zcrx_ifq_free(ifq); + return ret; +} + +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq = ctx->ifq; + + lockdep_assert_held(&ctx->uring_lock); + + if (!ifq) + return; + + ctx->ifq = NULL; + io_zcrx_ifq_free(ifq); +} + +static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; +} + +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); + + if (!ctx->ifq) + return; + io_zcrx_scrub(ctx->ifq); + io_close_queue(ctx->ifq); +} + +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +{ + u32 entries; + + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; + return min(entries, ifq->rq_entries); +} + +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, + unsigned mask) +{ + unsigned int idx = ifq->cached_rq_head++ & mask; + + return &ifq->rqes[idx]; +} + +static void io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq) +{ + unsigned int mask = ifq->rq_entries - 1; + unsigned int entries; + netmem_ref netmem; + + spin_lock_bh(&ifq->rq_lock); + + entries = io_zcrx_rqring_entries(ifq); + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); + if (unlikely(!entries)) { + spin_unlock_bh(&ifq->rq_lock); + return; + } + + do { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_zcrx_area *area; + struct net_iov *niov; + unsigned niov_idx, area_idx; + + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; + + if (unlikely(rqe->__pad || area_idx)) + continue; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + continue; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + niov = &area->nia.niovs[niov_idx]; + if (!io_zcrx_put_niov_uref(niov)) + continue; + + netmem = net_iov_to_netmem(niov); + if (page_pool_unref_netmem(netmem, 1) != 0) + continue; + + if (unlikely(niov->pp != pp)) { + io_zcrx_return_niov(niov); + continue; + } + + io_zcrx_sync_for_device(pp, niov); + net_mp_netmem_place_in_cache(pp, netmem); + } while (--entries); + + smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); + spin_unlock_bh(&ifq->rq_lock); +} + +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + + spin_lock_bh(&area->freelist_lock); + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + struct net_iov *niov = __io_zcrx_get_free_niov(area); + netmem_ref netmem = net_iov_to_netmem(niov); + + net_mp_niov_set_page_pool(pp, niov); + io_zcrx_sync_for_device(pp, niov); + net_mp_netmem_place_in_cache(pp, netmem); + } + spin_unlock_bh(&area->freelist_lock); +} + +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + /* pp should already be ensuring that */ + if (unlikely(pp->alloc.count)) + goto out_return; + + io_zcrx_ring_refill(pp, ifq); + if (likely(pp->alloc.count)) + goto out_return; + + io_zcrx_refill_slow(pp, ifq); + if (!pp->alloc.count) + return 0; +out_return: + return pp->alloc.cache[--pp->alloc.count]; +} + +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) +{ + struct net_iov *niov; + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + niov = netmem_to_net_iov(netmem); + net_mp_niov_clear_page_pool(niov); + io_zcrx_return_niov_freelist(niov); + return false; +} + +static int io_pp_zc_init(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + + if (WARN_ON_ONCE(!ifq)) + return -EINVAL; + if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) + return -EINVAL; + if (WARN_ON_ONCE(!pp->dma_map)) + return -EOPNOTSUPP; + if (pp->p.order != 0) + return -EOPNOTSUPP; + if (pp->p.dma_dir != DMA_FROM_DEVICE) + return -EOPNOTSUPP; + + percpu_ref_get(&ifq->ctx->refs); + return 0; +} + +static void io_pp_zc_destroy(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_area *area = ifq->area; + + if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) + return; + percpu_ref_put(&ifq->ctx->refs); +} + +static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + struct nlattr *nest; + int type; + + type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; + nest = nla_nest_start(rsp, type); + if (!nest) + return -EMSGSIZE; + nla_nest_end(rsp, nest); + + return 0; +} + +static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) +{ + struct pp_memory_provider_params *p = &rxq->mp_params; + struct io_zcrx_ifq *ifq = mp_priv; + + io_zcrx_drop_netdev(ifq); + p->mp_ops = NULL; + p->mp_priv = NULL; +} + +static const struct memory_provider_ops io_uring_pp_zc_ops = { + .alloc_netmems = io_pp_zc_alloc_netmems, + .release_netmem = io_pp_zc_release_netmem, + .init = io_pp_zc_init, + .destroy = io_pp_zc_destroy, + .nl_fill = io_pp_nl_fill, + .uninstall = io_pp_uninstall, +}; + +static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, + struct io_zcrx_ifq *ifq, int off, int len) +{ + struct io_uring_zcrx_cqe *rcqe; + struct io_zcrx_area *area; + struct io_uring_cqe *cqe; + u64 offset; + + if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) + return false; + + cqe->user_data = req->cqe.user_data; + cqe->res = len; + cqe->flags = IORING_CQE_F_MORE; + + area = io_zcrx_iov_to_area(niov); + offset = off + (net_iov_idx(niov) << PAGE_SHIFT); + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); + rcqe->__pad = 0; + return true; +} + +static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) +{ + struct net_iov *niov = NULL; + + spin_lock_bh(&area->freelist_lock); + if (area->free_count) + niov = __io_zcrx_get_free_niov(area); + spin_unlock_bh(&area->freelist_lock); + + if (niov) + page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); + return niov; +} + +static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + void *src_base, struct page *src_page, + unsigned int src_offset, size_t len) +{ + struct io_zcrx_area *area = ifq->area; + size_t copied = 0; + int ret = 0; + + while (len) { + size_t copy_size = min_t(size_t, PAGE_SIZE, len); + const int dst_off = 0; + struct net_iov *niov; + struct page *dst_page; + void *dst_addr; + + niov = io_zcrx_alloc_fallback(area); + if (!niov) { + ret = -ENOMEM; + break; + } + + dst_page = io_zcrx_iov_page(niov); + dst_addr = kmap_local_page(dst_page); + if (src_page) + src_base = kmap_local_page(src_page); + + memcpy(dst_addr, src_base + src_offset, copy_size); + + if (src_page) + kunmap_local(src_base); + kunmap_local(dst_addr); + + if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { + io_zcrx_return_niov(niov); + ret = -ENOSPC; + break; + } + + io_zcrx_get_niov_uref(niov); + src_offset += copy_size; + len -= copy_size; + copied += copy_size; + } + + return copied ? copied : ret; +} + +static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct page *page = skb_frag_page(frag); + u32 p_off, p_len, t, copied = 0; + int ret = 0; + + off += skb_frag_off(frag); + + skb_frag_foreach_page(frag, off, len, + page, p_off, p_len, t) { + ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); + if (ret < 0) + return copied ? copied : ret; + copied += ret; + } + return copied; +} + +static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct net_iov *niov; + + if (unlikely(!skb_frag_is_net_iov(frag))) + return io_zcrx_copy_frag(req, ifq, frag, off, len); + + niov = netmem_to_net_iov(frag->netmem); + if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + niov->pp->mp_priv != ifq) + return -EFAULT; + + if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) + return -ENOSPC; + + /* + * Prevent it from being recycled while user is accessing it. + * It has to be done before grabbing a user reference. + */ + page_pool_ref_netmem(net_iov_to_netmem(niov)); + io_zcrx_get_niov_uref(niov); + return len; +} + +static int +io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct io_zcrx_args *args = desc->arg.data; + struct io_zcrx_ifq *ifq = args->ifq; + struct io_kiocb *req = args->req; + struct sk_buff *frag_iter; + unsigned start, start_off = offset; + int i, copy, end, off; + int ret = 0; + + len = min_t(size_t, len, desc->count); + if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) + return -EAGAIN; + + if (unlikely(offset < skb_headlen(skb))) { + ssize_t copied; + size_t to_copy; + + to_copy = min_t(size_t, skb_headlen(skb) - offset, len); + copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, + offset, to_copy); + if (copied < 0) { + ret = copied; + goto out; + } + offset += copied; + len -= copied; + if (!len) + goto out; + if (offset != skb_headlen(skb)) + goto out; + } + + start = skb_headlen(skb); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag; + + if (WARN_ON(start > offset + len)) + return -EFAULT; + + frag = &skb_shinfo(skb)->frags[i]; + end = start + skb_frag_size(frag); + + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + if (WARN_ON(start > offset + len)) + return -EFAULT; + + end = start + frag_iter->len; + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + +out: + if (offset == start_off) + return ret; + desc->count -= (offset - start_off); + return offset - start_off; +} + +static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct sock *sk, int flags, + unsigned issue_flags, unsigned int *outlen) +{ + unsigned int len = *outlen; + struct io_zcrx_args args = { + .req = req, + .ifq = ifq, + .sock = sk->sk_socket, + }; + read_descriptor_t rd_desc = { + .count = len ? len : UINT_MAX, + .arg.data = &args, + }; + int ret; + + lock_sock(sk); + ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); + if (len && ret > 0) + *outlen = len - ret; + if (ret <= 0) { + if (ret < 0 || sock_flag(sk, SOCK_DONE)) + goto out; + if (sk->sk_err) + ret = sock_error(sk); + else if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + else if (sk->sk_state == TCP_CLOSE) + ret = -ENOTCONN; + else + ret = -EAGAIN; + } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && + (issue_flags & IO_URING_F_MULTISHOT)) { + ret = IOU_REQUEUE; + } else if (sock_flag(sk, SOCK_DONE)) { + /* Make it to retry until it finally gets 0. */ + if (issue_flags & IO_URING_F_MULTISHOT) + ret = IOU_REQUEUE; + else + ret = -EAGAIN; + } +out: + release_sock(sk); + return ret; +} + +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->recvmsg != tcp_recvmsg) + return -EPROTONOSUPPORT; + + sock_rps_record_flow(sk); + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h new file mode 100644 index 000000000000..706cc7300780 --- /dev/null +++ b/io_uring/zcrx.h @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +#include <linux/io_uring_types.h> +#include <linux/socket.h> +#include <net/page_pool/types.h> +#include <net/net_trackers.h> + +struct io_zcrx_area { + struct net_iov_area nia; + struct io_zcrx_ifq *ifq; + atomic_t *user_refs; + + bool is_mapped; + u16 area_id; + struct page **pages; + + /* freelist */ + spinlock_t freelist_lock ____cacheline_aligned_in_smp; + u32 free_count; + u32 *freelist; +}; + +struct io_zcrx_ifq { + struct io_ring_ctx *ctx; + struct io_zcrx_area *area; + + struct io_uring *rq_ring; + struct io_uring_zcrx_rqe *rqes; + u32 rq_entries; + u32 cached_rq_head; + spinlock_t rq_lock; + + u32 if_rxq; + struct device *dev; + struct net_device *netdev; + netdevice_tracker netdev_tracker; + spinlock_t lock; +}; + +#if defined(CONFIG_IO_URING_ZCRX) +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg); +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len); +#else +static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + return -EOPNOTSUPP; +} +#endif + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + +#endif diff --git a/tools/testing/selftests/drivers/net/hw/.gitignore b/tools/testing/selftests/drivers/net/hw/.gitignore index e9fe6ede681a..6942bf575497 100644 --- a/tools/testing/selftests/drivers/net/hw/.gitignore +++ b/tools/testing/selftests/drivers/net/hw/.gitignore @@ -1 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +iou-zcrx ncdevmem diff --git a/tools/testing/selftests/drivers/net/hw/Makefile b/tools/testing/selftests/drivers/net/hw/Makefile index cde5814ff9a7..07cddb19ba35 100644 --- a/tools/testing/selftests/drivers/net/hw/Makefile +++ b/tools/testing/selftests/drivers/net/hw/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0+ OR MIT +TEST_GEN_FILES = iou-zcrx + TEST_PROGS = \ csum.py \ devlink_port_split.py \ @@ -10,6 +12,7 @@ TEST_PROGS = \ ethtool_rmon.sh \ hw_stats_l3.sh \ hw_stats_l3_gre.sh \ + iou-zcrx.py \ irq.py \ loopback.sh \ nic_link_layer.py \ @@ -44,3 +47,5 @@ YNL_GENS := ethtool netdev include ../../../net/ynl.mk include ../../../net/bpf.mk + +$(OUTPUT)/iou-zcrx: LDLIBS += -luring diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.c b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c new file mode 100644 index 000000000000..c26b4180eddd --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <assert.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <limits.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <arpa/inet.h> +#include <linux/errqueue.h> +#include <linux/if_packet.h> +#include <linux/ipv6.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <net/ethernet.h> +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> +#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/un.h> +#include <sys/wait.h> + +#include <liburing.h> + +#define PAGE_SIZE (4096) +#define AREA_SIZE (8192 * PAGE_SIZE) +#define SEND_SIZE (512 * 4096) +#define min(a, b) \ + ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + _a < _b ? _a : _b; \ + }) +#define min_t(t, a, b) \ + ({ \ + t _ta = (a); \ + t _tb = (b); \ + min(_ta, _tb); \ + }) + +#define ALIGN_UP(v, align) (((v) + (align) - 1) & ~((align) - 1)) + +static int cfg_server; +static int cfg_client; +static int cfg_port = 8000; +static int cfg_payload_len; +static const char *cfg_ifname; +static int cfg_queue_id = -1; +static bool cfg_oneshot; +static int cfg_oneshot_recvs; +static int cfg_send_size = SEND_SIZE; +static struct sockaddr_in6 cfg_addr; + +static char payload[SEND_SIZE] __attribute__((aligned(PAGE_SIZE))); +static void *area_ptr; +static void *ring_ptr; +static size_t ring_size; +static struct io_uring_zcrx_rq rq_ring; +static unsigned long area_token; +static int connfd; +static bool stop; +static size_t received; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static int parse_address(const char *str, int port, struct sockaddr_in6 *sin6) +{ + int ret; + + sin6->sin6_family = AF_INET6; + sin6->sin6_port = htons(port); + + ret = inet_pton(sin6->sin6_family, str, &sin6->sin6_addr); + if (ret != 1) { + /* fallback to plain IPv4 */ + ret = inet_pton(AF_INET, str, &sin6->sin6_addr.s6_addr32[3]); + if (ret != 1) + return -1; + + /* add ::ffff prefix */ + sin6->sin6_addr.s6_addr32[0] = 0; + sin6->sin6_addr.s6_addr32[1] = 0; + sin6->sin6_addr.s6_addr16[4] = 0; + sin6->sin6_addr.s6_addr16[5] = 0xffff; + } + + return 0; +} + +static inline size_t get_refill_ring_size(unsigned int rq_entries) +{ + size_t size; + + ring_size = rq_entries * sizeof(struct io_uring_zcrx_rqe); + /* add space for the header (head/tail/etc.) */ + ring_size += PAGE_SIZE; + return ALIGN_UP(ring_size, 4096); +} + +static void setup_zcrx(struct io_uring *ring) +{ + unsigned int ifindex; + unsigned int rq_entries = 4096; + int ret; + + ifindex = if_nametoindex(cfg_ifname); + if (!ifindex) + error(1, 0, "bad interface name: %s", cfg_ifname); + + area_ptr = mmap(NULL, + AREA_SIZE, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + if (area_ptr == MAP_FAILED) + error(1, 0, "mmap(): zero copy area"); + + ring_size = get_refill_ring_size(rq_entries); + ring_ptr = mmap(NULL, + ring_size, + PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, + 0, + 0); + + struct io_uring_region_desc region_reg = { + .size = ring_size, + .user_addr = (__u64)(unsigned long)ring_ptr, + .flags = IORING_MEM_REGION_TYPE_USER, + }; + + struct io_uring_zcrx_area_reg area_reg = { + .addr = (__u64)(unsigned long)area_ptr, + .len = AREA_SIZE, + .flags = 0, + }; + + struct io_uring_zcrx_ifq_reg reg = { + .if_idx = ifindex, + .if_rxq = cfg_queue_id, + .rq_entries = rq_entries, + .area_ptr = (__u64)(unsigned long)&area_reg, + .region_ptr = (__u64)(unsigned long)®ion_reg, + }; + + ret = io_uring_register_ifq(ring, ®); + if (ret) + error(1, 0, "io_uring_register_ifq(): %d", ret); + + rq_ring.khead = (unsigned int *)((char *)ring_ptr + reg.offsets.head); + rq_ring.ktail = (unsigned int *)((char *)ring_ptr + reg.offsets.tail); + rq_ring.rqes = (struct io_uring_zcrx_rqe *)((char *)ring_ptr + reg.offsets.rqes); + rq_ring.rq_tail = 0; + rq_ring.ring_entries = reg.rq_entries; + + area_token = area_reg.rq_area_token; +} + +static void add_accept(struct io_uring *ring, int sockfd) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_accept(sqe, sockfd, NULL, NULL, 0); + sqe->user_data = 1; +} + +static void add_recvzc(struct io_uring *ring, int sockfd) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, sockfd, NULL, 0, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + sqe->user_data = 2; +} + +static void add_recvzc_oneshot(struct io_uring *ring, int sockfd, size_t len) +{ + struct io_uring_sqe *sqe; + + sqe = io_uring_get_sqe(ring); + + io_uring_prep_rw(IORING_OP_RECV_ZC, sqe, sockfd, NULL, len, 0); + sqe->ioprio |= IORING_RECV_MULTISHOT; + sqe->user_data = 2; +} + +static void process_accept(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + if (cqe->res < 0) + error(1, 0, "accept()"); + if (connfd) + error(1, 0, "Unexpected second connection"); + + connfd = cqe->res; + if (cfg_oneshot) + add_recvzc_oneshot(ring, connfd, PAGE_SIZE); + else + add_recvzc(ring, connfd); +} + +static void process_recvzc(struct io_uring *ring, struct io_uring_cqe *cqe) +{ + unsigned rq_mask = rq_ring.ring_entries - 1; + struct io_uring_zcrx_cqe *rcqe; + struct io_uring_zcrx_rqe *rqe; + struct io_uring_sqe *sqe; + uint64_t mask; + char *data; + ssize_t n; + int i; + + if (cqe->res == 0 && cqe->flags == 0 && cfg_oneshot_recvs == 0) { + stop = true; + return; + } + + if (cqe->res < 0) + error(1, 0, "recvzc(): %d", cqe->res); + + if (cfg_oneshot) { + if (cqe->res == 0 && cqe->flags == 0 && cfg_oneshot_recvs) { + add_recvzc_oneshot(ring, connfd, PAGE_SIZE); + cfg_oneshot_recvs--; + } + } else if (!(cqe->flags & IORING_CQE_F_MORE)) { + add_recvzc(ring, connfd); + } + + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + + n = cqe->res; + mask = (1ULL << IORING_ZCRX_AREA_SHIFT) - 1; + data = (char *)area_ptr + (rcqe->off & mask); + + for (i = 0; i < n; i++) { + if (*(data + i) != payload[(received + i)]) + error(1, 0, "payload mismatch at ", i); + } + received += n; + + rqe = &rq_ring.rqes[(rq_ring.rq_tail & rq_mask)]; + rqe->off = (rcqe->off & ~IORING_ZCRX_AREA_MASK) | area_token; + rqe->len = cqe->res; + io_uring_smp_store_release(rq_ring.ktail, ++rq_ring.rq_tail); +} + +static void server_loop(struct io_uring *ring) +{ + struct io_uring_cqe *cqe; + unsigned int count = 0; + unsigned int head; + int i, ret; + + io_uring_submit_and_wait(ring, 1); + + io_uring_for_each_cqe(ring, head, cqe) { + if (cqe->user_data == 1) + process_accept(ring, cqe); + else if (cqe->user_data == 2) + process_recvzc(ring, cqe); + else + error(1, 0, "unknown cqe"); + count++; + } + io_uring_cq_advance(ring, count); +} + +static void run_server(void) +{ + unsigned int flags = 0; + struct io_uring ring; + int fd, enable, ret; + uint64_t tstop; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) + error(1, 0, "socket()"); + + enable = 1; + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int)); + if (ret < 0) + error(1, 0, "setsockopt(SO_REUSEADDR)"); + + ret = bind(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr)); + if (ret < 0) + error(1, 0, "bind()"); + + if (listen(fd, 1024) < 0) + error(1, 0, "listen()"); + + flags |= IORING_SETUP_COOP_TASKRUN; + flags |= IORING_SETUP_SINGLE_ISSUER; + flags |= IORING_SETUP_DEFER_TASKRUN; + flags |= IORING_SETUP_SUBMIT_ALL; + flags |= IORING_SETUP_CQE32; + + io_uring_queue_init(512, &ring, flags); + + setup_zcrx(&ring); + + add_accept(&ring, fd); + + tstop = gettimeofday_ms() + 5000; + while (!stop && gettimeofday_ms() < tstop) + server_loop(&ring); + + if (!stop) + error(1, 0, "test failed\n"); +} + +static void run_client(void) +{ + ssize_t to_send = cfg_send_size; + ssize_t sent = 0; + ssize_t chunk, res; + int fd; + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (fd == -1) + error(1, 0, "socket()"); + + if (connect(fd, (struct sockaddr *)&cfg_addr, sizeof(cfg_addr))) + error(1, 0, "connect()"); + + while (to_send) { + void *src = &payload[sent]; + + chunk = min_t(ssize_t, cfg_payload_len, to_send); + res = send(fd, src, chunk, 0); + if (res < 0) + error(1, 0, "send(): %d", sent); + sent += res; + to_send -= res; + } + + close(fd); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s (-4|-6) (-s|-c) -h<server_ip> -p<port> " + "-l<payload_size> -i<ifname> -q<rxq_id>", filepath); +} + +static void parse_opts(int argc, char **argv) +{ + const int max_payload_len = sizeof(payload) - + sizeof(struct ipv6hdr) - + sizeof(struct tcphdr) - + 40 /* max tcp options */; + struct sockaddr_in6 *addr6 = (void *) &cfg_addr; + char *addr = NULL; + int ret; + int c; + + if (argc <= 1) + usage(argv[0]); + cfg_payload_len = max_payload_len; + + while ((c = getopt(argc, argv, "sch:p:l:i:q:o:z:")) != -1) { + switch (c) { + case 's': + if (cfg_client) + error(1, 0, "Pass one of -s or -c"); + cfg_server = 1; + break; + case 'c': + if (cfg_server) + error(1, 0, "Pass one of -s or -c"); + cfg_client = 1; + break; + case 'h': + addr = optarg; + break; + case 'p': + cfg_port = strtoul(optarg, NULL, 0); + break; + case 'l': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 'i': + cfg_ifname = optarg; + break; + case 'q': + cfg_queue_id = strtoul(optarg, NULL, 0); + break; + case 'o': { + cfg_oneshot = true; + cfg_oneshot_recvs = strtoul(optarg, NULL, 0); + break; + } + case 'z': + cfg_send_size = strtoul(optarg, NULL, 0); + break; + } + } + + if (cfg_server && addr) + error(1, 0, "Receiver cannot have -h specified"); + + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + addr6->sin6_addr = in6addr_any; + if (addr) { + ret = parse_address(addr, cfg_port, addr6); + if (ret) + error(1, 0, "receiver address parse error: %s", addr); + } + + if (cfg_payload_len > max_payload_len) + error(1, 0, "-l: payload exceeds max (%d)", max_payload_len); +} + +int main(int argc, char **argv) +{ + const char *cfg_test = argv[argc - 1]; + int i; + + parse_opts(argc, argv); + + for (i = 0; i < SEND_SIZE; i++) + payload[i] = 'a' + (i % 26); + + if (cfg_server) + run_server(); + else if (cfg_client) + run_client(); + + return 0; +} diff --git a/tools/testing/selftests/drivers/net/hw/iou-zcrx.py b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py new file mode 100755 index 000000000000..d301d9b356f7 --- /dev/null +++ b/tools/testing/selftests/drivers/net/hw/iou-zcrx.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import re +from os import path +from lib.py import ksft_run, ksft_exit +from lib.py import NetDrvEpEnv +from lib.py import bkg, cmd, ethtool, wait_port_listen + + +def _get_rx_ring_entries(cfg): + output = ethtool(f"-g {cfg.ifname}", host=cfg.remote).stdout + values = re.findall(r'RX:\s+(\d+)', output) + return int(values[1]) + + +def _get_combined_channels(cfg): + output = ethtool(f"-l {cfg.ifname}", host=cfg.remote).stdout + values = re.findall(r'Combined:\s+(\d+)', output) + return int(values[1]) + + +def _set_flow_rule(cfg, chan): + output = ethtool(f"-N {cfg.ifname} flow-type tcp6 dst-port 9999 action {chan}", host=cfg.remote).stdout + values = re.search(r'ID (\d+)', output).group(1) + return int(values) + + +def test_zcrx(cfg) -> None: + cfg.require_v6() + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + rx_ring = _get_rx_ring_entries(cfg) + + try: + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + + rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1}" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_v6} -p 9999 -l 12840" + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) + finally: + ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} default", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + + +def test_zcrx_oneshot(cfg) -> None: + cfg.require_v6() + + combined_chans = _get_combined_channels(cfg) + if combined_chans < 2: + raise KsftSkipEx('at least 2 combined channels required') + rx_ring = _get_rx_ring_entries(cfg) + + try: + ethtool(f"-G {cfg.ifname} rx 64", host=cfg.remote) + ethtool(f"-X {cfg.ifname} equal {combined_chans - 1}", host=cfg.remote) + flow_rule_id = _set_flow_rule(cfg, combined_chans - 1) + + rx_cmd = f"{cfg.bin_remote} -s -p 9999 -i {cfg.ifname} -q {combined_chans - 1} -o 4" + tx_cmd = f"{cfg.bin_local} -c -h {cfg.remote_v6} -p 9999 -l 4096 -z 16384" + with bkg(rx_cmd, host=cfg.remote, exit_wait=True): + wait_port_listen(9999, proto="tcp", host=cfg.remote) + cmd(tx_cmd) + finally: + ethtool(f"-N {cfg.ifname} delete {flow_rule_id}", host=cfg.remote) + ethtool(f"-X {cfg.ifname} default", host=cfg.remote) + ethtool(f"-G {cfg.ifname} rx {rx_ring}", host=cfg.remote) + + +def main() -> None: + with NetDrvEpEnv(__file__) as cfg: + cfg.bin_local = path.abspath(path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx") + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) + + ksft_run(globs=globals(), case_pfx={"test_"}, args=(cfg, )) + ksft_exit() + + +if __name__ == "__main__": + main() |