From ab6005f3912fff07330297aba08922d2456dcede Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 4 Apr 2025 15:46:34 +0100
Subject: io_uring: don't post tag CQEs on file/buffer registration failure

Buffer / file table registration is all or nothing, if it fails all
resources we might have partially registered are dropped and the table
is killed. If that happens, it doesn't make sense to post any rsrc tag
CQEs. That would be confusing to the application, which should not need
to handle that case.

Cc: stable@vger.kernel.org
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Fixes: 7029acd8a9503 ("io_uring/rsrc: get rid of per-ring io_rsrc_node list")
Link: https://lore.kernel.org/r/c514446a8dcb0197cddd5d4ba8f6511da081cf1f.1743777957.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/rsrc.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 5e64a8bb30a4..b36c8825550e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -175,6 +175,18 @@ void io_rsrc_cache_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->imu_cache, kfree);
 }
 
+static void io_clear_table_tags(struct io_rsrc_data *data)
+{
+	int i;
+
+	for (i = 0; i < data->nr; i++) {
+		struct io_rsrc_node *node = data->nodes[i];
+
+		if (node)
+			node->tag = 0;
+	}
+}
+
 __cold void io_rsrc_data_free(struct io_ring_ctx *ctx,
 			      struct io_rsrc_data *data)
 {
@@ -583,6 +595,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
 	return 0;
 fail:
+	io_clear_table_tags(&ctx->file_table.data);
 	io_sqe_files_unregister(ctx);
 	return ret;
 }
@@ -902,8 +915,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	ctx->buf_table = data;
-	if (ret)
+	if (ret) {
+		io_clear_table_tags(&ctx->buf_table);
 		io_sqe_buffers_unregister(ctx);
+	}
 	return ret;
 }
 
-- 
cgit 


From 9b58440a5b2fe78102ce1e9e03946645558d0f55 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 5 Apr 2025 11:17:49 +0100
Subject: io_uring/zcrx: put refill data into separate cache line

Refill queue lock and other bits are only used from the allocation path
on the rx softirq side, but it shares the cache line with other fields
like ctx that are used also in the "syscall" path, which causes cache
bouncing when softirq runs on a different CPU.

Separate them into different cache lines. The first one now contains
constant fields used by both contextx, followed by a line responsible
for refill queue data.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6d1f598e27d623c07fc49d6baee13089a9b1216c.1743848241.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 706cc7300780..b59c560d5d84 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -26,11 +26,11 @@ struct io_zcrx_ifq {
 	struct io_ring_ctx		*ctx;
 	struct io_zcrx_area		*area;
 
+	spinlock_t			rq_lock ____cacheline_aligned_in_smp;
 	struct io_uring			*rq_ring;
 	struct io_uring_zcrx_rqe	*rqes;
-	u32				rq_entries;
 	u32				cached_rq_head;
-	spinlock_t			rq_lock;
+	u32				rq_entries;
 
 	u32				if_rxq;
 	struct device			*dev;
-- 
cgit 


From 5a17131a5dbd0ebca655bfb65fe3fe643ccc27f3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 5 Apr 2025 11:18:29 +0100
Subject: io_uring/zcrx: separate niov number from pages

A preparation patch that separates the number of pages / folios from
the number of niovs. They will not match in the future to support huge
pages, improved dma mapping and/or larger chunk sizes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0780ac966ee84200385737f45bb0f2ada052392b.1743848231.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/zcrx.c | 19 ++++++++++---------
 io_uring/zcrx.h |  1 +
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 80d4a6f71d29..0f46e0404c04 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -181,7 +181,7 @@ static void io_zcrx_free_area(struct io_zcrx_area *area)
 	kvfree(area->nia.niovs);
 	kvfree(area->user_refs);
 	if (area->pages) {
-		unpin_user_pages(area->pages, area->nia.num_niovs);
+		unpin_user_pages(area->pages, area->nr_folios);
 		kvfree(area->pages);
 	}
 	kfree(area);
@@ -192,7 +192,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 			       struct io_uring_zcrx_area_reg *area_reg)
 {
 	struct io_zcrx_area *area;
-	int i, ret, nr_pages;
+	int i, ret, nr_pages, nr_iovs;
 	struct iovec iov;
 
 	if (area_reg->flags || area_reg->rq_area_token)
@@ -220,27 +220,28 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 		area->pages = NULL;
 		goto err;
 	}
-	area->nia.num_niovs = nr_pages;
+	area->nr_folios = nr_iovs = nr_pages;
+	area->nia.num_niovs = nr_iovs;
 
-	area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]),
+	area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
 					 GFP_KERNEL | __GFP_ZERO);
 	if (!area->nia.niovs)
 		goto err;
 
-	area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]),
+	area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]),
 					GFP_KERNEL | __GFP_ZERO);
 	if (!area->freelist)
 		goto err;
 
-	for (i = 0; i < nr_pages; i++)
+	for (i = 0; i < nr_iovs; i++)
 		area->freelist[i] = i;
 
-	area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]),
+	area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
 					GFP_KERNEL | __GFP_ZERO);
 	if (!area->user_refs)
 		goto err;
 
-	for (i = 0; i < nr_pages; i++) {
+	for (i = 0; i < nr_iovs; i++) {
 		struct net_iov *niov = &area->nia.niovs[i];
 
 		niov->owner = &area->nia;
@@ -248,7 +249,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
 		atomic_set(&area->user_refs[i], 0);
 	}
 
-	area->free_count = nr_pages;
+	area->free_count = nr_iovs;
 	area->ifq = ifq;
 	/* we're only supporting one area per ifq for now */
 	area->area_id = 0;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index b59c560d5d84..47f1c0e8c197 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -15,6 +15,7 @@ struct io_zcrx_area {
 	bool			is_mapped;
 	u16			area_id;
 	struct page		**pages;
+	unsigned long		nr_folios;
 
 	/* freelist */
 	spinlock_t		freelist_lock ____cacheline_aligned_in_smp;
-- 
cgit 


From cf960726eb65e8d0bfecbcce6cf95f47b1ffa6cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 7 Apr 2025 07:51:23 -0600
Subject: io_uring/kbuf: reject zero sized provided buffers

This isn't fixing a real issue, but there's also zero point in going
through group and buffer setup, when the buffers are going to be
rejected once attempted to get used.

Cc: stable@vger.kernel.org
Reported-by: syzbot+58928048fd1416f1457c@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/kbuf.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 098109259671..953d5e742569 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -504,6 +504,8 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	p->nbufs = tmp;
 	p->addr = READ_ONCE(sqe->addr);
 	p->len = READ_ONCE(sqe->len);
+	if (!p->len)
+		return -EINVAL;
 
 	if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs,
 				&size))
-- 
cgit