summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/Makefile6
-rw-r--r--io_uring/advise.c4
-rw-r--r--io_uring/cancel.c2
-rw-r--r--io_uring/cmd_net.c83
-rw-r--r--io_uring/epoll.c4
-rw-r--r--io_uring/eventfd.c66
-rw-r--r--io_uring/eventfd.h3
-rw-r--r--io_uring/fdinfo.c86
-rw-r--r--io_uring/fs.c10
-rw-r--r--io_uring/futex.c10
-rw-r--r--io_uring/io-wq.c65
-rw-r--r--io_uring/io-wq.h5
-rw-r--r--io_uring/io_uring.c288
-rw-r--r--io_uring/io_uring.h4
-rw-r--r--io_uring/kbuf.c148
-rw-r--r--io_uring/kbuf.h8
-rw-r--r--io_uring/memmap.c13
-rw-r--r--io_uring/memmap.h4
-rw-r--r--io_uring/msg_ring.c2
-rw-r--r--io_uring/net.c76
-rw-r--r--io_uring/nop.c2
-rw-r--r--io_uring/notif.c1
-rw-r--r--io_uring/opdef.c11
-rw-r--r--io_uring/openclose.c139
-rw-r--r--io_uring/openclose.h3
-rw-r--r--io_uring/poll.c4
-rw-r--r--io_uring/rsrc.c91
-rw-r--r--io_uring/rsrc.h28
-rw-r--r--io_uring/rw.c8
-rw-r--r--io_uring/rw.h2
-rw-r--r--io_uring/splice.c4
-rw-r--r--io_uring/statx.c2
-rw-r--r--io_uring/sync.c6
-rw-r--r--io_uring/tctx.c2
-rw-r--r--io_uring/timeout.c13
-rw-r--r--io_uring/timeout.h13
-rw-r--r--io_uring/truncate.c2
-rw-r--r--io_uring/uring_cmd.c96
-rw-r--r--io_uring/uring_cmd.h6
-rw-r--r--io_uring/waitid.c2
-rw-r--r--io_uring/xattr.c8
-rw-r--r--io_uring/zcrx.c375
-rw-r--r--io_uring/zcrx.h26
43 files changed, 991 insertions, 740 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 3e28a741ca15..d97c6b51d584 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -7,11 +7,11 @@ GCOV_PROFILE := y
endif
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
- tctx.o filetable.o rw.o net.o poll.o \
+ tctx.o filetable.o rw.o poll.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
- statx.o timeout.o fdinfo.o cancel.o \
+ statx.o timeout.o cancel.o \
waitid.o register.o truncate.o \
memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
@@ -19,3 +19,5 @@ obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_EPOLL) += epoll.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
+obj-$(CONFIG_NET) += net.o cmd_net.o
+obj-$(CONFIG_PROC_FS) += fdinfo.o
diff --git a/io_uring/advise.c b/io_uring/advise.c
index cb7b881665e5..0073f74e3658 100644
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
#else
return -EOPNOTSUPP;
#endif
@@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index 0870060bac7c..6d57602304df 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -229,7 +229,7 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int __io_sync_cancel(struct io_uring_task *tctx,
diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c
new file mode 100644
index 000000000000..e99170c7d41a
--- /dev/null
+++ b/io_uring/cmd_net.c
@@ -0,0 +1,83 @@
+#include <asm/ioctls.h>
+#include <linux/io_uring/net.h>
+#include <net/sock.h>
+
+#include "uring_cmd.h"
+
+static inline int io_uring_cmd_getsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optlen, optname, level, err;
+ void __user *optval;
+
+ level = READ_ONCE(sqe->level);
+ if (level != SOL_SOCKET)
+ return -EOPNOTSUPP;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+
+ err = do_sock_getsockopt(sock, compat, level, optname,
+ USER_SOCKPTR(optval),
+ KERNEL_SOCKPTR(&optlen));
+ if (err)
+ return err;
+
+ /* On success, return optlen */
+ return optlen;
+}
+
+static inline int io_uring_cmd_setsockopt(struct socket *sock,
+ struct io_uring_cmd *cmd,
+ unsigned int issue_flags)
+{
+ const struct io_uring_sqe *sqe = cmd->sqe;
+ bool compat = !!(issue_flags & IO_URING_F_COMPAT);
+ int optname, optlen, level;
+ void __user *optval;
+ sockptr_t optval_s;
+
+ optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
+ optname = READ_ONCE(sqe->optname);
+ optlen = READ_ONCE(sqe->optlen);
+ level = READ_ONCE(sqe->level);
+ optval_s = USER_SOCKPTR(optval);
+
+ return do_sock_setsockopt(sock, compat, level, optname, optval_s,
+ optlen);
+}
+
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct socket *sock = cmd->file->private_data;
+ struct sock *sk = sock->sk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ int ret, arg = 0;
+
+ if (!prot || !prot->ioctl)
+ return -EOPNOTSUPP;
+
+ switch (cmd->cmd_op) {
+ case SOCKET_URING_OP_SIOCINQ:
+ ret = prot->ioctl(sk, SIOCINQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_SIOCOUTQ:
+ ret = prot->ioctl(sk, SIOCOUTQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_GETSOCKOPT:
+ return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
+ case SOCKET_URING_OP_SETSOCKOPT:
+ return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index 6d2c48ba1923..8d4610246ba0 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index 100d5da94cb9..78f8ab7db104 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
io_eventfd_put(ev_fd);
}
-static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
-{
- if (put_ref)
- io_eventfd_put(ev_fd);
- rcu_read_unlock();
-}
-
/*
* Returns true if the caller should put the ev_fd reference, false if not.
*/
@@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
/*
* Trigger if eventfd_async isn't set, or if it's set and the caller is
- * an async worker. If ev_fd isn't valid, obviously return false.
+ * an async worker.
*/
static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
{
- if (ev_fd)
- return !ev_fd->eventfd_async || io_wq_current_is_worker();
- return false;
+ return !ev_fd->eventfd_async || io_wq_current_is_worker();
}
-/*
- * On success, returns with an ev_fd reference grabbed and the RCU read
- * lock held.
- */
-static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
+ bool skip = false;
struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return NULL;
-
- rcu_read_lock();
+ return;
- /*
- * rcu_dereference ctx->io_ev_fd once and use it for both for checking
- * and eventfd_signal
- */
+ guard(rcu)();
ev_fd = rcu_dereference(ctx->io_ev_fd);
-
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
- if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
- return ev_fd;
-
- rcu_read_unlock();
- return NULL;
-}
-
-void io_eventfd_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd)
- io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
-}
-
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
-{
- struct io_ev_fd *ev_fd;
-
- ev_fd = io_eventfd_grab(ctx);
- if (ev_fd) {
- bool skip, put_ref = true;
+ if (!ev_fd)
+ return;
+ if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs))
+ return;
+ if (cqe_event) {
/*
* Eventfd should only get triggered when at least one event
* has been posted. Some applications rely on the eventfd
@@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
-
- if (!skip)
- put_ref = __io_eventfd_signal(ev_fd);
-
- io_eventfd_release(ev_fd, put_ref);
}
+
+ if (skip || __io_eventfd_signal(ev_fd))
+ io_eventfd_put(ev_fd);
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h
index d394f49c6321..e2f1985c2cf9 100644
--- a/io_uring/eventfd.h
+++ b/io_uring/eventfd.h
@@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async);
int io_eventfd_unregister(struct io_ring_ctx *ctx);
-void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
-void io_eventfd_signal(struct io_ring_ctx *ctx);
+void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event);
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 9414ca6d101c..e9355276ab5d 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -15,37 +15,6 @@
#include "cancel.h"
#include "rsrc.h"
-#ifdef CONFIG_PROC_FS
-static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
- const struct cred *cred)
-{
- struct user_namespace *uns = seq_user_ns(m);
- struct group_info *gi;
- kernel_cap_t cap;
- int g;
-
- seq_printf(m, "%5d\n", id);
- seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid));
- seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid));
- seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid));
- seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid));
- seq_puts(m, "\n\tGroups:\t");
- gi = cred->group_info;
- for (g = 0; g < gi->ngroups; g++) {
- seq_put_decimal_ull(m, g ? " " : "",
- from_kgid_munged(uns, gi->gid[g]));
- }
- seq_puts(m, "\n\tCapEff:\t");
- cap = cred->cap_effective;
- seq_put_hex_ll(m, NULL, cap.val, 16);
- seq_putc(m, '\n');
- return 0;
-}
-
#ifdef CONFIG_NET_RX_BUSY_POLL
static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m,
@@ -86,13 +55,8 @@ static inline void napi_show_fdinfo(struct io_ring_ctx *ctx,
}
#endif
-/*
- * Caller holds a reference to the file already, we don't need to do
- * anything else to get an extra reference.
- */
-__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
+static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
- struct io_ring_ctx *ctx = file->private_data;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
struct rusage sq_usage;
@@ -106,7 +70,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
unsigned int sq_entries, cq_entries;
int sq_pid = -1, sq_cpu = -1;
u64 sq_total_time = 0, sq_work_time = 0;
- bool has_lock;
unsigned int i;
if (ctx->flags & IORING_SETUP_CQE32)
@@ -176,15 +139,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_printf(m, "\n");
}
- /*
- * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
- * since fdinfo case grabs it in the opposite direction of normal use
- * cases. If we fail to get the lock, we just don't iterate any
- * structures that could be going away outside the io_uring mutex.
- */
- has_lock = mutex_trylock(&ctx->uring_lock);
-
- if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_sq_data *sq = ctx->sq_data;
/*
@@ -206,7 +161,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
- for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
+ for (i = 0; i < ctx->file_table.data.nr; i++) {
struct file *f = NULL;
if (ctx->file_table.data.nodes[i])
@@ -218,7 +173,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
}
}
seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
- for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+ for (i = 0; i < ctx->buf_table.nr; i++) {
struct io_mapped_ubuf *buf = NULL;
if (ctx->buf_table.nodes[i])
@@ -228,17 +183,9 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
else
seq_printf(m, "%5u: <none>\n", i);
}
- if (has_lock && !xa_empty(&ctx->personalities)) {
- unsigned long index;
- const struct cred *cred;
-
- seq_printf(m, "Personalities:\n");
- xa_for_each(&ctx->personalities, index, cred)
- io_uring_show_cred(m, index, cred);
- }
seq_puts(m, "PollList:\n");
- for (i = 0; has_lock && i < (1U << ctx->cancel_table.hash_bits); i++) {
+ for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) {
struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
struct io_kiocb *req;
@@ -247,9 +194,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
task_work_pending(req->tctx->task));
}
- if (has_lock)
- mutex_unlock(&ctx->uring_lock);
-
seq_puts(m, "CqOverflowList:\n");
spin_lock(&ctx->completion_lock);
list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) {
@@ -262,4 +206,22 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
spin_unlock(&ctx->completion_lock);
napi_show_fdinfo(ctx, m);
}
-#endif
+
+/*
+ * Caller holds a reference to the file already, we don't need to do
+ * anything else to get an extra reference.
+ */
+__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
+{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ /*
+ * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
+ * since fdinfo case grabs it in the opposite direction of normal use
+ * cases.
+ */
+ if (mutex_trylock(&ctx->uring_lock)) {
+ __io_uring_show_fdinfo(ctx, m);
+ mutex_unlock(&ctx->uring_lock);
+ }
+}
diff --git a/io_uring/fs.c b/io_uring/fs.c
index eccea851dd5a..37079a414eab 100644
--- a/io_uring/fs.c
+++ b/io_uring/fs.c
@@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_renameat_cleanup(struct io_kiocb *req)
@@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_unlinkat_cleanup(struct io_kiocb *req)
@@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_mkdirat_cleanup(struct io_kiocb *req)
@@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
req->flags &= ~REQ_F_NEED_CLEANUP;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_link_cleanup(struct io_kiocb *req)
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 0ea4820cd8ff..fa374afbaa51 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -234,7 +234,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
kfree(futexv);
req->async_data = NULL;
req->flags &= ~REQ_F_ASYNC_DATA;
- return IOU_OK;
+ return IOU_COMPLETE;
}
/*
@@ -273,7 +273,6 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
struct io_ring_ctx *ctx = req->ctx;
struct io_futex_data *ifd = NULL;
- struct futex_hash_bucket *hb;
int ret;
if (!iof->futex_mask) {
@@ -295,12 +294,11 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
ifd->req = req;
ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags,
- &ifd->q, &hb);
+ &ifd->q, NULL, NULL);
if (!ret) {
hlist_add_head(&req->hash_node, &ctx->futex_list);
io_ring_submit_unlock(ctx, issue_flags);
- futex_queue(&ifd->q, hb, NULL);
return IOU_ISSUE_SKIP_COMPLETE;
}
@@ -311,7 +309,7 @@ done:
req_set_fail(req);
io_req_set_res(req, ret, 0);
kfree(ifd);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
@@ -328,5 +326,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 04a75d666195..cd1fcb115739 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -114,9 +114,6 @@ enum {
struct io_wq {
unsigned long state;
- free_work_fn *free_work;
- io_wq_work_fn *do_work;
-
struct io_wq_hash *hash;
atomic_t worker_refs;
@@ -153,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq,
static void create_worker_cb(struct callback_head *cb);
static void io_wq_cancel_tw_create(struct io_wq *wq);
+static inline unsigned int __io_get_work_hash(unsigned int work_flags)
+{
+ return work_flags >> IO_WQ_HASH_SHIFT;
+}
+
+static inline unsigned int io_get_work_hash(struct io_wq_work *work)
+{
+ return __io_get_work_hash(atomic_read(&work->flags));
+}
+
static bool io_worker_get(struct io_worker *worker)
{
return refcount_inc_not_zero(&worker->ref);
@@ -412,6 +419,30 @@ fail:
return false;
}
+/* Defer if current and next work are both hashed to the same chain */
+static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct)
+{
+ unsigned int hash, work_flags;
+ struct io_wq_work *next;
+
+ lockdep_assert_held(&acct->lock);
+
+ work_flags = atomic_read(&work->flags);
+ if (!__io_wq_is_hashed(work_flags))
+ return false;
+
+ /* should not happen, io_acct_run_queue() said we had work */
+ if (wq_list_empty(&acct->work_list))
+ return true;
+
+ hash = __io_get_work_hash(work_flags);
+ next = container_of(acct->work_list.first, struct io_wq_work, list);
+ work_flags = atomic_read(&next->flags);
+ if (!__io_wq_is_hashed(work_flags))
+ return false;
+ return hash == __io_get_work_hash(work_flags);
+}
+
static void io_wq_dec_running(struct io_worker *worker)
{
struct io_wq_acct *acct = io_wq_get_acct(worker);
@@ -422,8 +453,14 @@ static void io_wq_dec_running(struct io_worker *worker)
if (!atomic_dec_and_test(&acct->nr_running))
return;
+ if (!worker->cur_work)
+ return;
if (!io_acct_run_queue(acct))
return;
+ if (io_wq_hash_defer(worker->cur_work, acct)) {
+ raw_spin_unlock(&acct->lock);
+ return;
+ }
raw_spin_unlock(&acct->lock);
atomic_inc(&acct->nr_running);
@@ -457,16 +494,6 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker)
}
}
-static inline unsigned int __io_get_work_hash(unsigned int work_flags)
-{
- return work_flags >> IO_WQ_HASH_SHIFT;
-}
-
-static inline unsigned int io_get_work_hash(struct io_wq_work *work)
-{
- return __io_get_work_hash(atomic_read(&work->flags));
-}
-
static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
{
bool ret = false;
@@ -612,10 +639,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
if (do_kill &&
(work_flags & IO_WQ_WORK_UNBOUND))
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
+ io_wq_submit_work(work);
io_assign_current_work(worker, NULL);
- linked = wq->free_work(work);
+ linked = io_wq_free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
@@ -934,8 +961,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
- wq->do_work(work);
- work = wq->free_work(work);
+ io_wq_submit_work(work);
+ work = io_wq_free_work(work);
} while (work);
}
@@ -1195,8 +1222,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
int ret, i;
struct io_wq *wq;
- if (WARN_ON_ONCE(!data->free_work || !data->do_work))
- return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(!bounded))
return ERR_PTR(-EINVAL);
@@ -1206,8 +1231,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
refcount_inc(&data->hash->refs);
wq->hash = data->hash;
- wq->free_work = data->free_work;
- wq->do_work = data->do_work;
ret = -ENOMEM;
diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index d4fb2940e435..774abab54732 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,9 +21,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work *);
-
struct io_wq_hash {
refcount_t refs;
unsigned long map;
@@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash)
struct io_wq_data {
struct io_wq_hash *hash;
struct task_struct *task;
- io_wq_work_fn *do_work;
- free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 541e65a1eebf..c7a9cecf528e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
struct io_defer_entry {
struct list_head list;
struct io_kiocb *req;
- u32 seq;
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
@@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->tctx_list);
ctx->submit_state.free_list.next = NULL;
INIT_HLIST_HEAD(&ctx->waitid_list);
+ xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
#ifdef CONFIG_FUTEX
INIT_HLIST_HEAD(&ctx->futex_list);
#endif
@@ -380,25 +381,6 @@ err:
return NULL;
}
-static void io_account_cq_overflow(struct io_ring_ctx *ctx)
-{
- struct io_rings *r = ctx->rings;
-
- WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
- ctx->cq_extra--;
-}
-
-static bool req_need_defer(struct io_kiocb *req, u32 seq)
-{
- if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
- struct io_ring_ctx *ctx = req->ctx;
-
- return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
- }
-
- return false;
-}
-
static void io_clean_op(struct io_kiocb *req)
{
if (unlikely(req->flags & REQ_F_BUFFER_SELECTED))
@@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
+static unsigned io_linked_nr(struct io_kiocb *req)
+{
+ struct io_kiocb *tmp;
+ unsigned nr = 0;
+
+ io_for_each_link(tmp, req)
+ nr++;
+ return nr;
+}
+
static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
- spin_lock(&ctx->completion_lock);
+ bool drain_seen = false, first = true;
+
+ lockdep_assert_held(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
+
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
- if (req_need_defer(de->req, de->seq))
- break;
+ drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
+ if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+ return;
+
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue(de->req);
kfree(de);
+ first = false;
}
- spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
if (ctx->has_evfd)
- io_eventfd_flush_signal(ctx);
+ io_eventfd_signal(ctx, true);
}
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
@@ -636,6 +633,7 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
* to care for a non-real case.
*/
if (need_resched()) {
+ ctx->cqe_sentinel = ctx->cqe_cached;
io_cq_unlock_post(ctx);
mutex_unlock(&ctx->uring_lock);
cond_resched();
@@ -700,27 +698,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
}
-static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags, u64 extra1, u64 extra2)
+static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
+ struct io_overflow_cqe *ocqe)
{
- struct io_overflow_cqe *ocqe;
- size_t ocq_size = sizeof(struct io_overflow_cqe);
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
-
lockdep_assert_held(&ctx->completion_lock);
- if (is_cqe32)
- ocq_size += sizeof(struct io_uring_cqe);
-
- ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
- trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
+ struct io_rings *r = ctx->rings;
+
/*
* If we're in ring overflow flush mode, or in task cancel mode,
* or cannot allocate an overflow entry, then we need to drop it
* on the floor.
*/
- io_account_cq_overflow(ctx);
+ WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false;
}
@@ -729,23 +720,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags);
}
- ocqe->cqe.user_data = user_data;
- ocqe->cqe.res = res;
- ocqe->cqe.flags = cflags;
- if (is_cqe32) {
- ocqe->cqe.big_cqe[0] = extra1;
- ocqe->cqe.big_cqe[1] = extra2;
- }
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
-static void io_req_cqe_overflow(struct io_kiocb *req)
+static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
+ struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe, gfp_t gfp)
{
- io_cqring_event_overflow(req->ctx, req->cqe.user_data,
- req->cqe.res, req->cqe.flags,
- req->big_cqe.extra1, req->big_cqe.extra2);
- memset(&req->big_cqe, 0, sizeof(req->big_cqe));
+ struct io_overflow_cqe *ocqe;
+ size_t ocq_size = sizeof(struct io_overflow_cqe);
+ bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+
+ if (is_cqe32)
+ ocq_size += sizeof(struct io_uring_cqe);
+
+ ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
+ trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
+ if (ocqe) {
+ ocqe->cqe.user_data = cqe->user_data;
+ ocqe->cqe.res = cqe->res;
+ ocqe->cqe.flags = cqe->flags;
+ if (is_cqe32 && big_cqe) {
+ ocqe->cqe.big_cqe[0] = big_cqe->extra1;
+ ocqe->cqe.big_cqe[1] = big_cqe->extra2;
+ }
+ }
+ if (big_cqe)
+ big_cqe->extra1 = big_cqe->extra2 = 0;
+ return ocqe;
}
/*
@@ -790,13 +793,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
{
struct io_uring_cqe *cqe;
- ctx->cq_extra++;
-
- /*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
- */
if (likely(io_get_cqe(ctx, &cqe))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
@@ -813,14 +809,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
+static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags)
+{
+ return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags };
+}
+
+static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe)
+{
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL);
+ spin_lock(&ctx->completion_lock);
+ io_cqring_add_overflow(ctx, ocqe);
+ spin_unlock(&ctx->completion_lock);
+}
+
+static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx,
+ struct io_cqe *cqe,
+ struct io_big_cqe *big_cqe)
+{
+ struct io_overflow_cqe *ocqe;
+
+ ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC);
+ return io_cqring_add_overflow(ctx, ocqe);
+}
+
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
- if (!filled)
- filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
+ if (unlikely(!filled)) {
+ struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
+
+ filled = io_cqe_overflow_locked(ctx, &cqe, NULL);
+ }
io_cq_unlock_post(ctx);
return filled;
}
@@ -831,10 +856,13 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
*/
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
+ lockdep_assert_held(&ctx->uring_lock);
+ lockdep_assert(ctx->lockless_cq);
+
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
- spin_lock(&ctx->completion_lock);
- io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
- spin_unlock(&ctx->completion_lock);
+ struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
+
+ io_cqe_overflow(ctx, &cqe, NULL);
}
ctx->submit_state.cq_flush = true;
}
@@ -924,22 +952,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
}
/*
- * Don't initialise the fields below on every allocation, but do that in
- * advance and keep them valid across allocations.
- */
-static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
-{
- req->ctx = ctx;
- req->buf_node = NULL;
- req->file_node = NULL;
- req->link = NULL;
- req->async_data = NULL;
- /* not necessary, but safer to zero */
- memset(&req->cqe, 0, sizeof(req->cqe));
- memset(&req->big_cqe, 0, sizeof(req->big_cqe));
-}
-
-/*
* A request might get retired back into the request caches even before opcode
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
* Because of that, io_alloc_req() should be called only under ->uring_lock
@@ -948,7 +960,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
- gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
+ gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO;
void *reqs[IO_REQ_ALLOC_BATCH];
int ret;
@@ -966,10 +978,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
}
percpu_ref_get_many(&ctx->refs, ret);
+ ctx->nr_req_allocated += ret;
+
while (ret--) {
struct io_kiocb *req = reqs[ret];
- io_preinit_req(req, ctx);
io_req_add_to_cache(req, ctx);
}
return true;
@@ -1191,7 +1204,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (ctx->has_evfd)
- io_eventfd_signal(ctx);
+ io_eventfd_signal(ctx, false);
}
nr_wait = atomic_read(&ctx->cq_wait_nr);
@@ -1383,6 +1396,16 @@ void io_queue_next(struct io_kiocb *req)
io_req_task_queue(nxt);
}
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
+{
+ if (req->file_node) {
+ io_put_rsrc_node(req->ctx, req->file_node);
+ req->file_node = NULL;
+ }
+ if (req->flags & REQ_F_BUF_NODE)
+ io_put_rsrc_node(req->ctx, req->buf_node);
+}
+
static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
@@ -1443,13 +1466,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
*/
if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
- if (ctx->lockless_cq) {
- spin_lock(&ctx->completion_lock);
- io_req_cqe_overflow(req);
- spin_unlock(&ctx->completion_lock);
- } else {
- io_req_cqe_overflow(req);
- }
+ if (ctx->lockless_cq)
+ io_cqe_overflow(ctx, &req->cqe, &req->big_cqe);
+ else
+ io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe);
}
}
__io_cq_unlock_post(ctx);
@@ -1458,6 +1478,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
+
+ if (unlikely(ctx->drain_active))
+ io_queue_deferred(ctx);
+
ctx->submit_state.cq_flush = false;
}
@@ -1645,56 +1669,28 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
-static u32 io_get_sequence(struct io_kiocb *req)
-{
- u32 seq = req->ctx->cached_sq_head;
- struct io_kiocb *cur;
-
- /* need original cached_sq_head, but it was increased for each req */
- io_for_each_link(cur, req)
- seq--;
- return seq;
-}
-
static __cold void io_drain_req(struct io_kiocb *req)
__must_hold(&ctx->uring_lock)
{
struct io_ring_ctx *ctx = req->ctx;
+ bool drain = req->flags & IOSQE_IO_DRAIN;
struct io_defer_entry *de;
- int ret;
- u32 seq = io_get_sequence(req);
-
- /* Still need defer if there is pending req in defer list. */
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
-queue:
- ctx->drain_active = false;
- io_req_task_queue(req);
- return;
- }
- spin_unlock(&ctx->completion_lock);
- io_prep_async_link(req);
- de = kmalloc(sizeof(*de), GFP_KERNEL);
+ de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT);
if (!de) {
- ret = -ENOMEM;
- io_req_defer_failed(req, ret);
+ io_req_defer_failed(req, -ENOMEM);
return;
}
- spin_lock(&ctx->completion_lock);
- if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
- spin_unlock(&ctx->completion_lock);
- kfree(de);
- goto queue;
- }
-
+ io_prep_async_link(req);
trace_io_uring_defer(req);
de->req = req;
- de->seq = seq;
+
+ ctx->nr_drained += io_linked_nr(req);
list_add_tail(&de->list, &ctx->defer_list);
- spin_unlock(&ctx->completion_lock);
+ io_queue_deferred(ctx);
+ if (!drain && list_empty(&ctx->defer_list))
+ ctx->drain_active = false;
}
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -1756,7 +1752,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
ret = __io_issue_sqe(req, issue_flags, def);
- if (ret == IOU_OK) {
+ if (ret == IOU_COMPLETE) {
if (issue_flags & IO_URING_F_COMPLETE_DEFER)
io_req_complete_defer(req);
else
@@ -1815,7 +1811,7 @@ void io_wq_submit_work(struct io_wq_work *work)
bool needs_poll = false;
int ret = 0, err = -ECANCELED;
- /* one will be dropped by ->io_wq_free_work() after returning to io-wq */
+ /* one will be dropped by io_wq_free_work() after returning to io-wq */
if (!(req->flags & REQ_F_REFCOUNT))
__io_req_set_refcount(req, 2);
else
@@ -1913,7 +1909,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
if (node) {
- io_req_assign_rsrc_node(&req->file_node, node);
+ node->refs++;
+ req->file_node = node;
req->flags |= io_slot_flags(node);
file = io_slot_file(node);
}
@@ -2046,7 +2043,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
int personality;
u8 opcode;
- /* req is partially pre-initialised, see io_preinit_req() */
+ req->ctx = ctx;
req->opcode = opcode = READ_ONCE(sqe->opcode);
/* same numerical values with corresponding REQ_F_*, safe to copy */
sqe_flags = READ_ONCE(sqe->flags);
@@ -2277,10 +2274,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
(!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
head = READ_ONCE(ctx->sq_array[head]);
if (unlikely(head >= ctx->sq_entries)) {
- /* drop invalid entries */
- spin_lock(&ctx->completion_lock);
- ctx->cq_extra--;
- spin_unlock(&ctx->completion_lock);
WRITE_ONCE(ctx->rings->sq_dropped,
READ_ONCE(ctx->rings->sq_dropped) + 1);
return false;
@@ -2698,21 +2691,26 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
return off;
}
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
{
struct io_kiocb *req;
int nr = 0;
- mutex_lock(&ctx->uring_lock);
-
while (!io_req_cache_empty(ctx)) {
req = io_extract_req(ctx);
kmem_cache_free(req_cachep, req);
nr++;
}
- if (nr)
+ if (nr) {
+ ctx->nr_req_allocated -= nr;
percpu_ref_put_many(&ctx->refs, nr);
- mutex_unlock(&ctx->uring_lock);
+ }
+}
+
+static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
+{
+ guard(mutex)(&ctx->uring_lock);
+ __io_req_caches_free(ctx);
}
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -2748,6 +2746,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
percpu_ref_exit(&ctx->refs);
free_uid(ctx->user);
io_req_caches_free(ctx);
+
+ WARN_ON_ONCE(ctx->nr_req_allocated);
+
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
io_napi_free(ctx);
@@ -2882,7 +2883,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
io_cqring_overflow_kill(ctx);
mutex_unlock(&ctx->uring_lock);
}
- if (ctx->ifq) {
+ if (!xa_empty(&ctx->zcrx_ctxs)) {
mutex_lock(&ctx->uring_lock);
io_shutdown_zcrx_ifqs(ctx);
mutex_unlock(&ctx->uring_lock);
@@ -3014,20 +3015,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct io_defer_entry *de;
LIST_HEAD(list);
- spin_lock(&ctx->completion_lock);
list_for_each_entry_reverse(de, &ctx->defer_list, list) {
if (io_match_task_safe(de->req, tctx, cancel_all)) {
list_cut_position(&list, &ctx->defer_list, &de->list);
break;
}
}
- spin_unlock(&ctx->completion_lock);
if (list_empty(&list))
return false;
while (!list_empty(&list)) {
de = list_first_entry(&list, struct io_defer_entry, list);
list_del_init(&de->list);
+ ctx->nr_drained -= io_linked_nr(de->req);
io_req_task_queue_fail(de->req, -ECANCELED);
kfree(de);
}
@@ -3102,8 +3102,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
io_allowed_defer_tw_run(ctx))
ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
- ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
mutex_lock(&ctx->uring_lock);
+ ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
ret |= io_poll_remove_all(ctx, tctx, cancel_all);
ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
ret |= io_futex_remove_all(ctx, tctx, cancel_all);
@@ -3913,6 +3913,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
+ BUILD_BUG_SQE_ELEM(44, __u8, write_stream);
+ BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e4050b2d0821..0ea7a435d1de 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -19,7 +19,6 @@
#endif
enum {
- IOU_OK = 0, /* deprecated, use IOU_COMPLETE */
IOU_COMPLETE = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
@@ -196,7 +195,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
{
io_lockdep_assert_cq_locked(ctx);
- ctx->cq_extra++;
ctx->submit_state.cq_flush = true;
return io_get_cqe(ctx, cqe_ret);
}
@@ -414,7 +412,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ if (unlikely(ctx->off_timeout_used ||
ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx);
}
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 953d5e742569..8cce3ebd813f 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req)
{
if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED)))
return;
- req->buf_index = req->kbuf->bgid;
req->flags &= ~REQ_F_BUFFER_SELECTED;
kfree(req->kbuf);
req->kbuf = NULL;
@@ -110,7 +109,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
bl = io_buffer_get_list(ctx, buf->bgid);
list_add(&buf->list, &bl->buf_list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- req->buf_index = buf->bgid;
io_ring_submit_unlock(ctx, issue_flags);
return true;
@@ -193,7 +191,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags)
+ unsigned buf_group, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_buffer_list *bl;
@@ -201,7 +199,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
io_ring_submit_lock(req->ctx, issue_flags);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, buf_group);
if (likely(bl)) {
if (bl->flags & IOBL_BUF_RING)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
@@ -302,7 +300,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
int ret = -ENOENT;
io_ring_submit_lock(ctx, issue_flags);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
goto out_unlock;
@@ -335,7 +333,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
lockdep_assert_held(&ctx->uring_lock);
- bl = io_buffer_get_list(ctx, req->buf_index);
+ bl = io_buffer_get_list(ctx, arg->buf_group);
if (unlikely(!bl))
return -ENOENT;
@@ -355,10 +353,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
struct io_buffer_list *bl = req->buf_list;
bool ret = true;
- if (bl) {
+ if (bl)
ret = io_kbuf_commit(req, bl, len, nr);
- req->buf_index = bl->bgid;
- }
+
req->flags &= ~REQ_F_BUFFER_RING;
return ret;
}
@@ -379,45 +376,33 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs)
return ret;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx,
- struct io_buffer_list *bl, unsigned nbufs)
+static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl,
+ unsigned long nbufs)
{
- unsigned i = 0;
-
- /* shouldn't happen */
- if (!nbufs)
- return 0;
-
- if (bl->flags & IOBL_BUF_RING) {
- i = bl->buf_ring->tail - bl->head;
- io_free_region(ctx, &bl->region);
- /* make sure it's seen as empty */
- INIT_LIST_HEAD(&bl->buf_list);
- bl->flags &= ~IOBL_BUF_RING;
- return i;
- }
+ unsigned long i = 0;
+ struct io_buffer *nxt;
/* protects io_buffers_cache */
lockdep_assert_held(&ctx->uring_lock);
+ WARN_ON_ONCE(bl->flags & IOBL_BUF_RING);
- while (!list_empty(&bl->buf_list)) {
- struct io_buffer *nxt;
-
+ for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) {
nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
kfree(nxt);
-
- if (++i == nbufs)
- return i;
cond_resched();
}
-
return i;
}
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
- __io_remove_buffers(ctx, bl, -1U);
+ if (bl->flags & IOBL_BUF_RING)
+ io_free_region(ctx, &bl->region);
+ else
+ io_remove_buffers_legacy(ctx, bl, -1U);
+
kfree(bl);
}
@@ -465,30 +450,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return 0;
}
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
-{
- struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
-
- ret = -ENOENT;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (bl) {
- ret = -EINVAL;
- /* can't use provide/remove buffers command on mapped buffers */
- if (!(bl->flags & IOBL_BUF_RING))
- ret = __io_remove_buffers(ctx, bl, p->nbufs);
- }
- io_ring_submit_unlock(ctx, issue_flags);
- if (ret < 0)
- req_set_fail(req);
- io_req_set_res(req, ret, 0);
- return IOU_OK;
-}
-
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
unsigned long size, tmp_check;
@@ -512,8 +473,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EOVERFLOW;
if (check_add_overflow((unsigned long)p->addr, size, &tmp_check))
return -EOVERFLOW;
-
- size = (unsigned long)p->len * p->nbufs;
if (!access_ok(u64_to_user_ptr(p->addr), size))
return -EFAULT;
@@ -552,49 +511,56 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
return i ? 0 : -ENOMEM;
}
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
+static int __io_manage_buffers_legacy(struct io_kiocb *req,
+ struct io_buffer_list *bl)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
- struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer_list *bl;
- int ret = 0;
-
- io_ring_submit_lock(ctx, issue_flags);
+ int ret;
- bl = io_buffer_get_list(ctx, p->bgid);
- if (unlikely(!bl)) {
+ if (!bl) {
+ if (req->opcode != IORING_OP_PROVIDE_BUFFERS)
+ return -ENOENT;
bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
- if (!bl) {
- ret = -ENOMEM;
- goto err;
- }
+ if (!bl)
+ return -ENOMEM;
+
INIT_LIST_HEAD(&bl->buf_list);
- ret = io_buffer_add_list(ctx, bl, p->bgid);
+ ret = io_buffer_add_list(req->ctx, bl, p->bgid);
if (ret) {
kfree(bl);
- goto err;
+ return ret;
}
}
- /* can't add buffers via this command for a mapped buffer ring */
- if (bl->flags & IOBL_BUF_RING) {
- ret = -EINVAL;
- goto err;
- }
+ /* can't use provide/remove buffers command on mapped buffers */
+ if (bl->flags & IOBL_BUF_RING)
+ return -EINVAL;
+ if (req->opcode == IORING_OP_PROVIDE_BUFFERS)
+ return io_add_buffers(req->ctx, p, bl);
+ return io_remove_buffers_legacy(req->ctx, bl, p->nbufs);
+}
+
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ int ret;
- ret = io_add_buffers(ctx, p, bl);
-err:
+ io_ring_submit_lock(ctx, issue_flags);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ ret = __io_manage_buffers_legacy(req, bl);
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
- struct io_buffer_list *bl, *free_bl = NULL;
+ struct io_buffer_list *bl;
struct io_uring_region_desc rd;
struct io_uring_buf_ring *br;
unsigned long mmap_offset;
@@ -605,8 +571,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
-
- if (reg.resv[0] || reg.resv[1] || reg.resv[2])
+ if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
@@ -624,7 +589,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
io_destroy_bl(ctx, bl);
}
- free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT);
if (!bl)
return -ENOMEM;
@@ -669,7 +634,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0;
fail:
io_free_region(ctx, &bl->region);
- kfree(free_bl);
+ kfree(bl);
return ret;
}
@@ -682,9 +647,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
- if (reg.resv[0] || reg.resv[1] || reg.resv[2])
- return -EINVAL;
- if (reg.flags)
+ if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags)
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
@@ -704,14 +667,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_status buf_status;
struct io_buffer_list *bl;
- int i;
if (copy_from_user(&buf_status, arg, sizeof(buf_status)))
return -EFAULT;
-
- for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++)
- if (buf_status.resv[i])
- return -EINVAL;
+ if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv)))
+ return -EINVAL;
bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl)
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 2ec0b983ce24..4d2c209d1a41 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,20 +55,19 @@ struct buf_sel_arg {
size_t max_len;
unsigned short nr_iovs;
unsigned short mode;
+ unsigned buf_group;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
- unsigned int issue_flags);
+ unsigned buf_group, unsigned int issue_flags);
int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
unsigned int issue_flags);
int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
void io_destroy_buffers(struct io_ring_ctx *ctx);
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags);
-
int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
+int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
@@ -94,7 +93,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
* to monopolize the buffer.
*/
if (req->buf_list) {
- req->buf_index = req->buf_list->bgid;
req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
return true;
}
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 76fcc79656b0..725dc0bec24c 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -13,6 +13,7 @@
#include "memmap.h"
#include "kbuf.h"
#include "rsrc.h"
+#include "zcrx.h"
static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
size_t size, gfp_t gfp)
@@ -116,7 +117,7 @@ static int io_region_init_ptr(struct io_mapped_region *mr)
void *ptr;
if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
- if (ifd.nr_folios == 1) {
+ if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) {
mr->ptr = page_address(mr->pages[0]);
return 0;
}
@@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
loff_t pgoff)
{
loff_t offset = pgoff << PAGE_SHIFT;
- unsigned int bgid;
+ unsigned int id;
+
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
@@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
case IORING_OFF_SQES:
return &ctx->sq_region;
case IORING_OFF_PBUF_RING:
- bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- return io_pbuf_get_region(ctx, bgid);
+ id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+ return io_pbuf_get_region(ctx, id);
case IORING_MAP_OFF_PARAM_REGION:
return &ctx->param_region;
case IORING_MAP_OFF_ZCRX_REGION:
- return &ctx->zcrx_region;
+ id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT;
+ return io_zcrx_get_region(ctx, id);
}
return NULL;
}
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index dad0aa5b1b45..08419684e4bc 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -4,7 +4,9 @@
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
+#define IORING_OFF_ZCRX_SHIFT 16
+
+struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages);
#ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 50a958e9c921..71400d6cefc8 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -328,7 +328,7 @@ done:
req_set_fail(req);
}
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
diff --git a/io_uring/net.c b/io_uring/net.c
index 24040bc3916a..d13f3e8f6c72 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -18,7 +18,6 @@
#include "rsrc.h"
#include "zcrx.h"
-#if defined(CONFIG_NET)
struct io_shutdown {
struct file *file;
int how;
@@ -129,7 +128,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
ret = __sys_shutdown_sock(sock, shutdown->how);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static bool io_net_retry(struct socket *sock, int flags)
@@ -190,7 +189,6 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
sr->done_io = 0;
sr->retry = false;
sr->len = 0; /* get from the provided buffer */
- req->buf_index = sr->buf_group;
}
static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg,
@@ -359,15 +357,13 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
kmsg->msg.msg_name = &kmsg->addr;
kmsg->msg.msg_namelen = addr_len;
}
- if (sr->flags & IORING_RECVSEND_FIXED_BUF)
+ if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
+ req->flags |= REQ_F_IMPORT_BUFFER;
return 0;
- if (!io_do_buffer_select(req)) {
- ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
- &kmsg->msg.msg_iter);
- if (unlikely(ret < 0))
- return ret;
}
- return 0;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ return 0;
+ return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter);
}
static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,13 +405,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ sr->buf_group = req->buf_index;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
if (req->opcode == IORING_OP_SENDMSG)
return -EINVAL;
- if (!(req->flags & REQ_F_BUFFER_SELECT))
- return -EINVAL;
sr->msg_flags |= MSG_WAITALL;
- sr->buf_group = req->buf_index;
req->buf_list = NULL;
req->flags |= REQ_F_MULTISHOT;
}
@@ -507,7 +502,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
/* Otherwise stop bundle and use the current result. */
finish:
io_req_set_res(req, *ret, cflags);
- *ret = IOU_OK;
+ *ret = IOU_COMPLETE;
return true;
}
@@ -558,7 +553,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
else if (sr->done_io)
ret = sr->done_io;
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
@@ -571,6 +566,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags,
.iovs = &kmsg->fast_iov,
.max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1,
+ .buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@@ -723,7 +719,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr *kmsg;
- int ret;
kmsg = io_msg_alloc_async(req);
if (unlikely(!kmsg))
@@ -739,13 +734,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
kmsg->msg.msg_iocb = NULL;
kmsg->msg.msg_ubuf = NULL;
- if (!io_do_buffer_select(req)) {
- ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
- &kmsg->msg.msg_iter);
- if (unlikely(ret))
- return ret;
- }
- return 0;
+ if (req->flags & REQ_F_BUFFER_SELECT)
+ return 0;
+ return import_ubuf(ITER_DEST, sr->buf, sr->len,
+ &kmsg->msg.msg_iter);
}
return io_recvmsg_copy_hdr(req, kmsg);
@@ -827,18 +819,24 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
- cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
+ size_t this_ret = *ret - sr->done_io;
+
+ cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret),
issue_flags);
if (sr->retry)
cflags = req->cqe.flags | (cflags & CQE_F_MASK);
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
goto finish;
- /* if more is available, retry and append to this one */
- if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) {
+ /*
+ * If more is available AND it was a full transfer, retry and
+ * append to this one
+ */
+ if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 &&
+ !iov_iter_count(&kmsg->msg.msg_iter)) {
req->cqe.flags = cflags & ~CQE_F_MASK;
sr->len = kmsg->msg.msg_inq;
- sr->done_io += *ret;
+ sr->done_io += this_ret;
sr->retry = true;
return false;
}
@@ -985,7 +983,7 @@ retry_multishot:
void __user *buf;
size_t len = sr->len;
- buf = io_buffer_select(req, &len, issue_flags);
+ buf = io_buffer_select(req, &len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
@@ -1063,6 +1061,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
.iovs = &kmsg->fast_iov,
.nr_iovs = 1,
.mode = KBUF_MODE_EXPAND,
+ .buf_group = sr->buf_group,
};
if (kmsg->vec.iovec) {
@@ -1095,7 +1094,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
void __user *buf;
*len = sr->len;
- buf = io_buffer_select(req, len, issue_flags);
+ buf = io_buffer_select(req, len, sr->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
sr->buf = buf;
@@ -1191,16 +1190,14 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc);
unsigned ifq_idx;
- if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr ||
- sqe->addr3))
+ if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3))
return -EINVAL;
ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
- if (ifq_idx != 0)
- return -EINVAL;
- zc->ifq = req->ctx->ifq;
+ zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
if (!zc->ifq)
return -EINVAL;
+
zc->len = READ_ONCE(sqe->len);
zc->flags = READ_ONCE(sqe->ioprio);
zc->msg_flags = READ_ONCE(sqe->msg_flags);
@@ -1321,8 +1318,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -ENOMEM;
if (req->opcode == IORING_OP_SEND_ZC) {
- if (zc->flags & IORING_RECVSEND_FIXED_BUF)
- req->flags |= REQ_F_IMPORT_BUFFER;
ret = io_send_setup(req, sqe);
} else {
if (unlikely(sqe->addr2 || sqe->file_index))
@@ -1470,7 +1465,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
@@ -1541,7 +1536,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_sendrecv_fail(struct io_kiocb *req)
@@ -1705,7 +1700,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags)
sock->file_slot);
}
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1772,7 +1767,7 @@ out:
req_set_fail(req);
io_req_msg_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -1846,4 +1841,3 @@ void io_netmsg_cache_free(const void *entry)
io_vec_free(&kmsg->vec);
kfree(kmsg);
}
-#endif
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 28f06285fdc2..6ac2de761fd3 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -68,5 +68,5 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, nop->result, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 7bd92538dccb..9a6f6e92d742 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
if (unlikely(!io_alloc_req(ctx, &notif)))
return NULL;
+ notif->ctx = ctx;
notif->opcode = IORING_OP_NOP;
notif->flags = 0;
notif->file = NULL;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 489384c0438b..6e0882b051f9 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -333,13 +333,13 @@ const struct io_issue_def io_issue_defs[] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_provide_buffers_prep,
- .issue = io_provide_buffers,
+ .issue = io_manage_buffers_legacy,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.prep = io_remove_buffers_prep,
- .issue = io_remove_buffers,
+ .issue = io_manage_buffers_legacy,
},
[IORING_OP_TEE] = {
.needs_file = 1,
@@ -569,6 +569,10 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_prep_writev_fixed,
.issue = io_write,
},
+ [IORING_OP_PIPE] = {
+ .prep = io_pipe_prep,
+ .issue = io_pipe,
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -815,6 +819,9 @@ const struct io_cold_def io_cold_defs[] = {
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
+ [IORING_OP_PIPE] = {
+ .name = "PIPE",
+ },
};
const char *io_uring_get_opcode(u8 opcode)
diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3357dfa14ca..83e36ad4e31b 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -6,6 +6,8 @@
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/namei.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/watch_queue.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
@@ -169,7 +171,7 @@ err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_openat(struct io_kiocb *req, unsigned int issue_flags)
@@ -257,7 +259,7 @@ err:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -300,5 +302,136 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
+}
+
+struct io_pipe {
+ struct file *file;
+ int __user *fds;
+ int flags;
+ int file_slot;
+ unsigned long nofile;
+};
+
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+
+ if (sqe->fd || sqe->off || sqe->addr3)
+ return -EINVAL;
+
+ p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ p->flags = READ_ONCE(sqe->pipe_flags);
+ if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
+ return -EINVAL;
+
+ p->file_slot = READ_ONCE(sqe->file_index);
+ p->nofile = rlimit(RLIMIT_NOFILE);
+ return 0;
+}
+
+static int io_pipe_fixed(struct io_kiocb *req, struct file **files,
+ unsigned int issue_flags)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, fds[2] = { -1, -1 };
+ int slot = p->file_slot;
+
+ if (p->flags & O_CLOEXEC)
+ return -EINVAL;
+
+ io_ring_submit_lock(ctx, issue_flags);
+
+ ret = __io_fixed_fd_install(ctx, files[0], slot);
+ if (ret < 0)
+ goto err;
+ fds[0] = ret;
+ files[0] = NULL;
+
+ /*
+ * If a specific slot is given, next one will be used for
+ * the write side.
+ */
+ if (slot != IORING_FILE_INDEX_ALLOC)
+ slot++;
+
+ ret = __io_fixed_fd_install(ctx, files[1], slot);
+ if (ret < 0)
+ goto err;
+ fds[1] = ret;
+ files[1] = NULL;
+
+ io_ring_submit_unlock(ctx, issue_flags);
+
+ if (!copy_to_user(p->fds, fds, sizeof(fds)))
+ return 0;
+
+ ret = -EFAULT;
+ io_ring_submit_lock(ctx, issue_flags);
+err:
+ if (fds[0] != -1)
+ io_fixed_fd_remove(ctx, fds[0]);
+ if (fds[1] != -1)
+ io_fixed_fd_remove(ctx, fds[1]);
+ io_ring_submit_unlock(ctx, issue_flags);
+ return ret;
+}
+
+static int io_pipe_fd(struct io_kiocb *req, struct file **files)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ int ret, fds[2] = { -1, -1 };
+
+ ret = __get_unused_fd_flags(p->flags, p->nofile);
+ if (ret < 0)
+ goto err;
+ fds[0] = ret;
+
+ ret = __get_unused_fd_flags(p->flags, p->nofile);
+ if (ret < 0)
+ goto err;
+ fds[1] = ret;
+
+ if (!copy_to_user(p->fds, fds, sizeof(fds))) {
+ fd_install(fds[0], files[0]);
+ fd_install(fds[1], files[1]);
+ return 0;
+ }
+ ret = -EFAULT;
+err:
+ if (fds[0] != -1)
+ put_unused_fd(fds[0]);
+ if (fds[1] != -1)
+ put_unused_fd(fds[1]);
+ return ret;
+}
+
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe);
+ struct file *files[2];
+ int ret;
+
+ ret = create_pipe_files(files, p->flags);
+ if (ret)
+ return ret;
+ files[0]->f_mode |= FMODE_NOWAIT;
+ files[1]->f_mode |= FMODE_NOWAIT;
+
+ if (!!p->file_slot)
+ ret = io_pipe_fixed(req, files, issue_flags);
+ else
+ ret = io_pipe_fd(req, files);
+
+ io_req_set_res(req, ret, 0);
+ if (!ret)
+ return IOU_COMPLETE;
+
+ req_set_fail(req);
+ if (files[0])
+ fput(files[0]);
+ if (files[1])
+ fput(files[1]);
+ return ret;
}
diff --git a/io_uring/openclose.h b/io_uring/openclose.h
index 8a93c98ad0ad..4ca2a9935abc 100644
--- a/io_uring/openclose.h
+++ b/io_uring/openclose.h
@@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags);
int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_close(struct io_kiocb *req, unsigned int issue_flags);
+int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_pipe(struct io_kiocb *req, unsigned int issue_flags);
+
int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8eb744eb9f4c..0526062e2f81 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -893,7 +893,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags);
if (ret > 0) {
io_req_set_res(req, ipt.result_mask, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
return ret ?: IOU_ISSUE_SKIP_COMPLETE;
}
@@ -948,5 +948,5 @@ out:
}
/* complete update request, we're done with it */
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index f80a77c4973f..c592ceace97d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
-int io_buffer_validate(struct iovec *iov)
+int io_validate_user_buf_range(u64 uaddr, u64 ulen)
{
- unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+ unsigned long tmp, base = (unsigned long)uaddr;
+ unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
+ /* arbitrary limit, but we need something */
+ if (ulen > SZ_1G || !ulen)
+ return -EFAULT;
+ if (check_add_overflow(base, acct_len, &tmp))
+ return -EOVERFLOW;
+ return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
@@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov)
*/
if (!iov->iov_base)
return iov->iov_len ? -EFAULT : 0;
- if (!iov->iov_len)
- return -EFAULT;
-
- /* arbitrary limit, but we need something */
- if (iov->iov_len > SZ_1G)
- return -EFAULT;
- if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
- return -EOVERFLOW;
-
- return 0;
+ return io_validate_user_buf_range((unsigned long)iov->iov_base,
+ iov->iov_len);
}
static void io_release_ubuf(void *priv)
@@ -497,7 +500,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
@@ -685,38 +688,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages, **new_array = NULL;
- int nr_pages_left = *nr_pages, i, j;
- int nr_folios = data->nr_folios;
+ unsigned nr_pages_left = *nr_pages;
+ unsigned nr_folios = data->nr_folios;
+ unsigned i, j;
/* Store head pages only*/
- new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
- GFP_KERNEL);
+ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL);
if (!new_array)
return false;
- new_array[0] = compound_head(page_array[0]);
- /*
- * The pages are bound to the folio, it doesn't
- * actually unpin them but drops all but one reference,
- * which is usually put down by io_buffer_unmap().
- * Note, needs a better helper.
- */
- if (data->nr_pages_head > 1)
- unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
-
- j = data->nr_pages_head;
- nr_pages_left -= data->nr_pages_head;
- for (i = 1; i < nr_folios; i++) {
- unsigned int nr_unpin;
-
- new_array[i] = page_array[j];
- nr_unpin = min_t(unsigned int, nr_pages_left - 1,
- data->nr_pages_mid - 1);
- if (nr_unpin)
- unpin_user_pages(&page_array[j+1], nr_unpin);
- j += data->nr_pages_mid;
- nr_pages_left -= data->nr_pages_mid;
+ for (i = 0, j = 0; i < nr_folios; i++) {
+ struct page *p = compound_head(page_array[j]);
+ struct folio *folio = page_folio(p);
+ unsigned int nr;
+
+ WARN_ON_ONCE(i > 0 && p != page_array[j]);
+
+ nr = i ? data->nr_pages_mid : data->nr_pages_head;
+ nr = min(nr, nr_pages_left);
+ /* Drop all but one ref, the entire folio will remain pinned. */
+ if (nr > 1)
+ unpin_user_folio(folio, nr - 1);
+ j += nr;
+ nr_pages_left -= nr;
+ new_array[i] = p;
}
+
+ WARN_ON_ONCE(j != *nr_pages);
+
kvfree(page_array);
*pages = new_array;
*nr_pages = nr_folios;
@@ -1062,8 +1061,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
size_t offset;
int ret;
- if (WARN_ON_ONCE(!imu))
- return -EFAULT;
ret = validate_fixed_range(buf_addr, len, imu);
if (unlikely(ret))
return ret;
@@ -1110,13 +1107,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req,
if (req->flags & REQ_F_BUF_NODE)
return req->buf_node;
+ req->flags |= REQ_F_BUF_NODE;
io_ring_submit_lock(ctx, issue_flags);
node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
- if (node)
- io_req_assign_buf_node(req, node);
+ if (node) {
+ node->refs++;
+ req->buf_node = node;
+ io_ring_submit_unlock(ctx, issue_flags);
+ return node;
+ }
+ req->flags &= ~REQ_F_BUF_NODE;
io_ring_submit_unlock(ctx, issue_flags);
- return node;
+ return NULL;
}
int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter,
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index b52242852ff3..0d2138f16322 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -83,7 +83,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type);
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
-int io_buffer_validate(struct iovec *iov);
+int io_validate_user_buf_range(u64 uaddr, u64 ulen);
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
struct io_imu_folio_data *data);
@@ -115,32 +115,6 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
return true;
}
-static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
-{
- if (req->file_node) {
- io_put_rsrc_node(req->ctx, req->file_node);
- req->file_node = NULL;
- }
- if (req->flags & REQ_F_BUF_NODE) {
- io_put_rsrc_node(req->ctx, req->buf_node);
- req->buf_node = NULL;
- }
-}
-
-static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
- struct io_rsrc_node *node)
-{
- node->refs++;
- *dst_node = node;
-}
-
-static inline void io_req_assign_buf_node(struct io_kiocb *req,
- struct io_rsrc_node *node)
-{
- io_req_assign_rsrc_node(&req->buf_node, node);
- req->flags |= REQ_F_BUF_NODE;
-}
-
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 039e063f7091..710d8cd53ebb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req,
return io_import_vec(ddir, req, io, buf, sqe_len);
if (io_do_buffer_select(req)) {
- buf = io_buffer_select(req, &sqe_len, issue_flags);
+ buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags);
if (!buf)
return -ENOBUFS;
rw->addr = (unsigned long) buf;
@@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ struct io_async_rw *io;
unsigned ioprio;
u64 attr_type_mask;
int ret;
if (io_rw_alloc_async(req))
return -ENOMEM;
+ io = req->async_data;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
/* used for fixed read/write too - just read unconditionally */
req->buf_index = READ_ONCE(sqe->buf_index);
+ io->buf_group = req->buf_index;
ioprio = READ_ONCE(sqe->ioprio);
if (ioprio) {
@@ -276,6 +279,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
rw->kiocb.dio_complete = NULL;
rw->kiocb.ki_flags = 0;
+ rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream);
if (req->ctx->flags & IORING_SETUP_IOPOLL)
rw->kiocb.ki_complete = io_complete_rw_iopoll;
@@ -657,7 +661,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
io_req_io_end(req);
io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags);
- return IOU_OK;
+ return IOU_COMPLETE;
} else {
io_rw_done(req, ret);
}
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 81d6d9a8cf69..129a53fe5482 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -16,6 +16,8 @@ struct io_async_rw {
struct iov_iter iter;
struct iov_iter_state iter_state;
struct iovec fast_iov;
+ unsigned buf_group;
+
/*
* wpq is for buffered io, while meta fields are used with
* direct io
diff --git a/io_uring/splice.c b/io_uring/splice.c
index 7b89bd84d486..35ce4e60b495 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -103,7 +103,7 @@ done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -144,5 +144,5 @@ done:
if (ret != sp->len)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/statx.c b/io_uring/statx.c
index 6bc4651700a2..5111e9befbfe 100644
--- a/io_uring/statx.c
+++ b/io_uring/statx.c
@@ -59,7 +59,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
void io_statx_cleanup(struct io_kiocb *req)
diff --git a/io_uring/sync.c b/io_uring/sync.c
index 255f68c37e55..cea2d381ffd2 100644
--- a/io_uring/sync.c
+++ b/io_uring/sync.c
@@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -79,7 +79,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
sync->flags & IORING_FSYNC_DATASYNC);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -108,5 +108,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
if (ret >= 0)
fsnotify_modify(req->file);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index adc6e42c14df..5b66755579c0 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
data.hash = hash;
data.task = task;
- data.free_work = io_wq_free_work;
- data.do_work = io_wq_submit_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 2a107665230b..7f13bfa9f2b6 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -35,6 +35,9 @@ struct io_timeout_rem {
bool ltimeout;
};
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+ struct io_kiocb *link);
+
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
@@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx;
raw_spin_lock_irq(&ctx->timeout_lock);
- link = io_disarm_linked_timeout(req);
+ if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT)
+ link = __io_disarm_linked_timeout(req, req->link);
+
raw_spin_unlock_irq(&ctx->timeout_lock);
if (link)
io_req_queue_tw_complete(link, -ECANCELED);
@@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req)
io_fail_links(req);
}
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
- struct io_kiocb *link)
+static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
+ struct io_kiocb *link)
__must_hold(&req->ctx->completion_lock)
__must_hold(&req->ctx->timeout_lock)
{
@@ -500,7 +505,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int __io_timeout_prep(struct io_kiocb *req,
diff --git a/io_uring/timeout.h b/io_uring/timeout.h
index e91b32448dcf..2b7c9ad72992 100644
--- a/io_uring/timeout.h
+++ b/io_uring/timeout.h
@@ -8,19 +8,6 @@ struct io_timeout_data {
u32 flags;
};
-struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req,
- struct io_kiocb *link);
-
-static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req)
-{
- struct io_kiocb *link = req->link;
-
- if (link && link->opcode == IORING_OP_LINK_TIMEOUT)
- return __io_disarm_linked_timeout(req, link);
-
- return NULL;
-}
-
__cold void io_flush_timeouts(struct io_ring_ctx *ctx);
struct io_cancel_data;
int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd);
diff --git a/io_uring/truncate.c b/io_uring/truncate.c
index 62ee73d34d72..487baf23b44e 100644
--- a/io_uring/truncate.c
+++ b/io_uring/truncate.c
@@ -44,5 +44,5 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags)
ret = do_ftruncate(req->file, ft->len, 1);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index a9ea7d29cdd9..929cad6ee326 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -3,13 +3,10 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/io_uring/cmd.h>
-#include <linux/io_uring/net.h>
#include <linux/security.h>
#include <linux/nospec.h>
-#include <net/sock.h>
#include <uapi/linux/io_uring.h>
-#include <asm/ioctls.h>
#include "io_uring.h"
#include "alloc_cache.h"
@@ -254,6 +251,11 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
return -EOPNOTSUPP;
issue_flags |= IO_URING_F_IOPOLL;
req->iopoll_completed = 0;
+ if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
+ /* make sure every req only blocks once */
+ req->flags &= ~REQ_F_IOPOLL_STATE;
+ req->iopoll_start = ktime_get_ns();
+ }
}
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
@@ -263,7 +265,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
req_set_fail(req);
io_req_uring_cleanup(req, issue_flags);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -273,6 +275,9 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
+ if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
+ return -EINVAL;
+
return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
@@ -287,6 +292,9 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
struct io_async_cmd *ac = req->async_data;
int ret;
+ if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED)))
+ return -EINVAL;
+
ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs);
if (ret)
return ret;
@@ -302,83 +310,3 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
io_req_queue_iowq(req);
}
-
-static inline int io_uring_cmd_getsockopt(struct socket *sock,
- struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- const struct io_uring_sqe *sqe = cmd->sqe;
- bool compat = !!(issue_flags & IO_URING_F_COMPAT);
- int optlen, optname, level, err;
- void __user *optval;
-
- level = READ_ONCE(sqe->level);
- if (level != SOL_SOCKET)
- return -EOPNOTSUPP;
-
- optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
- optname = READ_ONCE(sqe->optname);
- optlen = READ_ONCE(sqe->optlen);
-
- err = do_sock_getsockopt(sock, compat, level, optname,
- USER_SOCKPTR(optval),
- KERNEL_SOCKPTR(&optlen));
- if (err)
- return err;
-
- /* On success, return optlen */
- return optlen;
-}
-
-static inline int io_uring_cmd_setsockopt(struct socket *sock,
- struct io_uring_cmd *cmd,
- unsigned int issue_flags)
-{
- const struct io_uring_sqe *sqe = cmd->sqe;
- bool compat = !!(issue_flags & IO_URING_F_COMPAT);
- int optname, optlen, level;
- void __user *optval;
- sockptr_t optval_s;
-
- optval = u64_to_user_ptr(READ_ONCE(sqe->optval));
- optname = READ_ONCE(sqe->optname);
- optlen = READ_ONCE(sqe->optlen);
- level = READ_ONCE(sqe->level);
- optval_s = USER_SOCKPTR(optval);
-
- return do_sock_setsockopt(sock, compat, level, optname, optval_s,
- optlen);
-}
-
-#if defined(CONFIG_NET)
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
-{
- struct socket *sock = cmd->file->private_data;
- struct sock *sk = sock->sk;
- struct proto *prot = READ_ONCE(sk->sk_prot);
- int ret, arg = 0;
-
- if (!prot || !prot->ioctl)
- return -EOPNOTSUPP;
-
- switch (cmd->cmd_op) {
- case SOCKET_URING_OP_SIOCINQ:
- ret = prot->ioctl(sk, SIOCINQ, &arg);
- if (ret)
- return ret;
- return arg;
- case SOCKET_URING_OP_SIOCOUTQ:
- ret = prot->ioctl(sk, SIOCOUTQ, &arg);
- if (ret)
- return ret;
- return arg;
- case SOCKET_URING_OP_GETSOCKOPT:
- return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
- case SOCKET_URING_OP_SETSOCKOPT:
- return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
- default:
- return -EOPNOTSUPP;
- }
-}
-EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
-#endif
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index b04686b6b5d2..e6a5142c890e 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -17,9 +17,3 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
struct io_uring_task *tctx, bool cancel_all);
void io_cmd_cache_free(const void *entry);
-
-int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
- const struct iovec __user *uvec,
- size_t uvec_segs,
- int ddir, struct iov_iter *iter,
- unsigned issue_flags);
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 54e69984cd8a..e07a94694397 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -323,5 +323,5 @@ done:
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index de5064fcae8a..322b94ff9e4b 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -109,7 +109,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = file_getxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -122,7 +122,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
ix->filename = NULL;
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
static int __io_setxattr_prep(struct io_kiocb *req,
@@ -190,7 +190,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = file_setxattr(req->file, &ix->ctx);
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
@@ -203,5 +203,5 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx);
ix->filename = NULL;
io_xattr_finish(req, ret);
- return IOU_OK;
+ return IOU_COMPLETE;
}
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index fe86606b9f30..1513431587a7 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -26,29 +26,207 @@
#include "zcrx.h"
#include "rsrc.h"
+#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
{
return pp->mp_priv;
}
-#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
-static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
- struct io_zcrx_area *area, int nr_mapped)
+ return container_of(owner, struct io_zcrx_area, nia);
+}
+
+static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ return area->mem.pages[net_iov_idx(niov)];
+}
+
+static void io_release_dmabuf(struct io_zcrx_mem *mem)
+{
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return;
+
+ if (mem->sgt)
+ dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt,
+ DMA_FROM_DEVICE);
+ if (mem->attach)
+ dma_buf_detach(mem->dmabuf, mem->attach);
+ if (mem->dmabuf)
+ dma_buf_put(mem->dmabuf);
+
+ mem->sgt = NULL;
+ mem->attach = NULL;
+ mem->dmabuf = NULL;
+}
+
+static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ unsigned long off = (unsigned long)area_reg->addr;
+ unsigned long len = (unsigned long)area_reg->len;
+ unsigned long total_size = 0;
+ struct scatterlist *sg;
+ int dmabuf_fd = area_reg->dmabuf_fd;
+ int i, ret;
+
+ if (WARN_ON_ONCE(!ifq->dev))
+ return -EFAULT;
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return -EINVAL;
+
+ mem->is_dmabuf = true;
+ mem->dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(mem->dmabuf)) {
+ ret = PTR_ERR(mem->dmabuf);
+ mem->dmabuf = NULL;
+ goto err;
+ }
+
+ mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev);
+ if (IS_ERR(mem->attach)) {
+ ret = PTR_ERR(mem->attach);
+ mem->attach = NULL;
+ goto err;
+ }
+
+ mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE);
+ if (IS_ERR(mem->sgt)) {
+ ret = PTR_ERR(mem->sgt);
+ mem->sgt = NULL;
+ goto err;
+ }
+
+ for_each_sgtable_dma_sg(mem->sgt, sg, i)
+ total_size += sg_dma_len(sg);
+
+ if (total_size < off + len)
+ return -EINVAL;
+
+ mem->dmabuf_offset = off;
+ mem->size = len;
+ return 0;
+err:
+ io_release_dmabuf(mem);
+ return ret;
+}
+
+static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+ unsigned long off = area->mem.dmabuf_offset;
+ struct scatterlist *sg;
+ unsigned i, niov_idx = 0;
+
+ if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER))
+ return -EINVAL;
+
+ for_each_sgtable_dma_sg(area->mem.sgt, sg, i) {
+ dma_addr_t dma = sg_dma_address(sg);
+ unsigned long sg_len = sg_dma_len(sg);
+ unsigned long sg_off = min(sg_len, off);
+
+ off -= sg_off;
+ sg_len -= sg_off;
+ dma += sg_off;
+
+ while (sg_len && niov_idx < area->nia.num_niovs) {
+ struct net_iov *niov = &area->nia.niovs[niov_idx];
+
+ if (net_mp_niov_set_dma_addr(niov, dma))
+ return 0;
+ sg_len -= PAGE_SIZE;
+ dma += PAGE_SIZE;
+ niov_idx++;
+ }
+ }
+ return niov_idx;
+}
+
+static int io_import_umem(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ struct page **pages;
+ int nr_pages;
+
+ if (area_reg->dmabuf_fd)
+ return -EINVAL;
+ if (!area_reg->addr)
+ return -EFAULT;
+ pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+ &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ mem->pages = pages;
+ mem->nr_folios = nr_pages;
+ mem->size = area_reg->len;
+ return 0;
+}
+
+static void io_release_area_mem(struct io_zcrx_mem *mem)
+{
+ if (mem->is_dmabuf) {
+ io_release_dmabuf(mem);
+ return;
+ }
+ if (mem->pages) {
+ unpin_user_pages(mem->pages, mem->nr_folios);
+ kvfree(mem->pages);
+ }
+}
+
+static int io_import_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ int ret;
+
+ ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
+ if (ret)
+ return ret;
+ if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (area_reg->flags & IORING_ZCRX_AREA_DMABUF)
+ return io_import_dmabuf(ifq, mem, area_reg);
+ return io_import_umem(ifq, mem, area_reg);
+}
+
+static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area, int nr_mapped)
{
int i;
for (i = 0; i < nr_mapped; i++) {
- struct net_iov *niov = &area->nia.niovs[i];
- dma_addr_t dma;
+ netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]);
+ dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem);
- dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov));
dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE,
DMA_FROM_DEVICE, IO_DMA_ATTR);
- net_mp_niov_set_dma_addr(niov, 0);
}
}
+static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_area *area, int nr_mapped)
+{
+ int i;
+
+ if (area->mem.is_dmabuf)
+ io_release_dmabuf(&area->mem);
+ else
+ io_zcrx_unmap_umem(ifq, area, nr_mapped);
+
+ for (i = 0; i < area->nia.num_niovs; i++)
+ net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0);
+}
+
static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
guard(mutex)(&ifq->dma_lock);
@@ -58,20 +236,16 @@ static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *are
area->is_mapped = false;
}
-static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
{
int i;
- guard(mutex)(&ifq->dma_lock);
- if (area->is_mapped)
- return 0;
-
for (i = 0; i < area->nia.num_niovs; i++) {
struct net_iov *niov = &area->nia.niovs[i];
dma_addr_t dma;
- dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
+ dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
+ PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
if (dma_mapping_error(ifq->dev, dma))
break;
if (net_mp_niov_set_dma_addr(niov, dma)) {
@@ -80,9 +254,24 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
break;
}
}
+ return i;
+}
+
+static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
+{
+ unsigned nr;
+
+ guard(mutex)(&ifq->dma_lock);
+ if (area->is_mapped)
+ return 0;
+
+ if (area->mem.is_dmabuf)
+ nr = io_zcrx_map_area_dmabuf(ifq, area);
+ else
+ nr = io_zcrx_map_area_umem(ifq, area);
- if (i != area->nia.num_niovs) {
- __io_zcrx_unmap_area(ifq, area, i);
+ if (nr != area->nia.num_niovs) {
+ __io_zcrx_unmap_area(ifq, area, nr);
return -EINVAL;
}
@@ -118,13 +307,6 @@ struct io_zcrx_args {
static const struct memory_provider_ops io_uring_pp_zc_ops;
-static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
-{
- struct net_iov_area *owner = net_iov_owner(niov);
-
- return container_of(owner, struct io_zcrx_area, nia);
-}
-
static inline atomic_t *io_get_user_counter(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@@ -147,17 +329,12 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
-static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
-{
- struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-
- return area->pages[net_iov_idx(niov)];
-}
-
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
- struct io_uring_region_desc *rd)
+ struct io_uring_region_desc *rd,
+ u32 id)
{
+ u64 mmap_offset;
size_t off, size;
void *ptr;
int ret;
@@ -167,12 +344,14 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
if (size > rd->size)
return -EINVAL;
- ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
- IORING_MAP_OFF_ZCRX_REGION);
+ mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
+ mmap_offset += id << IORING_OFF_PBUF_SHIFT;
+
+ ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
if (ret < 0)
return ret;
- ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
+ ptr = io_region_get_ptr(&ifq->region);
ifq->rq_ring = (struct io_uring *)ptr;
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
return 0;
@@ -180,7 +359,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
{
- io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
+ io_free_region(ifq->ctx, &ifq->region);
ifq->rq_ring = NULL;
ifq->rqes = NULL;
}
@@ -188,53 +367,44 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
static void io_zcrx_free_area(struct io_zcrx_area *area)
{
io_zcrx_unmap_area(area->ifq, area);
+ io_release_area_mem(&area->mem);
kvfree(area->freelist);
kvfree(area->nia.niovs);
kvfree(area->user_refs);
- if (area->pages) {
- unpin_user_pages(area->pages, area->nr_folios);
- kvfree(area->pages);
- }
kfree(area);
}
+#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF)
+
static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area **res,
struct io_uring_zcrx_area_reg *area_reg)
{
struct io_zcrx_area *area;
- int i, ret, nr_pages, nr_iovs;
- struct iovec iov;
+ unsigned nr_iovs;
+ int i, ret;
- if (area_reg->flags || area_reg->rq_area_token)
+ if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS)
return -EINVAL;
- if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
+ if (area_reg->rq_area_token)
return -EINVAL;
- if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ if (area_reg->__resv2[0] || area_reg->__resv2[1])
return -EINVAL;
- iov.iov_base = u64_to_user_ptr(area_reg->addr);
- iov.iov_len = area_reg->len;
- ret = io_buffer_validate(&iov);
- if (ret)
- return ret;
-
ret = -ENOMEM;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
goto err;
- area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
- &nr_pages);
- if (IS_ERR(area->pages)) {
- ret = PTR_ERR(area->pages);
- area->pages = NULL;
+ ret = io_import_area(ifq, &area->mem, area_reg);
+ if (ret)
goto err;
- }
- area->nr_folios = nr_iovs = nr_pages;
+
+ nr_iovs = area->mem.size >> PAGE_SHIFT;
area->nia.num_niovs = nr_iovs;
+ ret = -ENOMEM;
area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->nia.niovs)
@@ -245,9 +415,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
if (!area->freelist)
goto err;
- for (i = 0; i < nr_iovs; i++)
- area->freelist[i] = i;
-
area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->user_refs)
@@ -259,6 +426,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
niov->owner = &area->nia;
area->freelist[i] = i;
atomic_set(&area->user_refs[i], 0);
+ niov->type = NET_IOV_IOURING;
}
area->free_count = nr_iovs;
@@ -341,6 +509,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
kfree(ifq);
}
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id)
+{
+ struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
+
+ lockdep_assert_held(&ctx->mmap_lock);
+
+ return ifq ? &ifq->region : NULL;
+}
+
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
{
@@ -350,6 +528,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_region_desc rd;
struct io_zcrx_ifq *ifq;
int ret;
+ u32 id;
/*
* 1. Interface queue allocation.
@@ -362,8 +541,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
ctx->flags & IORING_SETUP_CQE32))
return -EINVAL;
- if (ctx->ifq)
- return -EBUSY;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
@@ -386,29 +563,37 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ifq = io_zcrx_ifq_alloc(ctx);
if (!ifq)
return -ENOMEM;
+ ifq->rq_entries = reg.rq_entries;
- ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
- if (ret)
- goto err;
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ /* preallocate id */
+ ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+ if (ret)
+ goto ifq_free;
+ }
- ret = io_zcrx_create_area(ifq, &ifq->area, &area);
+ ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id);
if (ret)
goto err;
- ifq->rq_entries = reg.rq_entries;
-
- ret = -ENODEV;
ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx,
&ifq->netdev_tracker, GFP_KERNEL);
- if (!ifq->netdev)
+ if (!ifq->netdev) {
+ ret = -ENODEV;
goto err;
+ }
ifq->dev = ifq->netdev->dev.parent;
- ret = -EOPNOTSUPP;
- if (!ifq->dev)
+ if (!ifq->dev) {
+ ret = -EOPNOTSUPP;
goto err;
+ }
get_device(ifq->dev);
+ ret = io_zcrx_create_area(ifq, &ifq->area, &area);
+ if (ret)
+ goto err;
+
mp_param.mp_ops = &io_uring_pp_zc_ops;
mp_param.mp_priv = ifq;
ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param);
@@ -419,6 +604,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
reg.offsets.rqes = sizeof(struct io_uring);
reg.offsets.head = offsetof(struct io_uring, head);
reg.offsets.tail = offsetof(struct io_uring, tail);
+ reg.zcrx_id = id;
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ /* publish ifq */
+ ret = -ENOMEM;
+ if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+ goto err;
+ }
if (copy_to_user(arg, &reg, sizeof(reg)) ||
copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
@@ -426,24 +619,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
ret = -EFAULT;
goto err;
}
- ctx->ifq = ifq;
return 0;
err:
+ scoped_guard(mutex, &ctx->mmap_lock)
+ xa_erase(&ctx->zcrx_ctxs, id);
+ifq_free:
io_zcrx_ifq_free(ifq);
return ret;
}
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
{
- struct io_zcrx_ifq *ifq = ctx->ifq;
+ struct io_zcrx_ifq *ifq;
+ unsigned long id;
lockdep_assert_held(&ctx->uring_lock);
- if (!ifq)
- return;
+ while (1) {
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
+ if (ifq)
+ xa_erase(&ctx->zcrx_ctxs, id);
+ }
+ if (!ifq)
+ break;
+ io_zcrx_ifq_free(ifq);
+ }
- ctx->ifq = NULL;
- io_zcrx_ifq_free(ifq);
+ xa_destroy(&ctx->zcrx_ctxs);
}
static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
@@ -500,12 +703,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
{
+ struct io_zcrx_ifq *ifq;
+ unsigned long index;
+
lockdep_assert_held(&ctx->uring_lock);
- if (!ctx->ifq)
- return;
- io_zcrx_scrub(ctx->ifq);
- io_close_queue(ctx->ifq);
+ xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
+ io_zcrx_scrub(ifq);
+ io_close_queue(ifq);
+ }
}
static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
@@ -742,6 +948,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
size_t copied = 0;
int ret = 0;
+ if (area->mem.is_dmabuf)
+ return -EFAULT;
+
while (len) {
size_t copy_size = min_t(size_t, PAGE_SIZE, len);
const int dst_off = 0;
@@ -809,7 +1018,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
return io_zcrx_copy_frag(req, ifq, frag, off, len);
niov = netmem_to_net_iov(frag->netmem);
- if (niov->pp->mp_ops != &io_uring_pp_zc_ops ||
+ if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops ||
io_pp_to_ifq(niov->pp) != ifq)
return -EFAULT;
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index f2bc811f022c..2f5e26389f22 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -3,10 +3,24 @@
#define IOU_ZC_RX_H
#include <linux/io_uring_types.h>
+#include <linux/dma-buf.h>
#include <linux/socket.h>
#include <net/page_pool/types.h>
#include <net/net_trackers.h>
+struct io_zcrx_mem {
+ unsigned long size;
+ bool is_dmabuf;
+
+ struct page **pages;
+ unsigned long nr_folios;
+
+ struct dma_buf_attachment *attach;
+ struct dma_buf *dmabuf;
+ struct sg_table *sgt;
+ unsigned long dmabuf_offset;
+};
+
struct io_zcrx_area {
struct net_iov_area nia;
struct io_zcrx_ifq *ifq;
@@ -14,13 +28,13 @@ struct io_zcrx_area {
bool is_mapped;
u16 area_id;
- struct page **pages;
- unsigned long nr_folios;
/* freelist */
spinlock_t freelist_lock ____cacheline_aligned_in_smp;
u32 free_count;
u32 *freelist;
+
+ struct io_zcrx_mem mem;
};
struct io_zcrx_ifq {
@@ -39,6 +53,7 @@ struct io_zcrx_ifq {
netdevice_tracker netdev_tracker;
spinlock_t lock;
struct mutex dma_lock;
+ struct io_mapped_region region;
};
#if defined(CONFIG_IO_URING_ZCRX)
@@ -49,6 +64,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
struct socket *sock, unsigned int flags,
unsigned issue_flags, unsigned int *len);
+struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id);
#else
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
struct io_uring_zcrx_ifq_reg __user *arg)
@@ -67,6 +84,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,
{
return -EOPNOTSUPP;
}
+static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
+ unsigned int id)
+{
+ return NULL;
+}
#endif
int io_recvzc(struct io_kiocb *req, unsigned int issue_flags);