summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2025-03-07 09:07:19 -0700
committerJens Axboe <axboe@kernel.dk>2025-03-07 09:07:19 -0700
commit6e3da40ed6f3508dcf34b1539496bcccef7015ef (patch)
tree0eeb5608dc15f91a09bf0d88499572a05294eb4b
parent78b6f6e9bf3960c5ee3368415a11babb754b9a19 (diff)
parent19f7e942732766aec8a51d217ab5fb4a7fe3bb0d (diff)
Merge branch 'for-6.15/io_uring-epoll-wait' into for-6.15/io_uring-reg-vec
* for-6.15/io_uring-epoll-wait: io_uring/epoll: add support for IORING_OP_EPOLL_WAIT io_uring/epoll: remove CONFIG_EPOLL guards eventpoll: add epoll_sendevents() helper eventpoll: abstract out ep_try_send_events() helper eventpoll: abstract out parameter sanity checking
-rw-r--r--fs/eventpoll.c87
-rw-r--r--include/linux/eventpoll.h4
-rw-r--r--include/uapi/linux/io_uring.h1
-rw-r--r--io_uring/Makefile9
-rw-r--r--io_uring/epoll.c35
-rw-r--r--io_uring/epoll.h2
-rw-r--r--io_uring/opdef.c14
7 files changed, 122 insertions, 30 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7c0980db77b3..94b87aaad0f6 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
return ret;
}
+static int ep_try_send_events(struct eventpoll *ep,
+ struct epoll_event __user *events, int maxevents)
+{
+ int res;
+
+ /*
+ * Try to transfer events to user space. In case we get 0 events and
+ * there's still timeout left over, we go trying again in search of
+ * more luck.
+ */
+ res = ep_send_events(ep, events, maxevents);
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
+ return res;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -2031,17 +2047,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
while (1) {
if (eavail) {
- /*
- * Try to transfer events to user space. In case we get
- * 0 events and there's still timeout left over, we go
- * trying again in search of more luck.
- */
- res = ep_send_events(ep, events, maxevents);
- if (res) {
- if (res > 0)
- ep_suspend_napi_irqs(ep);
+ res = ep_try_send_events(ep, events, maxevents);
+ if (res)
return res;
- }
}
if (timed_out)
@@ -2445,6 +2453,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
+static int ep_check_params(struct file *file, struct epoll_event __user *evs,
+ int maxevents)
+{
+ /* The maximum number of event must be greater than zero */
+ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
+ return -EINVAL;
+
+ /* Verify that the area passed by the user is writeable */
+ if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
+ return -EFAULT;
+
+ /*
+ * We have to check that the file structure underneath the fd
+ * the user passed to us _is_ an eventpoll file.
+ */
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ return 0;
+}
+
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents)
+{
+ struct eventpoll *ep;
+ int ret;
+
+ ret = ep_check_params(file, events, maxevents);
+ if (unlikely(ret))
+ return ret;
+
+ ep = file->private_data;
+ /*
+ * Racy call, but that's ok - it should get retried based on
+ * poll readiness anyway.
+ */
+ if (ep_events_available(ep))
+ return ep_try_send_events(ep, events, maxevents);
+ return 0;
+}
+
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
@@ -2453,26 +2502,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
struct eventpoll *ep;
-
- /* The maximum number of event must be greater than zero */
- if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
- return -EINVAL;
-
- /* Verify that the area passed by the user is writeable */
- if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
- return -EFAULT;
+ int ret;
/* Get the "struct file *" for the eventpoll file */
CLASS(fd, f)(epfd);
if (fd_empty(f))
return -EBADF;
- /*
- * We have to check that the file structure underneath the fd
- * the user passed to us _is_ an eventpoll file.
- */
- if (!is_file_epoll(fd_file(f)))
- return -EINVAL;
+ ret = ep_check_params(fd_file(f), events, maxevents);
+ if (unlikely(ret))
+ return ret;
/*
* At this point it is safe to assume that the "private_data" contains
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 0c0d00fcd131..ccb478eb174b 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t
/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);
+/* Copy ready events to userspace */
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents);
+
/*
* This is called from inside fs/file_table.c:__fput() to unlink files
* from the eventpoll interface. We need to have this facility to cleanup
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1e02e94bc26d..3d99bf9bbf61 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -280,6 +280,7 @@ enum io_uring_op {
IORING_OP_BIND,
IORING_OP_LISTEN,
IORING_OP_RECV_ZC,
+ IORING_OP_EPOLL_WAIT,
/* this goes last, obviously */
IORING_OP_LAST,
diff --git a/io_uring/Makefile b/io_uring/Makefile
index 98e48339d84d..3e28a741ca15 100644
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -11,10 +11,11 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
- epoll.o statx.o timeout.o fdinfo.o \
- cancel.o waitid.o register.o \
- truncate.o memmap.o alloc_cache.o
+ statx.o timeout.o fdinfo.o cancel.o \
+ waitid.o register.o truncate.o \
+ memmap.o alloc_cache.o
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
-obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
+obj-$(CONFIG_EPOLL) += epoll.o
+obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
diff --git a/io_uring/epoll.c b/io_uring/epoll.c
index 89bff2068a19..6d2c48ba1923 100644
--- a/io_uring/epoll.c
+++ b/io_uring/epoll.c
@@ -12,7 +12,6 @@
#include "io_uring.h"
#include "epoll.h"
-#if defined(CONFIG_EPOLL)
struct io_epoll {
struct file *file;
int epfd;
@@ -21,6 +20,12 @@ struct io_epoll {
struct epoll_event event;
};
+struct io_epoll_wait {
+ struct file *file;
+ int maxevents;
+ struct epoll_event __user *events;
+};
+
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll);
@@ -58,4 +63,30 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0);
return IOU_OK;
}
-#endif
+
+int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
+
+ if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
+ return -EINVAL;
+
+ iew->maxevents = READ_ONCE(sqe->len);
+ iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ return 0;
+}
+
+int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait);
+ int ret;
+
+ ret = epoll_sendevents(req->file, iew->events, iew->maxevents);
+ if (ret == 0)
+ return -EAGAIN;
+ if (ret < 0)
+ req_set_fail(req);
+
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
diff --git a/io_uring/epoll.h b/io_uring/epoll.h
index 870cce11ba98..4111997c360b 100644
--- a/io_uring/epoll.h
+++ b/io_uring/epoll.h
@@ -3,4 +3,6 @@
#if defined(CONFIG_EPOLL)
int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags);
+int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags);
#endif
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 9511262c513e..db77df513d55 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -529,6 +529,17 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_eopnotsupp_prep,
#endif
},
+ [IORING_OP_EPOLL_WAIT] = {
+ .needs_file = 1,
+ .audit_skip = 1,
+ .pollin = 1,
+#if defined(CONFIG_EPOLL)
+ .prep = io_epoll_wait_prep,
+ .issue = io_epoll_wait,
+#else
+ .prep = io_eopnotsupp_prep,
+#endif
+ },
};
const struct io_cold_def io_cold_defs[] = {
@@ -761,6 +772,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_RECV_ZC] = {
.name = "RECV_ZC",
},
+ [IORING_OP_EPOLL_WAIT] = {
+ .name = "EPOLL_WAIT",
+ },
};
const char *io_uring_get_opcode(u8 opcode)