diff options
-rw-r--r-- | fs/eventpoll.c | 87 | ||||
-rw-r--r-- | include/linux/eventpoll.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 1 | ||||
-rw-r--r-- | io_uring/Makefile | 9 | ||||
-rw-r--r-- | io_uring/epoll.c | 35 | ||||
-rw-r--r-- | io_uring/epoll.h | 2 | ||||
-rw-r--r-- | io_uring/opdef.c | 14 |
7 files changed, 122 insertions, 30 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7c0980db77b3..94b87aaad0f6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1980,6 +1980,22 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, return ret; } +static int ep_try_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + int res; + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + res = ep_send_events(ep, events, maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + return res; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2031,17 +2047,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, while (1) { if (eavail) { - /* - * Try to transfer events to user space. In case we get - * 0 events and there's still timeout left over, we go - * trying again in search of more luck. - */ - res = ep_send_events(ep, events, maxevents); - if (res) { - if (res > 0) - ep_suspend_napi_irqs(ep); + res = ep_try_send_events(ep, events, maxevents); + if (res) return res; - } } if (timed_out) @@ -2445,6 +2453,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return do_epoll_ctl(epfd, op, fd, &epds, false); } +static int ep_check_params(struct file *file, struct epoll_event __user *evs, + int maxevents) +{ + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) + return -EFAULT; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + if (!is_file_epoll(file)) + return -EINVAL; + + return 0; +} + +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents) +{ + struct eventpoll *ep; + int ret; + + ret = ep_check_params(file, events, maxevents); + if (unlikely(ret)) + return ret; + + ep = file->private_data; + /* + * Racy call, but that's ok - it should get retried based on + * poll readiness anyway. + */ + if (ep_events_available(ep)) + return ep_try_send_events(ep, events, maxevents); + return 0; +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). @@ -2453,26 +2502,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; - - /* The maximum number of event must be greater than zero */ - if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ - if (!access_ok(events, maxevents * sizeof(struct epoll_event))) - return -EFAULT; + int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; - /* - * We have to check that the file structure underneath the fd - * the user passed to us _is_ an eventpoll file. - */ - if (!is_file_epoll(fd_file(f))) - return -EINVAL; + ret = ep_check_params(fd_file(f), events, maxevents); + if (unlikely(ret)) + return ret; /* * At this point it is safe to assume that the "private_data" contains diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 0c0d00fcd131..ccb478eb174b 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); +/* Copy ready events to userspace */ +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents); + /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1e02e94bc26d..3d99bf9bbf61 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -280,6 +280,7 @@ enum io_uring_op { IORING_OP_BIND, IORING_OP_LISTEN, IORING_OP_RECV_ZC, + IORING_OP_EPOLL_WAIT, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/Makefile b/io_uring/Makefile index 98e48339d84d..3e28a741ca15 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -11,10 +11,11 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - epoll.o statx.o timeout.o fdinfo.o \ - cancel.o waitid.o register.o \ - truncate.o memmap.o alloc_cache.o + statx.o timeout.o fdinfo.o cancel.o \ + waitid.o register.o truncate.o \ + memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o -obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_EPOLL) += epoll.o +obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 89bff2068a19..6d2c48ba1923 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -12,7 +12,6 @@ #include "io_uring.h" #include "epoll.h" -#if defined(CONFIG_EPOLL) struct io_epoll { struct file *file; int epfd; @@ -21,6 +20,12 @@ struct io_epoll { struct epoll_event event; }; +struct io_epoll_wait { + struct file *file; + int maxevents; + struct epoll_event __user *events; +}; + int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll); @@ -58,4 +63,30 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } -#endif + +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + + iew->maxevents = READ_ONCE(sqe->len); + iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr)); + return 0; +} + +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + int ret; + + ret = epoll_sendevents(req->file, iew->events, iew->maxevents); + if (ret == 0) + return -EAGAIN; + if (ret < 0) + req_set_fail(req); + + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/epoll.h b/io_uring/epoll.h index 870cce11ba98..4111997c360b 100644 --- a/io_uring/epoll.h +++ b/io_uring/epoll.h @@ -3,4 +3,6 @@ #if defined(CONFIG_EPOLL) int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags); +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags); #endif diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9511262c513e..db77df513d55 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -529,6 +529,17 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_EPOLL_WAIT] = { + .needs_file = 1, + .audit_skip = 1, + .pollin = 1, +#if defined(CONFIG_EPOLL) + .prep = io_epoll_wait_prep, + .issue = io_epoll_wait, +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const struct io_cold_def io_cold_defs[] = { @@ -761,6 +772,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_RECV_ZC] = { .name = "RECV_ZC", }, + [IORING_OP_EPOLL_WAIT] = { + .name = "EPOLL_WAIT", + }, }; const char *io_uring_get_opcode(u8 opcode) |