summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/io_uring.c55
-rw-r--r--io_uring/io_uring.h1
-rw-r--r--io_uring/kbuf.c56
-rw-r--r--io_uring/napi.c4
-rw-r--r--io_uring/net.c5
-rw-r--r--io_uring/rsrc.c4
-rw-r--r--io_uring/rw.c7
-rw-r--r--io_uring/timeout.c13
8 files changed, 77 insertions, 68 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4e362c8542a7..3ba49c628337 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2444,7 +2444,7 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
goto out_wake;
}
- iowq->t.function = io_cqring_timer_wakeup;
+ hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup);
hrtimer_set_expires(timer, iowq->timeout);
return HRTIMER_RESTART;
out_wake:
@@ -2479,8 +2479,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0;
}
+struct ext_arg {
+ size_t argsz;
+ struct timespec64 ts;
+ const sigset_t __user *sig;
+ ktime_t min_time;
+ bool ts_set;
+ bool iowait;
+};
+
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
+ struct ext_arg *ext_arg,
ktime_t start_time)
{
int ret = 0;
@@ -2490,7 +2500,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
* can take into account that the task is waiting for IO - turns out
* to be important for low QD IO.
*/
- if (current_pending_io())
+ if (ext_arg->iowait && current_pending_io())
current->in_iowait = 1;
if (iowq->timeout != KTIME_MAX || iowq->min_timeout)
ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time);
@@ -2503,6 +2513,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
/* If this returns > 0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
+ struct ext_arg *ext_arg,
ktime_t start_time)
{
if (unlikely(READ_ONCE(ctx->check_cq)))
@@ -2516,17 +2527,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (unlikely(io_should_wake(iowq)))
return 0;
- return __io_cqring_wait_schedule(ctx, iowq, start_time);
+ return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time);
}
-struct ext_arg {
- size_t argsz;
- struct timespec64 ts;
- const sigset_t __user *sig;
- ktime_t min_time;
- bool ts_set;
-};
-
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -2606,7 +2609,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
TASK_INTERRUPTIBLE);
}
- ret = io_cqring_wait_schedule(ctx, &iowq, start_time);
+ ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time);
__set_current_state(TASK_RUNNING);
atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
@@ -3263,6 +3266,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
const struct io_uring_getevents_arg __user *uarg = argp;
struct io_uring_getevents_arg arg;
+ ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT);
+
/*
* If EXT_ARG isn't set, then we have no timespec and the argp pointer
* is just a pointer to the sigset_t.
@@ -3340,7 +3345,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
IORING_ENTER_REGISTERED_RING |
IORING_ENTER_ABS_TIMER |
- IORING_ENTER_EXT_ARG_REG)))
+ IORING_ENTER_EXT_ARG_REG |
+ IORING_ENTER_NO_IOWAIT)))
return -EINVAL;
/*
@@ -3748,7 +3754,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
- IORING_FEAT_RW_ATTR;
+ IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -3822,29 +3828,36 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
return io_uring_create(entries, &p, params);
}
-static inline bool io_uring_allowed(void)
+static inline int io_uring_allowed(void)
{
int disabled = READ_ONCE(sysctl_io_uring_disabled);
kgid_t io_uring_group;
if (disabled == 2)
- return false;
+ return -EPERM;
if (disabled == 0 || capable(CAP_SYS_ADMIN))
- return true;
+ goto allowed_lsm;
io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
if (!gid_valid(io_uring_group))
- return false;
+ return -EPERM;
- return in_group_p(io_uring_group);
+ if (!in_group_p(io_uring_group))
+ return -EPERM;
+
+allowed_lsm:
+ return security_uring_allowed();
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
struct io_uring_params __user *, params)
{
- if (!io_uring_allowed())
- return -EPERM;
+ int ret;
+
+ ret = io_uring_allowed();
+ if (ret)
+ return ret;
return io_uring_setup(entries, params);
}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 2308f39ed915..87f883130286 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -91,7 +91,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
unsigned flags);
-bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 3478be6d02ab..098109259671 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -32,6 +32,25 @@ struct io_provide_buf {
__u16 bid;
};
+static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
+{
+ while (len) {
+ struct io_uring_buf *buf;
+ u32 this_len;
+
+ buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
+ this_len = min_t(int, len, buf->len);
+ buf->len -= this_len;
+ if (buf->len) {
+ buf->addr += this_len;
+ return false;
+ }
+ bl->head++;
+ len -= this_len;
+ }
+ return true;
+}
+
bool io_kbuf_commit(struct io_kiocb *req,
struct io_buffer_list *bl, int len, int nr)
{
@@ -42,20 +61,8 @@ bool io_kbuf_commit(struct io_kiocb *req,
if (unlikely(len < 0))
return true;
-
- if (bl->flags & IOBL_INC) {
- struct io_uring_buf *buf;
-
- buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
- if (WARN_ON_ONCE(len > buf->len))
- len = buf->len;
- buf->len -= len;
- if (buf->len) {
- buf->addr += len;
- return false;
- }
- }
-
+ if (bl->flags & IOBL_INC)
+ return io_kbuf_inc_commit(bl, len);
bl->head += nr;
return true;
}
@@ -226,25 +233,14 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
buf = io_ring_head_to_buf(br, head, bl->mask);
if (arg->max_len) {
u32 len = READ_ONCE(buf->len);
+ size_t needed;
if (unlikely(!len))
return -ENOBUFS;
- /*
- * Limit incremental buffers to 1 segment. No point trying
- * to peek ahead and map more than we need, when the buffers
- * themselves should be large when setup with
- * IOU_PBUF_RING_INC.
- */
- if (bl->flags & IOBL_INC) {
- nr_avail = 1;
- } else {
- size_t needed;
-
- needed = (arg->max_len + len - 1) / len;
- needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
- if (nr_avail > needed)
- nr_avail = needed;
- }
+ needed = (arg->max_len + len - 1) / len;
+ needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
+ if (nr_avail > needed)
+ nr_avail = needed;
}
/*
diff --git a/io_uring/napi.c b/io_uring/napi.c
index b1ade3fda30f..4a10de03e426 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -44,7 +44,7 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
struct io_napi_entry *e;
/* Non-NAPI IDs can be rejected. */
- if (napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi_id))
return -EINVAL;
hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
@@ -87,7 +87,7 @@ static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
struct io_napi_entry *e;
/* Non-NAPI IDs can be rejected. */
- if (napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi_id))
return -EINVAL;
hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
diff --git a/io_uring/net.c b/io_uring/net.c
index 616e953ef0ae..8944eb679024 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -157,7 +157,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
req->async_data = NULL;
- req->flags &= ~REQ_F_ASYNC_DATA;
+ req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP);
}
}
@@ -464,7 +464,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static void io_req_msg_cleanup(struct io_kiocb *req,
unsigned int issue_flags)
{
- req->flags &= ~REQ_F_NEED_CLEANUP;
io_netmsg_recycle(req, issue_flags);
}
@@ -1499,6 +1498,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(zc->notif);
+ zc->notif = NULL;
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
@@ -1572,6 +1572,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
*/
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
io_notif_flush(sr->notif);
+ sr->notif = NULL;
io_req_msg_cleanup(req, 0);
}
io_req_set_res(req, ret, IORING_CQE_F_MORE);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 607b09bd8374..3f195e24777e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -1024,7 +1024,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
- iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len);
+ iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
if (offset) {
/*
@@ -1051,6 +1051,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
* to use the slow iter advance.
*/
if (offset < bvec->bv_len) {
+ iter->count -= offset;
iter->iov_offset = offset;
} else if (imu->is_kbuf) {
iov_iter_advance(iter, offset);
@@ -1063,6 +1064,7 @@ static int io_import_fixed(int ddir, struct iov_iter *iter,
iter->bvec += seg_skip;
iter->nr_segs -= seg_skip;
+ iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
}
}
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 246b22225919..039e063f7091 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -599,11 +599,10 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
if (unlikely(res != req->cqe.res)) {
- if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ if (res == -EAGAIN && io_rw_should_reissue(req))
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
- return;
- }
- req->cqe.res = res;
+ else
+ req->cqe.res = res;
}
/* order with io_iopoll_complete() checking ->iopoll_completed */
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index fec6ec7beb62..2a107665230b 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -407,8 +407,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
io = req->async_data;
if (hrtimer_try_to_cancel(&io->timer) == -1)
return -EALREADY;
- hrtimer_init(&io->timer, io_timeout_get_clock(io), mode);
- io->timer.function = io_link_timeout_fn;
+ hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode);
hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode);
return 0;
}
@@ -430,8 +429,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
data->ts = *ts;
list_add_tail(&timeout->list, &ctx->timeout_list);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), mode);
- data->timer.function = io_timeout_fn;
+ hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode);
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode);
return 0;
}
@@ -557,7 +555,6 @@ static int __io_timeout_prep(struct io_kiocb *req,
return -EINVAL;
data->mode = io_translate_timeout_mode(flags);
- hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode);
if (is_timeout_link) {
struct io_submit_link *link = &req->ctx->submit_state.link;
@@ -568,6 +565,10 @@ static int __io_timeout_prep(struct io_kiocb *req,
return -EINVAL;
timeout->head = link->last;
link->last->flags |= REQ_F_ARM_LTIMEOUT;
+ hrtimer_setup(&data->timer, io_link_timeout_fn, io_timeout_get_clock(data),
+ data->mode);
+ } else {
+ hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), data->mode);
}
return 0;
}
@@ -627,7 +628,6 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
}
add:
list_add(&timeout->list, entry);
- data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
raw_spin_unlock_irq(&ctx->timeout_lock);
return IOU_ISSUE_SKIP_COMPLETE;
@@ -646,7 +646,6 @@ void io_queue_linked_timeout(struct io_kiocb *req)
if (timeout->head) {
struct io_timeout_data *data = req->async_data;
- data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
data->mode);
list_add_tail(&timeout->list, &ctx->ltimeout_list);