summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/fdinfo.c12
-rw-r--r--io_uring/io-wq.c4
-rw-r--r--io_uring/io_uring.c10
-rw-r--r--io_uring/io_uring.h2
-rw-r--r--io_uring/kbuf.c6
-rw-r--r--io_uring/kbuf.h3
-rw-r--r--io_uring/net.c36
-rw-r--r--io_uring/opdef.c1
-rw-r--r--io_uring/register.c7
-rw-r--r--io_uring/rsrc.c34
-rw-r--r--io_uring/rsrc.h1
-rw-r--r--io_uring/sqpoll.c49
-rw-r--r--io_uring/sqpoll.h8
-rw-r--r--io_uring/zcrx.c6
14 files changed, 122 insertions, 57 deletions
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index e9355276ab5d..9798d6fb4ec7 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -141,18 +141,26 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
if (ctx->flags & IORING_SETUP_SQPOLL) {
struct io_sq_data *sq = ctx->sq_data;
+ struct task_struct *tsk;
+ rcu_read_lock();
+ tsk = rcu_dereference(sq->thread);
/*
* sq->thread might be NULL if we raced with the sqpoll
* thread termination.
*/
- if (sq->thread) {
+ if (tsk) {
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ getrusage(tsk, RUSAGE_SELF, &sq_usage);
+ put_task_struct(tsk);
sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
- getrusage(sq->thread, RUSAGE_SELF, &sq_usage);
sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
+ sq_usage.ru_stime.tv_usec);
sq_work_time = sq->work_time;
+ } else {
+ rcu_read_unlock();
}
}
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index cd1fcb115739..be91edf34f01 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -1259,8 +1259,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
atomic_set(&wq->worker_refs, 1);
init_completion(&wq->worker_done);
ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node);
- if (ret)
+ if (ret) {
+ put_task_struct(wq->task);
goto err;
+ }
return wq;
err:
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index cf759c172083..73648d26a622 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1523,6 +1523,9 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
}
}
mutex_unlock(&ctx->uring_lock);
+
+ if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+ io_move_task_work_from_local(ctx);
}
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events)
@@ -1663,11 +1666,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
io_req_flags_t io_file_get_flags(struct file *file)
{
+ struct inode *inode = file_inode(file);
io_req_flags_t res = 0;
BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1);
- if (S_ISREG(file_inode(file)->i_mode))
+ if (S_ISREG(inode->i_mode) && !(inode->i_flags & S_ANON_INODE))
res |= REQ_F_ISREG;
if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
res |= REQ_F_SUPPORT_NOWAIT;
@@ -2906,7 +2910,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
struct task_struct *tsk;
io_sq_thread_park(sqd);
- tsk = sqd->thread;
+ tsk = sqpoll_task_locked(sqd);
if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
io_wq_cancel_cb(tsk->io_uring->io_wq,
io_cancel_ctx_cb, ctx, true);
@@ -3142,7 +3146,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
s64 inflight;
DEFINE_WAIT(wait);
- WARN_ON_ONCE(sqd && sqd->thread != current);
+ WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current);
if (!current->io_uring)
return;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index d59c12277d58..66c1ca73f55e 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -98,8 +98,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *coun
struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
-int io_uring_alloc_task_context(struct task_struct *task,
- struct io_ring_ctx *ctx);
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 2ea65f3cef72..f2d2cc319faa 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -270,8 +270,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
/* truncate end piece, if needed, for non partial buffers */
if (len > arg->max_len) {
len = arg->max_len;
- if (!(bl->flags & IOBL_INC))
+ if (!(bl->flags & IOBL_INC)) {
+ arg->partial_map = 1;
+ if (iov != arg->iovs)
+ break;
buf->len = len;
+ }
}
iov->iov_base = u64_to_user_ptr(buf->addr);
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 5d83c7adc739..723d0361898e 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -58,7 +58,8 @@ struct buf_sel_arg {
size_t max_len;
unsigned short nr_iovs;
unsigned short mode;
- unsigned buf_group;
+ unsigned short buf_group;
+ unsigned short partial_map;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
diff --git a/io_uring/net.c b/io_uring/net.c
index e16633fd6630..43a43522f406 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -75,12 +75,17 @@ struct io_sr_msg {
u16 flags;
/* initialised and used only by !msg send variants */
u16 buf_group;
- bool retry;
+ unsigned short retry_flags;
void __user *msg_control;
/* used only for send zerocopy */
struct io_kiocb *notif;
};
+enum sr_retry_flags {
+ IO_SR_MSG_RETRY = 1,
+ IO_SR_MSG_PARTIAL_MAP = 2,
+};
+
/*
* Number of times we'll try and do receives if there's more data. If we
* exceed this limit, then add us to the back of the queue and retry from
@@ -187,7 +192,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
req->flags &= ~REQ_F_BL_EMPTY;
sr->done_io = 0;
- sr->retry = false;
+ sr->retry_flags = 0;
sr->len = 0; /* get from the provided buffer */
}
@@ -397,7 +402,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry = false;
+ sr->retry_flags = 0;
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~SENDMSG_FLAGS)
@@ -751,7 +756,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry = false;
+ sr->retry_flags = 0;
if (unlikely(sqe->file_index || sqe->addr2))
return -EINVAL;
@@ -821,9 +826,9 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
if (sr->flags & IORING_RECVSEND_BUNDLE) {
size_t this_ret = *ret - sr->done_io;
- cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret),
+ cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret),
issue_flags);
- if (sr->retry)
+ if (sr->retry_flags & IO_SR_MSG_RETRY)
cflags = req->cqe.flags | (cflags & CQE_F_MASK);
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
@@ -832,12 +837,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
* If more is available AND it was a full transfer, retry and
* append to this one
*/
- if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 &&
+ if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 &&
!iov_iter_count(&kmsg->msg.msg_iter)) {
req->cqe.flags = cflags & ~CQE_F_MASK;
sr->len = kmsg->msg.msg_inq;
sr->done_io += this_ret;
- sr->retry = true;
+ sr->retry_flags |= IO_SR_MSG_RETRY;
return false;
}
} else {
@@ -1077,6 +1082,14 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
if (unlikely(ret < 0))
return ret;
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+ kmsg->vec.nr = ret;
+ kmsg->vec.iovec = arg.iovs;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+ if (arg.partial_map)
+ sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP;
+
/* special case 1 vec, can be a fast path */
if (ret == 1) {
sr->buf = arg.iovs[0].iov_base;
@@ -1085,11 +1098,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
}
iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
arg.out_len);
- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
- kmsg->vec.nr = ret;
- kmsg->vec.iovec = arg.iovs;
- req->flags |= REQ_F_NEED_CLEANUP;
- }
} else {
void __user *buf;
@@ -1275,7 +1283,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
zc->done_io = 0;
- zc->retry = false;
+ zc->retry_flags = 0;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 6e0882b051f9..6de6229207a8 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = {
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
+ .hash_reg_file = 1,
.prep = io_fallocate_prep,
.issue = io_fallocate,
},
diff --git a/io_uring/register.c b/io_uring/register.c
index cc23a4c205cd..a59589249fce 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -273,6 +273,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
+ struct task_struct *tsk;
+
/*
* Observe the correct sqd->lock -> ctx->uring_lock
* ordering. Fine to drop uring_lock here, we hold
@@ -282,8 +284,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
mutex_unlock(&ctx->uring_lock);
mutex_lock(&sqd->lock);
mutex_lock(&ctx->uring_lock);
- if (sqd->thread)
- tctx = sqd->thread->io_uring;
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk)
+ tctx = tsk->io_uring;
}
} else {
tctx = current->io_uring;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index c592ceace97d..f2b31fb68992 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -112,8 +112,11 @@ static void io_release_ubuf(void *priv)
struct io_mapped_ubuf *imu = priv;
unsigned int i;
- for (i = 0; i < imu->nr_bvecs; i++)
- unpin_user_page(imu->bvec[i].bv_page);
+ for (i = 0; i < imu->nr_bvecs; i++) {
+ struct folio *folio = page_folio(imu->bvec[i].bv_page);
+
+ unpin_user_folio(folio, 1);
+ }
}
static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
@@ -731,6 +734,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
data->nr_pages_mid = folio_nr_pages(folio);
data->folio_shift = folio_shift(folio);
+ data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
/*
* Check if pages are contiguous inside a folio, and all folios have
@@ -809,10 +813,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
imu->nr_bvecs = nr_pages;
ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
- if (ret) {
- unpin_user_pages(pages, nr_pages);
+ if (ret)
goto done;
- }
size = iov->iov_len;
/* store original address for later verification */
@@ -826,7 +828,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
if (coalesced)
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
- off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
+
+ off = (unsigned long)iov->iov_base & ~PAGE_MASK;
+ if (coalesced)
+ off += data.first_folio_page_idx << PAGE_SHIFT;
+
node->buf = imu;
ret = 0;
@@ -842,6 +848,10 @@ done:
if (ret) {
if (imu)
io_free_imu(ctx, imu);
+ if (pages) {
+ for (i = 0; i < nr_pages; i++)
+ unpin_user_folio(page_folio(pages[i]), 1);
+ }
io_cache_free(&ctx->node_cache, node);
node = ERR_PTR(ret);
}
@@ -1177,6 +1187,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
return -EINVAL;
if (check_add_overflow(arg->nr, arg->dst_off, &nbufs))
return -EOVERFLOW;
+ if (nbufs > IORING_MAX_REG_BUFFERS)
+ return -EINVAL;
ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr));
if (ret)
@@ -1327,7 +1339,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
{
unsigned long folio_size = 1 << imu->folio_shift;
unsigned long folio_mask = folio_size - 1;
- u64 folio_addr = imu->ubuf & ~folio_mask;
struct bio_vec *res_bvec = vec->bvec;
size_t total_len = 0;
unsigned bvec_idx = 0;
@@ -1349,8 +1360,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
return -EOVERFLOW;
- /* by using folio address it also accounts for bvec offset */
- offset = buf_addr - folio_addr;
+ offset = buf_addr - imu->ubuf;
+ /*
+ * Only the first bvec can have non zero bv_offset, account it
+ * here and work with full folios below.
+ */
+ offset += imu->bvec[0].bv_offset;
+
src_bvec = imu->bvec + (offset >> imu->folio_shift);
offset &= folio_mask;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 0d2138f16322..25e7e998dcfd 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -49,6 +49,7 @@ struct io_imu_folio_data {
unsigned int nr_pages_mid;
unsigned int folio_shift;
unsigned int nr_folios;
+ unsigned long first_folio_page_idx;
};
bool io_rsrc_cache_init(struct io_ring_ctx *ctx);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 03c699493b5a..a3f11349ce06 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -16,6 +16,7 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
+#include "tctx.h"
#include "napi.h"
#include "sqpoll.h"
@@ -30,7 +31,7 @@ enum {
void io_sq_thread_unpark(struct io_sq_data *sqd)
__releases(&sqd->lock)
{
- WARN_ON_ONCE(sqd->thread == current);
+ WARN_ON_ONCE(sqpoll_task_locked(sqd) == current);
/*
* Do the dance but not conditional clear_bit() because it'd race with
@@ -46,24 +47,32 @@ void io_sq_thread_unpark(struct io_sq_data *sqd)
void io_sq_thread_park(struct io_sq_data *sqd)
__acquires(&sqd->lock)
{
- WARN_ON_ONCE(data_race(sqd->thread) == current);
+ struct task_struct *tsk;
atomic_inc(&sqd->park_pending);
set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
+
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk) {
+ WARN_ON_ONCE(tsk == current);
+ wake_up_process(tsk);
+ }
}
void io_sq_thread_stop(struct io_sq_data *sqd)
{
- WARN_ON_ONCE(sqd->thread == current);
+ struct task_struct *tsk;
+
WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
mutex_lock(&sqd->lock);
- if (sqd->thread)
- wake_up_process(sqd->thread);
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk) {
+ WARN_ON_ONCE(tsk == current);
+ wake_up_process(tsk);
+ }
mutex_unlock(&sqd->lock);
wait_for_completion(&sqd->exited);
}
@@ -270,7 +279,8 @@ static int io_sq_thread(void *data)
/* offload context creation failed, just exit */
if (!current->io_uring) {
mutex_lock(&sqd->lock);
- sqd->thread = NULL;
+ rcu_assign_pointer(sqd->thread, NULL);
+ put_task_struct(current);
mutex_unlock(&sqd->lock);
goto err_out;
}
@@ -379,7 +389,8 @@ static int io_sq_thread(void *data)
io_sq_tw(&retry_list, UINT_MAX);
io_uring_cancel_generic(true, sqd);
- sqd->thread = NULL;
+ rcu_assign_pointer(sqd->thread, NULL);
+ put_task_struct(current);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
io_run_task_work();
@@ -409,7 +420,6 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
__cold int io_sq_offload_create(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
- struct task_struct *task_to_put = NULL;
int ret;
/* Retain compatibility with failing for an invalid attach attempt */
@@ -484,8 +494,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
goto err_sqpoll;
}
- sqd->thread = tsk;
- task_to_put = get_task_struct(tsk);
+ mutex_lock(&sqd->lock);
+ rcu_assign_pointer(sqd->thread, tsk);
+ mutex_unlock(&sqd->lock);
+
+ get_task_struct(tsk);
ret = io_uring_alloc_task_context(tsk, ctx);
wake_up_new_task(tsk);
if (ret)
@@ -495,16 +508,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
ret = -EINVAL;
goto err;
}
-
- if (task_to_put)
- put_task_struct(task_to_put);
return 0;
err_sqpoll:
complete(&ctx->sq_data->exited);
err:
io_sq_thread_finish(ctx);
- if (task_to_put)
- put_task_struct(task_to_put);
return ret;
}
@@ -515,10 +523,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
int ret = -EINVAL;
if (sqd) {
+ struct task_struct *tsk;
+
io_sq_thread_park(sqd);
/* Don't set affinity for a dying thread */
- if (sqd->thread)
- ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
+ tsk = sqpoll_task_locked(sqd);
+ if (tsk)
+ ret = io_wq_cpu_affinity(tsk->io_uring, mask);
io_sq_thread_unpark(sqd);
}
diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
index 4171666b1cf4..b83dcdec9765 100644
--- a/io_uring/sqpoll.h
+++ b/io_uring/sqpoll.h
@@ -8,7 +8,7 @@ struct io_sq_data {
/* ctx's that are using this sqd */
struct list_head ctx_list;
- struct task_struct *thread;
+ struct task_struct __rcu *thread;
struct wait_queue_head wait;
unsigned sq_thread_idle;
@@ -29,3 +29,9 @@ void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd);
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
+
+static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd)
+{
+ return rcu_dereference_protected(sqd->thread,
+ lockdep_is_held(&sqd->lock));
+}
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 797247a34cb7..085eeed8cd50 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -106,8 +106,10 @@ static int io_import_dmabuf(struct io_zcrx_ifq *ifq,
for_each_sgtable_dma_sg(mem->sgt, sg, i)
total_size += sg_dma_len(sg);
- if (total_size < off + len)
- return -EINVAL;
+ if (total_size < off + len) {
+ ret = -EINVAL;
+ goto err;
+ }
mem->dmabuf_offset = off;
mem->size = len;