diff options
58 files changed, 3883 insertions, 872 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index a0791caff632..ea62d303c56b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24397,6 +24397,7 @@ S: Maintained F: Documentation/block/ublk.rst F: drivers/block/ublk_drv.c F: include/uapi/linux/ublk_cmd.h +F: tools/testing/selftests/ublk/ UBSAN M: Kees Cook <kees@kernel.org> diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ca9a67b5b537..6d7b26ab434f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -51,6 +51,9 @@ /* private ioctl command mirror */ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) +#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) +#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -196,12 +199,14 @@ struct ublk_params_header { static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag); static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_COPY; + return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) @@ -581,7 +586,7 @@ static void ublk_apply_params(struct ublk_device *ub) static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { - return ubq->flags & UBLK_F_USER_COPY; + return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) @@ -1747,6 +1752,42 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, io_uring_cmd_mark_cancelable(cmd, issue_flags); } +static void ublk_io_release(void *priv) +{ + struct request *rq = priv; + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + ublk_put_req_ref(ubq, rq); +} + +static int ublk_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_queue *ubq, unsigned int tag, + unsigned int index, unsigned int issue_flags) +{ + struct ublk_device *ub = cmd->file->private_data; + struct request *req; + int ret; + + req = __ublk_check_and_get_req(ub, ubq, tag, 0); + if (!req) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) { + ublk_put_req_ref(ubq, req); + return ret; + } + + return 0; +} + +static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + unsigned int index, unsigned int issue_flags) +{ + return io_buffer_unregister_bvec(cmd, index, issue_flags); +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1798,6 +1839,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ret = -EINVAL; switch (_IOC_NR(cmd_op)) { + case UBLK_IO_REGISTER_IO_BUF: + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags); + case UBLK_IO_UNREGISTER_IO_BUF: + return ublk_unregister_io_buf(cmd, ub_cmd->addr, issue_flags); case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) { @@ -2459,7 +2504,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * buffer by pwrite() to ublk char device, which can't be * used for unprivileged device */ - if (info.flags & UBLK_F_USER_COPY) + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; } @@ -2527,9 +2572,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_free_dev_number; } - /* We are not ready to support zero copy */ - ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; - ub->dev_info.nr_hw_queues = min_t(unsigned int, ub->dev_info.nr_hw_queues, nr_cpu_ids); ublk_align_max_io_size(ub); @@ -2863,7 +2905,7 @@ static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; - u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + u64 features = UBLK_F_ALL; if (header->len != UBLK_FEATURES_LEN || !header->addr) return -EINVAL; diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 24e2c702da7a..ecf136489044 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -114,7 +114,8 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - struct io_uring_cmd *ioucmd, unsigned int flags) + struct io_uring_cmd *ioucmd, unsigned int flags, + unsigned int iou_issue_flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -146,7 +147,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, goto out; } ret = io_uring_cmd_import_fixed(ubuffer, bufflen, - rq_data_dir(req), &iter, ioucmd); + rq_data_dir(req), &iter, ioucmd, + iou_issue_flags); if (ret < 0) goto out; ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); @@ -198,7 +200,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, NULL, flags); + meta_len, NULL, flags, 0); if (ret) return ret; } @@ -514,10 +516,10 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return PTR_ERR(req); req->timeout = d.timeout_ms ? msecs_to_jiffies(d.timeout_ms) : 0; - if (d.addr && d.data_len) { + if (d.data_len) { ret = nvme_map_user_request(req, d.addr, d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, ioucmd, vec); + d.metadata_len, ioucmd, vec, issue_flags); if (ret) return ret; } diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index abd0c8bd950b..598cacda4aa3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -4,6 +4,7 @@ #include <uapi/linux/io_uring.h> #include <linux/io_uring_types.h> +#include <linux/blk-mq.h> /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ #define IORING_URING_CMD_CANCELABLE (1U << 30) @@ -39,7 +40,9 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, + unsigned int issue_flags); /* * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd @@ -66,8 +69,10 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) +static inline int +io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { return -EOPNOTSUPP; } @@ -123,4 +128,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur return cmd_to_io_kiocb(cmd)->async_data; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3def525a1da3..72aac84dca93 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -292,6 +292,8 @@ struct io_ring_ctx { struct io_file_table file_table; struct io_rsrc_data buf_table; + struct io_alloc_cache node_cache; + struct io_alloc_cache imu_cache; struct io_submit_state submit_state; @@ -360,7 +362,6 @@ struct io_ring_ctx { spinlock_t completion_lock; - struct list_head io_buffers_comp; struct list_head cq_overflow_list; struct hlist_head waitid_list; @@ -379,8 +380,6 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; - struct list_head io_buffers_cache; - /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; @@ -439,8 +438,15 @@ struct io_ring_ctx { struct io_mapped_region param_region; }; +/* + * Token indicating function is called in task work context: + * ctx->uring_lock is held and any completions generated will be flushed. + * ONLY core io_uring.c should instantiate this struct. + */ struct io_tw_state { }; +/* Alias to use in code that doesn't instantiate struct io_tw_state */ +typedef struct io_tw_state io_tw_token_t; enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, @@ -566,7 +572,7 @@ enum { REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); struct io_task_work { struct llist_node node; @@ -601,7 +607,11 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ ((cmd_type *)&(req)->cmd) \ ) -#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) + +static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) +{ + return ptr; +} struct io_kiocb { union { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 050fa8eb2e8f..0d6c83c8d1cf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -541,6 +541,7 @@ struct io_cqring_offsets { #define IORING_ENTER_REGISTERED_RING (1U << 4) #define IORING_ENTER_ABS_TIMER (1U << 5) #define IORING_ENTER_EXT_ARG_REG (1U << 6) +#define IORING_ENTER_NO_IOWAIT (1U << 7) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -578,6 +579,7 @@ struct io_uring_params { #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_MIN_TIMEOUT (1U << 15) #define IORING_FEAT_RW_ATTR (1U << 16) +#define IORING_FEAT_NO_IOWAIT (1U << 17) /* * io_uring_register(2) opcodes and arguments diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a8bc98bb69fc..74246c926b55 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -94,6 +94,10 @@ _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) #define UBLK_U_IO_NEED_GET_DATA \ _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) +#define UBLK_U_IO_REGISTER_IO_BUF \ + _IOWR('u', 0x23, struct ublksrv_io_cmd) +#define UBLK_U_IO_UNREGISTER_IO_BUF \ + _IOWR('u', 0x24, struct ublksrv_io_cmd) /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 0dd17d8ba93a..7f68eff2e7f3 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -68,4 +68,10 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) return io_cache_alloc_new(cache, gfp); } +static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) +{ + if (!io_alloc_cache_put(cache, obj)) + kfree(obj); +} + #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 484193567839..0870060bac7c 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -341,3 +341,45 @@ out: fput(file); return ret; } + +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_match_task_safe(req, tctx, cancel_all)) + continue; + hlist_del_init(&req->hash_node); + if (cancel(req)) + found = true; + } + + return found; +} + +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + int nr = 0; + + io_ring_submit_lock(ctx, issue_flags); + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_cancel_req_match(req, cd)) + continue; + if (cancel(req)) + nr++; + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) + break; + } + io_ring_submit_unlock(ctx, issue_flags); + return nr ?: -ENOENT; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index bbfea2cd00ea..43e9bb74e9d1 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -24,6 +24,14 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)); + +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)); + static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { if (req->cancel_seq_set && sequence == req->work.cancel_seq) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index dd8eeec97acf..a21660e3145a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) return -ENOMEM; diff --git a/io_uring/futex.c b/io_uring/futex.c index 43e2143255f5..0ea4820cd8ff 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -44,30 +44,28 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { req->async_data = NULL; hlist_del_init(&req->hash_node); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } -static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { - struct io_futex_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; - io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) - kfree(ifd); - __io_futex_complete(req, ts); + io_tw_lock(ctx, tw); + io_cache_free(&ctx->futex_cache, req->async_data); + __io_futex_complete(req, tw); } -static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (!iof->futexv_unqueued) { int res; @@ -79,7 +77,7 @@ static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) kfree(req->async_data); req->flags &= ~REQ_F_ASYNC_DATA; - __io_futex_complete(req, ts); + __io_futex_complete(req, tw); } static bool io_futexv_claim(struct io_futex *iof) @@ -90,7 +88,7 @@ static bool io_futexv_claim(struct io_futex *iof) return true; } -static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_futex_cancel(struct io_kiocb *req) { /* futex wake already done or in progress */ if (req->opcode == IORING_OP_FUTEX_WAIT) { @@ -116,49 +114,13 @@ static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_futex_cancel(ctx, req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->futex_list, __io_futex_cancel); } bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (!io_match_task_safe(req, tctx, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_futex_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->futex_list, cancel_all, __io_futex_cancel); } int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 91019b4d0308..04a75d666195 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -30,7 +30,6 @@ enum { IO_WORKER_F_UP = 0, /* up and active */ IO_WORKER_F_RUNNING = 1, /* account as running */ IO_WORKER_F_FREE = 2, /* worker on free list */ - IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -46,12 +45,12 @@ enum { */ struct io_worker { refcount_t ref; - int create_index; unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; + struct io_wq_acct *acct; struct io_wq_work *cur_work; raw_spinlock_t lock; @@ -77,10 +76,27 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) struct io_wq_acct { + /** + * Protects access to the worker lists. + */ + raw_spinlock_t workers_lock; + unsigned nr_workers; unsigned max_workers; - int index; atomic_t nr_running; + + /** + * The list of free workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct hlist_nulls_head free_list; + + /** + * The list of all workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct list_head all_list; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; @@ -112,12 +128,6 @@ struct io_wq { struct io_wq_acct acct[IO_WQ_ACCT_NR]; - /* lock protects access to elements below */ - raw_spinlock_t lock; - - struct hlist_nulls_head free_list; - struct list_head all_list; - struct wait_queue_entry wait; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; @@ -135,7 +145,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, int index); +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct); static void io_wq_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wq *wq, struct io_wq_acct *acct, @@ -160,14 +170,14 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) } static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, - struct io_wq_work *work) + unsigned int work_flags) { - return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); + return worker->acct; } static void io_worker_ref_put(struct io_wq *wq) @@ -192,9 +202,9 @@ static void io_worker_cancel_cb(struct io_worker *worker) struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -213,6 +223,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { struct io_wq *wq = worker->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -226,11 +237,11 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_wq_dec_running(worker); /* * this worker is a goner, clear ->worker_private to avoid any @@ -269,8 +280,7 @@ static inline bool io_acct_run_queue(struct io_wq_acct *acct) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wq_activate_free_worker(struct io_wq *wq, - struct io_wq_acct *acct) +static bool io_acct_activate_free_worker(struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -281,13 +291,9 @@ static bool io_wq_activate_free_worker(struct io_wq *wq, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wq_get_acct(worker) != acct) { - io_worker_release(worker); - continue; - } /* * If the worker is already running, it's either already * starting work or finishing work. In either case, if it does @@ -314,16 +320,16 @@ static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); atomic_inc(&acct->nr_running); atomic_inc(&wq->worker_refs); - return create_io_worker(wq, acct->index); + return create_io_worker(wq, acct); } static void io_wq_inc_running(struct io_worker *worker) @@ -343,16 +349,16 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; - acct = &wq->acct[worker->create_index]; - raw_spin_lock(&wq->lock); + acct = worker->acct; + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); if (do_create) { - create_io_worker(wq, worker->create_index); + create_io_worker(wq, acct); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -384,7 +390,6 @@ static bool io_queue_worker_create(struct io_worker *worker, atomic_inc(&wq->worker_refs); init_task_work(&worker->create_work, func); - worker->create_index = acct->index; if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { /* * EXIT may have been set after checking it above, check after @@ -430,31 +435,36 @@ static void io_wq_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) +static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) { if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { clear_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } } /* * No work, worker going to sleep. Move to freelist. */ -static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) - __must_hold(wq->lock) +static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) + __must_hold(acct->workers_lock) { if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { set_bit(IO_WORKER_F_FREE, &worker->flags); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); } } +static inline unsigned int __io_get_work_hash(unsigned int work_flags) +{ + return work_flags >> IO_WQ_HASH_SHIFT; +} + static inline unsigned int io_get_work_hash(struct io_wq_work *work) { - return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; + return __io_get_work_hash(atomic_read(&work->flags)); } static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) @@ -475,26 +485,27 @@ static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) } static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, - struct io_worker *worker) + struct io_wq *wq) __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { + unsigned int work_flags; unsigned int hash; work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) { wq_list_del(&acct->work_list, node, prev); return work; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); /* all items with this hash lie in [work, tail] */ tail = wq->hash_tail[hash]; @@ -564,7 +575,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(acct, worker); + work = io_get_next_work(acct, wq); if (work) { /* * Make sure cancelation can find this, even before @@ -583,7 +594,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (!work) break; - __io_worker_busy(wq, worker); + __io_worker_busy(acct, worker); io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -591,12 +602,15 @@ static void io_worker_handle_work(struct io_wq_acct *acct, /* handle a whole dependent link */ do { struct io_wq_work *next_hashed, *linked; - unsigned int hash = io_get_work_hash(work); + unsigned int work_flags = atomic_read(&work->flags); + unsigned int hash = __io_wq_is_hashed(work_flags) + ? __io_get_work_hash(work_flags) + : -1U; next_hashed = wq_next_work(work); if (do_kill && - (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -654,20 +668,20 @@ static int io_wq_worker(void *data) while (io_acct_run_queue(acct)) io_worker_handle_work(acct, worker); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); /* * Last sleep timed out. Exit if we're not the last worker, * or if someone modified our affinity. */ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wq, worker); - raw_spin_unlock(&wq->lock); + __io_worker_idle(acct, worker); + raw_spin_unlock(&acct->workers_lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -728,18 +742,18 @@ void io_wq_worker_sleeping(struct task_struct *tsk) io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wq->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); - list_add_tail_rcu(&worker->all_list, &wq->all_list); + raw_spin_lock(&acct->workers_lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); + list_add_tail_rcu(&worker->all_list, &acct->all_list); set_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); wake_up_new_task(tsk); } @@ -787,20 +801,20 @@ static void create_worker_cont(struct callback_head *cb) struct io_worker *worker; struct task_struct *tsk; struct io_wq *wq; + struct io_wq_acct *acct; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); wq = worker->wq; + acct = io_wq_get_acct(worker); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); io_worker_release(worker); return; } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { - struct io_wq_acct *acct = io_wq_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -808,11 +822,11 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } io_worker_ref_put(wq); kfree(worker); @@ -834,9 +848,8 @@ static void io_workqueue_create(struct work_struct *work) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, int index) +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) { - struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; @@ -846,24 +859,22 @@ static bool create_io_worker(struct io_wq *wq, int index) if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); return false; } refcount_set(&worker->ref, 1); worker->wq = wq; + worker->acct = acct; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); - if (index == IO_WQ_ACCT_BOUND) - set_bit(IO_WORKER_F_BOUND, &worker->flags); - tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; @@ -879,14 +890,14 @@ fail: * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wq *wq, - bool (*func)(struct io_worker *, void *), - void *data) +static bool io_acct_for_each_worker(struct io_wq_acct *acct, + bool (*func)(struct io_worker *, void *), + void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -900,6 +911,18 @@ static bool io_wq_for_each_worker(struct io_wq *wq, return ret; } +static bool io_wq_for_each_worker(struct io_wq *wq, + bool (*func)(struct io_worker *, void *), + void *data) +{ + for (int i = 0; i < IO_WQ_ACCT_NR; i++) { + if (!io_acct_for_each_worker(&wq->acct[i], func, data)) + return false; + } + + return true; +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { __set_notify_signal(worker->task); @@ -916,19 +939,19 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) } while (work); } -static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, + struct io_wq_work *work, unsigned int work_flags) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; - if (!io_wq_is_hashed(work)) { + if (!__io_wq_is_hashed(work_flags)) { append: wq_list_add_tail(&work->list, &acct->work_list); return; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); tail = wq->hash_tail[hash]; wq->hash_tail[hash] = work; if (!tail) @@ -944,8 +967,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int work_flags = atomic_read(&work->flags); + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -964,12 +987,12 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) } raw_spin_lock(&acct->lock); - io_wq_insert_work(wq, work); + io_wq_insert_work(wq, acct, work, work_flags); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); rcu_read_lock(); - do_create = !io_wq_activate_free_worker(wq, acct); + do_create = !io_acct_activate_free_worker(acct); rcu_read_unlock(); if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || @@ -980,12 +1003,12 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) if (likely(did_create)) return; - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); /* fatal condition, failed to create the first worker */ io_acct_cancel_pending_work(wq, acct, &match); @@ -1034,10 +1057,10 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static inline void io_wq_remove_pending(struct io_wq *wq, + struct io_wq_acct *acct, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; @@ -1064,7 +1087,7 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wq_remove_pending(wq, work, prev); + io_wq_remove_pending(wq, acct, work, prev); raw_spin_unlock(&acct->lock); io_run_cancel(work, wq); match->nr_pending++; @@ -1092,11 +1115,22 @@ retry: } } +static void io_acct_cancel_running_work(struct io_wq_acct *acct, + struct io_cb_cancel_data *match) +{ + raw_spin_lock(&acct->workers_lock); + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); + raw_spin_unlock(&acct->workers_lock); +} + static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); + + for (int i = 0; i < IO_WQ_ACCT_NR; i++) + io_acct_cancel_running_work(&wq->acct[i], match); + rcu_read_unlock(); } @@ -1119,16 +1153,14 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wq->lock, to ensure that + * Do both of these while holding the acct->workers_lock, to ensure that * we'll find a work item regardless of state. */ io_wq_cancel_pending_work(wq, &match); if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - raw_spin_lock(&wq->lock); io_wq_cancel_running_work(wq, &match); - raw_spin_unlock(&wq->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; @@ -1152,7 +1184,7 @@ static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wq_activate_free_worker(wq, acct); + io_acct_activate_free_worker(acct); } rcu_read_unlock(); return 1; @@ -1190,16 +1222,16 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wq_acct *acct = &wq->acct[i]; - acct->index = i; atomic_set(&acct->nr_running, 0); + + raw_spin_lock_init(&acct->workers_lock); + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); + INIT_LIST_HEAD(&acct->all_list); + INIT_WQ_LIST(&acct->work_list); raw_spin_lock_init(&acct->lock); } - raw_spin_lock_init(&wq->lock); - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); - INIT_LIST_HEAD(&wq->all_list); - wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); @@ -1385,14 +1417,14 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); - raw_spin_lock(&wq->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { acct = &wq->acct[i]; + raw_spin_lock(&acct->workers_lock); prev[i] = max_t(int, acct->max_workers, prev[i]); if (new_count[i]) acct->max_workers = new_count[i]; + raw_spin_unlock(&acct->workers_lock); } - raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index b3b004a7b625..d4fb2940e435 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -54,9 +54,14 @@ int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); int io_wq_max_workers(struct io_wq *wq, int *new_count); bool io_wq_worker_stopped(void); +static inline bool __io_wq_is_hashed(unsigned int work_flags) +{ + return work_flags & IO_WQ_WORK_HASHED; +} + static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; + return __io_wq_is_hashed(atomic_read(&work->flags)); } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 01d75e5c47aa..078475447264 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -110,11 +110,13 @@ #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) + #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ REQ_F_ASYNC_DATA) -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -131,7 +133,6 @@ struct io_defer_entry { /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) /* * No waiters. It's larger than any valid value of the tw counter @@ -254,7 +255,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &ts); + req->io_task_work.func(req, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -282,6 +283,17 @@ static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) return 0; } +static void io_free_alloc_caches(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_alloc_cache_free(&ctx->msg_cache, kfree); + io_futex_cache_free(ctx); + io_rsrc_cache_free(ctx); +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -313,7 +325,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, @@ -328,6 +339,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); + ret |= io_rsrc_cache_init(ctx); if (ret) goto free_ref; init_completion(&ctx->ref_comp); @@ -338,7 +350,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->completion_lock); raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -360,12 +371,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) free_ref: percpu_ref_exit(&ctx->refs); err: - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, kfree); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); @@ -393,11 +399,8 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_kbuf_drop(req); - spin_unlock(&req->ctx->completion_lock); - } + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) + io_kbuf_drop_legacy(req); if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; @@ -542,7 +545,7 @@ static void io_queue_iowq(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) { io_queue_iowq(req); } @@ -899,7 +902,7 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + if (ctx->lockless_cq) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; @@ -1021,7 +1024,7 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) { if (!ctx) return; @@ -1051,24 +1054,24 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, &ts); + req, ts); node = next; (*count)++; if (unlikely(need_resched())) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = NULL; cond_resched(); } } while (node && *count < max_entries); - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); return node; } @@ -1157,7 +1160,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, * We don't know how many reuqests is there in the link and whether * they can even be queued lazily, fall back to non-lazy. */ - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + if (req->flags & IO_REQ_LINK_FLAGS) flags &= ~IOU_F_TWQ_LAZY_WAKE; guard(rcu)(); @@ -1276,7 +1279,7 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, } static int __io_run_local_work_loop(struct llist_node **node, - struct io_tw_state *ts, + io_tw_token_t tw, int events) { int ret = 0; @@ -1287,7 +1290,7 @@ static int __io_run_local_work_loop(struct llist_node **node, io_task_work.node); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + req, tw); *node = next; if (++ret >= events) break; @@ -1296,7 +1299,7 @@ static int __io_run_local_work_loop(struct llist_node **node, return ret; } -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, int min_events, int max_events) { struct llist_node *node; @@ -1309,7 +1312,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: min_events -= ret; - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); if (ctx->retry_llist.first) goto retry_done; @@ -1318,7 +1321,7 @@ again: * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - ret += __io_run_local_work_loop(&node, ts, max_events - ret); + ret += __io_run_local_work_loop(&node, tw, max_events - ret); ctx->retry_llist.first = node; loops++; @@ -1340,7 +1343,7 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events, + return __io_run_local_work(ctx, ts, min_events, max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } @@ -1351,20 +1354,20 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events, max_events); + ret = __io_run_local_work(ctx, ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (unlikely(io_should_terminate_tw())) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) @@ -1419,8 +1422,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) - kfree(apoll); + io_cache_free(&ctx->apoll_cache, apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) @@ -1582,7 +1584,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return 0; } -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) { io_req_complete_defer(req); } @@ -1719,15 +1721,13 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, return !!req->file; } -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +static inline int __io_issue_sqe(struct io_kiocb *req, + unsigned int issue_flags, + const struct io_issue_def *def) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; - if (unlikely(!io_assign_file(req, def, issue_flags))) - return -EBADF; - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); @@ -1742,6 +1742,19 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); + return ret; +} + +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + int ret; + + if (unlikely(!io_assign_file(req, def, issue_flags))) + return -EBADF; + + ret = __io_issue_sqe(req, issue_flags, def); + if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); @@ -1762,11 +1775,26 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return ret; } -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| - IO_URING_F_COMPLETE_DEFER); + const unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_MULTISHOT | + IO_URING_F_COMPLETE_DEFER; + int ret; + + io_tw_lock(req->ctx, tw); + + WARN_ON_ONCE(!req->file); + if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EFAULT; + + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); + + WARN_ON_ONCE(ret == IOU_OK); + + if (ret == IOU_ISSUE_SKIP_COMPLETE) + ret = 0; + return ret; } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) @@ -1996,9 +2024,8 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } -static void io_init_req_drain(struct io_kiocb *req) +static void io_init_drain(struct io_ring_ctx *ctx) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; @@ -2062,7 +2089,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) return io_init_fail_req(req, -EOPNOTSUPP); - io_init_req_drain(req); + io_init_drain(ctx); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { @@ -2458,8 +2485,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; } +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; + bool iowait; +}; + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { int ret = 0; @@ -2469,7 +2506,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ - if (current_pending_io()) + if (ext_arg->iowait && current_pending_io()) current->in_iowait = 1; if (iowq->timeout != KTIME_MAX || iowq->min_timeout) ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); @@ -2482,6 +2519,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, /* If this returns > 0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { if (unlikely(READ_ONCE(ctx->check_cq))) @@ -2495,17 +2533,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (unlikely(io_should_wake(iowq))) return 0; - return __io_cqring_wait_schedule(ctx, iowq, start_time); + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); } -struct ext_arg { - size_t argsz; - struct timespec64 ts; - const sigset_t __user *sig; - ktime_t min_time; - bool ts_set; -}; - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2583,7 +2613,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq, start_time); + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -2704,12 +2734,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, kfree); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); io_destroy_buffers(ctx); io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); @@ -3239,6 +3264,8 @@ static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; + ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); + /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. @@ -3316,7 +3343,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_REGISTERED_RING | IORING_ENTER_ABS_TIMER | - IORING_ENTER_EXT_ARG_REG))) + IORING_ENTER_EXT_ARG_REG | + IORING_ENTER_NO_IOWAIT))) return -EINVAL; /* @@ -3537,6 +3565,44 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } +static int io_uring_sanitise_params(struct io_uring_params *p) +{ + unsigned flags = p->flags; + + /* There is no way to mmap rings without a real fd */ + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && + !(flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + } + + if (flags & IORING_SETUP_TASKRUN_FLAG) { + if (!(flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_DEFER_TASKRUN))) + return -EINVAL; + } + + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + /* + * For DEFER_TASKRUN we require the completion task to be the same as + * the submission task. This implies that there is only one submitter. + */ + if ((flags & IORING_SETUP_DEFER_TASKRUN) && + !(flags & IORING_SETUP_SINGLE_ISSUER)) + return -EINVAL; + + return 0; +} + int io_uring_fill_params(unsigned entries, struct io_uring_params *p) { if (!entries) @@ -3547,10 +3613,6 @@ int io_uring_fill_params(unsigned entries, struct io_uring_params *p) entries = IORING_MAX_ENTRIES; } - if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) - && !(p->flags & IORING_SETUP_NO_MMAP)) - return -EINVAL; - /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -3612,6 +3674,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct file *file; int ret; + ret = io_uring_sanitise_params(p); + if (ret) + return ret; + ret = io_uring_fill_params(entries, p); if (unlikely(ret)) return ret; @@ -3659,37 +3725,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_DEFER_TASKRUN)) - goto err; - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - goto err; + else ctx->notify_method = TWA_SIGNAL; - } - - /* HYBRID_IOPOLL only valid with IOPOLL */ - if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) == - IORING_SETUP_HYBRID_IOPOLL) - goto err; - - /* - * For DEFER_TASKRUN we require the completion task to be the same as the - * submission task. This implies that there is only one submitter, so enforce - * that. - */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { - goto err; - } /* * This is just grabbed for accounting purposes. When a process exits, @@ -3719,7 +3758,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | - IORING_FEAT_RW_ATTR; + IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3915,6 +3954,9 @@ static int __init io_uring_init(void) io_uring_optable_init(); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling @@ -3925,10 +3967,9 @@ static int __init io_uring_init(void) req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); - io_buf_cachep = KMEM_CACHE(io_buffer, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ab619e63ef39..372129e24372 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -88,11 +88,10 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned flags); -bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); @@ -104,7 +103,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); void io_req_queue_iowq(struct io_kiocb *req); -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -147,6 +146,11 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) #endif } +static inline bool io_is_compat(struct io_ring_ctx *ctx) +{ + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); +} + static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, 0); @@ -376,7 +380,7 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || io_local_work_pending(ctx); } -static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) { lockdep_assert_held(&ctx->uring_lock); } @@ -418,7 +422,6 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) } extern struct kmem_cache *req_cachep; -extern struct kmem_cache *io_buf_cachep; static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8e72de7712ac..098109259671 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -20,7 +20,8 @@ /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) -struct kmem_cache *io_buf_cachep; +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] struct io_provide_buf { struct file *file; @@ -31,6 +32,41 @@ struct io_provide_buf { __u16 bid; }; +static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) +{ + while (len) { + struct io_uring_buf *buf; + u32 this_len; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + this_len = min_t(int, len, buf->len); + buf->len -= this_len; + if (buf->len) { + buf->addr += this_len; + return false; + } + bl->head++; + len -= this_len; + } + return true; +} + +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) +{ + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; + + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + if (bl->flags & IOBL_INC) + return io_kbuf_inc_commit(bl, len); + bl->head += nr; + return true; +} + static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { @@ -52,6 +88,16 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } +void io_kbuf_drop_legacy(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + req->buf_index = req->kbuf->bgid; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(req->kbuf); + req->kbuf = NULL; +} + bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -70,33 +116,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return true; } -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) -{ - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); - } -} - static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl) { @@ -214,25 +233,14 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { u32 len = READ_ONCE(buf->len); + size_t needed; if (unlikely(!len)) return -ENOBUFS; - /* - * Limit incremental buffers to 1 segment. No point trying - * to peek ahead and map more than we need, when the buffers - * themselves should be large when setup with - * IOU_PBUF_RING_INC. - */ - if (bl->flags & IOBL_INC) { - nr_avail = 1; - } else { - size_t needed; - - needed = (arg->max_len + len - 1) / len; - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; - } + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; } /* @@ -342,6 +350,35 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) +{ + struct io_buffer_list *bl = req->buf_list; + bool ret = true; + + if (bl) { + ret = io_kbuf_commit(req, bl, len, nr); + req->buf_index = bl->bgid; + } + req->flags &= ~REQ_F_BUFFER_RING; + return ret; +} + +unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) +{ + unsigned int ret; + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + + if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) { + io_kbuf_drop_legacy(req); + return ret; + } + + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + return ret; +} + static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { @@ -367,7 +404,9 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *nxt; nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_move(&nxt->list, &ctx->io_buffers_cache); + list_del(&nxt->list); + kfree(nxt); + if (++i == nbufs) return i; cond_resched(); @@ -385,8 +424,6 @@ static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; - struct list_head *item, *tmp; - struct io_buffer *buf; while (1) { unsigned long index = 0; @@ -400,19 +437,6 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) break; io_put_bl(ctx, bl); } - - /* - * Move deferred locked entries to cache before pruning - */ - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) - list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - - list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { - buf = list_entry(item, struct io_buffer, list); - kmem_cache_free(io_buf_cachep, buf); - } } static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) @@ -501,53 +525,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } -#define IO_BUFFER_ALLOC_BATCH 64 - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; - int allocated; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * batch of buffer entries and add those to our freelist. - */ - - allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, - ARRAY_SIZE(bufs), (void **) bufs); - if (unlikely(!allocated)) { - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); - if (!bufs[0]) - return -ENOMEM; - allocated = 1; - } - - while (allocated) - list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); - - return 0; -} - static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, struct io_buffer_list *bl) { @@ -556,12 +533,11 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); + if (!buf) break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); + + list_add_tail(&buf->list, &bl->buf_list); buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index bd80c44c5af1..2ec0b983ce24 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -74,9 +74,12 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); - bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_drop_legacy(struct io_kiocb *req); + +unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr); struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, unsigned int bgid); @@ -116,100 +119,19 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -/* Mapped buffer ring, return io_uring_buf from head */ -#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] - -static inline bool io_kbuf_commit(struct io_kiocb *req, - struct io_buffer_list *bl, int len, int nr) -{ - if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) - return true; - - req->flags &= ~REQ_F_BUFFERS_COMMIT; - - if (unlikely(len < 0)) - return true; - - if (bl->flags & IOBL_INC) { - struct io_uring_buf *buf; - - buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); - if (WARN_ON_ONCE(len > buf->len)) - len = buf->len; - buf->len -= len; - if (buf->len) { - buf->addr += len; - return false; - } - } - - bl->head += nr; - return true; -} - -static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) -{ - struct io_buffer_list *bl = req->buf_list; - bool ret = true; - - if (bl) { - ret = io_kbuf_commit(req, bl, len, nr); - req->buf_index = bl->bgid; - } - req->flags &= ~REQ_F_BUFFER_RING; - return ret; -} - -static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, - struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req, len, 1); - } else { - req->buf_index = req->kbuf->bgid; - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } -} - -static inline void io_kbuf_drop(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - - /* len == 0 is fine here, non-ring will always drop all of it */ - __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); -} - -static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, - int nbufs, unsigned issue_flags) -{ - unsigned int ret; - - if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) - return 0; - - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); - if (req->flags & REQ_F_BUFFER_RING) { - if (!__io_put_kbuf_ring(req, len, nbufs)) - ret |= IORING_CQE_F_BUF_MORE; - } else { - __io_put_kbuf(req, len, issue_flags); - } - return ret; -} - static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - return __io_put_kbufs(req, len, 1, issue_flags); + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbufs(req, len, 1); } static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, int nbufs, unsigned issue_flags) { - return __io_put_kbufs(req, len, nbufs, issue_flags); + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbufs(req, len, nbufs); } #endif diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7e6f68e911f1..0bbcbbcdebfd 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -71,7 +71,7 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) return target_ctx->task_complete; } -static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_ring_ctx *ctx = req->ctx; diff --git a/io_uring/net.c b/io_uring/net.c index 50e8a3ccc9de..3fc39af5159e 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -75,7 +75,7 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - u16 buf_index; + bool retry; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; @@ -187,17 +187,15 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; + sr->retry = false; sr->len = 0; /* get from the provided buffer */ req->buf_index = sr->buf_group; } -#ifdef CONFIG_COMPAT -static int io_compat_msg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg, - struct compat_msghdr *msg, int ddir) +static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, + const struct iovec __user *uiov, unsigned uvec_seg, + int ddir) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct compat_iovec __user *uiov; struct iovec *iov; int ret, nr_segs; @@ -205,103 +203,108 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, nr_segs = iomsg->free_iov_nr; iov = iomsg->free_iov; } else { - iov = &iomsg->fast_iov; nr_segs = 1; + iov = &iomsg->fast_iov; } + ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, + &iomsg->msg.msg_iter, io_is_compat(req->ctx)); + if (unlikely(ret < 0)) + return ret; + io_net_vec_assign(req, iomsg, iov); + return 0; +} + +static int io_compat_msg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg, + struct compat_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct compat_iovec __user *uiov; + int ret; + if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; + ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + uiov = compat_ptr(msg->msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; + sr->len = 0; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) + struct compat_iovec tmp_iov; + + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; + sr->len = tmp_iov.iov_len; } return 0; } - ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, - nr_segs, &iov, &iomsg->msg.msg_iter, true); - if (unlikely(ret < 0)) - return ret; - - io_net_vec_assign(req, iomsg, iov); - return 0; + return io_net_import_vec(req, iomsg, (struct iovec __user *)uiov, + msg->msg_iovlen, ddir); } -#endif -static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, - struct user_msghdr *msg, int ddir) +static int io_copy_msghdr_from_user(struct user_msghdr *msg, + struct user_msghdr __user *umsg) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct user_msghdr __user *umsg = sr->umsg; - struct iovec *iov; - int ret, nr_segs; - - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; - } else { - iov = &iomsg->fast_iov; - nr_segs = 1; - } - if (!user_access_begin(umsg, sizeof(*umsg))) return -EFAULT; - - ret = -EFAULT; unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); + user_access_end(); + return 0; +ua_end: + user_access_end(); + return -EFAULT; +} + +static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, + struct user_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct user_msghdr __user *umsg = sr->umsg; + int ret; + + ret = io_copy_msghdr_from_user(msg, umsg); + if (unlikely(ret)) + return ret; + msg->msg_flags = 0; + ret = __copy_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; + sr->len = 0; } else if (msg->msg_iovlen > 1) { - ret = -EINVAL; - goto ua_end; + return -EINVAL; } else { struct iovec __user *uiov = msg->msg_iov; + struct iovec tmp_iov; - /* we only need the length for provided buffers */ - if (!access_ok(&uiov->iov_len, sizeof(uiov->iov_len))) - goto ua_end; - unsafe_get_user(iov->iov_len, &uiov->iov_len, ua_end); - sr->len = iov->iov_len; + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) + return -EFAULT; + sr->len = tmp_iov.iov_len; } - ret = 0; -ua_end: - user_access_end(); - return ret; + return 0; } - user_access_end(); - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, - &iov, &iomsg->msg.msg_iter, false); - if (unlikely(ret < 0)) - return ret; - - io_net_vec_assign(req, iomsg, iov); - return 0; + return io_net_import_vec(req, iomsg, msg->msg_iov, msg->msg_iovlen, ddir); } static int io_sendmsg_copy_hdr(struct io_kiocb *req, @@ -314,26 +317,16 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_iter.nr_segs = 0; -#ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { + if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); - if (unlikely(ret)) - return ret; - - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE, + NULL); sr->msg_control = iomsg->msg.msg_control_user; return ret; } -#endif - - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); - if (unlikely(ret)) - return ret; - - ret = __copy_msghdr(&iomsg->msg, &msg, NULL); + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE, NULL); /* save msg_control as sys_sendmsg() overwrites it */ sr->msg_control = iomsg->msg.msg_control_user; return ret; @@ -387,14 +380,10 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; - int ret; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ret = io_sendmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return io_sendmsg_copy_hdr(req, kmsg); } #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) @@ -404,6 +393,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; + sr->retry = false; if (req->opcode != IORING_OP_SEND) { if (sqe->addr2 || sqe->file_index) @@ -427,10 +417,9 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->buf_list = NULL; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif + if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG) @@ -714,31 +703,20 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_iter.nr_segs = 0; -#ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { + if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); - if (unlikely(ret)) - return ret; - - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); - if (unlikely(ret)) - return ret; - - return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, - cmsg.msg_controllen); + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST, + &iomsg->uaddr); + memset(&msg, 0, sizeof(msg)); + msg.msg_namelen = cmsg.msg_namelen; + msg.msg_controllen = cmsg.msg_controllen; + } else { + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); } -#endif - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); if (unlikely(ret)) return ret; - - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (unlikely(ret)) - return ret; - return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, msg.msg_controllen); } @@ -772,10 +750,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) return 0; } - ret = io_recvmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return io_recvmsg_copy_hdr(req, kmsg); } #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ @@ -786,6 +761,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; + sr->retry = false; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -826,14 +802,16 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif + sr->nr_multishot_loops = 0; return io_recvmsg_prep_setup(req); } +/* bits to clear in old and inherit in new cflags on bundle retry */ +#define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) + /* * Finishes io_recv and io_recvmsg. * @@ -853,9 +831,19 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (sr->flags & IORING_RECVSEND_BUNDLE) { cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); + if (sr->retry) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; + /* if more is available, retry and append to this one */ + if (!sr->retry && kmsg->msg.msg_inq > 0 && *ret > 0) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; + sr->done_io += *ret; + sr->retry = true; + return false; + } } else { cflags |= io_put_kbuf(req, *ret, issue_flags); } @@ -1234,6 +1222,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *notif; zc->done_io = 0; + zc->retry = false; req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) @@ -1272,14 +1261,13 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; - zc->buf_index = READ_ONCE(sqe->buf_index); + req->buf_index = READ_ONCE(sqe->buf_index); if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) zc->msg_flags |= MSG_CMSG_COMPAT; -#endif + if (unlikely(!io_msg_alloc_async(req))) return -ENOMEM; if (req->opcode != IORING_OP_SENDMSG_ZC) @@ -1344,24 +1332,10 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) int ret; if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - - ret = -EFAULT; - io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index); - if (node) { - io_req_assign_buf_node(sr->notif, node); - ret = 0; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (unlikely(ret)) - return ret; - - ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, - node->buf, (u64)(uintptr_t)sr->buf, - sr->len); + sr->notif->buf_index = req->buf_index; + ret = io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); if (unlikely(ret)) return ret; kmsg->msg.sg_from_iter = io_sg_from_iter; @@ -1600,7 +1574,6 @@ retry: } if (ret == -ERESTARTSYS) ret = -EINTR; - req_set_fail(req); } else if (!fixed) { fd_install(fd, file); ret = fd; @@ -1613,14 +1586,8 @@ retry: if (!arg.is_empty) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, cflags); - return IOU_OK; - } - - if (ret < 0) - return ret; - if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { + if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && + io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) goto retry; if (issue_flags & IO_URING_F_MULTISHOT) @@ -1629,6 +1596,10 @@ retry: } io_req_set_res(req, ret, cflags); + if (ret < 0) + req_set_fail(req); + if (!(issue_flags & IO_URING_F_MULTISHOT)) + return IOU_OK; return IOU_STOP_MULTISHOT; } diff --git a/io_uring/nop.c b/io_uring/nop.c index 5e5196df650a..28f06285fdc2 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -16,7 +16,6 @@ struct io_nop { struct file *file; int result; int fd; - int buffer; unsigned int flags; }; @@ -40,9 +39,7 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) else nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) - nop->buffer = READ_ONCE(sqe->buf_index); - else - nop->buffer = -1; + req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -64,17 +61,8 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags) } } if (nop->flags & IORING_NOP_FIXED_BUFFER) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - - ret = -EFAULT; - io_ring_submit_lock(ctx, issue_flags); - node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer); - if (node) { - io_req_assign_buf_node(req, node); - ret = 0; - } - io_ring_submit_unlock(ctx, issue_flags); + if (!io_find_buf_node(req, issue_flags)) + ret = -EFAULT; } done: if (ret < 0) diff --git a/io_uring/notif.c b/io_uring/notif.c index ee3a33510b3c..7bd92538dccb 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -11,7 +11,7 @@ static const struct ubuf_info_ops io_ubuf_ops; -static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) +static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) { struct io_notif_data *nd = io_notif_to_data(notif); @@ -29,7 +29,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) } nd = nd->next; - io_req_task_complete(notif, ts); + io_req_task_complete(notif, tw); } while (nd); } diff --git a/io_uring/opdef.c b/io_uring/opdef.c index e8baef4e5146..306fd9c48b44 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -104,7 +104,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, - .issue = io_read, + .issue = io_read_fixed, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -118,7 +118,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, - .issue = io_write, + .issue = io_write_fixed, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 14456436ff74..719a52104abe 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -7,6 +7,12 @@ struct io_issue_def { unsigned needs_file : 1; /* should block plug */ unsigned plug : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -15,14 +21,8 @@ struct io_issue_def { unsigned pollin : 1; unsigned pollout : 1; unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; /* skip auditing */ unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; /* have to be put into the iopoll list */ unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ diff --git a/io_uring/poll.c b/io_uring/poll.c index bb1c0cd4f809..176854882ba6 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -220,7 +220,7 @@ static inline void io_poll_execute(struct io_kiocb *req, int res) * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) +static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; @@ -288,7 +288,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, ts); + int ret = io_poll_issue(req, tw); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) @@ -311,11 +311,11 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_NO_ACTION; } -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) { int ret; - ret = io_poll_check_events(req, ts); + ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { io_kbuf_recycle(req, 0); return; @@ -335,7 +335,7 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -343,14 +343,14 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } else { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); else io_req_defer_failed(req, ret); } diff --git a/io_uring/poll.h b/io_uring/poll.h index 04ede93113dc..27e2db2ed4ae 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring_types.h> + #define IO_POLL_ALLOC_CACHE_MAX 32 enum { @@ -43,4 +45,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index af39b69eb4fd..5fff6ba2b7c0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -9,6 +9,7 @@ #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <uapi/linux/io_uring.h> @@ -32,6 +33,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) +#define IO_CACHED_BVECS_SEGS 32 + int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -101,36 +104,79 @@ static int io_buffer_validate(struct iovec *iov) return 0; } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +static void io_release_ubuf(void *priv) { + struct io_mapped_ubuf *imu = priv; unsigned int i; - if (node->buf) { - struct io_mapped_ubuf *imu = node->buf; + for (i = 0; i < imu->nr_bvecs; i++) + unpin_user_page(imu->bvec[i].bv_page); +} + +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, + int nr_bvecs) +{ + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), + GFP_KERNEL); +} - if (!refcount_dec_and_test(&imu->refs)) - return; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) + io_cache_free(&ctx->imu_cache, imu); + else kvfree(imu); - } } -struct io_rsrc_node *io_rsrc_node_alloc(int type) +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (!refcount_dec_and_test(&imu->refs)) + return; + + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + imu->release(imu->priv); + io_free_imu(ctx, imu); +} + +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { struct io_rsrc_node *node; - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); if (node) { node->type = type; node->refs = 1; + node->tag = 0; + node->file_ptr = 0; } return node; } -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) +bool io_rsrc_cache_init(struct io_ring_ctx *ctx) +{ + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, + IO_CACHED_BVECS_SEGS); + const int node_size = sizeof(struct io_rsrc_node); + bool ret; + + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, + node_size, 0); + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, + imu_cache_size, 0); + return ret; +} + +void io_rsrc_cache_free(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->node_cache, kfree); + io_alloc_cache_free(&ctx->imu_cache, kfree); +} + +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, + struct io_rsrc_data *data) { if (!data->nr) return; @@ -203,7 +249,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); @@ -449,19 +495,17 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) switch (node->type) { case IORING_RSRC_FILE: - if (io_slot_file(node)) - fput(io_slot_file(node)); + fput(io_slot_file(node)); break; case IORING_RSRC_BUFFER: - if (node->buf) - io_buffer_unmap(ctx, node); + io_buffer_unmap(ctx, node->buf); break; default: WARN_ON_ONCE(1); break; } - kfree(node); + io_cache_free(&ctx->node_cache, node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -523,7 +567,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; } ret = -ENOMEM; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { fput(file); goto fail; @@ -728,10 +772,9 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (!iov->iov_base) return NULL; - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); - node->buf = NULL; ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, @@ -748,10 +791,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_pages); if (!imu) goto done; + imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { unpin_user_pages(pages, nr_pages); @@ -762,8 +806,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; - imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = io_release_ubuf; + imu->priv = imu; + imu->is_kbuf = false; + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); @@ -781,9 +828,9 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } done: if (ret) { - kvfree(imu); - if (node) - io_put_rsrc_node(ctx, node); + if (imu) + io_free_imu(ctx, imu); + io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } kvfree(pages); @@ -860,7 +907,102 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -int io_import_fixed(int ddir, struct iov_iter *iter, +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + struct bio_vec bv, *bvec; + u16 nr_bvecs; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + if (data->nodes[index]) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + nr_bvecs = blk_rq_nr_phys_segments(rq); + imu = io_alloc_imu(ctx, nr_bvecs); + if (!imu) { + kfree(node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + imu->is_kbuf = true; + imu->dir = 1 << rq_data_dir(rq); + + bvec = imu->bvec; + rq_for_each_bvec(bv, rq, rq_iter) + *bvec++ = bv; + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); + +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + node = data->nodes[index]; + if (!node) { + ret = -EINVAL; + goto unlock; + } + if (!node->buf->is_kbuf) { + ret = -EBUSY; + goto unlock; + } + + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + +static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { @@ -874,20 +1016,22 @@ int io_import_fixed(int ddir, struct iov_iter *iter, /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; /* * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the @@ -901,8 +1045,16 @@ int io_import_fixed(int ddir, struct iov_iter *iter, */ const struct bio_vec *bvec = imu->bvec; + /* + * Kernel buffer bvecs, on the other hand, don't necessarily + * have the size property of user registered ones, so we have + * to use the slow iter advance. + */ if (offset < bvec->bv_len) { + iter->count -= offset; iter->iov_offset = offset; + } else if (imu->is_kbuf) { + iov_iter_advance(iter, offset); } else { unsigned long seg_skip; @@ -912,6 +1064,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, iter->bvec += seg_skip; iter->nr_segs -= seg_skip; + iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } @@ -919,6 +1072,35 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return 0; } +inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + + if (req->flags & REQ_F_BUF_NODE) + return req->buf_node; + + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + if (node) + io_req_assign_buf_node(req, node); + io_ring_submit_unlock(ctx, issue_flags); + return node; +} + +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + return io_import_fixed(ddir, iter, node->buf, buf_addr, len); +} + /* Lock two rings at once. The rings must be different! */ static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) { @@ -1002,7 +1184,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (!src_node) { dst_node = NULL; } else { - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { ret = -ENOMEM; goto out_free; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 89ea0135a1a0..f10a1252b3e9 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,6 +2,7 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H +#include <linux/io_uring_types.h> #include <linux/lockdep.h> enum { @@ -20,6 +21,11 @@ struct io_rsrc_node { }; }; +enum { + IO_IMU_DEST = 1 << ITER_DEST, + IO_IMU_SOURCE = 1 << ITER_SOURCE, +}; + struct io_mapped_ubuf { u64 ubuf; unsigned int len; @@ -27,6 +33,10 @@ struct io_mapped_ubuf { unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; + void (*release)(void *); + void *priv; + bool is_kbuf; + u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; @@ -39,14 +49,18 @@ struct io_imu_folio_data { unsigned int nr_folios; }; -struct io_rsrc_node *io_rsrc_node_alloc(int type); +bool io_rsrc_cache_init(struct io_ring_ctx *ctx); +void io_rsrc_cache_free(struct io_ring_ctx *ctx); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len); +struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags); +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); @@ -77,7 +91,7 @@ static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { lockdep_assert_held(&ctx->uring_lock); - if (node && !--node->refs) + if (!--node->refs) io_free_rsrc_node(ctx, node); } diff --git a/io_uring/rw.c b/io_uring/rw.c index e5528cebcd06..7c2f5f70a2c5 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -49,24 +49,16 @@ static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) return false; } -#ifdef CONFIG_COMPAT static int io_iov_compat_buffer_select_prep(struct io_rw *rw) { - struct compat_iovec __user *uiov; - compat_ssize_t clen; + struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); + struct compat_iovec iov; - uiov = u64_to_user_ptr(rw->addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) + if (copy_from_user(&iov, uiov, sizeof(iov))) return -EFAULT; - if (clen < 0) - return -EINVAL; - - rw->len = clen; + rw->len = iov.iov_len; return 0; } -#endif static int io_iov_buffer_select_prep(struct io_kiocb *req) { @@ -77,10 +69,8 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) if (rw->len != 1) return -EINVAL; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_iov_compat_buffer_select_prep(rw); -#endif uiov = u64_to_user_ptr(rw->addr); if (copy_from_user(&iov, uiov, sizeof(*uiov))) @@ -89,41 +79,24 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -static int __io_import_iovec(int ddir, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static int io_import_vec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + const struct iovec __user *uvec, + size_t uvec_segs) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + int ret, nr_segs; struct iovec *iov; - void __user *buf; - int nr_segs, ret; - size_t sqe_len; - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - return import_ubuf(ddir, buf, sqe_len, &io->iter); - } if (io->free_iovec) { nr_segs = io->free_iov_nr; iov = io->free_iovec; } else { - iov = &io->fast_iov; nr_segs = 1; + iov = &io->fast_iov; } - ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, - req->ctx->compat); + + ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, + io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; if (iov) { @@ -135,13 +108,35 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req, return 0; } -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + void __user *buf = u64_to_user_ptr(rw->addr); + size_t sqe_len = rw->len; + + if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) + return io_import_vec(ddir, req, io, buf, sqe_len); + + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + return import_ubuf(ddir, buf, sqe_len, &io->iter); +} + +static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) { int ret; - ret = __io_import_iovec(rw, req, io, issue_flags); + ret = __io_import_rw_buffer(rw, req, io, issue_flags); if (unlikely(ret < 0)) return ret; @@ -212,20 +207,6 @@ static int io_rw_alloc_async(struct io_kiocb *req) return 0; } -static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) -{ - struct io_async_rw *rw; - - if (io_rw_alloc_async(req)) - return -ENOMEM; - - if (!do_import || io_do_buffer_select(req)) - return 0; - - rw = req->async_data; - return io_import_iovec(ddir, req, rw, 0); -} - static inline void io_meta_save_state(struct io_async_rw *io) { io->meta_state.seed = io->meta.seed; @@ -267,14 +248,17 @@ static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, return ret; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - int ddir, bool do_import) +static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; u64 attr_type_mask; int ret; + if (io_rw_alloc_async(req)) + return -ENOMEM; + rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); @@ -300,10 +284,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - ret = io_prep_rw_setup(req, ddir, do_import); - - if (unlikely(ret)) - return ret; attr_type_mask = READ_ONCE(sqe->attr_type_mask); if (attr_type_mask) { @@ -314,31 +294,50 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, return -EINVAL; attr_ptr = READ_ONCE(sqe->attr_ptr); - ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); } - return ret; + return 0; +} + +static int io_rw_do_import(struct io_kiocb *req, int ddir) +{ + if (io_do_buffer_select(req)) + return 0; + + return io_import_rw_buffer(ddir, req, req->async_data, 0); +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ddir); + if (unlikely(ret)) + return ret; + + return io_rw_do_import(req, ddir); } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_DEST, true); + return io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_SOURCE, true); + return io_prep_rw(req, sqe, ITER_SOURCE); } static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { - const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe, ddir, do_import); + ret = io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; - if (do_import) + if (!(req->flags & REQ_F_BUFFER_SELECT)) return 0; /* @@ -358,38 +357,30 @@ int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_prep_rwv(req, sqe, ITER_SOURCE); } -static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - struct io_async_rw *io; + struct io_async_rw *io = req->async_data; int ret; - ret = io_prep_rw(req, sqe, ddir, false); - if (unlikely(ret)) - return ret; - - node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (!node) - return -EFAULT; - io_req_assign_buf_node(req, node); + if (io->bytes_done) + return 0; - io = req->async_data; - ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len); + ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, + issue_flags); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_DEST); + return __io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_SOURCE); + return __io_prep_rw(req, sqe, ITER_SOURCE); } /* @@ -405,7 +396,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe, ITER_DEST, false); + ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; @@ -519,7 +510,7 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -536,7 +527,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); io_req_rw_cleanup(req, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -637,6 +628,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) */ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { + struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -652,6 +644,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) + return -EFAULT; ppos = io_kiocb_ppos(kiocb); @@ -863,7 +857,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) loff_t *ppos; if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, io, issue_flags); + ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; } @@ -1154,6 +1148,28 @@ ret_eagain: } } +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); + if (unlikely(ret)) + return ret; + + return io_read(req, issue_flags); +} + +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + return io_write(req, issue_flags); +} + void io_rw_fail(struct io_kiocb *req) { int res; diff --git a/io_uring/rw.h b/io_uring/rw.h index eaa59bd64870..bf121b81ebe8 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring_types.h> #include <linux/pagemap.h> struct io_meta_state { @@ -37,9 +38,11 @@ int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_write(struct io_kiocb *req, unsigned int issue_flags); +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); void io_rw_cache_free(const void *entry); diff --git a/io_uring/splice.c b/io_uring/splice.c index 5b84f1630611..7b89bd84d486 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -51,7 +51,8 @@ void io_splice_cleanup(struct io_kiocb *req) { struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); - io_put_rsrc_node(req->ctx, sp->rsrc_node); + if (sp->rsrc_node) + io_put_rsrc_node(req->ctx, sp->rsrc_node); } static struct file *io_splice_get_file(struct io_kiocb *req, diff --git a/io_uring/timeout.c b/io_uring/timeout.c index c5fb817b1e28..2a107665230b 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -65,7 +65,7 @@ static inline bool io_timeout_finish(struct io_timeout *timeout, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); -static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; @@ -82,7 +82,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) } } - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) @@ -154,9 +154,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_flush_killed_timeouts(&list, 0); } -static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) +static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) { - io_tw_lock(link->ctx, ts); + io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -165,7 +165,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, ts); + io_req_task_complete(link, tw); link = nxt; } } @@ -312,7 +312,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; @@ -330,11 +330,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e6701b7aa147..de39b602aa82 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -102,7 +102,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) +static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; @@ -199,21 +199,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *node; - u16 index = READ_ONCE(sqe->buf_index); - - node = io_rsrc_node_lookup(&ctx->buf_table, index); - if (unlikely(!node)) - return -EFAULT; - /* - * Pi node upfront, prior to io_uring_cmd_import_fixed() - * being called. This prevents destruction of the mapped buffer - * we'll need at actual import time. - */ - io_req_assign_buf_node(req, node); - } + if (ioucmd->flags & IORING_URING_CMD_FIXED) + req->buf_index = READ_ONCE(sqe->buf_index); + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); return io_uring_cmd_prep_setup(req, sqe); @@ -237,7 +225,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) issue_flags |= IO_URING_F_SQE128; if (ctx->flags & IORING_SETUP_CQE32) issue_flags |= IO_URING_F_CQE32; - if (ctx->compat) + if (io_is_compat(ctx)) issue_flags |= IO_URING_F_COMPAT; if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) @@ -257,16 +245,13 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - struct io_rsrc_node *node = req->buf_node; - - /* Must have had rsrc_node assigned at prep time */ - if (node) - return io_import_fixed(rw, iter, node->buf, ubuf, len); - return -EFAULT; + return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 15a7daf3ff4f..54e69984cd8a 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -16,7 +16,7 @@ #include "waitid.h" #include "../kernel/exit.h" -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) @@ -42,7 +42,6 @@ static void io_waitid_free(struct io_kiocb *req) req->flags &= ~REQ_F_ASYNC_DATA; } -#ifdef CONFIG_COMPAT static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; @@ -67,7 +66,6 @@ Efault: ret = false; goto done; } -#endif static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { @@ -77,10 +75,8 @@ static bool io_waitid_copy_si(struct io_kiocb *req, int signo) if (!iw->infop) return true; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_waitid_compat_copy_si(iw, signo); -#endif if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; @@ -132,7 +128,7 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) io_req_set_res(req, ret, 0); } -static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_waitid_cancel(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; @@ -158,49 +154,13 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_waitid_cancel(ctx, req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); } bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (!io_match_task_safe(req, tctx, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_waitid_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) @@ -221,13 +181,13 @@ static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) return true; } -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) { struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - io_tw_lock(ctx, ts); + io_tw_lock(ctx, tw); ret = __do_wait(&iwa->wo); @@ -257,7 +217,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 2ebaf5e6942e..2694344274bf 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -114,6 +114,7 @@ endif TARGETS += tmpfs TARGETS += tpm2 TARGETS += tty +TARGETS += ublk TARGETS += uevent TARGETS += user_events TARGETS += vDSO diff --git a/tools/testing/selftests/ublk/.gitignore b/tools/testing/selftests/ublk/.gitignore new file mode 100644 index 000000000000..8b2871ea7751 --- /dev/null +++ b/tools/testing/selftests/ublk/.gitignore @@ -0,0 +1,3 @@ +kublk +/tools +*-verify.state diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile new file mode 100644 index 000000000000..7817afe29005 --- /dev/null +++ b/tools/testing/selftests/ublk/Makefile @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0 + +CFLAGS += -O3 -Wl,-no-as-needed -Wall -I $(top_srcdir) +LDLIBS += -lpthread -lm -luring + +TEST_PROGS := test_generic_01.sh + +TEST_PROGS += test_null_01.sh +TEST_PROGS += test_null_02.sh +TEST_PROGS += test_loop_01.sh +TEST_PROGS += test_loop_02.sh +TEST_PROGS += test_loop_03.sh +TEST_PROGS += test_loop_04.sh +TEST_PROGS += test_stripe_01.sh +TEST_PROGS += test_stripe_02.sh + +TEST_PROGS += test_stress_01.sh +TEST_PROGS += test_stress_02.sh + +TEST_GEN_PROGS_EXTENDED = kublk + +include ../lib.mk + +$(TEST_GEN_PROGS_EXTENDED): kublk.c null.c file_backed.c common.c stripe.c + +check: + shellcheck -x -f gcc *.sh diff --git a/tools/testing/selftests/ublk/common.c b/tools/testing/selftests/ublk/common.c new file mode 100644 index 000000000000..01580a6f8519 --- /dev/null +++ b/tools/testing/selftests/ublk/common.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +void backing_file_tgt_deinit(struct ublk_dev *dev) +{ + int i; + + for (i = 1; i < dev->nr_fds; i++) { + fsync(dev->fds[i]); + close(dev->fds[i]); + } +} + +int backing_file_tgt_init(struct ublk_dev *dev) +{ + int fd, i; + + assert(dev->nr_fds == 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) { + char *file = dev->tgt.backing_file[i]; + unsigned long bytes; + struct stat st; + + ublk_dbg(UBLK_DBG_DEV, "%s: file %d: %s\n", __func__, i, file); + + fd = open(file, O_RDWR | O_DIRECT); + if (fd < 0) { + ublk_err("%s: backing file %s can't be opened: %s\n", + __func__, file, strerror(errno)); + return -EBADF; + } + + if (fstat(fd, &st) < 0) { + close(fd); + return -EBADF; + } + + if (S_ISREG(st.st_mode)) + bytes = st.st_size; + else if (S_ISBLK(st.st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) + return -1; + } else { + return -EINVAL; + } + + dev->tgt.backing_file_size[i] = bytes; + dev->fds[dev->nr_fds] = fd; + dev->nr_fds += 1; + } + + return 0; +} diff --git a/tools/testing/selftests/ublk/config b/tools/testing/selftests/ublk/config new file mode 100644 index 000000000000..592b0ba4d661 --- /dev/null +++ b/tools/testing/selftests/ublk/config @@ -0,0 +1 @@ +CONFIG_BLK_DEV_UBLK=m diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c new file mode 100644 index 000000000000..6f34eabfae97 --- /dev/null +++ b/tools/testing/selftests/ublk/file_backed.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +static enum io_uring_op ublk_to_uring_op(const struct ublksrv_io_desc *iod, int zc) +{ + unsigned ublk_op = ublksrv_get_op(iod); + + if (ublk_op == UBLK_IO_OP_READ) + return zc ? IORING_OP_READ_FIXED : IORING_OP_READ; + else if (ublk_op == UBLK_IO_OP_WRITE) + return zc ? IORING_OP_WRITE_FIXED : IORING_OP_WRITE; + assert(0); +} + +static int loop_queue_flush_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + struct io_uring_sqe *sqe[1]; + + ublk_queue_alloc_sqes(q, sqe, 1); + io_uring_prep_fsync(sqe[0], 1 /*fds[1]*/, IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); + return 1; +} + +static int loop_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + int zc = ublk_queue_use_zc(q); + enum io_uring_op op = ublk_to_uring_op(iod, zc); + struct io_uring_sqe *sqe[3]; + + if (!zc) { + ublk_queue_alloc_sqes(q, sqe, 1); + if (!sqe[0]) + return -ENOMEM; + + io_uring_prep_rw(op, sqe[0], 1 /*fds[1]*/, + (void *)iod->addr, + iod->nr_sectors << 9, + iod->start_sector << 9); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, 1); + return 1; + } + + ublk_queue_alloc_sqes(q, sqe, 3); + + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); + + io_uring_prep_rw(op, sqe[1], 1 /*fds[1]*/, 0, + iod->nr_sectors << 9, + iod->start_sector << 9); + sqe[1]->buf_index = tag; + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); + + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); + + return 2; +} + +static int loop_queue_tgt_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + int ret; + + switch (ublk_op) { + case UBLK_IO_OP_FLUSH: + ret = loop_queue_flush_io(q, iod, tag); + break; + case UBLK_IO_OP_WRITE_ZEROES: + case UBLK_IO_OP_DISCARD: + ret = -ENOTSUP; + break; + case UBLK_IO_OP_READ: + case UBLK_IO_OP_WRITE: + ret = loop_queue_tgt_rw_io(q, iod, tag); + break; + default: + ret = -EINVAL; + break; + } + + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u\n", __func__, tag, + iod->op_flags, iod->start_sector, iod->nr_sectors << 9); + return ret; +} + +static int ublk_loop_queue_io(struct ublk_queue *q, int tag) +{ + int queued = loop_queue_tgt_io(q, tag); + + ublk_queued_tgt_io(q, tag, queued); + return 0; +} + +static void ublk_loop_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + if (!io->result) + io->result = cqe->res; + if (cqe->res < 0) + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } + + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; + + if (ublk_completed_tgt_io(q, tag)) + ublk_complete_io(q, tag, io->result); +} + +static int ublk_loop_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + unsigned long long bytes; + int ret; + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DMA_ALIGN, + .basic = { + .attrs = UBLK_ATTR_VOLATILE_CACHE, + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, + }, + .dma = { + .alignment = 511, + }, + }; + + ret = backing_file_tgt_init(dev); + if (ret) + return ret; + + if (dev->tgt.nr_backing_files != 1) + return -EINVAL; + + bytes = dev->tgt.backing_file_size[0]; + dev->tgt.dev_size = bytes; + p.basic.dev_sectors = bytes >> 9; + dev->tgt.params = p; + + return 0; +} + +const struct ublk_tgt_ops loop_tgt_ops = { + .name = "loop", + .init_tgt = ublk_loop_tgt_init, + .deinit_tgt = backing_file_tgt_deinit, + .queue_io = ublk_loop_queue_io, + .tgt_io_done = ublk_loop_io_done, +}; diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c new file mode 100644 index 000000000000..05147b53c361 --- /dev/null +++ b/tools/testing/selftests/ublk/kublk.c @@ -0,0 +1,1138 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Description: uring_cmd based ublk + */ + +#include "kublk.h" + +unsigned int ublk_dbg_mask = UBLK_LOG; +static const struct ublk_tgt_ops *tgt_ops_list[] = { + &null_tgt_ops, + &loop_tgt_ops, + &stripe_tgt_ops, +}; + +static const struct ublk_tgt_ops *ublk_find_tgt(const char *name) +{ + const struct ublk_tgt_ops *ops; + int i; + + if (name == NULL) + return NULL; + + for (i = 0; sizeof(tgt_ops_list) / sizeof(ops); i++) + if (strcmp(tgt_ops_list[i]->name, name) == 0) + return tgt_ops_list[i]; + return NULL; +} + +static inline int ublk_setup_ring(struct io_uring *r, int depth, + int cq_depth, unsigned flags) +{ + struct io_uring_params p; + + memset(&p, 0, sizeof(p)); + p.flags = flags | IORING_SETUP_CQSIZE; + p.cq_entries = cq_depth; + + return io_uring_queue_init_params(depth, r, &p); +} + +static void ublk_ctrl_init_cmd(struct ublk_dev *dev, + struct io_uring_sqe *sqe, + struct ublk_ctrl_cmd_data *data) +{ + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + struct ublksrv_ctrl_cmd *cmd = (struct ublksrv_ctrl_cmd *)ublk_get_sqe_cmd(sqe); + + sqe->fd = dev->ctrl_fd; + sqe->opcode = IORING_OP_URING_CMD; + sqe->ioprio = 0; + + if (data->flags & CTRL_CMD_HAS_BUF) { + cmd->addr = data->addr; + cmd->len = data->len; + } + + if (data->flags & CTRL_CMD_HAS_DATA) + cmd->data[0] = data->data[0]; + + cmd->dev_id = info->dev_id; + cmd->queue_id = -1; + + ublk_set_sqe_cmd_op(sqe, data->cmd_op); + + io_uring_sqe_set_data(sqe, cmd); +} + +static int __ublk_ctrl_cmd(struct ublk_dev *dev, + struct ublk_ctrl_cmd_data *data) +{ + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + int ret = -EINVAL; + + sqe = io_uring_get_sqe(&dev->ring); + if (!sqe) { + ublk_err("%s: can't get sqe ret %d\n", __func__, ret); + return ret; + } + + ublk_ctrl_init_cmd(dev, sqe, data); + + ret = io_uring_submit(&dev->ring); + if (ret < 0) { + ublk_err("uring submit ret %d\n", ret); + return ret; + } + + ret = io_uring_wait_cqe(&dev->ring, &cqe); + if (ret < 0) { + ublk_err("wait cqe: %s\n", strerror(-ret)); + return ret; + } + io_uring_cqe_seen(&dev->ring, cqe); + + return cqe->res; +} + +static int ublk_ctrl_stop_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_CMD_STOP_DEV, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_start_dev(struct ublk_dev *dev, + int daemon_pid) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_START_DEV, + .flags = CTRL_CMD_HAS_DATA, + }; + + dev->dev_info.ublksrv_pid = data.data[0] = daemon_pid; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_add_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_ADD_DEV, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) &dev->dev_info, + .len = sizeof(struct ublksrv_ctrl_dev_info), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_del_dev(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_DEL_DEV, + .flags = 0, + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_info(struct ublk_dev *dev) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_GET_DEV_INFO, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) &dev->dev_info, + .len = sizeof(struct ublksrv_ctrl_dev_info), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_set_params(struct ublk_dev *dev, + struct ublk_params *params) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_SET_PARAMS, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) params, + .len = sizeof(*params), + }; + params->len = sizeof(*params); + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_params(struct ublk_dev *dev, + struct ublk_params *params) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_CMD_GET_PARAMS, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64)params, + .len = sizeof(*params), + }; + + params->len = sizeof(*params); + + return __ublk_ctrl_cmd(dev, &data); +} + +static int ublk_ctrl_get_features(struct ublk_dev *dev, + __u64 *features) +{ + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_GET_FEATURES, + .flags = CTRL_CMD_HAS_BUF, + .addr = (__u64) (uintptr_t) features, + .len = sizeof(*features), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +static const char *ublk_dev_state_desc(struct ublk_dev *dev) +{ + switch (dev->dev_info.state) { + case UBLK_S_DEV_DEAD: + return "DEAD"; + case UBLK_S_DEV_LIVE: + return "LIVE"; + case UBLK_S_DEV_QUIESCED: + return "QUIESCED"; + default: + return "UNKNOWN"; + }; +} + +static void ublk_ctrl_dump(struct ublk_dev *dev) +{ + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + struct ublk_params p; + int ret; + + ret = ublk_ctrl_get_params(dev, &p); + if (ret < 0) { + ublk_err("failed to get params %m\n"); + return; + } + + ublk_log("dev id %d: nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", + info->dev_id, info->nr_hw_queues, info->queue_depth, + 1 << p.basic.logical_bs_shift, p.basic.dev_sectors); + ublk_log("\tmax rq size %d daemon pid %d flags 0x%llx state %s\n", + info->max_io_buf_bytes, info->ublksrv_pid, info->flags, + ublk_dev_state_desc(dev)); + fflush(stdout); +} + +static void ublk_ctrl_deinit(struct ublk_dev *dev) +{ + close(dev->ctrl_fd); + free(dev); +} + +static struct ublk_dev *ublk_ctrl_init(void) +{ + struct ublk_dev *dev = (struct ublk_dev *)calloc(1, sizeof(*dev)); + struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + int ret; + + dev->ctrl_fd = open(CTRL_DEV, O_RDWR); + if (dev->ctrl_fd < 0) { + free(dev); + return NULL; + } + + info->max_io_buf_bytes = UBLK_IO_MAX_BYTES; + + ret = ublk_setup_ring(&dev->ring, UBLK_CTRL_RING_DEPTH, + UBLK_CTRL_RING_DEPTH, IORING_SETUP_SQE128); + if (ret < 0) { + ublk_err("queue_init: %s\n", strerror(-ret)); + free(dev); + return NULL; + } + dev->nr_fds = 1; + + return dev; +} + +static int __ublk_queue_cmd_buf_sz(unsigned depth) +{ + int size = depth * sizeof(struct ublksrv_io_desc); + unsigned int page_sz = getpagesize(); + + return round_up(size, page_sz); +} + +static int ublk_queue_max_cmd_buf_sz(void) +{ + return __ublk_queue_cmd_buf_sz(UBLK_MAX_QUEUE_DEPTH); +} + +static int ublk_queue_cmd_buf_sz(struct ublk_queue *q) +{ + return __ublk_queue_cmd_buf_sz(q->q_depth); +} + +static void ublk_queue_deinit(struct ublk_queue *q) +{ + int i; + int nr_ios = q->q_depth; + + io_uring_unregister_buffers(&q->ring); + + io_uring_unregister_ring_fd(&q->ring); + + if (q->ring.ring_fd > 0) { + io_uring_unregister_files(&q->ring); + close(q->ring.ring_fd); + q->ring.ring_fd = -1; + } + + if (q->io_cmd_buf) + munmap(q->io_cmd_buf, ublk_queue_cmd_buf_sz(q)); + + for (i = 0; i < nr_ios; i++) + free(q->ios[i].buf_addr); +} + +static int ublk_queue_init(struct ublk_queue *q) +{ + struct ublk_dev *dev = q->dev; + int depth = dev->dev_info.queue_depth; + int i, ret = -1; + int cmd_buf_size, io_buf_size; + unsigned long off; + int ring_depth = dev->tgt.sq_depth, cq_depth = dev->tgt.cq_depth; + + q->tgt_ops = dev->tgt.ops; + q->state = 0; + q->q_depth = depth; + q->cmd_inflight = 0; + q->tid = gettid(); + + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { + q->state |= UBLKSRV_NO_BUF; + q->state |= UBLKSRV_ZC; + } + + cmd_buf_size = ublk_queue_cmd_buf_sz(q); + off = UBLKSRV_CMD_BUF_OFFSET + q->q_id * ublk_queue_max_cmd_buf_sz(); + q->io_cmd_buf = (char *)mmap(0, cmd_buf_size, PROT_READ, + MAP_SHARED | MAP_POPULATE, dev->fds[0], off); + if (q->io_cmd_buf == MAP_FAILED) { + ublk_err("ublk dev %d queue %d map io_cmd_buf failed %m\n", + q->dev->dev_info.dev_id, q->q_id); + goto fail; + } + + io_buf_size = dev->dev_info.max_io_buf_bytes; + for (i = 0; i < q->q_depth; i++) { + q->ios[i].buf_addr = NULL; + q->ios[i].flags = UBLKSRV_NEED_FETCH_RQ | UBLKSRV_IO_FREE; + + if (q->state & UBLKSRV_NO_BUF) + continue; + + if (posix_memalign((void **)&q->ios[i].buf_addr, + getpagesize(), io_buf_size)) { + ublk_err("ublk dev %d queue %d io %d posix_memalign failed %m\n", + dev->dev_info.dev_id, q->q_id, i); + goto fail; + } + } + + ret = ublk_setup_ring(&q->ring, ring_depth, cq_depth, + IORING_SETUP_COOP_TASKRUN); + if (ret < 0) { + ublk_err("ublk dev %d queue %d setup io_uring failed %d\n", + q->dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + + if (dev->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY) { + ret = io_uring_register_buffers_sparse(&q->ring, q->q_depth); + if (ret) { + ublk_err("ublk dev %d queue %d register spare buffers failed %d", + dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + } + + io_uring_register_ring_fd(&q->ring); + + ret = io_uring_register_files(&q->ring, dev->fds, dev->nr_fds); + if (ret) { + ublk_err("ublk dev %d queue %d register files failed %d\n", + q->dev->dev_info.dev_id, q->q_id, ret); + goto fail; + } + + return 0; + fail: + ublk_queue_deinit(q); + ublk_err("ublk dev %d queue %d failed\n", + dev->dev_info.dev_id, q->q_id); + return -ENOMEM; +} + +#define WAIT_USEC 100000 +#define MAX_WAIT_USEC (3 * 1000000) +static int ublk_dev_prep(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + int dev_id = dev->dev_info.dev_id; + unsigned int wait_usec = 0; + int ret = 0, fd = -1; + char buf[64]; + + snprintf(buf, 64, "%s%d", UBLKC_DEV, dev_id); + + while (wait_usec < MAX_WAIT_USEC) { + fd = open(buf, O_RDWR); + if (fd >= 0) + break; + usleep(WAIT_USEC); + wait_usec += WAIT_USEC; + } + if (fd < 0) { + ublk_err("can't open %s %s\n", buf, strerror(errno)); + return -1; + } + + dev->fds[0] = fd; + if (dev->tgt.ops->init_tgt) + ret = dev->tgt.ops->init_tgt(ctx, dev); + if (ret) + close(dev->fds[0]); + return ret; +} + +static void ublk_dev_unprep(struct ublk_dev *dev) +{ + if (dev->tgt.ops->deinit_tgt) + dev->tgt.ops->deinit_tgt(dev); + close(dev->fds[0]); +} + +int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag) +{ + struct ublksrv_io_cmd *cmd; + struct io_uring_sqe *sqe[1]; + unsigned int cmd_op = 0; + __u64 user_data; + + /* only freed io can be issued */ + if (!(io->flags & UBLKSRV_IO_FREE)) + return 0; + + /* we issue because we need either fetching or committing */ + if (!(io->flags & + (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP))) + return 0; + + if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP) + cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ; + else if (io->flags & UBLKSRV_NEED_FETCH_RQ) + cmd_op = UBLK_U_IO_FETCH_REQ; + + if (io_uring_sq_space_left(&q->ring) < 1) + io_uring_submit(&q->ring); + + ublk_queue_alloc_sqes(q, sqe, 1); + if (!sqe[0]) { + ublk_err("%s: run out of sqe %d, tag %d\n", + __func__, q->q_id, tag); + return -1; + } + + cmd = (struct ublksrv_io_cmd *)ublk_get_sqe_cmd(sqe[0]); + + if (cmd_op == UBLK_U_IO_COMMIT_AND_FETCH_REQ) + cmd->result = io->result; + + /* These fields should be written once, never change */ + ublk_set_sqe_cmd_op(sqe[0], cmd_op); + sqe[0]->fd = 0; /* dev->fds[0] */ + sqe[0]->opcode = IORING_OP_URING_CMD; + sqe[0]->flags = IOSQE_FIXED_FILE; + sqe[0]->rw_flags = 0; + cmd->tag = tag; + cmd->q_id = q->q_id; + if (!(q->state & UBLKSRV_NO_BUF)) + cmd->addr = (__u64) (uintptr_t) io->buf_addr; + else + cmd->addr = 0; + + user_data = build_user_data(tag, _IOC_NR(cmd_op), 0, 0); + io_uring_sqe_set_data64(sqe[0], user_data); + + io->flags = 0; + + q->cmd_inflight += 1; + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: (qid %d tag %u cmd_op %u) iof %x stopping %d\n", + __func__, q->q_id, tag, cmd_op, + io->flags, !!(q->state & UBLKSRV_QUEUE_STOPPING)); + return 1; +} + +static void ublk_submit_fetch_commands(struct ublk_queue *q) +{ + int i = 0; + + for (i = 0; i < q->q_depth; i++) + ublk_queue_io_cmd(q, &q->ios[i], i); +} + +static int ublk_queue_is_idle(struct ublk_queue *q) +{ + return !io_uring_sq_ready(&q->ring) && !q->io_inflight; +} + +static int ublk_queue_is_done(struct ublk_queue *q) +{ + return (q->state & UBLKSRV_QUEUE_STOPPING) && ublk_queue_is_idle(q); +} + +static inline void ublksrv_handle_tgt_cqe(struct ublk_queue *q, + struct io_uring_cqe *cqe) +{ + unsigned tag = user_data_to_tag(cqe->user_data); + + if (cqe->res < 0 && cqe->res != -EAGAIN) + ublk_err("%s: failed tgt io: res %d qid %u tag %u, cmd_op %u\n", + __func__, cqe->res, q->q_id, + user_data_to_tag(cqe->user_data), + user_data_to_op(cqe->user_data)); + + if (q->tgt_ops->tgt_io_done) + q->tgt_ops->tgt_io_done(q, tag, cqe); +} + +static void ublk_handle_cqe(struct io_uring *r, + struct io_uring_cqe *cqe, void *data) +{ + struct ublk_queue *q = container_of(r, struct ublk_queue, ring); + unsigned tag = user_data_to_tag(cqe->user_data); + unsigned cmd_op = user_data_to_op(cqe->user_data); + int fetch = (cqe->res != UBLK_IO_RES_ABORT) && + !(q->state & UBLKSRV_QUEUE_STOPPING); + struct ublk_io *io; + + if (cqe->res < 0 && cqe->res != -ENODEV) + ublk_err("%s: res %d userdata %llx queue state %x\n", __func__, + cqe->res, cqe->user_data, q->state); + + ublk_dbg(UBLK_DBG_IO_CMD, "%s: res %d (qid %d tag %u cmd_op %u target %d/%d) stopping %d\n", + __func__, cqe->res, q->q_id, tag, cmd_op, + is_target_io(cqe->user_data), + user_data_to_tgt_data(cqe->user_data), + (q->state & UBLKSRV_QUEUE_STOPPING)); + + /* Don't retrieve io in case of target io */ + if (is_target_io(cqe->user_data)) { + ublksrv_handle_tgt_cqe(q, cqe); + return; + } + + io = &q->ios[tag]; + q->cmd_inflight--; + + if (!fetch) { + q->state |= UBLKSRV_QUEUE_STOPPING; + io->flags &= ~UBLKSRV_NEED_FETCH_RQ; + } + + if (cqe->res == UBLK_IO_RES_OK) { + assert(tag < q->q_depth); + if (q->tgt_ops->queue_io) + q->tgt_ops->queue_io(q, tag); + } else { + /* + * COMMIT_REQ will be completed immediately since no fetching + * piggyback is required. + * + * Marking IO_FREE only, then this io won't be issued since + * we only issue io with (UBLKSRV_IO_FREE | UBLKSRV_NEED_*) + * + * */ + io->flags = UBLKSRV_IO_FREE; + } +} + +static int ublk_reap_events_uring(struct io_uring *r) +{ + struct io_uring_cqe *cqe; + unsigned head; + int count = 0; + + io_uring_for_each_cqe(r, head, cqe) { + ublk_handle_cqe(r, cqe, NULL); + count += 1; + } + io_uring_cq_advance(r, count); + + return count; +} + +static int ublk_process_io(struct ublk_queue *q) +{ + int ret, reapped; + + ublk_dbg(UBLK_DBG_QUEUE, "dev%d-q%d: to_submit %d inflight cmd %u stopping %d\n", + q->dev->dev_info.dev_id, + q->q_id, io_uring_sq_ready(&q->ring), + q->cmd_inflight, + (q->state & UBLKSRV_QUEUE_STOPPING)); + + if (ublk_queue_is_done(q)) + return -ENODEV; + + ret = io_uring_submit_and_wait(&q->ring, 1); + reapped = ublk_reap_events_uring(&q->ring); + + ublk_dbg(UBLK_DBG_QUEUE, "submit result %d, reapped %d stop %d idle %d\n", + ret, reapped, (q->state & UBLKSRV_QUEUE_STOPPING), + (q->state & UBLKSRV_QUEUE_IDLE)); + + return reapped; +} + +static void *ublk_io_handler_fn(void *data) +{ + struct ublk_queue *q = data; + int dev_id = q->dev->dev_info.dev_id; + int ret; + + ret = ublk_queue_init(q); + if (ret) { + ublk_err("ublk dev %d queue %d init queue failed\n", + dev_id, q->q_id); + return NULL; + } + ublk_dbg(UBLK_DBG_QUEUE, "tid %d: ublk dev %d queue %d started\n", + q->tid, dev_id, q->q_id); + + /* submit all io commands to ublk driver */ + ublk_submit_fetch_commands(q); + do { + if (ublk_process_io(q) < 0) + break; + } while (1); + + ublk_dbg(UBLK_DBG_QUEUE, "ublk dev %d queue %d exited\n", dev_id, q->q_id); + ublk_queue_deinit(q); + return NULL; +} + +static void ublk_set_parameters(struct ublk_dev *dev) +{ + int ret; + + ret = ublk_ctrl_set_params(dev, &dev->tgt.params); + if (ret) + ublk_err("dev %d set basic parameter failed %d\n", + dev->dev_info.dev_id, ret); +} + +static int ublk_send_dev_event(const struct dev_ctx *ctx, int dev_id) +{ + uint64_t id; + int evtfd = ctx->_evtfd; + + if (evtfd < 0) + return -EBADF; + + if (dev_id >= 0) + id = dev_id + 1; + else + id = ERROR_EVTFD_DEVID; + + if (write(evtfd, &id, sizeof(id)) != sizeof(id)) + return -EINVAL; + + return 0; +} + + +static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + int ret, i; + void *thread_ret; + const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; + + ublk_dbg(UBLK_DBG_DEV, "%s enter\n", __func__); + + ret = ublk_dev_prep(ctx, dev); + if (ret) + return ret; + + for (i = 0; i < dinfo->nr_hw_queues; i++) { + dev->q[i].dev = dev; + dev->q[i].q_id = i; + pthread_create(&dev->q[i].thread, NULL, + ublk_io_handler_fn, + &dev->q[i]); + } + + /* everything is fine now, start us */ + ublk_set_parameters(dev); + ret = ublk_ctrl_start_dev(dev, getpid()); + if (ret < 0) { + ublk_err("%s: ublk_ctrl_start_dev failed: %d\n", __func__, ret); + goto fail; + } + + ublk_ctrl_get_info(dev); + if (ctx->fg) + ublk_ctrl_dump(dev); + else + ublk_send_dev_event(ctx, dev->dev_info.dev_id); + + /* wait until we are terminated */ + for (i = 0; i < dinfo->nr_hw_queues; i++) + pthread_join(dev->q[i].thread, &thread_ret); + fail: + ublk_dev_unprep(dev); + ublk_dbg(UBLK_DBG_DEV, "%s exit\n", __func__); + + return ret; +} + +static int wait_ublk_dev(const char *path, int evt_mask, unsigned timeout) +{ +#define EV_SIZE (sizeof(struct inotify_event)) +#define EV_BUF_LEN (128 * (EV_SIZE + 16)) + struct pollfd pfd; + int fd, wd; + int ret = -EINVAL; + const char *dev_name = basename(path); + + fd = inotify_init(); + if (fd < 0) { + ublk_dbg(UBLK_DBG_DEV, "%s: inotify init failed\n", __func__); + return fd; + } + + wd = inotify_add_watch(fd, "/dev", evt_mask); + if (wd == -1) { + ublk_dbg(UBLK_DBG_DEV, "%s: add watch for /dev failed\n", __func__); + goto fail; + } + + pfd.fd = fd; + pfd.events = POLL_IN; + while (1) { + int i = 0; + char buffer[EV_BUF_LEN]; + ret = poll(&pfd, 1, 1000 * timeout); + + if (ret == -1) { + ublk_err("%s: poll inotify failed: %d\n", __func__, ret); + goto rm_watch; + } else if (ret == 0) { + ublk_err("%s: poll inotify timeout\n", __func__); + ret = -ETIMEDOUT; + goto rm_watch; + } + + ret = read(fd, buffer, EV_BUF_LEN); + if (ret < 0) { + ublk_err("%s: read inotify fd failed\n", __func__); + goto rm_watch; + } + + while (i < ret) { + struct inotify_event *event = (struct inotify_event *)&buffer[i]; + + ublk_dbg(UBLK_DBG_DEV, "%s: inotify event %x %s\n", + __func__, event->mask, event->name); + if (event->mask & evt_mask) { + if (!strcmp(event->name, dev_name)) { + ret = 0; + goto rm_watch; + } + } + i += EV_SIZE + event->len; + } + } +rm_watch: + inotify_rm_watch(fd, wd); +fail: + close(fd); + return ret; +} + +static int ublk_stop_io_daemon(const struct ublk_dev *dev) +{ + int daemon_pid = dev->dev_info.ublksrv_pid; + int dev_id = dev->dev_info.dev_id; + char ublkc[64]; + int ret = 0; + + if (daemon_pid < 0) + return 0; + + /* daemon may be dead already */ + if (kill(daemon_pid, 0) < 0) + goto wait; + + snprintf(ublkc, sizeof(ublkc), "/dev/%s%d", "ublkc", dev_id); + + /* ublk char device may be gone already */ + if (access(ublkc, F_OK) != 0) + goto wait; + + /* Wait until ublk char device is closed, when the daemon is shutdown */ + ret = wait_ublk_dev(ublkc, IN_CLOSE, 10); + /* double check and since it may be closed before starting inotify */ + if (ret == -ETIMEDOUT) + ret = kill(daemon_pid, 0) < 0; +wait: + waitpid(daemon_pid, NULL, 0); + ublk_dbg(UBLK_DBG_DEV, "%s: pid %d dev_id %d ret %d\n", + __func__, daemon_pid, dev_id, ret); + + return ret; +} + +static int __cmd_dev_add(const struct dev_ctx *ctx) +{ + unsigned nr_queues = ctx->nr_hw_queues; + const char *tgt_type = ctx->tgt_type; + unsigned depth = ctx->queue_depth; + __u64 features; + const struct ublk_tgt_ops *ops; + struct ublksrv_ctrl_dev_info *info; + struct ublk_dev *dev; + int dev_id = ctx->dev_id; + int ret, i; + + ops = ublk_find_tgt(tgt_type); + if (!ops) { + ublk_err("%s: no such tgt type, type %s\n", + __func__, tgt_type); + return -ENODEV; + } + + if (nr_queues > UBLK_MAX_QUEUES || depth > UBLK_QUEUE_DEPTH) { + ublk_err("%s: invalid nr_queues or depth queues %u depth %u\n", + __func__, nr_queues, depth); + return -EINVAL; + } + + dev = ublk_ctrl_init(); + if (!dev) { + ublk_err("%s: can't alloc dev id %d, type %s\n", + __func__, dev_id, tgt_type); + return -ENOMEM; + } + + /* kernel doesn't support get_features */ + ret = ublk_ctrl_get_features(dev, &features); + if (ret < 0) + return -EINVAL; + + if (!(features & UBLK_F_CMD_IOCTL_ENCODE)) + return -ENOTSUP; + + info = &dev->dev_info; + info->dev_id = ctx->dev_id; + info->nr_hw_queues = nr_queues; + info->queue_depth = depth; + info->flags = ctx->flags; + dev->tgt.ops = ops; + dev->tgt.sq_depth = depth; + dev->tgt.cq_depth = depth; + + for (i = 0; i < MAX_BACK_FILES; i++) { + if (ctx->files[i]) { + strcpy(dev->tgt.backing_file[i], ctx->files[i]); + dev->tgt.nr_backing_files++; + } + } + + ret = ublk_ctrl_add_dev(dev); + if (ret < 0) { + ublk_err("%s: can't add dev id %d, type %s ret %d\n", + __func__, dev_id, tgt_type, ret); + goto fail; + } + + ret = ublk_start_daemon(ctx, dev); + ublk_dbg(UBLK_DBG_DEV, "%s: daemon exit %d\b", ret); + if (ret < 0) + ublk_ctrl_del_dev(dev); + +fail: + if (ret < 0) + ublk_send_dev_event(ctx, -1); + ublk_ctrl_deinit(dev); + return ret; +} + +static int __cmd_dev_list(struct dev_ctx *ctx); + +static int cmd_dev_add(struct dev_ctx *ctx) +{ + int res; + + if (ctx->fg) + goto run; + + ctx->_evtfd = eventfd(0, 0); + if (ctx->_evtfd < 0) { + ublk_err("%s: failed to create eventfd %s\n", __func__, strerror(errno)); + exit(-1); + } + + setsid(); + res = fork(); + if (res == 0) { +run: + res = __cmd_dev_add(ctx); + return res; + } else if (res > 0) { + uint64_t id; + + res = read(ctx->_evtfd, &id, sizeof(id)); + close(ctx->_evtfd); + if (res == sizeof(id) && id != ERROR_EVTFD_DEVID) { + ctx->dev_id = id - 1; + return __cmd_dev_list(ctx); + } + exit(EXIT_FAILURE); + } else { + return res; + } +} + +static int __cmd_dev_del(struct dev_ctx *ctx) +{ + int number = ctx->dev_id; + struct ublk_dev *dev; + int ret; + + dev = ublk_ctrl_init(); + dev->dev_info.dev_id = number; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) + goto fail; + + ret = ublk_ctrl_stop_dev(dev); + if (ret < 0) + ublk_err("%s: stop dev %d failed ret %d\n", __func__, number, ret); + + ret = ublk_stop_io_daemon(dev); + if (ret < 0) + ublk_err("%s: stop daemon id %d dev %d, ret %d\n", + __func__, dev->dev_info.ublksrv_pid, number, ret); + ublk_ctrl_del_dev(dev); +fail: + ublk_ctrl_deinit(dev); + + return (ret >= 0) ? 0 : ret; +} + +static int cmd_dev_del(struct dev_ctx *ctx) +{ + int i; + + if (ctx->dev_id >= 0 || !ctx->all) + return __cmd_dev_del(ctx); + + for (i = 0; i < 255; i++) { + ctx->dev_id = i; + __cmd_dev_del(ctx); + } + return 0; +} + +static int __cmd_dev_list(struct dev_ctx *ctx) +{ + struct ublk_dev *dev = ublk_ctrl_init(); + int ret; + + if (!dev) + return -ENODEV; + + dev->dev_info.dev_id = ctx->dev_id; + + ret = ublk_ctrl_get_info(dev); + if (ret < 0) { + if (ctx->logging) + ublk_err("%s: can't get dev info from %d: %d\n", + __func__, ctx->dev_id, ret); + } else { + ublk_ctrl_dump(dev); + } + + ublk_ctrl_deinit(dev); + + return ret; +} + +static int cmd_dev_list(struct dev_ctx *ctx) +{ + int i; + + if (ctx->dev_id >= 0 || !ctx->all) + return __cmd_dev_list(ctx); + + ctx->logging = false; + for (i = 0; i < 255; i++) { + ctx->dev_id = i; + __cmd_dev_list(ctx); + } + return 0; +} + +static int cmd_dev_get_features(void) +{ +#define const_ilog2(x) (63 - __builtin_clzll(x)) + static const char *feat_map[] = { + [const_ilog2(UBLK_F_SUPPORT_ZERO_COPY)] = "ZERO_COPY", + [const_ilog2(UBLK_F_URING_CMD_COMP_IN_TASK)] = "COMP_IN_TASK", + [const_ilog2(UBLK_F_NEED_GET_DATA)] = "GET_DATA", + [const_ilog2(UBLK_F_USER_RECOVERY)] = "USER_RECOVERY", + [const_ilog2(UBLK_F_USER_RECOVERY_REISSUE)] = "RECOVERY_REISSUE", + [const_ilog2(UBLK_F_UNPRIVILEGED_DEV)] = "UNPRIVILEGED_DEV", + [const_ilog2(UBLK_F_CMD_IOCTL_ENCODE)] = "CMD_IOCTL_ENCODE", + [const_ilog2(UBLK_F_USER_COPY)] = "USER_COPY", + [const_ilog2(UBLK_F_ZONED)] = "ZONED", + [const_ilog2(UBLK_F_USER_RECOVERY_FAIL_IO)] = "RECOVERY_FAIL_IO", + }; + struct ublk_dev *dev; + __u64 features = 0; + int ret; + + dev = ublk_ctrl_init(); + if (!dev) { + fprintf(stderr, "ublksrv_ctrl_init failed id\n"); + return -EOPNOTSUPP; + } + + ret = ublk_ctrl_get_features(dev, &features); + if (!ret) { + int i; + + printf("ublk_drv features: 0x%llx\n", features); + + for (i = 0; i < sizeof(features) * 8; i++) { + const char *feat; + + if (!((1ULL << i) & features)) + continue; + if (i < sizeof(feat_map) / sizeof(feat_map[0])) + feat = feat_map[i]; + else + feat = "unknown"; + printf("\t%-20s: 0x%llx\n", feat, 1ULL << i); + } + } + + return ret; +} + +static int cmd_dev_help(char *exe) +{ + printf("%s add -t [null|loop] [-q nr_queues] [-d depth] [-n dev_id] [backfile1] [backfile2] ...\n", exe); + printf("\t default: nr_queues=2(max 4), depth=128(max 128), dev_id=-1(auto allocation)\n"); + printf("%s del [-n dev_id] -a \n", exe); + printf("\t -a delete all devices -n delete specified device\n"); + printf("%s list [-n dev_id] -a \n", exe); + printf("\t -a list all devices, -n list specified device, default -a \n"); + printf("%s features\n", exe); + return 0; +} + +int main(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "all", 0, NULL, 'a' }, + { "type", 1, NULL, 't' }, + { "number", 1, NULL, 'n' }, + { "queues", 1, NULL, 'q' }, + { "depth", 1, NULL, 'd' }, + { "debug_mask", 1, NULL, 0 }, + { "quiet", 0, NULL, 0 }, + { "zero_copy", 0, NULL, 'z' }, + { "foreground", 0, NULL, 0 }, + { "chunk_size", 1, NULL, 0 }, + { 0, 0, 0, 0 } + }; + int option_idx, opt; + const char *cmd = argv[1]; + struct dev_ctx ctx = { + .queue_depth = 128, + .nr_hw_queues = 2, + .dev_id = -1, + .tgt_type = "unknown", + .chunk_size = 65536, /* def chunk size is 64K */ + }; + int ret = -EINVAL, i; + + if (argc == 1) + return ret; + + optind = 2; + while ((opt = getopt_long(argc, argv, "t:n:d:q:az", + longopts, &option_idx)) != -1) { + switch (opt) { + case 'a': + ctx.all = 1; + break; + case 'n': + ctx.dev_id = strtol(optarg, NULL, 10); + break; + case 't': + if (strlen(optarg) < sizeof(ctx.tgt_type)) + strcpy(ctx.tgt_type, optarg); + break; + case 'q': + ctx.nr_hw_queues = strtol(optarg, NULL, 10); + break; + case 'd': + ctx.queue_depth = strtol(optarg, NULL, 10); + break; + case 'z': + ctx.flags |= UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_USER_COPY; + break; + case 0: + if (!strcmp(longopts[option_idx].name, "debug_mask")) + ublk_dbg_mask = strtol(optarg, NULL, 16); + if (!strcmp(longopts[option_idx].name, "quiet")) + ublk_dbg_mask = 0; + if (!strcmp(longopts[option_idx].name, "foreground")) + ctx.fg = 1; + if (!strcmp(longopts[option_idx].name, "chunk_size")) + ctx.chunk_size = strtol(optarg, NULL, 10); + } + } + + i = optind; + while (i < argc && ctx.nr_files < MAX_BACK_FILES) { + ctx.files[ctx.nr_files++] = argv[i++]; + } + + if (!strcmp(cmd, "add")) + ret = cmd_dev_add(&ctx); + else if (!strcmp(cmd, "del")) + ret = cmd_dev_del(&ctx); + else if (!strcmp(cmd, "list")) { + ctx.all = 1; + ret = cmd_dev_list(&ctx); + } else if (!strcmp(cmd, "help")) + ret = cmd_dev_help(argv[0]); + else if (!strcmp(cmd, "features")) + ret = cmd_dev_get_features(); + else + cmd_dev_help(argv[0]); + + return ret; +} diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h new file mode 100644 index 000000000000..f31a5c4d4143 --- /dev/null +++ b/tools/testing/selftests/ublk/kublk.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef KUBLK_INTERNAL_H +#define KUBLK_INTERNAL_H + +#include <unistd.h> +#include <stdlib.h> +#include <assert.h> +#include <stdio.h> +#include <stdarg.h> +#include <string.h> +#include <pthread.h> +#include <getopt.h> +#include <limits.h> +#include <poll.h> +#include <fcntl.h> +#include <sys/syscall.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/inotify.h> +#include <sys/wait.h> +#include <sys/eventfd.h> +#include <sys/uio.h> +#include <liburing.h> +#include <linux/ublk_cmd.h> +#include "ublk_dep.h" + +#define __maybe_unused __attribute__((unused)) +#define MAX_BACK_FILES 4 +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +/****************** part 1: libublk ********************/ + +#define CTRL_DEV "/dev/ublk-control" +#define UBLKC_DEV "/dev/ublkc" +#define UBLKB_DEV "/dev/ublkb" +#define UBLK_CTRL_RING_DEPTH 32 +#define ERROR_EVTFD_DEVID -2 + +/* queue idle timeout */ +#define UBLKSRV_IO_IDLE_SECS 20 + +#define UBLK_IO_MAX_BYTES (1 << 20) +#define UBLK_MAX_QUEUES 4 +#define UBLK_QUEUE_DEPTH 128 + +#define UBLK_DBG_DEV (1U << 0) +#define UBLK_DBG_QUEUE (1U << 1) +#define UBLK_DBG_IO_CMD (1U << 2) +#define UBLK_DBG_IO (1U << 3) +#define UBLK_DBG_CTRL_CMD (1U << 4) +#define UBLK_LOG (1U << 5) + +struct ublk_dev; +struct ublk_queue; + +struct dev_ctx { + char tgt_type[16]; + unsigned long flags; + unsigned nr_hw_queues; + unsigned queue_depth; + int dev_id; + int nr_files; + char *files[MAX_BACK_FILES]; + unsigned int logging:1; + unsigned int all:1; + unsigned int fg:1; + + /* stripe */ + unsigned int chunk_size; + + int _evtfd; +}; + +struct ublk_ctrl_cmd_data { + __u32 cmd_op; +#define CTRL_CMD_HAS_DATA 1 +#define CTRL_CMD_HAS_BUF 2 + __u32 flags; + + __u64 data[2]; + __u64 addr; + __u32 len; +}; + +struct ublk_io { + char *buf_addr; + +#define UBLKSRV_NEED_FETCH_RQ (1UL << 0) +#define UBLKSRV_NEED_COMMIT_RQ_COMP (1UL << 1) +#define UBLKSRV_IO_FREE (1UL << 2) + unsigned short flags; + unsigned short refs; /* used by target code only */ + + int result; + + unsigned short tgt_ios; + void *private_data; +}; + +struct ublk_tgt_ops { + const char *name; + int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); + void (*deinit_tgt)(struct ublk_dev *); + + int (*queue_io)(struct ublk_queue *, int tag); + void (*tgt_io_done)(struct ublk_queue *, + int tag, const struct io_uring_cqe *); +}; + +struct ublk_tgt { + unsigned long dev_size; + unsigned int sq_depth; + unsigned int cq_depth; + const struct ublk_tgt_ops *ops; + struct ublk_params params; + + int nr_backing_files; + unsigned long backing_file_size[MAX_BACK_FILES]; + char backing_file[MAX_BACK_FILES][PATH_MAX]; +}; + +struct ublk_queue { + int q_id; + int q_depth; + unsigned int cmd_inflight; + unsigned int io_inflight; + struct ublk_dev *dev; + const struct ublk_tgt_ops *tgt_ops; + char *io_cmd_buf; + struct io_uring ring; + struct ublk_io ios[UBLK_QUEUE_DEPTH]; +#define UBLKSRV_QUEUE_STOPPING (1U << 0) +#define UBLKSRV_QUEUE_IDLE (1U << 1) +#define UBLKSRV_NO_BUF (1U << 2) +#define UBLKSRV_ZC (1U << 3) + unsigned state; + pid_t tid; + pthread_t thread; +}; + +struct ublk_dev { + struct ublk_tgt tgt; + struct ublksrv_ctrl_dev_info dev_info; + struct ublk_queue q[UBLK_MAX_QUEUES]; + + int fds[MAX_BACK_FILES + 1]; /* fds[0] points to /dev/ublkcN */ + int nr_fds; + int ctrl_fd; + struct io_uring ring; + + void *private_data; +}; + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + unsigned long __mptr = (unsigned long)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); }) +#endif + +#define round_up(val, rnd) \ + (((val) + ((rnd) - 1)) & ~((rnd) - 1)) + + +extern unsigned int ublk_dbg_mask; +extern int ublk_queue_io_cmd(struct ublk_queue *q, struct ublk_io *io, unsigned tag); + +static inline int is_target_io(__u64 user_data) +{ + return (user_data & (1ULL << 63)) != 0; +} + +static inline __u64 build_user_data(unsigned tag, unsigned op, + unsigned tgt_data, unsigned is_target_io) +{ + assert(!(tag >> 16) && !(op >> 8) && !(tgt_data >> 16)); + + return tag | (op << 16) | (tgt_data << 24) | (__u64)is_target_io << 63; +} + +static inline unsigned int user_data_to_tag(__u64 user_data) +{ + return user_data & 0xffff; +} + +static inline unsigned int user_data_to_op(__u64 user_data) +{ + return (user_data >> 16) & 0xff; +} + +static inline unsigned int user_data_to_tgt_data(__u64 user_data) +{ + return (user_data >> 24) & 0xffff; +} + +static inline unsigned short ublk_cmd_op_nr(unsigned int op) +{ + return _IOC_NR(op); +} + +static inline void ublk_err(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); +} + +static inline void ublk_log(const char *fmt, ...) +{ + if (ublk_dbg_mask & UBLK_LOG) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +static inline void ublk_dbg(int level, const char *fmt, ...) +{ + if (level & ublk_dbg_mask) { + va_list ap; + + va_start(ap, fmt); + vfprintf(stdout, fmt, ap); + } +} + +static inline int ublk_queue_alloc_sqes(struct ublk_queue *q, + struct io_uring_sqe *sqes[], int nr_sqes) +{ + unsigned left = io_uring_sq_space_left(&q->ring); + int i; + + if (left < nr_sqes) + io_uring_submit(&q->ring); + + for (i = 0; i < nr_sqes; i++) { + sqes[i] = io_uring_get_sqe(&q->ring); + if (!sqes[i]) + return i; + } + + return nr_sqes; +} + +static inline void io_uring_prep_buf_register(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_REGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + +static inline void io_uring_prep_buf_unregister(struct io_uring_sqe *sqe, + int dev_fd, int tag, int q_id, __u64 index) +{ + struct ublksrv_io_cmd *cmd = (struct ublksrv_io_cmd *)sqe->cmd; + + io_uring_prep_read(sqe, dev_fd, 0, 0, 0); + sqe->opcode = IORING_OP_URING_CMD; + sqe->flags |= IOSQE_FIXED_FILE; + sqe->cmd_op = UBLK_U_IO_UNREGISTER_IO_BUF; + + cmd->tag = tag; + cmd->addr = index; + cmd->q_id = q_id; +} + +static inline void *ublk_get_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return (void *)&sqe->cmd; +} + +static inline void ublk_set_io_res(struct ublk_queue *q, int tag, int res) +{ + q->ios[tag].result = res; +} + +static inline int ublk_get_io_res(const struct ublk_queue *q, unsigned tag) +{ + return q->ios[tag].result; +} + +static inline void ublk_mark_io_done(struct ublk_io *io, int res) +{ + io->flags |= (UBLKSRV_NEED_COMMIT_RQ_COMP | UBLKSRV_IO_FREE); + io->result = res; +} + +static inline const struct ublksrv_io_desc *ublk_get_iod(const struct ublk_queue *q, int tag) +{ + return (struct ublksrv_io_desc *)&(q->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); +} + +static inline void ublk_set_sqe_cmd_op(struct io_uring_sqe *sqe, __u32 cmd_op) +{ + __u32 *addr = (__u32 *)&sqe->off; + + addr[0] = cmd_op; + addr[1] = 0; +} + +static inline struct ublk_io *ublk_get_io(struct ublk_queue *q, unsigned tag) +{ + return &q->ios[tag]; +} + +static inline int ublk_complete_io(struct ublk_queue *q, unsigned tag, int res) +{ + struct ublk_io *io = &q->ios[tag]; + + ublk_mark_io_done(io, res); + + return ublk_queue_io_cmd(q, io, tag); +} + +static inline void ublk_queued_tgt_io(struct ublk_queue *q, unsigned tag, int queued) +{ + if (queued < 0) + ublk_complete_io(q, tag, queued); + else { + struct ublk_io *io = ublk_get_io(q, tag); + + q->io_inflight += queued; + io->tgt_ios = queued; + io->result = 0; + } +} + +static inline int ublk_completed_tgt_io(struct ublk_queue *q, unsigned tag) +{ + struct ublk_io *io = ublk_get_io(q, tag); + + q->io_inflight--; + + return --io->tgt_ios == 0; +} + +static inline int ublk_queue_use_zc(const struct ublk_queue *q) +{ + return q->state & UBLKSRV_ZC; +} + +extern const struct ublk_tgt_ops null_tgt_ops; +extern const struct ublk_tgt_ops loop_tgt_ops; +extern const struct ublk_tgt_ops stripe_tgt_ops; + +void backing_file_tgt_deinit(struct ublk_dev *dev); +int backing_file_tgt_init(struct ublk_dev *dev); + +static inline unsigned int ilog2(unsigned int x) +{ + if (x == 0) + return 0; + return (sizeof(x) * 8 - 1) - __builtin_clz(x); +} +#endif diff --git a/tools/testing/selftests/ublk/null.c b/tools/testing/selftests/ublk/null.c new file mode 100644 index 000000000000..899875ff50fe --- /dev/null +++ b/tools/testing/selftests/ublk/null.c @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "kublk.h" + +#ifndef IORING_NOP_INJECT_RESULT +#define IORING_NOP_INJECT_RESULT (1U << 0) +#endif + +#ifndef IORING_NOP_FIXED_BUFFER +#define IORING_NOP_FIXED_BUFFER (1U << 3) +#endif + +static int ublk_null_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; + unsigned long dev_size = 250UL << 30; + + dev->tgt.dev_size = dev_size; + dev->tgt.params = (struct ublk_params) { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = info->max_io_buf_bytes >> 9, + .dev_sectors = dev_size >> 9, + }, + }; + + if (info->flags & UBLK_F_SUPPORT_ZERO_COPY) + dev->tgt.sq_depth = dev->tgt.cq_depth = 2 * info->queue_depth; + return 0; +} + +static int null_queue_zc_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + struct io_uring_sqe *sqe[3]; + + ublk_queue_alloc_sqes(q, sqe, 3); + + io_uring_prep_buf_register(sqe[0], 0, tag, q->q_id, tag); + sqe[0]->user_data = build_user_data(tag, + ublk_cmd_op_nr(sqe[0]->cmd_op), 0, 1); + sqe[0]->flags |= IOSQE_CQE_SKIP_SUCCESS | IOSQE_IO_HARDLINK; + + io_uring_prep_nop(sqe[1]); + sqe[1]->buf_index = tag; + sqe[1]->flags |= IOSQE_FIXED_FILE | IOSQE_IO_HARDLINK; + sqe[1]->rw_flags = IORING_NOP_FIXED_BUFFER | IORING_NOP_INJECT_RESULT; + sqe[1]->len = iod->nr_sectors << 9; /* injected result */ + sqe[1]->user_data = build_user_data(tag, ublk_op, 0, 1); + + io_uring_prep_buf_unregister(sqe[2], 0, tag, q->q_id, tag); + sqe[2]->user_data = build_user_data(tag, ublk_cmd_op_nr(sqe[2]->cmd_op), 0, 1); + + // buf register is marked as IOSQE_CQE_SKIP_SUCCESS + return 2; +} + +static void ublk_null_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + + if (cqe->res < 0 || op != ublk_cmd_op_nr(UBLK_U_IO_UNREGISTER_IO_BUF)) { + if (!io->result) + io->result = cqe->res; + if (cqe->res < 0) + ublk_err("%s: io failed op %x user_data %lx\n", + __func__, op, cqe->user_data); + } + + /* buffer register op is IOSQE_CQE_SKIP_SUCCESS */ + if (op == ublk_cmd_op_nr(UBLK_U_IO_REGISTER_IO_BUF)) + io->tgt_ios += 1; + + if (ublk_completed_tgt_io(q, tag)) + ublk_complete_io(q, tag, io->result); +} + +static int ublk_null_queue_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + int zc = ublk_queue_use_zc(q); + int queued; + + if (!zc) { + ublk_complete_io(q, tag, iod->nr_sectors << 9); + return 0; + } + + queued = null_queue_zc_io(q, tag); + ublk_queued_tgt_io(q, tag, queued); + return 0; +} + +const struct ublk_tgt_ops null_tgt_ops = { + .name = "null", + .init_tgt = ublk_null_tgt_init, + .queue_io = ublk_null_queue_io, + .tgt_io_done = ublk_null_io_done, +}; diff --git a/tools/testing/selftests/ublk/stripe.c b/tools/testing/selftests/ublk/stripe.c new file mode 100644 index 000000000000..98c564b12f3c --- /dev/null +++ b/tools/testing/selftests/ublk/stripe.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "kublk.h" + +#define NR_STRIPE MAX_BACK_FILES + +struct stripe_conf { + unsigned nr_files; + unsigned shift; +}; + +struct stripe { + loff_t start; + unsigned nr_sects; + int seq; + + struct iovec *vec; + unsigned nr_vec; + unsigned cap; +}; + +struct stripe_array { + struct stripe s[NR_STRIPE]; + unsigned nr; + struct iovec _vec[]; +}; + +static inline const struct stripe_conf *get_chunk_shift(const struct ublk_queue *q) +{ + return (struct stripe_conf *)q->dev->private_data; +} + +static inline unsigned calculate_nr_vec(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod) +{ + const unsigned shift = conf->shift - 9; + const unsigned unit_sects = conf->nr_files << shift; + loff_t start = iod->start_sector; + loff_t end = start + iod->nr_sectors; + + return (end / unit_sects) - (start / unit_sects) + 1; +} + +static struct stripe_array *alloc_stripe_array(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod) +{ + unsigned nr_vecs = calculate_nr_vec(conf, iod); + unsigned total = nr_vecs * conf->nr_files; + struct stripe_array *s; + int i; + + s = malloc(sizeof(*s) + total * sizeof(struct iovec)); + + s->nr = 0; + for (i = 0; i < conf->nr_files; i++) { + struct stripe *t = &s->s[i]; + + t->nr_vec = 0; + t->vec = &s->_vec[i * nr_vecs]; + t->nr_sects = 0; + t->cap = nr_vecs; + } + + return s; +} + +static void free_stripe_array(struct stripe_array *s) +{ + free(s); +} + +static void calculate_stripe_array(const struct stripe_conf *conf, + const struct ublksrv_io_desc *iod, struct stripe_array *s) +{ + const unsigned shift = conf->shift - 9; + const unsigned chunk_sects = 1 << shift; + const unsigned unit_sects = conf->nr_files << shift; + off64_t start = iod->start_sector; + off64_t end = start + iod->nr_sectors; + unsigned long done = 0; + unsigned idx = 0; + + while (start < end) { + unsigned nr_sects = chunk_sects - (start & (chunk_sects - 1)); + loff_t unit_off = (start / unit_sects) * unit_sects; + unsigned seq = (start - unit_off) >> shift; + struct stripe *this = &s->s[idx]; + loff_t stripe_off = (unit_off / conf->nr_files) + + (start & (chunk_sects - 1)); + + if (nr_sects > end - start) + nr_sects = end - start; + if (this->nr_sects == 0) { + this->nr_sects = nr_sects; + this->start = stripe_off; + this->seq = seq; + s->nr += 1; + } else { + assert(seq == this->seq); + assert(this->start + this->nr_sects == stripe_off); + this->nr_sects += nr_sects; + } + + assert(this->nr_vec < this->cap); + this->vec[this->nr_vec].iov_base = (void *)(iod->addr + done); + this->vec[this->nr_vec++].iov_len = nr_sects << 9; + + start += nr_sects; + done += nr_sects << 9; + idx = (idx + 1) % conf->nr_files; + } +} + +static inline enum io_uring_op stripe_to_uring_op(const struct ublksrv_io_desc *iod) +{ + unsigned ublk_op = ublksrv_get_op(iod); + + if (ublk_op == UBLK_IO_OP_READ) + return IORING_OP_READV; + else if (ublk_op == UBLK_IO_OP_WRITE) + return IORING_OP_WRITEV; + assert(0); +} + +static int stripe_queue_tgt_rw_io(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + const struct stripe_conf *conf = get_chunk_shift(q); + enum io_uring_op op = stripe_to_uring_op(iod); + struct io_uring_sqe *sqe[NR_STRIPE]; + struct stripe_array *s = alloc_stripe_array(conf, iod); + struct ublk_io *io = ublk_get_io(q, tag); + int i; + + io->private_data = s; + calculate_stripe_array(conf, iod, s); + + ublk_queue_alloc_sqes(q, sqe, s->nr); + for (i = 0; i < s->nr; i++) { + struct stripe *t = &s->s[i]; + + io_uring_prep_rw(op, sqe[i], + t->seq + 1, + (void *)t->vec, + t->nr_vec, + t->start << 9); + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + /* bit63 marks us as tgt io */ + sqe[i]->user_data = build_user_data(tag, ublksrv_get_op(iod), i, 1); + } + return s->nr; +} + +static int handle_flush(struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) +{ + const struct stripe_conf *conf = get_chunk_shift(q); + struct io_uring_sqe *sqe[NR_STRIPE]; + int i; + + ublk_queue_alloc_sqes(q, sqe, conf->nr_files); + for (i = 0; i < conf->nr_files; i++) { + io_uring_prep_fsync(sqe[i], i + 1, IORING_FSYNC_DATASYNC); + io_uring_sqe_set_flags(sqe[i], IOSQE_FIXED_FILE); + sqe[i]->user_data = build_user_data(tag, UBLK_IO_OP_FLUSH, 0, 1); + } + return conf->nr_files; +} + +static int stripe_queue_tgt_io(struct ublk_queue *q, int tag) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned ublk_op = ublksrv_get_op(iod); + int ret = 0; + + switch (ublk_op) { + case UBLK_IO_OP_FLUSH: + ret = handle_flush(q, iod, tag); + break; + case UBLK_IO_OP_WRITE_ZEROES: + case UBLK_IO_OP_DISCARD: + ret = -ENOTSUP; + break; + case UBLK_IO_OP_READ: + case UBLK_IO_OP_WRITE: + ret = stripe_queue_tgt_rw_io(q, iod, tag); + break; + default: + ret = -EINVAL; + break; + } + ublk_dbg(UBLK_DBG_IO, "%s: tag %d ublk io %x %llx %u ret %d\n", __func__, tag, + iod->op_flags, iod->start_sector, iod->nr_sectors << 9, ret); + return ret; +} + +static int ublk_stripe_queue_io(struct ublk_queue *q, int tag) +{ + int queued = stripe_queue_tgt_io(q, tag); + + ublk_queued_tgt_io(q, tag, queued); + return 0; +} + +static void ublk_stripe_io_done(struct ublk_queue *q, int tag, + const struct io_uring_cqe *cqe) +{ + const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); + unsigned op = user_data_to_op(cqe->user_data); + struct ublk_io *io = ublk_get_io(q, tag); + int res = cqe->res; + + if (res < 0) { + if (!io->result) + io->result = res; + ublk_err("%s: io failure %d tag %u\n", __func__, res, tag); + } + + /* fail short READ/WRITE simply */ + if (op == UBLK_IO_OP_READ || op == UBLK_IO_OP_WRITE) { + unsigned seq = user_data_to_tgt_data(cqe->user_data); + struct stripe_array *s = io->private_data; + + if (res < s->s[seq].vec->iov_len) + io->result = -EIO; + } + + if (ublk_completed_tgt_io(q, tag)) { + int res = io->result; + + if (!res) + res = iod->nr_sectors << 9; + + ublk_complete_io(q, tag, res); + + free_stripe_array(io->private_data); + io->private_data = NULL; + } +} + +static int ublk_stripe_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) +{ + struct ublk_params p = { + .types = UBLK_PARAM_TYPE_BASIC, + .basic = { + .attrs = UBLK_ATTR_VOLATILE_CACHE, + .logical_bs_shift = 9, + .physical_bs_shift = 12, + .io_opt_shift = 12, + .io_min_shift = 9, + .max_sectors = dev->dev_info.max_io_buf_bytes >> 9, + }, + }; + unsigned chunk_size = ctx->chunk_size; + struct stripe_conf *conf; + unsigned chunk_shift; + loff_t bytes = 0; + int ret, i; + + if ((chunk_size & (chunk_size - 1)) || !chunk_size) { + ublk_err("invalid chunk size %u\n", chunk_size); + return -EINVAL; + } + + if (chunk_size < 4096 || chunk_size > 512 * 1024) { + ublk_err("invalid chunk size %u\n", chunk_size); + return -EINVAL; + } + + chunk_shift = ilog2(chunk_size); + + ret = backing_file_tgt_init(dev); + if (ret) + return ret; + + if (!dev->tgt.nr_backing_files || dev->tgt.nr_backing_files > NR_STRIPE) + return -EINVAL; + + assert(dev->nr_fds == dev->tgt.nr_backing_files + 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) + dev->tgt.backing_file_size[i] &= ~((1 << chunk_shift) - 1); + + for (i = 0; i < dev->tgt.nr_backing_files; i++) { + unsigned long size = dev->tgt.backing_file_size[i]; + + if (size != dev->tgt.backing_file_size[0]) + return -EINVAL; + bytes += size; + } + + conf = malloc(sizeof(*conf)); + conf->shift = chunk_shift; + conf->nr_files = dev->tgt.nr_backing_files; + + dev->private_data = conf; + dev->tgt.dev_size = bytes; + p.basic.dev_sectors = bytes >> 9; + dev->tgt.params = p; + dev->tgt.sq_depth = dev->dev_info.queue_depth * conf->nr_files; + dev->tgt.cq_depth = dev->dev_info.queue_depth * conf->nr_files; + + printf("%s: shift %u files %u\n", __func__, conf->shift, conf->nr_files); + + return 0; +} + +static void ublk_stripe_tgt_deinit(struct ublk_dev *dev) +{ + free(dev->private_data); + backing_file_tgt_deinit(dev); +} + +const struct ublk_tgt_ops stripe_tgt_ops = { + .name = "stripe", + .init_tgt = ublk_stripe_tgt_init, + .deinit_tgt = ublk_stripe_tgt_deinit, + .queue_io = ublk_stripe_queue_io, + .tgt_io_done = ublk_stripe_io_done, +}; diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh new file mode 100755 index 000000000000..75f54ac6b1c4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_common.sh @@ -0,0 +1,246 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +UBLK_SKIP_CODE=4 + +_have_program() { + if command -v "$1" >/dev/null 2>&1; then + return 0 + fi + return 1 +} + +_get_disk_dev_t() { + local dev_id=$1 + local dev + local major + local minor + + dev=/dev/ublkb"${dev_id}" + major=$(stat -c '%Hr' "$dev") + minor=$(stat -c '%Lr' "$dev") + + echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) )) +} + +_create_backfile() { + local my_size=$1 + local my_file + + my_file=$(mktemp ublk_file_"${my_size}"_XXXXX) + truncate -s "${my_size}" "${my_file}" + echo "$my_file" +} + +_remove_backfile() { + local file=$1 + + [ -f "$file" ] && rm -f "$file" +} + +_create_tmp_dir() { + local my_file; + + my_file=$(mktemp -d ublk_dir_XXXXX) + echo "$my_file" +} + +_remove_tmp_dir() { + local dir=$1 + + [ -d "$dir" ] && rmdir "$dir" +} + +_mkfs_mount_test() +{ + local dev=$1 + local err_code=0 + local mnt_dir; + + mnt_dir=$(_create_tmp_dir) + mkfs.ext4 -F "$dev" > /dev/null 2>&1 + err_code=$? + if [ $err_code -ne 0 ]; then + return $err_code + fi + + mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1 + umount "$dev" + err_code=$? + _remove_tmp_dir "$mnt_dir" + if [ $err_code -ne 0 ]; then + return $err_code + fi +} + +_check_root() { + local ksft_skip=4 + + if [ $UID != 0 ]; then + echo please run this as root >&2 + exit $ksft_skip + fi +} + +_remove_ublk_devices() { + ${UBLK_PROG} del -a + modprobe -r ublk_drv > /dev/null 2>&1 +} + +_get_ublk_dev_state() { + ${UBLK_PROG} list -n "$1" | grep "state" | awk '{print $11}' +} + +_get_ublk_daemon_pid() { + ${UBLK_PROG} list -n "$1" | grep "pid" | awk '{print $7}' +} + +_prep_test() { + _check_root + local type=$1 + shift 1 + modprobe ublk_drv > /dev/null 2>&1 + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" +} + +_remove_test_files() +{ + local files=$* + + for file in ${files}; do + [ -f "${file}" ] && rm -f "${file}" + done +} + +_show_result() +{ + if [ "$UBLK_TEST_SHOW_RESULT" -ne 0 ]; then + if [ "$2" -eq 0 ]; then + echo "$1 : [PASS]" + elif [ "$2" -eq 4 ]; then + echo "$1 : [SKIP]" + else + echo "$1 : [FAIL]" + fi + fi + [ "$2" -ne 0 ] && exit "$2" + return 0 +} + +# don't call from sub-shell, otherwise can't exit +_check_add_dev() +{ + local tid=$1 + local code=$2 + shift 2 + if [ "${code}" -ne 0 ]; then + _remove_test_files "$@" + _show_result "${tid}" "${code}" + fi +} + +_cleanup_test() { + "${UBLK_PROG}" del -a + rm -f "$UBLK_TMP" +} + +_have_feature() +{ + if $UBLK_PROG "features" | grep "$1" > /dev/null 2>&1; then + return 0 + fi + return 1 +} + +_add_ublk_dev() { + local kublk_temp; + local dev_id; + + if [ ! -c /dev/ublk-control ]; then + return ${UBLK_SKIP_CODE} + fi + if echo "$@" | grep -q "\-z"; then + if ! _have_feature "ZERO_COPY"; then + return ${UBLK_SKIP_CODE} + fi + fi + + kublk_temp=$(mktemp /tmp/kublk-XXXXXX) + if ! "${UBLK_PROG}" add "$@" > "${kublk_temp}" 2>&1; then + echo "fail to add ublk dev $*" + rm -f "${kublk_temp}" + return 255 + fi + + dev_id=$(grep "dev id" "${kublk_temp}" | awk -F '[ :]' '{print $3}') + udevadm settle + rm -f "${kublk_temp}" + echo "${dev_id}" +} + +# kill the ublk daemon and return ublk device state +__ublk_kill_daemon() +{ + local dev_id=$1 + local exp_state=$2 + local daemon_pid + local state + + daemon_pid=$(_get_ublk_daemon_pid "${dev_id}") + state=$(_get_ublk_dev_state "${dev_id}") + + for ((j=0;j<50;j++)); do + [ "$state" == "$exp_state" ] && break + kill -9 "$daemon_pid" > /dev/null 2>&1 + sleep 1 + state=$(_get_ublk_dev_state "${dev_id}") + done + echo "$state" +} + +__remove_ublk_dev_return() { + local dev_id=$1 + + ${UBLK_PROG} del -n "${dev_id}" + local res=$? + udevadm settle + return ${res} +} + +__run_io_and_remove() +{ + local dev_id=$1 + local size=$2 + local kill_server=$3 + + fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio \ + --rw=readwrite --iodepth=64 --size="${size}" --numjobs=4 \ + --runtime=20 --time_based > /dev/null 2>&1 & + sleep 2 + if [ "${kill_server}" = "yes" ]; then + local state + state=$(__ublk_kill_daemon "${dev_id}" "DEAD") + if [ "$state" != "DEAD" ]; then + echo "device isn't dead($state) after killing daemon" + return 255 + fi + fi + if ! __remove_ublk_dev_return "${dev_id}"; then + echo "delete dev ${dev_id} failed" + return 255 + fi + wait +} + +_ublk_test_top_dir() +{ + cd "$(dirname "$0")" && pwd +} + +UBLK_TMP=$(mktemp ublk_test_XXXXX) +UBLK_PROG=$(_ublk_test_top_dir)/kublk +UBLK_TEST_QUIET=1 +UBLK_TEST_SHOW_RESULT=1 +export UBLK_PROG +export UBLK_TEST_QUIET +export UBLK_TEST_SHOW_RESULT diff --git a/tools/testing/selftests/ublk/test_generic_01.sh b/tools/testing/selftests/ublk/test_generic_01.sh new file mode 100755 index 000000000000..9227a208ba53 --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_01.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="generic_01" +ERR_CODE=0 + +if ! _have_program bpftrace; then + exit "$UBLK_SKIP_CODE" +fi + +_prep_test "null" "sequential io order" + +dev_id=$(_add_ublk_dev -t null) +_check_add_dev $TID $? + +dev_t=$(_get_disk_dev_t "$dev_id") +bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & +btrace_pid=$! +sleep 2 + +if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then + _cleanup_test "null" + exit "$UBLK_SKIP_CODE" +fi + +# run fio over this ublk disk +fio --name=write_seq \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? +kill "$btrace_pid" +wait +if grep -q "io_out_of_order" "$UBLK_TMP"; then + cat "$UBLK_TMP" + ERR_CODE=255 +fi +_cleanup_test "null" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_01.sh b/tools/testing/selftests/ublk/test_loop_01.sh new file mode 100755 index 000000000000..c882d2a08e13 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_01.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_01" +ERR_CODE=0 + +_prep_test "loop" "write and verify test" + +backfile_0=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -t loop "$backfile_0") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=16 \ + --rw=write \ + --size=256M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_02.sh b/tools/testing/selftests/ublk/test_loop_02.sh new file mode 100755 index 000000000000..03863d825e07 --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_02.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_02" +ERR_CODE=0 + +_prep_test "loop" "mkfs & mount & umount" + +backfile_0=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t loop "$backfile_0") +_check_add_dev $TID $? "$backfile_0" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_03.sh b/tools/testing/selftests/ublk/test_loop_03.sh new file mode 100755 index 000000000000..269c96787d7d --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_03.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_03" +ERR_CODE=0 + +_prep_test "loop" "write and verify over zero copy" + +backfile_0=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") +_check_add_dev $TID $? "$backfile_0" + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=64 \ + --rw=write \ + --size=256M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_loop_04.sh b/tools/testing/selftests/ublk/test_loop_04.sh new file mode 100755 index 000000000000..1435422c38ec --- /dev/null +++ b/tools/testing/selftests/ublk/test_loop_04.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="loop_04" +ERR_CODE=0 + +_prep_test "loop" "mkfs & mount & umount with zero copy" + +backfile_0=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t loop -z "$backfile_0") +_check_add_dev $TID $? "$backfile_0" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "loop" + +_remove_backfile "$backfile_0" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_01.sh b/tools/testing/selftests/ublk/test_null_01.sh new file mode 100755 index 000000000000..a34203f72668 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_01.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="null_01" +ERR_CODE=0 + +_prep_test "null" "basic IO test" + +dev_id=$(_add_ublk_dev -t null) +_check_add_dev $TID $? + +# run fio over the two disks +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "null" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_null_02.sh b/tools/testing/selftests/ublk/test_null_02.sh new file mode 100755 index 000000000000..5633ca876655 --- /dev/null +++ b/tools/testing/selftests/ublk/test_null_02.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="null_02" +ERR_CODE=0 + +_prep_test "null" "basic IO test with zero copy" + +dev_id=$(_add_ublk_dev -t null -z) +_check_add_dev $TID $? + +# run fio over the two disks +fio --name=job1 --filename=/dev/ublkb"${dev_id}" --ioengine=libaio --rw=readwrite --iodepth=32 --size=256M > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "null" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_01.sh b/tools/testing/selftests/ublk/test_stress_01.sh new file mode 100755 index 000000000000..7177f6c57bc5 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_01.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_01" +ERR_CODE=0 +DEV_ID=-1 + +ublk_io_and_remove() +{ + local size=$1 + shift 1 + local backfile="" + if echo "$@" | grep -q "loop"; then + backfile=${*: -1} + fi + DEV_ID=$(_add_ublk_dev "$@") + _check_add_dev $TID $? "${backfile}" + + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs. remove device(ublk add $*)" + if ! __run_io_and_remove "${DEV_ID}" "${size}" "no"; then + echo "/dev/ublkc${DEV_ID} isn't removed" + _remove_backfile "${backfile}" + exit 255 + fi +} + +_prep_test "stress" "run IO and remove device" + +ublk_io_and_remove 8G -t null +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +BACK_FILE=$(_create_backfile 256M) +ublk_io_and_remove 256M -t loop "${BACK_FILE}" +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +ublk_io_and_remove 256M -t loop -z "${BACK_FILE}" +ERR_CODE=$? +_cleanup_test "stress" +_remove_backfile "${BACK_FILE}" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stress_02.sh b/tools/testing/selftests/ublk/test_stress_02.sh new file mode 100755 index 000000000000..2a8e60579a06 --- /dev/null +++ b/tools/testing/selftests/ublk/test_stress_02.sh @@ -0,0 +1,47 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh +TID="stress_02" +ERR_CODE=0 +DEV_ID=-1 + +ublk_io_and_kill_daemon() +{ + local size=$1 + shift 1 + local backfile="" + if echo "$@" | grep -q "loop"; then + backfile=${*: -1} + fi + DEV_ID=$(_add_ublk_dev "$@") + _check_add_dev $TID $? "${backfile}" + + [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "run ublk IO vs kill ublk server(ublk add $*)" + if ! __run_io_and_remove "${DEV_ID}" "${size}" "yes"; then + echo "/dev/ublkc${DEV_ID} isn't removed res ${res}" + _remove_backfile "${backfile}" + exit 255 + fi +} + +_prep_test "stress" "run IO and kill ublk server" + +ublk_io_and_kill_daemon 8G -t null +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +BACK_FILE=$(_create_backfile 256M) +ublk_io_and_kill_daemon 256M -t loop "${BACK_FILE}" +ERR_CODE=$? +if [ ${ERR_CODE} -ne 0 ]; then + _show_result $TID $ERR_CODE +fi + +ublk_io_and_kill_daemon 256M -t loop -z "${BACK_FILE}" +ERR_CODE=$? +_cleanup_test "stress" +_remove_backfile "${BACK_FILE}" +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_01.sh b/tools/testing/selftests/ublk/test_stripe_01.sh new file mode 100755 index 000000000000..c01f3dc325ab --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_01.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_01" +ERR_CODE=0 + +_prep_test "stripe" "write and verify test" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) + +dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "${backfile_0}" + +# run fio over the ublk disk +fio --name=write_and_verify \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=libaio --iodepth=32 \ + --rw=write \ + --size=512M \ + --direct=1 \ + --verify=crc32c \ + --do_verify=1 \ + --bs=4k > /dev/null 2>&1 +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/test_stripe_02.sh b/tools/testing/selftests/ublk/test_stripe_02.sh new file mode 100755 index 000000000000..e8a45fa82dde --- /dev/null +++ b/tools/testing/selftests/ublk/test_stripe_02.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +TID="stripe_02" +ERR_CODE=0 + +_prep_test "stripe" "mkfs & mount & umount" + +backfile_0=$(_create_backfile 256M) +backfile_1=$(_create_backfile 256M) +dev_id=$(_add_ublk_dev -t stripe "$backfile_0" "$backfile_1") +_check_add_dev $TID $? "$backfile_0" "$backfile_1" + +_mkfs_mount_test /dev/ublkb"${dev_id}" +ERR_CODE=$? + +_cleanup_test "stripe" + +_remove_backfile "$backfile_0" +_remove_backfile "$backfile_1" + +_show_result $TID $ERR_CODE diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt b/tools/testing/selftests/ublk/trace/seq_io.bt new file mode 100644 index 000000000000..272ac54c9d5f --- /dev/null +++ b/tools/testing/selftests/ublk/trace/seq_io.bt @@ -0,0 +1,25 @@ +/* + $1: dev_t + $2: RWBS + $3: strlen($2) +*/ +BEGIN { + @last_rw[$1, str($2)] = 0; +} +tracepoint:block:block_rq_complete +{ + $dev = $1; + if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { + $last = @last_rw[$dev, str($2)]; + if ((uint64)args.sector != $last) { + printf("io_out_of_order: exp %llu actual %llu\n", + args.sector, $last); + } + @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); + } + @ios = count(); +} + +END { + clear(@last_rw); +} diff --git a/tools/testing/selftests/ublk/ublk_dep.h b/tools/testing/selftests/ublk/ublk_dep.h new file mode 100644 index 000000000000..f68fa7eac939 --- /dev/null +++ b/tools/testing/selftests/ublk/ublk_dep.h @@ -0,0 +1,18 @@ +#ifndef UBLK_DEP_H +#define UBLK_DEP_H + +#ifndef UBLK_U_IO_REGISTER_IO_BUF +#define UBLK_U_IO_REGISTER_IO_BUF \ + _IOWR('u', 0x23, struct ublksrv_io_cmd) +#define UBLK_U_IO_UNREGISTER_IO_BUF \ + _IOWR('u', 0x24, struct ublksrv_io_cmd) +#endif + +#ifndef UBLK_F_USER_RECOVERY_FAIL_IO +#define UBLK_F_USER_RECOVERY_FAIL_IO (1ULL << 9) +#endif + +#ifndef UBLK_F_ZONED +#define UBLK_F_ZONED (1ULL << 8) +#endif +#endif |