diff options
32 files changed, 845 insertions, 776 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2a4bd6611692..5d81ad9a3d20 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) { - unsigned short nr_vecs = bip->bip_max_vcnt - 1; - struct bio_vec *copy = &bip->bip_vec[1]; - size_t bytes = bip->bip_iter.bi_size; - struct iov_iter iter; + unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *orig_bvecs = &bip->bip_vec[1]; + struct bio_vec *bounce_bvec = &bip->bip_vec[0]; + size_t bytes = bounce_bvec->bv_len; + struct iov_iter orig_iter; int ret; - iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); - ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter); WARN_ON_ONCE(ret != bytes); - bio_integrity_unpin_bvec(copy, nr_vecs, true); + bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true); } /** @@ -301,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + size_t offset, bytes = iter->count; unsigned int direction, nr_bvecs; - struct iov_iter iter; int ret, nr_vecs; - size_t offset; bool copy; if (bio_integrity(bio)) @@ -323,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) else direction = ITER_SOURCE; - iov_iter_ubuf(&iter, direction, ubuf, bytes); - nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { @@ -334,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) pages = NULL; } - copy = !iov_iter_is_aligned(&iter, align, align); - ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + copy = !iov_iter_is_aligned(iter, align, align); + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; @@ -365,6 +364,55 @@ free_bvec: return ret; } +static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (meta->flags & IO_INTEGRITY_CHK_GUARD) + bip->bip_flags |= BIP_CHECK_GUARD; + if (meta->flags & IO_INTEGRITY_CHK_APPTAG) + bip->bip_flags |= BIP_CHECK_APPTAG; + if (meta->flags & IO_INTEGRITY_CHK_REFTAG) + bip->bip_flags |= BIP_CHECK_REFTAG; + + bip->app_tag = meta->app_tag; +} + +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int integrity_bytes; + int ret; + struct iov_iter it; + + if (!bi) + return -EINVAL; + /* + * original meta iterator can be bigger. + * process integrity info corresponding to current data buffer only. + */ + it = meta->iter; + integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio)); + if (it.count < integrity_bytes) + return -EINVAL; + + /* should fit into two bytes */ + BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16)); + + if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS)) + return -EINVAL; + + it.count = integrity_bytes; + ret = bio_integrity_map_user(bio, &it); + if (!ret) { + bio_uio_meta_to_bip(bio, meta); + bip_set_seed(bio_integrity(bio), meta->seed); + iov_iter_advance(&meta->iter, integrity_bytes); + meta->seed += bio_integrity_intervals(bi, bio_sectors(bio)); + } + return ret; +} + /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -435,6 +483,11 @@ bool bio_integrity_prep(struct bio *bio) if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; + /* describe what tags to check in payload */ + if (bi->csum_type) + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) { printk(KERN_ERR "could not attach integrity payload\n"); @@ -559,7 +612,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; - bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; + bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; + bip->app_tag = bip_src->app_tag; return 0; } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 013469faa5e7..a1678f0a9f81 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); + int ret; + struct iov_iter iter; + unsigned int direction; + if (op_is_write(req_op(rq))) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + iov_iter_ubuf(&iter, direction, ubuf, bytes); + ret = bio_integrity_map_user(rq->bio, &iter); if (ret) return ret; diff --git a/block/fops.c b/block/fops.c index 13a67940d040..6d5c4fc5a216 100644 --- a/block/fops.c +++ b/block/fops.c @@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; + WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; + bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; + if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + bio_integrity_unmap_user(bio); + if (atomic_dec_and_test(&dio->ref)) { - if (!(dio->flags & DIO_IS_SYNC)) { + if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { - bio_release_pages(bio, false); - bio_clear_flag(bio, BIO_REFFED); - bio_put(bio); - blk_finish_plug(&plug); - return -EAGAIN; + ret = -EAGAIN; + goto fail; } bio->bi_opf |= REQ_NOWAIT; } + if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + ret = bio_integrity_map_iter(bio, iocb->private); + if (unlikely(ret)) + goto fail; + } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) @@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_put(&dio->bio); return ret; +fail: + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return ret; } static void blkdev_bio_end_io_async(struct bio *bio) @@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } + if (iocb->ki_flags & IOCB_HAS_METADATA) + bio_integrity_unmap_user(bio); + iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { @@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + if (unlikely(ret)) + goto out_bio_put; } dio->size = bio->bi_iter.bi_size; @@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_HAS_METADATA) { + ret = bio_integrity_map_iter(bio, iocb->private); + WRITE_ONCE(iocb->private, NULL); + if (unlikely(ret)) + goto out_bio_put; + } + if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; @@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, submit_bio(bio); } return -EIOCBQUEUED; + +out_bio_put: + bio_put(bio); + return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2147069775c6..76b615d4d5b9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -885,6 +885,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, return BLK_STS_OK; } +static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd) +{ + cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag); + cmnd->rw.lbatm = cpu_to_le16(0xffff); +} + static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd, struct request *req) { @@ -1017,18 +1023,17 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, control |= NVME_RW_PRINFO_PRACT; } - switch (ns->head->pi_type) { - case NVME_NS_DPS_PI_TYPE3: + if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD)) control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; + if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) { + control |= NVME_RW_PRINFO_PRCHK_REF; if (op == nvme_cmd_zone_append) control |= NVME_RW_APPEND_PIREMAP; nvme_set_ref_tag(ns, cmnd, req); - break; + } + if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) { + control |= NVME_RW_PRINFO_PRCHK_APP; + nvme_set_app_tag(req, cmnd); } } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a48c4d5edfa3..950d8c9fb884 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -809,14 +809,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_GUARD)) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG)) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dbf0f74c1529..802f52e38efd 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -7,10 +7,12 @@ enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ + BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ + BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 5, /* guard check */ + BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ }; struct bio_integrity_payload { @@ -21,6 +23,7 @@ struct bio_integrity_payload { unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + u16 app_tag; /* application tag value */ struct bvec_iter bio_iter; /* for rewinding parent bio */ @@ -30,6 +33,9 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ + BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) + #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ @@ -72,7 +78,8 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -98,8 +105,12 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len) +static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) +{ + return -EINVAL; +} + +static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) { return -EINVAL; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 2b8447f53200..a4af70367f8a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -349,6 +349,7 @@ struct readahead_control; #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) +#define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fd4cdb0860a2..623d8e798a11 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -78,8 +78,9 @@ struct io_hash_table { struct io_mapped_region { struct page **pages; - void *vmap_ptr; - size_t nr_pages; + void *ptr; + unsigned nr_pages; + unsigned flags; }; /* @@ -293,6 +294,11 @@ struct io_ring_ctx { struct io_submit_state submit_state; + /* + * Modifications are protected by ->uring_lock and ->mmap_lock. + * The flags, buf_pages and buf_nr_pages fields should be stable + * once published. + */ struct xarray io_bl_xa; struct io_hash_table cancel_table; @@ -424,17 +430,10 @@ struct io_ring_ctx { * side will need to grab this lock, to prevent either side from * being run concurrently with the other. */ - struct mutex resize_lock; - - /* - * If IORING_SETUP_NO_MMAP is used, then the below holds - * the gup'ed pages for the two rings, and the sqes. - */ - unsigned short n_ring_pages; - unsigned short n_sqe_pages; - struct page **ring_pages; - struct page **sqe_pages; + struct mutex mmap_lock; + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; @@ -481,6 +480,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_HAS_METADATA_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -561,6 +561,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* request has read/write metadata assigned */ + REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/include/linux/uio.h b/include/linux/uio.h index 853f9de5aa05..8ada84e85447 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -82,6 +82,15 @@ struct iov_iter { }; }; +typedef __u16 uio_meta_flags_t; + +struct uio_meta { + uio_meta_flags_t flags; + u16 app_tag; + u64 seed; + struct iov_iter iter; +}; + static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 56a4f93a08f4..2bbe00cf1248 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -40,6 +40,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) +/* flags for integrity meta */ +#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */ +#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */ +#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */ + +#define IO_INTEGRITY_VALID_FLAGS (IO_INTEGRITY_CHK_GUARD | \ + IO_INTEGRITY_CHK_REFTAG | \ + IO_INTEGRITY_CHK_APPTAG) + #define SEEK_SET 0 /* seek relative to beginning of file */ #define SEEK_CUR 1 /* seek relative to current file position */ #define SEEK_END 2 /* seek relative to end of file */ diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index aac9a4f8fa9a..e11c82638527 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -98,6 +98,10 @@ struct io_uring_sqe { __u64 addr3; __u64 __pad2[1]; }; + struct { + __u64 attr_ptr; /* pointer to attribute information */ + __u64 attr_type_mask; /* bit mask of attributes */ + }; __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then @@ -107,6 +111,18 @@ struct io_uring_sqe { }; }; +/* sqe->attr_type_mask flags */ +#define IORING_RW_ATTR_FLAG_PI (1U << 0) +/* PI attribute information */ +struct io_uring_attr_pi { + __u16 flags; + __u16 app_tag; + __u32 len; + __u64 addr; + __u64 seed; + __u64 rsvd; +}; + /* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate @@ -561,6 +577,7 @@ struct io_uring_params { #define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_MIN_TIMEOUT (1U << 15) +#define IORING_FEAT_RW_ATTR (1U << 16) /* * io_uring_register(2) opcodes and arguments diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index b7a38a2069cf..a3a8cfec32ce 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -30,6 +30,19 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) return NULL; } +static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp, + void (*init_once)(void *obj)) +{ + if (unlikely(!cache->nr_cached)) { + void *obj = kmalloc(cache->elem_size, gfp); + + if (obj && init_once) + init_once(obj); + return obj; + } + return io_alloc_cache_get(cache); +} + /* returns false if the cache was initialized properly */ static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, size_t size) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index b214e5a407b5..f60d0a9d505e 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -211,10 +211,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) if (ctx->file_table.data.nodes[i]) f = io_slot_file(ctx->file_table.data.nodes[i]); - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); - else - seq_printf(m, "%5u: <none>\n", i); + if (f) { + seq_printf(m, "%5u: ", i); + seq_file_path(m, f, " \t\n\\"); + seq_puts(m, "\n"); + } } seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); for (i = 0; has_lock && i < ctx->buf_table.nr; i++) { diff --git a/io_uring/futex.c b/io_uring/futex.c index e29662f039e1..30139cc150f2 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -251,17 +251,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) io_req_task_work_add(req); } -static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) -{ - struct io_futex_data *ifd; - - ifd = io_alloc_cache_get(&ctx->futex_cache); - if (ifd) - return ifd; - - return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); -} - int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); @@ -331,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) } io_ring_submit_lock(ctx, issue_flags); - ifd = io_alloc_ifd(ctx); + ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL); if (!ifd) { ret = -ENOMEM; goto done_unlock; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4758f1ba902b..7bfbc7c22367 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -115,7 +115,7 @@ REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) + REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -143,7 +143,8 @@ struct io_defer_entry { static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all); + bool cancel_all, + bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); @@ -350,7 +351,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); - mutex_init(&ctx->resize_lock); + mutex_init(&ctx->mmap_lock); return ctx; @@ -361,7 +362,7 @@ err: io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); @@ -550,8 +551,9 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -562,6 +564,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) io_req_task_queue(de->req); kfree(de); } + spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -570,11 +573,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); + if (ctx->drain_active) io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) io_eventfd_flush_signal(ctx); } @@ -1401,6 +1401,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1440,7 +1446,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); @@ -1640,19 +1651,6 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - WARN_ON_ONCE(!def->async_size); - req->async_data = kmalloc(def->async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -2631,36 +2629,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, - true); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, - true); - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; - vunmap(ctx->rings); - vunmap(ctx->sq_sqes); - } - + io_free_region(ctx, &ctx->sq_region); + io_free_region(ctx, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -2732,7 +2704,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); io_free_region(ctx, &ctx->param_region); @@ -2894,7 +2866,8 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -3062,7 +3035,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all) + bool cancel_all, + bool is_sqpoll_thread) { struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; @@ -3092,7 +3066,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { + is_sqpoll_thread) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; @@ -3165,13 +3139,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) continue; loop |= io_uring_try_cancel_requests(node->ctx, current->io_uring, - cancel_all); + cancel_all, + false); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, current->io_uring, - cancel_all); + cancel_all, + true); } if (loop) { @@ -3233,6 +3209,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, end > ctx->cq_wait_size)) return ERR_PTR(-EFAULT); + offset = array_index_nospec(offset, ctx->cq_wait_size - size); return ctx->cq_wait_arg + offset; } @@ -3477,9 +3454,10 @@ bool io_is_uring_fops(struct file *file) static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3490,15 +3468,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); - - if (IS_ERR(rings)) - return PTR_ERR(rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; @@ -3515,17 +3495,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { io_rings_free(ctx); - return PTR_ERR(ptr); + return ret; } - - ctx->sq_sqes = ptr; + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); return 0; } @@ -3733,7 +3714,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | + IORING_FEAT_RW_ATTR; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3894,6 +3876,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 492cbbf2c23b..f65e3f3ede51 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -8,9 +8,11 @@ #include <linux/poll.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> +#include "alloc_cache.h" #include "io-wq.h" #include "slist.h" #include "filetable.h" +#include "opdef.h" #ifndef CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -223,6 +225,27 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, + struct io_kiocb *req, + void (*init_once)(void *obj)) +{ + req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once); + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + +static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index eec5eb7de843..04bf493eecae 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -45,10 +45,10 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; - atomic_set(&bl->refs, 1); + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -353,17 +353,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; - if (bl->buf_nr_pages) { - int j; - - if (!(bl->flags & IOBL_MMAP)) { - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - } - io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->flags & IOBL_MMAP); - bl->flags &= ~IOBL_MMAP; - } + io_free_region(ctx, &bl->region); /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); bl->flags &= ~IOBL_BUF_RING; @@ -386,12 +376,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - if (atomic_dec_and_test(&bl->refs)) { - __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); - } + __io_remove_buffers(ctx, bl, -1U); + kfree(bl); } void io_destroy_buffers(struct io_ring_ctx *ctx) @@ -399,10 +387,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer_list *bl; struct list_head *item, *tmp; struct io_buffer *buf; - unsigned long index; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); + while (1) { + unsigned long index = 0; + + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; io_put_bl(ctx, bl); } @@ -591,11 +586,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) INIT_LIST_HEAD(&bl->buf_list); ret = io_buffer_add_list(ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. - */ - kfree_rcu(bl, rcu); + kfree(bl); goto err; } } @@ -615,75 +606,14 @@ err: return IOU_OK; } -static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_uring_buf_ring *br = NULL; - struct page **pages; - int nr_pages, ret; - - pages = io_pin_pages(reg->ring_addr, - flex_array_size(br, bufs, reg->ring_entries), - &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!br) { - ret = -ENOMEM; - goto error_unpin; - } - -#ifdef SHM_COLOUR - /* - * On platforms that have specific aliasing requirements, SHM_COLOUR - * is set and we must guarantee that the kernel and user side align - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and - * the application mmap's the provided ring buffer. Fail the request - * if we, by chance, don't end up with aligned addresses. The app - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle - * this transparently. - */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { - ret = -EINVAL; - goto error_unpin; - } -#endif - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->buf_ring = br; - bl->flags |= IOBL_BUF_RING; - bl->flags &= ~IOBL_MMAP; - return 0; -error_unpin: - unpin_user_pages(pages, nr_pages); - kvfree(pages); - vunmap(br); - return ret; -} - -static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, - struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - size_t ring_size; - - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - - bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); - if (IS_ERR(bl->buf_ring)) { - bl->buf_ring = NULL; - return -ENOMEM; - } - - bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); - return 0; -} - int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; struct io_buffer_list *bl, *free_bl = NULL; + struct io_uring_region_desc rd; + struct io_uring_buf_ring *br; + unsigned long mmap_offset; + unsigned long ring_size; int ret; lockdep_assert_held(&ctx->uring_lock); @@ -695,19 +625,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - } else { - if (reg.ring_addr) - return -EINVAL; - } - if (!is_power_of_2(reg.ring_entries)) return -EINVAL; - /* cannot disambiguate full vs empty due to head/tail size */ if (reg.ring_entries >= 65536) return -EINVAL; @@ -723,22 +642,48 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - if (!(reg.flags & IOU_PBUF_RING_MMAP)) - ret = io_pin_pbuf_ring(®, bl); - else - ret = io_alloc_pbuf_ring(ctx, ®, bl); + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(br, bufs, reg.ring_entries); - if (!ret) { - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; - if (reg.flags & IOU_PBUF_RING_INC) - bl->flags |= IOBL_INC; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(ring_size); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg.ring_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + if (ret) + goto fail; + br = io_region_get_ptr(&bl->region); - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if (!(reg.flags & IOU_PBUF_RING_MMAP) && + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { + ret = -EINVAL; + goto fail; } +#endif - kfree_rcu(free_bl, rcu); + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + bl->flags |= IOBL_BUF_RING; + bl->buf_ring = br; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +fail: + io_free_region(ctx, &bl->region); + kfree(free_bl); return ret; } @@ -762,7 +707,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - xa_erase(&ctx->io_bl_xa, bl->bgid); + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->io_bl_xa, bl->bgid); + io_put_bl(ctx, bl); return 0; } @@ -793,50 +740,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid) -{ - struct io_buffer_list *bl; - bool ret; - - /* - * We have to be a bit careful here - we're inside mmap and cannot grab - * the uring_lock. This means the buffer_list could be simultaneously - * going away, if someone is trying to be sneaky. Look it up under rcu - * so we know it's not going away, and attempt to grab a reference to - * it. If the ref is already zero, then fail the mapping. If successful, - * the caller will call io_put_bl() to drop the the reference at at the - * end. This may then safely free the buffer_list (and drop the pages) - * at that point, vm_insert_pages() would've already grabbed the - * necessary vma references. - */ - rcu_read_lock(); - bl = xa_load(&ctx->io_bl_xa, bgid); - /* must be a mmap'able buffer ring and have pages */ - ret = false; - if (bl && bl->flags & IOBL_MMAP) - ret = atomic_inc_not_zero(&bl->refs); - rcu_read_unlock(); - - if (ret) - return bl; - - return ERR_PTR(-EINVAL); -} - -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid) { - struct io_ring_ctx *ctx = file->private_data; - loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; struct io_buffer_list *bl; - int bgid, ret; - bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return PTR_ERR(bl); + lockdep_assert_held(&ctx->mmap_lock); - ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); - io_put_bl(ctx, bl); - return ret; + bl = xa_load(&ctx->io_bl_xa, bgid); + if (!bl || !(bl->flags & IOBL_BUF_RING)) + return NULL; + return &bl->region; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 36aadfe5ac00..bd80c44c5af1 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -3,15 +3,13 @@ #define IOU_KBUF_H #include <uapi/linux/io_uring.h> +#include <linux/io_uring_types.h> enum { /* ring mapped provided buffers */ IOBL_BUF_RING = 1, - /* ring mapped provided buffers, but mmap'ed by application */ - IOBL_MMAP = 2, /* buffers are consumed incrementally rather than always fully */ - IOBL_INC = 4, - + IOBL_INC = 2, }; struct io_buffer_list { @@ -21,11 +19,7 @@ struct io_buffer_list { */ union { struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - struct rcu_head rcu; + struct io_uring_buf_ring *buf_ring; }; __u16 bgid; @@ -37,7 +31,7 @@ struct io_buffer_list { __u16 flags; - atomic_t refs; + struct io_mapped_region region; }; struct io_buffer { @@ -84,10 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid); -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 57de9bccbf50..dda846190fbd 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -36,102 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages, return page_address(page); } -static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, - gfp_t gfp) -{ - void *ret; - int i; - - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(gfp); - if (!pages[i]) - goto err; - } - - ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (ret) - return ret; -err: - while (i--) - put_page(pages[i]); - return ERR_PTR(-ENOMEM); -} - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - struct page **pages; - int nr_pages; - void *ret; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); - if (!pages) - return ERR_PTR(-ENOMEM); - - ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) - goto done; - if (nr_pages == 1) - goto fail; - - ret = io_mem_alloc_single(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) { -done: - *out_pages = pages; - *npages = nr_pages; - return ret; - } -fail: - kvfree(pages); - *out_pages = NULL; - *npages = 0; - return ret; -} - -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages) -{ - bool do_vunmap = false; - - if (!ptr) - return; - - if (put_pages && *npages) { - struct page **to_free = *pages; - int i; - - /* - * Only did vmap for the non-compound multiple page case. - * For the compound page, we just need to put the head. - */ - if (PageCompound(to_free[0])) - *npages = 1; - else if (*npages > 1) - do_vunmap = true; - for (i = 0; i < *npages; i++) - put_page(to_free[i]); - } - if (do_vunmap) - vunmap(ptr); - kvfree(*pages); - *pages = NULL; - *npages = 0; -} - -void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - - if (!page_array) - return; - - unpin_user_pages(page_array, npages); - kvfree(page_array); - *pages = NULL; -} - struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; @@ -174,64 +78,127 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) return ERR_PTR(ret); } -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) +enum { + /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ + IO_REGION_F_VMAP = 1, + /* memory is provided by user and pinned by the kernel */ + IO_REGION_F_USER_PROVIDED = 2, + /* only the first page in the array is ref'ed */ + IO_REGION_F_SINGLE_REF = 4, +}; + +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { - struct page **page_array; - unsigned int nr_pages; - void *page_addr; + if (mr->pages) { + long nr_refs = mr->nr_pages; - *npages = 0; + if (mr->flags & IO_REGION_F_SINGLE_REF) + nr_refs = 1; - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + unpin_user_pages(mr->pages, nr_refs); + else + release_pages(mr->pages, nr_refs); - nr_pages = 0; - page_array = io_pin_pages(uaddr, size, &nr_pages); - if (IS_ERR(page_array)) - return page_array; + kvfree(mr->pages); + } + if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) + vunmap(mr->ptr); + if (mr->nr_pages && ctx->user) + __io_unaccount_mem(ctx->user, mr->nr_pages); - page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); - if (page_addr) { - *pages = page_array; - *npages = nr_pages; - return page_addr; + memset(mr, 0, sizeof(*mr)); +} + +static int io_region_init_ptr(struct io_mapped_region *mr) +{ + struct io_imu_folio_data ifd; + void *ptr; + + if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { + if (ifd.nr_folios == 1) { + mr->ptr = page_address(mr->pages[0]); + return 0; + } } + ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); + if (!ptr) + return -ENOMEM; - io_pages_free(&page_array, nr_pages); - return ERR_PTR(-ENOMEM); + mr->ptr = ptr; + mr->flags |= IO_REGION_F_VMAP; + return 0; } -void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) +static int io_region_pin_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg) { - if (mr->pages) { - unpin_user_pages(mr->pages, mr->nr_pages); - kvfree(mr->pages); + unsigned long size = mr->nr_pages << PAGE_SHIFT; + struct page **pages; + int nr_pages; + + pages = io_pin_pages(reg->user_addr, size, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) + return -EFAULT; + + mr->pages = pages; + mr->flags |= IO_REGION_F_USER_PROVIDED; + return 0; +} + +static int io_region_allocate_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + unsigned long size = mr->nr_pages << PAGE_SHIFT; + unsigned long nr_allocated; + struct page **pages; + void *p; + + pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); + if (!pages) + return -ENOMEM; + + p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); + if (!IS_ERR(p)) { + mr->flags |= IO_REGION_F_SINGLE_REF; + goto done; } - if (mr->vmap_ptr) - vunmap(mr->vmap_ptr); - if (mr->nr_pages && ctx->user) - __io_unaccount_mem(ctx->user, mr->nr_pages); - memset(mr, 0, sizeof(*mr)); + nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE, + mr->nr_pages, pages); + if (nr_allocated != mr->nr_pages) { + if (nr_allocated) + release_pages(pages, nr_allocated); + kvfree(pages); + return -ENOMEM; + } +done: + reg->mmap_offset = mmap_offset; + mr->pages = pages; + return 0; } int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg) + struct io_uring_region_desc *reg, + unsigned long mmap_offset) { - int pages_accounted = 0; - struct page **pages; int nr_pages, ret; - void *vptr; u64 end; - if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages)) + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) return -EFAULT; if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) return -EINVAL; - if (reg->flags != IORING_MEM_REGION_TYPE_USER) + if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) return -EINVAL; - if (!reg->user_addr) + /* user_addr should be set IFF it's a user memory backed region */ + if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) return -EFAULT; if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; @@ -242,94 +209,120 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (check_add_overflow(reg->user_addr, reg->size, &end)) return -EOVERFLOW; - pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - + nr_pages = reg->size >> PAGE_SHIFT; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) - goto out_free; - pages_accounted = nr_pages; + return ret; } + mr->nr_pages = nr_pages; - vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!vptr) { - ret = -ENOMEM; + if (reg->flags & IORING_MEM_REGION_TYPE_USER) + ret = io_region_pin_pages(ctx, mr, reg); + else + ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); + if (ret) goto out_free; - } - mr->pages = pages; - mr->vmap_ptr = vptr; - mr->nr_pages = nr_pages; + ret = io_region_init_ptr(mr); + if (ret) + goto out_free; return 0; out_free: - if (pages_accounted) - __io_unaccount_mem(ctx->user, pages_accounted); - io_pages_free(&pages, nr_pages); + io_free_region(ctx, mr); return ret; } -static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, - size_t sz) +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + struct io_mapped_region tmp_mr; + int ret; + + memcpy(&tmp_mr, mr, sizeof(tmp_mr)); + ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); + if (ret) + return ret; + + /* + * Once published mmap can find it without holding only the ->mmap_lock + * and not ->uring_lock. + */ + guard(mutex)(&ctx->mmap_lock); + memcpy(mr, &tmp_mr, sizeof(tmp_mr)); + return 0; +} + +static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, + loff_t pgoff) { - struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; + unsigned int bgid; - switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - if (!ctx->rings) - return ERR_PTR(-EFAULT); - return ctx->rings; + return &ctx->ring_region; case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - if (!ctx->sq_sqes) - return ERR_PTR(-EFAULT); - return ctx->sq_sqes; - case IORING_OFF_PBUF_RING: { - struct io_buffer_list *bl; - unsigned int bgid; - void *ptr; - + return &ctx->sq_region; + case IORING_OFF_PBUF_RING: bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return bl; - ptr = bl->buf_ring; - io_put_bl(ctx, bl); - return ptr; - } + return io_pbuf_get_region(ctx, bgid); + case IORING_MAP_OFF_PARAM_REGION: + return &ctx->param_region; } + return NULL; +} - return ERR_PTR(-EINVAL); +static void *io_region_validate_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr) +{ + lockdep_assert_held(&ctx->mmap_lock); + + if (!io_region_is_set(mr)) + return ERR_PTR(-EINVAL); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + return ERR_PTR(-EINVAL); + + return io_region_get_ptr(mr); } -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages) +static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, + size_t sz) { - unsigned long nr_pages = npages; + struct io_ring_ctx *ctx = file->private_data; + struct io_mapped_region *region; - vm_flags_set(vma, VM_DONTEXPAND); - return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + region = io_mmap_get_region(ctx, pgoff); + if (!region) + return ERR_PTR(-EINVAL); + return io_region_validate_mmap(ctx, region); } #ifdef CONFIG_MMU +static int io_region_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct vm_area_struct *vma, + unsigned max_pages) +{ + unsigned long nr_pages = min(mr->nr_pages, max_pages); + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); +} + __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned int npages; + unsigned int page_limit = UINT_MAX; + struct io_mapped_region *region; void *ptr; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) @@ -338,16 +331,12 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); - case IORING_OFF_SQES: - return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, - ctx->n_sqe_pages); - case IORING_OFF_PBUF_RING: - return io_pbuf_mmap(file, vma); + page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + break; } - return -EINVAL; + region = io_mmap_get_region(ctx, vma->vm_pgoff); + return io_region_mmap(ctx, region, vma, page_limit); } unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, @@ -365,7 +354,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) return -EINVAL; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) @@ -415,7 +404,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, struct io_ring_ctx *ctx = file->private_data; void *ptr; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) diff --git a/io_uring/memmap.h b/io_uring/memmap.h index f361a635b6c7..c898dcba2b4e 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -1,18 +1,9 @@ #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -void io_pages_free(struct page ***pages, int npages); -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages); - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size); -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages); +#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size); +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); @@ -24,11 +15,17 @@ int io_uring_mmap(struct file *file, struct vm_area_struct *vma); void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg); + struct io_uring_region_desc *reg, + unsigned long mmap_offset); + +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset); static inline void *io_region_get_ptr(struct io_mapped_region *mr) { - return mr->vmap_ptr; + return mr->ptr; } static inline bool io_region_is_set(struct io_mapped_region *mr) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 333c220d322a..bd3cd78d2dba 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -354,10 +354,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) return __io_msg_ring_data(fd_file(f)->private_data, &io_msg, IO_URING_F_UNLOCKED); } - -void io_msg_cache_free(const void *entry) -{ - struct io_kiocb *req = (struct io_kiocb *) entry; - - kmem_cache_free(req_cachep, req); -} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 38e7f8f0c944..32236d2fb778 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -4,4 +4,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); -void io_msg_cache_free(const void *entry); diff --git a/io_uring/net.c b/io_uring/net.c index c6cd38cc5dc4..85f55fbc25c9 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -155,30 +155,31 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } } +static void io_msg_async_data_init(void *obj) +{ + struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj; + + hdr->free_iov = NULL; + hdr->free_iov_nr = 0; +} + static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_msghdr *hdr; - hdr = io_alloc_cache_get(&ctx->netmsg_cache); - if (hdr) { - if (hdr->free_iov) { - kasan_mempool_unpoison_object(hdr->free_iov, - hdr->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; - } + hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req, + io_msg_async_data_init); + if (!hdr) + return NULL; - if (!io_alloc_async_data(req)) { - hdr = req->async_data; - hdr->free_iov_nr = 0; - hdr->free_iov = NULL; - return hdr; + /* If the async data was cached, we might have an iov cached inside. */ + if (hdr->free_iov) { + kasan_mempool_unpoison_object(hdr->free_iov, + hdr->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; } - return NULL; + return hdr; } /* assign new iovec to kmsg, if we need to */ diff --git a/io_uring/poll.c b/io_uring/poll.c index bced9edd5233..cc01c40b43d3 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -648,15 +648,12 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - apoll = io_alloc_cache_get(&ctx->apoll_cache); - if (!apoll) - goto alloc_apoll; - apoll->poll.retries = APOLL_MAX_RETRY; } else { -alloc_apoll: - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) + if (!(issue_flags & IO_URING_F_UNLOCKED)) + apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL); + else + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } diff --git a/io_uring/register.c b/io_uring/register.c index 371aec87e078..05025047d1da 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -104,21 +104,13 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) +static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, + struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; @@ -130,47 +122,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, if (IS_ERR(res)) return PTR_ERR(res); - ret = 0; + ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); + if (res[i].register_op >= IORING_REGISTER_LAST) + goto err; + __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + if (res[i].sqe_op >= IORING_OP_LAST) + goto err; + __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + restrictions->sqe_flags_required = res[i].sqe_flags; break; default: - ret = -EINVAL; - goto out; + goto err; } } -out: + ret = 0; + +err: + kfree(res); + return ret; +} + +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + int ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; - - kfree(res); return ret; } @@ -367,28 +369,19 @@ static int io_register_clock(struct io_ring_ctx *ctx, * either mapping or freeing. */ struct io_ring_ctx_rings { - unsigned short n_ring_pages; - unsigned short n_sqe_pages; - struct page **ring_pages; - struct page **sqe_pages; - struct io_uring_sqe *sq_sqes; struct io_rings *rings; + struct io_uring_sqe *sq_sqes; + + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; }; -static void io_register_free_rings(struct io_uring_params *p, +static void io_register_free_rings(struct io_ring_ctx *ctx, + struct io_uring_params *p, struct io_ring_ctx_rings *r) { - if (!(p->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, - true); - io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, - true); - } else { - io_pages_free(&r->ring_pages, r->n_ring_pages); - io_pages_free(&r->sqe_pages, r->n_sqe_pages); - vunmap(r->rings); - vunmap(r->sq_sqes); - } + io_free_region(ctx, &r->sq_region); + io_free_region(ctx, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -403,11 +396,11 @@ static void io_register_free_rings(struct io_uring_params *p, static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; unsigned i, tail, old_head; struct io_uring_params p; - void *ptr; int ret; /* for single issuer, must be owner resizing */ @@ -441,13 +434,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (size == SIZE_MAX) return -EOVERFLOW; - if (!(p.flags & IORING_SETUP_NO_MMAP)) - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); - else - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, - p.cq_off.user_addr, size); - if (IS_ERR(n.rings)) - return PTR_ERR(n.rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); /* * At this point n.rings is shared with userspace, just like o.rings @@ -463,7 +461,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EFAULT; } @@ -472,20 +470,22 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) else size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); if (size == SIZE_MAX) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EOVERFLOW; } - if (!(p.flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); - else - ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, - p.sq_off.user_addr, - size); - if (IS_ERR(ptr)) { - io_register_free_rings(&p, &n); - return PTR_ERR(ptr); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; } + n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread @@ -497,15 +497,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) } /* - * We'll do the swap. Grab the ctx->resize_lock, which will exclude + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude * any new mmap's on the ring fd. Clear out existing mappings to prevent * mmap from seeing them, as we'll unmap them. Any attempt to mmap * existing rings beyond this point will fail. Not that it could proceed * at this point anyway, as the io_uring mmap side needs go grab the - * ctx->resize_lock as well. Likewise, hold the completion lock over the + * ctx->mmap_lock as well. Likewise, hold the completion lock over the * duration of the actual swap. */ - mutex_lock(&ctx->resize_lock); + mutex_lock(&ctx->mmap_lock); spin_lock(&ctx->completion_lock); o.rings = ctx->rings; ctx->rings = NULL; @@ -516,7 +516,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ - n.sq_sqes = ptr; tail = READ_ONCE(o.rings->sq.tail); old_head = READ_ONCE(o.rings->sq.head); if (tail - old_head > p.sq_entries) @@ -527,8 +526,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) n.sq_sqes[dst_head] = o.sq_sqes[src_head]; } - WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head)); - WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail)); + WRITE_ONCE(n.rings->sq.head, old_head); + WRITE_ONCE(n.rings->sq.tail, tail); tail = READ_ONCE(o.rings->cq.tail); old_head = READ_ONCE(o.rings->cq.head); @@ -547,8 +546,8 @@ overflow: n.rings->cqes[dst_head] = o.rings->cqes[src_head]; } - WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head)); - WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail)); + WRITE_ONCE(n.rings->cq.head, old_head); + WRITE_ONCE(n.rings->cq.tail, tail); /* invalidate cached cqe refill */ ctx->cqe_cached = ctx->cqe_sentinel = NULL; @@ -566,16 +565,14 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; - swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, n_sqe_pages); - swap_old(ctx, o, n, ring_pages); - swap_old(ctx, o, n, sqe_pages); + swap_old(ctx, o, n, ring_region); + swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: spin_unlock(&ctx->completion_lock); - mutex_unlock(&ctx->resize_lock); - io_register_free_rings(&p, to_free); + mutex_unlock(&ctx->mmap_lock); + io_register_free_rings(ctx, &p, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); @@ -598,7 +595,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) rd_uptr = u64_to_user_ptr(reg.region_uptr); if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; - if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) @@ -613,7 +609,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; - ret = io_create_region(ctx, &ctx->param_region, &rd); + ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, + IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 69937d0c94f9..e32ac5853391 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -626,11 +626,12 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data, int nr_folios) +static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; int nr_pages_left = *nr_pages, i, j; + int nr_folios = data->nr_folios; /* Store head pages only*/ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), @@ -667,27 +668,21 @@ static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, return true; } -static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data) +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data) { - struct page **page_array = *pages; struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; - if (*nr_pages <= 1) - return false; - data->nr_pages_mid = folio_nr_pages(folio); - if (data->nr_pages_mid == 1) - return false; - data->folio_shift = folio_shift(folio); + /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ - for (i = 1; i < *nr_pages; i++) { + for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; @@ -715,7 +710,8 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, if (nr_folios == 1) data->nr_pages_head = count; - return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); + data->nr_folios = nr_folios; + return true; } static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, @@ -729,7 +725,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; - bool coalesced; + bool coalesced = false; if (!iov->iov_base) return NULL; @@ -749,7 +745,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ - coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); + if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { + if (data.nr_pages_mid != 1) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); + } imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) @@ -883,7 +882,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); if (offset) { /* @@ -905,7 +904,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - iter->count -= offset; iter->iov_offset = offset; } else { unsigned long seg_skip; @@ -916,7 +914,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, iter->bvec += seg_skip; iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } @@ -931,6 +928,13 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx int i, ret, off, nr; unsigned int nbufs; + /* + * Accounting state is shared between the two rings; that only works if + * both rings are accounted towards the same counters. + */ + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) + return -EINVAL; + /* if offsets are given, must have nr specified too */ if (!arg->nr && (arg->dst_off || arg->src_off)) return -EINVAL; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7a4668deaa1a..c8b093584461 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -40,6 +40,7 @@ struct io_imu_folio_data { /* For non-head/tail folios, has to be fully included */ unsigned int nr_pages_mid; unsigned int folio_shift; + unsigned int nr_folios; }; struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); @@ -66,6 +67,9 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data); + static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, int index) { diff --git a/io_uring/rw.c b/io_uring/rw.c index 29bb3010f9c0..a9a2733be842 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -202,45 +202,40 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) * mean that the underlying data can be gone at any time. But that * should be fixed seperately, and then this check could be killed. */ - if (!(req->flags & REQ_F_REFCOUNT)) { + if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; io_rw_recycle(req, issue_flags); } } +static void io_rw_async_data_init(void *obj) +{ + struct io_async_rw *rw = (struct io_async_rw *)obj; + + rw->free_iovec = NULL; + rw->bytes_done = 0; +} + static int io_rw_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_rw *rw; - rw = io_alloc_cache_get(&ctx->rw_cache); - if (rw) { - if (rw->free_iovec) { - kasan_mempool_unpoison_object(rw->free_iovec, - rw->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = rw; - goto done; - } - - if (!io_alloc_async_data(req)) { - rw = req->async_data; - rw->free_iovec = NULL; - rw->free_iov_nr = 0; -done: - rw->bytes_done = 0; - return 0; + rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init); + if (!rw) + return -ENOMEM; + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; } - - return -ENOMEM; + rw->bytes_done = 0; + return 0; } static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) { struct io_async_rw *rw; - int ret; if (io_rw_alloc_async(req)) return -ENOMEM; @@ -249,12 +244,48 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; rw = req->async_data; - ret = io_import_iovec(ddir, req, rw, 0); + return io_import_iovec(ddir, req, rw, 0); +} + +static inline void io_meta_save_state(struct io_async_rw *io) +{ + io->meta_state.seed = io->meta.seed; + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); +} + +static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + io->meta.seed = io->meta_state.seed; + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); + } +} + +static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, + u64 attr_ptr, u64 attr_type_mask) +{ + struct io_uring_attr_pi pi_attr; + struct io_async_rw *io; + int ret; + + if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), + sizeof(pi_attr))) + return -EFAULT; + + if (pi_attr.rsvd) + return -EINVAL; + + io = req->async_data; + io->meta.flags = pi_attr.flags; + io->meta.app_tag = pi_attr.app_tag; + io->meta.seed = pi_attr.seed; + ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), + pi_attr.len, &io->meta.iter); if (unlikely(ret < 0)) return ret; - - iov_iter_save_state(&rw->iter, &rw->iter_state); - return 0; + req->flags |= REQ_F_HAS_METADATA; + io_meta_save_state(io); + return ret; } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -262,6 +293,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; + u64 attr_type_mask; int ret; rw->kiocb.ki_pos = READ_ONCE(sqe->off); @@ -279,11 +311,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return io_prep_rw_setup(req, ddir, do_import); + ret = io_prep_rw_setup(req, ddir, do_import); + + if (unlikely(ret)) + return ret; + + attr_type_mask = READ_ONCE(sqe->attr_type_mask); + if (attr_type_mask) { + u64 attr_ptr; + + /* only PI attribute is supported currently */ + if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) + return -EINVAL; + + attr_ptr = READ_ONCE(sqe->attr_ptr); + ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + } + return ret; } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -385,7 +434,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - io_rw_iovec_free(req->async_data); + lockdep_assert_held(&req->ctx->uring_lock); + io_rw_recycle(req, 0); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) @@ -405,17 +455,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -#ifdef CONFIG_BLOCK -static void io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - iov_iter_restore(&io->iter, &io->iter_state); -} - static bool io_rw_should_reissue(struct io_kiocb *req) { +#ifdef CONFIG_BLOCK + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); umode_t mode = file_inode(req->file)->i_mode; + struct io_async_rw *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!S_ISBLK(mode) && !S_ISREG(mode)) @@ -430,23 +475,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (percpu_ref_is_dying(&ctx->refs)) return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->tctx->task, current) || !in_task()) - return false; + + io_meta_restore(io, &rw->kiocb); + iov_iter_restore(&io->iter, &io->iter_state); return true; -} #else -static void io_resubmit_prep(struct io_kiocb *req) -{ -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ return false; -} #endif +} static void io_req_end_write(struct io_kiocb *req) { @@ -473,22 +509,16 @@ static void io_req_io_end(struct io_kiocb *req) } } -static bool __io_complete_rw_common(struct io_kiocb *req, long res) +static void __io_complete_rw_common(struct io_kiocb *req, long res) { - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - /* - * Reissue will start accounting again, finish the - * current cycle. - */ - io_req_io_end(req); - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - return true; - } + if (res == req->cqe.res) + return; + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + } else { req_set_fail(req); req->cqe.res = res; } - return false; } static inline int io_fixup_rw_res(struct io_kiocb *req, long res) @@ -531,8 +561,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_kiocb *req = cmd_to_io_kiocb(rw); if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { - if (__io_complete_rw_common(req, res)) - return; + __io_complete_rw_common(req, res); io_req_set_res(req, io_fixup_rw_res(req, res), 0); } req->io_task_work.func = io_req_rw_complete; @@ -594,26 +623,19 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { - if (!__io_complete_rw_common(req, ret)) { - /* - * Safe to call io_end from here as we're inline - * from the submission path. - */ - io_req_io_end(req); - io_req_set_res(req, final_ret, - io_put_kbuf(req, ret, issue_flags)); - io_req_rw_cleanup(req, issue_flags); - return IOU_OK; - } + __io_complete_rw_common(req, ret); + /* + * Safe to call io_end from here as we're inline + * from the submission path. + */ + io_req_io_end(req); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); + io_req_rw_cleanup(req, issue_flags); + return IOU_OK; } else { io_rw_done(&rw->kiocb, ret); } - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - io_resubmit_prep(req); - return -EAGAIN; - } return IOU_ISSUE_SKIP_COMPLETE; } @@ -736,8 +758,11 @@ static bool io_rw_should_retry(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) + /* + * Never retry for NOWAIT or a request with metadata, we just complete + * with -EAGAIN. + */ + if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) return false; /* Only for buffered IO */ @@ -828,6 +853,19 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } + if (req->flags & REQ_F_HAS_METADATA) { + struct io_async_rw *io = req->async_data; + + /* + * We have a union of meta fields with wpq used for buffered-io + * in io_async_rw, so fail it here. + */ + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->ki_flags |= IOCB_HAS_METADATA; + kiocb->private = &io->meta; + } + return 0; } @@ -876,8 +914,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EOPNOTSUPP && force_nonblock) ret = -EAGAIN; - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; + if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; @@ -902,6 +939,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); do { /* @@ -1087,11 +1125,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -1127,6 +1160,7 @@ done: } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; diff --git a/io_uring/rw.h b/io_uring/rw.h index 3f432dc75441..2d7656bd268d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,6 +2,11 @@ #include <linux/pagemap.h> +struct io_meta_state { + u32 seed; + struct iov_iter_state iter_meta; +}; + struct io_async_rw { size_t bytes_done; struct iov_iter iter; @@ -9,7 +14,14 @@ struct io_async_rw { struct iovec fast_iov; struct iovec *free_iovec; int free_iov_nr; - struct wait_page_queue wpq; + /* wpq is for buffered io, while meta fields are used with direct io */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; + }; }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index e9cec9e4dc2f..2bd7e0a317bb 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -544,10 +544,9 @@ static int __io_timeout_prep(struct io_kiocb *req, if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (io_alloc_async_data(req)) + data = io_uring_alloc_async_data_nocache(req); + if (!data) return -ENOMEM; - - data = req->async_data; data->req = req; data->flags = flags; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index ce7726a04883..fc94c465a985 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,26 +16,6 @@ #include "rsrc.h" #include "uring_cmd.h" -static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_cmd_data *cache; - - cache = io_alloc_cache_get(&ctx->uring_cache); - if (cache) { - cache->op_data = NULL; - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = cache; - return cache; - } - if (!io_alloc_async_data(req)) { - cache = req->async_data; - cache->op_data = NULL; - return cache; - } - return NULL; -} - static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -130,7 +110,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; - if (current->flags & (PF_EXITING | PF_KTHREAD)) + if (io_should_terminate_tw()) flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ @@ -188,14 +168,22 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); +static void io_uring_cmd_init_once(void *obj) +{ + struct io_uring_cmd_data *data = obj; + + data->op_data = NULL; +} + static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd_data *cache; - cache = io_uring_async_get(req); - if (unlikely(!cache)) + cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, + io_uring_cmd_init_once); + if (!cache) return -ENOMEM; if (!(req->flags & REQ_F_FORCE_ASYNC)) { diff --git a/io_uring/waitid.c b/io_uring/waitid.c index daef5dd644f0..6778c0ee76c4 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -303,10 +303,10 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) struct io_waitid_async *iwa; int ret; - if (io_alloc_async_data(req)) + iwa = io_uring_alloc_async_data_nocache(req); + if (!iwa) return -ENOMEM; - iwa = req->async_data; iwa->req = req; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, |