summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/bio-integrity.c84
-rw-r--r--block/blk-integrity.c10
-rw-r--r--block/fops.c45
-rw-r--r--drivers/nvme/host/core.c21
-rw-r--r--drivers/scsi/sd.c4
-rw-r--r--include/linux/bio-integrity.h25
-rw-r--r--include/linux/fs.h1
-rw-r--r--include/linux/io_uring_types.h26
-rw-r--r--include/linux/uio.h9
-rw-r--r--include/uapi/linux/fs.h9
-rw-r--r--include/uapi/linux/io_uring.h17
-rw-r--r--io_uring/alloc_cache.h13
-rw-r--r--io_uring/fdinfo.c9
-rw-r--r--io_uring/futex.c13
-rw-r--r--io_uring/io_uring.c136
-rw-r--r--io_uring/io_uring.h23
-rw-r--r--io_uring/kbuf.c226
-rw-r--r--io_uring/kbuf.h20
-rw-r--r--io_uring/memmap.c375
-rw-r--r--io_uring/memmap.h23
-rw-r--r--io_uring/msg_ring.c7
-rw-r--r--io_uring/msg_ring.h1
-rw-r--r--io_uring/net.c35
-rw-r--r--io_uring/poll.c13
-rw-r--r--io_uring/register.c163
-rw-r--r--io_uring/rsrc.c40
-rw-r--r--io_uring/rsrc.h4
-rw-r--r--io_uring/rw.c212
-rw-r--r--io_uring/rw.h14
-rw-r--r--io_uring/timeout.c5
-rw-r--r--io_uring/uring_cmd.c34
-rw-r--r--io_uring/waitid.c4
32 files changed, 845 insertions, 776 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 2a4bd6611692..5d81ad9a3d20 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
{
- unsigned short nr_vecs = bip->bip_max_vcnt - 1;
- struct bio_vec *copy = &bip->bip_vec[1];
- size_t bytes = bip->bip_iter.bi_size;
- struct iov_iter iter;
+ unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1;
+ struct bio_vec *orig_bvecs = &bip->bip_vec[1];
+ struct bio_vec *bounce_bvec = &bip->bip_vec[0];
+ size_t bytes = bounce_bvec->bv_len;
+ struct iov_iter orig_iter;
int ret;
- iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
- ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+ iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes);
+ ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter);
WARN_ON_ONCE(ret != bytes);
- bio_integrity_unpin_bvec(copy, nr_vecs, true);
+ bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true);
}
/**
@@ -301,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
return nr_bvecs;
}
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
+int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits);
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ size_t offset, bytes = iter->count;
unsigned int direction, nr_bvecs;
- struct iov_iter iter;
int ret, nr_vecs;
- size_t offset;
bool copy;
if (bio_integrity(bio))
@@ -323,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
else
direction = ITER_SOURCE;
- iov_iter_ubuf(&iter, direction, ubuf, bytes);
- nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+ nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1);
if (nr_vecs > BIO_MAX_VECS)
return -E2BIG;
if (nr_vecs > UIO_FASTIOV) {
@@ -334,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes)
pages = NULL;
}
- copy = !iov_iter_is_aligned(&iter, align, align);
- ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+ copy = !iov_iter_is_aligned(iter, align, align);
+ ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset);
if (unlikely(ret < 0))
goto free_bvec;
@@ -365,6 +364,55 @@ free_bvec:
return ret;
}
+static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta)
+{
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+
+ if (meta->flags & IO_INTEGRITY_CHK_GUARD)
+ bip->bip_flags |= BIP_CHECK_GUARD;
+ if (meta->flags & IO_INTEGRITY_CHK_APPTAG)
+ bip->bip_flags |= BIP_CHECK_APPTAG;
+ if (meta->flags & IO_INTEGRITY_CHK_REFTAG)
+ bip->bip_flags |= BIP_CHECK_REFTAG;
+
+ bip->app_tag = meta->app_tag;
+}
+
+int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
+{
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+ unsigned int integrity_bytes;
+ int ret;
+ struct iov_iter it;
+
+ if (!bi)
+ return -EINVAL;
+ /*
+ * original meta iterator can be bigger.
+ * process integrity info corresponding to current data buffer only.
+ */
+ it = meta->iter;
+ integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio));
+ if (it.count < integrity_bytes)
+ return -EINVAL;
+
+ /* should fit into two bytes */
+ BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16));
+
+ if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS))
+ return -EINVAL;
+
+ it.count = integrity_bytes;
+ ret = bio_integrity_map_user(bio, &it);
+ if (!ret) {
+ bio_uio_meta_to_bip(bio, meta);
+ bip_set_seed(bio_integrity(bio), meta->seed);
+ iov_iter_advance(&meta->iter, integrity_bytes);
+ meta->seed += bio_integrity_intervals(bi, bio_sectors(bio));
+ }
+ return ret;
+}
+
/**
* bio_integrity_prep - Prepare bio for integrity I/O
* @bio: bio to prepare
@@ -435,6 +483,11 @@ bool bio_integrity_prep(struct bio *bio)
if (bi->csum_type == BLK_INTEGRITY_CSUM_IP)
bip->bip_flags |= BIP_IP_CHECKSUM;
+ /* describe what tags to check in payload */
+ if (bi->csum_type)
+ bip->bip_flags |= BIP_CHECK_GUARD;
+ if (bi->flags & BLK_INTEGRITY_REF_TAG)
+ bip->bip_flags |= BIP_CHECK_REFTAG;
if (bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf)) < len) {
printk(KERN_ERR "could not attach integrity payload\n");
@@ -559,7 +612,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
bip->bip_vec = bip_src->bip_vec;
bip->bip_iter = bip_src->bip_iter;
- bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;
+ bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS;
+ bip->app_tag = bip_src->app_tag;
return 0;
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 013469faa5e7..a1678f0a9f81 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf,
ssize_t bytes)
{
- int ret = bio_integrity_map_user(rq->bio, ubuf, bytes);
+ int ret;
+ struct iov_iter iter;
+ unsigned int direction;
+ if (op_is_write(req_op(rq)))
+ direction = ITER_DEST;
+ else
+ direction = ITER_SOURCE;
+ iov_iter_ubuf(&iter, direction, ubuf, bytes);
+ ret = bio_integrity_map_user(rq->bio, &iter);
if (ret)
return ret;
diff --git a/block/fops.c b/block/fops.c
index 13a67940d040..6d5c4fc5a216 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
+ WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA);
if (nr_pages <= DIO_INLINE_BIO_VECS)
vecs = inline_vecs;
else {
@@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
+ bool is_sync = dio->flags & DIO_IS_SYNC;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
+ if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA))
+ bio_integrity_unmap_user(bio);
+
if (atomic_dec_and_test(&dio->ref)) {
- if (!(dio->flags & DIO_IS_SYNC)) {
+ if (!is_sync) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
@@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* a retry of this from blocking context.
*/
if (unlikely(iov_iter_count(iter))) {
- bio_release_pages(bio, false);
- bio_clear_flag(bio, BIO_REFFED);
- bio_put(bio);
- blk_finish_plug(&plug);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto fail;
}
bio->bi_opf |= REQ_NOWAIT;
}
+ if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ if (unlikely(ret))
+ goto fail;
+ }
if (is_read) {
if (dio->flags & DIO_SHOULD_DIRTY)
@@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio_put(&dio->bio);
return ret;
+fail:
+ bio_release_pages(bio, false);
+ bio_clear_flag(bio, BIO_REFFED);
+ bio_put(bio);
+ blk_finish_plug(&plug);
+ return ret;
}
static void blkdev_bio_end_io_async(struct bio *bio)
@@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
ret = blk_status_to_errno(bio->bi_status);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA)
+ bio_integrity_unmap_user(bio);
+
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
@@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
bio_iov_bvec_set(bio, iter);
} else {
ret = bio_iov_iter_get_pages(bio, iter);
- if (unlikely(ret)) {
- bio_put(bio);
- return ret;
- }
+ if (unlikely(ret))
+ goto out_bio_put;
}
dio->size = bio->bi_iter.bi_size;
@@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_HAS_METADATA) {
+ ret = bio_integrity_map_iter(bio, iocb->private);
+ WRITE_ONCE(iocb->private, NULL);
+ if (unlikely(ret))
+ goto out_bio_put;
+ }
+
if (iocb->ki_flags & IOCB_ATOMIC)
bio->bi_opf |= REQ_ATOMIC;
@@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
submit_bio(bio);
}
return -EIOCBQUEUED;
+
+out_bio_put:
+ bio_put(bio);
+ return ret;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2147069775c6..76b615d4d5b9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -885,6 +885,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
return BLK_STS_OK;
}
+static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd)
+{
+ cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag);
+ cmnd->rw.lbatm = cpu_to_le16(0xffff);
+}
+
static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
struct request *req)
{
@@ -1017,18 +1023,17 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
control |= NVME_RW_PRINFO_PRACT;
}
- switch (ns->head->pi_type) {
- case NVME_NS_DPS_PI_TYPE3:
+ if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD))
control |= NVME_RW_PRINFO_PRCHK_GUARD;
- break;
- case NVME_NS_DPS_PI_TYPE1:
- case NVME_NS_DPS_PI_TYPE2:
- control |= NVME_RW_PRINFO_PRCHK_GUARD |
- NVME_RW_PRINFO_PRCHK_REF;
+ if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) {
+ control |= NVME_RW_PRINFO_PRCHK_REF;
if (op == nvme_cmd_zone_append)
control |= NVME_RW_APPEND_PIREMAP;
nvme_set_ref_tag(ns, cmnd, req);
- break;
+ }
+ if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) {
+ control |= NVME_RW_PRINFO_PRCHK_APP;
+ nvme_set_app_tag(req, cmnd);
}
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a48c4d5edfa3..950d8c9fb884 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -809,14 +809,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd,
if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM))
scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM;
- if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
+ if (bio_integrity_flagged(bio, BIP_CHECK_GUARD))
scmd->prot_flags |= SCSI_PROT_GUARD_CHECK;
}
if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */
scmd->prot_flags |= SCSI_PROT_REF_INCREMENT;
- if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false)
+ if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG))
scmd->prot_flags |= SCSI_PROT_REF_CHECK;
}
diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h
index dbf0f74c1529..802f52e38efd 100644
--- a/include/linux/bio-integrity.h
+++ b/include/linux/bio-integrity.h
@@ -7,10 +7,12 @@
enum bip_flags {
BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
- BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */
- BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */
- BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */
- BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */
+ BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */
+ BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */
+ BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */
+ BIP_CHECK_GUARD = 1 << 5, /* guard check */
+ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */
+ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */
};
struct bio_integrity_payload {
@@ -21,6 +23,7 @@ struct bio_integrity_payload {
unsigned short bip_vcnt; /* # of integrity bio_vecs */
unsigned short bip_max_vcnt; /* integrity bio_vec slots */
unsigned short bip_flags; /* control flags */
+ u16 app_tag; /* application tag value */
struct bvec_iter bio_iter; /* for rewinding parent bio */
@@ -30,6 +33,9 @@ struct bio_integrity_payload {
struct bio_vec bip_inline_vecs[];/* embedded bvec array */
};
+#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \
+ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG)
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
#define bip_for_each_vec(bvl, bip, iter) \
@@ -72,7 +78,8 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp,
unsigned int nr);
int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len,
unsigned int offset);
-int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len);
+int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter);
+int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta);
void bio_integrity_unmap_user(struct bio *bio);
bool bio_integrity_prep(struct bio *bio);
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done);
@@ -98,8 +105,12 @@ static inline void bioset_integrity_free(struct bio_set *bs)
{
}
-static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
- ssize_t len)
+static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter)
+{
+ return -EINVAL;
+}
+
+static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta)
{
return -EINVAL;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2b8447f53200..a4af70367f8a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -349,6 +349,7 @@ struct readahead_control;
#define IOCB_DIO_CALLER_COMP (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW (1 << 23)
+#define IOCB_HAS_METADATA (1 << 24)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fd4cdb0860a2..623d8e798a11 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -78,8 +78,9 @@ struct io_hash_table {
struct io_mapped_region {
struct page **pages;
- void *vmap_ptr;
- size_t nr_pages;
+ void *ptr;
+ unsigned nr_pages;
+ unsigned flags;
};
/*
@@ -293,6 +294,11 @@ struct io_ring_ctx {
struct io_submit_state submit_state;
+ /*
+ * Modifications are protected by ->uring_lock and ->mmap_lock.
+ * The flags, buf_pages and buf_nr_pages fields should be stable
+ * once published.
+ */
struct xarray io_bl_xa;
struct io_hash_table cancel_table;
@@ -424,17 +430,10 @@ struct io_ring_ctx {
* side will need to grab this lock, to prevent either side from
* being run concurrently with the other.
*/
- struct mutex resize_lock;
-
- /*
- * If IORING_SETUP_NO_MMAP is used, then the below holds
- * the gup'ed pages for the two rings, and the sqes.
- */
- unsigned short n_ring_pages;
- unsigned short n_sqe_pages;
- struct page **ring_pages;
- struct page **sqe_pages;
+ struct mutex mmap_lock;
+ struct io_mapped_region sq_region;
+ struct io_mapped_region ring_region;
/* used for optimised request parameter and wait argument passing */
struct io_mapped_region param_region;
};
@@ -481,6 +480,7 @@ enum {
REQ_F_BL_NO_RECYCLE_BIT,
REQ_F_BUFFERS_COMMIT_BIT,
REQ_F_BUF_NODE_BIT,
+ REQ_F_HAS_METADATA_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
@@ -561,6 +561,8 @@ enum {
REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
/* buf node is valid */
REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
+ /* request has read/write metadata assigned */
+ REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 853f9de5aa05..8ada84e85447 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -82,6 +82,15 @@ struct iov_iter {
};
};
+typedef __u16 uio_meta_flags_t;
+
+struct uio_meta {
+ uio_meta_flags_t flags;
+ u16 app_tag;
+ u64 seed;
+ struct iov_iter iter;
+};
+
static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
if (iter->iter_type == ITER_UBUF)
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 56a4f93a08f4..2bbe00cf1248 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -40,6 +40,15 @@
#define BLOCK_SIZE_BITS 10
#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
+/* flags for integrity meta */
+#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */
+#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */
+#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */
+
+#define IO_INTEGRITY_VALID_FLAGS (IO_INTEGRITY_CHK_GUARD | \
+ IO_INTEGRITY_CHK_REFTAG | \
+ IO_INTEGRITY_CHK_APPTAG)
+
#define SEEK_SET 0 /* seek relative to beginning of file */
#define SEEK_CUR 1 /* seek relative to current file position */
#define SEEK_END 2 /* seek relative to end of file */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index aac9a4f8fa9a..e11c82638527 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -98,6 +98,10 @@ struct io_uring_sqe {
__u64 addr3;
__u64 __pad2[1];
};
+ struct {
+ __u64 attr_ptr; /* pointer to attribute information */
+ __u64 attr_type_mask; /* bit mask of attributes */
+ };
__u64 optval;
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
@@ -107,6 +111,18 @@ struct io_uring_sqe {
};
};
+/* sqe->attr_type_mask flags */
+#define IORING_RW_ATTR_FLAG_PI (1U << 0)
+/* PI attribute information */
+struct io_uring_attr_pi {
+ __u16 flags;
+ __u16 app_tag;
+ __u32 len;
+ __u64 addr;
+ __u64 seed;
+ __u64 rsvd;
+};
+
/*
* If sqe->file_index is set to this for opcodes that instantiate a new
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
@@ -561,6 +577,7 @@ struct io_uring_params {
#define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
+#define IORING_FEAT_RW_ATTR (1U << 16)
/*
* io_uring_register(2) opcodes and arguments
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index b7a38a2069cf..a3a8cfec32ce 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -30,6 +30,19 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
return NULL;
}
+static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp,
+ void (*init_once)(void *obj))
+{
+ if (unlikely(!cache->nr_cached)) {
+ void *obj = kmalloc(cache->elem_size, gfp);
+
+ if (obj && init_once)
+ init_once(obj);
+ return obj;
+ }
+ return io_alloc_cache_get(cache);
+}
+
/* returns false if the cache was initialized properly */
static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
unsigned max_nr, size_t size)
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index b214e5a407b5..f60d0a9d505e 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -211,10 +211,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
if (ctx->file_table.data.nodes[i])
f = io_slot_file(ctx->file_table.data.nodes[i]);
- if (f)
- seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname);
- else
- seq_printf(m, "%5u: <none>\n", i);
+ if (f) {
+ seq_printf(m, "%5u: ", i);
+ seq_file_path(m, f, " \t\n\\");
+ seq_puts(m, "\n");
+ }
}
seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
diff --git a/io_uring/futex.c b/io_uring/futex.c
index e29662f039e1..30139cc150f2 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -251,17 +251,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
io_req_task_work_add(req);
}
-static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
-{
- struct io_futex_data *ifd;
-
- ifd = io_alloc_cache_get(&ctx->futex_cache);
- if (ifd)
- return ifd;
-
- return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
-}
-
int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex);
@@ -331,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags)
}
io_ring_submit_lock(ctx, issue_flags);
- ifd = io_alloc_ifd(ctx);
+ ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL);
if (!ifd) {
ret = -ENOMEM;
goto done_unlock;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4758f1ba902b..7bfbc7c22367 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -115,7 +115,7 @@
REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
- IO_REQ_CLEAN_FLAGS)
+ REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -143,7 +143,8 @@ struct io_defer_entry {
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_uring_task *tctx,
- bool cancel_all);
+ bool cancel_all,
+ bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
@@ -350,7 +351,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
io_napi_init(ctx);
- mutex_init(&ctx->resize_lock);
+ mutex_init(&ctx->mmap_lock);
return ctx;
@@ -361,7 +362,7 @@ err:
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
- io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
+ io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
kvfree(ctx->cancel_table.hbs);
xa_destroy(&ctx->io_bl_xa);
@@ -550,8 +551,9 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
-static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
+ spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
@@ -562,6 +564,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
io_req_task_queue(de->req);
kfree(de);
}
+ spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -570,11 +573,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active) {
- spin_lock(&ctx->completion_lock);
+ if (ctx->drain_active)
io_queue_deferred(ctx);
- spin_unlock(&ctx->completion_lock);
- }
if (ctx->has_evfd)
io_eventfd_flush_signal(ctx);
}
@@ -1401,6 +1401,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
comp_list);
if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+ if (req->flags & REQ_F_REISSUE) {
+ node = req->comp_list.next;
+ req->flags &= ~REQ_F_REISSUE;
+ io_queue_iowq(req);
+ continue;
+ }
if (req->flags & REQ_F_REFCOUNT) {
node = req->comp_list.next;
if (!req_ref_put_and_test(req))
@@ -1440,7 +1446,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP) &&
+ /*
+ * Requests marked with REQUEUE should not post a CQE, they
+ * will go through the io-wq retry machinery and post one
+ * later.
+ */
+ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->lockless_cq) {
spin_lock(&ctx->completion_lock);
@@ -1640,19 +1651,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
-bool io_alloc_async_data(struct io_kiocb *req)
-{
- const struct io_issue_def *def = &io_issue_defs[req->opcode];
-
- WARN_ON_ONCE(!def->async_size);
- req->async_data = kmalloc(def->async_size, GFP_KERNEL);
- if (req->async_data) {
- req->flags |= REQ_F_ASYNC_DATA;
- return false;
- }
- return true;
-}
-
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
@@ -2631,36 +2629,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
- size_t size)
-{
- return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
- size);
-}
-
-static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
- size_t size)
-{
- return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
- size);
-}
-
static void io_rings_free(struct io_ring_ctx *ctx)
{
- if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
- io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
- true);
- io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
- true);
- } else {
- io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
- ctx->n_ring_pages = 0;
- io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
- ctx->n_sqe_pages = 0;
- vunmap(ctx->rings);
- vunmap(ctx->sq_sqes);
- }
-
+ io_free_region(ctx, &ctx->sq_region);
+ io_free_region(ctx, &ctx->ring_region);
ctx->rings = NULL;
ctx->sq_sqes = NULL;
}
@@ -2732,7 +2704,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
- io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
+ io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
io_free_region(ctx, &ctx->param_region);
@@ -2894,7 +2866,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);
- while (io_uring_try_cancel_requests(ctx, NULL, true))
+ /* The SQPOLL thread never reaches this path */
+ while (io_uring_try_cancel_requests(ctx, NULL, true, false))
cond_resched();
if (ctx->sq_data) {
@@ -3062,7 +3035,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_uring_task *tctx,
- bool cancel_all)
+ bool cancel_all,
+ bool is_sqpoll_thread)
{
struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, };
enum io_wq_cancel cret;
@@ -3092,7 +3066,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
- (ctx->sq_data && ctx->sq_data->thread == current)) {
+ is_sqpoll_thread) {
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
@@ -3165,13 +3139,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
continue;
loop |= io_uring_try_cancel_requests(node->ctx,
current->io_uring,
- cancel_all);
+ cancel_all,
+ false);
}
} else {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
loop |= io_uring_try_cancel_requests(ctx,
current->io_uring,
- cancel_all);
+ cancel_all,
+ true);
}
if (loop) {
@@ -3233,6 +3209,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
end > ctx->cq_wait_size))
return ERR_PTR(-EFAULT);
+ offset = array_index_nospec(offset, ctx->cq_wait_size - size);
return ctx->cq_wait_arg + offset;
}
@@ -3477,9 +3454,10 @@ bool io_is_uring_fops(struct file *file)
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
+ struct io_uring_region_desc rd;
struct io_rings *rings;
size_t size, sq_array_offset;
- void *ptr;
+ int ret;
/* make sure these are sane, as we already accounted them */
ctx->sq_entries = p->sq_entries;
@@ -3490,15 +3468,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (size == SIZE_MAX)
return -EOVERFLOW;
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
- else
- rings = io_rings_map(ctx, p->cq_off.user_addr, size);
-
- if (IS_ERR(rings))
- return PTR_ERR(rings);
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (ctx->flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p->cq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
+ if (ret)
+ return ret;
+ ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
- ctx->rings = rings;
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1;
@@ -3515,17 +3495,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
}
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
- else
- ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
-
- if (IS_ERR(ptr)) {
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (ctx->flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p->sq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
+ if (ret) {
io_rings_free(ctx);
- return PTR_ERR(ptr);
+ return ret;
}
-
- ctx->sq_sqes = ptr;
+ ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region);
return 0;
}
@@ -3733,7 +3714,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
- IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
+ IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
+ IORING_FEAT_RW_ATTR;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -3894,6 +3876,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+ BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
+ BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 492cbbf2c23b..f65e3f3ede51 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -8,9 +8,11 @@
#include <linux/poll.h>
#include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h>
+#include "alloc_cache.h"
#include "io-wq.h"
#include "slist.h"
#include "filetable.h"
+#include "opdef.h"
#ifndef CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
@@ -223,6 +225,27 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
req->cqe.flags = cflags;
}
+static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
+ struct io_kiocb *req,
+ void (*init_once)(void *obj))
+{
+ req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once);
+ if (req->async_data)
+ req->flags |= REQ_F_ASYNC_DATA;
+ return req->async_data;
+}
+
+static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req)
+{
+ const struct io_issue_def *def = &io_issue_defs[req->opcode];
+
+ WARN_ON_ONCE(!def->async_size);
+ req->async_data = kmalloc(def->async_size, GFP_KERNEL);
+ if (req->async_data)
+ req->flags |= REQ_F_ASYNC_DATA;
+ return req->async_data;
+}
+
static inline bool req_has_async_data(struct io_kiocb *req)
{
return req->flags & REQ_F_ASYNC_DATA;
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index eec5eb7de843..04bf493eecae 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -45,10 +45,10 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
/*
* Store buffer group ID and finally mark the list as visible.
* The normal lookup doesn't care about the visibility as we're
- * always under the ->uring_lock, but the RCU lookup from mmap does.
+ * always under the ->uring_lock, but lookups from mmap do.
*/
bl->bgid = bgid;
- atomic_set(&bl->refs, 1);
+ guard(mutex)(&ctx->mmap_lock);
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
@@ -353,17 +353,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head;
- if (bl->buf_nr_pages) {
- int j;
-
- if (!(bl->flags & IOBL_MMAP)) {
- for (j = 0; j < bl->buf_nr_pages; j++)
- unpin_user_page(bl->buf_pages[j]);
- }
- io_pages_unmap(bl->buf_ring, &bl->buf_pages,
- &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
- bl->flags &= ~IOBL_MMAP;
- }
+ io_free_region(ctx, &bl->region);
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
bl->flags &= ~IOBL_BUF_RING;
@@ -386,12 +376,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
return i;
}
-void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
- if (atomic_dec_and_test(&bl->refs)) {
- __io_remove_buffers(ctx, bl, -1U);
- kfree_rcu(bl, rcu);
- }
+ __io_remove_buffers(ctx, bl, -1U);
+ kfree(bl);
}
void io_destroy_buffers(struct io_ring_ctx *ctx)
@@ -399,10 +387,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
struct io_buffer_list *bl;
struct list_head *item, *tmp;
struct io_buffer *buf;
- unsigned long index;
- xa_for_each(&ctx->io_bl_xa, index, bl) {
- xa_erase(&ctx->io_bl_xa, bl->bgid);
+ while (1) {
+ unsigned long index = 0;
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT);
+ if (bl)
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ }
+ if (!bl)
+ break;
io_put_bl(ctx, bl);
}
@@ -591,11 +586,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
- /*
- * Doesn't need rcu free as it was never visible, but
- * let's keep it consistent throughout.
- */
- kfree_rcu(bl, rcu);
+ kfree(bl);
goto err;
}
}
@@ -615,75 +606,14 @@ err:
return IOU_OK;
}
-static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
- struct io_buffer_list *bl)
-{
- struct io_uring_buf_ring *br = NULL;
- struct page **pages;
- int nr_pages, ret;
-
- pages = io_pin_pages(reg->ring_addr,
- flex_array_size(br, bufs, reg->ring_entries),
- &nr_pages);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (!br) {
- ret = -ENOMEM;
- goto error_unpin;
- }
-
-#ifdef SHM_COLOUR
- /*
- * On platforms that have specific aliasing requirements, SHM_COLOUR
- * is set and we must guarantee that the kernel and user side align
- * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
- * the application mmap's the provided ring buffer. Fail the request
- * if we, by chance, don't end up with aligned addresses. The app
- * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
- * this transparently.
- */
- if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
- ret = -EINVAL;
- goto error_unpin;
- }
-#endif
- bl->buf_pages = pages;
- bl->buf_nr_pages = nr_pages;
- bl->buf_ring = br;
- bl->flags |= IOBL_BUF_RING;
- bl->flags &= ~IOBL_MMAP;
- return 0;
-error_unpin:
- unpin_user_pages(pages, nr_pages);
- kvfree(pages);
- vunmap(br);
- return ret;
-}
-
-static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
- struct io_uring_buf_reg *reg,
- struct io_buffer_list *bl)
-{
- size_t ring_size;
-
- ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
-
- bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
- if (IS_ERR(bl->buf_ring)) {
- bl->buf_ring = NULL;
- return -ENOMEM;
- }
-
- bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
- return 0;
-}
-
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
+ struct io_uring_region_desc rd;
+ struct io_uring_buf_ring *br;
+ unsigned long mmap_offset;
+ unsigned long ring_size;
int ret;
lockdep_assert_held(&ctx->uring_lock);
@@ -695,19 +625,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
- if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
- if (!reg.ring_addr)
- return -EFAULT;
- if (reg.ring_addr & ~PAGE_MASK)
- return -EINVAL;
- } else {
- if (reg.ring_addr)
- return -EINVAL;
- }
-
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
-
/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;
@@ -723,22 +642,48 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOMEM;
}
- if (!(reg.flags & IOU_PBUF_RING_MMAP))
- ret = io_pin_pbuf_ring(&reg, bl);
- else
- ret = io_alloc_pbuf_ring(ctx, &reg, bl);
+ mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
+ ring_size = flex_array_size(br, bufs, reg.ring_entries);
- if (!ret) {
- bl->nr_entries = reg.ring_entries;
- bl->mask = reg.ring_entries - 1;
- if (reg.flags & IOU_PBUF_RING_INC)
- bl->flags |= IOBL_INC;
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(ring_size);
+ if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+ rd.user_addr = reg.ring_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset);
+ if (ret)
+ goto fail;
+ br = io_region_get_ptr(&bl->region);
- io_buffer_add_list(ctx, bl, reg.bgid);
- return 0;
+#ifdef SHM_COLOUR
+ /*
+ * On platforms that have specific aliasing requirements, SHM_COLOUR
+ * is set and we must guarantee that the kernel and user side align
+ * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
+ * the application mmap's the provided ring buffer. Fail the request
+ * if we, by chance, don't end up with aligned addresses. The app
+ * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
+ * this transparently.
+ */
+ if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
+ ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
+ ret = -EINVAL;
+ goto fail;
}
+#endif
- kfree_rcu(free_bl, rcu);
+ bl->nr_entries = reg.ring_entries;
+ bl->mask = reg.ring_entries - 1;
+ bl->flags |= IOBL_BUF_RING;
+ bl->buf_ring = br;
+ if (reg.flags & IOU_PBUF_RING_INC)
+ bl->flags |= IOBL_INC;
+ io_buffer_add_list(ctx, bl, reg.bgid);
+ return 0;
+fail:
+ io_free_region(ctx, &bl->region);
+ kfree(free_bl);
return ret;
}
@@ -762,7 +707,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL;
- xa_erase(&ctx->io_bl_xa, bl->bgid);
+ scoped_guard(mutex, &ctx->mmap_lock)
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+
io_put_bl(ctx, bl);
return 0;
}
@@ -793,50 +740,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
-struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
- unsigned long bgid)
-{
- struct io_buffer_list *bl;
- bool ret;
-
- /*
- * We have to be a bit careful here - we're inside mmap and cannot grab
- * the uring_lock. This means the buffer_list could be simultaneously
- * going away, if someone is trying to be sneaky. Look it up under rcu
- * so we know it's not going away, and attempt to grab a reference to
- * it. If the ref is already zero, then fail the mapping. If successful,
- * the caller will call io_put_bl() to drop the the reference at at the
- * end. This may then safely free the buffer_list (and drop the pages)
- * at that point, vm_insert_pages() would've already grabbed the
- * necessary vma references.
- */
- rcu_read_lock();
- bl = xa_load(&ctx->io_bl_xa, bgid);
- /* must be a mmap'able buffer ring and have pages */
- ret = false;
- if (bl && bl->flags & IOBL_MMAP)
- ret = atomic_inc_not_zero(&bl->refs);
- rcu_read_unlock();
-
- if (ret)
- return bl;
-
- return ERR_PTR(-EINVAL);
-}
-
-int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
+struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
- struct io_ring_ctx *ctx = file->private_data;
- loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
struct io_buffer_list *bl;
- int bgid, ret;
- bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- bl = io_pbuf_get_bl(ctx, bgid);
- if (IS_ERR(bl))
- return PTR_ERR(bl);
+ lockdep_assert_held(&ctx->mmap_lock);
- ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
- io_put_bl(ctx, bl);
- return ret;
+ bl = xa_load(&ctx->io_bl_xa, bgid);
+ if (!bl || !(bl->flags & IOBL_BUF_RING))
+ return NULL;
+ return &bl->region;
}
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 36aadfe5ac00..bd80c44c5af1 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -3,15 +3,13 @@
#define IOU_KBUF_H
#include <uapi/linux/io_uring.h>
+#include <linux/io_uring_types.h>
enum {
/* ring mapped provided buffers */
IOBL_BUF_RING = 1,
- /* ring mapped provided buffers, but mmap'ed by application */
- IOBL_MMAP = 2,
/* buffers are consumed incrementally rather than always fully */
- IOBL_INC = 4,
-
+ IOBL_INC = 2,
};
struct io_buffer_list {
@@ -21,11 +19,7 @@ struct io_buffer_list {
*/
union {
struct list_head buf_list;
- struct {
- struct page **buf_pages;
- struct io_uring_buf_ring *buf_ring;
- };
- struct rcu_head rcu;
+ struct io_uring_buf_ring *buf_ring;
};
__u16 bgid;
@@ -37,7 +31,7 @@ struct io_buffer_list {
__u16 flags;
- atomic_t refs;
+ struct io_mapped_region region;
};
struct io_buffer {
@@ -84,10 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
-void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
-struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
- unsigned long bgid);
-int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
+struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
+ unsigned int bgid);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 57de9bccbf50..dda846190fbd 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -36,102 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
return page_address(page);
}
-static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
- gfp_t gfp)
-{
- void *ret;
- int i;
-
- for (i = 0; i < nr_pages; i++) {
- pages[i] = alloc_page(gfp);
- if (!pages[i])
- goto err;
- }
-
- ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (ret)
- return ret;
-err:
- while (i--)
- put_page(pages[i]);
- return ERR_PTR(-ENOMEM);
-}
-
-void *io_pages_map(struct page ***out_pages, unsigned short *npages,
- size_t size)
-{
- gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
- struct page **pages;
- int nr_pages;
- void *ret;
-
- nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
- if (!IS_ERR(ret))
- goto done;
- if (nr_pages == 1)
- goto fail;
-
- ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
- if (!IS_ERR(ret)) {
-done:
- *out_pages = pages;
- *npages = nr_pages;
- return ret;
- }
-fail:
- kvfree(pages);
- *out_pages = NULL;
- *npages = 0;
- return ret;
-}
-
-void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
- bool put_pages)
-{
- bool do_vunmap = false;
-
- if (!ptr)
- return;
-
- if (put_pages && *npages) {
- struct page **to_free = *pages;
- int i;
-
- /*
- * Only did vmap for the non-compound multiple page case.
- * For the compound page, we just need to put the head.
- */
- if (PageCompound(to_free[0]))
- *npages = 1;
- else if (*npages > 1)
- do_vunmap = true;
- for (i = 0; i < *npages; i++)
- put_page(to_free[i]);
- }
- if (do_vunmap)
- vunmap(ptr);
- kvfree(*pages);
- *pages = NULL;
- *npages = 0;
-}
-
-void io_pages_free(struct page ***pages, int npages)
-{
- struct page **page_array = *pages;
-
- if (!page_array)
- return;
-
- unpin_user_pages(page_array, npages);
- kvfree(page_array);
- *pages = NULL;
-}
-
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
{
unsigned long start, end, nr_pages;
@@ -174,64 +78,127 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
return ERR_PTR(ret);
}
-void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
- unsigned long uaddr, size_t size)
+enum {
+ /* memory was vmap'ed for the kernel, freeing the region vunmap's it */
+ IO_REGION_F_VMAP = 1,
+ /* memory is provided by user and pinned by the kernel */
+ IO_REGION_F_USER_PROVIDED = 2,
+ /* only the first page in the array is ref'ed */
+ IO_REGION_F_SINGLE_REF = 4,
+};
+
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
{
- struct page **page_array;
- unsigned int nr_pages;
- void *page_addr;
+ if (mr->pages) {
+ long nr_refs = mr->nr_pages;
- *npages = 0;
+ if (mr->flags & IO_REGION_F_SINGLE_REF)
+ nr_refs = 1;
- if (uaddr & (PAGE_SIZE - 1) || !size)
- return ERR_PTR(-EINVAL);
+ if (mr->flags & IO_REGION_F_USER_PROVIDED)
+ unpin_user_pages(mr->pages, nr_refs);
+ else
+ release_pages(mr->pages, nr_refs);
- nr_pages = 0;
- page_array = io_pin_pages(uaddr, size, &nr_pages);
- if (IS_ERR(page_array))
- return page_array;
+ kvfree(mr->pages);
+ }
+ if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr)
+ vunmap(mr->ptr);
+ if (mr->nr_pages && ctx->user)
+ __io_unaccount_mem(ctx->user, mr->nr_pages);
- page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL);
- if (page_addr) {
- *pages = page_array;
- *npages = nr_pages;
- return page_addr;
+ memset(mr, 0, sizeof(*mr));
+}
+
+static int io_region_init_ptr(struct io_mapped_region *mr)
+{
+ struct io_imu_folio_data ifd;
+ void *ptr;
+
+ if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) {
+ if (ifd.nr_folios == 1) {
+ mr->ptr = page_address(mr->pages[0]);
+ return 0;
+ }
}
+ ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
- io_pages_free(&page_array, nr_pages);
- return ERR_PTR(-ENOMEM);
+ mr->ptr = ptr;
+ mr->flags |= IO_REGION_F_VMAP;
+ return 0;
}
-void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
+static int io_region_pin_pages(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg)
{
- if (mr->pages) {
- unpin_user_pages(mr->pages, mr->nr_pages);
- kvfree(mr->pages);
+ unsigned long size = mr->nr_pages << PAGE_SHIFT;
+ struct page **pages;
+ int nr_pages;
+
+ pages = io_pin_pages(reg->user_addr, size, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ if (WARN_ON_ONCE(nr_pages != mr->nr_pages))
+ return -EFAULT;
+
+ mr->pages = pages;
+ mr->flags |= IO_REGION_F_USER_PROVIDED;
+ return 0;
+}
+
+static int io_region_allocate_pages(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg,
+ unsigned long mmap_offset)
+{
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
+ unsigned long size = mr->nr_pages << PAGE_SHIFT;
+ unsigned long nr_allocated;
+ struct page **pages;
+ void *p;
+
+ pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
+ if (!pages)
+ return -ENOMEM;
+
+ p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp);
+ if (!IS_ERR(p)) {
+ mr->flags |= IO_REGION_F_SINGLE_REF;
+ goto done;
}
- if (mr->vmap_ptr)
- vunmap(mr->vmap_ptr);
- if (mr->nr_pages && ctx->user)
- __io_unaccount_mem(ctx->user, mr->nr_pages);
- memset(mr, 0, sizeof(*mr));
+ nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE,
+ mr->nr_pages, pages);
+ if (nr_allocated != mr->nr_pages) {
+ if (nr_allocated)
+ release_pages(pages, nr_allocated);
+ kvfree(pages);
+ return -ENOMEM;
+ }
+done:
+ reg->mmap_offset = mmap_offset;
+ mr->pages = pages;
+ return 0;
}
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
- struct io_uring_region_desc *reg)
+ struct io_uring_region_desc *reg,
+ unsigned long mmap_offset)
{
- int pages_accounted = 0;
- struct page **pages;
int nr_pages, ret;
- void *vptr;
u64 end;
- if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
+ if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
return -EFAULT;
if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv)))
return -EINVAL;
- if (reg->flags != IORING_MEM_REGION_TYPE_USER)
+ if (reg->flags & ~IORING_MEM_REGION_TYPE_USER)
return -EINVAL;
- if (!reg->user_addr)
+ /* user_addr should be set IFF it's a user memory backed region */
+ if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr)
return -EFAULT;
if (!reg->size || reg->mmap_offset || reg->id)
return -EINVAL;
@@ -242,94 +209,120 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
if (check_add_overflow(reg->user_addr, reg->size, &end))
return -EOVERFLOW;
- pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
+ nr_pages = reg->size >> PAGE_SHIFT;
if (ctx->user) {
ret = __io_account_mem(ctx->user, nr_pages);
if (ret)
- goto out_free;
- pages_accounted = nr_pages;
+ return ret;
}
+ mr->nr_pages = nr_pages;
- vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (!vptr) {
- ret = -ENOMEM;
+ if (reg->flags & IORING_MEM_REGION_TYPE_USER)
+ ret = io_region_pin_pages(ctx, mr, reg);
+ else
+ ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset);
+ if (ret)
goto out_free;
- }
- mr->pages = pages;
- mr->vmap_ptr = vptr;
- mr->nr_pages = nr_pages;
+ ret = io_region_init_ptr(mr);
+ if (ret)
+ goto out_free;
return 0;
out_free:
- if (pages_accounted)
- __io_unaccount_mem(ctx->user, pages_accounted);
- io_pages_free(&pages, nr_pages);
+ io_free_region(ctx, mr);
return ret;
}
-static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
- size_t sz)
+int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg,
+ unsigned long mmap_offset)
+{
+ struct io_mapped_region tmp_mr;
+ int ret;
+
+ memcpy(&tmp_mr, mr, sizeof(tmp_mr));
+ ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset);
+ if (ret)
+ return ret;
+
+ /*
+ * Once published mmap can find it without holding only the ->mmap_lock
+ * and not ->uring_lock.
+ */
+ guard(mutex)(&ctx->mmap_lock);
+ memcpy(mr, &tmp_mr, sizeof(tmp_mr));
+ return 0;
+}
+
+static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
+ loff_t pgoff)
{
- struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT;
+ unsigned int bgid;
- switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
+ switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
- /* Don't allow mmap if the ring was setup without it */
- if (ctx->flags & IORING_SETUP_NO_MMAP)
- return ERR_PTR(-EINVAL);
- if (!ctx->rings)
- return ERR_PTR(-EFAULT);
- return ctx->rings;
+ return &ctx->ring_region;
case IORING_OFF_SQES:
- /* Don't allow mmap if the ring was setup without it */
- if (ctx->flags & IORING_SETUP_NO_MMAP)
- return ERR_PTR(-EINVAL);
- if (!ctx->sq_sqes)
- return ERR_PTR(-EFAULT);
- return ctx->sq_sqes;
- case IORING_OFF_PBUF_RING: {
- struct io_buffer_list *bl;
- unsigned int bgid;
- void *ptr;
-
+ return &ctx->sq_region;
+ case IORING_OFF_PBUF_RING:
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- bl = io_pbuf_get_bl(ctx, bgid);
- if (IS_ERR(bl))
- return bl;
- ptr = bl->buf_ring;
- io_put_bl(ctx, bl);
- return ptr;
- }
+ return io_pbuf_get_region(ctx, bgid);
+ case IORING_MAP_OFF_PARAM_REGION:
+ return &ctx->param_region;
}
+ return NULL;
+}
- return ERR_PTR(-EINVAL);
+static void *io_region_validate_mmap(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr)
+{
+ lockdep_assert_held(&ctx->mmap_lock);
+
+ if (!io_region_is_set(mr))
+ return ERR_PTR(-EINVAL);
+ if (mr->flags & IO_REGION_F_USER_PROVIDED)
+ return ERR_PTR(-EINVAL);
+
+ return io_region_get_ptr(mr);
}
-int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
- struct page **pages, int npages)
+static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
+ size_t sz)
{
- unsigned long nr_pages = npages;
+ struct io_ring_ctx *ctx = file->private_data;
+ struct io_mapped_region *region;
- vm_flags_set(vma, VM_DONTEXPAND);
- return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+ region = io_mmap_get_region(ctx, pgoff);
+ if (!region)
+ return ERR_PTR(-EINVAL);
+ return io_region_validate_mmap(ctx, region);
}
#ifdef CONFIG_MMU
+static int io_region_mmap(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ struct vm_area_struct *vma,
+ unsigned max_pages)
+{
+ unsigned long nr_pages = min(mr->nr_pages, max_pages);
+
+ vm_flags_set(vma, VM_DONTEXPAND);
+ return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages);
+}
+
__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
long offset = vma->vm_pgoff << PAGE_SHIFT;
- unsigned int npages;
+ unsigned int page_limit = UINT_MAX;
+ struct io_mapped_region *region;
void *ptr;
- guard(mutex)(&ctx->resize_lock);
+ guard(mutex)(&ctx->mmap_lock);
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
@@ -338,16 +331,12 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING:
- npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT);
- return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages);
- case IORING_OFF_SQES:
- return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
- ctx->n_sqe_pages);
- case IORING_OFF_PBUF_RING:
- return io_pbuf_mmap(file, vma);
+ page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ break;
}
- return -EINVAL;
+ region = io_mmap_get_region(ctx, vma->vm_pgoff);
+ return io_region_mmap(ctx, region, vma, page_limit);
}
unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -365,7 +354,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr)
return -EINVAL;
- guard(mutex)(&ctx->resize_lock);
+ guard(mutex)(&ctx->mmap_lock);
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
if (IS_ERR(ptr))
@@ -415,7 +404,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
struct io_ring_ctx *ctx = file->private_data;
void *ptr;
- guard(mutex)(&ctx->resize_lock);
+ guard(mutex)(&ctx->mmap_lock);
ptr = io_uring_validate_mmap_request(file, pgoff, len);
if (IS_ERR(ptr))
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index f361a635b6c7..c898dcba2b4e 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -1,18 +1,9 @@
#ifndef IO_URING_MEMMAP_H
#define IO_URING_MEMMAP_H
-struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
-void io_pages_free(struct page ***pages, int npages);
-int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
- struct page **pages, int npages);
-
-void *io_pages_map(struct page ***out_pages, unsigned short *npages,
- size_t size);
-void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
- bool put_pages);
+#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
-void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
- unsigned long uaddr, size_t size);
+struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
#ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file);
@@ -24,11 +15,17 @@ int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
- struct io_uring_region_desc *reg);
+ struct io_uring_region_desc *reg,
+ unsigned long mmap_offset);
+
+int io_create_region_mmap_safe(struct io_ring_ctx *ctx,
+ struct io_mapped_region *mr,
+ struct io_uring_region_desc *reg,
+ unsigned long mmap_offset);
static inline void *io_region_get_ptr(struct io_mapped_region *mr)
{
- return mr->vmap_ptr;
+ return mr->ptr;
}
static inline bool io_region_is_set(struct io_mapped_region *mr)
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 333c220d322a..bd3cd78d2dba 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -354,10 +354,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe)
return __io_msg_ring_data(fd_file(f)->private_data,
&io_msg, IO_URING_F_UNLOCKED);
}
-
-void io_msg_cache_free(const void *entry)
-{
- struct io_kiocb *req = (struct io_kiocb *) entry;
-
- kmem_cache_free(req_cachep, req);
-}
diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h
index 38e7f8f0c944..32236d2fb778 100644
--- a/io_uring/msg_ring.h
+++ b/io_uring/msg_ring.h
@@ -4,4 +4,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe);
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
void io_msg_ring_cleanup(struct io_kiocb *req);
-void io_msg_cache_free(const void *entry);
diff --git a/io_uring/net.c b/io_uring/net.c
index c6cd38cc5dc4..85f55fbc25c9 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -155,30 +155,31 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
}
}
+static void io_msg_async_data_init(void *obj)
+{
+ struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj;
+
+ hdr->free_iov = NULL;
+ hdr->free_iov_nr = 0;
+}
+
static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_async_msghdr *hdr;
- hdr = io_alloc_cache_get(&ctx->netmsg_cache);
- if (hdr) {
- if (hdr->free_iov) {
- kasan_mempool_unpoison_object(hdr->free_iov,
- hdr->free_iov_nr * sizeof(struct iovec));
- req->flags |= REQ_F_NEED_CLEANUP;
- }
- req->flags |= REQ_F_ASYNC_DATA;
- req->async_data = hdr;
- return hdr;
- }
+ hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req,
+ io_msg_async_data_init);
+ if (!hdr)
+ return NULL;
- if (!io_alloc_async_data(req)) {
- hdr = req->async_data;
- hdr->free_iov_nr = 0;
- hdr->free_iov = NULL;
- return hdr;
+ /* If the async data was cached, we might have an iov cached inside. */
+ if (hdr->free_iov) {
+ kasan_mempool_unpoison_object(hdr->free_iov,
+ hdr->free_iov_nr * sizeof(struct iovec));
+ req->flags |= REQ_F_NEED_CLEANUP;
}
- return NULL;
+ return hdr;
}
/* assign new iovec to kmsg, if we need to */
diff --git a/io_uring/poll.c b/io_uring/poll.c
index bced9edd5233..cc01c40b43d3 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -648,15 +648,12 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
if (req->flags & REQ_F_POLLED) {
apoll = req->apoll;
kfree(apoll->double_poll);
- } else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
- apoll = io_alloc_cache_get(&ctx->apoll_cache);
- if (!apoll)
- goto alloc_apoll;
- apoll->poll.retries = APOLL_MAX_RETRY;
} else {
-alloc_apoll:
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
+ if (!(issue_flags & IO_URING_F_UNLOCKED))
+ apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL);
+ else
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (!apoll)
return NULL;
apoll->poll.retries = APOLL_MAX_RETRY;
}
diff --git a/io_uring/register.c b/io_uring/register.c
index 371aec87e078..05025047d1da 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -104,21 +104,13 @@ static int io_register_personality(struct io_ring_ctx *ctx)
return id;
}
-static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
- void __user *arg, unsigned int nr_args)
+static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
+ struct io_restriction *restrictions)
{
struct io_uring_restriction *res;
size_t size;
int i, ret;
- /* Restrictions allowed only if rings started disabled */
- if (!(ctx->flags & IORING_SETUP_R_DISABLED))
- return -EBADFD;
-
- /* We allow only a single restrictions registration */
- if (ctx->restrictions.registered)
- return -EBUSY;
-
if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
return -EINVAL;
@@ -130,47 +122,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
if (IS_ERR(res))
return PTR_ERR(res);
- ret = 0;
+ ret = -EINVAL;
for (i = 0; i < nr_args; i++) {
switch (res[i].opcode) {
case IORING_RESTRICTION_REGISTER_OP:
- if (res[i].register_op >= IORING_REGISTER_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].register_op,
- ctx->restrictions.register_op);
+ if (res[i].register_op >= IORING_REGISTER_LAST)
+ goto err;
+ __set_bit(res[i].register_op, restrictions->register_op);
break;
case IORING_RESTRICTION_SQE_OP:
- if (res[i].sqe_op >= IORING_OP_LAST) {
- ret = -EINVAL;
- goto out;
- }
-
- __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
+ if (res[i].sqe_op >= IORING_OP_LAST)
+ goto err;
+ __set_bit(res[i].sqe_op, restrictions->sqe_op);
break;
case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
- ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
+ restrictions->sqe_flags_allowed = res[i].sqe_flags;
break;
case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
- ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
+ restrictions->sqe_flags_required = res[i].sqe_flags;
break;
default:
- ret = -EINVAL;
- goto out;
+ goto err;
}
}
-out:
+ ret = 0;
+
+err:
+ kfree(res);
+ return ret;
+}
+
+static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
+ void __user *arg, unsigned int nr_args)
+{
+ int ret;
+
+ /* Restrictions allowed only if rings started disabled */
+ if (!(ctx->flags & IORING_SETUP_R_DISABLED))
+ return -EBADFD;
+
+ /* We allow only a single restrictions registration */
+ if (ctx->restrictions.registered)
+ return -EBUSY;
+
+ ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
/* Reset all restrictions if an error happened */
if (ret != 0)
memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
else
ctx->restrictions.registered = true;
-
- kfree(res);
return ret;
}
@@ -367,28 +369,19 @@ static int io_register_clock(struct io_ring_ctx *ctx,
* either mapping or freeing.
*/
struct io_ring_ctx_rings {
- unsigned short n_ring_pages;
- unsigned short n_sqe_pages;
- struct page **ring_pages;
- struct page **sqe_pages;
- struct io_uring_sqe *sq_sqes;
struct io_rings *rings;
+ struct io_uring_sqe *sq_sqes;
+
+ struct io_mapped_region sq_region;
+ struct io_mapped_region ring_region;
};
-static void io_register_free_rings(struct io_uring_params *p,
+static void io_register_free_rings(struct io_ring_ctx *ctx,
+ struct io_uring_params *p,
struct io_ring_ctx_rings *r)
{
- if (!(p->flags & IORING_SETUP_NO_MMAP)) {
- io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
- true);
- io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
- true);
- } else {
- io_pages_free(&r->ring_pages, r->n_ring_pages);
- io_pages_free(&r->sqe_pages, r->n_sqe_pages);
- vunmap(r->rings);
- vunmap(r->sq_sqes);
- }
+ io_free_region(ctx, &r->sq_region);
+ io_free_region(ctx, &r->ring_region);
}
#define swap_old(ctx, o, n, field) \
@@ -403,11 +396,11 @@ static void io_register_free_rings(struct io_uring_params *p,
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
{
+ struct io_uring_region_desc rd;
struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
size_t size, sq_array_offset;
unsigned i, tail, old_head;
struct io_uring_params p;
- void *ptr;
int ret;
/* for single issuer, must be owner resizing */
@@ -441,13 +434,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
if (size == SIZE_MAX)
return -EOVERFLOW;
- if (!(p.flags & IORING_SETUP_NO_MMAP))
- n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
- else
- n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
- p.cq_off.user_addr, size);
- if (IS_ERR(n.rings))
- return PTR_ERR(n.rings);
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (p.flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p.cq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
+ if (ret) {
+ io_register_free_rings(ctx, &p, &n);
+ return ret;
+ }
+ n.rings = io_region_get_ptr(&n.ring_region);
/*
* At this point n.rings is shared with userspace, just like o.rings
@@ -463,7 +461,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
if (copy_to_user(arg, &p, sizeof(p))) {
- io_register_free_rings(&p, &n);
+ io_register_free_rings(ctx, &p, &n);
return -EFAULT;
}
@@ -472,20 +470,22 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
else
size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
if (size == SIZE_MAX) {
- io_register_free_rings(&p, &n);
+ io_register_free_rings(ctx, &p, &n);
return -EOVERFLOW;
}
- if (!(p.flags & IORING_SETUP_NO_MMAP))
- ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
- else
- ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
- p.sq_off.user_addr,
- size);
- if (IS_ERR(ptr)) {
- io_register_free_rings(&p, &n);
- return PTR_ERR(ptr);
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (p.flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p.sq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
+ if (ret) {
+ io_register_free_rings(ctx, &p, &n);
+ return ret;
}
+ n.sq_sqes = io_region_get_ptr(&n.sq_region);
/*
* If using SQPOLL, park the thread
@@ -497,15 +497,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
}
/*
- * We'll do the swap. Grab the ctx->resize_lock, which will exclude
+ * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
* any new mmap's on the ring fd. Clear out existing mappings to prevent
* mmap from seeing them, as we'll unmap them. Any attempt to mmap
* existing rings beyond this point will fail. Not that it could proceed
* at this point anyway, as the io_uring mmap side needs go grab the
- * ctx->resize_lock as well. Likewise, hold the completion lock over the
+ * ctx->mmap_lock as well. Likewise, hold the completion lock over the
* duration of the actual swap.
*/
- mutex_lock(&ctx->resize_lock);
+ mutex_lock(&ctx->mmap_lock);
spin_lock(&ctx->completion_lock);
o.rings = ctx->rings;
ctx->rings = NULL;
@@ -516,7 +516,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
* Now copy SQ and CQ entries, if any. If either of the destination
* rings can't hold what is already there, then fail the operation.
*/
- n.sq_sqes = ptr;
tail = READ_ONCE(o.rings->sq.tail);
old_head = READ_ONCE(o.rings->sq.head);
if (tail - old_head > p.sq_entries)
@@ -527,8 +526,8 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
}
- WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
- WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));
+ WRITE_ONCE(n.rings->sq.head, old_head);
+ WRITE_ONCE(n.rings->sq.tail, tail);
tail = READ_ONCE(o.rings->cq.tail);
old_head = READ_ONCE(o.rings->cq.head);
@@ -547,8 +546,8 @@ overflow:
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
}
- WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head));
- WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail));
+ WRITE_ONCE(n.rings->cq.head, old_head);
+ WRITE_ONCE(n.rings->cq.tail, tail);
/* invalidate cached cqe refill */
ctx->cqe_cached = ctx->cqe_sentinel = NULL;
@@ -566,16 +565,14 @@ overflow:
ctx->rings = n.rings;
ctx->sq_sqes = n.sq_sqes;
- swap_old(ctx, o, n, n_ring_pages);
- swap_old(ctx, o, n, n_sqe_pages);
- swap_old(ctx, o, n, ring_pages);
- swap_old(ctx, o, n, sqe_pages);
+ swap_old(ctx, o, n, ring_region);
+ swap_old(ctx, o, n, sq_region);
to_free = &o;
ret = 0;
out:
spin_unlock(&ctx->completion_lock);
- mutex_unlock(&ctx->resize_lock);
- io_register_free_rings(&p, to_free);
+ mutex_unlock(&ctx->mmap_lock);
+ io_register_free_rings(ctx, &p, to_free);
if (ctx->sq_data)
io_sq_thread_unpark(ctx->sq_data);
@@ -598,7 +595,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
rd_uptr = u64_to_user_ptr(reg.region_uptr);
if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
return -EFAULT;
-
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
return -EINVAL;
if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
@@ -613,7 +609,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
!(ctx->flags & IORING_SETUP_R_DISABLED))
return -EINVAL;
- ret = io_create_region(ctx, &ctx->param_region, &rd);
+ ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
+ IORING_MAP_OFF_PARAM_REGION);
if (ret)
return ret;
if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 69937d0c94f9..e32ac5853391 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -626,11 +626,12 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
- struct io_imu_folio_data *data, int nr_folios)
+static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
+ struct io_imu_folio_data *data)
{
struct page **page_array = *pages, **new_array = NULL;
int nr_pages_left = *nr_pages, i, j;
+ int nr_folios = data->nr_folios;
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
@@ -667,27 +668,21 @@ static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
return true;
}
-static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
- struct io_imu_folio_data *data)
+bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
+ struct io_imu_folio_data *data)
{
- struct page **page_array = *pages;
struct folio *folio = page_folio(page_array[0]);
unsigned int count = 1, nr_folios = 1;
int i;
- if (*nr_pages <= 1)
- return false;
-
data->nr_pages_mid = folio_nr_pages(folio);
- if (data->nr_pages_mid == 1)
- return false;
-
data->folio_shift = folio_shift(folio);
+
/*
* Check if pages are contiguous inside a folio, and all folios have
* the same page count except for the head and tail.
*/
- for (i = 1; i < *nr_pages; i++) {
+ for (i = 1; i < nr_pages; i++) {
if (page_folio(page_array[i]) == folio &&
page_array[i] == page_array[i-1] + 1) {
count++;
@@ -715,7 +710,8 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
if (nr_folios == 1)
data->nr_pages_head = count;
- return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
+ data->nr_folios = nr_folios;
+ return true;
}
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
@@ -729,7 +725,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
size_t size;
int ret, nr_pages, i;
struct io_imu_folio_data data;
- bool coalesced;
+ bool coalesced = false;
if (!iov->iov_base)
return NULL;
@@ -749,7 +745,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
}
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
- coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
+ if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
+ if (data.nr_pages_mid != 1)
+ coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
+ }
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu)
@@ -883,7 +882,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
- iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
+ iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len);
if (offset) {
/*
@@ -905,7 +904,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) {
- iter->count -= offset;
iter->iov_offset = offset;
} else {
unsigned long seg_skip;
@@ -916,7 +914,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
iter->bvec += seg_skip;
iter->nr_segs -= seg_skip;
- iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
}
}
@@ -931,6 +928,13 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
int i, ret, off, nr;
unsigned int nbufs;
+ /*
+ * Accounting state is shared between the two rings; that only works if
+ * both rings are accounted towards the same counters.
+ */
+ if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account)
+ return -EINVAL;
+
/* if offsets are given, must have nr specified too */
if (!arg->nr && (arg->dst_off || arg->src_off))
return -EINVAL;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 7a4668deaa1a..c8b093584461 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -40,6 +40,7 @@ struct io_imu_folio_data {
/* For non-head/tail folios, has to be fully included */
unsigned int nr_pages_mid;
unsigned int folio_shift;
+ unsigned int nr_folios;
};
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
@@ -66,6 +67,9 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
+bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
+ struct io_imu_folio_data *data);
+
static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
int index)
{
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 29bb3010f9c0..a9a2733be842 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -202,45 +202,40 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
* mean that the underlying data can be gone at any time. But that
* should be fixed seperately, and then this check could be killed.
*/
- if (!(req->flags & REQ_F_REFCOUNT)) {
+ if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) {
req->flags &= ~REQ_F_NEED_CLEANUP;
io_rw_recycle(req, issue_flags);
}
}
+static void io_rw_async_data_init(void *obj)
+{
+ struct io_async_rw *rw = (struct io_async_rw *)obj;
+
+ rw->free_iovec = NULL;
+ rw->bytes_done = 0;
+}
+
static int io_rw_alloc_async(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_async_rw *rw;
- rw = io_alloc_cache_get(&ctx->rw_cache);
- if (rw) {
- if (rw->free_iovec) {
- kasan_mempool_unpoison_object(rw->free_iovec,
- rw->free_iov_nr * sizeof(struct iovec));
- req->flags |= REQ_F_NEED_CLEANUP;
- }
- req->flags |= REQ_F_ASYNC_DATA;
- req->async_data = rw;
- goto done;
- }
-
- if (!io_alloc_async_data(req)) {
- rw = req->async_data;
- rw->free_iovec = NULL;
- rw->free_iov_nr = 0;
-done:
- rw->bytes_done = 0;
- return 0;
+ rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init);
+ if (!rw)
+ return -ENOMEM;
+ if (rw->free_iovec) {
+ kasan_mempool_unpoison_object(rw->free_iovec,
+ rw->free_iov_nr * sizeof(struct iovec));
+ req->flags |= REQ_F_NEED_CLEANUP;
}
-
- return -ENOMEM;
+ rw->bytes_done = 0;
+ return 0;
}
static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
{
struct io_async_rw *rw;
- int ret;
if (io_rw_alloc_async(req))
return -ENOMEM;
@@ -249,12 +244,48 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
return 0;
rw = req->async_data;
- ret = io_import_iovec(ddir, req, rw, 0);
+ return io_import_iovec(ddir, req, rw, 0);
+}
+
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+ io->meta_state.seed = io->meta.seed;
+ iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb)
+{
+ if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+ io->meta.seed = io->meta_state.seed;
+ iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+ }
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir,
+ u64 attr_ptr, u64 attr_type_mask)
+{
+ struct io_uring_attr_pi pi_attr;
+ struct io_async_rw *io;
+ int ret;
+
+ if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr),
+ sizeof(pi_attr)))
+ return -EFAULT;
+
+ if (pi_attr.rsvd)
+ return -EINVAL;
+
+ io = req->async_data;
+ io->meta.flags = pi_attr.flags;
+ io->meta.app_tag = pi_attr.app_tag;
+ io->meta.seed = pi_attr.seed;
+ ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr),
+ pi_attr.len, &io->meta.iter);
if (unlikely(ret < 0))
return ret;
-
- iov_iter_save_state(&rw->iter, &rw->iter_state);
- return 0;
+ req->flags |= REQ_F_HAS_METADATA;
+ io_meta_save_state(io);
+ return ret;
}
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
@@ -262,6 +293,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned ioprio;
+ u64 attr_type_mask;
int ret;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +311,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
+ rw->kiocb.ki_flags = 0;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
- return io_prep_rw_setup(req, ddir, do_import);
+ ret = io_prep_rw_setup(req, ddir, do_import);
+
+ if (unlikely(ret))
+ return ret;
+
+ attr_type_mask = READ_ONCE(sqe->attr_type_mask);
+ if (attr_type_mask) {
+ u64 attr_ptr;
+
+ /* only PI attribute is supported currently */
+ if (attr_type_mask != IORING_RW_ATTR_FLAG_PI)
+ return -EINVAL;
+
+ attr_ptr = READ_ONCE(sqe->attr_ptr);
+ ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask);
+ }
+ return ret;
}
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -385,7 +434,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
void io_readv_writev_cleanup(struct io_kiocb *req)
{
- io_rw_iovec_free(req->async_data);
+ lockdep_assert_held(&req->ctx->uring_lock);
+ io_rw_recycle(req, 0);
}
static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
@@ -405,17 +455,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
return NULL;
}
-#ifdef CONFIG_BLOCK
-static void io_resubmit_prep(struct io_kiocb *req)
-{
- struct io_async_rw *io = req->async_data;
-
- iov_iter_restore(&io->iter, &io->iter_state);
-}
-
static bool io_rw_should_reissue(struct io_kiocb *req)
{
+#ifdef CONFIG_BLOCK
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
umode_t mode = file_inode(req->file)->i_mode;
+ struct io_async_rw *io = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
if (!S_ISBLK(mode) && !S_ISREG(mode))
@@ -430,23 +475,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
*/
if (percpu_ref_is_dying(&ctx->refs))
return false;
- /*
- * Play it safe and assume not safe to re-import and reissue if we're
- * not in the original thread group (or in task context).
- */
- if (!same_thread_group(req->tctx->task, current) || !in_task())
- return false;
+
+ io_meta_restore(io, &rw->kiocb);
+ iov_iter_restore(&io->iter, &io->iter_state);
return true;
-}
#else
-static void io_resubmit_prep(struct io_kiocb *req)
-{
-}
-static bool io_rw_should_reissue(struct io_kiocb *req)
-{
return false;
-}
#endif
+}
static void io_req_end_write(struct io_kiocb *req)
{
@@ -473,22 +509,16 @@ static void io_req_io_end(struct io_kiocb *req)
}
}
-static bool __io_complete_rw_common(struct io_kiocb *req, long res)
+static void __io_complete_rw_common(struct io_kiocb *req, long res)
{
- if (unlikely(res != req->cqe.res)) {
- if (res == -EAGAIN && io_rw_should_reissue(req)) {
- /*
- * Reissue will start accounting again, finish the
- * current cycle.
- */
- io_req_io_end(req);
- req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
- return true;
- }
+ if (res == req->cqe.res)
+ return;
+ if (res == -EAGAIN && io_rw_should_reissue(req)) {
+ req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
+ } else {
req_set_fail(req);
req->cqe.res = res;
}
- return false;
}
static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
@@ -531,8 +561,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_kiocb *req = cmd_to_io_kiocb(rw);
if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
- if (__io_complete_rw_common(req, res))
- return;
+ __io_complete_rw_common(req, res);
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
}
req->io_task_work.func = io_req_rw_complete;
@@ -594,26 +623,19 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos;
if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
- if (!__io_complete_rw_common(req, ret)) {
- /*
- * Safe to call io_end from here as we're inline
- * from the submission path.
- */
- io_req_io_end(req);
- io_req_set_res(req, final_ret,
- io_put_kbuf(req, ret, issue_flags));
- io_req_rw_cleanup(req, issue_flags);
- return IOU_OK;
- }
+ __io_complete_rw_common(req, ret);
+ /*
+ * Safe to call io_end from here as we're inline
+ * from the submission path.
+ */
+ io_req_io_end(req);
+ io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags));
+ io_req_rw_cleanup(req, issue_flags);
+ return IOU_OK;
} else {
io_rw_done(&rw->kiocb, ret);
}
- if (req->flags & REQ_F_REISSUE) {
- req->flags &= ~REQ_F_REISSUE;
- io_resubmit_prep(req);
- return -EAGAIN;
- }
return IOU_ISSUE_SKIP_COMPLETE;
}
@@ -736,8 +758,11 @@ static bool io_rw_should_retry(struct io_kiocb *req)
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct kiocb *kiocb = &rw->kiocb;
- /* never retry for NOWAIT, we just complete with -EAGAIN */
- if (req->flags & REQ_F_NOWAIT)
+ /*
+ * Never retry for NOWAIT or a request with metadata, we just complete
+ * with -EAGAIN.
+ */
+ if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA))
return false;
/* Only for buffered IO */
@@ -828,6 +853,19 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
kiocb->ki_complete = io_complete_rw;
}
+ if (req->flags & REQ_F_HAS_METADATA) {
+ struct io_async_rw *io = req->async_data;
+
+ /*
+ * We have a union of meta fields with wpq used for buffered-io
+ * in io_async_rw, so fail it here.
+ */
+ if (!(req->file->f_flags & O_DIRECT))
+ return -EOPNOTSUPP;
+ kiocb->ki_flags |= IOCB_HAS_METADATA;
+ kiocb->private = &io->meta;
+ }
+
return 0;
}
@@ -876,8 +914,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EOPNOTSUPP && force_nonblock)
ret = -EAGAIN;
- if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
- req->flags &= ~REQ_F_REISSUE;
+ if (ret == -EAGAIN) {
/* If we can poll, just do that. */
if (io_file_can_poll(req))
return -EAGAIN;
@@ -902,6 +939,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* manually if we need to.
*/
iov_iter_restore(&io->iter, &io->iter_state);
+ io_meta_restore(io, kiocb);
do {
/*
@@ -1087,11 +1125,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
else
ret2 = -EINVAL;
- if (req->flags & REQ_F_REISSUE) {
- req->flags &= ~REQ_F_REISSUE;
- ret2 = -EAGAIN;
- }
-
/*
* Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
* retry them without IOCB_NOWAIT.
@@ -1127,6 +1160,7 @@ done:
} else {
ret_eagain:
iov_iter_restore(&io->iter, &io->iter_state);
+ io_meta_restore(io, kiocb);
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@
#include <linux/pagemap.h>
+struct io_meta_state {
+ u32 seed;
+ struct iov_iter_state iter_meta;
+};
+
struct io_async_rw {
size_t bytes_done;
struct iov_iter iter;
@@ -9,7 +14,14 @@ struct io_async_rw {
struct iovec fast_iov;
struct iovec *free_iovec;
int free_iov_nr;
- struct wait_page_queue wpq;
+ /* wpq is for buffered io, while meta fields are used with direct io */
+ union {
+ struct wait_page_queue wpq;
+ struct {
+ struct uio_meta meta;
+ struct io_meta_state meta_state;
+ };
+ };
};
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index e9cec9e4dc2f..2bd7e0a317bb 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -544,10 +544,9 @@ static int __io_timeout_prep(struct io_kiocb *req,
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
- if (io_alloc_async_data(req))
+ data = io_uring_alloc_async_data_nocache(req);
+ if (!data)
return -ENOMEM;
-
- data = req->async_data;
data->req = req;
data->flags = flags;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index ce7726a04883..fc94c465a985 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -16,26 +16,6 @@
#include "rsrc.h"
#include "uring_cmd.h"
-static struct io_uring_cmd_data *io_uring_async_get(struct io_kiocb *req)
-{
- struct io_ring_ctx *ctx = req->ctx;
- struct io_uring_cmd_data *cache;
-
- cache = io_alloc_cache_get(&ctx->uring_cache);
- if (cache) {
- cache->op_data = NULL;
- req->flags |= REQ_F_ASYNC_DATA;
- req->async_data = cache;
- return cache;
- }
- if (!io_alloc_async_data(req)) {
- cache = req->async_data;
- cache->op_data = NULL;
- return cache;
- }
- return NULL;
-}
-
static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
@@ -130,7 +110,7 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
unsigned int flags = IO_URING_F_COMPLETE_DEFER;
- if (current->flags & (PF_EXITING | PF_KTHREAD))
+ if (io_should_terminate_tw())
flags |= IO_URING_F_TASK_DEAD;
/* task_work executor checks the deffered list completion */
@@ -188,14 +168,22 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+static void io_uring_cmd_init_once(void *obj)
+{
+ struct io_uring_cmd_data *data = obj;
+
+ data->op_data = NULL;
+}
+
static int io_uring_cmd_prep_setup(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
struct io_uring_cmd_data *cache;
- cache = io_uring_async_get(req);
- if (unlikely(!cache))
+ cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req,
+ io_uring_cmd_init_once);
+ if (!cache)
return -ENOMEM;
if (!(req->flags & REQ_F_FORCE_ASYNC)) {
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index daef5dd644f0..6778c0ee76c4 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -303,10 +303,10 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
struct io_waitid_async *iwa;
int ret;
- if (io_alloc_async_data(req))
+ iwa = io_uring_alloc_async_data_nocache(req);
+ if (!iwa)
return -ENOMEM;
- iwa = req->async_data;
iwa->req = req;
ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info,