summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig19
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/brd.c225
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/rnbd/rnbd-srv.c7
-rw-r--r--drivers/block/ublk_drv.c569
-rw-r--r--drivers/block/virtio_blk.c4
-rw-r--r--drivers/block/zloop.c1385
8 files changed, 1892 insertions, 320 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index e48b24be45ee..0f70e2374e7f 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -407,4 +407,23 @@ config BLKDEV_UBLK_LEGACY_OPCODES
source "drivers/block/rnbd/Kconfig"
+config BLK_DEV_ZONED_LOOP
+ tristate "Zoned loopback device support"
+ depends on BLK_DEV_ZONED
+ help
+ Saying Y here will allow you to use create a zoned block device using
+ regular files for zones (one file per zones). This is useful to test
+ file systems, device mapper and applications that support zoned block
+ devices. To create a zoned loop device, no user utility is needed, a
+ zoned loop device can be created (or re-started) using a command
+ like:
+
+ echo "add id=0,zone_size_mb=256,capacity_mb=16384,conv_zones=11" > \
+ /dev/zloop-control
+
+ See Documentation/admin-guide/blockdev/zoned_loop.rst for usage
+ details.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1105a2d4fdcb..097707aca725 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/
obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/
obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o
+obj-$(CONFIG_BLK_DEV_ZONED_LOOP) += zloop.o
swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 292f127cae0a..b1be6c510372 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -54,32 +54,33 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
/*
* Insert a new page for a given sector, if one does not already exist.
*/
-static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector,
+ blk_opf_t opf)
+ __releases(rcu)
+ __acquires(rcu)
{
- pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
- struct page *page;
- int ret = 0;
-
- page = brd_lookup_page(brd, sector);
- if (page)
- return 0;
+ gfp_t gfp = (opf & REQ_NOWAIT) ? GFP_NOWAIT : GFP_NOIO;
+ struct page *page, *ret;
+ rcu_read_unlock();
page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
+ rcu_read_lock();
if (!page)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
xa_lock(&brd->brd_pages);
- ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
- if (!ret)
- brd->brd_nr_pages++;
- xa_unlock(&brd->brd_pages);
-
- if (ret < 0) {
+ ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL,
+ page, gfp);
+ if (ret) {
+ xa_unlock(&brd->brd_pages);
__free_page(page);
- if (ret == -EBUSY)
- ret = 0;
+ if (xa_is_err(ret))
+ return ERR_PTR(xa_err(ret));
+ return ret;
}
- return ret;
+ brd->brd_nr_pages++;
+ xa_unlock(&brd->brd_pages);
+ return page;
}
/*
@@ -100,143 +101,77 @@ static void brd_free_pages(struct brd_device *brd)
}
/*
- * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ * Process a single segment. The segment is capped to not cross page boundaries
+ * in both the bio and the brd backing memory.
*/
-static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
- gfp_t gfp)
-{
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
- int ret;
-
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- ret = brd_insert_page(brd, sector, gfp);
- if (ret)
- return ret;
- if (copy < n) {
- sector += copy >> SECTOR_SHIFT;
- ret = brd_insert_page(brd, sector, gfp);
- }
- return ret;
-}
-
-/*
- * Copy n bytes from src to the brd starting at sector. Does not sleep.
- */
-static void copy_to_brd(struct brd_device *brd, const void *src,
- sector_t sector, size_t n)
+static bool brd_rw_bvec(struct brd_device *brd, struct bio *bio)
{
+ struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
+ sector_t sector = bio->bi_iter.bi_sector;
+ u32 offset = (sector & (PAGE_SECTORS - 1)) << SECTOR_SHIFT;
+ blk_opf_t opf = bio->bi_opf;
struct page *page;
- void *dst;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
+ void *kaddr;
- copy = min_t(size_t, n, PAGE_SIZE - offset);
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst + offset, src, copy);
- kunmap_atomic(dst);
-
- if (copy < n) {
- src += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- BUG_ON(!page);
-
- dst = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(dst);
- }
-}
+ bv.bv_len = min_t(u32, bv.bv_len, PAGE_SIZE - offset);
-/*
- * Copy n bytes to dst from the brd starting at sector. Does not sleep.
- */
-static void copy_from_brd(void *dst, struct brd_device *brd,
- sector_t sector, size_t n)
-{
- struct page *page;
- void *src;
- unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
- size_t copy;
-
- copy = min_t(size_t, n, PAGE_SIZE - offset);
+ rcu_read_lock();
page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src + offset, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
-
- if (copy < n) {
- dst += copy;
- sector += copy >> SECTOR_SHIFT;
- copy = n - copy;
- page = brd_lookup_page(brd, sector);
- if (page) {
- src = kmap_atomic(page);
- memcpy(dst, src, copy);
- kunmap_atomic(src);
- } else
- memset(dst, 0, copy);
+ if (!page && op_is_write(opf)) {
+ page = brd_insert_page(brd, sector, opf);
+ if (IS_ERR(page))
+ goto out_error;
}
-}
-
-/*
- * Process a single bvec of a bio.
- */
-static int brd_do_bvec(struct brd_device *brd, struct page *page,
- unsigned int len, unsigned int off, blk_opf_t opf,
- sector_t sector)
-{
- void *mem;
- int err = 0;
+ kaddr = bvec_kmap_local(&bv);
if (op_is_write(opf)) {
- /*
- * Must use NOIO because we don't want to recurse back into the
- * block or filesystem layers from page reclaim.
- */
- gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
-
- err = copy_to_brd_setup(brd, sector, len, gfp);
- if (err)
- goto out;
- }
-
- mem = kmap_atomic(page);
- if (!op_is_write(opf)) {
- copy_from_brd(mem + off, brd, sector, len);
- flush_dcache_page(page);
+ memcpy_to_page(page, offset, kaddr, bv.bv_len);
} else {
- flush_dcache_page(page);
- copy_to_brd(brd, mem + off, sector, len);
+ if (page)
+ memcpy_from_page(kaddr, page, offset, bv.bv_len);
+ else
+ memset(kaddr, 0, bv.bv_len);
}
- kunmap_atomic(mem);
+ kunmap_local(kaddr);
+ rcu_read_unlock();
+
+ bio_advance_iter_single(bio, &bio->bi_iter, bv.bv_len);
+ return true;
+
+out_error:
+ rcu_read_unlock();
+ if (PTR_ERR(page) == -ENOMEM && (opf & REQ_NOWAIT))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ return false;
+}
-out:
- return err;
+static void brd_free_one_page(struct rcu_head *head)
+{
+ struct page *page = container_of(head, struct page, rcu_head);
+
+ __free_page(page);
}
static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
{
- sector_t aligned_sector = (sector + PAGE_SECTORS) & ~PAGE_SECTORS;
+ sector_t aligned_sector = round_up(sector, PAGE_SECTORS);
+ sector_t aligned_end = round_down(
+ sector + (size >> SECTOR_SHIFT), PAGE_SECTORS);
struct page *page;
- size -= (aligned_sector - sector) * SECTOR_SIZE;
+ if (aligned_end <= aligned_sector)
+ return;
+
xa_lock(&brd->brd_pages);
- while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) {
+ while (aligned_sector < aligned_end && aligned_sector < rd_size * 2) {
page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT);
if (page) {
- __free_page(page);
+ call_rcu(&page->rcu_head, brd_free_one_page);
brd->brd_nr_pages--;
}
aligned_sector += PAGE_SECTORS;
- size -= PAGE_SIZE;
}
xa_unlock(&brd->brd_pages);
}
@@ -244,36 +179,18 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size)
static void brd_submit_bio(struct bio *bio)
{
struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
- sector_t sector = bio->bi_iter.bi_sector;
- struct bio_vec bvec;
- struct bvec_iter iter;
if (unlikely(op_is_discard(bio->bi_opf))) {
- brd_do_discard(brd, sector, bio->bi_iter.bi_size);
+ brd_do_discard(brd, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size);
bio_endio(bio);
return;
}
- bio_for_each_segment(bvec, bio, iter) {
- unsigned int len = bvec.bv_len;
- int err;
-
- /* Don't support un-aligned buffer */
- WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
- (len & (SECTOR_SIZE - 1)));
-
- err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
- bio->bi_opf, sector);
- if (err) {
- if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
- bio_wouldblock_error(bio);
- return;
- }
- bio_io_error(bio);
+ do {
+ if (!brd_rw_bvec(brd, bio))
return;
- }
- sector += len >> SECTOR_SHIFT;
- }
+ } while (bio->bi_iter.bi_size);
bio_endio(bio);
}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 65b96c083b3c..d5cc7bd2875c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -725,7 +725,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
scmd = blk_mq_rq_to_pdu(rq);
if (cgc->buflen) {
- ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+ ret = blk_rq_map_kern(rq, cgc->buffer, cgc->buflen,
GFP_NOIO);
if (ret)
goto out;
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
index 2ee6e9bd4e28..2df8941a6b14 100644
--- a/drivers/block/rnbd/rnbd-srv.c
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -147,12 +147,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
bio = bio_alloc(file_bdev(sess_dev->bdev_file), 1,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
- if (bio_add_page(bio, virt_to_page(data), datalen,
- offset_in_page(data)) != datalen) {
- rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n");
- err = -EINVAL;
- goto bio_put;
- }
+ bio_add_virt_nofail(bio, data, datalen);
bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw));
if (bio_has_data(bio) &&
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index dc104c025cd5..6f51072776f1 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -50,6 +50,8 @@
/* private ioctl command mirror */
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
+#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
+#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -64,7 +66,10 @@
| UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY \
| UBLK_F_ZONED \
- | UBLK_F_USER_RECOVERY_FAIL_IO)
+ | UBLK_F_USER_RECOVERY_FAIL_IO \
+ | UBLK_F_UPDATE_SIZE \
+ | UBLK_F_AUTO_BUF_REG \
+ | UBLK_F_QUIESCE)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -77,7 +82,11 @@
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
struct ublk_rq_data {
- struct kref ref;
+ refcount_t ref;
+
+ /* for auto-unregister buffer in case of UBLK_F_AUTO_BUF_REG */
+ u16 buf_index;
+ void *buf_ctx_handle;
};
struct ublk_uring_cmd_pdu {
@@ -99,6 +108,9 @@ struct ublk_uring_cmd_pdu {
* setup in ublk uring_cmd handler
*/
struct ublk_queue *ubq;
+
+ struct ublk_auto_buf_reg buf;
+
u16 tag;
};
@@ -131,6 +143,14 @@ struct ublk_uring_cmd_pdu {
*/
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+/*
+ * request buffer is registered automatically, so we have to unregister it
+ * before completing this request.
+ *
+ * io_uring will unregister buffer automatically for us during exiting.
+ */
+#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
+
/* atomic RW with ubq->cancel_lock */
#define UBLK_IO_FLAG_CANCELED 0x80000000
@@ -140,7 +160,12 @@ struct ublk_io {
unsigned int flags;
int res;
- struct io_uring_cmd *cmd;
+ union {
+ /* valid if UBLK_IO_FLAG_ACTIVE is set */
+ struct io_uring_cmd *cmd;
+ /* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
+ struct request *req;
+ };
};
struct ublk_queue {
@@ -198,13 +223,19 @@ struct ublk_params_header {
__u32 types;
};
+static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
const struct ublk_queue *ubq, int tag, size_t offset);
static inline unsigned int ublk_req_build_flags(struct request *req);
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag);
+
+static inline struct ublksrv_io_desc *
+ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
+{
+ return &ubq->io_cmd_buf[tag];
+}
+
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
@@ -356,8 +387,7 @@ static int ublk_report_zones(struct gendisk *disk, sector_t sector,
if (ret)
goto free_req;
- ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
- GFP_KERNEL);
+ ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
if (ret)
goto erase_desc;
@@ -477,7 +507,6 @@ static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
#endif
static inline void __ublk_complete_rq(struct request *req);
-static void ublk_complete_rq(struct kref *ref);
static dev_t ublk_chr_devt;
static const struct class ublk_chr_class = {
@@ -609,6 +638,11 @@ static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
}
+static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_AUTO_BUF_REG;
+}
+
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_USER_COPY;
@@ -616,7 +650,8 @@ static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
{
- return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq);
+ return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
+ !ublk_support_auto_buf_reg(ubq);
}
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
@@ -627,8 +662,13 @@ static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
*
* for zero copy, request buffer need to be registered to io_uring
* buffer table, so reference is needed
+ *
+ * For auto buffer register, ublk server still may issue
+ * UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
+ * so reference is required too.
*/
- return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq);
+ return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
+ ublk_support_auto_buf_reg(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
@@ -637,7 +677,7 @@ static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_init(&data->ref);
+ refcount_set(&data->ref, 1);
}
}
@@ -647,7 +687,7 @@ static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- return kref_get_unless_zero(&data->ref);
+ return refcount_inc_not_zero(&data->ref);
}
return true;
@@ -659,7 +699,8 @@ static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
if (ublk_need_req_ref(ubq)) {
struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
- kref_put(&data->ref, ublk_complete_rq);
+ if (refcount_dec_and_test(&data->ref))
+ __ublk_complete_rq(req);
} else {
__ublk_complete_rq(req);
}
@@ -695,12 +736,6 @@ static inline bool ublk_rq_has_data(const struct request *rq)
return bio_has_data(rq->bio);
}
-static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
- int tag)
-{
- return &ubq->io_cmd_buf[tag];
-}
-
static inline struct ublksrv_io_desc *
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
{
@@ -1117,18 +1152,12 @@ exit:
blk_mq_end_request(req, res);
}
-static void ublk_complete_rq(struct kref *ref)
+static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
+ int res, unsigned issue_flags)
{
- struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
- ref);
- struct request *req = blk_mq_rq_from_pdu(data);
+ /* read cmd first because req will overwrite it */
+ struct io_uring_cmd *cmd = io->cmd;
- __ublk_complete_rq(req);
-}
-
-static void ubq_complete_io_cmd(struct ublk_io *io, int res,
- unsigned issue_flags)
-{
/* mark this cmd owned by ublksrv */
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
@@ -1138,8 +1167,10 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res,
*/
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+ io->req = req;
+
/* tell ublksrv one io request is coming */
- io_uring_cmd_done(io->cmd, res, 0, issue_flags);
+ io_uring_cmd_done(cmd, res, 0, issue_flags);
}
#define UBLK_REQUEUE_DELAY_MS 3
@@ -1154,16 +1185,91 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq,
blk_mq_end_request(rq, BLK_STS_IOERR);
}
+static void ublk_auto_buf_reg_fallback(struct request *req)
+{
+ const struct ublk_queue *ubq = req->mq_hctx->driver_data;
+ struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
+ refcount_set(&data->ref, 1);
+}
+
+static bool ublk_auto_buf_reg(struct request *req, struct ublk_io *io,
+ unsigned int issue_flags)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(io->cmd);
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+ int ret;
+
+ ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
+ pdu->buf.index, issue_flags);
+ if (ret) {
+ if (pdu->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
+ ublk_auto_buf_reg_fallback(req);
+ return true;
+ }
+ blk_mq_end_request(req, BLK_STS_IOERR);
+ return false;
+ }
+ /* one extra reference is dropped by ublk_io_release */
+ refcount_set(&data->ref, 2);
+
+ data->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
+ /* store buffer index in request payload */
+ data->buf_index = pdu->buf.index;
+ io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
+ return true;
+}
+
+static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
+ struct request *req, struct ublk_io *io,
+ unsigned int issue_flags)
+{
+ if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
+ return ublk_auto_buf_reg(req, io, issue_flags);
+
+ ublk_init_req_ref(ubq, req);
+ return true;
+}
+
+static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
+ struct ublk_io *io)
+{
+ unsigned mapped_bytes = ublk_map_io(ubq, req, io);
+
+ /* partially mapped, update io descriptor */
+ if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
+ /*
+ * Nothing mapped, retry until we succeed.
+ *
+ * We may never succeed in mapping any bytes here because
+ * of OOM. TODO: reserve one buffer with single page pinned
+ * for providing forward progress guarantee.
+ */
+ if (unlikely(!mapped_bytes)) {
+ blk_mq_requeue_request(req, false);
+ blk_mq_delay_kick_requeue_list(req->q,
+ UBLK_REQUEUE_DELAY_MS);
+ return false;
+ }
+
+ ublk_get_iod(ubq, req->tag)->nr_sectors =
+ mapped_bytes >> 9;
+ }
+
+ return true;
+}
+
static void ublk_dispatch_req(struct ublk_queue *ubq,
struct request *req,
unsigned int issue_flags)
{
int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag];
- unsigned int mapped_bytes;
- pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+ pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr);
/*
@@ -1183,54 +1289,22 @@ static void ublk_dispatch_req(struct ublk_queue *ubq,
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
/*
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
- * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
+ * so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
* and notify it.
*/
- if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
- io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
- pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
- __func__, io->cmd->cmd_op, ubq->q_id,
- req->tag, io->flags);
- ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
- return;
- }
- /*
- * We have handled UBLK_IO_NEED_GET_DATA command,
- * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
- * do the copy work.
- */
- io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
- /* update iod->addr because ublksrv may have passed a new io buffer */
- ublk_get_iod(ubq, req->tag)->addr = io->addr;
- pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
- __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
- ublk_get_iod(ubq, req->tag)->addr);
+ io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+ pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
+ __func__, ubq->q_id, req->tag, io->flags);
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
+ issue_flags);
+ return;
}
- mapped_bytes = ublk_map_io(ubq, req, io);
-
- /* partially mapped, update io descriptor */
- if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
- /*
- * Nothing mapped, retry until we succeed.
- *
- * We may never succeed in mapping any bytes here because
- * of OOM. TODO: reserve one buffer with single page pinned
- * for providing forward progress guarantee.
- */
- if (unlikely(!mapped_bytes)) {
- blk_mq_requeue_request(req, false);
- blk_mq_delay_kick_requeue_list(req->q,
- UBLK_REQUEUE_DELAY_MS);
- return;
- }
-
- ublk_get_iod(ubq, req->tag)->nr_sectors =
- mapped_bytes >> 9;
- }
+ if (!ublk_start_io(ubq, req, io))
+ return;
- ublk_init_req_ref(ubq, req);
- ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
+ if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
+ ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
}
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
@@ -1590,30 +1664,6 @@ static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
-static void ublk_commit_completion(struct ublk_device *ub,
- const struct ublksrv_io_cmd *ub_cmd)
-{
- u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
- struct ublk_queue *ubq = ublk_get_queue(ub, qid);
- struct ublk_io *io = &ubq->ios[tag];
- struct request *req;
-
- /* now this cmd slot is owned by nbd driver */
- io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
- io->res = ub_cmd->result;
-
- /* find the io request and complete */
- req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
- if (WARN_ON_ONCE(unlikely(!req)))
- return;
-
- if (req_op(req) == REQ_OP_ZONE_APPEND)
- req->__sector = ub_cmd->zone_append_lba;
-
- if (likely(!blk_should_fake_timeout(req->q)))
- ublk_put_req_ref(ubq, req);
-}
-
static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{
@@ -1642,17 +1692,8 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
- if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
- struct request *rq;
-
- /*
- * Either we fail the request or ublk_rq_task_work_cb
- * will do it
- */
- rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
- if (rq && blk_mq_request_started(rq))
- __ublk_fail_req(ubq, io, rq);
- }
+ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
+ __ublk_fail_req(ubq, io, io->req);
}
}
@@ -1940,6 +1981,20 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
io_uring_cmd_mark_cancelable(cmd, issue_flags);
}
+static inline int ublk_set_auto_buf_reg(struct io_uring_cmd *cmd)
+{
+ struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+ pdu->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
+
+ if (pdu->buf.reserved0 || pdu->buf.reserved1)
+ return -EINVAL;
+
+ if (pdu->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
+ return -EINVAL;
+ return 0;
+}
+
static void ublk_io_release(void *priv)
{
struct request *rq = priv;
@@ -1953,16 +2008,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
unsigned int index, unsigned int issue_flags)
{
struct ublk_device *ub = cmd->file->private_data;
- const struct ublk_io *io = &ubq->ios[tag];
struct request *req;
int ret;
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- return -EINVAL;
-
req = __ublk_check_and_get_req(ub, ubq, tag, 0);
if (!req)
return -EINVAL;
@@ -1978,17 +2029,12 @@ static int ublk_register_io_buf(struct io_uring_cmd *cmd,
}
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
- const struct ublk_queue *ubq, unsigned int tag,
+ const struct ublk_queue *ubq,
unsigned int index, unsigned int issue_flags)
{
- const struct ublk_io *io = &ubq->ios[tag];
-
if (!ublk_support_zero_copy(ubq))
return -EINVAL;
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- return -EINVAL;
-
return io_buffer_unregister_bvec(cmd, index, issue_flags);
}
@@ -2031,6 +2077,12 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_queue *ubq,
goto out;
}
+ if (ublk_support_auto_buf_reg(ubq)) {
+ ret = ublk_set_auto_buf_reg(cmd);
+ if (ret)
+ goto out;
+ }
+
ublk_fill_io_cmd(io, cmd, buf_addr);
ublk_mark_io_ready(ub, ubq);
out:
@@ -2038,6 +2090,90 @@ out:
return ret;
}
+static int ublk_commit_and_fetch(const struct ublk_queue *ubq,
+ struct ublk_io *io, struct io_uring_cmd *cmd,
+ const struct ublksrv_io_cmd *ub_cmd,
+ unsigned int issue_flags)
+{
+ struct request *req = io->req;
+
+ if (ublk_need_map_io(ubq)) {
+ /*
+ * COMMIT_AND_FETCH_REQ has to provide IO buffer if
+ * NEED GET DATA is not enabled or it is Read IO.
+ */
+ if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
+ req_op(req) == REQ_OP_READ))
+ return -EINVAL;
+ } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
+ /*
+ * User copy requires addr to be unset when command is
+ * not zone append
+ */
+ return -EINVAL;
+ }
+
+ if (ublk_support_auto_buf_reg(ubq)) {
+ int ret;
+
+ /*
+ * `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
+ * and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
+ * `io_ring_ctx`.
+ *
+ * If this uring_cmd's io_ring_ctx isn't same with the
+ * one for registering the buffer, it is ublk server's
+ * responsibility for unregistering the buffer, otherwise
+ * this ublk request gets stuck.
+ */
+ if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ if (data->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
+ io_buffer_unregister_bvec(cmd, data->buf_index,
+ issue_flags);
+ io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
+ }
+
+ ret = ublk_set_auto_buf_reg(cmd);
+ if (ret)
+ return ret;
+ }
+
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
+
+ /* now this cmd slot is owned by ublk driver */
+ io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+ io->res = ub_cmd->result;
+
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ req->__sector = ub_cmd->zone_append_lba;
+
+ if (likely(!blk_should_fake_timeout(req->q)))
+ ublk_put_req_ref(ubq, req);
+
+ return 0;
+}
+
+static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io)
+{
+ struct request *req = io->req;
+
+ /*
+ * We have handled UBLK_IO_NEED_GET_DATA command,
+ * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
+ * do the copy work.
+ */
+ io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
+ /* update iod->addr because ublksrv may have passed a new io buffer */
+ ublk_get_iod(ubq, req->tag)->addr = io->addr;
+ pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
+ __func__, ubq->q_id, req->tag, io->flags,
+ ublk_get_iod(ubq, req->tag)->addr);
+
+ return ublk_start_io(ubq, req, io);
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -2048,7 +2184,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
u32 cmd_op = cmd->cmd_op;
unsigned tag = ub_cmd->tag;
int ret = -EINVAL;
- struct request *req;
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
__func__, cmd->cmd_op, ub_cmd->q_id, tag,
@@ -2058,9 +2193,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
ubq = ublk_get_queue(ub, ub_cmd->q_id);
- if (!ubq || ub_cmd->q_id != ubq->q_id)
- goto out;
-
if (ubq->ubq_daemon && ubq->ubq_daemon != current)
goto out;
@@ -2075,6 +2207,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
}
+ /* only UBLK_IO_FETCH_REQ is allowed if io is not OWNED_BY_SRV */
+ if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) &&
+ _IOC_NR(cmd_op) != UBLK_IO_FETCH_REQ)
+ goto out;
+
/*
* ensure that the user issues UBLK_IO_NEED_GET_DATA
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
@@ -2092,45 +2229,23 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_IO_REGISTER_IO_BUF:
return ublk_register_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
case UBLK_IO_UNREGISTER_IO_BUF:
- return ublk_unregister_io_buf(cmd, ubq, tag, ub_cmd->addr, issue_flags);
+ return ublk_unregister_io_buf(cmd, ubq, ub_cmd->addr, issue_flags);
case UBLK_IO_FETCH_REQ:
ret = ublk_fetch(cmd, ubq, io, ub_cmd->addr);
if (ret)
goto out;
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
-
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
-
- if (ublk_need_map_io(ubq)) {
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if
- * NEED GET DATA is not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
- req_op(req) == REQ_OP_READ))
- goto out;
- } else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
- /*
- * User copy requires addr to be unset when command is
- * not zone append
- */
- ret = -EINVAL;
+ ret = ublk_commit_and_fetch(ubq, io, cmd, ub_cmd, issue_flags);
+ if (ret)
goto out;
- }
-
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
- if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
- goto out;
- ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
- req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- ublk_dispatch_req(ubq, req, issue_flags);
- return -EIOCBQUEUED;
+ io->addr = ub_cmd->addr;
+ if (!ublk_get_data(ubq, io))
+ return -EIOCBQUEUED;
+
+ return UBLK_IO_RES_OK;
default:
goto out;
}
@@ -2728,6 +2843,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
return -EINVAL;
}
+ if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
+ pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
+ return -EINVAL;
+ }
+
/*
* unprivileged device can't be trusted, but RECOVERY and
* RECOVERY_REISSUE still may hang error handling, so can't
@@ -2744,8 +2864,11 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
* For USER_COPY, we depends on userspace to fill request
* buffer by pwrite() to ublk char device, which can't be
* used for unprivileged device
+ *
+ * Same with zero copy or auto buffer register.
*/
- if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
return -EINVAL;
}
@@ -2803,7 +2926,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
UBLK_F_URING_CMD_COMP_IN_TASK;
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
- if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY))
+ if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
+ UBLK_F_AUTO_BUF_REG))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/*
@@ -3106,6 +3230,127 @@ static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
return 0;
}
+static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
+{
+ struct ublk_param_basic *p = &ub->params.basic;
+ u64 new_size = header->data[0];
+
+ mutex_lock(&ub->mutex);
+ p->dev_sectors = new_size;
+ set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
+ mutex_unlock(&ub->mutex);
+}
+
+struct count_busy {
+ const struct ublk_queue *ubq;
+ unsigned int nr_busy;
+};
+
+static bool ublk_count_busy_req(struct request *rq, void *data)
+{
+ struct count_busy *idle = data;
+
+ if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
+ idle->nr_busy += 1;
+ return true;
+}
+
+/* uring_cmd is guaranteed to be active if the associated request is idle */
+static bool ubq_has_idle_io(const struct ublk_queue *ubq)
+{
+ struct count_busy data = {
+ .ubq = ubq,
+ };
+
+ blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
+ return data.nr_busy < ubq->q_depth;
+}
+
+/* Wait until each hw queue has at least one idle IO */
+static int ublk_wait_for_idle_io(struct ublk_device *ub,
+ unsigned int timeout_ms)
+{
+ unsigned int elapsed = 0;
+ int ret;
+
+ while (elapsed < timeout_ms && !signal_pending(current)) {
+ unsigned int queues_cancelable = 0;
+ int i;
+
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ queues_cancelable += !!ubq_has_idle_io(ubq);
+ }
+
+ /*
+ * Each queue needs at least one active command for
+ * notifying ublk server
+ */
+ if (queues_cancelable == ub->dev_info.nr_hw_queues)
+ break;
+
+ msleep(UBLK_REQUEUE_DELAY_MS);
+ elapsed += UBLK_REQUEUE_DELAY_MS;
+ }
+
+ if (signal_pending(current))
+ ret = -EINTR;
+ else if (elapsed >= timeout_ms)
+ ret = -EBUSY;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
+ const struct ublksrv_ctrl_cmd *header)
+{
+ /* zero means wait forever */
+ u64 timeout_ms = header->data[0];
+ struct gendisk *disk;
+ int i, ret = -ENODEV;
+
+ if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
+ return -EOPNOTSUPP;
+
+ mutex_lock(&ub->mutex);
+ disk = ublk_get_disk(ub);
+ if (!disk)
+ goto unlock;
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ goto put_disk;
+
+ ret = 0;
+ /* already in expected state */
+ if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+ goto put_disk;
+
+ /* Mark all queues as canceling */
+ blk_mq_quiesce_queue(disk->queue);
+ for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+ struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+ ubq->canceling = true;
+ }
+ blk_mq_unquiesce_queue(disk->queue);
+
+ if (!timeout_ms)
+ timeout_ms = UINT_MAX;
+ ret = ublk_wait_for_idle_io(ub, timeout_ms);
+
+put_disk:
+ ublk_put_disk(disk);
+unlock:
+ mutex_unlock(&ub->mutex);
+
+ /* Cancel pending uring_cmd */
+ if (!ret)
+ ublk_cancel_dev(ub);
+ return ret;
+}
+
/*
* All control commands are sent via /dev/ublk-control, so we have to check
* the destination device's permission
@@ -3191,6 +3436,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
case UBLK_CMD_SET_PARAMS:
case UBLK_CMD_START_USER_RECOVERY:
case UBLK_CMD_END_USER_RECOVERY:
+ case UBLK_CMD_UPDATE_SIZE:
+ case UBLK_CMD_QUIESCE_DEV:
mask = MAY_READ | MAY_WRITE;
break;
default:
@@ -3282,6 +3529,13 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_END_USER_RECOVERY:
ret = ublk_ctrl_end_recovery(ub, header);
break;
+ case UBLK_CMD_UPDATE_SIZE:
+ ublk_ctrl_set_size(ub, header);
+ ret = 0;
+ break;
+ case UBLK_CMD_QUIESCE_DEV:
+ ret = ublk_ctrl_quiesce_dev(ub, header);
+ break;
default:
ret = -EOPNOTSUPP;
break;
@@ -3315,6 +3569,7 @@ static int __init ublk_init(void)
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+ BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
init_waitqueue_head(&ublk_idr_wq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 7cffea01d868..30bca8cb7106 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -571,7 +571,7 @@ static int virtblk_submit_zone_report(struct virtio_blk *vblk,
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_ZONE_REPORT);
vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, sector);
- err = blk_rq_map_kern(q, req, report_buf, report_len, GFP_KERNEL);
+ err = blk_rq_map_kern(req, report_buf, report_len, GFP_KERNEL);
if (err)
goto out;
@@ -817,7 +817,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
vbr->out_hdr.sector = 0;
- err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+ err = blk_rq_map_kern(req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;
diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c
new file mode 100644
index 000000000000..553b1a713ab9
--- /dev/null
+++ b/drivers/block/zloop.c
@@ -0,0 +1,1385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2025, Christoph Hellwig.
+ * Copyright (c) 2025, Western Digital Corporation or its affiliates.
+ *
+ * Zoned Loop Device driver - exports a zoned block device using one file per
+ * zone as backing storage.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/blkzoned.h>
+#include <linux/pagemap.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/mutex.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+
+/*
+ * Options for adding (and removing) a device.
+ */
+enum {
+ ZLOOP_OPT_ERR = 0,
+ ZLOOP_OPT_ID = (1 << 0),
+ ZLOOP_OPT_CAPACITY = (1 << 1),
+ ZLOOP_OPT_ZONE_SIZE = (1 << 2),
+ ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
+ ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
+ ZLOOP_OPT_BASE_DIR = (1 << 5),
+ ZLOOP_OPT_NR_QUEUES = (1 << 6),
+ ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
+ ZLOOP_OPT_BUFFERED_IO = (1 << 8),
+};
+
+static const match_table_t zloop_opt_tokens = {
+ { ZLOOP_OPT_ID, "id=%d" },
+ { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
+ { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
+ { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
+ { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
+ { ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
+ { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
+ { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
+ { ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
+ { ZLOOP_OPT_ERR, NULL }
+};
+
+/* Default values for the "add" operation. */
+#define ZLOOP_DEF_ID -1
+#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
+#define ZLOOP_DEF_NR_ZONES 64
+#define ZLOOP_DEF_NR_CONV_ZONES 8
+#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
+#define ZLOOP_DEF_NR_QUEUES 1
+#define ZLOOP_DEF_QUEUE_DEPTH 128
+#define ZLOOP_DEF_BUFFERED_IO false
+
+/* Arbitrary limit on the zone size (16GB). */
+#define ZLOOP_MAX_ZONE_SIZE_MB 16384
+
+struct zloop_options {
+ unsigned int mask;
+ int id;
+ sector_t capacity;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_conv_zones;
+ char *base_dir;
+ unsigned int nr_queues;
+ unsigned int queue_depth;
+ bool buffered_io;
+};
+
+/*
+ * Device states.
+ */
+enum {
+ Zlo_creating = 0,
+ Zlo_live,
+ Zlo_deleting,
+};
+
+enum zloop_zone_flags {
+ ZLOOP_ZONE_CONV = 0,
+ ZLOOP_ZONE_SEQ_ERROR,
+};
+
+struct zloop_zone {
+ struct file *file;
+
+ unsigned long flags;
+ struct mutex lock;
+ enum blk_zone_cond cond;
+ sector_t start;
+ sector_t wp;
+
+ gfp_t old_gfp_mask;
+};
+
+struct zloop_device {
+ unsigned int id;
+ unsigned int state;
+
+ struct blk_mq_tag_set tag_set;
+ struct gendisk *disk;
+
+ struct workqueue_struct *workqueue;
+ bool buffered_io;
+
+ const char *base_dir;
+ struct file *data_dir;
+
+ unsigned int zone_shift;
+ sector_t zone_size;
+ sector_t zone_capacity;
+ unsigned int nr_zones;
+ unsigned int nr_conv_zones;
+ unsigned int block_size;
+
+ struct zloop_zone zones[] __counted_by(nr_zones);
+};
+
+struct zloop_cmd {
+ struct work_struct work;
+ atomic_t ref;
+ sector_t sector;
+ sector_t nr_sectors;
+ long ret;
+ struct kiocb iocb;
+ struct bio_vec *bvec;
+};
+
+static DEFINE_IDR(zloop_index_idr);
+static DEFINE_MUTEX(zloop_ctl_mutex);
+
+static unsigned int rq_zone_no(struct request *rq)
+{
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ return blk_rq_pos(rq) >> zlo->zone_shift;
+}
+
+static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ lockdep_assert_held(&zone->lock);
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat (err=%d)\n",
+ zone_no, ret);
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ return ret;
+ }
+
+ file_sectors = stat.size >> SECTOR_SHIFT;
+ if (file_sectors > zlo->zone_capacity) {
+ pr_err("Zone %u file too large (%llu sectors > %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return -EINVAL;
+ }
+
+ if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone %u file size not aligned to block size %u\n",
+ zone_no, zlo->block_size);
+ return -EINVAL;
+ }
+
+ if (!file_sectors) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ } else if (file_sectors == zlo->zone_capacity) {
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ } else {
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ zone->wp = zone->start + file_sectors;
+ }
+
+ return 0;
+}
+
+static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EXP_OPEN:
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_IMP_OPEN:
+ zone->cond = BLK_ZONE_COND_EXP_OPEN;
+ break;
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret)
+ goto unlock;
+ }
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_CLOSED:
+ break;
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ if (zone->wp == zone->start)
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ else
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_FULL:
+ default:
+ ret = -EIO;
+ break;
+ }
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, 0)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static int zloop_reset_all_zones(struct zloop_device *zlo)
+{
+ unsigned int i;
+ int ret;
+
+ for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
+ ret = zloop_reset_zone(zlo, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int ret = 0;
+
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
+ return -EIO;
+
+ mutex_lock(&zone->lock);
+
+ if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
+ zone->cond == BLK_ZONE_COND_FULL)
+ goto unlock;
+
+ if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zlo->zone_size;
+ clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+
+ unlock:
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static void zloop_put_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+
+ if (!atomic_dec_and_test(&cmd->ref))
+ return;
+ kfree(cmd->bvec);
+ cmd->bvec = NULL;
+ if (likely(!blk_should_fake_timeout(rq->q)))
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_rw_complete(struct kiocb *iocb, long ret)
+{
+ struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
+
+ cmd->ret = ret;
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_rw(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = rq_zone_no(rq);
+ sector_t sector = blk_rq_pos(rq);
+ sector_t nr_sectors = blk_rq_sectors(rq);
+ bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
+ bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
+ int rw = is_write ? ITER_SOURCE : ITER_DEST;
+ struct req_iterator rq_iter;
+ struct zloop_zone *zone;
+ struct iov_iter iter;
+ struct bio_vec tmp;
+ sector_t zone_end;
+ int nr_bvec = 0;
+ int ret;
+
+ atomic_set(&cmd->ref, 2);
+ cmd->sector = sector;
+ cmd->nr_sectors = nr_sectors;
+ cmd->ret = 0;
+
+ /* We should never get an I/O beyond the device capacity. */
+ if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
+ ret = -EIO;
+ goto out;
+ }
+ zone = &zlo->zones[zone_no];
+ zone_end = zone->start + zlo->zone_capacity;
+
+ /*
+ * The block layer should never send requests that are not fully
+ * contained within the zone.
+ */
+ if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+ if (ret)
+ goto out;
+ }
+
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
+ mutex_lock(&zone->lock);
+
+ if (is_append) {
+ sector = zone->wp;
+ cmd->sector = sector;
+ }
+
+ /*
+ * Write operations must be aligned to the write pointer and
+ * fully contained within the zone capacity.
+ */
+ if (sector != zone->wp || zone->wp + nr_sectors > zone_end) {
+ pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
+ zone_no, sector, zone->wp);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /* Implicitly open the target zone. */
+ if (zone->cond == BLK_ZONE_COND_CLOSED ||
+ zone->cond == BLK_ZONE_COND_EMPTY)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ /*
+ * Advance the write pointer of sequential zones. If the write
+ * fails, the wp position will be corrected when the next I/O
+ * copmpletes.
+ */
+ zone->wp += nr_sectors;
+ if (zone->wp == zone_end)
+ zone->cond = BLK_ZONE_COND_FULL;
+ }
+
+ rq_for_each_bvec(tmp, rq, rq_iter)
+ nr_bvec++;
+
+ if (rq->bio != rq->biotail) {
+ struct bio_vec *bvec;
+
+ cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO);
+ if (!cmd->bvec) {
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /*
+ * The bios of the request may be started from the middle of
+ * the 'bvec' because of bio splitting, so we can't directly
+ * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
+ * API will take care of all details for us.
+ */
+ bvec = cmd->bvec;
+ rq_for_each_bvec(tmp, rq, rq_iter) {
+ *bvec = tmp;
+ bvec++;
+ }
+ iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
+ } else {
+ /*
+ * Same here, this bio may be started from the middle of the
+ * 'bvec' because of bio splitting, so offset from the bvec
+ * must be passed to iov iterator
+ */
+ iov_iter_bvec(&iter, rw,
+ __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
+ nr_bvec, blk_rq_bytes(rq));
+ iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
+ }
+
+ cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
+ cmd->iocb.ki_filp = zone->file;
+ cmd->iocb.ki_complete = zloop_rw_complete;
+ if (!zlo->buffered_io)
+ cmd->iocb.ki_flags = IOCB_DIRECT;
+ cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
+ if (rw == ITER_SOURCE)
+ ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
+ else
+ ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
+unlock:
+ if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
+ mutex_unlock(&zone->lock);
+out:
+ if (ret != -EIOCBQUEUED)
+ zloop_rw_complete(&cmd->iocb, ret);
+ zloop_put_cmd(cmd);
+}
+
+static void zloop_handle_cmd(struct zloop_cmd *cmd)
+{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ /*
+ * zloop_rw() always executes asynchronously or completes
+ * directly.
+ */
+ zloop_rw(cmd);
+ return;
+ case REQ_OP_FLUSH:
+ /*
+ * Sync the entire FS containing the zone files instead of
+ * walking all files
+ */
+ cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb);
+ break;
+ case REQ_OP_ZONE_RESET:
+ cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_RESET_ALL:
+ cmd->ret = zloop_reset_all_zones(zlo);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_OPEN:
+ cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ pr_err("Unsupported operation %d\n", req_op(rq));
+ cmd->ret = -EOPNOTSUPP;
+ break;
+ }
+
+ blk_mq_complete_request(rq);
+}
+
+static void zloop_cmd_workfn(struct work_struct *work)
+{
+ struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
+ int orig_flags = current->flags;
+
+ current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+ zloop_handle_cmd(cmd);
+ current->flags = orig_flags;
+}
+
+static void zloop_complete_rq(struct request *rq)
+{
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+ unsigned int zone_no = cmd->sector >> zlo->zone_shift;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ blk_status_t sts = BLK_STS_OK;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
+ zone_no, cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ /* short read */
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
+ }
+ break;
+ case REQ_OP_WRITE:
+ case REQ_OP_ZONE_APPEND:
+ if (cmd->ret < 0)
+ pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
+ zone_no,
+ req_op(rq) == REQ_OP_WRITE ? "" : "append ",
+ cmd->sector, cmd->nr_sectors);
+
+ if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
+ pr_err("Zone %u: partial write %ld/%u B\n",
+ zone_no, cmd->ret, blk_rq_bytes(rq));
+ cmd->ret = -EIO;
+ }
+
+ if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ /*
+ * A write to a sequential zone file failed: mark the
+ * zone as having an error. This will be corrected and
+ * cleared when the next IO is submitted.
+ */
+ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
+ break;
+ }
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ rq->__sector = cmd->sector;
+
+ break;
+ default:
+ break;
+ }
+
+ if (cmd->ret < 0)
+ sts = errno_to_blk_status(cmd->ret);
+ blk_mq_end_request(rq, sts);
+}
+
+static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ struct zloop_device *zlo = rq->q->queuedata;
+
+ if (zlo->state == Zlo_deleting)
+ return BLK_STS_IOERR;
+
+ blk_mq_start_request(rq);
+
+ INIT_WORK(&cmd->work, zloop_cmd_workfn);
+ queue_work(zlo->workqueue, &cmd->work);
+
+ return BLK_STS_OK;
+}
+
+static const struct blk_mq_ops zloop_mq_ops = {
+ .queue_rq = zloop_queue_rq,
+ .complete = zloop_complete_rq,
+};
+
+static int zloop_open(struct gendisk *disk, blk_mode_t mode)
+{
+ struct zloop_device *zlo = disk->private_data;
+ int ret;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ if (zlo->state != Zlo_live)
+ ret = -ENXIO;
+ mutex_unlock(&zloop_ctl_mutex);
+ return ret;
+}
+
+static int zloop_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+ struct zloop_device *zlo = disk->private_data;
+ struct blk_zone blkz = {};
+ unsigned int first, i;
+ int ret;
+
+ first = disk_zone_no(disk, sector);
+ if (first >= zlo->nr_zones)
+ return 0;
+ nr_zones = min(nr_zones, zlo->nr_zones - first);
+
+ for (i = 0; i < nr_zones; i++) {
+ unsigned int zone_no = first + i;
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+
+ mutex_lock(&zone->lock);
+
+ if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ if (ret) {
+ mutex_unlock(&zone->lock);
+ return ret;
+ }
+ }
+
+ blkz.start = zone->start;
+ blkz.len = zlo->zone_size;
+ blkz.wp = zone->wp;
+ blkz.cond = zone->cond;
+ if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
+ blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ blkz.capacity = zlo->zone_size;
+ } else {
+ blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+ blkz.capacity = zlo->zone_capacity;
+ }
+
+ mutex_unlock(&zone->lock);
+
+ ret = cb(&blkz, i, data);
+ if (ret)
+ return ret;
+ }
+
+ return nr_zones;
+}
+
+static void zloop_free_disk(struct gendisk *disk)
+{
+ struct zloop_device *zlo = disk->private_data;
+ unsigned int i;
+
+ for (i = 0; i < zlo->nr_zones; i++) {
+ struct zloop_zone *zone = &zlo->zones[i];
+
+ mapping_set_gfp_mask(zone->file->f_mapping,
+ zone->old_gfp_mask);
+ fput(zone->file);
+ }
+
+ fput(zlo->data_dir);
+ destroy_workqueue(zlo->workqueue);
+ kfree(zlo->base_dir);
+ kvfree(zlo);
+}
+
+static const struct block_device_operations zloop_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_open,
+ .report_zones = zloop_report_zones,
+ .free_disk = zloop_free_disk,
+};
+
+__printf(3, 4)
+static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
+ const char *fmt, ...)
+{
+ struct file *file;
+ va_list ap;
+ char *p;
+
+ va_start(ap, fmt);
+ p = kvasprintf(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+ file = filp_open(p, oflags, mode);
+ kfree(p);
+ return file;
+}
+
+static int zloop_get_block_size(struct zloop_device *zlo,
+ struct zloop_zone *zone)
+{
+ struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
+ struct kstat st;
+
+ /*
+ * If the FS block size is lower than or equal to 4K, use that as the
+ * device block size. Otherwise, fallback to the FS direct IO alignment
+ * constraint if that is provided, and to the FS underlying device
+ * physical block size if the direct IO alignment is unknown.
+ */
+ if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
+ zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
+ else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
+ (st.result_mask & STATX_DIOALIGN))
+ zlo->block_size = st.dio_offset_align;
+ else if (sb_bdev)
+ zlo->block_size = bdev_physical_block_size(sb_bdev);
+ else
+ zlo->block_size = SECTOR_SIZE;
+
+ if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
+ pr_err("Zone capacity is not aligned to block size %u\n",
+ zlo->block_size);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
+ unsigned int zone_no, bool restore)
+{
+ struct zloop_zone *zone = &zlo->zones[zone_no];
+ int oflags = O_RDWR;
+ struct kstat stat;
+ sector_t file_sectors;
+ int ret;
+
+ mutex_init(&zone->lock);
+ zone->start = (sector_t)zone_no << zlo->zone_shift;
+
+ if (!restore)
+ oflags |= O_CREAT;
+
+ if (!opts->buffered_io)
+ oflags |= O_DIRECT;
+
+ if (zone_no < zlo->nr_conv_zones) {
+ /* Conventional zone file. */
+ set_bit(ZLOOP_ZONE_CONV, &zone->flags);
+ zone->cond = BLK_ZONE_COND_NOT_WP;
+ zone->wp = U64_MAX;
+
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
+ if (ret < 0) {
+ pr_err("Failed to get zone %u file stat\n", zone_no);
+ return ret;
+ }
+ file_sectors = stat.size >> SECTOR_SHIFT;
+
+ if (restore && file_sectors != zlo->zone_size) {
+ pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
+ zone_no, file_sectors, zlo->zone_capacity);
+ return ret;
+ }
+
+ ret = vfs_truncate(&zone->file->f_path,
+ zlo->zone_size << SECTOR_SHIFT);
+ if (ret < 0) {
+ pr_err("Failed to truncate zone %u file (err=%d)\n",
+ zone_no, ret);
+ return ret;
+ }
+
+ return 0;
+ }
+
+ /* Sequential zone file. */
+ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, zone_no);
+ if (IS_ERR(zone->file)) {
+ pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
+ zone_no, zlo->base_dir, zlo->id, zone_no,
+ PTR_ERR(zone->file));
+ return PTR_ERR(zone->file);
+ }
+
+ if (!zlo->block_size) {
+ ret = zloop_get_block_size(zlo, zone);
+ if (ret)
+ return ret;
+ }
+
+ zloop_get_block_size(zlo, zone);
+
+ mutex_lock(&zone->lock);
+ ret = zloop_update_seq_zone(zlo, zone_no);
+ mutex_unlock(&zone->lock);
+
+ return ret;
+}
+
+static bool zloop_dev_exists(struct zloop_device *zlo)
+{
+ struct file *cnv, *seq;
+ bool exists;
+
+ cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
+ zlo->base_dir, zlo->id, 0);
+ seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
+ zlo->base_dir, zlo->id, 0);
+ exists = !IS_ERR(cnv) || !IS_ERR(seq);
+
+ if (!IS_ERR(cnv))
+ fput(cnv);
+ if (!IS_ERR(seq))
+ fput(seq);
+
+ return exists;
+}
+
+static int zloop_ctl_add(struct zloop_options *opts)
+{
+ struct queue_limits lim = {
+ .max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
+ .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT,
+ .chunk_sectors = opts->zone_size,
+ .features = BLK_FEAT_ZONED,
+ };
+ unsigned int nr_zones, i, j;
+ struct zloop_device *zlo;
+ int ret = -EINVAL;
+ bool restore;
+
+ __module_get(THIS_MODULE);
+
+ nr_zones = opts->capacity >> ilog2(opts->zone_size);
+ if (opts->nr_conv_zones >= nr_zones) {
+ pr_err("Invalid number of conventional zones %u\n",
+ opts->nr_conv_zones);
+ goto out;
+ }
+
+ zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL);
+ if (!zlo) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ zlo->state = Zlo_creating;
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ goto out_free_dev;
+
+ /* Allocate id, if @opts->id >= 0, we're requesting that specific id */
+ if (opts->id >= 0) {
+ ret = idr_alloc(&zloop_index_idr, zlo,
+ opts->id, opts->id + 1, GFP_KERNEL);
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+ } else {
+ ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
+ }
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret < 0)
+ goto out_free_dev;
+
+ zlo->id = ret;
+ zlo->zone_shift = ilog2(opts->zone_size);
+ zlo->zone_size = opts->zone_size;
+ if (opts->zone_capacity)
+ zlo->zone_capacity = opts->zone_capacity;
+ else
+ zlo->zone_capacity = zlo->zone_size;
+ zlo->nr_zones = nr_zones;
+ zlo->nr_conv_zones = opts->nr_conv_zones;
+ zlo->buffered_io = opts->buffered_io;
+
+ zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
+ opts->nr_queues * opts->queue_depth, zlo->id);
+ if (!zlo->workqueue) {
+ ret = -ENOMEM;
+ goto out_free_idr;
+ }
+
+ if (opts->base_dir)
+ zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
+ else
+ zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
+ if (!zlo->base_dir) {
+ ret = -ENOMEM;
+ goto out_destroy_workqueue;
+ }
+
+ zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
+ zlo->base_dir, zlo->id);
+ if (IS_ERR(zlo->data_dir)) {
+ ret = PTR_ERR(zlo->data_dir);
+ pr_warn("Failed to open directory %s/%u (err=%d)\n",
+ zlo->base_dir, zlo->id, ret);
+ goto out_free_base_dir;
+ }
+
+ /*
+ * If we already have zone files, we are restoring a device created by a
+ * previous add operation. In this case, zloop_init_zone() will check
+ * that the zone files are consistent with the zone configuration given.
+ */
+ restore = zloop_dev_exists(zlo);
+ for (i = 0; i < nr_zones; i++) {
+ ret = zloop_init_zone(zlo, opts, i, restore);
+ if (ret)
+ goto out_close_files;
+ }
+
+ lim.physical_block_size = zlo->block_size;
+ lim.logical_block_size = zlo->block_size;
+
+ zlo->tag_set.ops = &zloop_mq_ops;
+ zlo->tag_set.nr_hw_queues = opts->nr_queues;
+ zlo->tag_set.queue_depth = opts->queue_depth;
+ zlo->tag_set.numa_node = NUMA_NO_NODE;
+ zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
+ zlo->tag_set.driver_data = zlo;
+
+ ret = blk_mq_alloc_tag_set(&zlo->tag_set);
+ if (ret) {
+ pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
+ goto out_close_files;
+ }
+
+ zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
+ if (IS_ERR(zlo->disk)) {
+ pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
+ ret = PTR_ERR(zlo->disk);
+ goto out_cleanup_tags;
+ }
+ zlo->disk->flags = GENHD_FL_NO_PART;
+ zlo->disk->fops = &zloop_fops;
+ zlo->disk->private_data = zlo;
+ sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
+ set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
+
+ ret = blk_revalidate_disk_zones(zlo->disk);
+ if (ret)
+ goto out_cleanup_disk;
+
+ ret = add_disk(zlo->disk);
+ if (ret) {
+ pr_err("add_disk failed (err=%d)\n", ret);
+ goto out_cleanup_disk;
+ }
+
+ mutex_lock(&zloop_ctl_mutex);
+ zlo->state = Zlo_live;
+ mutex_unlock(&zloop_ctl_mutex);
+
+ pr_info("Added device %d: %u zones of %llu MB, %u B block size\n",
+ zlo->id, zlo->nr_zones,
+ ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
+ zlo->block_size);
+
+ return 0;
+
+out_cleanup_disk:
+ put_disk(zlo->disk);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&zlo->tag_set);
+out_close_files:
+ for (j = 0; j < i; j++) {
+ struct zloop_zone *zone = &zlo->zones[j];
+
+ if (!IS_ERR_OR_NULL(zone->file))
+ fput(zone->file);
+ }
+ fput(zlo->data_dir);
+out_free_base_dir:
+ kfree(zlo->base_dir);
+out_destroy_workqueue:
+ destroy_workqueue(zlo->workqueue);
+out_free_idr:
+ mutex_lock(&zloop_ctl_mutex);
+ idr_remove(&zloop_index_idr, zlo->id);
+ mutex_unlock(&zloop_ctl_mutex);
+out_free_dev:
+ kvfree(zlo);
+out:
+ module_put(THIS_MODULE);
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ret;
+}
+
+static int zloop_ctl_remove(struct zloop_options *opts)
+{
+ struct zloop_device *zlo;
+ int ret;
+
+ if (!(opts->mask & ZLOOP_OPT_ID)) {
+ pr_err("No ID specified\n");
+ return -EINVAL;
+ }
+
+ ret = mutex_lock_killable(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ zlo = idr_find(&zloop_index_idr, opts->id);
+ if (!zlo || zlo->state == Zlo_creating) {
+ ret = -ENODEV;
+ } else if (zlo->state == Zlo_deleting) {
+ ret = -EINVAL;
+ } else {
+ idr_remove(&zloop_index_idr, zlo->id);
+ zlo->state = Zlo_deleting;
+ }
+
+ mutex_unlock(&zloop_ctl_mutex);
+ if (ret)
+ return ret;
+
+ del_gendisk(zlo->disk);
+ put_disk(zlo->disk);
+ blk_mq_free_tag_set(&zlo->tag_set);
+
+ pr_info("Removed device %d\n", opts->id);
+
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static int zloop_parse_options(struct zloop_options *opts, const char *buf)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *options, *o, *p;
+ unsigned int token;
+ int ret = 0;
+
+ /* Set defaults. */
+ opts->mask = 0;
+ opts->id = ZLOOP_DEF_ID;
+ opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
+ opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
+ opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
+ opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
+ opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
+ opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
+
+ if (!buf)
+ return 0;
+
+ /* Skip leading spaces before the options. */
+ while (isspace(*buf))
+ buf++;
+
+ options = o = kstrdup(buf, GFP_KERNEL);
+ if (!options)
+ return -ENOMEM;
+
+ /* Parse the options, doing only some light invalid value checks. */
+ while ((p = strsep(&o, ",\n")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, zloop_opt_tokens, args);
+ opts->mask |= token;
+ switch (token) {
+ case ZLOOP_OPT_ID:
+ if (match_int(args, &opts->id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case ZLOOP_OPT_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_SIZE:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
+ !is_power_of_2(token)) {
+ pr_err("Invalid zone size %u\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_size =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_ZONE_CAPACITY:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid zone capacity\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->zone_capacity =
+ ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
+ break;
+ case ZLOOP_OPT_NR_CONV_ZONES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_conv_zones = token;
+ break;
+ case ZLOOP_OPT_BASE_DIR:
+ p = match_strdup(args);
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ kfree(opts->base_dir);
+ opts->base_dir = p;
+ break;
+ case ZLOOP_OPT_NR_QUEUES:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid number of queues\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_queues = min(token, num_online_cpus());
+ break;
+ case ZLOOP_OPT_QUEUE_DEPTH:
+ if (match_uint(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!token) {
+ pr_err("Invalid queue depth\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->queue_depth = token;
+ break;
+ case ZLOOP_OPT_BUFFERED_IO:
+ opts->buffered_io = true;
+ break;
+ case ZLOOP_OPT_ERR:
+ default:
+ pr_warn("unknown parameter or missing value '%s'\n", p);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ ret = -EINVAL;
+ if (opts->capacity <= opts->zone_size) {
+ pr_err("Invalid capacity\n");
+ goto out;
+ }
+
+ if (opts->zone_capacity > opts->zone_size) {
+ pr_err("Invalid zone capacity\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ kfree(options);
+ return ret;
+}
+
+enum {
+ ZLOOP_CTL_ADD,
+ ZLOOP_CTL_REMOVE,
+};
+
+static struct zloop_ctl_op {
+ int code;
+ const char *name;
+} zloop_ctl_ops[] = {
+ { ZLOOP_CTL_ADD, "add" },
+ { ZLOOP_CTL_REMOVE, "remove" },
+ { -1, NULL },
+};
+
+static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *pos)
+{
+ struct zloop_options opts = { };
+ struct zloop_ctl_op *op;
+ const char *buf, *opts_buf;
+ int i, ret;
+
+ if (count > PAGE_SIZE)
+ return -ENOMEM;
+
+ buf = memdup_user_nul(ubuf, count);
+ if (IS_ERR(buf))
+ return PTR_ERR(buf);
+
+ for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
+ op = &zloop_ctl_ops[i];
+ if (!op->name) {
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!strncmp(buf, op->name, strlen(op->name)))
+ break;
+ }
+
+ if (count <= strlen(op->name))
+ opts_buf = NULL;
+ else
+ opts_buf = buf + strlen(op->name);
+
+ ret = zloop_parse_options(&opts, opts_buf);
+ if (ret) {
+ pr_err("Failed to parse options\n");
+ goto out;
+ }
+
+ switch (op->code) {
+ case ZLOOP_CTL_ADD:
+ ret = zloop_ctl_add(&opts);
+ break;
+ case ZLOOP_CTL_REMOVE:
+ ret = zloop_ctl_remove(&opts);
+ break;
+ default:
+ pr_err("Invalid operation\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+out:
+ kfree(opts.base_dir);
+ kfree(buf);
+ return ret ? ret : count;
+}
+
+static int zloop_ctl_show(struct seq_file *seq_file, void *private)
+{
+ const struct match_token *tok;
+ int i;
+
+ /* Add operation */
+ seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
+ for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
+ tok = &zloop_opt_tokens[i];
+ if (!tok->pattern)
+ break;
+ if (i)
+ seq_putc(seq_file, ',');
+ seq_puts(seq_file, tok->pattern);
+ }
+ seq_putc(seq_file, '\n');
+
+ /* Remove operation */
+ seq_puts(seq_file, zloop_ctl_ops[1].name);
+ seq_puts(seq_file, " id=%d\n");
+
+ return 0;
+}
+
+static int zloop_ctl_open(struct inode *inode, struct file *file)
+{
+ file->private_data = NULL;
+ return single_open(file, zloop_ctl_show, NULL);
+}
+
+static int zloop_ctl_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+
+static const struct file_operations zloop_ctl_fops = {
+ .owner = THIS_MODULE,
+ .open = zloop_ctl_open,
+ .release = zloop_ctl_release,
+ .write = zloop_ctl_write,
+ .read = seq_read,
+};
+
+static struct miscdevice zloop_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "zloop-control",
+ .fops = &zloop_ctl_fops,
+};
+
+static int __init zloop_init(void)
+{
+ int ret;
+
+ ret = misc_register(&zloop_misc);
+ if (ret) {
+ pr_err("Failed to register misc device: %d\n", ret);
+ return ret;
+ }
+ pr_info("Module loaded\n");
+
+ return 0;
+}
+
+static void __exit zloop_exit(void)
+{
+ misc_deregister(&zloop_misc);
+ idr_destroy(&zloop_index_idr);
+}
+
+module_init(zloop_init);
+module_exit(zloop_exit);
+
+MODULE_DESCRIPTION("Zoned loopback device");
+MODULE_LICENSE("GPL");