// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2025, Christoph Hellwig. * Copyright (c) 2025, Western Digital Corporation or its affiliates. * * Zoned Loop Device driver - exports a zoned block device using one file per * zone as backing storage. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include #include #include #include #include #include #include /* * Options for adding (and removing) a device. */ enum { ZLOOP_OPT_ERR = 0, ZLOOP_OPT_ID = (1 << 0), ZLOOP_OPT_CAPACITY = (1 << 1), ZLOOP_OPT_ZONE_SIZE = (1 << 2), ZLOOP_OPT_ZONE_CAPACITY = (1 << 3), ZLOOP_OPT_NR_CONV_ZONES = (1 << 4), ZLOOP_OPT_BASE_DIR = (1 << 5), ZLOOP_OPT_NR_QUEUES = (1 << 6), ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), ZLOOP_OPT_BUFFERED_IO = (1 << 8), }; static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_ID, "id=%d" }, { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" }, { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" }, { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" }, { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" }, { ZLOOP_OPT_BASE_DIR, "base_dir=%s" }, { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" }, { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, { ZLOOP_OPT_ERR, NULL } }; /* Default values for the "add" operation. */ #define ZLOOP_DEF_ID -1 #define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT) #define ZLOOP_DEF_NR_ZONES 64 #define ZLOOP_DEF_NR_CONV_ZONES 8 #define ZLOOP_DEF_BASE_DIR "/var/local/zloop" #define ZLOOP_DEF_NR_QUEUES 1 #define ZLOOP_DEF_QUEUE_DEPTH 128 #define ZLOOP_DEF_BUFFERED_IO false /* Arbitrary limit on the zone size (16GB). */ #define ZLOOP_MAX_ZONE_SIZE_MB 16384 struct zloop_options { unsigned int mask; int id; sector_t capacity; sector_t zone_size; sector_t zone_capacity; unsigned int nr_conv_zones; char *base_dir; unsigned int nr_queues; unsigned int queue_depth; bool buffered_io; }; /* * Device states. */ enum { Zlo_creating = 0, Zlo_live, Zlo_deleting, }; enum zloop_zone_flags { ZLOOP_ZONE_CONV = 0, ZLOOP_ZONE_SEQ_ERROR, }; struct zloop_zone { struct file *file; unsigned long flags; struct mutex lock; enum blk_zone_cond cond; sector_t start; sector_t wp; gfp_t old_gfp_mask; }; struct zloop_device { unsigned int id; unsigned int state; struct blk_mq_tag_set tag_set; struct gendisk *disk; struct workqueue_struct *workqueue; bool buffered_io; const char *base_dir; struct file *data_dir; unsigned int zone_shift; sector_t zone_size; sector_t zone_capacity; unsigned int nr_zones; unsigned int nr_conv_zones; unsigned int block_size; struct zloop_zone zones[] __counted_by(nr_zones); }; struct zloop_cmd { struct work_struct work; atomic_t ref; sector_t sector; sector_t nr_sectors; long ret; struct kiocb iocb; struct bio_vec *bvec; }; static DEFINE_IDR(zloop_index_idr); static DEFINE_MUTEX(zloop_ctl_mutex); static unsigned int rq_zone_no(struct request *rq) { struct zloop_device *zlo = rq->q->queuedata; return blk_rq_pos(rq) >> zlo->zone_shift; } static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; struct kstat stat; sector_t file_sectors; int ret; lockdep_assert_held(&zone->lock); ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); if (ret < 0) { pr_err("Failed to get zone %u file stat (err=%d)\n", zone_no, ret); set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); return ret; } file_sectors = stat.size >> SECTOR_SHIFT; if (file_sectors > zlo->zone_capacity) { pr_err("Zone %u file too large (%llu sectors > %llu)\n", zone_no, file_sectors, zlo->zone_capacity); return -EINVAL; } if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { pr_err("Zone %u file size not aligned to block size %u\n", zone_no, zlo->block_size); return -EINVAL; } if (!file_sectors) { zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; } else if (file_sectors == zlo->zone_capacity) { zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zlo->zone_size; } else { zone->cond = BLK_ZONE_COND_CLOSED; zone->wp = zone->start + file_sectors; } return 0; } static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) return -EIO; mutex_lock(&zone->lock); if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { ret = zloop_update_seq_zone(zlo, zone_no); if (ret) goto unlock; } switch (zone->cond) { case BLK_ZONE_COND_EXP_OPEN: break; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_CLOSED: case BLK_ZONE_COND_IMP_OPEN: zone->cond = BLK_ZONE_COND_EXP_OPEN; break; case BLK_ZONE_COND_FULL: default: ret = -EIO; break; } unlock: mutex_unlock(&zone->lock); return ret; } static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) return -EIO; mutex_lock(&zone->lock); if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { ret = zloop_update_seq_zone(zlo, zone_no); if (ret) goto unlock; } switch (zone->cond) { case BLK_ZONE_COND_CLOSED: break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: if (zone->wp == zone->start) zone->cond = BLK_ZONE_COND_EMPTY; else zone->cond = BLK_ZONE_COND_CLOSED; break; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_FULL: default: ret = -EIO; break; } unlock: mutex_unlock(&zone->lock); return ret; } static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) return -EIO; mutex_lock(&zone->lock); if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && zone->cond == BLK_ZONE_COND_EMPTY) goto unlock; if (vfs_truncate(&zone->file->f_path, 0)) { set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); ret = -EIO; goto unlock; } zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); unlock: mutex_unlock(&zone->lock); return ret; } static int zloop_reset_all_zones(struct zloop_device *zlo) { unsigned int i; int ret; for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) { ret = zloop_reset_zone(zlo, i); if (ret) return ret; } return 0; } static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; int ret = 0; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) return -EIO; mutex_lock(&zone->lock); if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && zone->cond == BLK_ZONE_COND_FULL) goto unlock; if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) { set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); ret = -EIO; goto unlock; } zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zlo->zone_size; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); unlock: mutex_unlock(&zone->lock); return ret; } static void zloop_put_cmd(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); if (!atomic_dec_and_test(&cmd->ref)) return; kfree(cmd->bvec); cmd->bvec = NULL; if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } static void zloop_rw_complete(struct kiocb *iocb, long ret) { struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb); cmd->ret = ret; zloop_put_cmd(cmd); } static void zloop_rw(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct zloop_device *zlo = rq->q->queuedata; unsigned int zone_no = rq_zone_no(rq); sector_t sector = blk_rq_pos(rq); sector_t nr_sectors = blk_rq_sectors(rq); bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; int rw = is_write ? ITER_SOURCE : ITER_DEST; struct req_iterator rq_iter; struct zloop_zone *zone; struct iov_iter iter; struct bio_vec tmp; sector_t zone_end; int nr_bvec = 0; int ret; atomic_set(&cmd->ref, 2); cmd->sector = sector; cmd->nr_sectors = nr_sectors; cmd->ret = 0; /* We should never get an I/O beyond the device capacity. */ if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { ret = -EIO; goto out; } zone = &zlo->zones[zone_no]; zone_end = zone->start + zlo->zone_capacity; /* * The block layer should never send requests that are not fully * contained within the zone. */ if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { ret = -EIO; goto out; } if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { mutex_lock(&zone->lock); ret = zloop_update_seq_zone(zlo, zone_no); mutex_unlock(&zone->lock); if (ret) goto out; } if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { mutex_lock(&zone->lock); if (is_append) { sector = zone->wp; cmd->sector = sector; } /* * Write operations must be aligned to the write pointer and * fully contained within the zone capacity. */ if (sector != zone->wp || zone->wp + nr_sectors > zone_end) { pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", zone_no, sector, zone->wp); ret = -EIO; goto unlock; } /* Implicitly open the target zone. */ if (zone->cond == BLK_ZONE_COND_CLOSED || zone->cond == BLK_ZONE_COND_EMPTY) zone->cond = BLK_ZONE_COND_IMP_OPEN; /* * Advance the write pointer of sequential zones. If the write * fails, the wp position will be corrected when the next I/O * copmpletes. */ zone->wp += nr_sectors; if (zone->wp == zone_end) zone->cond = BLK_ZONE_COND_FULL; } rq_for_each_bvec(tmp, rq, rq_iter) nr_bvec++; if (rq->bio != rq->biotail) { struct bio_vec *bvec; cmd->bvec = kmalloc_array(nr_bvec, sizeof(*cmd->bvec), GFP_NOIO); if (!cmd->bvec) { ret = -EIO; goto unlock; } /* * The bios of the request may be started from the middle of * the 'bvec' because of bio splitting, so we can't directly * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec * API will take care of all details for us. */ bvec = cmd->bvec; rq_for_each_bvec(tmp, rq, rq_iter) { *bvec = tmp; bvec++; } iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); } else { /* * Same here, this bio may be started from the middle of the * 'bvec' because of bio splitting, so offset from the bvec * must be passed to iov iterator */ iov_iter_bvec(&iter, rw, __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter), nr_bvec, blk_rq_bytes(rq)); iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; } cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; cmd->iocb.ki_filp = zone->file; cmd->iocb.ki_complete = zloop_rw_complete; if (!zlo->buffered_io) cmd->iocb.ki_flags = IOCB_DIRECT; cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); else ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); unlock: if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) mutex_unlock(&zone->lock); out: if (ret != -EIOCBQUEUED) zloop_rw_complete(&cmd->iocb, ret); zloop_put_cmd(cmd); } static void zloop_handle_cmd(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct zloop_device *zlo = rq->q->queuedata; switch (req_op(rq)) { case REQ_OP_READ: case REQ_OP_WRITE: case REQ_OP_ZONE_APPEND: /* * zloop_rw() always executes asynchronously or completes * directly. */ zloop_rw(cmd); return; case REQ_OP_FLUSH: /* * Sync the entire FS containing the zone files instead of * walking all files */ cmd->ret = sync_filesystem(file_inode(zlo->data_dir)->i_sb); break; case REQ_OP_ZONE_RESET: cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq)); break; case REQ_OP_ZONE_RESET_ALL: cmd->ret = zloop_reset_all_zones(zlo); break; case REQ_OP_ZONE_FINISH: cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq)); break; case REQ_OP_ZONE_OPEN: cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq)); break; case REQ_OP_ZONE_CLOSE: cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq)); break; default: WARN_ON_ONCE(1); pr_err("Unsupported operation %d\n", req_op(rq)); cmd->ret = -EOPNOTSUPP; break; } blk_mq_complete_request(rq); } static void zloop_cmd_workfn(struct work_struct *work) { struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work); int orig_flags = current->flags; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; zloop_handle_cmd(cmd); current->flags = orig_flags; } static void zloop_complete_rq(struct request *rq) { struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct zloop_device *zlo = rq->q->queuedata; unsigned int zone_no = cmd->sector >> zlo->zone_shift; struct zloop_zone *zone = &zlo->zones[zone_no]; blk_status_t sts = BLK_STS_OK; switch (req_op(rq)) { case REQ_OP_READ: if (cmd->ret < 0) pr_err("Zone %u: failed read sector %llu, %llu sectors\n", zone_no, cmd->sector, cmd->nr_sectors); if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { /* short read */ struct bio *bio; __rq_for_each_bio(bio, rq) zero_fill_bio(bio); } break; case REQ_OP_WRITE: case REQ_OP_ZONE_APPEND: if (cmd->ret < 0) pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n", zone_no, req_op(rq) == REQ_OP_WRITE ? "" : "append ", cmd->sector, cmd->nr_sectors); if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { pr_err("Zone %u: partial write %ld/%u B\n", zone_no, cmd->ret, blk_rq_bytes(rq)); cmd->ret = -EIO; } if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { /* * A write to a sequential zone file failed: mark the * zone as having an error. This will be corrected and * cleared when the next IO is submitted. */ set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); break; } if (req_op(rq) == REQ_OP_ZONE_APPEND) rq->__sector = cmd->sector; break; default: break; } if (cmd->ret < 0) sts = errno_to_blk_status(cmd->ret); blk_mq_end_request(rq, sts); } static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct zloop_device *zlo = rq->q->queuedata; if (zlo->state == Zlo_deleting) return BLK_STS_IOERR; blk_mq_start_request(rq); INIT_WORK(&cmd->work, zloop_cmd_workfn); queue_work(zlo->workqueue, &cmd->work); return BLK_STS_OK; } static const struct blk_mq_ops zloop_mq_ops = { .queue_rq = zloop_queue_rq, .complete = zloop_complete_rq, }; static int zloop_open(struct gendisk *disk, blk_mode_t mode) { struct zloop_device *zlo = disk->private_data; int ret; ret = mutex_lock_killable(&zloop_ctl_mutex); if (ret) return ret; if (zlo->state != Zlo_live) ret = -ENXIO; mutex_unlock(&zloop_ctl_mutex); return ret; } static int zloop_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct zloop_device *zlo = disk->private_data; struct blk_zone blkz = {}; unsigned int first, i; int ret; first = disk_zone_no(disk, sector); if (first >= zlo->nr_zones) return 0; nr_zones = min(nr_zones, zlo->nr_zones - first); for (i = 0; i < nr_zones; i++) { unsigned int zone_no = first + i; struct zloop_zone *zone = &zlo->zones[zone_no]; mutex_lock(&zone->lock); if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { ret = zloop_update_seq_zone(zlo, zone_no); if (ret) { mutex_unlock(&zone->lock); return ret; } } blkz.start = zone->start; blkz.len = zlo->zone_size; blkz.wp = zone->wp; blkz.cond = zone->cond; if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { blkz.type = BLK_ZONE_TYPE_CONVENTIONAL; blkz.capacity = zlo->zone_size; } else { blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ; blkz.capacity = zlo->zone_capacity; } mutex_unlock(&zone->lock); ret = cb(&blkz, i, data); if (ret) return ret; } return nr_zones; } static void zloop_free_disk(struct gendisk *disk) { struct zloop_device *zlo = disk->private_data; unsigned int i; for (i = 0; i < zlo->nr_zones; i++) { struct zloop_zone *zone = &zlo->zones[i]; mapping_set_gfp_mask(zone->file->f_mapping, zone->old_gfp_mask); fput(zone->file); } fput(zlo->data_dir); destroy_workqueue(zlo->workqueue); kfree(zlo->base_dir); kvfree(zlo); } static const struct block_device_operations zloop_fops = { .owner = THIS_MODULE, .open = zloop_open, .report_zones = zloop_report_zones, .free_disk = zloop_free_disk, }; __printf(3, 4) static struct file *zloop_filp_open_fmt(int oflags, umode_t mode, const char *fmt, ...) { struct file *file; va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(GFP_KERNEL, fmt, ap); va_end(ap); if (!p) return ERR_PTR(-ENOMEM); file = filp_open(p, oflags, mode); kfree(p); return file; } static int zloop_get_block_size(struct zloop_device *zlo, struct zloop_zone *zone) { struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev; struct kstat st; /* * If the FS block size is lower than or equal to 4K, use that as the * device block size. Otherwise, fallback to the FS direct IO alignment * constraint if that is provided, and to the FS underlying device * physical block size if the direct IO alignment is unknown. */ if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K) zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize; else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) && (st.result_mask & STATX_DIOALIGN)) zlo->block_size = st.dio_offset_align; else if (sb_bdev) zlo->block_size = bdev_physical_block_size(sb_bdev); else zlo->block_size = SECTOR_SIZE; if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { pr_err("Zone capacity is not aligned to block size %u\n", zlo->block_size); return -EINVAL; } return 0; } static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, unsigned int zone_no, bool restore) { struct zloop_zone *zone = &zlo->zones[zone_no]; int oflags = O_RDWR; struct kstat stat; sector_t file_sectors; int ret; mutex_init(&zone->lock); zone->start = (sector_t)zone_no << zlo->zone_shift; if (!restore) oflags |= O_CREAT; if (!opts->buffered_io) oflags |= O_DIRECT; if (zone_no < zlo->nr_conv_zones) { /* Conventional zone file. */ set_bit(ZLOOP_ZONE_CONV, &zone->flags); zone->cond = BLK_ZONE_COND_NOT_WP; zone->wp = U64_MAX; zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u", zlo->base_dir, zlo->id, zone_no); if (IS_ERR(zone->file)) { pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)", zone_no, zlo->base_dir, zlo->id, zone_no, PTR_ERR(zone->file)); return PTR_ERR(zone->file); } if (!zlo->block_size) { ret = zloop_get_block_size(zlo, zone); if (ret) return ret; } ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); if (ret < 0) { pr_err("Failed to get zone %u file stat\n", zone_no); return ret; } file_sectors = stat.size >> SECTOR_SHIFT; if (restore && file_sectors != zlo->zone_size) { pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n", zone_no, file_sectors, zlo->zone_capacity); return ret; } ret = vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT); if (ret < 0) { pr_err("Failed to truncate zone %u file (err=%d)\n", zone_no, ret); return ret; } return 0; } /* Sequential zone file. */ zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u", zlo->base_dir, zlo->id, zone_no); if (IS_ERR(zone->file)) { pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)", zone_no, zlo->base_dir, zlo->id, zone_no, PTR_ERR(zone->file)); return PTR_ERR(zone->file); } if (!zlo->block_size) { ret = zloop_get_block_size(zlo, zone); if (ret) return ret; } zloop_get_block_size(zlo, zone); mutex_lock(&zone->lock); ret = zloop_update_seq_zone(zlo, zone_no); mutex_unlock(&zone->lock); return ret; } static bool zloop_dev_exists(struct zloop_device *zlo) { struct file *cnv, *seq; bool exists; cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u", zlo->base_dir, zlo->id, 0); seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u", zlo->base_dir, zlo->id, 0); exists = !IS_ERR(cnv) || !IS_ERR(seq); if (!IS_ERR(cnv)) fput(cnv); if (!IS_ERR(seq)) fput(seq); return exists; } static int zloop_ctl_add(struct zloop_options *opts) { struct queue_limits lim = { .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, .max_hw_zone_append_sectors = SZ_1M >> SECTOR_SHIFT, .chunk_sectors = opts->zone_size, .features = BLK_FEAT_ZONED, }; unsigned int nr_zones, i, j; struct zloop_device *zlo; int ret = -EINVAL; bool restore; __module_get(THIS_MODULE); nr_zones = opts->capacity >> ilog2(opts->zone_size); if (opts->nr_conv_zones >= nr_zones) { pr_err("Invalid number of conventional zones %u\n", opts->nr_conv_zones); goto out; } zlo = kvzalloc(struct_size(zlo, zones, nr_zones), GFP_KERNEL); if (!zlo) { ret = -ENOMEM; goto out; } zlo->state = Zlo_creating; ret = mutex_lock_killable(&zloop_ctl_mutex); if (ret) goto out_free_dev; /* Allocate id, if @opts->id >= 0, we're requesting that specific id */ if (opts->id >= 0) { ret = idr_alloc(&zloop_index_idr, zlo, opts->id, opts->id + 1, GFP_KERNEL); if (ret == -ENOSPC) ret = -EEXIST; } else { ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL); } mutex_unlock(&zloop_ctl_mutex); if (ret < 0) goto out_free_dev; zlo->id = ret; zlo->zone_shift = ilog2(opts->zone_size); zlo->zone_size = opts->zone_size; if (opts->zone_capacity) zlo->zone_capacity = opts->zone_capacity; else zlo->zone_capacity = zlo->zone_size; zlo->nr_zones = nr_zones; zlo->nr_conv_zones = opts->nr_conv_zones; zlo->buffered_io = opts->buffered_io; zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, opts->nr_queues * opts->queue_depth, zlo->id); if (!zlo->workqueue) { ret = -ENOMEM; goto out_free_idr; } if (opts->base_dir) zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL); else zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL); if (!zlo->base_dir) { ret = -ENOMEM; goto out_destroy_workqueue; } zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u", zlo->base_dir, zlo->id); if (IS_ERR(zlo->data_dir)) { ret = PTR_ERR(zlo->data_dir); pr_warn("Failed to open directory %s/%u (err=%d)\n", zlo->base_dir, zlo->id, ret); goto out_free_base_dir; } /* * If we already have zone files, we are restoring a device created by a * previous add operation. In this case, zloop_init_zone() will check * that the zone files are consistent with the zone configuration given. */ restore = zloop_dev_exists(zlo); for (i = 0; i < nr_zones; i++) { ret = zloop_init_zone(zlo, opts, i, restore); if (ret) goto out_close_files; } lim.physical_block_size = zlo->block_size; lim.logical_block_size = zlo->block_size; zlo->tag_set.ops = &zloop_mq_ops; zlo->tag_set.nr_hw_queues = opts->nr_queues; zlo->tag_set.queue_depth = opts->queue_depth; zlo->tag_set.numa_node = NUMA_NO_NODE; zlo->tag_set.cmd_size = sizeof(struct zloop_cmd); zlo->tag_set.driver_data = zlo; ret = blk_mq_alloc_tag_set(&zlo->tag_set); if (ret) { pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret); goto out_close_files; } zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo); if (IS_ERR(zlo->disk)) { pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret); ret = PTR_ERR(zlo->disk); goto out_cleanup_tags; } zlo->disk->flags = GENHD_FL_NO_PART; zlo->disk->fops = &zloop_fops; zlo->disk->private_data = zlo; sprintf(zlo->disk->disk_name, "zloop%d", zlo->id); set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones); ret = blk_revalidate_disk_zones(zlo->disk); if (ret) goto out_cleanup_disk; ret = add_disk(zlo->disk); if (ret) { pr_err("add_disk failed (err=%d)\n", ret); goto out_cleanup_disk; } mutex_lock(&zloop_ctl_mutex); zlo->state = Zlo_live; mutex_unlock(&zloop_ctl_mutex); pr_info("Added device %d: %u zones of %llu MB, %u B block size\n", zlo->id, zlo->nr_zones, ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, zlo->block_size); return 0; out_cleanup_disk: put_disk(zlo->disk); out_cleanup_tags: blk_mq_free_tag_set(&zlo->tag_set); out_close_files: for (j = 0; j < i; j++) { struct zloop_zone *zone = &zlo->zones[j]; if (!IS_ERR_OR_NULL(zone->file)) fput(zone->file); } fput(zlo->data_dir); out_free_base_dir: kfree(zlo->base_dir); out_destroy_workqueue: destroy_workqueue(zlo->workqueue); out_free_idr: mutex_lock(&zloop_ctl_mutex); idr_remove(&zloop_index_idr, zlo->id); mutex_unlock(&zloop_ctl_mutex); out_free_dev: kvfree(zlo); out: module_put(THIS_MODULE); if (ret == -ENOENT) ret = -EINVAL; return ret; } static int zloop_ctl_remove(struct zloop_options *opts) { struct zloop_device *zlo; int ret; if (!(opts->mask & ZLOOP_OPT_ID)) { pr_err("No ID specified\n"); return -EINVAL; } ret = mutex_lock_killable(&zloop_ctl_mutex); if (ret) return ret; zlo = idr_find(&zloop_index_idr, opts->id); if (!zlo || zlo->state == Zlo_creating) { ret = -ENODEV; } else if (zlo->state == Zlo_deleting) { ret = -EINVAL; } else { idr_remove(&zloop_index_idr, zlo->id); zlo->state = Zlo_deleting; } mutex_unlock(&zloop_ctl_mutex); if (ret) return ret; del_gendisk(zlo->disk); put_disk(zlo->disk); blk_mq_free_tag_set(&zlo->tag_set); pr_info("Removed device %d\n", opts->id); module_put(THIS_MODULE); return 0; } static int zloop_parse_options(struct zloop_options *opts, const char *buf) { substring_t args[MAX_OPT_ARGS]; char *options, *o, *p; unsigned int token; int ret = 0; /* Set defaults. */ opts->mask = 0; opts->id = ZLOOP_DEF_ID; opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES; opts->zone_size = ZLOOP_DEF_ZONE_SIZE; opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES; opts->nr_queues = ZLOOP_DEF_NR_QUEUES; opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; if (!buf) return 0; /* Skip leading spaces before the options. */ while (isspace(*buf)) buf++; options = o = kstrdup(buf, GFP_KERNEL); if (!options) return -ENOMEM; /* Parse the options, doing only some light invalid value checks. */ while ((p = strsep(&o, ",\n")) != NULL) { if (!*p) continue; token = match_token(p, zloop_opt_tokens, args); opts->mask |= token; switch (token) { case ZLOOP_OPT_ID: if (match_int(args, &opts->id)) { ret = -EINVAL; goto out; } break; case ZLOOP_OPT_CAPACITY: if (match_uint(args, &token)) { ret = -EINVAL; goto out; } if (!token) { pr_err("Invalid capacity\n"); ret = -EINVAL; goto out; } opts->capacity = ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; break; case ZLOOP_OPT_ZONE_SIZE: if (match_uint(args, &token)) { ret = -EINVAL; goto out; } if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB || !is_power_of_2(token)) { pr_err("Invalid zone size %u\n", token); ret = -EINVAL; goto out; } opts->zone_size = ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; break; case ZLOOP_OPT_ZONE_CAPACITY: if (match_uint(args, &token)) { ret = -EINVAL; goto out; } if (!token) { pr_err("Invalid zone capacity\n"); ret = -EINVAL; goto out; } opts->zone_capacity = ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; break; case ZLOOP_OPT_NR_CONV_ZONES: if (match_uint(args, &token)) { ret = -EINVAL; goto out; } opts->nr_conv_zones = token; break; case ZLOOP_OPT_BASE_DIR: p = match_strdup(args); if (!p) { ret = -ENOMEM; goto out; } kfree(opts->base_dir); opts->base_dir = p; break; case ZLOOP_OPT_NR_QUEUES: if (match_uint(args, &token)) { ret = -EINVAL; goto out; } if (!token) { pr_err("Invalid number of queues\n"); ret = -EINVAL; goto out; } opts->nr_queues = min(token, num_online_cpus()); break; case ZLOOP_OPT_QUEUE_DEPTH: if (match_uint(args, &token)) { ret = -EINVAL; goto out; } if (!token) { pr_err("Invalid queue depth\n"); ret = -EINVAL; goto out; } opts->queue_depth = token; break; case ZLOOP_OPT_BUFFERED_IO: opts->buffered_io = true; break; case ZLOOP_OPT_ERR: default: pr_warn("unknown parameter or missing value '%s'\n", p); ret = -EINVAL; goto out; } } ret = -EINVAL; if (opts->capacity <= opts->zone_size) { pr_err("Invalid capacity\n"); goto out; } if (opts->zone_capacity > opts->zone_size) { pr_err("Invalid zone capacity\n"); goto out; } ret = 0; out: kfree(options); return ret; } enum { ZLOOP_CTL_ADD, ZLOOP_CTL_REMOVE, }; static struct zloop_ctl_op { int code; const char *name; } zloop_ctl_ops[] = { { ZLOOP_CTL_ADD, "add" }, { ZLOOP_CTL_REMOVE, "remove" }, { -1, NULL }, }; static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf, size_t count, loff_t *pos) { struct zloop_options opts = { }; struct zloop_ctl_op *op; const char *buf, *opts_buf; int i, ret; if (count > PAGE_SIZE) return -ENOMEM; buf = memdup_user_nul(ubuf, count); if (IS_ERR(buf)) return PTR_ERR(buf); for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) { op = &zloop_ctl_ops[i]; if (!op->name) { pr_err("Invalid operation\n"); ret = -EINVAL; goto out; } if (!strncmp(buf, op->name, strlen(op->name))) break; } if (count <= strlen(op->name)) opts_buf = NULL; else opts_buf = buf + strlen(op->name); ret = zloop_parse_options(&opts, opts_buf); if (ret) { pr_err("Failed to parse options\n"); goto out; } switch (op->code) { case ZLOOP_CTL_ADD: ret = zloop_ctl_add(&opts); break; case ZLOOP_CTL_REMOVE: ret = zloop_ctl_remove(&opts); break; default: pr_err("Invalid operation\n"); ret = -EINVAL; goto out; } out: kfree(opts.base_dir); kfree(buf); return ret ? ret : count; } static int zloop_ctl_show(struct seq_file *seq_file, void *private) { const struct match_token *tok; int i; /* Add operation */ seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name); for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) { tok = &zloop_opt_tokens[i]; if (!tok->pattern) break; if (i) seq_putc(seq_file, ','); seq_puts(seq_file, tok->pattern); } seq_putc(seq_file, '\n'); /* Remove operation */ seq_puts(seq_file, zloop_ctl_ops[1].name); seq_puts(seq_file, " id=%d\n"); return 0; } static int zloop_ctl_open(struct inode *inode, struct file *file) { file->private_data = NULL; return single_open(file, zloop_ctl_show, NULL); } static int zloop_ctl_release(struct inode *inode, struct file *file) { return single_release(inode, file); } static const struct file_operations zloop_ctl_fops = { .owner = THIS_MODULE, .open = zloop_ctl_open, .release = zloop_ctl_release, .write = zloop_ctl_write, .read = seq_read, }; static struct miscdevice zloop_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "zloop-control", .fops = &zloop_ctl_fops, }; static int __init zloop_init(void) { int ret; ret = misc_register(&zloop_misc); if (ret) { pr_err("Failed to register misc device: %d\n", ret); return ret; } pr_info("Module loaded\n"); return 0; } static void __exit zloop_exit(void) { misc_deregister(&zloop_misc); idr_destroy(&zloop_index_idr); } module_init(zloop_init); module_exit(zloop_exit); MODULE_DESCRIPTION("Zoned loopback device"); MODULE_LICENSE("GPL");