diff options
Diffstat (limited to 'drivers/block')
| -rw-r--r-- | drivers/block/Kconfig | 3 | ||||
| -rw-r--r-- | drivers/block/loop.c | 27 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-clt.c | 13 | ||||
| -rw-r--r-- | drivers/block/rnbd/rnbd-clt.h | 2 | ||||
| -rw-r--r-- | drivers/block/ublk_drv.c | 63 | ||||
| -rw-r--r-- | drivers/block/zloop.c | 13 | ||||
| -rw-r--r-- | drivers/block/zram/zram_drv.c | 483 | ||||
| -rw-r--r-- | drivers/block/zram/zram_drv.h | 2 |
8 files changed, 456 insertions, 150 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 77d694448990..858320b6ebb7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -316,9 +316,6 @@ config BLK_DEV_RBD tristate "Rados block device (RBD)" depends on INET && BLOCK select CEPH_LIB - select CRC32 - select CRYPTO_AES - select CRYPTO help Say Y here if you want include the Rados block device, which stripes a block device over objects stored in the Ceph distributed object diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ebe751f39742..32a3a5b13802 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -348,11 +348,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, struct file *file = lo->lo_backing_file; struct bio_vec tmp; unsigned int offset; - int nr_bvec = 0; + unsigned int nr_bvec; int ret; - rq_for_each_bvec(tmp, rq, rq_iter) - nr_bvec++; + nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { @@ -1083,7 +1082,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, /* Order wrt reading lo_state in loop_validate_file(). */ wmb(); - lo->lo_state = Lo_bound; + WRITE_ONCE(lo->lo_state, Lo_bound); if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; partscan = lo->lo_flags & LO_FLAGS_PARTSCAN; @@ -1180,7 +1179,7 @@ static void __loop_clr_fd(struct loop_device *lo) if (!part_shift) set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); mutex_lock(&lo->lo_mutex); - lo->lo_state = Lo_unbound; + WRITE_ONCE(lo->lo_state, Lo_unbound); mutex_unlock(&lo->lo_mutex); /* @@ -1219,7 +1218,7 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_flags |= LO_FLAGS_AUTOCLEAR; if (disk_openers(lo->lo_disk) == 1) - lo->lo_state = Lo_rundown; + WRITE_ONCE(lo->lo_state, Lo_rundown); loop_global_unlock(lo, true); return 0; @@ -1744,7 +1743,7 @@ static void lo_release(struct gendisk *disk) mutex_lock(&lo->lo_mutex); if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) - lo->lo_state = Lo_rundown; + WRITE_ONCE(lo->lo_state, Lo_rundown); need_clear = (lo->lo_state == Lo_rundown); mutex_unlock(&lo->lo_mutex); @@ -1859,7 +1858,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - if (lo->lo_state != Lo_bound) + if (data_race(READ_ONCE(lo->lo_state)) != Lo_bound) return BLK_STS_IOERR; switch (req_op(rq)) { @@ -2017,7 +2016,7 @@ static int loop_add(int i) lo->worker_tree = RB_ROOT; INIT_LIST_HEAD(&lo->idle_worker_list); timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE); - lo->lo_state = Lo_unbound; + WRITE_ONCE(lo->lo_state, Lo_unbound); err = mutex_lock_killable(&loop_ctl_mutex); if (err) @@ -2175,7 +2174,7 @@ static int loop_control_remove(int idx) goto mark_visible; } /* Mark this loop device as no more bound, but not quite unbound yet */ - lo->lo_state = Lo_deleting; + WRITE_ONCE(lo->lo_state, Lo_deleting); mutex_unlock(&lo->lo_mutex); loop_remove(lo); @@ -2198,8 +2197,12 @@ static int loop_control_get_free(int idx) if (ret) return ret; idr_for_each_entry(&loop_index_idr, lo, id) { - /* Hitting a race results in creating a new loop device which is harmless. */ - if (lo->idr_visible && data_race(lo->lo_state) == Lo_unbound) + /* + * Hitting a race results in creating a new loop device + * which is harmless. + */ + if (lo->idr_visible && + data_race(READ_ONCE(lo->lo_state)) == Lo_unbound) goto found; } mutex_unlock(&loop_ctl_mutex); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index f1409e54010a..d1c354636315 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1423,9 +1423,11 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, goto out_alloc; } - ret = ida_alloc_max(&index_ida, (1 << (MINORBITS - RNBD_PART_BITS)) - 1, - GFP_KERNEL); - if (ret < 0) { + dev->clt_device_id = ida_alloc_max(&index_ida, + (1 << (MINORBITS - RNBD_PART_BITS)) - 1, + GFP_KERNEL); + if (dev->clt_device_id < 0) { + ret = dev->clt_device_id; pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n", pathname, sess->sessname, ret); goto out_queues; @@ -1434,10 +1436,9 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, dev->pathname = kstrdup(pathname, GFP_KERNEL); if (!dev->pathname) { ret = -ENOMEM; - goto out_queues; + goto out_ida; } - dev->clt_device_id = ret; dev->sess = sess; dev->access_mode = access_mode; dev->nr_poll_queues = nr_poll_queues; @@ -1453,6 +1454,8 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, return dev; +out_ida: + ida_free(&index_ida, dev->clt_device_id); out_queues: kfree(dev->hw_queues); out_alloc: diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index a48e040abe63..fbc1ed766025 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -112,7 +112,7 @@ struct rnbd_clt_dev { struct rnbd_queue *hw_queues; u32 device_id; /* local Idr index - used to track minor number allocations. */ - u32 clt_device_id; + int clt_device_id; struct mutex lock; enum rnbd_clt_dev_state dev_state; refcount_t refcount; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 2c715df63f23..49c208457198 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -926,6 +926,7 @@ static size_t ublk_copy_user_pages(const struct request *req, size_t done = 0; rq_for_each_segment(bv, req, iter) { + unsigned len; void *bv_buf; size_t copied; @@ -934,18 +935,17 @@ static size_t ublk_copy_user_pages(const struct request *req, continue; } - bv.bv_offset += offset; - bv.bv_len -= offset; - bv_buf = bvec_kmap_local(&bv); + len = bv.bv_len - offset; + bv_buf = kmap_local_page(bv.bv_page) + bv.bv_offset + offset; if (dir == ITER_DEST) - copied = copy_to_iter(bv_buf, bv.bv_len, uiter); + copied = copy_to_iter(bv_buf, len, uiter); else - copied = copy_from_iter(bv_buf, bv.bv_len, uiter); + copied = copy_from_iter(bv_buf, len, uiter); kunmap_local(bv_buf); done += copied; - if (copied < bv.bv_len) + if (copied < len) break; offset = 0; @@ -1080,12 +1080,20 @@ static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu); } +static void ublk_end_request(struct request *req, blk_status_t error) +{ + local_bh_disable(); + blk_mq_end_request(req, error); + local_bh_enable(); +} + /* todo: handle partial completion */ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, bool need_map) { unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; + bool requeue; /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) @@ -1117,14 +1125,30 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, if (unlikely(unmapped_bytes < io->res)) io->res = unmapped_bytes; - if (blk_update_request(req, BLK_STS_OK, io->res)) + /* + * Run bio->bi_end_io() with softirqs disabled. If the final fput + * happens off this path, then that will prevent ublk's blkdev_release() + * from being called on current's task work, see fput() implementation. + * + * Otherwise, ublk server may not provide forward progress in case of + * reading the partition table from bdev_open() with disk->open_mutex + * held, and causes dead lock as we could already be holding + * disk->open_mutex here. + * + * Preferably we would not be doing IO with a mutex held that is also + * used for release, but this work-around will suffice for now. + */ + local_bh_disable(); + requeue = blk_update_request(req, BLK_STS_OK, io->res); + local_bh_enable(); + if (requeue) blk_mq_requeue_request(req, true); else if (likely(!blk_should_fake_timeout(req->q))) __blk_mq_end_request(req, BLK_STS_OK); return; exit: - blk_mq_end_request(req, res); + ublk_end_request(req, res); } static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io, @@ -1164,7 +1188,7 @@ static inline void __ublk_abort_rq(struct ublk_queue *ubq, if (ublk_nosrv_dev_should_queue_io(ubq->dev)) blk_mq_requeue_request(rq, false); else - blk_mq_end_request(rq, BLK_STS_IOERR); + ublk_end_request(rq, BLK_STS_IOERR); } static void @@ -1209,7 +1233,7 @@ __ublk_do_auto_buf_reg(const struct ublk_queue *ubq, struct request *req, ublk_auto_buf_reg_fallback(ubq, req->tag); return AUTO_BUF_REG_FALLBACK; } - blk_mq_end_request(req, BLK_STS_IOERR); + ublk_end_request(req, BLK_STS_IOERR); return AUTO_BUF_REG_FAIL; } @@ -1583,8 +1607,7 @@ static bool ublk_check_and_reset_active_ref(struct ublk_device *ub) { int i, j; - if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY | - UBLK_F_AUTO_BUF_REG))) + if (!ublk_dev_need_req_ref(ub)) return false; for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { @@ -3673,6 +3696,19 @@ exit: return ret; } +static bool ublk_ctrl_uring_cmd_may_sleep(u32 cmd_op) +{ + switch (_IOC_NR(cmd_op)) { + case UBLK_CMD_GET_QUEUE_AFFINITY: + case UBLK_CMD_GET_DEV_INFO: + case UBLK_CMD_GET_DEV_INFO2: + case _IOC_NR(UBLK_U_CMD_GET_FEATURES): + return false; + default: + return true; + } +} + static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { @@ -3681,7 +3717,8 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, u32 cmd_op = cmd->cmd_op; int ret = -EINVAL; - if (issue_flags & IO_URING_F_NONBLOCK) + if (ublk_ctrl_uring_cmd_may_sleep(cmd_op) && + issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; ublk_ctrl_cmd_dump(cmd); diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 3f50321aa4a7..8e334f5025fc 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -394,7 +394,7 @@ static void zloop_rw(struct zloop_cmd *cmd) struct bio_vec tmp; unsigned long flags; sector_t zone_end; - int nr_bvec = 0; + unsigned int nr_bvec; int ret; atomic_set(&cmd->ref, 2); @@ -487,8 +487,7 @@ static void zloop_rw(struct zloop_cmd *cmd) spin_unlock_irqrestore(&zone->wp_lock, flags); } - rq_for_each_bvec(tmp, rq, rq_iter) - nr_bvec++; + nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { struct bio_vec *bvec; @@ -698,7 +697,7 @@ static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); struct zloop_device *zlo = rq->q->queuedata; - if (zlo->state == Zlo_deleting) + if (data_race(READ_ONCE(zlo->state)) == Zlo_deleting) return BLK_STS_IOERR; /* @@ -1003,7 +1002,7 @@ static int zloop_ctl_add(struct zloop_options *opts) ret = -ENOMEM; goto out; } - zlo->state = Zlo_creating; + WRITE_ONCE(zlo->state, Zlo_creating); ret = mutex_lock_killable(&zloop_ctl_mutex); if (ret) @@ -1114,7 +1113,7 @@ static int zloop_ctl_add(struct zloop_options *opts) } mutex_lock(&zloop_ctl_mutex); - zlo->state = Zlo_live; + WRITE_ONCE(zlo->state, Zlo_live); mutex_unlock(&zloop_ctl_mutex); pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n", @@ -1178,7 +1177,7 @@ static int zloop_ctl_remove(struct zloop_options *opts) ret = -EINVAL; } else { idr_remove(&zloop_index_idr, zlo->id); - zlo->state = Zlo_deleting; + WRITE_ONCE(zlo->state, Zlo_deleting); } mutex_unlock(&zloop_ctl_mutex); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a43074657531..5759823d6314 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -500,8 +500,31 @@ out: } #ifdef CONFIG_ZRAM_WRITEBACK +#define INVALID_BDEV_BLOCK (~0UL) + +struct zram_wb_ctl { + /* idle list is accessed only by the writeback task, no concurency */ + struct list_head idle_reqs; + /* done list is accessed concurrently, protect by done_lock */ + struct list_head done_reqs; + wait_queue_head_t done_wait; + spinlock_t done_lock; + atomic_t num_inflight; +}; + +struct zram_wb_req { + unsigned long blk_idx; + struct page *page; + struct zram_pp_slot *pps; + struct bio_vec bio_vec; + struct bio bio; + + struct list_head entry; +}; + static ssize_t writeback_limit_enable_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) + struct device_attribute *attr, + const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); u64 val; @@ -510,33 +533,31 @@ static ssize_t writeback_limit_enable_store(struct device *dev, if (kstrtoull(buf, 10, &val)) return ret; - down_read(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); + down_write(&zram->init_lock); zram->wb_limit_enable = val; - spin_unlock(&zram->wb_limit_lock); - up_read(&zram->init_lock); + up_write(&zram->init_lock); ret = len; return ret; } static ssize_t writeback_limit_enable_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, + char *buf) { bool val; struct zram *zram = dev_to_zram(dev); down_read(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); val = zram->wb_limit_enable; - spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); return sysfs_emit(buf, "%d\n", val); } static ssize_t writeback_limit_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) + struct device_attribute *attr, + const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); u64 val; @@ -545,31 +566,71 @@ static ssize_t writeback_limit_store(struct device *dev, if (kstrtoull(buf, 10, &val)) return ret; - down_read(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); + /* + * When the page size is greater than 4KB, if bd_wb_limit is set to + * a value that is not page - size aligned, it will cause value + * wrapping. For example, when the page size is set to 16KB and + * bd_wb_limit is set to 3, a single write - back operation will + * cause bd_wb_limit to become -1. Even more terrifying is that + * bd_wb_limit is an unsigned number. + */ + val = rounddown(val, PAGE_SIZE / 4096); + + down_write(&zram->init_lock); zram->bd_wb_limit = val; - spin_unlock(&zram->wb_limit_lock); - up_read(&zram->init_lock); + up_write(&zram->init_lock); ret = len; return ret; } static ssize_t writeback_limit_show(struct device *dev, - struct device_attribute *attr, char *buf) + struct device_attribute *attr, char *buf) { u64 val; struct zram *zram = dev_to_zram(dev); down_read(&zram->init_lock); - spin_lock(&zram->wb_limit_lock); val = zram->bd_wb_limit; - spin_unlock(&zram->wb_limit_lock); up_read(&zram->init_lock); return sysfs_emit(buf, "%llu\n", val); } +static ssize_t writeback_batch_size_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u32 val; + + if (kstrtouint(buf, 10, &val)) + return -EINVAL; + + if (!val) + return -EINVAL; + + down_write(&zram->init_lock); + zram->wb_batch_size = val; + up_write(&zram->init_lock); + + return len; +} + +static ssize_t writeback_batch_size_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + u32 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->wb_batch_size; + up_read(&zram->init_lock); + + return sysfs_emit(buf, "%u\n", val); +} + static void reset_bdev(struct zram *zram) { if (!zram->backing_dev) @@ -697,23 +758,20 @@ out: return err; } -static unsigned long alloc_block_bdev(struct zram *zram) +static unsigned long zram_reserve_bdev_block(struct zram *zram) { - unsigned long blk_idx = 1; -retry: - /* skip 0 bit to confuse zram.handle = 0 */ - blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx); - if (blk_idx == zram->nr_pages) - return 0; + unsigned long blk_idx; - if (test_and_set_bit(blk_idx, zram->bitmap)) - goto retry; + blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, 0); + if (blk_idx == zram->nr_pages) + return INVALID_BDEV_BLOCK; + set_bit(blk_idx, zram->bitmap); atomic64_inc(&zram->stats.bd_count); return blk_idx; } -static void free_block_bdev(struct zram *zram, unsigned long blk_idx) +static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx) { int was_set; @@ -734,32 +792,249 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, submit_bio(bio); } -static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) +static void release_wb_req(struct zram_wb_req *req) { - unsigned long blk_idx = 0; - struct page *page = NULL; - struct zram_pp_slot *pps; - struct bio_vec bio_vec; - struct bio bio; + __free_page(req->page); + kfree(req); +} + +static void release_wb_ctl(struct zram_wb_ctl *wb_ctl) +{ + if (!wb_ctl) + return; + + /* We should never have inflight requests at this point */ + WARN_ON(atomic_read(&wb_ctl->num_inflight)); + WARN_ON(!list_empty(&wb_ctl->done_reqs)); + + while (!list_empty(&wb_ctl->idle_reqs)) { + struct zram_wb_req *req; + + req = list_first_entry(&wb_ctl->idle_reqs, + struct zram_wb_req, entry); + list_del(&req->entry); + release_wb_req(req); + } + + kfree(wb_ctl); +} + +static struct zram_wb_ctl *init_wb_ctl(struct zram *zram) +{ + struct zram_wb_ctl *wb_ctl; + int i; + + wb_ctl = kmalloc(sizeof(*wb_ctl), GFP_KERNEL); + if (!wb_ctl) + return NULL; + + INIT_LIST_HEAD(&wb_ctl->idle_reqs); + INIT_LIST_HEAD(&wb_ctl->done_reqs); + atomic_set(&wb_ctl->num_inflight, 0); + init_waitqueue_head(&wb_ctl->done_wait); + spin_lock_init(&wb_ctl->done_lock); + + for (i = 0; i < zram->wb_batch_size; i++) { + struct zram_wb_req *req; + + /* + * This is fatal condition only if we couldn't allocate + * any requests at all. Otherwise we just work with the + * requests that we have successfully allocated, so that + * writeback can still proceed, even if there is only one + * request on the idle list. + */ + req = kzalloc(sizeof(*req), GFP_KERNEL | __GFP_NOWARN); + if (!req) + break; + + req->page = alloc_page(GFP_KERNEL | __GFP_NOWARN); + if (!req->page) { + kfree(req); + break; + } + + list_add(&req->entry, &wb_ctl->idle_reqs); + } + + /* We couldn't allocate any requests, so writeabck is not possible */ + if (list_empty(&wb_ctl->idle_reqs)) + goto release_wb_ctl; + + return wb_ctl; + +release_wb_ctl: + release_wb_ctl(wb_ctl); + return NULL; +} + +static void zram_account_writeback_rollback(struct zram *zram) +{ + lockdep_assert_held_read(&zram->init_lock); + + if (zram->wb_limit_enable) + zram->bd_wb_limit += 1UL << (PAGE_SHIFT - 12); +} + +static void zram_account_writeback_submit(struct zram *zram) +{ + lockdep_assert_held_read(&zram->init_lock); + + if (zram->wb_limit_enable && zram->bd_wb_limit > 0) + zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); +} + +static int zram_writeback_complete(struct zram *zram, struct zram_wb_req *req) +{ + u32 index = req->pps->index; + int err; + + err = blk_status_to_errno(req->bio.bi_status); + if (err) { + /* + * Failed wb requests should not be accounted in wb_limit + * (if enabled). + */ + zram_account_writeback_rollback(zram); + zram_release_bdev_block(zram, req->blk_idx); + return err; + } + + atomic64_inc(&zram->stats.bd_writes); + zram_slot_lock(zram, index); + /* + * We release slot lock during writeback so slot can change under us: + * slot_free() or slot_free() and zram_write_page(). In both cases + * slot loses ZRAM_PP_SLOT flag. No concurrent post-processing can + * set ZRAM_PP_SLOT on such slots until current post-processing + * finishes. + */ + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) { + zram_release_bdev_block(zram, req->blk_idx); + goto out; + } + + zram_free_page(zram, index); + zram_set_flag(zram, index, ZRAM_WB); + zram_set_handle(zram, index, req->blk_idx); + atomic64_inc(&zram->stats.pages_stored); + +out: + zram_slot_unlock(zram, index); + return 0; +} + +static void zram_writeback_endio(struct bio *bio) +{ + struct zram_wb_req *req = container_of(bio, struct zram_wb_req, bio); + struct zram_wb_ctl *wb_ctl = bio->bi_private; + unsigned long flags; + + spin_lock_irqsave(&wb_ctl->done_lock, flags); + list_add(&req->entry, &wb_ctl->done_reqs); + spin_unlock_irqrestore(&wb_ctl->done_lock, flags); + + wake_up(&wb_ctl->done_wait); +} + +static void zram_submit_wb_request(struct zram *zram, + struct zram_wb_ctl *wb_ctl, + struct zram_wb_req *req) +{ + /* + * wb_limit (if enabled) should be adjusted before submission, + * so that we don't over-submit. + */ + zram_account_writeback_submit(zram); + atomic_inc(&wb_ctl->num_inflight); + req->bio.bi_private = wb_ctl; + submit_bio(&req->bio); +} + +static int zram_complete_done_reqs(struct zram *zram, + struct zram_wb_ctl *wb_ctl) +{ + struct zram_wb_req *req; + unsigned long flags; int ret = 0, err; - u32 index; - page = alloc_page(GFP_KERNEL); - if (!page) - return -ENOMEM; + while (atomic_read(&wb_ctl->num_inflight) > 0) { + spin_lock_irqsave(&wb_ctl->done_lock, flags); + req = list_first_entry_or_null(&wb_ctl->done_reqs, + struct zram_wb_req, entry); + if (req) + list_del(&req->entry); + spin_unlock_irqrestore(&wb_ctl->done_lock, flags); + + /* ->num_inflight > 0 doesn't mean we have done requests */ + if (!req) + break; + + err = zram_writeback_complete(zram, req); + if (err) + ret = err; + + atomic_dec(&wb_ctl->num_inflight); + release_pp_slot(zram, req->pps); + req->pps = NULL; + + list_add(&req->entry, &wb_ctl->idle_reqs); + } + + return ret; +} + +static struct zram_wb_req *zram_select_idle_req(struct zram_wb_ctl *wb_ctl) +{ + struct zram_wb_req *req; + + req = list_first_entry_or_null(&wb_ctl->idle_reqs, + struct zram_wb_req, entry); + if (req) + list_del(&req->entry); + return req; +} + +static int zram_writeback_slots(struct zram *zram, + struct zram_pp_ctl *ctl, + struct zram_wb_ctl *wb_ctl) +{ + unsigned long blk_idx = INVALID_BDEV_BLOCK; + struct zram_wb_req *req = NULL; + struct zram_pp_slot *pps; + int ret = 0, err = 0; + u32 index = 0; while ((pps = select_pp_slot(ctl))) { - spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { - spin_unlock(&zram->wb_limit_lock); ret = -EIO; break; } - spin_unlock(&zram->wb_limit_lock); - if (!blk_idx) { - blk_idx = alloc_block_bdev(zram); - if (!blk_idx) { + while (!req) { + req = zram_select_idle_req(wb_ctl); + if (req) + break; + + wait_event(wb_ctl->done_wait, + !list_empty(&wb_ctl->done_reqs)); + + err = zram_complete_done_reqs(zram, wb_ctl); + /* + * BIO errors are not fatal, we continue and simply + * attempt to writeback the remaining objects (pages). + * At the same time we need to signal user-space that + * some writes (at least one, but also could be all of + * them) were not successful and we do so by returning + * the most recent BIO error. + */ + if (err) + ret = err; + } + + if (blk_idx == INVALID_BDEV_BLOCK) { + blk_idx = zram_reserve_bdev_block(zram); + if (blk_idx == INVALID_BDEV_BLOCK) { ret = -ENOSPC; break; } @@ -768,74 +1043,54 @@ static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) index = pps->index; zram_slot_lock(zram, index); /* - * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so + * scan_slots() sets ZRAM_PP_SLOT and releases slot lock, so * slots can change in the meantime. If slots are accessed or * freed they lose ZRAM_PP_SLOT flag and hence we don't * post-process them. */ if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; - if (zram_read_from_zspool(zram, page, index)) + if (zram_read_from_zspool(zram, req->page, index)) goto next; zram_slot_unlock(zram, index); - bio_init(&bio, zram->bdev, &bio_vec, 1, - REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); - __bio_add_page(&bio, page, PAGE_SIZE, 0); - - /* - * XXX: A single page IO would be inefficient for write - * but it would be not bad as starter. - */ - err = submit_bio_wait(&bio); - if (err) { - release_pp_slot(zram, pps); - /* - * BIO errors are not fatal, we continue and simply - * attempt to writeback the remaining objects (pages). - * At the same time we need to signal user-space that - * some writes (at least one, but also could be all of - * them) were not successful and we do so by returning - * the most recent BIO error. - */ - ret = err; - continue; - } - - atomic64_inc(&zram->stats.bd_writes); - zram_slot_lock(zram, index); /* - * Same as above, we release slot lock during writeback so - * slot can change under us: slot_free() or slot_free() and - * reallocation (zram_write_page()). In both cases slot loses - * ZRAM_PP_SLOT flag. No concurrent post-processing can set - * ZRAM_PP_SLOT on such slots until current post-processing - * finishes. + * From now on pp-slot is owned by the req, remove it from + * its pp bucket. */ - if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) - goto next; + list_del_init(&pps->entry); + + req->blk_idx = blk_idx; + req->pps = pps; + bio_init(&req->bio, zram->bdev, &req->bio_vec, 1, REQ_OP_WRITE); + req->bio.bi_iter.bi_sector = req->blk_idx * (PAGE_SIZE >> 9); + req->bio.bi_end_io = zram_writeback_endio; + __bio_add_page(&req->bio, req->page, PAGE_SIZE, 0); + + zram_submit_wb_request(zram, wb_ctl, req); + blk_idx = INVALID_BDEV_BLOCK; + req = NULL; + cond_resched(); + continue; - zram_free_page(zram, index); - zram_set_flag(zram, index, ZRAM_WB); - zram_set_handle(zram, index, blk_idx); - blk_idx = 0; - atomic64_inc(&zram->stats.pages_stored); - spin_lock(&zram->wb_limit_lock); - if (zram->wb_limit_enable && zram->bd_wb_limit > 0) - zram->bd_wb_limit -= 1UL << (PAGE_SHIFT - 12); - spin_unlock(&zram->wb_limit_lock); next: zram_slot_unlock(zram, index); release_pp_slot(zram, pps); - - cond_resched(); } - if (blk_idx) - free_block_bdev(zram, blk_idx); - if (page) - __free_page(page); + /* + * Selected idle req, but never submitted it due to some error or + * wb limit. + */ + if (req) + release_wb_req(req); + + while (atomic_read(&wb_ctl->num_inflight) > 0) { + wait_event(wb_ctl->done_wait, !list_empty(&wb_ctl->done_reqs)); + err = zram_complete_done_reqs(zram, wb_ctl); + if (err) + ret = err; + } return ret; } @@ -948,7 +1203,8 @@ static ssize_t writeback_store(struct device *dev, struct zram *zram = dev_to_zram(dev); u64 nr_pages = zram->disksize >> PAGE_SHIFT; unsigned long lo = 0, hi = nr_pages; - struct zram_pp_ctl *ctl = NULL; + struct zram_pp_ctl *pp_ctl = NULL; + struct zram_wb_ctl *wb_ctl = NULL; char *args, *param, *val; ssize_t ret = len; int err, mode = 0; @@ -970,8 +1226,14 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - ctl = init_pp_ctl(); - if (!ctl) { + pp_ctl = init_pp_ctl(); + if (!pp_ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + wb_ctl = init_wb_ctl(zram); + if (!wb_ctl) { ret = -ENOMEM; goto release_init_lock; } @@ -1000,7 +1262,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); break; } @@ -1011,7 +1273,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); break; } @@ -1022,7 +1284,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); continue; } @@ -1033,17 +1295,18 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - scan_slots_for_writeback(zram, mode, lo, hi, ctl); + scan_slots_for_writeback(zram, mode, lo, hi, pp_ctl); continue; } } - err = zram_writeback_slots(zram, ctl); + err = zram_writeback_slots(zram, pp_ctl, wb_ctl); if (err) ret = err; release_init_lock: - release_pp_ctl(zram, ctl); + release_pp_ctl(zram, pp_ctl); + release_wb_ctl(wb_ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); @@ -1112,7 +1375,9 @@ static int read_from_bdev(struct zram *zram, struct page *page, return -EIO; } -static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {}; +static void zram_release_bdev_block(struct zram *zram, unsigned long blk_idx) +{ +} #endif #ifdef CONFIG_ZRAM_MEMORY_TRACKING @@ -1634,7 +1899,7 @@ static void zram_free_page(struct zram *zram, size_t index) if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); - free_block_bdev(zram, zram_get_handle(zram, index)); + zram_release_bdev_block(zram, zram_get_handle(zram, index)); goto out; } @@ -1740,14 +2005,14 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, ret = zram_read_from_zspool(zram, page, index); zram_slot_unlock(zram, index); } else { + unsigned long blk_idx = zram_get_handle(zram, index); + /* * The slot should be unlocked before reading from the backing * device. */ zram_slot_unlock(zram, index); - - ret = read_from_bdev(zram, page, zram_get_handle(zram, index), - parent); + ret = read_from_bdev(zram, page, blk_idx, parent); } /* Should NEVER happen. Return bio error if it does. */ @@ -2610,6 +2875,7 @@ static DEVICE_ATTR_RW(backing_dev); static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); static DEVICE_ATTR_RW(writeback_limit_enable); +static DEVICE_ATTR_RW(writeback_batch_size); #endif #ifdef CONFIG_ZRAM_MULTI_COMP static DEVICE_ATTR_RW(recomp_algorithm); @@ -2631,6 +2897,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_writeback.attr, &dev_attr_writeback_limit.attr, &dev_attr_writeback_limit_enable.attr, + &dev_attr_writeback_batch_size.attr, #endif &dev_attr_io_stat.attr, &dev_attr_mm_stat.attr, @@ -2692,7 +2959,7 @@ static int zram_add(void) init_rwsem(&zram->init_lock); #ifdef CONFIG_ZRAM_WRITEBACK - spin_lock_init(&zram->wb_limit_lock); + zram->wb_batch_size = 32; #endif /* gendisk structure */ diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 6cee93f9c0d0..c6d94501376c 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -127,8 +127,8 @@ struct zram { bool claim; /* Protected by disk->open_mutex */ #ifdef CONFIG_ZRAM_WRITEBACK struct file *backing_dev; - spinlock_t wb_limit_lock; bool wb_limit_enable; + u32 wb_batch_size; u64 bd_wb_limit; struct block_device *bdev; unsigned long *bitmap; |
