diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-bufio.c | 189 | ||||
-rw-r--r-- | drivers/md/dm-core.h | 4 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-dust.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-ebs-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-flakey.c | 118 | ||||
-rw-r--r-- | drivers/md/dm-ioctl.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-linear.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-log-writes.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 243 | ||||
-rw-r--r-- | drivers/md/dm-raid1.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-stripe.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-switch.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 263 | ||||
-rw-r--r-- | drivers/md/dm-vdo/indexer/volume.c | 24 | ||||
-rw-r--r-- | drivers/md/dm-verity-fec.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-verity-target.c | 15 | ||||
-rw-r--r-- | drivers/md/dm-verity-verify-sig.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-zone.c | 98 | ||||
-rw-r--r-- | drivers/md/dm-zoned-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm.c | 73 | ||||
-rw-r--r-- | drivers/md/dm.h | 6 |
23 files changed, 720 insertions, 388 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index d098e75e3461..ec84ba5e93e5 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -41,16 +41,6 @@ #define DM_BUFIO_LOW_WATERMARK_RATIO 16 /* - * Check buffer ages in this interval (seconds) - */ -#define DM_BUFIO_WORK_TIMER_SECS 30 - -/* - * Free buffers when they are older than this (seconds) - */ -#define DM_BUFIO_DEFAULT_AGE_SECS 300 - -/* * The nr of bytes of cached data to keep around. */ #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) @@ -1057,10 +1047,8 @@ static unsigned long dm_bufio_cache_size_latch; static DEFINE_SPINLOCK(global_spinlock); -/* - * Buffers are freed after this timeout - */ -static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; +static unsigned int dm_bufio_max_age; /* No longer does anything */ + static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; @@ -1088,7 +1076,6 @@ static LIST_HEAD(dm_bufio_all_clients); static DEFINE_MUTEX(dm_bufio_clients_lock); static struct workqueue_struct *dm_bufio_wq; -static struct delayed_work dm_bufio_cleanup_old_work; static struct work_struct dm_bufio_replacement_work; @@ -2680,130 +2667,6 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); /*--------------------------------------------------------------*/ -static unsigned int get_max_age_hz(void) -{ - unsigned int max_age = READ_ONCE(dm_bufio_max_age); - - if (max_age > UINT_MAX / HZ) - max_age = UINT_MAX / HZ; - - return max_age * HZ; -} - -static bool older_than(struct dm_buffer *b, unsigned long age_hz) -{ - return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz); -} - -struct evict_params { - gfp_t gfp; - unsigned long age_hz; - - /* - * This gets updated with the largest last_accessed (ie. most - * recently used) of the evicted buffers. It will not be reinitialised - * by __evict_many(), so you can use it across multiple invocations. - */ - unsigned long last_accessed; -}; - -/* - * We may not be able to evict this buffer if IO pending or the client - * is still using it. - * - * And if GFP_NOFS is used, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets - * rerouted to different bufio client. - */ -static enum evict_result select_for_evict(struct dm_buffer *b, void *context) -{ - struct evict_params *params = context; - - if (!(params->gfp & __GFP_FS) || - (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { - if (test_bit_acquire(B_READING, &b->state) || - test_bit(B_WRITING, &b->state) || - test_bit(B_DIRTY, &b->state)) - return ER_DONT_EVICT; - } - - return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP; -} - -static unsigned long __evict_many(struct dm_bufio_client *c, - struct evict_params *params, - int list_mode, unsigned long max_count) -{ - unsigned long count; - unsigned long last_accessed; - struct dm_buffer *b; - - for (count = 0; count < max_count; count++) { - b = cache_evict(&c->cache, list_mode, select_for_evict, params); - if (!b) - break; - - last_accessed = READ_ONCE(b->last_accessed); - if (time_after_eq(params->last_accessed, last_accessed)) - params->last_accessed = last_accessed; - - __make_buffer_clean(b); - __free_buffer_wake(b); - - cond_resched(); - } - - return count; -} - -static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) -{ - struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0}; - unsigned long retain = get_retain_buffers(c); - unsigned long count; - LIST_HEAD(write_list); - - dm_bufio_lock(c); - - __check_watermark(c, &write_list); - if (unlikely(!list_empty(&write_list))) { - dm_bufio_unlock(c); - __flush_write_list(&write_list); - dm_bufio_lock(c); - } - - count = cache_total(&c->cache); - if (count > retain) - __evict_many(c, ¶ms, LIST_CLEAN, count - retain); - - dm_bufio_unlock(c); -} - -static void cleanup_old_buffers(void) -{ - unsigned long max_age_hz = get_max_age_hz(); - struct dm_bufio_client *c; - - mutex_lock(&dm_bufio_clients_lock); - - __cache_size_refresh(); - - list_for_each_entry(c, &dm_bufio_all_clients, client_list) - evict_old_buffers(c, max_age_hz); - - mutex_unlock(&dm_bufio_clients_lock); -} - -static void work_fn(struct work_struct *w) -{ - cleanup_old_buffers(); - - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); -} - -/*--------------------------------------------------------------*/ - /* * Global cleanup tries to evict the oldest buffers from across _all_ * the clients. It does this by repeatedly evicting a few buffers from @@ -2841,27 +2704,51 @@ static void __insert_client(struct dm_bufio_client *new_client) list_add_tail(&new_client->client_list, h); } +static enum evict_result select_for_evict(struct dm_buffer *b, void *context) +{ + /* In no-sleep mode, we cannot wait on IO. */ + if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) { + if (test_bit_acquire(B_READING, &b->state) || + test_bit(B_WRITING, &b->state) || + test_bit(B_DIRTY, &b->state)) + return ER_DONT_EVICT; + } + return ER_EVICT; +} + static unsigned long __evict_a_few(unsigned long nr_buffers) { - unsigned long count; struct dm_bufio_client *c; - struct evict_params params = { - .gfp = GFP_KERNEL, - .age_hz = 0, - /* set to jiffies in case there are no buffers in this client */ - .last_accessed = jiffies - }; + unsigned long oldest_buffer = jiffies; + unsigned long last_accessed; + unsigned long count; + struct dm_buffer *b; c = __pop_client(); if (!c) return 0; dm_bufio_lock(c); - count = __evict_many(c, ¶ms, LIST_CLEAN, nr_buffers); + + for (count = 0; count < nr_buffers; count++) { + b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL); + if (!b) + break; + + last_accessed = READ_ONCE(b->last_accessed); + if (time_after_eq(oldest_buffer, last_accessed)) + oldest_buffer = last_accessed; + + __make_buffer_clean(b); + __free_buffer_wake(b); + + cond_resched(); + } + dm_bufio_unlock(c); if (count) - c->oldest_buffer = params.last_accessed; + c->oldest_buffer = oldest_buffer; __insert_client(c); return count; @@ -2944,10 +2831,7 @@ static int __init dm_bufio_init(void) if (!dm_bufio_wq) return -ENOMEM; - INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); return 0; } @@ -2959,7 +2843,6 @@ static void __exit dm_bufio_exit(void) { int bug = 0; - cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); destroy_workqueue(dm_bufio_wq); if (dm_bufio_client_count) { @@ -2996,7 +2879,7 @@ module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644); MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644); -MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); +MODULE_PARM_DESC(max_age_seconds, "No longer does anything"); module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644); MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 3637761f3585..c889332e533b 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -141,6 +141,7 @@ struct mapped_device { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_zones; void *zone_revalidate_map; + struct task_struct *revalidate_map_task; #endif #ifdef CONFIG_IMA @@ -162,9 +163,6 @@ struct mapped_device { #define DMF_POST_SUSPENDING 8 #define DMF_EMULATE_ZONE_APPEND 9 -void disable_discard(struct mapped_device *md); -void disable_write_zeroes(struct mapped_device *md); - static inline sector_t dm_get_size(struct mapped_device *md) { return get_capacity(md->disk); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index d4cf0ac2a7aa..16d3d454fb0a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -14,11 +14,14 @@ #include <linux/bio.h> #include <linux/slab.h> #include <linux/kthread.h> +#include <linux/delay.h> #include <linux/device-mapper.h> #define DM_MSG_PREFIX "delay" +#define SLEEP_SHIFT 3 + struct delay_class { struct dm_dev *dev; sector_t start; @@ -34,6 +37,7 @@ struct delay_c { struct work_struct flush_expired_bios; struct list_head delayed_bios; struct task_struct *worker; + unsigned int worker_sleep_us; bool may_delay; struct delay_class read; @@ -136,6 +140,7 @@ static int flush_worker_fn(void *data) schedule(); } else { spin_unlock(&dc->delayed_bios_lock); + fsleep(dc->worker_sleep_us); cond_resched(); } } @@ -212,7 +217,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; int ret; - unsigned int max_delay; + unsigned int max_delay, min_delay; if (argc != 3 && argc != 6 && argc != 9) { ti->error = "Requires exactly 3, 6 or 9 arguments"; @@ -235,7 +240,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->read, argv); if (ret) goto bad; - max_delay = dc->read.delay; + min_delay = max_delay = dc->read.delay; if (argc == 3) { ret = delay_class_ctr(ti, &dc->write, argv); @@ -251,6 +256,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; max_delay = max(max_delay, dc->write.delay); + min_delay = min_not_zero(min_delay, dc->write.delay); if (argc == 6) { ret = delay_class_ctr(ti, &dc->flush, argv + 3); @@ -263,9 +269,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; max_delay = max(max_delay, dc->flush.delay); + min_delay = min_not_zero(min_delay, dc->flush.delay); out: if (max_delay < 50) { + if (min_delay >> SLEEP_SHIFT) + dc->worker_sleep_us = 1000; + else + dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT; /* * In case of small requested delays, use kthread instead of * timers and workqueue to achieve better latency. @@ -438,7 +449,7 @@ out: static struct target_type delay_target = { .name = "delay", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = delay_ctr, diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 1a33820c9f46..e75310232bbf 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -534,7 +534,9 @@ static void dust_status(struct dm_target *ti, status_type_t type, } } -static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dust_device *dd = ti->private; struct dm_dev *dev = dd->dev; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index b19b0142a690..6abb31ca9662 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -415,7 +415,8 @@ static void ebs_status(struct dm_target *ti, status_type_t type, } } -static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct ebs_c *ec = ti->private; struct dm_dev *dev = ec->dev; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b690905ab89f..c711db6f8f5c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -47,14 +47,15 @@ enum feature_flag_bits { }; struct per_bio_data { - bool bio_submitted; + bool bio_can_corrupt; + struct bvec_iter saved_iter; }; static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, struct dm_target *ti) { - int r; - unsigned int argc; + int r = 0; + unsigned int argc = 0; const char *arg_name; static const struct dm_arg _args[] = { @@ -65,14 +66,13 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, {0, PROBABILITY_BASE, "Invalid random corrupt argument"}, }; - /* No feature arguments supplied. */ - if (!as->argc) - return 0; - - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) + if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error))) return r; + /* No feature arguments supplied. */ + if (!argc) + goto error_all_io; + while (argc) { arg_name = dm_shift_arg(as); argc--; @@ -128,8 +128,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> */ if (!strcasecmp(arg_name, "corrupt_bio_byte")) { - if (!argc) { - ti->error = "Feature corrupt_bio_byte requires parameters"; + if (fc->corrupt_bio_byte) { + ti->error = "Feature corrupt_bio_byte duplicated"; + return -EINVAL; + } else if (argc < 4) { + ti->error = "Feature corrupt_bio_byte requires 4 parameters"; return -EINVAL; } @@ -176,7 +179,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (!strcasecmp(arg_name, "random_read_corrupt")) { - if (!argc) { + if (fc->random_read_corrupt) { + ti->error = "Feature random_read_corrupt duplicated"; + return -EINVAL; + } else if (!argc) { ti->error = "Feature random_read_corrupt requires a parameter"; return -EINVAL; } @@ -189,7 +195,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (!strcasecmp(arg_name, "random_write_corrupt")) { - if (!argc) { + if (fc->random_write_corrupt) { + ti->error = "Feature random_write_corrupt duplicated"; + return -EINVAL; + } else if (!argc) { ti->error = "Feature random_write_corrupt requires a parameter"; return -EINVAL; } @@ -205,18 +214,25 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, return -EINVAL; } - if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { - ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + if (test_bit(DROP_WRITES, &fc->flags) && + (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; return -EINVAL; - } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { - ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + } else if (test_bit(ERROR_WRITES, &fc->flags) && + (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } else if (test_bit(ERROR_READS, &fc->flags) && + (fc->corrupt_bio_rw == READ || fc->random_read_corrupt)) { + ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set"; return -EINVAL; } if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) && !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) && !fc->random_read_corrupt && !fc->random_write_corrupt) { +error_all_io: set_bit(ERROR_WRITES, &fc->flags); set_bit(ERROR_READS, &fc->flags); } @@ -278,7 +294,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + r = dm_read_arg(_args + 1, &as, &fc->down_interval, &ti->error); if (r) goto bad; @@ -339,7 +355,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) } static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, - unsigned char corrupt_bio_value) + unsigned char corrupt_bio_value, + struct bvec_iter start) { struct bvec_iter iter; struct bio_vec bvec; @@ -348,7 +365,7 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, * Overwrite the Nth byte of the bio's data, on whichever page * it falls. */ - bio_for_each_segment(bvec, bio, iter) { + __bio_for_each_segment(bvec, bio, iter, start) { if (bio_iter_len(bio, iter) > corrupt_bio_byte) { unsigned char *segment = bvec_kmap_local(&bvec); segment[corrupt_bio_byte] = corrupt_bio_value; @@ -357,36 +374,31 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", bio, corrupt_bio_value, corrupt_bio_byte, (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size); + (unsigned long long)start.bi_sector, + start.bi_size); break; } corrupt_bio_byte -= bio_iter_len(bio, iter); } } -static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc, + struct bvec_iter start) { unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; - if (!bio_has_data(bio)) - return; - - corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value); + corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start); } -static void corrupt_bio_random(struct bio *bio) +static void corrupt_bio_random(struct bio *bio, struct bvec_iter start) { unsigned int corrupt_byte; unsigned char corrupt_value; - if (!bio_has_data(bio)) - return; - - corrupt_byte = get_random_u32() % bio->bi_iter.bi_size; + corrupt_byte = get_random_u32() % start.bi_size; corrupt_value = get_random_u8(); - corrupt_bio_common(bio, corrupt_byte, corrupt_value); + corrupt_bio_common(bio, corrupt_byte, corrupt_value, start); } static void clone_free(struct bio *clone) @@ -481,7 +493,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) unsigned int elapsed; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - pb->bio_submitted = false; + pb->bio_can_corrupt = false; if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; @@ -490,14 +502,15 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) elapsed = (jiffies - fc->start_time) / HZ; if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { bool corrupt_fixed, corrupt_random; - /* - * Flag this bio as submitted while down. - */ - pb->bio_submitted = true; + + if (bio_has_data(bio)) { + pb->bio_can_corrupt = true; + pb->saved_iter = bio->bi_iter; + } /* - * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set. - * Otherwise, flakey_end_io() will decide if the reads should be modified. + * If ERROR_READS isn't set flakey_end_io() will decide if the + * reads should be modified. */ if (bio_data_dir(bio) == READ) { if (test_bit(ERROR_READS, &fc->flags)) @@ -516,6 +529,8 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + if (!pb->bio_can_corrupt) + goto map_bio; /* * Corrupt matching writes. */ @@ -535,9 +550,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct bio *clone = clone_bio(ti, fc, bio); if (clone) { if (corrupt_fixed) - corrupt_bio_data(clone, fc); + corrupt_bio_data(clone, fc, + clone->bi_iter); if (corrupt_random) - corrupt_bio_random(clone); + corrupt_bio_random(clone, + clone->bi_iter); submit_bio(clone); return DM_MAPIO_SUBMITTED; } @@ -559,28 +576,21 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; - if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) { if (fc->corrupt_bio_byte) { if ((fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) { /* * Corrupt successful matching READs while in down state. */ - corrupt_bio_data(bio, fc); + corrupt_bio_data(bio, fc, pb->saved_iter); } } if (fc->random_read_corrupt) { u64 rnd = get_random_u64(); u32 rem = do_div(rnd, PROBABILITY_BASE); if (rem < fc->random_read_corrupt) - corrupt_bio_random(bio); - } - if (test_bit(ERROR_READS, &fc->flags)) { - /* - * Error read during the down_interval if drop_writes - * and error_writes were not configured. - */ - *error = BLK_STS_IOERR; + corrupt_bio_random(bio, pb->saved_iter); } } @@ -638,7 +648,9 @@ static void flakey_status(struct dm_target *ti, status_type_t type, } } -static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct flakey_c *fc = ti->private; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d42eac944eb5..4165fef4c170 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1885,6 +1885,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, {DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, {DM_GET_TARGET_VERSION_CMD, 0, get_target_version}, + {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */ }; if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 66318aba4bdb..15538ec58f8e 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -119,7 +119,9 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct linear_c *lc = ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8d7df8303d0a..d484e8e1d48a 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -818,7 +818,9 @@ static void log_writes_status(struct dm_target *ti, status_type_t type, } static int log_writes_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct log_writes_c *lc = ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6c98f4ae5ea9..81fec2e1e0ef 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -79,6 +79,7 @@ struct multipath { struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ + struct priority_group *last_probed_pg; atomic_t nr_valid_paths; /* Total number of usable paths */ unsigned int nr_priority_groups; @@ -87,6 +88,7 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + wait_queue_head_t probe_wait; /* Wait for probing paths */ unsigned int pg_init_retries; /* Number of times to retry pg_init */ unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */ atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ @@ -100,6 +102,7 @@ struct multipath { struct bio_list queued_bios; struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ + bool is_suspending; }; /* @@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ +#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */ +#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */ static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) { @@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) atomic_set(&m->pg_init_count, 0); m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; init_waitqueue_head(&m->pg_init_wait); + init_waitqueue_head(&m->probe_wait); return 0; } @@ -413,13 +419,21 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) goto failed; } + /* Don't change PG until it has no remaining paths */ + pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } + /* Were we instructed to switch PG? */ if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { spin_unlock_irqrestore(&m->lock, flags); - goto check_current_pg; + goto check_all_pgs; } m->next_pg = NULL; spin_unlock_irqrestore(&m->lock, flags); @@ -427,16 +441,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) if (!IS_ERR_OR_NULL(pgpath)) return pgpath; } - - /* Don't change PG until it has no remaining paths */ -check_current_pg: - pg = READ_ONCE(m->current_pg); - if (pg) { - pgpath = choose_path_in_pg(m, pg, nr_bytes); - if (!IS_ERR_OR_NULL(pgpath)) - return pgpath; - } - +check_all_pgs: /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. @@ -612,7 +617,6 @@ static void multipath_queue_bio(struct multipath *m, struct bio *bio) static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; - unsigned long flags; /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); @@ -620,12 +624,12 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); if (!pgpath) { - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { __multipath_queue_bio(m, bio); pgpath = ERR_PTR(-EAGAIN); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { @@ -688,7 +692,6 @@ static void process_queued_io_list(struct multipath *m) static void process_queued_bios(struct work_struct *work) { int r; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -697,16 +700,16 @@ static void process_queued_bios(struct work_struct *work) bio_list_init(&bios); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (bio_list_empty(&m->queued_bios)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); return; } bio_list_merge_init(&bios, &m->queued_bios); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -1190,7 +1193,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_arg_set as; unsigned int pg_count = 0; unsigned int next_pg_num; - unsigned long flags; as.argc = argc; as.argv = argv; @@ -1255,9 +1257,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); ti->num_flush_bios = 1; ti->num_discard_bios = 1; @@ -1292,23 +1294,21 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { if (m->hw_handler_name) { - unsigned long flags; - if (!atomic_read(&m->pg_init_in_progress)) goto skip; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (atomic_read(&m->pg_init_in_progress) && !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } skip: if (m->queue_mode == DM_TYPE_BIO_BASED) @@ -1370,11 +1370,10 @@ out: static int reinstate_path(struct pgpath *pgpath) { int r = 0, run_queue = 0; - unsigned long flags; struct multipath *m = pgpath->pg->m; unsigned int nr_valid_paths; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (pgpath->is_active) goto out; @@ -1404,7 +1403,7 @@ static int reinstate_path(struct pgpath *pgpath) schedule_work(&m->trigger_event); out: - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); if (run_queue) { dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); @@ -1439,15 +1438,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action) * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - bool bypassed) + bool bypassed, bool can_be_delayed) { unsigned long flags; spin_lock_irqsave(&m->lock, flags); pg->bypassed = bypassed; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } spin_unlock_irqrestore(&m->lock, flags); @@ -1461,7 +1464,6 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) { struct priority_group *pg; unsigned int pgnum; - unsigned long flags; char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || @@ -1470,17 +1472,21 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) return -EINVAL; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); list_for_each_entry(pg, &m->priority_groups, list) { pg->bypassed = false; if (--pgnum) continue; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } m->next_pg = pg; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); schedule_work(&m->trigger_event); return 0; @@ -1507,7 +1513,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) break; } - bypass_pg(m, pg, bypassed); + bypass_pg(m, pg, bypassed, true); return 0; } @@ -1561,7 +1567,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, true); + bypass_pg(m, pg, true, false); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1742,6 +1748,9 @@ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; + spin_lock_irq(&m->lock); + m->is_suspending = true; + spin_unlock_irq(&m->lock); /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) queue_if_no_path(m, false, true, __func__); @@ -1762,9 +1771,9 @@ static void multipath_postsuspend(struct dm_target *ti) static void multipath_resume(struct dm_target *ti) { struct multipath *m = ti->private; - unsigned long flags; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); + m->is_suspending = false; if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); @@ -1775,7 +1784,7 @@ static void multipath_resume(struct dm_target *ti) test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } /* @@ -1798,14 +1807,13 @@ static void multipath_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { int sz = 0, pg_counter, pgpath_counter; - unsigned long flags; struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *p; unsigned int pg_num; char state; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); /* Features */ if (type == STATUSTYPE_INFO) @@ -1845,10 +1853,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("%u ", m->nr_priority_groups); - if (m->next_pg) - pg_num = m->next_pg->pg_num; - else if (m->current_pg) + if (m->current_pg) pg_num = m->current_pg->pg_num; + else if (m->next_pg) + pg_num = m->next_pg->pg_num; else pg_num = (m->nr_priority_groups ? 1 : 0); @@ -1951,7 +1959,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, break; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv, @@ -1961,7 +1969,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg dev_t dev; struct multipath *m = ti->private; action_fn action; - unsigned long flags; mutex_lock(&m->work_mutex); @@ -1973,9 +1980,9 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, true, false, __func__); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, false, false, __func__); @@ -2021,14 +2028,132 @@ out: return r; } +/* + * Perform a minimal read from the given path to find out whether the + * path still works. If a path error occurs, fail it. + */ +static int probe_path(struct pgpath *pgpath) +{ + struct block_device *bdev = pgpath->path.dev->bdev; + unsigned int read_size = bdev_logical_block_size(bdev); + struct page *page; + struct bio *bio; + blk_status_t status; + int r = 0; + + if (WARN_ON_ONCE(read_size > PAGE_SIZE)) + return -EINVAL; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + /* Perform a minimal read: Sector 0, length read_size */ + bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); + if (!bio) { + r = -ENOMEM; + goto out; + } + + bio->bi_iter.bi_sector = 0; + __bio_add_page(bio, page, read_size, 0); + submit_bio_wait(bio); + status = bio->bi_status; + bio_put(bio); + + if (status && blk_path_error(status)) + fail_path(pgpath); + +out: + __free_page(page); + return r; +} + +/* + * Probe all active paths in current_pg to find out whether they still work. + * Fail all paths that do not work. + * + * Return -ENOTCONN if no valid path is left (even outside of current_pg). We + * cannot probe paths in other pgs without switching current_pg, so if valid + * paths are only in different pgs, they may or may not work. Additionally + * we should not probe paths in a pathgroup that is in the process of + * Initializing. Userspace can submit a request and we'll switch and wait + * for the pathgroup to be initialized. If the request fails, it may need to + * probe again. + */ +static int probe_active_paths(struct multipath *m) +{ + struct pgpath *pgpath; + struct priority_group *pg = NULL; + int r = 0; + + spin_lock_irq(&m->lock); + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { + wait_event_lock_irq(m->probe_wait, + !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), + m->lock); + /* + * if we waited because a probe was already in progress, + * and it probed the current active pathgroup, don't + * reprobe. Just return the number of valid paths + */ + if (m->current_pg == m->last_probed_pg) + goto skip_probe; + } + if (!m->current_pg || m->is_suspending || + test_bit(MPATHF_QUEUE_IO, &m->flags)) + goto skip_probe; + set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + pg = m->last_probed_pg = m->current_pg; + spin_unlock_irq(&m->lock); + + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pg != READ_ONCE(m->current_pg) || + READ_ONCE(m->is_suspending)) + goto out; + if (!pgpath->is_active) + continue; + + r = probe_path(pgpath); + if (r < 0) + goto out; + } + +out: + spin_lock_irq(&m->lock); + clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { + m->current_pgpath = NULL; + m->current_pg = NULL; + } +skip_probe: + if (r == 0 && !atomic_read(&m->nr_valid_paths)) + r = -ENOTCONN; + spin_unlock_irq(&m->lock); + if (pg) + wake_up(&m->probe_wait); + return r; +} + static int multipath_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct multipath *m = ti->private; struct pgpath *pgpath; - unsigned long flags; int r; + if (_IOC_TYPE(cmd) == DM_IOCTL) { + *forward = false; + switch (cmd) { + case DM_MPATH_PROBE_PATHS: + return probe_active_paths(m); + default: + return -ENOTTY; + } + } + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, 0); @@ -2044,10 +2169,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } else { /* No path is available */ r = -EIO; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) r = -ENOTCONN; - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } if (r == -ENOTCONN) { @@ -2055,10 +2180,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) (void) __pg_init_all_paths(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); } @@ -2180,7 +2305,7 @@ static int multipath_busy(struct dm_target *ti) */ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9e615b4f1f5e..785af4816584 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -133,10 +133,9 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) spin_lock_irqsave(&ms->lock, flags); should_wake = !(bl->head); bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void dispatch_bios(void *context, struct bio_list *bio_list) @@ -646,9 +645,9 @@ static void write_callback(unsigned long error, void *context) if (!ms->failures.head) should_wake = 1; bio_list_add(&ms->failures, bio); - spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void do_write(struct mirror_set *ms, struct bio *bio) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index e23076f7ece2..a6ca92049c10 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -217,10 +217,10 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped) if (unlikely(error == BLK_STS_TARGET)) { if (req_op(clone) == REQ_OP_DISCARD && !clone->q->limits.max_discard_sectors) - disable_discard(tio->md); + blk_queue_disable_discard(tio->md->queue); else if (req_op(clone) == REQ_OP_WRITE_ZEROES && !clone->q->limits.max_write_zeroes_sectors) - disable_write_zeroes(tio->md); + blk_queue_disable_write_zeroes(tio->md->queue); } switch (r) { diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index a1b7535c508a..a7dc04bd55e5 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -405,7 +405,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) { unsigned int i; - char major_minor[16]; + char major_minor[22]; struct stripe_c *sc = ti->private; if (!*error) @@ -417,8 +417,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, if (*error == BLK_STS_NOTSUPP) return DM_ENDIO_DONE; - memset(major_minor, 0, sizeof(major_minor)); - sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio))); + format_dev_t(major_minor, bio_dev(bio)); /* * Test to see which stripe drive triggered the event diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index dfd9fb52a6f3..bb1a70b5a215 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -517,7 +517,9 @@ static void switch_status(struct dm_target *ti, status_type_t type, * * Passthrough all ioctls to the path for sector 0 */ -static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct switch_ctx *sctx = ti->private; unsigned int path_nr; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 6b23e777e10e..24a857ff6d0b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -117,7 +117,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num) n_targets = (struct dm_target *) (n_highs + num); memset(n_highs, -1, sizeof(*n_highs) * num); - kvfree(t->highs); t->num_allocated = num; t->highs = n_highs; @@ -257,7 +256,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, if (bdev_is_zoned(bdev)) { unsigned int zone_sectors = bdev_zone_sectors(bdev); - if (start & (zone_sectors - 1)) { + if (!bdev_is_zone_aligned(bdev, start)) { DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg", dm_device_name(ti->table->md), (unsigned long long)start, @@ -274,7 +273,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * devices do not end up with a smaller zone in the middle of * the sector range. */ - if (len & (zone_sectors - 1)) { + if (!bdev_is_zone_aligned(bdev, len)) { DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg", dm_device_name(ti->table->md), (unsigned long long)len, @@ -431,6 +430,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, return 0; } + mutex_lock(&q->limits_lock); + /* + * BLK_FEAT_ATOMIC_WRITES is not inherited from the bottom device in + * blk_stack_limits(), so do it manually. + */ + limits->features |= (q->limits.features & BLK_FEAT_ATOMIC_WRITES); + if (blk_stack_limits(limits, &q->limits, get_start_sect(bdev) + start) < 0) DMWARN("%s: adding target device %pg caused an alignment inconsistency: " @@ -448,6 +454,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ if (!dm_target_has_integrity(ti->type)) queue_limits_stack_integrity_bdev(limits, bdev); + mutex_unlock(&q->limits_lock); return 0; } @@ -1189,6 +1196,176 @@ put_live_table: return 0; } +enum dm_wrappedkey_op { + DERIVE_SW_SECRET, + IMPORT_KEY, + GENERATE_KEY, + PREPARE_KEY, +}; + +struct dm_wrappedkey_op_args { + enum dm_wrappedkey_op op; + int err; + union { + struct { + const u8 *eph_key; + size_t eph_key_size; + u8 *sw_secret; + } derive_sw_secret; + struct { + const u8 *raw_key; + size_t raw_key_size; + u8 *lt_key; + } import_key; + struct { + u8 *lt_key; + } generate_key; + struct { + const u8 *lt_key; + size_t lt_key_size; + u8 *eph_key; + } prepare_key; + }; +}; + +static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_wrappedkey_op_args *args = data; + struct block_device *bdev = dev->bdev; + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err = -EOPNOTSUPP; + + if (!args->err) + return 0; + + switch (args->op) { + case DERIVE_SW_SECRET: + err = blk_crypto_derive_sw_secret( + bdev, + args->derive_sw_secret.eph_key, + args->derive_sw_secret.eph_key_size, + args->derive_sw_secret.sw_secret); + break; + case IMPORT_KEY: + err = blk_crypto_import_key(profile, + args->import_key.raw_key, + args->import_key.raw_key_size, + args->import_key.lt_key); + break; + case GENERATE_KEY: + err = blk_crypto_generate_key(profile, + args->generate_key.lt_key); + break; + case PREPARE_KEY: + err = blk_crypto_prepare_key(profile, + args->prepare_key.lt_key, + args->prepare_key.lt_key_size, + args->prepare_key.eph_key); + break; + } + args->err = err; + + /* Try another device in case this fails. */ + return 0; +} + +static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile, + struct dm_wrappedkey_op_args *args) +{ + struct mapped_device *md = + container_of(profile, struct dm_crypto_profile, profile)->md; + struct dm_target *ti; + struct dm_table *t; + int srcu_idx; + int i; + + args->err = -EOPNOTSUPP; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + goto out; + + /* + * blk-crypto currently has no support for multiple incompatible + * implementations of wrapped inline crypto keys on a single system. + * It was already checked earlier that support for wrapped keys was + * declared on all underlying devices. Thus, all the underlying devices + * should support all wrapped key operations and they should behave + * identically, i.e. work with the same keys. So, just executing the + * operation on the first device on which it works suffices for now. + */ + for (i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args); + if (!args->err) + break; + } +out: + dm_put_live_table(md, srcu_idx); + return args->err; +} + +static int dm_derive_sw_secret(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = DERIVE_SW_SECRET, + .derive_sw_secret = { + .eph_key = eph_key, + .eph_key_size = eph_key_size, + .sw_secret = sw_secret, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = IMPORT_KEY, + .import_key = { + .raw_key = raw_key, + .raw_key_size = raw_key_size, + .lt_key = lt_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = GENERATE_KEY, + .generate_key = { + .lt_key = lt_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = PREPARE_KEY, + .prepare_key = { + .lt_key = lt_key, + .lt_key_size = lt_key_size, + .eph_key = eph_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + static int device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -1263,6 +1440,13 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) profile); } + if (profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) { + profile->ll_ops.derive_sw_secret = dm_derive_sw_secret; + profile->ll_ops.import_key = dm_import_key; + profile->ll_ops.generate_key = dm_generate_key; + profile->ll_ops.prepare_key = dm_prepare_key; + } + if (t->md->queue && !blk_crypto_has_capabilities(profile, t->md->queue->crypto_profile)) { @@ -1490,6 +1674,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t) return true; } +bool dm_table_is_wildcard(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_is_wildcard(ti->type)) + return false; + } + + return true; +} + static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1721,8 +1917,12 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); + int b; - return !q->limits.max_write_zeroes_sectors; + mutex_lock(&q->limits_lock); + b = !q->limits.max_write_zeroes_sectors; + mutex_unlock(&q->limits_lock); + return b; } static bool dm_table_supports_write_zeroes(struct dm_table *t) @@ -1830,10 +2030,24 @@ static bool dm_table_supports_atomic_writes(struct dm_table *t) return true; } +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size) +{ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && dm_has_zone_plugs(t->md) && + old_size != new_size) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change size", + dm_device_name(t->md)); + return false; + } + return true; +} + int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { int r; + struct queue_limits old_limits; if (!dm_table_supports_nowait(t)) limits->features &= ~BLK_FEAT_NOWAIT; @@ -1860,28 +2074,30 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (dm_table_supports_flush(t)) limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; - if (dm_table_supports_dax(t, device_not_dax_capable)) { + if (dm_table_supports_dax(t, device_not_dax_capable)) limits->features |= BLK_FEAT_DAX; - if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) - set_dax_synchronous(t->md->dax_dev); - } else + else limits->features &= ~BLK_FEAT_DAX; - if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) - dax_write_cache(t->md->dax_dev, true); - /* For a zoned table, setup the zone related queue attributes. */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - (limits->features & BLK_FEAT_ZONED)) { - r = dm_set_zones_restrictions(t, q, limits); - if (r) - return r; + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + if (limits->features & BLK_FEAT_ZONED) { + r = dm_set_zones_restrictions(t, q, limits); + if (r) + return r; + } else if (dm_has_zone_plugs(t->md)) { + DMWARN("%s: device has zone write plug resources. " + "Cannot switch to non-zoned table.", + dm_device_name(t->md)); + return -EINVAL; + } } if (dm_table_supports_atomic_writes(t)) limits->features |= BLK_FEAT_ATOMIC_WRITES; - r = queue_limits_set(q, limits); + old_limits = queue_limits_start_update(q); + r = queue_limits_commit_update(q, limits); if (r) return r; @@ -1892,10 +2108,21 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (limits->features & BLK_FEAT_ZONED)) { r = dm_revalidate_zones(t, q); - if (r) + if (r) { + queue_limits_set(q, &old_limits); return r; + } } + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + dm_finalize_zone_settings(t, limits); + + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) + set_dax_synchronous(t->md->dax_dev); + + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) + dax_write_cache(t->md->dax_dev, true); + dm_update_crypto_profile(q, t); return 0; } diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c index 655453bb276b..425b3a74f4db 100644 --- a/drivers/md/dm-vdo/indexer/volume.c +++ b/drivers/md/dm-vdo/indexer/volume.c @@ -754,10 +754,11 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * u32 physical_page, struct cached_page **page_ptr) { struct cached_page *page; + unsigned int zone_number = request->zone_number; get_page_from_cache(&volume->page_cache, physical_page, &page); if (page != NULL) { - if (request->zone_number == 0) { + if (zone_number == 0) { /* Only one zone is allowed to update the LRU. */ make_page_most_recent(&volume->page_cache, page); } @@ -767,7 +768,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * } /* Prepare to enqueue a read for the page. */ - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); mutex_lock(&volume->read_threads_mutex); /* @@ -787,8 +788,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * * the order does not matter for correctness as it does below. */ mutex_unlock(&volume->read_threads_mutex); - begin_pending_search(&volume->page_cache, physical_page, - request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); return UDS_QUEUED; } @@ -797,7 +797,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * * "search pending" state in careful order so no other thread can mess with the data before * the caller gets to look at it. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); mutex_unlock(&volume->read_threads_mutex); *page_ptr = page; return UDS_SUCCESS; @@ -849,6 +849,7 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r { int result; struct cached_page *page = NULL; + unsigned int zone_number = request->zone_number; u32 physical_page = map_to_physical_page(volume->geometry, chapter, index_page_number); @@ -858,18 +859,18 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r * invalidation by the reader thread, before the reader thread has noticed that the * invalidate_counter has been incremented. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); result = get_volume_page_protected(volume, request, physical_page, &page); if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } result = uds_search_chapter_index_page(&page->index_page, volume->geometry, &request->record_name, record_page_number); - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } @@ -882,6 +883,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req { struct cached_page *record_page; struct index_geometry *geometry = volume->geometry; + unsigned int zone_number = request->zone_number; int result; u32 physical_page, page_number; @@ -905,11 +907,11 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req * invalidation by the reader thread, before the reader thread has noticed that the * invalidate_counter has been incremented. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); result = get_volume_page_protected(volume, request, physical_page, &record_page); if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } @@ -917,7 +919,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req &request->record_name, geometry, &request->old_metadata)) *found = true; - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return UDS_SUCCESS; } diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 0c41949db784..631a887b487c 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -593,6 +593,10 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, (*argc)--; if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + if (v->fec->dev) { + ti->error = "FEC device already specified"; + return -EINVAL; + } r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev); if (r) { ti->error = "FEC device lookup failed"; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 3c427f18a04b..81186bded1ce 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -682,7 +682,8 @@ static void verity_bh_work(struct work_struct *w) static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) { return ioprio <= IOPRIO_CLASS_IDLE && - bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]) && + !need_resched(); } static void verity_end_io(struct bio *bio) @@ -993,7 +994,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dm_verity *v = ti->private; @@ -1120,6 +1123,9 @@ static int verity_alloc_most_once(struct dm_verity *v) { struct dm_target *ti = v->ti; + if (v->validated_blocks) + return 0; + /* the bitset can only handle INT_MAX blocks */ if (v->data_blocks > INT_MAX) { ti->error = "device too large to use check_at_most_once"; @@ -1143,6 +1149,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v) struct dm_verity_io *io; u8 *zero_data; + if (v->zero_digest) + return 0; + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); if (!v->zero_digest) @@ -1577,7 +1586,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* Root hash signature is a optional parameter*/ + /* Root hash signature is an optional parameter */ r = verity_verify_root_hash(root_hash_digest_to_validate, strlen(root_hash_digest_to_validate), verify_args.sig, diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index a9e2c6c0a33c..d5261a0e4232 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -71,9 +71,14 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, const char *arg_name) { struct dm_target *ti = v->ti; - int ret = 0; + int ret; const char *sig_key = NULL; + if (v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified"); + return -EINVAL; + } + if (!*argc) { ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); return -EINVAL; @@ -83,14 +88,18 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, (*argc)--; ret = verity_verify_get_sig_from_key(sig_key, sig_opts); - if (ret < 0) + if (ret < 0) { ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); + return ret; + } v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); - if (!v->signature_key_desc) + if (!v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key"); return -ENOMEM; + } - return ret; + return 0; } /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 20edd3fabbab..3d31b82e0730 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -56,24 +56,31 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, { struct mapped_device *md = disk->private_data; struct dm_table *map; - int srcu_idx, ret; + struct dm_table *zone_revalidate_map = md->zone_revalidate_map; + int srcu_idx, ret = -EIO; + bool put_table = false; - if (!md->zone_revalidate_map) { - /* Regular user context */ + if (!zone_revalidate_map || md->revalidate_map_task != current) { + /* + * Regular user context or + * Zone revalidation during __bind() is in progress, but this + * call is from a different process + */ if (dm_suspended_md(md)) return -EAGAIN; map = dm_get_live_table(md, &srcu_idx); - if (!map) - return -EIO; + put_table = true; } else { /* Zone revalidation during __bind() */ - map = md->zone_revalidate_map; + map = zone_revalidate_map; } - ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); + if (map) + ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, + data); - if (!md->zone_revalidate_map) + if (put_table) dm_put_live_table(md, srcu_idx); return ret; @@ -153,33 +160,36 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; + unsigned int nr_zones = disk->nr_zones; int ret; if (!get_capacity(disk)) return 0; - /* Revalidate only if something changed. */ - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { - DMINFO("%s using %s zone append", - disk->disk_name, - queue_emulates_zone_append(q) ? "emulated" : "native"); - md->nr_zones = 0; - } - - if (md->nr_zones) + /* + * Do not revalidate if zone write plug resources have already + * been allocated. + */ + if (dm_has_zone_plugs(md)) return 0; + DMINFO("%s using %s zone append", disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); + /* * Our table is not live yet. So the call to dm_get_live_table() * in dm_blk_report_zones() will fail. Set a temporary pointer to * our table for dm_blk_report_zones() to use directly. */ md->zone_revalidate_map = t; + md->revalidate_map_task = current; ret = blk_revalidate_disk_zones(disk); + md->revalidate_map_task = NULL; md->zone_revalidate_map = NULL; if (ret) { DMERR("Revalidate zones failed %d", ret); + disk->nr_zones = nr_zones; return ret; } @@ -337,15 +347,15 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, /* * Check if zone append is natively supported, and if not, set the - * mapped device queue as needing zone append emulation. + * mapped device queue as needing zone append emulation. If zone + * append is natively supported, make sure that + * max_hw_zone_append_sectors is not set to 0. */ WARN_ON_ONCE(queue_is_mq(q)); - if (dm_table_supports_zone_append(t)) { - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - } else { - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + if (!dm_table_supports_zone_append(t)) lim->max_hw_zone_append_sectors = 0; - } + else if (lim->max_hw_zone_append_sectors == 0) + lim->max_hw_zone_append_sectors = lim->max_zone_append_sectors; /* * Determine the max open and max active zone limits for the mapped @@ -380,15 +390,28 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, lim->max_open_zones = 0; lim->max_active_zones = 0; lim->max_hw_zone_append_sectors = 0; + lim->max_zone_append_sectors = 0; lim->zone_write_granularity = 0; lim->chunk_sectors = 0; lim->features &= ~BLK_FEAT_ZONED; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - md->nr_zones = 0; - disk->nr_zones = 0; return 0; } + if (get_capacity(disk) && dm_has_zone_plugs(t->md)) { + if (q->limits.chunk_sectors != lim->chunk_sectors) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change zone size", + disk->disk_name); + return -EINVAL; + } + if (lim->max_hw_zone_append_sectors != 0 && + !dm_table_is_wildcard(t)) { + DMWARN("%s: device has zone write plug resources. " + "New table must emulate zone append", + disk->disk_name); + return -EINVAL; + } + } /* * Warn once (when the capacity is not yet set) if the mapped device is * partially using zone resources of the target devices as that leads to @@ -408,6 +431,23 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, return 0; } +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim) +{ + struct mapped_device *md = t->md; + + if (lim->features & BLK_FEAT_ZONED) { + if (dm_table_supports_zone_append(t)) + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + else + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + } else { + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + md->nr_zones = 0; + md->disk->nr_zones = 0; + } +} + + /* * IO completion callback called from clone_endio(). */ @@ -423,9 +463,9 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) */ if (clone->bi_status == BLK_STS_OK && bio_op(clone) == REQ_OP_ZONE_APPEND) { - sector_t mask = bdev_zone_sectors(disk->part0) - 1; - - orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; + orig_bio->bi_iter.bi_sector += + bdev_offset_from_zone_start(disk->part0, + clone->bi_iter.bi_sector); } return; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6141fc25d842..5da3db06da10 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1015,7 +1015,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) /* * Pass on ioctl to the backend device. */ -static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct dmz_target *dmz = ti->private; struct dmz_dev *dev = &dmz->dev[0]; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5ab7574c0c76..1726f0f828cc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -411,7 +411,8 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, - struct block_device **bdev) + struct block_device **bdev, unsigned int cmd, + unsigned long arg, bool *forward) { struct dm_target *ti; struct dm_table *map; @@ -434,8 +435,8 @@ retry: if (dm_suspended_md(md)) return -EAGAIN; - r = ti->type->prepare_ioctl(ti, bdev); - if (r == -ENOTCONN && !fatal_signal_pending(current)) { + r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward); + if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); fsleep(10000); goto retry; @@ -454,9 +455,10 @@ static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode, { struct mapped_device *md = bdev->bd_disk->private_data; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward); + if (!forward || r < 0) goto out; if (r > 0) { @@ -1082,22 +1084,6 @@ static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md) return &md->queue->limits; } -void disable_discard(struct mapped_device *md) -{ - struct queue_limits *limits = dm_get_queue_limits(md); - - /* device doesn't really support DISCARD, disable it */ - limits->max_hw_discard_sectors = 0; -} - -void disable_write_zeroes(struct mapped_device *md) -{ - struct queue_limits *limits = dm_get_queue_limits(md); - - /* device doesn't really support WRITE ZEROES, disable it */ - limits->max_write_zeroes_sectors = 0; -} - static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) { return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); @@ -1115,10 +1101,10 @@ static void clone_endio(struct bio *bio) if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); + blk_queue_disable_discard(md->queue); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bdev_write_zeroes_sectors(bio->bi_bdev)) - disable_write_zeroes(md); + blk_queue_disable_write_zeroes(md->queue); } if (static_branch_unlikely(&zoned_enabled) && @@ -2421,21 +2407,35 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct queue_limits *limits) { struct dm_table *old_map; - sector_t size; + sector_t size, old_size; int ret; lockdep_assert_held(&md->suspend_lock); size = dm_table_get_size(t); + old_size = dm_get_size(md); + + if (!dm_table_supports_size_change(t, old_size, size)) { + old_map = ERR_PTR(-EINVAL); + goto out; + } + + set_capacity(md->disk, size); + + ret = dm_table_set_restrictions(t, md->queue, limits); + if (ret) { + set_capacity(md->disk, old_size); + old_map = ERR_PTR(ret); + goto out; + } + /* * Wipe any geometry if the size of the table changed. */ - if (size != dm_get_size(md)) + if (size != old_size) memset(&md->geometry, 0, sizeof(md->geometry)); - set_capacity(md->disk, size); - dm_table_event_callback(t, event_callback, md); if (dm_table_request_based(t)) { @@ -2453,10 +2453,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * requests in the queue may refer to bio from the old bioset, * so you must walk through the queue to unprep. */ - if (!md->mempools) { + if (!md->mempools) md->mempools = t->mempools; - t->mempools = NULL; - } + else + dm_free_md_mempools(t->mempools); } else { /* * The md may already have mempools that need changing. @@ -2465,14 +2465,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, */ dm_free_md_mempools(md->mempools); md->mempools = t->mempools; - t->mempools = NULL; - } - - ret = dm_table_set_restrictions(t, md->queue, limits); - if (ret) { - old_map = ERR_PTR(ret); - goto out; } + t->mempools = NULL; old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, (void *)t); @@ -3638,10 +3632,13 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); + /* Not a real ioctl, but targets must not interpret non-DM ioctls */ + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward); if (r < 0) goto out; + WARN_ON_ONCE(!forward); ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_clear) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a0a8ff119815..245f52b59215 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -58,6 +58,7 @@ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); +bool dm_table_is_wildcard(struct dm_table *t); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -72,6 +73,8 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); @@ -102,6 +105,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim); int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, @@ -110,12 +114,14 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset); +#define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL) #else #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) { return false; } +#define dm_has_zone_plugs(md) false #endif /* |