diff options
Diffstat (limited to 'drivers/md')
32 files changed, 884 insertions, 501 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index ed40d8600656..2cc2eb24dc8a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -36,6 +36,7 @@ #include <linux/sched/clock.h> #include <linux/rculist.h> #include <linux/delay.h> +#include <linux/sort.h> #include <trace/events/bcache.h> /* @@ -559,8 +560,6 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) } } -#define cmp_int(l, r) ((l > r) - (l < r)) - #ifdef CONFIG_PROVE_LOCKING static int btree_lock_cmp_fn(const struct lockdep_map *_a, const struct lockdep_map *_b) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index cc1de925111c..1a2ce1a4b456 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out, bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META; bio->bi_iter.bi_sector = SB_SECTOR; - __bio_add_page(bio, virt_to_page(out), SB_SIZE, - offset_in_page(out)); + bio_add_virt_nofail(bio, out, SB_SIZE); out->offset = cpu_to_le64(sb->offset); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 9c8ed65cd87e..ec84ba5e93e5 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -41,16 +41,6 @@ #define DM_BUFIO_LOW_WATERMARK_RATIO 16 /* - * Check buffer ages in this interval (seconds) - */ -#define DM_BUFIO_WORK_TIMER_SECS 30 - -/* - * Free buffers when they are older than this (seconds) - */ -#define DM_BUFIO_DEFAULT_AGE_SECS 300 - -/* * The nr of bytes of cached data to keep around. */ #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) @@ -68,6 +58,8 @@ #define LIST_DIRTY 1 #define LIST_SIZE 2 +#define SCAN_RESCHED_CYCLE 16 + /*--------------------------------------------------------------*/ /* @@ -1055,10 +1047,8 @@ static unsigned long dm_bufio_cache_size_latch; static DEFINE_SPINLOCK(global_spinlock); -/* - * Buffers are freed after this timeout - */ -static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; +static unsigned int dm_bufio_max_age; /* No longer does anything */ + static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; @@ -1086,7 +1076,6 @@ static LIST_HEAD(dm_bufio_all_clients); static DEFINE_MUTEX(dm_bufio_clients_lock); static struct workqueue_struct *dm_bufio_wq; -static struct delayed_work dm_bufio_cleanup_old_work; static struct work_struct dm_bufio_replacement_work; @@ -1362,7 +1351,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector, ptr = (char *)b->data + offset; len = n_sectors << SECTOR_SHIFT; - __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr)); + bio_add_virt_nofail(bio, ptr, len); submit_bio(bio); } @@ -2424,7 +2413,12 @@ static void __scan(struct dm_bufio_client *c) atomic_long_dec(&c->need_shrink); freed++; - cond_resched(); + + if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) { + dm_bufio_unlock(c); + cond_resched(); + dm_bufio_lock(c); + } } } } @@ -2673,130 +2667,6 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset); /*--------------------------------------------------------------*/ -static unsigned int get_max_age_hz(void) -{ - unsigned int max_age = READ_ONCE(dm_bufio_max_age); - - if (max_age > UINT_MAX / HZ) - max_age = UINT_MAX / HZ; - - return max_age * HZ; -} - -static bool older_than(struct dm_buffer *b, unsigned long age_hz) -{ - return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz); -} - -struct evict_params { - gfp_t gfp; - unsigned long age_hz; - - /* - * This gets updated with the largest last_accessed (ie. most - * recently used) of the evicted buffers. It will not be reinitialised - * by __evict_many(), so you can use it across multiple invocations. - */ - unsigned long last_accessed; -}; - -/* - * We may not be able to evict this buffer if IO pending or the client - * is still using it. - * - * And if GFP_NOFS is used, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets - * rerouted to different bufio client. - */ -static enum evict_result select_for_evict(struct dm_buffer *b, void *context) -{ - struct evict_params *params = context; - - if (!(params->gfp & __GFP_FS) || - (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) { - if (test_bit_acquire(B_READING, &b->state) || - test_bit(B_WRITING, &b->state) || - test_bit(B_DIRTY, &b->state)) - return ER_DONT_EVICT; - } - - return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP; -} - -static unsigned long __evict_many(struct dm_bufio_client *c, - struct evict_params *params, - int list_mode, unsigned long max_count) -{ - unsigned long count; - unsigned long last_accessed; - struct dm_buffer *b; - - for (count = 0; count < max_count; count++) { - b = cache_evict(&c->cache, list_mode, select_for_evict, params); - if (!b) - break; - - last_accessed = READ_ONCE(b->last_accessed); - if (time_after_eq(params->last_accessed, last_accessed)) - params->last_accessed = last_accessed; - - __make_buffer_clean(b); - __free_buffer_wake(b); - - cond_resched(); - } - - return count; -} - -static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) -{ - struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0}; - unsigned long retain = get_retain_buffers(c); - unsigned long count; - LIST_HEAD(write_list); - - dm_bufio_lock(c); - - __check_watermark(c, &write_list); - if (unlikely(!list_empty(&write_list))) { - dm_bufio_unlock(c); - __flush_write_list(&write_list); - dm_bufio_lock(c); - } - - count = cache_total(&c->cache); - if (count > retain) - __evict_many(c, ¶ms, LIST_CLEAN, count - retain); - - dm_bufio_unlock(c); -} - -static void cleanup_old_buffers(void) -{ - unsigned long max_age_hz = get_max_age_hz(); - struct dm_bufio_client *c; - - mutex_lock(&dm_bufio_clients_lock); - - __cache_size_refresh(); - - list_for_each_entry(c, &dm_bufio_all_clients, client_list) - evict_old_buffers(c, max_age_hz); - - mutex_unlock(&dm_bufio_clients_lock); -} - -static void work_fn(struct work_struct *w) -{ - cleanup_old_buffers(); - - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); -} - -/*--------------------------------------------------------------*/ - /* * Global cleanup tries to evict the oldest buffers from across _all_ * the clients. It does this by repeatedly evicting a few buffers from @@ -2834,27 +2704,51 @@ static void __insert_client(struct dm_bufio_client *new_client) list_add_tail(&new_client->client_list, h); } +static enum evict_result select_for_evict(struct dm_buffer *b, void *context) +{ + /* In no-sleep mode, we cannot wait on IO. */ + if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) { + if (test_bit_acquire(B_READING, &b->state) || + test_bit(B_WRITING, &b->state) || + test_bit(B_DIRTY, &b->state)) + return ER_DONT_EVICT; + } + return ER_EVICT; +} + static unsigned long __evict_a_few(unsigned long nr_buffers) { - unsigned long count; struct dm_bufio_client *c; - struct evict_params params = { - .gfp = GFP_KERNEL, - .age_hz = 0, - /* set to jiffies in case there are no buffers in this client */ - .last_accessed = jiffies - }; + unsigned long oldest_buffer = jiffies; + unsigned long last_accessed; + unsigned long count; + struct dm_buffer *b; c = __pop_client(); if (!c) return 0; dm_bufio_lock(c); - count = __evict_many(c, ¶ms, LIST_CLEAN, nr_buffers); + + for (count = 0; count < nr_buffers; count++) { + b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL); + if (!b) + break; + + last_accessed = READ_ONCE(b->last_accessed); + if (time_after_eq(oldest_buffer, last_accessed)) + oldest_buffer = last_accessed; + + __make_buffer_clean(b); + __free_buffer_wake(b); + + cond_resched(); + } + dm_bufio_unlock(c); if (count) - c->oldest_buffer = params.last_accessed; + c->oldest_buffer = oldest_buffer; __insert_client(c); return count; @@ -2937,10 +2831,7 @@ static int __init dm_bufio_init(void) if (!dm_bufio_wq) return -ENOMEM; - INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn); INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup); - queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work, - DM_BUFIO_WORK_TIMER_SECS * HZ); return 0; } @@ -2952,7 +2843,6 @@ static void __exit dm_bufio_exit(void) { int bug = 0; - cancel_delayed_work_sync(&dm_bufio_cleanup_old_work); destroy_workqueue(dm_bufio_wq); if (dm_bufio_client_count) { @@ -2989,7 +2879,7 @@ module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644); MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644); -MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); +MODULE_PARM_DESC(max_age_seconds, "No longer does anything"); module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644); MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 3637761f3585..c889332e533b 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -141,6 +141,7 @@ struct mapped_device { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_zones; void *zone_revalidate_map; + struct task_struct *revalidate_map_task; #endif #ifdef CONFIG_IMA @@ -162,9 +163,6 @@ struct mapped_device { #define DMF_POST_SUSPENDING 8 #define DMF_EMULATE_ZONE_APPEND 9 -void disable_discard(struct mapped_device *md); -void disable_write_zeroes(struct mapped_device *md); - static inline sector_t dm_get_size(struct mapped_device *md) { return get_capacity(md->disk); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index d4cf0ac2a7aa..16d3d454fb0a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -14,11 +14,14 @@ #include <linux/bio.h> #include <linux/slab.h> #include <linux/kthread.h> +#include <linux/delay.h> #include <linux/device-mapper.h> #define DM_MSG_PREFIX "delay" +#define SLEEP_SHIFT 3 + struct delay_class { struct dm_dev *dev; sector_t start; @@ -34,6 +37,7 @@ struct delay_c { struct work_struct flush_expired_bios; struct list_head delayed_bios; struct task_struct *worker; + unsigned int worker_sleep_us; bool may_delay; struct delay_class read; @@ -136,6 +140,7 @@ static int flush_worker_fn(void *data) schedule(); } else { spin_unlock(&dc->delayed_bios_lock); + fsleep(dc->worker_sleep_us); cond_resched(); } } @@ -212,7 +217,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) { struct delay_c *dc; int ret; - unsigned int max_delay; + unsigned int max_delay, min_delay; if (argc != 3 && argc != 6 && argc != 9) { ti->error = "Requires exactly 3, 6 or 9 arguments"; @@ -235,7 +240,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) ret = delay_class_ctr(ti, &dc->read, argv); if (ret) goto bad; - max_delay = dc->read.delay; + min_delay = max_delay = dc->read.delay; if (argc == 3) { ret = delay_class_ctr(ti, &dc->write, argv); @@ -251,6 +256,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; max_delay = max(max_delay, dc->write.delay); + min_delay = min_not_zero(min_delay, dc->write.delay); if (argc == 6) { ret = delay_class_ctr(ti, &dc->flush, argv + 3); @@ -263,9 +269,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; max_delay = max(max_delay, dc->flush.delay); + min_delay = min_not_zero(min_delay, dc->flush.delay); out: if (max_delay < 50) { + if (min_delay >> SLEEP_SHIFT) + dc->worker_sleep_us = 1000; + else + dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT; /* * In case of small requested delays, use kthread instead of * timers and workqueue to achieve better latency. @@ -438,7 +449,7 @@ out: static struct target_type delay_target = { .name = "delay", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = delay_ctr, diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 1a33820c9f46..e75310232bbf 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -534,7 +534,9 @@ static void dust_status(struct dm_target *ti, status_type_t type, } } -static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dust_device *dd = ti->private; struct dm_dev *dev = dd->dev; diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index b19b0142a690..6abb31ca9662 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -415,7 +415,8 @@ static void ebs_status(struct dm_target *ti, status_type_t type, } } -static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct ebs_c *ec = ti->private; struct dm_dev *dev = ec->dev; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index b690905ab89f..c711db6f8f5c 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -47,14 +47,15 @@ enum feature_flag_bits { }; struct per_bio_data { - bool bio_submitted; + bool bio_can_corrupt; + struct bvec_iter saved_iter; }; static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, struct dm_target *ti) { - int r; - unsigned int argc; + int r = 0; + unsigned int argc = 0; const char *arg_name; static const struct dm_arg _args[] = { @@ -65,14 +66,13 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, {0, PROBABILITY_BASE, "Invalid random corrupt argument"}, }; - /* No feature arguments supplied. */ - if (!as->argc) - return 0; - - r = dm_read_arg_group(_args, as, &argc, &ti->error); - if (r) + if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error))) return r; + /* No feature arguments supplied. */ + if (!argc) + goto error_all_io; + while (argc) { arg_name = dm_shift_arg(as); argc--; @@ -128,8 +128,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags> */ if (!strcasecmp(arg_name, "corrupt_bio_byte")) { - if (!argc) { - ti->error = "Feature corrupt_bio_byte requires parameters"; + if (fc->corrupt_bio_byte) { + ti->error = "Feature corrupt_bio_byte duplicated"; + return -EINVAL; + } else if (argc < 4) { + ti->error = "Feature corrupt_bio_byte requires 4 parameters"; return -EINVAL; } @@ -176,7 +179,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (!strcasecmp(arg_name, "random_read_corrupt")) { - if (!argc) { + if (fc->random_read_corrupt) { + ti->error = "Feature random_read_corrupt duplicated"; + return -EINVAL; + } else if (!argc) { ti->error = "Feature random_read_corrupt requires a parameter"; return -EINVAL; } @@ -189,7 +195,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, } if (!strcasecmp(arg_name, "random_write_corrupt")) { - if (!argc) { + if (fc->random_write_corrupt) { + ti->error = "Feature random_write_corrupt duplicated"; + return -EINVAL; + } else if (!argc) { ti->error = "Feature random_write_corrupt requires a parameter"; return -EINVAL; } @@ -205,18 +214,25 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, return -EINVAL; } - if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { - ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + if (test_bit(DROP_WRITES, &fc->flags) && + (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; return -EINVAL; - } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) { - ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set"; + } else if (test_bit(ERROR_WRITES, &fc->flags) && + (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) { + ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set"; + return -EINVAL; + } else if (test_bit(ERROR_READS, &fc->flags) && + (fc->corrupt_bio_rw == READ || fc->random_read_corrupt)) { + ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set"; return -EINVAL; } if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) && !test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) && !fc->random_read_corrupt && !fc->random_write_corrupt) { +error_all_io: set_bit(ERROR_WRITES, &fc->flags); set_bit(ERROR_READS, &fc->flags); } @@ -278,7 +294,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error); + r = dm_read_arg(_args + 1, &as, &fc->down_interval, &ti->error); if (r) goto bad; @@ -339,7 +355,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) } static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, - unsigned char corrupt_bio_value) + unsigned char corrupt_bio_value, + struct bvec_iter start) { struct bvec_iter iter; struct bio_vec bvec; @@ -348,7 +365,7 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, * Overwrite the Nth byte of the bio's data, on whichever page * it falls. */ - bio_for_each_segment(bvec, bio, iter) { + __bio_for_each_segment(bvec, bio, iter, start) { if (bio_iter_len(bio, iter) > corrupt_bio_byte) { unsigned char *segment = bvec_kmap_local(&bvec); segment[corrupt_bio_byte] = corrupt_bio_value; @@ -357,36 +374,31 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte, "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", bio, corrupt_bio_value, corrupt_bio_byte, (bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size); + (unsigned long long)start.bi_sector, + start.bi_size); break; } corrupt_bio_byte -= bio_iter_len(bio, iter); } } -static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) +static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc, + struct bvec_iter start) { unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1; - if (!bio_has_data(bio)) - return; - - corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value); + corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start); } -static void corrupt_bio_random(struct bio *bio) +static void corrupt_bio_random(struct bio *bio, struct bvec_iter start) { unsigned int corrupt_byte; unsigned char corrupt_value; - if (!bio_has_data(bio)) - return; - - corrupt_byte = get_random_u32() % bio->bi_iter.bi_size; + corrupt_byte = get_random_u32() % start.bi_size; corrupt_value = get_random_u8(); - corrupt_bio_common(bio, corrupt_byte, corrupt_value); + corrupt_bio_common(bio, corrupt_byte, corrupt_value, start); } static void clone_free(struct bio *clone) @@ -481,7 +493,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) unsigned int elapsed; struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); - pb->bio_submitted = false; + pb->bio_can_corrupt = false; if (op_is_zone_mgmt(bio_op(bio))) goto map_bio; @@ -490,14 +502,15 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) elapsed = (jiffies - fc->start_time) / HZ; if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) { bool corrupt_fixed, corrupt_random; - /* - * Flag this bio as submitted while down. - */ - pb->bio_submitted = true; + + if (bio_has_data(bio)) { + pb->bio_can_corrupt = true; + pb->saved_iter = bio->bi_iter; + } /* - * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set. - * Otherwise, flakey_end_io() will decide if the reads should be modified. + * If ERROR_READS isn't set flakey_end_io() will decide if the + * reads should be modified. */ if (bio_data_dir(bio) == READ) { if (test_bit(ERROR_READS, &fc->flags)) @@ -516,6 +529,8 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } + if (!pb->bio_can_corrupt) + goto map_bio; /* * Corrupt matching writes. */ @@ -535,9 +550,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) struct bio *clone = clone_bio(ti, fc, bio); if (clone) { if (corrupt_fixed) - corrupt_bio_data(clone, fc); + corrupt_bio_data(clone, fc, + clone->bi_iter); if (corrupt_random) - corrupt_bio_random(clone); + corrupt_bio_random(clone, + clone->bi_iter); submit_bio(clone); return DM_MAPIO_SUBMITTED; } @@ -559,28 +576,21 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, if (op_is_zone_mgmt(bio_op(bio))) return DM_ENDIO_DONE; - if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { + if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) { if (fc->corrupt_bio_byte) { if ((fc->corrupt_bio_rw == READ) && all_corrupt_bio_flags_match(bio, fc)) { /* * Corrupt successful matching READs while in down state. */ - corrupt_bio_data(bio, fc); + corrupt_bio_data(bio, fc, pb->saved_iter); } } if (fc->random_read_corrupt) { u64 rnd = get_random_u64(); u32 rem = do_div(rnd, PROBABILITY_BASE); if (rem < fc->random_read_corrupt) - corrupt_bio_random(bio); - } - if (test_bit(ERROR_READS, &fc->flags)) { - /* - * Error read during the down_interval if drop_writes - * and error_writes were not configured. - */ - *error = BLK_STS_IOERR; + corrupt_bio_random(bio, pb->saved_iter); } } @@ -638,7 +648,9 @@ static void flakey_status(struct dm_target *ti, status_type_t type, } } -static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct flakey_c *fc = ti->private; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 2a283feb3319..1f626066e8cc 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2557,14 +2557,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w) char *mem; outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios); - - r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0); - if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) { - bio_put(outgoing_bio); - bio->bi_status = BLK_STS_RESOURCE; - bio_endio(bio); - return; - } + bio_add_virt_nofail(outgoing_bio, outgoing_data, + ic->sectors_per_block << SECTOR_SHIFT); bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1); if (IS_ERR(bip)) { @@ -3211,7 +3205,8 @@ next_chunk: bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios); bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector; - __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer)); + bio_add_virt_nofail(bio, recalc_buffer, + range.n_sectors << SECTOR_SHIFT); r = submit_bio_wait(bio); bio_put(bio); if (unlikely(r)) { @@ -3228,7 +3223,8 @@ next_chunk: bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios); bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector; - __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer)); + bio_add_virt_nofail(bio, recalc_buffer, + range.n_sectors << SECTOR_SHIFT); bip = bio_integrity_alloc(bio, GFP_NOIO, 1); if (unlikely(IS_ERR(bip))) { @@ -5164,7 +5160,7 @@ static void dm_integrity_dtr(struct dm_target *ti) BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); BUG_ON(!list_empty(&ic->wait_list)); - if (ic->mode == 'B') + if (ic->mode == 'B' && ic->bitmap_flush_work.work.func) cancel_delayed_work_sync(&ic->bitmap_flush_work); if (ic->metadata_wq) destroy_workqueue(ic->metadata_wq); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index d42eac944eb5..4165fef4c170 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1885,6 +1885,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}, {DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll}, {DM_GET_TARGET_VERSION_CMD, 0, get_target_version}, + {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */ }; if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 66318aba4bdb..15538ec58f8e 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -119,7 +119,9 @@ static void linear_status(struct dm_target *ti, status_type_t type, } } -static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct linear_c *lc = ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 8d7df8303d0a..d484e8e1d48a 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -818,7 +818,9 @@ static void log_writes_status(struct dm_target *ti, status_type_t type, } static int log_writes_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct log_writes_c *lc = ti->private; struct dm_dev *dev = lc->dev; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6c98f4ae5ea9..81fec2e1e0ef 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -79,6 +79,7 @@ struct multipath { struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ + struct priority_group *last_probed_pg; atomic_t nr_valid_paths; /* Total number of usable paths */ unsigned int nr_priority_groups; @@ -87,6 +88,7 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + wait_queue_head_t probe_wait; /* Wait for probing paths */ unsigned int pg_init_retries; /* Number of times to retry pg_init */ unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */ atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ @@ -100,6 +102,7 @@ struct multipath { struct bio_list queued_bios; struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ + bool is_suspending; }; /* @@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ +#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */ +#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */ static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) { @@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) atomic_set(&m->pg_init_count, 0); m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; init_waitqueue_head(&m->pg_init_wait); + init_waitqueue_head(&m->probe_wait); return 0; } @@ -413,13 +419,21 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) goto failed; } + /* Don't change PG until it has no remaining paths */ + pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } + /* Were we instructed to switch PG? */ if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { spin_unlock_irqrestore(&m->lock, flags); - goto check_current_pg; + goto check_all_pgs; } m->next_pg = NULL; spin_unlock_irqrestore(&m->lock, flags); @@ -427,16 +441,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) if (!IS_ERR_OR_NULL(pgpath)) return pgpath; } - - /* Don't change PG until it has no remaining paths */ -check_current_pg: - pg = READ_ONCE(m->current_pg); - if (pg) { - pgpath = choose_path_in_pg(m, pg, nr_bytes); - if (!IS_ERR_OR_NULL(pgpath)) - return pgpath; - } - +check_all_pgs: /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. @@ -612,7 +617,6 @@ static void multipath_queue_bio(struct multipath *m, struct bio *bio) static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) { struct pgpath *pgpath; - unsigned long flags; /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); @@ -620,12 +624,12 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); if (!pgpath) { - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { __multipath_queue_bio(m, bio); pgpath = ERR_PTR(-EAGAIN); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) || mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) { @@ -688,7 +692,6 @@ static void process_queued_io_list(struct multipath *m) static void process_queued_bios(struct work_struct *work) { int r; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -697,16 +700,16 @@ static void process_queued_bios(struct work_struct *work) bio_list_init(&bios); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (bio_list_empty(&m->queued_bios)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); return; } bio_list_merge_init(&bios, &m->queued_bios); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -1190,7 +1193,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) struct dm_arg_set as; unsigned int pg_count = 0; unsigned int next_pg_num; - unsigned long flags; as.argc = argc; as.argv = argv; @@ -1255,9 +1257,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); ti->num_flush_bios = 1; ti->num_discard_bios = 1; @@ -1292,23 +1294,21 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m) static void flush_multipath_work(struct multipath *m) { if (m->hw_handler_name) { - unsigned long flags; - if (!atomic_read(&m->pg_init_in_progress)) goto skip; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (atomic_read(&m->pg_init_in_progress) && !test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) { - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); flush_workqueue(kmpath_handlerd); multipath_wait_for_pg_init_completion(m); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags); } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } skip: if (m->queue_mode == DM_TYPE_BIO_BASED) @@ -1370,11 +1370,10 @@ out: static int reinstate_path(struct pgpath *pgpath) { int r = 0, run_queue = 0; - unsigned long flags; struct multipath *m = pgpath->pg->m; unsigned int nr_valid_paths; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (pgpath->is_active) goto out; @@ -1404,7 +1403,7 @@ static int reinstate_path(struct pgpath *pgpath) schedule_work(&m->trigger_event); out: - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); if (run_queue) { dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); @@ -1439,15 +1438,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action) * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - bool bypassed) + bool bypassed, bool can_be_delayed) { unsigned long flags; spin_lock_irqsave(&m->lock, flags); pg->bypassed = bypassed; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } spin_unlock_irqrestore(&m->lock, flags); @@ -1461,7 +1464,6 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) { struct priority_group *pg; unsigned int pgnum; - unsigned long flags; char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || @@ -1470,17 +1472,21 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) return -EINVAL; } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); list_for_each_entry(pg, &m->priority_groups, list) { pg->bypassed = false; if (--pgnum) continue; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } m->next_pg = pg; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); schedule_work(&m->trigger_event); return 0; @@ -1507,7 +1513,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) break; } - bypass_pg(m, pg, bypassed); + bypass_pg(m, pg, bypassed, true); return 0; } @@ -1561,7 +1567,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, true); + bypass_pg(m, pg, true, false); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1742,6 +1748,9 @@ static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; + spin_lock_irq(&m->lock); + m->is_suspending = true; + spin_unlock_irq(&m->lock); /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) queue_if_no_path(m, false, true, __func__); @@ -1762,9 +1771,9 @@ static void multipath_postsuspend(struct dm_target *ti) static void multipath_resume(struct dm_target *ti) { struct multipath *m = ti->private; - unsigned long flags; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); + m->is_suspending = false; if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); @@ -1775,7 +1784,7 @@ static void multipath_resume(struct dm_target *ti) test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags), test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } /* @@ -1798,14 +1807,13 @@ static void multipath_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { int sz = 0, pg_counter, pgpath_counter; - unsigned long flags; struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *p; unsigned int pg_num; char state; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); /* Features */ if (type == STATUSTYPE_INFO) @@ -1845,10 +1853,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("%u ", m->nr_priority_groups); - if (m->next_pg) - pg_num = m->next_pg->pg_num; - else if (m->current_pg) + if (m->current_pg) pg_num = m->current_pg->pg_num; + else if (m->next_pg) + pg_num = m->next_pg->pg_num; else pg_num = (m->nr_priority_groups ? 1 : 0); @@ -1951,7 +1959,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, break; } - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv, @@ -1961,7 +1969,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg dev_t dev; struct multipath *m = ti->private; action_fn action; - unsigned long flags; mutex_lock(&m->work_mutex); @@ -1973,9 +1980,9 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, true, false, __func__); - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); enable_nopath_timeout(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, false, false, __func__); @@ -2021,14 +2028,132 @@ out: return r; } +/* + * Perform a minimal read from the given path to find out whether the + * path still works. If a path error occurs, fail it. + */ +static int probe_path(struct pgpath *pgpath) +{ + struct block_device *bdev = pgpath->path.dev->bdev; + unsigned int read_size = bdev_logical_block_size(bdev); + struct page *page; + struct bio *bio; + blk_status_t status; + int r = 0; + + if (WARN_ON_ONCE(read_size > PAGE_SIZE)) + return -EINVAL; + + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + /* Perform a minimal read: Sector 0, length read_size */ + bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL); + if (!bio) { + r = -ENOMEM; + goto out; + } + + bio->bi_iter.bi_sector = 0; + __bio_add_page(bio, page, read_size, 0); + submit_bio_wait(bio); + status = bio->bi_status; + bio_put(bio); + + if (status && blk_path_error(status)) + fail_path(pgpath); + +out: + __free_page(page); + return r; +} + +/* + * Probe all active paths in current_pg to find out whether they still work. + * Fail all paths that do not work. + * + * Return -ENOTCONN if no valid path is left (even outside of current_pg). We + * cannot probe paths in other pgs without switching current_pg, so if valid + * paths are only in different pgs, they may or may not work. Additionally + * we should not probe paths in a pathgroup that is in the process of + * Initializing. Userspace can submit a request and we'll switch and wait + * for the pathgroup to be initialized. If the request fails, it may need to + * probe again. + */ +static int probe_active_paths(struct multipath *m) +{ + struct pgpath *pgpath; + struct priority_group *pg = NULL; + int r = 0; + + spin_lock_irq(&m->lock); + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { + wait_event_lock_irq(m->probe_wait, + !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), + m->lock); + /* + * if we waited because a probe was already in progress, + * and it probed the current active pathgroup, don't + * reprobe. Just return the number of valid paths + */ + if (m->current_pg == m->last_probed_pg) + goto skip_probe; + } + if (!m->current_pg || m->is_suspending || + test_bit(MPATHF_QUEUE_IO, &m->flags)) + goto skip_probe; + set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + pg = m->last_probed_pg = m->current_pg; + spin_unlock_irq(&m->lock); + + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pg != READ_ONCE(m->current_pg) || + READ_ONCE(m->is_suspending)) + goto out; + if (!pgpath->is_active) + continue; + + r = probe_path(pgpath); + if (r < 0) + goto out; + } + +out: + spin_lock_irq(&m->lock); + clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { + m->current_pgpath = NULL; + m->current_pg = NULL; + } +skip_probe: + if (r == 0 && !atomic_read(&m->nr_valid_paths)) + r = -ENOTCONN; + spin_unlock_irq(&m->lock); + if (pg) + wake_up(&m->probe_wait); + return r; +} + static int multipath_prepare_ioctl(struct dm_target *ti, - struct block_device **bdev) + struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct multipath *m = ti->private; struct pgpath *pgpath; - unsigned long flags; int r; + if (_IOC_TYPE(cmd) == DM_IOCTL) { + *forward = false; + switch (cmd) { + case DM_MPATH_PROBE_PATHS: + return probe_active_paths(m); + default: + return -ENOTTY; + } + } + pgpath = READ_ONCE(m->current_pgpath); if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m)) pgpath = choose_pgpath(m, 0); @@ -2044,10 +2169,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, } else { /* No path is available */ r = -EIO; - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) r = -ENOTCONN; - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); } if (r == -ENOTCONN) { @@ -2055,10 +2180,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti, /* Path status changed, redo selection */ (void) choose_pgpath(m, 0); } - spin_lock_irqsave(&m->lock, flags); + spin_lock_irq(&m->lock); if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) (void) __pg_init_all_paths(m); - spin_unlock_irqrestore(&m->lock, flags); + spin_unlock_irq(&m->lock); dm_table_run_md_queue_async(m->ti->table); process_queued_io_list(m); } @@ -2180,7 +2305,7 @@ static int multipath_busy(struct dm_target *ti) */ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE | DM_TARGET_PASSES_INTEGRITY, .module = THIS_MODULE, diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6adc55fd90d3..127138c61be5 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -14,6 +14,7 @@ #include "raid5.h" #include "raid10.h" #include "md-bitmap.h" +#include "dm-core.h" #include <linux/device-mapper.h> @@ -3308,6 +3309,7 @@ size_check: /* Disable/enable discard support on raid set. */ configure_discard_support(rs); + rs->md.dm_gendisk = ti->table->md->disk; mddev_unlock(&rs->md); return 0; @@ -3327,6 +3329,7 @@ static void raid_dtr(struct dm_target *ti) mddev_lock_nointr(&rs->md); md_stop(&rs->md); + rs->md.dm_gendisk = NULL; mddev_unlock(&rs->md); if (work_pending(&rs->md.event_work)) diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9e615b4f1f5e..785af4816584 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -133,10 +133,9 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) spin_lock_irqsave(&ms->lock, flags); should_wake = !(bl->head); bio_list_add(bl, bio); - spin_unlock_irqrestore(&ms->lock, flags); - if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void dispatch_bios(void *context, struct bio_list *bio_list) @@ -646,9 +645,9 @@ static void write_callback(unsigned long error, void *context) if (!ms->failures.head) should_wake = 1; bio_list_add(&ms->failures, bio); - spin_unlock_irqrestore(&ms->lock, flags); if (should_wake) wakeup_mirrord(ms); + spin_unlock_irqrestore(&ms->lock, flags); } static void do_write(struct mirror_set *ms, struct bio *bio) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index e23076f7ece2..a6ca92049c10 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -217,10 +217,10 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped) if (unlikely(error == BLK_STS_TARGET)) { if (req_op(clone) == REQ_OP_DISCARD && !clone->q->limits.max_discard_sectors) - disable_discard(tio->md); + blk_queue_disable_discard(tio->md->queue); else if (req_op(clone) == REQ_OP_WRITE_ZEROES && !clone->q->limits.max_write_zeroes_sectors) - disable_write_zeroes(tio->md); + blk_queue_disable_write_zeroes(tio->md->queue); } switch (r) { diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index a1b7535c508a..a7dc04bd55e5 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -405,7 +405,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error) { unsigned int i; - char major_minor[16]; + char major_minor[22]; struct stripe_c *sc = ti->private; if (!*error) @@ -417,8 +417,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio, if (*error == BLK_STS_NOTSUPP) return DM_ENDIO_DONE; - memset(major_minor, 0, sizeof(major_minor)); - sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio))); + format_dev_t(major_minor, bio_dev(bio)); /* * Test to see which stripe drive triggered the event diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c index dfd9fb52a6f3..bb1a70b5a215 100644 --- a/drivers/md/dm-switch.c +++ b/drivers/md/dm-switch.c @@ -517,7 +517,9 @@ static void switch_status(struct dm_target *ti, status_type_t type, * * Passthrough all ioctls to the path for sector 0 */ -static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct switch_ctx *sctx = ti->private; unsigned int path_nr; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 35100a435c88..24a857ff6d0b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -117,7 +117,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num) n_targets = (struct dm_target *) (n_highs + num); memset(n_highs, -1, sizeof(*n_highs) * num); - kvfree(t->highs); t->num_allocated = num; t->highs = n_highs; @@ -257,7 +256,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, if (bdev_is_zoned(bdev)) { unsigned int zone_sectors = bdev_zone_sectors(bdev); - if (start & (zone_sectors - 1)) { + if (!bdev_is_zone_aligned(bdev, start)) { DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg", dm_device_name(ti->table->md), (unsigned long long)start, @@ -274,7 +273,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * devices do not end up with a smaller zone in the middle of * the sector range. */ - if (len & (zone_sectors - 1)) { + if (!bdev_is_zone_aligned(bdev, len)) { DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg", dm_device_name(ti->table->md), (unsigned long long)len, @@ -431,6 +430,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, return 0; } + mutex_lock(&q->limits_lock); + /* + * BLK_FEAT_ATOMIC_WRITES is not inherited from the bottom device in + * blk_stack_limits(), so do it manually. + */ + limits->features |= (q->limits.features & BLK_FEAT_ATOMIC_WRITES); + if (blk_stack_limits(limits, &q->limits, get_start_sect(bdev) + start) < 0) DMWARN("%s: adding target device %pg caused an alignment inconsistency: " @@ -448,6 +454,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ if (!dm_target_has_integrity(ti->type)) queue_limits_stack_integrity_bdev(limits, bdev); + mutex_unlock(&q->limits_lock); return 0; } @@ -523,8 +530,9 @@ static char **realloc_argv(unsigned int *size, char **old_argv) gfp = GFP_NOIO; } argv = kmalloc_array(new_size, sizeof(*argv), gfp); - if (argv && old_argv) { - memcpy(argv, old_argv, *size * sizeof(*argv)); + if (argv) { + if (old_argv) + memcpy(argv, old_argv, *size * sizeof(*argv)); *size = new_size; } @@ -1049,7 +1057,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned int min_pool_size = 0, pool_size; struct dm_md_mempools *pools; unsigned int bioset_flags = 0; - bool mempool_needs_integrity = t->integrity_supported; if (unlikely(type == DM_TYPE_NONE)) { DMERR("no table type is set, can't allocate mempools"); @@ -1074,8 +1081,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); - - mempool_needs_integrity |= ti->mempool_needs_integrity; } pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, @@ -1175,7 +1180,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, t = dm_get_live_table(md, &srcu_idx); if (!t) - return 0; + goto put_live_table; for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1186,10 +1191,181 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, (void *)key); } +put_live_table: dm_put_live_table(md, srcu_idx); return 0; } +enum dm_wrappedkey_op { + DERIVE_SW_SECRET, + IMPORT_KEY, + GENERATE_KEY, + PREPARE_KEY, +}; + +struct dm_wrappedkey_op_args { + enum dm_wrappedkey_op op; + int err; + union { + struct { + const u8 *eph_key; + size_t eph_key_size; + u8 *sw_secret; + } derive_sw_secret; + struct { + const u8 *raw_key; + size_t raw_key_size; + u8 *lt_key; + } import_key; + struct { + u8 *lt_key; + } generate_key; + struct { + const u8 *lt_key; + size_t lt_key_size; + u8 *eph_key; + } prepare_key; + }; +}; + +static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) +{ + struct dm_wrappedkey_op_args *args = data; + struct block_device *bdev = dev->bdev; + struct blk_crypto_profile *profile = + bdev_get_queue(bdev)->crypto_profile; + int err = -EOPNOTSUPP; + + if (!args->err) + return 0; + + switch (args->op) { + case DERIVE_SW_SECRET: + err = blk_crypto_derive_sw_secret( + bdev, + args->derive_sw_secret.eph_key, + args->derive_sw_secret.eph_key_size, + args->derive_sw_secret.sw_secret); + break; + case IMPORT_KEY: + err = blk_crypto_import_key(profile, + args->import_key.raw_key, + args->import_key.raw_key_size, + args->import_key.lt_key); + break; + case GENERATE_KEY: + err = blk_crypto_generate_key(profile, + args->generate_key.lt_key); + break; + case PREPARE_KEY: + err = blk_crypto_prepare_key(profile, + args->prepare_key.lt_key, + args->prepare_key.lt_key_size, + args->prepare_key.eph_key); + break; + } + args->err = err; + + /* Try another device in case this fails. */ + return 0; +} + +static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile, + struct dm_wrappedkey_op_args *args) +{ + struct mapped_device *md = + container_of(profile, struct dm_crypto_profile, profile)->md; + struct dm_target *ti; + struct dm_table *t; + int srcu_idx; + int i; + + args->err = -EOPNOTSUPP; + + t = dm_get_live_table(md, &srcu_idx); + if (!t) + goto out; + + /* + * blk-crypto currently has no support for multiple incompatible + * implementations of wrapped inline crypto keys on a single system. + * It was already checked earlier that support for wrapped keys was + * declared on all underlying devices. Thus, all the underlying devices + * should support all wrapped key operations and they should behave + * identically, i.e. work with the same keys. So, just executing the + * operation on the first device on which it works suffices for now. + */ + for (i = 0; i < t->num_targets; i++) { + ti = dm_table_get_target(t, i); + if (!ti->type->iterate_devices) + continue; + ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args); + if (!args->err) + break; + } +out: + dm_put_live_table(md, srcu_idx); + return args->err; +} + +static int dm_derive_sw_secret(struct blk_crypto_profile *profile, + const u8 *eph_key, size_t eph_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = DERIVE_SW_SECRET, + .derive_sw_secret = { + .eph_key = eph_key, + .eph_key_size = eph_key_size, + .sw_secret = sw_secret, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_import_key(struct blk_crypto_profile *profile, + const u8 *raw_key, size_t raw_key_size, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = IMPORT_KEY, + .import_key = { + .raw_key = raw_key, + .raw_key_size = raw_key_size, + .lt_key = lt_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_generate_key(struct blk_crypto_profile *profile, + u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = GENERATE_KEY, + .generate_key = { + .lt_key = lt_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + +static int dm_prepare_key(struct blk_crypto_profile *profile, + const u8 *lt_key, size_t lt_key_size, + u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE]) +{ + struct dm_wrappedkey_op_args args = { + .op = PREPARE_KEY, + .prepare_key = { + .lt_key = lt_key, + .lt_key_size = lt_key_size, + .eph_key = eph_key, + }, + }; + return dm_exec_wrappedkey_op(profile, &args); +} + static int device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) @@ -1264,6 +1440,13 @@ static int dm_table_construct_crypto_profile(struct dm_table *t) profile); } + if (profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) { + profile->ll_ops.derive_sw_secret = dm_derive_sw_secret; + profile->ll_ops.import_key = dm_import_key; + profile->ll_ops.generate_key = dm_generate_key; + profile->ll_ops.prepare_key = dm_prepare_key; + } + if (t->md->queue && !blk_crypto_has_capabilities(profile, t->md->queue->crypto_profile)) { @@ -1491,6 +1674,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t) return true; } +bool dm_table_is_wildcard(struct dm_table *t) +{ + for (unsigned int i = 0; i < t->num_targets; i++) { + struct dm_target *ti = dm_table_get_target(t, i); + + if (!dm_target_is_wildcard(ti->type)) + return false; + } + + return true; +} + static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1722,8 +1917,12 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev * sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); + int b; - return !q->limits.max_write_zeroes_sectors; + mutex_lock(&q->limits_lock); + b = !q->limits.max_write_zeroes_sectors; + mutex_unlock(&q->limits_lock); + return b; } static bool dm_table_supports_write_zeroes(struct dm_table *t) @@ -1831,10 +2030,24 @@ static bool dm_table_supports_atomic_writes(struct dm_table *t) return true; } +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size) +{ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && dm_has_zone_plugs(t->md) && + old_size != new_size) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change size", + dm_device_name(t->md)); + return false; + } + return true; +} + int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { int r; + struct queue_limits old_limits; if (!dm_table_supports_nowait(t)) limits->features &= ~BLK_FEAT_NOWAIT; @@ -1861,28 +2074,30 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (dm_table_supports_flush(t)) limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA; - if (dm_table_supports_dax(t, device_not_dax_capable)) { + if (dm_table_supports_dax(t, device_not_dax_capable)) limits->features |= BLK_FEAT_DAX; - if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) - set_dax_synchronous(t->md->dax_dev); - } else + else limits->features &= ~BLK_FEAT_DAX; - if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) - dax_write_cache(t->md->dax_dev, true); - /* For a zoned table, setup the zone related queue attributes. */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - (limits->features & BLK_FEAT_ZONED)) { - r = dm_set_zones_restrictions(t, q, limits); - if (r) - return r; + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { + if (limits->features & BLK_FEAT_ZONED) { + r = dm_set_zones_restrictions(t, q, limits); + if (r) + return r; + } else if (dm_has_zone_plugs(t->md)) { + DMWARN("%s: device has zone write plug resources. " + "Cannot switch to non-zoned table.", + dm_device_name(t->md)); + return -EINVAL; + } } if (dm_table_supports_atomic_writes(t)) limits->features |= BLK_FEAT_ATOMIC_WRITES; - r = queue_limits_set(q, limits); + old_limits = queue_limits_start_update(q); + r = queue_limits_commit_update(q, limits); if (r) return r; @@ -1893,10 +2108,21 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && (limits->features & BLK_FEAT_ZONED)) { r = dm_revalidate_zones(t, q); - if (r) + if (r) { + queue_limits_set(q, &old_limits); return r; + } } + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + dm_finalize_zone_settings(t, limits); + + if (dm_table_supports_dax(t, device_not_dax_synchronous_capable)) + set_dax_synchronous(t->md->dax_dev); + + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) + dax_write_cache(t->md->dax_dev, true); + dm_update_crypto_profile(q, t); return 0; } diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c index 655453bb276b..425b3a74f4db 100644 --- a/drivers/md/dm-vdo/indexer/volume.c +++ b/drivers/md/dm-vdo/indexer/volume.c @@ -754,10 +754,11 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * u32 physical_page, struct cached_page **page_ptr) { struct cached_page *page; + unsigned int zone_number = request->zone_number; get_page_from_cache(&volume->page_cache, physical_page, &page); if (page != NULL) { - if (request->zone_number == 0) { + if (zone_number == 0) { /* Only one zone is allowed to update the LRU. */ make_page_most_recent(&volume->page_cache, page); } @@ -767,7 +768,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * } /* Prepare to enqueue a read for the page. */ - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); mutex_lock(&volume->read_threads_mutex); /* @@ -787,8 +788,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * * the order does not matter for correctness as it does below. */ mutex_unlock(&volume->read_threads_mutex); - begin_pending_search(&volume->page_cache, physical_page, - request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); return UDS_QUEUED; } @@ -797,7 +797,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request * * "search pending" state in careful order so no other thread can mess with the data before * the caller gets to look at it. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); mutex_unlock(&volume->read_threads_mutex); *page_ptr = page; return UDS_SUCCESS; @@ -849,6 +849,7 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r { int result; struct cached_page *page = NULL; + unsigned int zone_number = request->zone_number; u32 physical_page = map_to_physical_page(volume->geometry, chapter, index_page_number); @@ -858,18 +859,18 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r * invalidation by the reader thread, before the reader thread has noticed that the * invalidate_counter has been incremented. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); result = get_volume_page_protected(volume, request, physical_page, &page); if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } result = uds_search_chapter_index_page(&page->index_page, volume->geometry, &request->record_name, record_page_number); - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } @@ -882,6 +883,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req { struct cached_page *record_page; struct index_geometry *geometry = volume->geometry; + unsigned int zone_number = request->zone_number; int result; u32 physical_page, page_number; @@ -905,11 +907,11 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req * invalidation by the reader thread, before the reader thread has noticed that the * invalidate_counter has been incremented. */ - begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + begin_pending_search(&volume->page_cache, physical_page, zone_number); result = get_volume_page_protected(volume, request, physical_page, &record_page); if (result != UDS_SUCCESS) { - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return result; } @@ -917,7 +919,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req &request->record_name, geometry, &request->old_metadata)) *found = true; - end_pending_search(&volume->page_cache, request->zone_number); + end_pending_search(&volume->page_cache, zone_number); return UDS_SUCCESS; } diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 0c41949db784..631a887b487c 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -593,6 +593,10 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, (*argc)--; if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { + if (v->fec->dev) { + ti->error = "FEC device already specified"; + return -EINVAL; + } r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev); if (r) { ti->error = "FEC device lookup failed"; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 3c427f18a04b..81186bded1ce 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -682,7 +682,8 @@ static void verity_bh_work(struct work_struct *w) static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) { return ioprio <= IOPRIO_CLASS_IDLE && - bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]) && + !need_resched(); } static void verity_end_io(struct bio *bio) @@ -993,7 +994,9 @@ static void verity_status(struct dm_target *ti, status_type_t type, } } -static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, + bool *forward) { struct dm_verity *v = ti->private; @@ -1120,6 +1123,9 @@ static int verity_alloc_most_once(struct dm_verity *v) { struct dm_target *ti = v->ti; + if (v->validated_blocks) + return 0; + /* the bitset can only handle INT_MAX blocks */ if (v->data_blocks > INT_MAX) { ti->error = "device too large to use check_at_most_once"; @@ -1143,6 +1149,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v) struct dm_verity_io *io; u8 *zero_data; + if (v->zero_digest) + return 0; + v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL); if (!v->zero_digest) @@ -1577,7 +1586,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - /* Root hash signature is a optional parameter*/ + /* Root hash signature is an optional parameter */ r = verity_verify_root_hash(root_hash_digest_to_validate, strlen(root_hash_digest_to_validate), verify_args.sig, diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c index a9e2c6c0a33c..d5261a0e4232 100644 --- a/drivers/md/dm-verity-verify-sig.c +++ b/drivers/md/dm-verity-verify-sig.c @@ -71,9 +71,14 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, const char *arg_name) { struct dm_target *ti = v->ti; - int ret = 0; + int ret; const char *sig_key = NULL; + if (v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified"); + return -EINVAL; + } + if (!*argc) { ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified"); return -EINVAL; @@ -83,14 +88,18 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as, (*argc)--; ret = verity_verify_get_sig_from_key(sig_key, sig_opts); - if (ret < 0) + if (ret < 0) { ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified"); + return ret; + } v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL); - if (!v->signature_key_desc) + if (!v->signature_key_desc) { + ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key"); return -ENOMEM; + } - return ret; + return 0; } /* diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 20edd3fabbab..3d31b82e0730 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -56,24 +56,31 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, { struct mapped_device *md = disk->private_data; struct dm_table *map; - int srcu_idx, ret; + struct dm_table *zone_revalidate_map = md->zone_revalidate_map; + int srcu_idx, ret = -EIO; + bool put_table = false; - if (!md->zone_revalidate_map) { - /* Regular user context */ + if (!zone_revalidate_map || md->revalidate_map_task != current) { + /* + * Regular user context or + * Zone revalidation during __bind() is in progress, but this + * call is from a different process + */ if (dm_suspended_md(md)) return -EAGAIN; map = dm_get_live_table(md, &srcu_idx); - if (!map) - return -EIO; + put_table = true; } else { /* Zone revalidation during __bind() */ - map = md->zone_revalidate_map; + map = zone_revalidate_map; } - ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); + if (map) + ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, + data); - if (!md->zone_revalidate_map) + if (put_table) dm_put_live_table(md, srcu_idx); return ret; @@ -153,33 +160,36 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; struct gendisk *disk = md->disk; + unsigned int nr_zones = disk->nr_zones; int ret; if (!get_capacity(disk)) return 0; - /* Revalidate only if something changed. */ - if (!disk->nr_zones || disk->nr_zones != md->nr_zones) { - DMINFO("%s using %s zone append", - disk->disk_name, - queue_emulates_zone_append(q) ? "emulated" : "native"); - md->nr_zones = 0; - } - - if (md->nr_zones) + /* + * Do not revalidate if zone write plug resources have already + * been allocated. + */ + if (dm_has_zone_plugs(md)) return 0; + DMINFO("%s using %s zone append", disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); + /* * Our table is not live yet. So the call to dm_get_live_table() * in dm_blk_report_zones() will fail. Set a temporary pointer to * our table for dm_blk_report_zones() to use directly. */ md->zone_revalidate_map = t; + md->revalidate_map_task = current; ret = blk_revalidate_disk_zones(disk); + md->revalidate_map_task = NULL; md->zone_revalidate_map = NULL; if (ret) { DMERR("Revalidate zones failed %d", ret); + disk->nr_zones = nr_zones; return ret; } @@ -337,15 +347,15 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, /* * Check if zone append is natively supported, and if not, set the - * mapped device queue as needing zone append emulation. + * mapped device queue as needing zone append emulation. If zone + * append is natively supported, make sure that + * max_hw_zone_append_sectors is not set to 0. */ WARN_ON_ONCE(queue_is_mq(q)); - if (dm_table_supports_zone_append(t)) { - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - } else { - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + if (!dm_table_supports_zone_append(t)) lim->max_hw_zone_append_sectors = 0; - } + else if (lim->max_hw_zone_append_sectors == 0) + lim->max_hw_zone_append_sectors = lim->max_zone_append_sectors; /* * Determine the max open and max active zone limits for the mapped @@ -380,15 +390,28 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, lim->max_open_zones = 0; lim->max_active_zones = 0; lim->max_hw_zone_append_sectors = 0; + lim->max_zone_append_sectors = 0; lim->zone_write_granularity = 0; lim->chunk_sectors = 0; lim->features &= ~BLK_FEAT_ZONED; - clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - md->nr_zones = 0; - disk->nr_zones = 0; return 0; } + if (get_capacity(disk) && dm_has_zone_plugs(t->md)) { + if (q->limits.chunk_sectors != lim->chunk_sectors) { + DMWARN("%s: device has zone write plug resources. " + "Cannot change zone size", + disk->disk_name); + return -EINVAL; + } + if (lim->max_hw_zone_append_sectors != 0 && + !dm_table_is_wildcard(t)) { + DMWARN("%s: device has zone write plug resources. " + "New table must emulate zone append", + disk->disk_name); + return -EINVAL; + } + } /* * Warn once (when the capacity is not yet set) if the mapped device is * partially using zone resources of the target devices as that leads to @@ -408,6 +431,23 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, return 0; } +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim) +{ + struct mapped_device *md = t->md; + + if (lim->features & BLK_FEAT_ZONED) { + if (dm_table_supports_zone_append(t)) + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + else + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + } else { + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + md->nr_zones = 0; + md->disk->nr_zones = 0; + } +} + + /* * IO completion callback called from clone_endio(). */ @@ -423,9 +463,9 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) */ if (clone->bi_status == BLK_STS_OK && bio_op(clone) == REQ_OP_ZONE_APPEND) { - sector_t mask = bdev_zone_sectors(disk->part0) - 1; - - orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; + orig_bio->bi_iter.bi_sector += + bdev_offset_from_zone_start(disk->part0, + clone->bi_iter.bi_sector); } return; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6141fc25d842..5da3db06da10 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -1015,7 +1015,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits) /* * Pass on ioctl to the backend device. */ -static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) +static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev, + unsigned int cmd, unsigned long arg, bool *forward) { struct dmz_target *dmz = ti->private; struct dmz_dev *dev = &dmz->dev[0]; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5ab7574c0c76..1726f0f828cc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -411,7 +411,8 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, - struct block_device **bdev) + struct block_device **bdev, unsigned int cmd, + unsigned long arg, bool *forward) { struct dm_target *ti; struct dm_table *map; @@ -434,8 +435,8 @@ retry: if (dm_suspended_md(md)) return -EAGAIN; - r = ti->type->prepare_ioctl(ti, bdev); - if (r == -ENOTCONN && !fatal_signal_pending(current)) { + r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward); + if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) { dm_put_live_table(md, *srcu_idx); fsleep(10000); goto retry; @@ -454,9 +455,10 @@ static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode, { struct mapped_device *md = bdev->bd_disk->private_data; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); - if (r < 0) + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward); + if (!forward || r < 0) goto out; if (r > 0) { @@ -1082,22 +1084,6 @@ static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md) return &md->queue->limits; } -void disable_discard(struct mapped_device *md) -{ - struct queue_limits *limits = dm_get_queue_limits(md); - - /* device doesn't really support DISCARD, disable it */ - limits->max_hw_discard_sectors = 0; -} - -void disable_write_zeroes(struct mapped_device *md) -{ - struct queue_limits *limits = dm_get_queue_limits(md); - - /* device doesn't really support WRITE ZEROES, disable it */ - limits->max_write_zeroes_sectors = 0; -} - static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) { return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); @@ -1115,10 +1101,10 @@ static void clone_endio(struct bio *bio) if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); + blk_queue_disable_discard(md->queue); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bdev_write_zeroes_sectors(bio->bi_bdev)) - disable_write_zeroes(md); + blk_queue_disable_write_zeroes(md->queue); } if (static_branch_unlikely(&zoned_enabled) && @@ -2421,21 +2407,35 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct queue_limits *limits) { struct dm_table *old_map; - sector_t size; + sector_t size, old_size; int ret; lockdep_assert_held(&md->suspend_lock); size = dm_table_get_size(t); + old_size = dm_get_size(md); + + if (!dm_table_supports_size_change(t, old_size, size)) { + old_map = ERR_PTR(-EINVAL); + goto out; + } + + set_capacity(md->disk, size); + + ret = dm_table_set_restrictions(t, md->queue, limits); + if (ret) { + set_capacity(md->disk, old_size); + old_map = ERR_PTR(ret); + goto out; + } + /* * Wipe any geometry if the size of the table changed. */ - if (size != dm_get_size(md)) + if (size != old_size) memset(&md->geometry, 0, sizeof(md->geometry)); - set_capacity(md->disk, size); - dm_table_event_callback(t, event_callback, md); if (dm_table_request_based(t)) { @@ -2453,10 +2453,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * requests in the queue may refer to bio from the old bioset, * so you must walk through the queue to unprep. */ - if (!md->mempools) { + if (!md->mempools) md->mempools = t->mempools; - t->mempools = NULL; - } + else + dm_free_md_mempools(t->mempools); } else { /* * The md may already have mempools that need changing. @@ -2465,14 +2465,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, */ dm_free_md_mempools(md->mempools); md->mempools = t->mempools; - t->mempools = NULL; - } - - ret = dm_table_set_restrictions(t, md->queue, limits); - if (ret) { - old_map = ERR_PTR(ret); - goto out; } + t->mempools = NULL; old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); rcu_assign_pointer(md->map, (void *)t); @@ -3638,10 +3632,13 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; int r, srcu_idx; + bool forward = true; - r = dm_prepare_ioctl(md, &srcu_idx, &bdev); + /* Not a real ioctl, but targets must not interpret non-DM ioctls */ + r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward); if (r < 0) goto out; + WARN_ON_ONCE(!forward); ops = bdev->bd_disk->fops->pr_ops; if (ops && ops->pr_clear) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a0a8ff119815..245f52b59215 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -58,6 +58,7 @@ void dm_table_event_callback(struct dm_table *t, void (*fn)(void *), void *context); struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector); bool dm_table_has_no_data_devices(struct dm_table *table); +bool dm_table_is_wildcard(struct dm_table *t); int dm_calculate_queue_limits(struct dm_table *table, struct queue_limits *limits); int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -72,6 +73,8 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); struct dm_target *dm_table_get_immutable_target(struct dm_table *t); struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); +bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size, + sector_t new_size); void dm_lock_md_type(struct mapped_device *md); void dm_unlock_md_type(struct mapped_device *md); @@ -102,6 +105,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *lim); int dm_revalidate_zones(struct dm_table *t, struct request_queue *q); +void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED int dm_blk_report_zones(struct gendisk *disk, sector_t sector, @@ -110,12 +114,14 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t, sector_t sector, unsigned int nr_zones, unsigned long *need_reset); +#define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL) #else #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) { return false; } +#define dm_has_zone_plugs(md) false #endif /* diff --git a/drivers/md/md.c b/drivers/md/md.c index 9daa78c5fe33..0fde115e921f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* Default safemode delay: 200 msec */ #define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* - * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' - * is 1000 KB/sec, so the extra system load does not show up that much. - * Increase it if you want to have more _guaranteed_ speed. Note that - * the RAID driver will use the maximum available bandwidth if the IO - * subsystem is idle. There is also an 'absolute maximum' reconstruction - * speed limit - in case reconstruction slows down your system despite - * idle IO detection. + * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit' + * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load + * does not show up that much. Increase it if you want to have more guaranteed + * speed. Note that the RAID driver will use the maximum bandwidth + * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle. * - * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. - * or /sys/block/mdX/md/sync_speed_{min,max} + * Background sync IO speed control: + * + * - below speed min: + * no limit; + * - above speed min and below speed max: + * a) if mddev is idle, then no limit; + * b) if mddev is busy handling normal IO, then limit inflight sync IO + * to sync_io_depth; + * - above speed max: + * sync IO can't be issued; + * + * Following configurations can be changed via /proc/sys/dev/raid/ for system + * or /sys/block/mdX/md/ for one array. */ - static int sysctl_speed_limit_min = 1000; static int sysctl_speed_limit_max = 200000; -static inline int speed_min(struct mddev *mddev) +static int sysctl_sync_io_depth = 32; + +static int speed_min(struct mddev *mddev) { return mddev->sync_speed_min ? mddev->sync_speed_min : sysctl_speed_limit_min; } -static inline int speed_max(struct mddev *mddev) +static int speed_max(struct mddev *mddev) { return mddev->sync_speed_max ? mddev->sync_speed_max : sysctl_speed_limit_max; } +static int sync_io_depth(struct mddev *mddev) +{ + return mddev->sync_io_depth ? + mddev->sync_io_depth : sysctl_sync_io_depth; +} + static void rdev_uninit_serial(struct md_rdev *rdev) { if (!test_and_clear_bit(CollisionCheck, &rdev->flags)) @@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = { .procname = "speed_limit_min", .data = &sysctl_speed_limit_min, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "speed_limit_max", .data = &sysctl_speed_limit_max, .maxlen = sizeof(int), - .mode = S_IRUGO|S_IWUSR, + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_io_depth", + .data = &sysctl_sync_io_depth, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_dointvec, }, }; @@ -5091,7 +5114,7 @@ static ssize_t sync_min_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_min(mddev), - mddev->sync_speed_min ? "local": "system"); + mddev->sync_speed_min ? "local" : "system"); } static ssize_t @@ -5100,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len) unsigned int min; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { min = 0; } else { rv = kstrtouint(buf, 10, &min); @@ -5120,7 +5143,7 @@ static ssize_t sync_max_show(struct mddev *mddev, char *page) { return sprintf(page, "%d (%s)\n", speed_max(mddev), - mddev->sync_speed_max ? "local": "system"); + mddev->sync_speed_max ? "local" : "system"); } static ssize_t @@ -5129,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len) unsigned int max; int rv; - if (strncmp(buf, "system", 6)==0) { + if (strncmp(buf, "system", 6) == 0) { max = 0; } else { rv = kstrtouint(buf, 10, &max); @@ -5146,6 +5169,35 @@ static struct md_sysfs_entry md_sync_max = __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); static ssize_t +sync_io_depth_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%d (%s)\n", sync_io_depth(mddev), + mddev->sync_io_depth ? "local" : "system"); +} + +static ssize_t +sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned int max; + int rv; + + if (strncmp(buf, "system", 6) == 0) { + max = 0; + } else { + rv = kstrtouint(buf, 10, &max); + if (rv < 0) + return rv; + if (max == 0) + return -EINVAL; + } + mddev->sync_io_depth = max; + return len; +} + +static struct md_sysfs_entry md_sync_io_depth = +__ATTR_RW(sync_io_depth); + +static ssize_t degraded_show(struct mddev *mddev, char *page) { return sprintf(page, "%d\n", mddev->degraded); @@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_mismatches.attr, &md_sync_min.attr, &md_sync_max.attr, + &md_sync_io_depth.attr, &md_sync_speed.attr, &md_sync_force_parallel.attr, &md_sync_completed.attr, @@ -8572,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev) put_cluster_ops(mddev); } -static int is_mddev_idle(struct mddev *mddev, int init) +static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init) { + unsigned long last_events = rdev->last_events; + + if (!bdev_is_partition(rdev->bdev)) + return true; + + /* + * If rdev is partition, and user doesn't issue IO to the array, the + * array is still not idle if user issues IO to other partitions. + */ + rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0, + sectors) - + part_stat_read_accum(rdev->bdev, sectors); + + return init || rdev->last_events <= last_events; +} + +/* + * mddev is idle if following conditions are matched since last check: + * 1) mddev doesn't have normal IO completed; + * 2) mddev doesn't have inflight normal IO; + * 3) if any member disk is partition, and other partitions don't have IO + * completed; + * + * Noted this checking rely on IO accounting is enabled. + */ +static bool is_mddev_idle(struct mddev *mddev, int init) +{ + unsigned long last_events = mddev->normal_io_events; + struct gendisk *disk; struct md_rdev *rdev; - int idle; - int curr_events; + bool idle = true; - idle = 1; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) { - struct gendisk *disk = rdev->bdev->bd_disk; + disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk; + if (!disk) + return true; - if (!init && !blk_queue_io_stat(disk->queue)) - continue; + mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors); + if (!init && (mddev->normal_io_events > last_events || + bdev_count_inflight(disk->part0))) + idle = false; - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&disk->sync_io); - /* sync IO will cause sync_io to increase before the disk_stats - * as sync_io is counted when a request starts, and - * disk_stats is counted when it completes. - * So resync activity will cause curr_events to be smaller than - * when there was no such activity. - * non-sync IO will cause disk_stat to increase without - * increasing sync_io so curr_events will (eventually) - * be larger than it was before. Once it becomes - * substantially larger, the test below will cause - * the array to appear non-idle, and resync will slow - * down. - * If there is a lot of outstanding resync activity when - * we set last_event to curr_events, then all that activity - * completing might cause the array to appear non-idle - * and resync will be slowed down even though there might - * not have been non-resync activity. This will only - * happen once though. 'last_events' will soon reflect - * the state where there is little or no outstanding - * resync requests, and further resync activity will - * always make curr_events less than last_events. - * - */ - if (init || curr_events - rdev->last_events > 64) { - rdev->last_events = curr_events; - idle = 0; - } - } + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (!is_rdev_holder_idle(rdev, init)) + idle = false; rcu_read_unlock(); + return idle; } @@ -8927,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action) } } +static bool sync_io_within_limit(struct mddev *mddev) +{ + int io_sectors; + + /* + * For raid456, sync IO is stripe(4k) per IO, for other levels, it's + * RESYNC_PAGES(64k) per IO. + */ + if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6) + io_sectors = 8; + else + io_sectors = 128; + + return atomic_read(&mddev->recovery_active) < + io_sectors * sync_io_depth(mddev); +} + #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) #define UPDATE_FREQUENCY (5*60*HZ) @@ -9195,7 +9270,8 @@ void md_do_sync(struct md_thread *thread) msleep(500); goto repeat; } - if (!is_mddev_idle(mddev, 0)) { + if (!sync_io_within_limit(mddev) && + !is_mddev_idle(mddev, 0)) { /* * Give other IO more of a chance. * The faster the devices, the less we wait. diff --git a/drivers/md/md.h b/drivers/md/md.h index 1cf00a04bcdd..d45a9e6ead80 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -132,7 +132,7 @@ struct md_rdev { sector_t sectors; /* Device size (in 512bytes sectors) */ struct mddev *mddev; /* RAID array if running */ - int last_events; /* IO event timestamp */ + unsigned long last_events; /* IO event timestamp */ /* * If meta_bdev is non-NULL, it means that a separate device is @@ -404,7 +404,8 @@ struct mddev { * are happening, so run/ * takeover/stop are not safe */ - struct gendisk *gendisk; + struct gendisk *gendisk; /* mdraid gendisk */ + struct gendisk *dm_gendisk; /* dm-raid gendisk */ struct kobject kobj; int hold_active; @@ -483,6 +484,7 @@ struct mddev { /* if zero, use the system-wide default */ int sync_speed_min; int sync_speed_max; + int sync_io_depth; /* resync even though the same disks are shared among md-devices */ int parallel_resync; @@ -518,6 +520,7 @@ struct mddev { * adding a spare */ + unsigned long normal_io_events; /* IO event timestamp */ atomic_t recovery_active; /* blocks scheduled, but not written */ wait_queue_head_t recovery_wait; sector_t recovery_cp; @@ -714,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev) } extern void mddev_unlock(struct mddev *mddev); -static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) -{ - if (blk_queue_io_stat(bdev->bd_disk->queue)) - atomic_add(nr_sectors, &bdev->bd_disk->sync_io); -} - -static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) -{ - md_sync_acct(bio->bi_bdev, nr_sectors); -} - struct md_personality { struct md_submodule_head head; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index de9bccbe7337..657d481525be 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) wbio->bi_end_io = end_sync_write; atomic_inc(&r1_bio->remaining); - md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); submit_bio_noacct(wbio); } @@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { read_targets--; - md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; submit_bio_noacct(bio); @@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, } else { atomic_set(&r1_bio->remaining, 1); bio = r1_bio->bios[r1_bio->read_disk]; - md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; submit_bio_noacct(bio); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ba32bac975b8..dce06bf65016 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) atomic_inc(&conf->mirrors[d].rdev->nr_pending); atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio)); if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) tbio->bi_opf |= MD_FAILFAST; @@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) bio_copy_data(tbio, fbio); d = r10_bio->devs[i].devnum; atomic_inc(&r10_bio->remaining); - md_sync_acct(conf->mirrors[d].replacement->bdev, - bio_sectors(tbio)); submit_bio_noacct(tbio); } @@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) d = r10_bio->devs[1].devnum; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); - md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); submit_bio_noacct(wbio); } if (wbio2) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); - md_sync_acct(conf->mirrors[d].replacement->bdev, - bio_sectors(wbio2)); submit_bio_noacct(wbio2); } } @@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->sectors = nr_sectors; if (bio->bi_end_io == end_sync_read) { - md_sync_acct_bio(bio, nr_sectors); bio->bi_status = 0; submit_bio_noacct(bio); } @@ -4880,7 +4873,6 @@ read_more: r10_bio->sectors = nr_sectors; /* Now submit the read */ - md_sync_acct_bio(read_bio, r10_bio->sectors); atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; submit_bio_noacct(read_bio); @@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) continue; atomic_inc(&rdev->nr_pending); - md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; submit_bio_noacct(b); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6389383166c0..ca5b0e8ba707 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1240,10 +1240,6 @@ again: } if (rdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); - set_bit(STRIPE_IO_STARTED, &sh->state); bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags); @@ -1300,10 +1296,6 @@ again: submit_bio_noacct(bi); } if (rrdev) { - if (s->syncing || s->expanding || s->expanded - || s->replacing) - md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); - set_bit(STRIPE_IO_STARTED, &sh->state); bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags); |