summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/btree.c3
-rw-r--r--drivers/md/bcache/super.c3
-rw-r--r--drivers/md/dm-bufio.c200
-rw-r--r--drivers/md/dm-core.h4
-rw-r--r--drivers/md/dm-delay.c17
-rw-r--r--drivers/md/dm-dust.c4
-rw-r--r--drivers/md/dm-ebs-target.c3
-rw-r--r--drivers/md/dm-flakey.c118
-rw-r--r--drivers/md/dm-integrity.c18
-rw-r--r--drivers/md/dm-ioctl.c1
-rw-r--r--drivers/md/dm-linear.c4
-rw-r--r--drivers/md/dm-log-writes.c4
-rw-r--r--drivers/md/dm-mpath.c243
-rw-r--r--drivers/md/dm-raid.c3
-rw-r--r--drivers/md/dm-raid1.c5
-rw-r--r--drivers/md/dm-rq.c4
-rw-r--r--drivers/md/dm-stripe.c5
-rw-r--r--drivers/md/dm-switch.c4
-rw-r--r--drivers/md/dm-table.c274
-rw-r--r--drivers/md/dm-vdo/indexer/volume.c24
-rw-r--r--drivers/md/dm-verity-fec.c4
-rw-r--r--drivers/md/dm-verity-target.c15
-rw-r--r--drivers/md/dm-verity-verify-sig.c17
-rw-r--r--drivers/md/dm-zone.c98
-rw-r--r--drivers/md/dm-zoned-target.c3
-rw-r--r--drivers/md/dm.c73
-rw-r--r--drivers/md/dm.h6
-rw-r--r--drivers/md/md.c190
-rw-r--r--drivers/md/md.h18
-rw-r--r--drivers/md/raid1.c3
-rw-r--r--drivers/md/raid10.c9
-rw-r--r--drivers/md/raid5.c8
32 files changed, 884 insertions, 501 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ed40d8600656..2cc2eb24dc8a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -36,6 +36,7 @@
#include <linux/sched/clock.h>
#include <linux/rculist.h>
#include <linux/delay.h>
+#include <linux/sort.h>
#include <trace/events/bcache.h>
/*
@@ -559,8 +560,6 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
}
}
-#define cmp_int(l, r) ((l > r) - (l < r))
-
#ifdef CONFIG_PROVE_LOCKING
static int btree_lock_cmp_fn(const struct lockdep_map *_a,
const struct lockdep_map *_b)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index cc1de925111c..1a2ce1a4b456 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- __bio_add_page(bio, virt_to_page(out), SB_SIZE,
- offset_in_page(out));
+ bio_add_virt_nofail(bio, out, SB_SIZE);
out->offset = cpu_to_le64(sb->offset);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 9c8ed65cd87e..ec84ba5e93e5 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -41,16 +41,6 @@
#define DM_BUFIO_LOW_WATERMARK_RATIO 16
/*
- * Check buffer ages in this interval (seconds)
- */
-#define DM_BUFIO_WORK_TIMER_SECS 30
-
-/*
- * Free buffers when they are older than this (seconds)
- */
-#define DM_BUFIO_DEFAULT_AGE_SECS 300
-
-/*
* The nr of bytes of cached data to keep around.
*/
#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
@@ -68,6 +58,8 @@
#define LIST_DIRTY 1
#define LIST_SIZE 2
+#define SCAN_RESCHED_CYCLE 16
+
/*--------------------------------------------------------------*/
/*
@@ -1055,10 +1047,8 @@ static unsigned long dm_bufio_cache_size_latch;
static DEFINE_SPINLOCK(global_spinlock);
-/*
- * Buffers are freed after this timeout
- */
-static unsigned int dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
+static unsigned int dm_bufio_max_age; /* No longer does anything */
+
static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
static unsigned long dm_bufio_peak_allocated;
@@ -1086,7 +1076,6 @@ static LIST_HEAD(dm_bufio_all_clients);
static DEFINE_MUTEX(dm_bufio_clients_lock);
static struct workqueue_struct *dm_bufio_wq;
-static struct delayed_work dm_bufio_cleanup_old_work;
static struct work_struct dm_bufio_replacement_work;
@@ -1362,7 +1351,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
ptr = (char *)b->data + offset;
len = n_sectors << SECTOR_SHIFT;
- __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
+ bio_add_virt_nofail(bio, ptr, len);
submit_bio(bio);
}
@@ -2424,7 +2413,12 @@ static void __scan(struct dm_bufio_client *c)
atomic_long_dec(&c->need_shrink);
freed++;
- cond_resched();
+
+ if (unlikely(freed % SCAN_RESCHED_CYCLE == 0)) {
+ dm_bufio_unlock(c);
+ cond_resched();
+ dm_bufio_lock(c);
+ }
}
}
}
@@ -2673,130 +2667,6 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
/*--------------------------------------------------------------*/
-static unsigned int get_max_age_hz(void)
-{
- unsigned int max_age = READ_ONCE(dm_bufio_max_age);
-
- if (max_age > UINT_MAX / HZ)
- max_age = UINT_MAX / HZ;
-
- return max_age * HZ;
-}
-
-static bool older_than(struct dm_buffer *b, unsigned long age_hz)
-{
- return time_after_eq(jiffies, READ_ONCE(b->last_accessed) + age_hz);
-}
-
-struct evict_params {
- gfp_t gfp;
- unsigned long age_hz;
-
- /*
- * This gets updated with the largest last_accessed (ie. most
- * recently used) of the evicted buffers. It will not be reinitialised
- * by __evict_many(), so you can use it across multiple invocations.
- */
- unsigned long last_accessed;
-};
-
-/*
- * We may not be able to evict this buffer if IO pending or the client
- * is still using it.
- *
- * And if GFP_NOFS is used, we must not do any I/O because we hold
- * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
- * rerouted to different bufio client.
- */
-static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
-{
- struct evict_params *params = context;
-
- if (!(params->gfp & __GFP_FS) ||
- (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep)) {
- if (test_bit_acquire(B_READING, &b->state) ||
- test_bit(B_WRITING, &b->state) ||
- test_bit(B_DIRTY, &b->state))
- return ER_DONT_EVICT;
- }
-
- return older_than(b, params->age_hz) ? ER_EVICT : ER_STOP;
-}
-
-static unsigned long __evict_many(struct dm_bufio_client *c,
- struct evict_params *params,
- int list_mode, unsigned long max_count)
-{
- unsigned long count;
- unsigned long last_accessed;
- struct dm_buffer *b;
-
- for (count = 0; count < max_count; count++) {
- b = cache_evict(&c->cache, list_mode, select_for_evict, params);
- if (!b)
- break;
-
- last_accessed = READ_ONCE(b->last_accessed);
- if (time_after_eq(params->last_accessed, last_accessed))
- params->last_accessed = last_accessed;
-
- __make_buffer_clean(b);
- __free_buffer_wake(b);
-
- cond_resched();
- }
-
- return count;
-}
-
-static void evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
-{
- struct evict_params params = {.gfp = 0, .age_hz = age_hz, .last_accessed = 0};
- unsigned long retain = get_retain_buffers(c);
- unsigned long count;
- LIST_HEAD(write_list);
-
- dm_bufio_lock(c);
-
- __check_watermark(c, &write_list);
- if (unlikely(!list_empty(&write_list))) {
- dm_bufio_unlock(c);
- __flush_write_list(&write_list);
- dm_bufio_lock(c);
- }
-
- count = cache_total(&c->cache);
- if (count > retain)
- __evict_many(c, &params, LIST_CLEAN, count - retain);
-
- dm_bufio_unlock(c);
-}
-
-static void cleanup_old_buffers(void)
-{
- unsigned long max_age_hz = get_max_age_hz();
- struct dm_bufio_client *c;
-
- mutex_lock(&dm_bufio_clients_lock);
-
- __cache_size_refresh();
-
- list_for_each_entry(c, &dm_bufio_all_clients, client_list)
- evict_old_buffers(c, max_age_hz);
-
- mutex_unlock(&dm_bufio_clients_lock);
-}
-
-static void work_fn(struct work_struct *w)
-{
- cleanup_old_buffers();
-
- queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
- DM_BUFIO_WORK_TIMER_SECS * HZ);
-}
-
-/*--------------------------------------------------------------*/
-
/*
* Global cleanup tries to evict the oldest buffers from across _all_
* the clients. It does this by repeatedly evicting a few buffers from
@@ -2834,27 +2704,51 @@ static void __insert_client(struct dm_bufio_client *new_client)
list_add_tail(&new_client->client_list, h);
}
+static enum evict_result select_for_evict(struct dm_buffer *b, void *context)
+{
+ /* In no-sleep mode, we cannot wait on IO. */
+ if (static_branch_unlikely(&no_sleep_enabled) && b->c->no_sleep) {
+ if (test_bit_acquire(B_READING, &b->state) ||
+ test_bit(B_WRITING, &b->state) ||
+ test_bit(B_DIRTY, &b->state))
+ return ER_DONT_EVICT;
+ }
+ return ER_EVICT;
+}
+
static unsigned long __evict_a_few(unsigned long nr_buffers)
{
- unsigned long count;
struct dm_bufio_client *c;
- struct evict_params params = {
- .gfp = GFP_KERNEL,
- .age_hz = 0,
- /* set to jiffies in case there are no buffers in this client */
- .last_accessed = jiffies
- };
+ unsigned long oldest_buffer = jiffies;
+ unsigned long last_accessed;
+ unsigned long count;
+ struct dm_buffer *b;
c = __pop_client();
if (!c)
return 0;
dm_bufio_lock(c);
- count = __evict_many(c, &params, LIST_CLEAN, nr_buffers);
+
+ for (count = 0; count < nr_buffers; count++) {
+ b = cache_evict(&c->cache, LIST_CLEAN, select_for_evict, NULL);
+ if (!b)
+ break;
+
+ last_accessed = READ_ONCE(b->last_accessed);
+ if (time_after_eq(oldest_buffer, last_accessed))
+ oldest_buffer = last_accessed;
+
+ __make_buffer_clean(b);
+ __free_buffer_wake(b);
+
+ cond_resched();
+ }
+
dm_bufio_unlock(c);
if (count)
- c->oldest_buffer = params.last_accessed;
+ c->oldest_buffer = oldest_buffer;
__insert_client(c);
return count;
@@ -2937,10 +2831,7 @@ static int __init dm_bufio_init(void)
if (!dm_bufio_wq)
return -ENOMEM;
- INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
- queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
- DM_BUFIO_WORK_TIMER_SECS * HZ);
return 0;
}
@@ -2952,7 +2843,6 @@ static void __exit dm_bufio_exit(void)
{
int bug = 0;
- cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
destroy_workqueue(dm_bufio_wq);
if (dm_bufio_client_count) {
@@ -2989,7 +2879,7 @@ module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, 0644);
MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
module_param_named(max_age_seconds, dm_bufio_max_age, uint, 0644);
-MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
+MODULE_PARM_DESC(max_age_seconds, "No longer does anything");
module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, 0644);
MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3637761f3585..c889332e533b 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -141,6 +141,7 @@ struct mapped_device {
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
void *zone_revalidate_map;
+ struct task_struct *revalidate_map_task;
#endif
#ifdef CONFIG_IMA
@@ -162,9 +163,6 @@ struct mapped_device {
#define DMF_POST_SUSPENDING 8
#define DMF_EMULATE_ZONE_APPEND 9
-void disable_discard(struct mapped_device *md);
-void disable_write_zeroes(struct mapped_device *md);
-
static inline sector_t dm_get_size(struct mapped_device *md)
{
return get_capacity(md->disk);
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index d4cf0ac2a7aa..16d3d454fb0a 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -14,11 +14,14 @@
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <linux/device-mapper.h>
#define DM_MSG_PREFIX "delay"
+#define SLEEP_SHIFT 3
+
struct delay_class {
struct dm_dev *dev;
sector_t start;
@@ -34,6 +37,7 @@ struct delay_c {
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
struct task_struct *worker;
+ unsigned int worker_sleep_us;
bool may_delay;
struct delay_class read;
@@ -136,6 +140,7 @@ static int flush_worker_fn(void *data)
schedule();
} else {
spin_unlock(&dc->delayed_bios_lock);
+ fsleep(dc->worker_sleep_us);
cond_resched();
}
}
@@ -212,7 +217,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct delay_c *dc;
int ret;
- unsigned int max_delay;
+ unsigned int max_delay, min_delay;
if (argc != 3 && argc != 6 && argc != 9) {
ti->error = "Requires exactly 3, 6 or 9 arguments";
@@ -235,7 +240,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ret = delay_class_ctr(ti, &dc->read, argv);
if (ret)
goto bad;
- max_delay = dc->read.delay;
+ min_delay = max_delay = dc->read.delay;
if (argc == 3) {
ret = delay_class_ctr(ti, &dc->write, argv);
@@ -251,6 +256,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret)
goto bad;
max_delay = max(max_delay, dc->write.delay);
+ min_delay = min_not_zero(min_delay, dc->write.delay);
if (argc == 6) {
ret = delay_class_ctr(ti, &dc->flush, argv + 3);
@@ -263,9 +269,14 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (ret)
goto bad;
max_delay = max(max_delay, dc->flush.delay);
+ min_delay = min_not_zero(min_delay, dc->flush.delay);
out:
if (max_delay < 50) {
+ if (min_delay >> SLEEP_SHIFT)
+ dc->worker_sleep_us = 1000;
+ else
+ dc->worker_sleep_us = (min_delay * 1000) >> SLEEP_SHIFT;
/*
* In case of small requested delays, use kthread instead of
* timers and workqueue to achieve better latency.
@@ -438,7 +449,7 @@ out:
static struct target_type delay_target = {
.name = "delay",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
.features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = delay_ctr,
diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c
index 1a33820c9f46..e75310232bbf 100644
--- a/drivers/md/dm-dust.c
+++ b/drivers/md/dm-dust.c
@@ -534,7 +534,9 @@ static void dust_status(struct dm_target *ti, status_type_t type,
}
}
-static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int dust_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct dust_device *dd = ti->private;
struct dm_dev *dev = dd->dev;
diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c
index b19b0142a690..6abb31ca9662 100644
--- a/drivers/md/dm-ebs-target.c
+++ b/drivers/md/dm-ebs-target.c
@@ -415,7 +415,8 @@ static void ebs_status(struct dm_target *ti, status_type_t type,
}
}
-static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int ebs_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg, bool *forward)
{
struct ebs_c *ec = ti->private;
struct dm_dev *dev = ec->dev;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b690905ab89f..c711db6f8f5c 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -47,14 +47,15 @@ enum feature_flag_bits {
};
struct per_bio_data {
- bool bio_submitted;
+ bool bio_can_corrupt;
+ struct bvec_iter saved_iter;
};
static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
struct dm_target *ti)
{
- int r;
- unsigned int argc;
+ int r = 0;
+ unsigned int argc = 0;
const char *arg_name;
static const struct dm_arg _args[] = {
@@ -65,14 +66,13 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
{0, PROBABILITY_BASE, "Invalid random corrupt argument"},
};
- /* No feature arguments supplied. */
- if (!as->argc)
- return 0;
-
- r = dm_read_arg_group(_args, as, &argc, &ti->error);
- if (r)
+ if (as->argc && (r = dm_read_arg_group(_args, as, &argc, &ti->error)))
return r;
+ /* No feature arguments supplied. */
+ if (!argc)
+ goto error_all_io;
+
while (argc) {
arg_name = dm_shift_arg(as);
argc--;
@@ -128,8 +128,11 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
* corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
*/
if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
- if (!argc) {
- ti->error = "Feature corrupt_bio_byte requires parameters";
+ if (fc->corrupt_bio_byte) {
+ ti->error = "Feature corrupt_bio_byte duplicated";
+ return -EINVAL;
+ } else if (argc < 4) {
+ ti->error = "Feature corrupt_bio_byte requires 4 parameters";
return -EINVAL;
}
@@ -176,7 +179,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
}
if (!strcasecmp(arg_name, "random_read_corrupt")) {
- if (!argc) {
+ if (fc->random_read_corrupt) {
+ ti->error = "Feature random_read_corrupt duplicated";
+ return -EINVAL;
+ } else if (!argc) {
ti->error = "Feature random_read_corrupt requires a parameter";
return -EINVAL;
}
@@ -189,7 +195,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
}
if (!strcasecmp(arg_name, "random_write_corrupt")) {
- if (!argc) {
+ if (fc->random_write_corrupt) {
+ ti->error = "Feature random_write_corrupt duplicated";
+ return -EINVAL;
+ } else if (!argc) {
ti->error = "Feature random_write_corrupt requires a parameter";
return -EINVAL;
}
@@ -205,18 +214,25 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
return -EINVAL;
}
- if (test_bit(DROP_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
- ti->error = "drop_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+ if (test_bit(DROP_WRITES, &fc->flags) &&
+ (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) {
+ ti->error = "drop_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set";
return -EINVAL;
- } else if (test_bit(ERROR_WRITES, &fc->flags) && (fc->corrupt_bio_rw == WRITE)) {
- ti->error = "error_writes is incompatible with corrupt_bio_byte with the WRITE flag set";
+ } else if (test_bit(ERROR_WRITES, &fc->flags) &&
+ (fc->corrupt_bio_rw == WRITE || fc->random_write_corrupt)) {
+ ti->error = "error_writes is incompatible with random_write_corrupt or corrupt_bio_byte with the WRITE flag set";
+ return -EINVAL;
+ } else if (test_bit(ERROR_READS, &fc->flags) &&
+ (fc->corrupt_bio_rw == READ || fc->random_read_corrupt)) {
+ ti->error = "error_reads is incompatible with random_read_corrupt or corrupt_bio_byte with the READ flag set";
return -EINVAL;
}
if (!fc->corrupt_bio_byte && !test_bit(ERROR_READS, &fc->flags) &&
!test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags) &&
!fc->random_read_corrupt && !fc->random_write_corrupt) {
+error_all_io:
set_bit(ERROR_WRITES, &fc->flags);
set_bit(ERROR_READS, &fc->flags);
}
@@ -278,7 +294,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- r = dm_read_arg(_args, &as, &fc->down_interval, &ti->error);
+ r = dm_read_arg(_args + 1, &as, &fc->down_interval, &ti->error);
if (r)
goto bad;
@@ -339,7 +355,8 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
}
static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
- unsigned char corrupt_bio_value)
+ unsigned char corrupt_bio_value,
+ struct bvec_iter start)
{
struct bvec_iter iter;
struct bio_vec bvec;
@@ -348,7 +365,7 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
* Overwrite the Nth byte of the bio's data, on whichever page
* it falls.
*/
- bio_for_each_segment(bvec, bio, iter) {
+ __bio_for_each_segment(bvec, bio, iter, start) {
if (bio_iter_len(bio, iter) > corrupt_bio_byte) {
unsigned char *segment = bvec_kmap_local(&bvec);
segment[corrupt_bio_byte] = corrupt_bio_value;
@@ -357,36 +374,31 @@ static void corrupt_bio_common(struct bio *bio, unsigned int corrupt_bio_byte,
"(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n",
bio, corrupt_bio_value, corrupt_bio_byte,
(bio_data_dir(bio) == WRITE) ? 'w' : 'r', bio->bi_opf,
- (unsigned long long)bio->bi_iter.bi_sector,
- bio->bi_iter.bi_size);
+ (unsigned long long)start.bi_sector,
+ start.bi_size);
break;
}
corrupt_bio_byte -= bio_iter_len(bio, iter);
}
}
-static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
+static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc,
+ struct bvec_iter start)
{
unsigned int corrupt_bio_byte = fc->corrupt_bio_byte - 1;
- if (!bio_has_data(bio))
- return;
-
- corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value);
+ corrupt_bio_common(bio, corrupt_bio_byte, fc->corrupt_bio_value, start);
}
-static void corrupt_bio_random(struct bio *bio)
+static void corrupt_bio_random(struct bio *bio, struct bvec_iter start)
{
unsigned int corrupt_byte;
unsigned char corrupt_value;
- if (!bio_has_data(bio))
- return;
-
- corrupt_byte = get_random_u32() % bio->bi_iter.bi_size;
+ corrupt_byte = get_random_u32() % start.bi_size;
corrupt_value = get_random_u8();
- corrupt_bio_common(bio, corrupt_byte, corrupt_value);
+ corrupt_bio_common(bio, corrupt_byte, corrupt_value, start);
}
static void clone_free(struct bio *clone)
@@ -481,7 +493,7 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
unsigned int elapsed;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
- pb->bio_submitted = false;
+ pb->bio_can_corrupt = false;
if (op_is_zone_mgmt(bio_op(bio)))
goto map_bio;
@@ -490,14 +502,15 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
elapsed = (jiffies - fc->start_time) / HZ;
if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
bool corrupt_fixed, corrupt_random;
- /*
- * Flag this bio as submitted while down.
- */
- pb->bio_submitted = true;
+
+ if (bio_has_data(bio)) {
+ pb->bio_can_corrupt = true;
+ pb->saved_iter = bio->bi_iter;
+ }
/*
- * Error reads if neither corrupt_bio_byte or drop_writes or error_writes are set.
- * Otherwise, flakey_end_io() will decide if the reads should be modified.
+ * If ERROR_READS isn't set flakey_end_io() will decide if the
+ * reads should be modified.
*/
if (bio_data_dir(bio) == READ) {
if (test_bit(ERROR_READS, &fc->flags))
@@ -516,6 +529,8 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+ if (!pb->bio_can_corrupt)
+ goto map_bio;
/*
* Corrupt matching writes.
*/
@@ -535,9 +550,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
struct bio *clone = clone_bio(ti, fc, bio);
if (clone) {
if (corrupt_fixed)
- corrupt_bio_data(clone, fc);
+ corrupt_bio_data(clone, fc,
+ clone->bi_iter);
if (corrupt_random)
- corrupt_bio_random(clone);
+ corrupt_bio_random(clone,
+ clone->bi_iter);
submit_bio(clone);
return DM_MAPIO_SUBMITTED;
}
@@ -559,28 +576,21 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
if (op_is_zone_mgmt(bio_op(bio)))
return DM_ENDIO_DONE;
- if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
+ if (!*error && pb->bio_can_corrupt && (bio_data_dir(bio) == READ)) {
if (fc->corrupt_bio_byte) {
if ((fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc)) {
/*
* Corrupt successful matching READs while in down state.
*/
- corrupt_bio_data(bio, fc);
+ corrupt_bio_data(bio, fc, pb->saved_iter);
}
}
if (fc->random_read_corrupt) {
u64 rnd = get_random_u64();
u32 rem = do_div(rnd, PROBABILITY_BASE);
if (rem < fc->random_read_corrupt)
- corrupt_bio_random(bio);
- }
- if (test_bit(ERROR_READS, &fc->flags)) {
- /*
- * Error read during the down_interval if drop_writes
- * and error_writes were not configured.
- */
- *error = BLK_STS_IOERR;
+ corrupt_bio_random(bio, pb->saved_iter);
}
}
@@ -638,7 +648,9 @@ static void flakey_status(struct dm_target *ti, status_type_t type,
}
}
-static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct flakey_c *fc = ti->private;
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 2a283feb3319..1f626066e8cc 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2557,14 +2557,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
char *mem;
outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
-
- r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
- if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
- bio_put(outgoing_bio);
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return;
- }
+ bio_add_virt_nofail(outgoing_bio, outgoing_data,
+ ic->sectors_per_block << SECTOR_SHIFT);
bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
if (IS_ERR(bip)) {
@@ -3211,7 +3205,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
r = submit_bio_wait(bio);
bio_put(bio);
if (unlikely(r)) {
@@ -3228,7 +3223,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
if (unlikely(IS_ERR(bip))) {
@@ -5164,7 +5160,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
BUG_ON(!list_empty(&ic->wait_list));
- if (ic->mode == 'B')
+ if (ic->mode == 'B' && ic->bitmap_flush_work.work.func)
cancel_delayed_work_sync(&ic->bitmap_flush_work);
if (ic->metadata_wq)
destroy_workqueue(ic->metadata_wq);
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index d42eac944eb5..4165fef4c170 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1885,6 +1885,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
{DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
{DM_DEV_ARM_POLL_CMD, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
{DM_GET_TARGET_VERSION_CMD, 0, get_target_version},
+ {DM_MPATH_PROBE_PATHS_CMD, 0, NULL}, /* block device ioctl */
};
if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 66318aba4bdb..15538ec58f8e 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -119,7 +119,9 @@ static void linear_status(struct dm_target *ti, status_type_t type,
}
}
-static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct linear_c *lc = ti->private;
struct dm_dev *dev = lc->dev;
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 8d7df8303d0a..d484e8e1d48a 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -818,7 +818,9 @@ static void log_writes_status(struct dm_target *ti, status_type_t type,
}
static int log_writes_prepare_ioctl(struct dm_target *ti,
- struct block_device **bdev)
+ struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct log_writes_c *lc = ti->private;
struct dm_dev *dev = lc->dev;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6c98f4ae5ea9..81fec2e1e0ef 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -79,6 +79,7 @@ struct multipath {
struct pgpath *current_pgpath;
struct priority_group *current_pg;
struct priority_group *next_pg; /* Switch to this PG if set */
+ struct priority_group *last_probed_pg;
atomic_t nr_valid_paths; /* Total number of usable paths */
unsigned int nr_priority_groups;
@@ -87,6 +88,7 @@ struct multipath {
const char *hw_handler_name;
char *hw_handler_params;
wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */
+ wait_queue_head_t probe_wait; /* Wait for probing paths */
unsigned int pg_init_retries; /* Number of times to retry pg_init */
unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */
atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */
@@ -100,6 +102,7 @@ struct multipath {
struct bio_list queued_bios;
struct timer_list nopath_timer; /* Timeout for queue_if_no_path */
+ bool is_suspending;
};
/*
@@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t);
#define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */
#define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */
#define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */
+#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */
+#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */
static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m)
{
@@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m)
atomic_set(&m->pg_init_count, 0);
m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT;
init_waitqueue_head(&m->pg_init_wait);
+ init_waitqueue_head(&m->probe_wait);
return 0;
}
@@ -413,13 +419,21 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
goto failed;
}
+ /* Don't change PG until it has no remaining paths */
+ pg = READ_ONCE(m->current_pg);
+ if (pg) {
+ pgpath = choose_path_in_pg(m, pg, nr_bytes);
+ if (!IS_ERR_OR_NULL(pgpath))
+ return pgpath;
+ }
+
/* Were we instructed to switch PG? */
if (READ_ONCE(m->next_pg)) {
spin_lock_irqsave(&m->lock, flags);
pg = m->next_pg;
if (!pg) {
spin_unlock_irqrestore(&m->lock, flags);
- goto check_current_pg;
+ goto check_all_pgs;
}
m->next_pg = NULL;
spin_unlock_irqrestore(&m->lock, flags);
@@ -427,16 +441,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
if (!IS_ERR_OR_NULL(pgpath))
return pgpath;
}
-
- /* Don't change PG until it has no remaining paths */
-check_current_pg:
- pg = READ_ONCE(m->current_pg);
- if (pg) {
- pgpath = choose_path_in_pg(m, pg, nr_bytes);
- if (!IS_ERR_OR_NULL(pgpath))
- return pgpath;
- }
-
+check_all_pgs:
/*
* Loop through priority groups until we find a valid path.
* First time we skip PGs marked 'bypassed'.
@@ -612,7 +617,6 @@ static void multipath_queue_bio(struct multipath *m, struct bio *bio)
static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
{
struct pgpath *pgpath;
- unsigned long flags;
/* Do we need to select a new pgpath? */
pgpath = READ_ONCE(m->current_pgpath);
@@ -620,12 +624,12 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio)
pgpath = choose_pgpath(m, bio->bi_iter.bi_size);
if (!pgpath) {
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
__multipath_queue_bio(m, bio);
pgpath = ERR_PTR(-EAGAIN);
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
} else if (mpath_double_check_test_bit(MPATHF_QUEUE_IO, m) ||
mpath_double_check_test_bit(MPATHF_PG_INIT_REQUIRED, m)) {
@@ -688,7 +692,6 @@ static void process_queued_io_list(struct multipath *m)
static void process_queued_bios(struct work_struct *work)
{
int r;
- unsigned long flags;
struct bio *bio;
struct bio_list bios;
struct blk_plug plug;
@@ -697,16 +700,16 @@ static void process_queued_bios(struct work_struct *work)
bio_list_init(&bios);
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (bio_list_empty(&m->queued_bios)) {
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
return;
}
bio_list_merge_init(&bios, &m->queued_bios);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
blk_start_plug(&plug);
while ((bio = bio_list_pop(&bios))) {
@@ -1190,7 +1193,6 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv)
struct dm_arg_set as;
unsigned int pg_count = 0;
unsigned int next_pg_num;
- unsigned long flags;
as.argc = argc;
as.argv = argv;
@@ -1255,9 +1257,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
enable_nopath_timeout(m);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
@@ -1292,23 +1294,21 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
static void flush_multipath_work(struct multipath *m)
{
if (m->hw_handler_name) {
- unsigned long flags;
-
if (!atomic_read(&m->pg_init_in_progress))
goto skip;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (atomic_read(&m->pg_init_in_progress) &&
!test_and_set_bit(MPATHF_PG_INIT_DISABLED, &m->flags)) {
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
clear_bit(MPATHF_PG_INIT_DISABLED, &m->flags);
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
skip:
if (m->queue_mode == DM_TYPE_BIO_BASED)
@@ -1370,11 +1370,10 @@ out:
static int reinstate_path(struct pgpath *pgpath)
{
int r = 0, run_queue = 0;
- unsigned long flags;
struct multipath *m = pgpath->pg->m;
unsigned int nr_valid_paths;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (pgpath->is_active)
goto out;
@@ -1404,7 +1403,7 @@ static int reinstate_path(struct pgpath *pgpath)
schedule_work(&m->trigger_event);
out:
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
if (run_queue) {
dm_table_run_md_queue_async(m->ti->table);
process_queued_io_list(m);
@@ -1439,15 +1438,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action)
* Temporarily try to avoid having to use the specified PG
*/
static void bypass_pg(struct multipath *m, struct priority_group *pg,
- bool bypassed)
+ bool bypassed, bool can_be_delayed)
{
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
pg->bypassed = bypassed;
- m->current_pgpath = NULL;
- m->current_pg = NULL;
+ if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags))
+ set_bit(MPATHF_NEED_PG_SWITCH, &m->flags);
+ else {
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
spin_unlock_irqrestore(&m->lock, flags);
@@ -1461,7 +1464,6 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
{
struct priority_group *pg;
unsigned int pgnum;
- unsigned long flags;
char dummy;
if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
@@ -1470,17 +1472,21 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
return -EINVAL;
}
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
list_for_each_entry(pg, &m->priority_groups, list) {
pg->bypassed = false;
if (--pgnum)
continue;
- m->current_pgpath = NULL;
- m->current_pg = NULL;
+ if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags))
+ set_bit(MPATHF_NEED_PG_SWITCH, &m->flags);
+ else {
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
m->next_pg = pg;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
schedule_work(&m->trigger_event);
return 0;
@@ -1507,7 +1513,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed)
break;
}
- bypass_pg(m, pg, bypassed);
+ bypass_pg(m, pg, bypassed, true);
return 0;
}
@@ -1561,7 +1567,7 @@ static void pg_init_done(void *data, int errors)
* Probably doing something like FW upgrade on the
* controller so try the other pg.
*/
- bypass_pg(m, pg, true);
+ bypass_pg(m, pg, true, false);
break;
case SCSI_DH_RETRY:
/* Wait before retrying. */
@@ -1742,6 +1748,9 @@ static void multipath_presuspend(struct dm_target *ti)
{
struct multipath *m = ti->private;
+ spin_lock_irq(&m->lock);
+ m->is_suspending = true;
+ spin_unlock_irq(&m->lock);
/* FIXME: bio-based shouldn't need to always disable queue_if_no_path */
if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti))
queue_if_no_path(m, false, true, __func__);
@@ -1762,9 +1771,9 @@ static void multipath_postsuspend(struct dm_target *ti)
static void multipath_resume(struct dm_target *ti)
{
struct multipath *m = ti->private;
- unsigned long flags;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
+ m->is_suspending = false;
if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) {
set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags);
clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags);
@@ -1775,7 +1784,7 @@ static void multipath_resume(struct dm_target *ti)
test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags),
test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags));
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
/*
@@ -1798,14 +1807,13 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
unsigned int status_flags, char *result, unsigned int maxlen)
{
int sz = 0, pg_counter, pgpath_counter;
- unsigned long flags;
struct multipath *m = ti->private;
struct priority_group *pg;
struct pgpath *p;
unsigned int pg_num;
char state;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
/* Features */
if (type == STATUSTYPE_INFO)
@@ -1845,10 +1853,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
DMEMIT("%u ", m->nr_priority_groups);
- if (m->next_pg)
- pg_num = m->next_pg->pg_num;
- else if (m->current_pg)
+ if (m->current_pg)
pg_num = m->current_pg->pg_num;
+ else if (m->next_pg)
+ pg_num = m->next_pg->pg_num;
else
pg_num = (m->nr_priority_groups ? 1 : 0);
@@ -1951,7 +1959,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
break;
}
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
static int multipath_message(struct dm_target *ti, unsigned int argc, char **argv,
@@ -1961,7 +1969,6 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg
dev_t dev;
struct multipath *m = ti->private;
action_fn action;
- unsigned long flags;
mutex_lock(&m->work_mutex);
@@ -1973,9 +1980,9 @@ static int multipath_message(struct dm_target *ti, unsigned int argc, char **arg
if (argc == 1) {
if (!strcasecmp(argv[0], "queue_if_no_path")) {
r = queue_if_no_path(m, true, false, __func__);
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
enable_nopath_timeout(m);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
goto out;
} else if (!strcasecmp(argv[0], "fail_if_no_path")) {
r = queue_if_no_path(m, false, false, __func__);
@@ -2021,14 +2028,132 @@ out:
return r;
}
+/*
+ * Perform a minimal read from the given path to find out whether the
+ * path still works. If a path error occurs, fail it.
+ */
+static int probe_path(struct pgpath *pgpath)
+{
+ struct block_device *bdev = pgpath->path.dev->bdev;
+ unsigned int read_size = bdev_logical_block_size(bdev);
+ struct page *page;
+ struct bio *bio;
+ blk_status_t status;
+ int r = 0;
+
+ if (WARN_ON_ONCE(read_size > PAGE_SIZE))
+ return -EINVAL;
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ /* Perform a minimal read: Sector 0, length read_size */
+ bio = bio_alloc(bdev, 1, REQ_OP_READ, GFP_KERNEL);
+ if (!bio) {
+ r = -ENOMEM;
+ goto out;
+ }
+
+ bio->bi_iter.bi_sector = 0;
+ __bio_add_page(bio, page, read_size, 0);
+ submit_bio_wait(bio);
+ status = bio->bi_status;
+ bio_put(bio);
+
+ if (status && blk_path_error(status))
+ fail_path(pgpath);
+
+out:
+ __free_page(page);
+ return r;
+}
+
+/*
+ * Probe all active paths in current_pg to find out whether they still work.
+ * Fail all paths that do not work.
+ *
+ * Return -ENOTCONN if no valid path is left (even outside of current_pg). We
+ * cannot probe paths in other pgs without switching current_pg, so if valid
+ * paths are only in different pgs, they may or may not work. Additionally
+ * we should not probe paths in a pathgroup that is in the process of
+ * Initializing. Userspace can submit a request and we'll switch and wait
+ * for the pathgroup to be initialized. If the request fails, it may need to
+ * probe again.
+ */
+static int probe_active_paths(struct multipath *m)
+{
+ struct pgpath *pgpath;
+ struct priority_group *pg = NULL;
+ int r = 0;
+
+ spin_lock_irq(&m->lock);
+ if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) {
+ wait_event_lock_irq(m->probe_wait,
+ !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags),
+ m->lock);
+ /*
+ * if we waited because a probe was already in progress,
+ * and it probed the current active pathgroup, don't
+ * reprobe. Just return the number of valid paths
+ */
+ if (m->current_pg == m->last_probed_pg)
+ goto skip_probe;
+ }
+ if (!m->current_pg || m->is_suspending ||
+ test_bit(MPATHF_QUEUE_IO, &m->flags))
+ goto skip_probe;
+ set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags);
+ pg = m->last_probed_pg = m->current_pg;
+ spin_unlock_irq(&m->lock);
+
+ list_for_each_entry(pgpath, &pg->pgpaths, list) {
+ if (pg != READ_ONCE(m->current_pg) ||
+ READ_ONCE(m->is_suspending))
+ goto out;
+ if (!pgpath->is_active)
+ continue;
+
+ r = probe_path(pgpath);
+ if (r < 0)
+ goto out;
+ }
+
+out:
+ spin_lock_irq(&m->lock);
+ clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags);
+ if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) {
+ m->current_pgpath = NULL;
+ m->current_pg = NULL;
+ }
+skip_probe:
+ if (r == 0 && !atomic_read(&m->nr_valid_paths))
+ r = -ENOTCONN;
+ spin_unlock_irq(&m->lock);
+ if (pg)
+ wake_up(&m->probe_wait);
+ return r;
+}
+
static int multipath_prepare_ioctl(struct dm_target *ti,
- struct block_device **bdev)
+ struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct multipath *m = ti->private;
struct pgpath *pgpath;
- unsigned long flags;
int r;
+ if (_IOC_TYPE(cmd) == DM_IOCTL) {
+ *forward = false;
+ switch (cmd) {
+ case DM_MPATH_PROBE_PATHS:
+ return probe_active_paths(m);
+ default:
+ return -ENOTTY;
+ }
+ }
+
pgpath = READ_ONCE(m->current_pgpath);
if (!pgpath || !mpath_double_check_test_bit(MPATHF_QUEUE_IO, m))
pgpath = choose_pgpath(m, 0);
@@ -2044,10 +2169,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
} else {
/* No path is available */
r = -EIO;
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))
r = -ENOTCONN;
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
}
if (r == -ENOTCONN) {
@@ -2055,10 +2180,10 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
/* Path status changed, redo selection */
(void) choose_pgpath(m, 0);
}
- spin_lock_irqsave(&m->lock, flags);
+ spin_lock_irq(&m->lock);
if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
(void) __pg_init_all_paths(m);
- spin_unlock_irqrestore(&m->lock, flags);
+ spin_unlock_irq(&m->lock);
dm_table_run_md_queue_async(m->ti->table);
process_queued_io_list(m);
}
@@ -2180,7 +2305,7 @@ static int multipath_busy(struct dm_target *ti)
*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 14, 0},
+ .version = {1, 15, 0},
.features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE |
DM_TARGET_PASSES_INTEGRITY,
.module = THIS_MODULE,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6adc55fd90d3..127138c61be5 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -14,6 +14,7 @@
#include "raid5.h"
#include "raid10.h"
#include "md-bitmap.h"
+#include "dm-core.h"
#include <linux/device-mapper.h>
@@ -3308,6 +3309,7 @@ size_check:
/* Disable/enable discard support on raid set. */
configure_discard_support(rs);
+ rs->md.dm_gendisk = ti->table->md->disk;
mddev_unlock(&rs->md);
return 0;
@@ -3327,6 +3329,7 @@ static void raid_dtr(struct dm_target *ti)
mddev_lock_nointr(&rs->md);
md_stop(&rs->md);
+ rs->md.dm_gendisk = NULL;
mddev_unlock(&rs->md);
if (work_pending(&rs->md.event_work))
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9e615b4f1f5e..785af4816584 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -133,10 +133,9 @@ static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
spin_lock_irqsave(&ms->lock, flags);
should_wake = !(bl->head);
bio_list_add(bl, bio);
- spin_unlock_irqrestore(&ms->lock, flags);
-
if (should_wake)
wakeup_mirrord(ms);
+ spin_unlock_irqrestore(&ms->lock, flags);
}
static void dispatch_bios(void *context, struct bio_list *bio_list)
@@ -646,9 +645,9 @@ static void write_callback(unsigned long error, void *context)
if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio);
- spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
+ spin_unlock_irqrestore(&ms->lock, flags);
}
static void do_write(struct mirror_set *ms, struct bio *bio)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index e23076f7ece2..a6ca92049c10 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -217,10 +217,10 @@ static void dm_done(struct request *clone, blk_status_t error, bool mapped)
if (unlikely(error == BLK_STS_TARGET)) {
if (req_op(clone) == REQ_OP_DISCARD &&
!clone->q->limits.max_discard_sectors)
- disable_discard(tio->md);
+ blk_queue_disable_discard(tio->md->queue);
else if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
!clone->q->limits.max_write_zeroes_sectors)
- disable_write_zeroes(tio->md);
+ blk_queue_disable_write_zeroes(tio->md->queue);
}
switch (r) {
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index a1b7535c508a..a7dc04bd55e5 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -405,7 +405,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
blk_status_t *error)
{
unsigned int i;
- char major_minor[16];
+ char major_minor[22];
struct stripe_c *sc = ti->private;
if (!*error)
@@ -417,8 +417,7 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
if (*error == BLK_STS_NOTSUPP)
return DM_ENDIO_DONE;
- memset(major_minor, 0, sizeof(major_minor));
- sprintf(major_minor, "%d:%d", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)));
+ format_dev_t(major_minor, bio_dev(bio));
/*
* Test to see which stripe drive triggered the event
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index dfd9fb52a6f3..bb1a70b5a215 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -517,7 +517,9 @@ static void switch_status(struct dm_target *ti, status_type_t type,
*
* Passthrough all ioctls to the path for sector 0
*/
-static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct switch_ctx *sctx = ti->private;
unsigned int path_nr;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 35100a435c88..24a857ff6d0b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -117,7 +117,6 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
n_targets = (struct dm_target *) (n_highs + num);
memset(n_highs, -1, sizeof(*n_highs) * num);
- kvfree(t->highs);
t->num_allocated = num;
t->highs = n_highs;
@@ -257,7 +256,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
if (bdev_is_zoned(bdev)) {
unsigned int zone_sectors = bdev_zone_sectors(bdev);
- if (start & (zone_sectors - 1)) {
+ if (!bdev_is_zone_aligned(bdev, start)) {
DMERR("%s: start=%llu not aligned to h/w zone size %u of %pg",
dm_device_name(ti->table->md),
(unsigned long long)start,
@@ -274,7 +273,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
* devices do not end up with a smaller zone in the middle of
* the sector range.
*/
- if (len & (zone_sectors - 1)) {
+ if (!bdev_is_zone_aligned(bdev, len)) {
DMERR("%s: len=%llu not aligned to h/w zone size %u of %pg",
dm_device_name(ti->table->md),
(unsigned long long)len,
@@ -431,6 +430,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
return 0;
}
+ mutex_lock(&q->limits_lock);
+ /*
+ * BLK_FEAT_ATOMIC_WRITES is not inherited from the bottom device in
+ * blk_stack_limits(), so do it manually.
+ */
+ limits->features |= (q->limits.features & BLK_FEAT_ATOMIC_WRITES);
+
if (blk_stack_limits(limits, &q->limits,
get_start_sect(bdev) + start) < 0)
DMWARN("%s: adding target device %pg caused an alignment inconsistency: "
@@ -448,6 +454,7 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
*/
if (!dm_target_has_integrity(ti->type))
queue_limits_stack_integrity_bdev(limits, bdev);
+ mutex_unlock(&q->limits_lock);
return 0;
}
@@ -523,8 +530,9 @@ static char **realloc_argv(unsigned int *size, char **old_argv)
gfp = GFP_NOIO;
}
argv = kmalloc_array(new_size, sizeof(*argv), gfp);
- if (argv && old_argv) {
- memcpy(argv, old_argv, *size * sizeof(*argv));
+ if (argv) {
+ if (old_argv)
+ memcpy(argv, old_argv, *size * sizeof(*argv));
*size = new_size;
}
@@ -1049,7 +1057,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
unsigned int min_pool_size = 0, pool_size;
struct dm_md_mempools *pools;
unsigned int bioset_flags = 0;
- bool mempool_needs_integrity = t->integrity_supported;
if (unlikely(type == DM_TYPE_NONE)) {
DMERR("no table type is set, can't allocate mempools");
@@ -1074,8 +1081,6 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
per_io_data_size = max(per_io_data_size, ti->per_io_data_size);
min_pool_size = max(min_pool_size, ti->num_flush_bios);
-
- mempool_needs_integrity |= ti->mempool_needs_integrity;
}
pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
front_pad = roundup(per_io_data_size,
@@ -1175,7 +1180,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
t = dm_get_live_table(md, &srcu_idx);
if (!t)
- return 0;
+ goto put_live_table;
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
@@ -1186,10 +1191,181 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile,
(void *)key);
}
+put_live_table:
dm_put_live_table(md, srcu_idx);
return 0;
}
+enum dm_wrappedkey_op {
+ DERIVE_SW_SECRET,
+ IMPORT_KEY,
+ GENERATE_KEY,
+ PREPARE_KEY,
+};
+
+struct dm_wrappedkey_op_args {
+ enum dm_wrappedkey_op op;
+ int err;
+ union {
+ struct {
+ const u8 *eph_key;
+ size_t eph_key_size;
+ u8 *sw_secret;
+ } derive_sw_secret;
+ struct {
+ const u8 *raw_key;
+ size_t raw_key_size;
+ u8 *lt_key;
+ } import_key;
+ struct {
+ u8 *lt_key;
+ } generate_key;
+ struct {
+ const u8 *lt_key;
+ size_t lt_key_size;
+ u8 *eph_key;
+ } prepare_key;
+ };
+};
+
+static int dm_wrappedkey_op_callback(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct dm_wrappedkey_op_args *args = data;
+ struct block_device *bdev = dev->bdev;
+ struct blk_crypto_profile *profile =
+ bdev_get_queue(bdev)->crypto_profile;
+ int err = -EOPNOTSUPP;
+
+ if (!args->err)
+ return 0;
+
+ switch (args->op) {
+ case DERIVE_SW_SECRET:
+ err = blk_crypto_derive_sw_secret(
+ bdev,
+ args->derive_sw_secret.eph_key,
+ args->derive_sw_secret.eph_key_size,
+ args->derive_sw_secret.sw_secret);
+ break;
+ case IMPORT_KEY:
+ err = blk_crypto_import_key(profile,
+ args->import_key.raw_key,
+ args->import_key.raw_key_size,
+ args->import_key.lt_key);
+ break;
+ case GENERATE_KEY:
+ err = blk_crypto_generate_key(profile,
+ args->generate_key.lt_key);
+ break;
+ case PREPARE_KEY:
+ err = blk_crypto_prepare_key(profile,
+ args->prepare_key.lt_key,
+ args->prepare_key.lt_key_size,
+ args->prepare_key.eph_key);
+ break;
+ }
+ args->err = err;
+
+ /* Try another device in case this fails. */
+ return 0;
+}
+
+static int dm_exec_wrappedkey_op(struct blk_crypto_profile *profile,
+ struct dm_wrappedkey_op_args *args)
+{
+ struct mapped_device *md =
+ container_of(profile, struct dm_crypto_profile, profile)->md;
+ struct dm_target *ti;
+ struct dm_table *t;
+ int srcu_idx;
+ int i;
+
+ args->err = -EOPNOTSUPP;
+
+ t = dm_get_live_table(md, &srcu_idx);
+ if (!t)
+ goto out;
+
+ /*
+ * blk-crypto currently has no support for multiple incompatible
+ * implementations of wrapped inline crypto keys on a single system.
+ * It was already checked earlier that support for wrapped keys was
+ * declared on all underlying devices. Thus, all the underlying devices
+ * should support all wrapped key operations and they should behave
+ * identically, i.e. work with the same keys. So, just executing the
+ * operation on the first device on which it works suffices for now.
+ */
+ for (i = 0; i < t->num_targets; i++) {
+ ti = dm_table_get_target(t, i);
+ if (!ti->type->iterate_devices)
+ continue;
+ ti->type->iterate_devices(ti, dm_wrappedkey_op_callback, args);
+ if (!args->err)
+ break;
+ }
+out:
+ dm_put_live_table(md, srcu_idx);
+ return args->err;
+}
+
+static int dm_derive_sw_secret(struct blk_crypto_profile *profile,
+ const u8 *eph_key, size_t eph_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = DERIVE_SW_SECRET,
+ .derive_sw_secret = {
+ .eph_key = eph_key,
+ .eph_key_size = eph_key_size,
+ .sw_secret = sw_secret,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
+static int dm_import_key(struct blk_crypto_profile *profile,
+ const u8 *raw_key, size_t raw_key_size,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = IMPORT_KEY,
+ .import_key = {
+ .raw_key = raw_key,
+ .raw_key_size = raw_key_size,
+ .lt_key = lt_key,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
+static int dm_generate_key(struct blk_crypto_profile *profile,
+ u8 lt_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = GENERATE_KEY,
+ .generate_key = {
+ .lt_key = lt_key,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
+static int dm_prepare_key(struct blk_crypto_profile *profile,
+ const u8 *lt_key, size_t lt_key_size,
+ u8 eph_key[BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE])
+{
+ struct dm_wrappedkey_op_args args = {
+ .op = PREPARE_KEY,
+ .prepare_key = {
+ .lt_key = lt_key,
+ .lt_key_size = lt_key_size,
+ .eph_key = eph_key,
+ },
+ };
+ return dm_exec_wrappedkey_op(profile, &args);
+}
+
static int
device_intersect_crypto_capabilities(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
@@ -1264,6 +1440,13 @@ static int dm_table_construct_crypto_profile(struct dm_table *t)
profile);
}
+ if (profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED) {
+ profile->ll_ops.derive_sw_secret = dm_derive_sw_secret;
+ profile->ll_ops.import_key = dm_import_key;
+ profile->ll_ops.generate_key = dm_generate_key;
+ profile->ll_ops.prepare_key = dm_prepare_key;
+ }
+
if (t->md->queue &&
!blk_crypto_has_capabilities(profile,
t->md->queue->crypto_profile)) {
@@ -1491,6 +1674,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t)
return true;
}
+bool dm_table_is_wildcard(struct dm_table *t)
+{
+ for (unsigned int i = 0; i < t->num_targets; i++) {
+ struct dm_target *ti = dm_table_get_target(t, i);
+
+ if (!dm_target_is_wildcard(ti->type))
+ return false;
+ }
+
+ return true;
+}
+
static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1722,8 +1917,12 @@ static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *
sector_t start, sector_t len, void *data)
{
struct request_queue *q = bdev_get_queue(dev->bdev);
+ int b;
- return !q->limits.max_write_zeroes_sectors;
+ mutex_lock(&q->limits_lock);
+ b = !q->limits.max_write_zeroes_sectors;
+ mutex_unlock(&q->limits_lock);
+ return b;
}
static bool dm_table_supports_write_zeroes(struct dm_table *t)
@@ -1831,10 +2030,24 @@ static bool dm_table_supports_atomic_writes(struct dm_table *t)
return true;
}
+bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size,
+ sector_t new_size)
+{
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && dm_has_zone_plugs(t->md) &&
+ old_size != new_size) {
+ DMWARN("%s: device has zone write plug resources. "
+ "Cannot change size",
+ dm_device_name(t->md));
+ return false;
+ }
+ return true;
+}
+
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *limits)
{
int r;
+ struct queue_limits old_limits;
if (!dm_table_supports_nowait(t))
limits->features &= ~BLK_FEAT_NOWAIT;
@@ -1861,28 +2074,30 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (dm_table_supports_flush(t))
limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
- if (dm_table_supports_dax(t, device_not_dax_capable)) {
+ if (dm_table_supports_dax(t, device_not_dax_capable))
limits->features |= BLK_FEAT_DAX;
- if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
- set_dax_synchronous(t->md->dax_dev);
- } else
+ else
limits->features &= ~BLK_FEAT_DAX;
- if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
- dax_write_cache(t->md->dax_dev, true);
-
/* For a zoned table, setup the zone related queue attributes. */
- if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
- (limits->features & BLK_FEAT_ZONED)) {
- r = dm_set_zones_restrictions(t, q, limits);
- if (r)
- return r;
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
+ if (limits->features & BLK_FEAT_ZONED) {
+ r = dm_set_zones_restrictions(t, q, limits);
+ if (r)
+ return r;
+ } else if (dm_has_zone_plugs(t->md)) {
+ DMWARN("%s: device has zone write plug resources. "
+ "Cannot switch to non-zoned table.",
+ dm_device_name(t->md));
+ return -EINVAL;
+ }
}
if (dm_table_supports_atomic_writes(t))
limits->features |= BLK_FEAT_ATOMIC_WRITES;
- r = queue_limits_set(q, limits);
+ old_limits = queue_limits_start_update(q);
+ r = queue_limits_commit_update(q, limits);
if (r)
return r;
@@ -1893,10 +2108,21 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
(limits->features & BLK_FEAT_ZONED)) {
r = dm_revalidate_zones(t, q);
- if (r)
+ if (r) {
+ queue_limits_set(q, &old_limits);
return r;
+ }
}
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+ dm_finalize_zone_settings(t, limits);
+
+ if (dm_table_supports_dax(t, device_not_dax_synchronous_capable))
+ set_dax_synchronous(t->md->dax_dev);
+
+ if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
+ dax_write_cache(t->md->dax_dev, true);
+
dm_update_crypto_profile(q, t);
return 0;
}
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
index 655453bb276b..425b3a74f4db 100644
--- a/drivers/md/dm-vdo/indexer/volume.c
+++ b/drivers/md/dm-vdo/indexer/volume.c
@@ -754,10 +754,11 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
u32 physical_page, struct cached_page **page_ptr)
{
struct cached_page *page;
+ unsigned int zone_number = request->zone_number;
get_page_from_cache(&volume->page_cache, physical_page, &page);
if (page != NULL) {
- if (request->zone_number == 0) {
+ if (zone_number == 0) {
/* Only one zone is allowed to update the LRU. */
make_page_most_recent(&volume->page_cache, page);
}
@@ -767,7 +768,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
}
/* Prepare to enqueue a read for the page. */
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
mutex_lock(&volume->read_threads_mutex);
/*
@@ -787,8 +788,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
* the order does not matter for correctness as it does below.
*/
mutex_unlock(&volume->read_threads_mutex);
- begin_pending_search(&volume->page_cache, physical_page,
- request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
return UDS_QUEUED;
}
@@ -797,7 +797,7 @@ static int get_volume_page_protected(struct volume *volume, struct uds_request *
* "search pending" state in careful order so no other thread can mess with the data before
* the caller gets to look at it.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
mutex_unlock(&volume->read_threads_mutex);
*page_ptr = page;
return UDS_SUCCESS;
@@ -849,6 +849,7 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r
{
int result;
struct cached_page *page = NULL;
+ unsigned int zone_number = request->zone_number;
u32 physical_page = map_to_physical_page(volume->geometry, chapter,
index_page_number);
@@ -858,18 +859,18 @@ static int search_cached_index_page(struct volume *volume, struct uds_request *r
* invalidation by the reader thread, before the reader thread has noticed that the
* invalidate_counter has been incremented.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
result = get_volume_page_protected(volume, request, physical_page, &page);
if (result != UDS_SUCCESS) {
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
result = uds_search_chapter_index_page(&page->index_page, volume->geometry,
&request->record_name,
record_page_number);
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
@@ -882,6 +883,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
{
struct cached_page *record_page;
struct index_geometry *geometry = volume->geometry;
+ unsigned int zone_number = request->zone_number;
int result;
u32 physical_page, page_number;
@@ -905,11 +907,11 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
* invalidation by the reader thread, before the reader thread has noticed that the
* invalidate_counter has been incremented.
*/
- begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+ begin_pending_search(&volume->page_cache, physical_page, zone_number);
result = get_volume_page_protected(volume, request, physical_page, &record_page);
if (result != UDS_SUCCESS) {
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return result;
}
@@ -917,7 +919,7 @@ int uds_search_cached_record_page(struct volume *volume, struct uds_request *req
&request->record_name, geometry, &request->old_metadata))
*found = true;
- end_pending_search(&volume->page_cache, request->zone_number);
+ end_pending_search(&volume->page_cache, zone_number);
return UDS_SUCCESS;
}
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index 0c41949db784..631a887b487c 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -593,6 +593,10 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
(*argc)--;
if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) {
+ if (v->fec->dev) {
+ ti->error = "FEC device already specified";
+ return -EINVAL;
+ }
r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev);
if (r) {
ti->error = "FEC device lookup failed";
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 3c427f18a04b..81186bded1ce 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -682,7 +682,8 @@ static void verity_bh_work(struct work_struct *w)
static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio)
{
return ioprio <= IOPRIO_CLASS_IDLE &&
- bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]);
+ bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]) &&
+ !need_resched();
}
static void verity_end_io(struct bio *bio)
@@ -993,7 +994,9 @@ static void verity_status(struct dm_target *ti, status_type_t type,
}
}
-static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int verity_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg,
+ bool *forward)
{
struct dm_verity *v = ti->private;
@@ -1120,6 +1123,9 @@ static int verity_alloc_most_once(struct dm_verity *v)
{
struct dm_target *ti = v->ti;
+ if (v->validated_blocks)
+ return 0;
+
/* the bitset can only handle INT_MAX blocks */
if (v->data_blocks > INT_MAX) {
ti->error = "device too large to use check_at_most_once";
@@ -1143,6 +1149,9 @@ static int verity_alloc_zero_digest(struct dm_verity *v)
struct dm_verity_io *io;
u8 *zero_data;
+ if (v->zero_digest)
+ return 0;
+
v->zero_digest = kmalloc(v->digest_size, GFP_KERNEL);
if (!v->zero_digest)
@@ -1577,7 +1586,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- /* Root hash signature is a optional parameter*/
+ /* Root hash signature is an optional parameter */
r = verity_verify_root_hash(root_hash_digest_to_validate,
strlen(root_hash_digest_to_validate),
verify_args.sig,
diff --git a/drivers/md/dm-verity-verify-sig.c b/drivers/md/dm-verity-verify-sig.c
index a9e2c6c0a33c..d5261a0e4232 100644
--- a/drivers/md/dm-verity-verify-sig.c
+++ b/drivers/md/dm-verity-verify-sig.c
@@ -71,9 +71,14 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
const char *arg_name)
{
struct dm_target *ti = v->ti;
- int ret = 0;
+ int ret;
const char *sig_key = NULL;
+ if (v->signature_key_desc) {
+ ti->error = DM_VERITY_VERIFY_ERR("root_hash_sig_key_desc already specified");
+ return -EINVAL;
+ }
+
if (!*argc) {
ti->error = DM_VERITY_VERIFY_ERR("Signature key not specified");
return -EINVAL;
@@ -83,14 +88,18 @@ int verity_verify_sig_parse_opt_args(struct dm_arg_set *as,
(*argc)--;
ret = verity_verify_get_sig_from_key(sig_key, sig_opts);
- if (ret < 0)
+ if (ret < 0) {
ti->error = DM_VERITY_VERIFY_ERR("Invalid key specified");
+ return ret;
+ }
v->signature_key_desc = kstrdup(sig_key, GFP_KERNEL);
- if (!v->signature_key_desc)
+ if (!v->signature_key_desc) {
+ ti->error = DM_VERITY_VERIFY_ERR("Could not allocate memory for signature key");
return -ENOMEM;
+ }
- return ret;
+ return 0;
}
/*
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 20edd3fabbab..3d31b82e0730 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -56,24 +56,31 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
{
struct mapped_device *md = disk->private_data;
struct dm_table *map;
- int srcu_idx, ret;
+ struct dm_table *zone_revalidate_map = md->zone_revalidate_map;
+ int srcu_idx, ret = -EIO;
+ bool put_table = false;
- if (!md->zone_revalidate_map) {
- /* Regular user context */
+ if (!zone_revalidate_map || md->revalidate_map_task != current) {
+ /*
+ * Regular user context or
+ * Zone revalidation during __bind() is in progress, but this
+ * call is from a different process
+ */
if (dm_suspended_md(md))
return -EAGAIN;
map = dm_get_live_table(md, &srcu_idx);
- if (!map)
- return -EIO;
+ put_table = true;
} else {
/* Zone revalidation during __bind() */
- map = md->zone_revalidate_map;
+ map = zone_revalidate_map;
}
- ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
+ if (map)
+ ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb,
+ data);
- if (!md->zone_revalidate_map)
+ if (put_table)
dm_put_live_table(md, srcu_idx);
return ret;
@@ -153,33 +160,36 @@ int dm_revalidate_zones(struct dm_table *t, struct request_queue *q)
{
struct mapped_device *md = t->md;
struct gendisk *disk = md->disk;
+ unsigned int nr_zones = disk->nr_zones;
int ret;
if (!get_capacity(disk))
return 0;
- /* Revalidate only if something changed. */
- if (!disk->nr_zones || disk->nr_zones != md->nr_zones) {
- DMINFO("%s using %s zone append",
- disk->disk_name,
- queue_emulates_zone_append(q) ? "emulated" : "native");
- md->nr_zones = 0;
- }
-
- if (md->nr_zones)
+ /*
+ * Do not revalidate if zone write plug resources have already
+ * been allocated.
+ */
+ if (dm_has_zone_plugs(md))
return 0;
+ DMINFO("%s using %s zone append", disk->disk_name,
+ queue_emulates_zone_append(q) ? "emulated" : "native");
+
/*
* Our table is not live yet. So the call to dm_get_live_table()
* in dm_blk_report_zones() will fail. Set a temporary pointer to
* our table for dm_blk_report_zones() to use directly.
*/
md->zone_revalidate_map = t;
+ md->revalidate_map_task = current;
ret = blk_revalidate_disk_zones(disk);
+ md->revalidate_map_task = NULL;
md->zone_revalidate_map = NULL;
if (ret) {
DMERR("Revalidate zones failed %d", ret);
+ disk->nr_zones = nr_zones;
return ret;
}
@@ -337,15 +347,15 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
/*
* Check if zone append is natively supported, and if not, set the
- * mapped device queue as needing zone append emulation.
+ * mapped device queue as needing zone append emulation. If zone
+ * append is natively supported, make sure that
+ * max_hw_zone_append_sectors is not set to 0.
*/
WARN_ON_ONCE(queue_is_mq(q));
- if (dm_table_supports_zone_append(t)) {
- clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- } else {
- set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ if (!dm_table_supports_zone_append(t))
lim->max_hw_zone_append_sectors = 0;
- }
+ else if (lim->max_hw_zone_append_sectors == 0)
+ lim->max_hw_zone_append_sectors = lim->max_zone_append_sectors;
/*
* Determine the max open and max active zone limits for the mapped
@@ -380,15 +390,28 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
lim->max_open_zones = 0;
lim->max_active_zones = 0;
lim->max_hw_zone_append_sectors = 0;
+ lim->max_zone_append_sectors = 0;
lim->zone_write_granularity = 0;
lim->chunk_sectors = 0;
lim->features &= ~BLK_FEAT_ZONED;
- clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
- md->nr_zones = 0;
- disk->nr_zones = 0;
return 0;
}
+ if (get_capacity(disk) && dm_has_zone_plugs(t->md)) {
+ if (q->limits.chunk_sectors != lim->chunk_sectors) {
+ DMWARN("%s: device has zone write plug resources. "
+ "Cannot change zone size",
+ disk->disk_name);
+ return -EINVAL;
+ }
+ if (lim->max_hw_zone_append_sectors != 0 &&
+ !dm_table_is_wildcard(t)) {
+ DMWARN("%s: device has zone write plug resources. "
+ "New table must emulate zone append",
+ disk->disk_name);
+ return -EINVAL;
+ }
+ }
/*
* Warn once (when the capacity is not yet set) if the mapped device is
* partially using zone resources of the target devices as that leads to
@@ -408,6 +431,23 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
return 0;
}
+void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim)
+{
+ struct mapped_device *md = t->md;
+
+ if (lim->features & BLK_FEAT_ZONED) {
+ if (dm_table_supports_zone_append(t))
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ else
+ set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ } else {
+ clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+ md->nr_zones = 0;
+ md->disk->nr_zones = 0;
+ }
+}
+
+
/*
* IO completion callback called from clone_endio().
*/
@@ -423,9 +463,9 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
*/
if (clone->bi_status == BLK_STS_OK &&
bio_op(clone) == REQ_OP_ZONE_APPEND) {
- sector_t mask = bdev_zone_sectors(disk->part0) - 1;
-
- orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask;
+ orig_bio->bi_iter.bi_sector +=
+ bdev_offset_from_zone_start(disk->part0,
+ clone->bi_iter.bi_sector);
}
return;
diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c
index 6141fc25d842..5da3db06da10 100644
--- a/drivers/md/dm-zoned-target.c
+++ b/drivers/md/dm-zoned-target.c
@@ -1015,7 +1015,8 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
/*
* Pass on ioctl to the backend device.
*/
-static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
+static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev,
+ unsigned int cmd, unsigned long arg, bool *forward)
{
struct dmz_target *dmz = ti->private;
struct dmz_dev *dev = &dmz->dev[0];
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5ab7574c0c76..1726f0f828cc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -411,7 +411,8 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
- struct block_device **bdev)
+ struct block_device **bdev, unsigned int cmd,
+ unsigned long arg, bool *forward)
{
struct dm_target *ti;
struct dm_table *map;
@@ -434,8 +435,8 @@ retry:
if (dm_suspended_md(md))
return -EAGAIN;
- r = ti->type->prepare_ioctl(ti, bdev);
- if (r == -ENOTCONN && !fatal_signal_pending(current)) {
+ r = ti->type->prepare_ioctl(ti, bdev, cmd, arg, forward);
+ if (r == -ENOTCONN && *forward && !fatal_signal_pending(current)) {
dm_put_live_table(md, *srcu_idx);
fsleep(10000);
goto retry;
@@ -454,9 +455,10 @@ static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode,
{
struct mapped_device *md = bdev->bd_disk->private_data;
int r, srcu_idx;
+ bool forward = true;
- r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
- if (r < 0)
+ r = dm_prepare_ioctl(md, &srcu_idx, &bdev, cmd, arg, &forward);
+ if (!forward || r < 0)
goto out;
if (r > 0) {
@@ -1082,22 +1084,6 @@ static inline struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
return &md->queue->limits;
}
-void disable_discard(struct mapped_device *md)
-{
- struct queue_limits *limits = dm_get_queue_limits(md);
-
- /* device doesn't really support DISCARD, disable it */
- limits->max_hw_discard_sectors = 0;
-}
-
-void disable_write_zeroes(struct mapped_device *md)
-{
- struct queue_limits *limits = dm_get_queue_limits(md);
-
- /* device doesn't really support WRITE ZEROES, disable it */
- limits->max_write_zeroes_sectors = 0;
-}
-
static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
{
return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
@@ -1115,10 +1101,10 @@ static void clone_endio(struct bio *bio)
if (unlikely(error == BLK_STS_TARGET)) {
if (bio_op(bio) == REQ_OP_DISCARD &&
!bdev_max_discard_sectors(bio->bi_bdev))
- disable_discard(md);
+ blk_queue_disable_discard(md->queue);
else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
!bdev_write_zeroes_sectors(bio->bi_bdev))
- disable_write_zeroes(md);
+ blk_queue_disable_write_zeroes(md->queue);
}
if (static_branch_unlikely(&zoned_enabled) &&
@@ -2421,21 +2407,35 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
struct queue_limits *limits)
{
struct dm_table *old_map;
- sector_t size;
+ sector_t size, old_size;
int ret;
lockdep_assert_held(&md->suspend_lock);
size = dm_table_get_size(t);
+ old_size = dm_get_size(md);
+
+ if (!dm_table_supports_size_change(t, old_size, size)) {
+ old_map = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ set_capacity(md->disk, size);
+
+ ret = dm_table_set_restrictions(t, md->queue, limits);
+ if (ret) {
+ set_capacity(md->disk, old_size);
+ old_map = ERR_PTR(ret);
+ goto out;
+ }
+
/*
* Wipe any geometry if the size of the table changed.
*/
- if (size != dm_get_size(md))
+ if (size != old_size)
memset(&md->geometry, 0, sizeof(md->geometry));
- set_capacity(md->disk, size);
-
dm_table_event_callback(t, event_callback, md);
if (dm_table_request_based(t)) {
@@ -2453,10 +2453,10 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
* requests in the queue may refer to bio from the old bioset,
* so you must walk through the queue to unprep.
*/
- if (!md->mempools) {
+ if (!md->mempools)
md->mempools = t->mempools;
- t->mempools = NULL;
- }
+ else
+ dm_free_md_mempools(t->mempools);
} else {
/*
* The md may already have mempools that need changing.
@@ -2465,14 +2465,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
*/
dm_free_md_mempools(md->mempools);
md->mempools = t->mempools;
- t->mempools = NULL;
- }
-
- ret = dm_table_set_restrictions(t, md->queue, limits);
- if (ret) {
- old_map = ERR_PTR(ret);
- goto out;
}
+ t->mempools = NULL;
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, (void *)t);
@@ -3638,10 +3632,13 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
int r, srcu_idx;
+ bool forward = true;
- r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
+ /* Not a real ioctl, but targets must not interpret non-DM ioctls */
+ r = dm_prepare_ioctl(md, &srcu_idx, &bdev, 0, 0, &forward);
if (r < 0)
goto out;
+ WARN_ON_ONCE(!forward);
ops = bdev->bd_disk->fops->pr_ops;
if (ops && ops->pr_clear)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a0a8ff119815..245f52b59215 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -58,6 +58,7 @@ void dm_table_event_callback(struct dm_table *t,
void (*fn)(void *), void *context);
struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
bool dm_table_has_no_data_devices(struct dm_table *table);
+bool dm_table_is_wildcard(struct dm_table *t);
int dm_calculate_queue_limits(struct dm_table *table,
struct queue_limits *limits);
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
@@ -72,6 +73,8 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
struct dm_target *dm_table_get_immutable_target(struct dm_table *t);
struct dm_target *dm_table_get_wildcard_target(struct dm_table *t);
bool dm_table_request_based(struct dm_table *t);
+bool dm_table_supports_size_change(struct dm_table *t, sector_t old_size,
+ sector_t new_size);
void dm_lock_md_type(struct mapped_device *md);
void dm_unlock_md_type(struct mapped_device *md);
@@ -102,6 +105,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q,
struct queue_limits *lim);
int dm_revalidate_zones(struct dm_table *t, struct request_queue *q);
+void dm_finalize_zone_settings(struct dm_table *t, struct queue_limits *lim);
void dm_zone_endio(struct dm_io *io, struct bio *clone);
#ifdef CONFIG_BLK_DEV_ZONED
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
@@ -110,12 +114,14 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
int dm_zone_get_reset_bitmap(struct mapped_device *md, struct dm_table *t,
sector_t sector, unsigned int nr_zones,
unsigned long *need_reset);
+#define dm_has_zone_plugs(md) ((md)->disk->zone_wplugs_hash != NULL)
#else
#define dm_blk_report_zones NULL
static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
{
return false;
}
+#define dm_has_zone_plugs(md) false
#endif
/*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9daa78c5fe33..0fde115e921f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
- * is 1000 KB/sec, so the extra system load does not show up that much.
- * Increase it if you want to have more _guaranteed_ speed. Note that
- * the RAID driver will use the maximum available bandwidth if the IO
- * subsystem is idle. There is also an 'absolute maximum' reconstruction
- * speed limit - in case reconstruction slows down your system despite
- * idle IO detection.
+ * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
+ * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
+ * does not show up that much. Increase it if you want to have more guaranteed
+ * speed. Note that the RAID driver will use the maximum bandwidth
+ * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
*
- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
- * or /sys/block/mdX/md/sync_speed_{min,max}
+ * Background sync IO speed control:
+ *
+ * - below speed min:
+ * no limit;
+ * - above speed min and below speed max:
+ * a) if mddev is idle, then no limit;
+ * b) if mddev is busy handling normal IO, then limit inflight sync IO
+ * to sync_io_depth;
+ * - above speed max:
+ * sync IO can't be issued;
+ *
+ * Following configurations can be changed via /proc/sys/dev/raid/ for system
+ * or /sys/block/mdX/md/ for one array.
*/
-
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(struct mddev *mddev)
+static int sysctl_sync_io_depth = 32;
+
+static int speed_min(struct mddev *mddev)
{
return mddev->sync_speed_min ?
mddev->sync_speed_min : sysctl_speed_limit_min;
}
-static inline int speed_max(struct mddev *mddev)
+static int speed_max(struct mddev *mddev)
{
return mddev->sync_speed_max ?
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int sync_io_depth(struct mddev *mddev)
+{
+ return mddev->sync_io_depth ?
+ mddev->sync_io_depth : sysctl_sync_io_depth;
+}
+
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
@@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_io_depth",
+ .data = &sysctl_sync_io_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
};
@@ -5091,7 +5114,7 @@ static ssize_t
sync_min_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_min(mddev),
- mddev->sync_speed_min ? "local": "system");
+ mddev->sync_speed_min ? "local" : "system");
}
static ssize_t
@@ -5100,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int min;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
min = 0;
} else {
rv = kstrtouint(buf, 10, &min);
@@ -5120,7 +5143,7 @@ static ssize_t
sync_max_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_max(mddev),
- mddev->sync_speed_max ? "local": "system");
+ mddev->sync_speed_max ? "local" : "system");
}
static ssize_t
@@ -5129,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int max;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
max = 0;
} else {
rv = kstrtouint(buf, 10, &max);
@@ -5146,6 +5169,35 @@ static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
static ssize_t
+sync_io_depth_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
+ mddev->sync_io_depth ? "local" : "system");
+}
+
+static ssize_t
+sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int max;
+ int rv;
+
+ if (strncmp(buf, "system", 6) == 0) {
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
+ }
+ mddev->sync_io_depth = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_io_depth =
+__ATTR_RW(sync_io_depth);
+
+static ssize_t
degraded_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d\n", mddev->degraded);
@@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
+ &md_sync_io_depth.attr,
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
@@ -8572,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev)
put_cluster_ops(mddev);
}
-static int is_mddev_idle(struct mddev *mddev, int init)
+static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
{
+ unsigned long last_events = rdev->last_events;
+
+ if (!bdev_is_partition(rdev->bdev))
+ return true;
+
+ /*
+ * If rdev is partition, and user doesn't issue IO to the array, the
+ * array is still not idle if user issues IO to other partitions.
+ */
+ rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
+ sectors) -
+ part_stat_read_accum(rdev->bdev, sectors);
+
+ return init || rdev->last_events <= last_events;
+}
+
+/*
+ * mddev is idle if following conditions are matched since last check:
+ * 1) mddev doesn't have normal IO completed;
+ * 2) mddev doesn't have inflight normal IO;
+ * 3) if any member disk is partition, and other partitions don't have IO
+ * completed;
+ *
+ * Noted this checking rely on IO accounting is enabled.
+ */
+static bool is_mddev_idle(struct mddev *mddev, int init)
+{
+ unsigned long last_events = mddev->normal_io_events;
+ struct gendisk *disk;
struct md_rdev *rdev;
- int idle;
- int curr_events;
+ bool idle = true;
- idle = 1;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_disk;
+ disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
+ if (!disk)
+ return true;
- if (!init && !blk_queue_io_stat(disk->queue))
- continue;
+ mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
+ if (!init && (mddev->normal_io_events > last_events ||
+ bdev_count_inflight(disk->part0)))
+ idle = false;
- curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
- atomic_read(&disk->sync_io);
- /* sync IO will cause sync_io to increase before the disk_stats
- * as sync_io is counted when a request starts, and
- * disk_stats is counted when it completes.
- * So resync activity will cause curr_events to be smaller than
- * when there was no such activity.
- * non-sync IO will cause disk_stat to increase without
- * increasing sync_io so curr_events will (eventually)
- * be larger than it was before. Once it becomes
- * substantially larger, the test below will cause
- * the array to appear non-idle, and resync will slow
- * down.
- * If there is a lot of outstanding resync activity when
- * we set last_event to curr_events, then all that activity
- * completing might cause the array to appear non-idle
- * and resync will be slowed down even though there might
- * not have been non-resync activity. This will only
- * happen once though. 'last_events' will soon reflect
- * the state where there is little or no outstanding
- * resync requests, and further resync activity will
- * always make curr_events less than last_events.
- *
- */
- if (init || curr_events - rdev->last_events > 64) {
- rdev->last_events = curr_events;
- idle = 0;
- }
- }
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (!is_rdev_holder_idle(rdev, init))
+ idle = false;
rcu_read_unlock();
+
return idle;
}
@@ -8927,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
}
}
+static bool sync_io_within_limit(struct mddev *mddev)
+{
+ int io_sectors;
+
+ /*
+ * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
+ * RESYNC_PAGES(64k) per IO.
+ */
+ if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
+ io_sectors = 8;
+ else
+ io_sectors = 128;
+
+ return atomic_read(&mddev->recovery_active) <
+ io_sectors * sync_io_depth(mddev);
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -9195,7 +9270,8 @@ void md_do_sync(struct md_thread *thread)
msleep(500);
goto repeat;
}
- if (!is_mddev_idle(mddev, 0)) {
+ if (!sync_io_within_limit(mddev) &&
+ !is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1cf00a04bcdd..d45a9e6ead80 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -132,7 +132,7 @@ struct md_rdev {
sector_t sectors; /* Device size (in 512bytes sectors) */
struct mddev *mddev; /* RAID array if running */
- int last_events; /* IO event timestamp */
+ unsigned long last_events; /* IO event timestamp */
/*
* If meta_bdev is non-NULL, it means that a separate device is
@@ -404,7 +404,8 @@ struct mddev {
* are happening, so run/
* takeover/stop are not safe
*/
- struct gendisk *gendisk;
+ struct gendisk *gendisk; /* mdraid gendisk */
+ struct gendisk *dm_gendisk; /* dm-raid gendisk */
struct kobject kobj;
int hold_active;
@@ -483,6 +484,7 @@ struct mddev {
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
+ int sync_io_depth;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
@@ -518,6 +520,7 @@ struct mddev {
* adding a spare
*/
+ unsigned long normal_io_events; /* IO event timestamp */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
@@ -714,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev)
}
extern void mddev_unlock(struct mddev *mddev);
-static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-{
- if (blk_queue_io_stat(bdev->bd_disk->queue))
- atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
-}
-
-static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
-{
- md_sync_acct(bio->bi_bdev, nr_sectors);
-}
-
struct md_personality
{
struct md_submodule_head head;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index de9bccbe7337..657d481525be 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
wbio->bi_end_io = end_sync_write;
atomic_inc(&r1_bio->remaining);
- md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
@@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
read_targets--;
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
@@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
atomic_set(&r1_bio->remaining, 1);
bio = r1_bio->bios[r1_bio->read_disk];
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ba32bac975b8..dce06bf65016 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
tbio->bi_opf |= MD_FAILFAST;
@@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(tbio));
submit_bio_noacct(tbio);
}
@@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(wbio2));
submit_bio_noacct(wbio2);
}
}
@@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
- md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
submit_bio_noacct(bio);
}
@@ -4880,7 +4873,6 @@ read_more:
r10_bio->sectors = nr_sectors;
/* Now submit the read */
- md_sync_acct_bio(read_bio, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
submit_bio_noacct(read_bio);
@@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
atomic_inc(&rdev->nr_pending);
- md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
submit_bio_noacct(b);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6389383166c0..ca5b0e8ba707 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1240,10 +1240,6 @@ again:
}
if (rdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
@@ -1300,10 +1296,6 @@ again:
submit_bio_noacct(bi);
}
if (rrdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);