summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/super.c6
-rw-r--r--drivers/md/dm-bufio.c2
-rw-r--r--drivers/md/dm-integrity.c16
-rw-r--r--drivers/md/dm-raid.c3
-rw-r--r--drivers/md/md.c190
-rw-r--r--drivers/md/md.h18
-rw-r--r--drivers/md/raid1.c3
-rw-r--r--drivers/md/raid10.c9
-rw-r--r--drivers/md/raid5.c8
9 files changed, 151 insertions, 104 deletions
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 813b38aec3e4..1a2ce1a4b456 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -293,8 +293,7 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
bio->bi_iter.bi_sector = SB_SECTOR;
- __bio_add_page(bio, virt_to_page(out), SB_SIZE,
- offset_in_page(out));
+ bio_add_virt_nofail(bio, out, SB_SIZE);
out->offset = cpu_to_le64(sb->offset);
@@ -546,7 +545,8 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
static struct uuid_entry *uuid_find_empty(struct cache_set *c)
{
- static const char zero_uuid[16] = { 0 };
+ static const char zero_uuid[16] __nonstring =
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
return uuid_find(c, zero_uuid);
}
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index f0b5a6931161..d098e75e3461 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1364,7 +1364,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
ptr = (char *)b->data + offset;
len = n_sectors << SECTOR_SHIFT;
- __bio_add_page(bio, virt_to_page(ptr), len, offset_in_page(ptr));
+ bio_add_virt_nofail(bio, ptr, len);
submit_bio(bio);
}
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index cc3d3897ef42..1f626066e8cc 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -2557,14 +2557,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
char *mem;
outgoing_bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recheck_bios);
-
- r = bio_add_page(outgoing_bio, virt_to_page(outgoing_data), ic->sectors_per_block << SECTOR_SHIFT, 0);
- if (unlikely(r != (ic->sectors_per_block << SECTOR_SHIFT))) {
- bio_put(outgoing_bio);
- bio->bi_status = BLK_STS_RESOURCE;
- bio_endio(bio);
- return;
- }
+ bio_add_virt_nofail(outgoing_bio, outgoing_data,
+ ic->sectors_per_block << SECTOR_SHIFT);
bip = bio_integrity_alloc(outgoing_bio, GFP_NOIO, 1);
if (IS_ERR(bip)) {
@@ -3211,7 +3205,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_READ, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
r = submit_bio_wait(bio);
bio_put(bio);
if (unlikely(r)) {
@@ -3228,7 +3223,8 @@ next_chunk:
bio = bio_alloc_bioset(ic->dev->bdev, 1, REQ_OP_WRITE, GFP_NOIO, &ic->recalc_bios);
bio->bi_iter.bi_sector = ic->start + SB_SECTORS + range.logical_sector;
- __bio_add_page(bio, virt_to_page(recalc_buffer), range.n_sectors << SECTOR_SHIFT, offset_in_page(recalc_buffer));
+ bio_add_virt_nofail(bio, recalc_buffer,
+ range.n_sectors << SECTOR_SHIFT);
bip = bio_integrity_alloc(bio, GFP_NOIO, 1);
if (unlikely(IS_ERR(bip))) {
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6adc55fd90d3..127138c61be5 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -14,6 +14,7 @@
#include "raid5.h"
#include "raid10.h"
#include "md-bitmap.h"
+#include "dm-core.h"
#include <linux/device-mapper.h>
@@ -3308,6 +3309,7 @@ size_check:
/* Disable/enable discard support on raid set. */
configure_discard_support(rs);
+ rs->md.dm_gendisk = ti->table->md->disk;
mddev_unlock(&rs->md);
return 0;
@@ -3327,6 +3329,7 @@ static void raid_dtr(struct dm_target *ti)
mddev_lock_nointr(&rs->md);
md_stop(&rs->md);
+ rs->md.dm_gendisk = NULL;
mddev_unlock(&rs->md);
if (work_pending(&rs->md.event_work))
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9daa78c5fe33..0fde115e921f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
- * is 1000 KB/sec, so the extra system load does not show up that much.
- * Increase it if you want to have more _guaranteed_ speed. Note that
- * the RAID driver will use the maximum available bandwidth if the IO
- * subsystem is idle. There is also an 'absolute maximum' reconstruction
- * speed limit - in case reconstruction slows down your system despite
- * idle IO detection.
+ * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
+ * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
+ * does not show up that much. Increase it if you want to have more guaranteed
+ * speed. Note that the RAID driver will use the maximum bandwidth
+ * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
*
- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
- * or /sys/block/mdX/md/sync_speed_{min,max}
+ * Background sync IO speed control:
+ *
+ * - below speed min:
+ * no limit;
+ * - above speed min and below speed max:
+ * a) if mddev is idle, then no limit;
+ * b) if mddev is busy handling normal IO, then limit inflight sync IO
+ * to sync_io_depth;
+ * - above speed max:
+ * sync IO can't be issued;
+ *
+ * Following configurations can be changed via /proc/sys/dev/raid/ for system
+ * or /sys/block/mdX/md/ for one array.
*/
-
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(struct mddev *mddev)
+static int sysctl_sync_io_depth = 32;
+
+static int speed_min(struct mddev *mddev)
{
return mddev->sync_speed_min ?
mddev->sync_speed_min : sysctl_speed_limit_min;
}
-static inline int speed_max(struct mddev *mddev)
+static int speed_max(struct mddev *mddev)
{
return mddev->sync_speed_max ?
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int sync_io_depth(struct mddev *mddev)
+{
+ return mddev->sync_io_depth ?
+ mddev->sync_io_depth : sysctl_sync_io_depth;
+}
+
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
@@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_io_depth",
+ .data = &sysctl_sync_io_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
};
@@ -5091,7 +5114,7 @@ static ssize_t
sync_min_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_min(mddev),
- mddev->sync_speed_min ? "local": "system");
+ mddev->sync_speed_min ? "local" : "system");
}
static ssize_t
@@ -5100,7 +5123,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int min;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
min = 0;
} else {
rv = kstrtouint(buf, 10, &min);
@@ -5120,7 +5143,7 @@ static ssize_t
sync_max_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_max(mddev),
- mddev->sync_speed_max ? "local": "system");
+ mddev->sync_speed_max ? "local" : "system");
}
static ssize_t
@@ -5129,7 +5152,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int max;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
max = 0;
} else {
rv = kstrtouint(buf, 10, &max);
@@ -5146,6 +5169,35 @@ static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
static ssize_t
+sync_io_depth_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
+ mddev->sync_io_depth ? "local" : "system");
+}
+
+static ssize_t
+sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int max;
+ int rv;
+
+ if (strncmp(buf, "system", 6) == 0) {
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
+ }
+ mddev->sync_io_depth = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_io_depth =
+__ATTR_RW(sync_io_depth);
+
+static ssize_t
degraded_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d\n", mddev->degraded);
@@ -5671,6 +5723,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
+ &md_sync_io_depth.attr,
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
@@ -8572,50 +8625,55 @@ void md_cluster_stop(struct mddev *mddev)
put_cluster_ops(mddev);
}
-static int is_mddev_idle(struct mddev *mddev, int init)
+static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
{
+ unsigned long last_events = rdev->last_events;
+
+ if (!bdev_is_partition(rdev->bdev))
+ return true;
+
+ /*
+ * If rdev is partition, and user doesn't issue IO to the array, the
+ * array is still not idle if user issues IO to other partitions.
+ */
+ rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
+ sectors) -
+ part_stat_read_accum(rdev->bdev, sectors);
+
+ return init || rdev->last_events <= last_events;
+}
+
+/*
+ * mddev is idle if following conditions are matched since last check:
+ * 1) mddev doesn't have normal IO completed;
+ * 2) mddev doesn't have inflight normal IO;
+ * 3) if any member disk is partition, and other partitions don't have IO
+ * completed;
+ *
+ * Noted this checking rely on IO accounting is enabled.
+ */
+static bool is_mddev_idle(struct mddev *mddev, int init)
+{
+ unsigned long last_events = mddev->normal_io_events;
+ struct gendisk *disk;
struct md_rdev *rdev;
- int idle;
- int curr_events;
+ bool idle = true;
- idle = 1;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_disk;
+ disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
+ if (!disk)
+ return true;
- if (!init && !blk_queue_io_stat(disk->queue))
- continue;
+ mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
+ if (!init && (mddev->normal_io_events > last_events ||
+ bdev_count_inflight(disk->part0)))
+ idle = false;
- curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
- atomic_read(&disk->sync_io);
- /* sync IO will cause sync_io to increase before the disk_stats
- * as sync_io is counted when a request starts, and
- * disk_stats is counted when it completes.
- * So resync activity will cause curr_events to be smaller than
- * when there was no such activity.
- * non-sync IO will cause disk_stat to increase without
- * increasing sync_io so curr_events will (eventually)
- * be larger than it was before. Once it becomes
- * substantially larger, the test below will cause
- * the array to appear non-idle, and resync will slow
- * down.
- * If there is a lot of outstanding resync activity when
- * we set last_event to curr_events, then all that activity
- * completing might cause the array to appear non-idle
- * and resync will be slowed down even though there might
- * not have been non-resync activity. This will only
- * happen once though. 'last_events' will soon reflect
- * the state where there is little or no outstanding
- * resync requests, and further resync activity will
- * always make curr_events less than last_events.
- *
- */
- if (init || curr_events - rdev->last_events > 64) {
- rdev->last_events = curr_events;
- idle = 0;
- }
- }
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (!is_rdev_holder_idle(rdev, init))
+ idle = false;
rcu_read_unlock();
+
return idle;
}
@@ -8927,6 +8985,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
}
}
+static bool sync_io_within_limit(struct mddev *mddev)
+{
+ int io_sectors;
+
+ /*
+ * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
+ * RESYNC_PAGES(64k) per IO.
+ */
+ if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
+ io_sectors = 8;
+ else
+ io_sectors = 128;
+
+ return atomic_read(&mddev->recovery_active) <
+ io_sectors * sync_io_depth(mddev);
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -9195,7 +9270,8 @@ void md_do_sync(struct md_thread *thread)
msleep(500);
goto repeat;
}
- if (!is_mddev_idle(mddev, 0)) {
+ if (!sync_io_within_limit(mddev) &&
+ !is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1cf00a04bcdd..d45a9e6ead80 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -132,7 +132,7 @@ struct md_rdev {
sector_t sectors; /* Device size (in 512bytes sectors) */
struct mddev *mddev; /* RAID array if running */
- int last_events; /* IO event timestamp */
+ unsigned long last_events; /* IO event timestamp */
/*
* If meta_bdev is non-NULL, it means that a separate device is
@@ -404,7 +404,8 @@ struct mddev {
* are happening, so run/
* takeover/stop are not safe
*/
- struct gendisk *gendisk;
+ struct gendisk *gendisk; /* mdraid gendisk */
+ struct gendisk *dm_gendisk; /* dm-raid gendisk */
struct kobject kobj;
int hold_active;
@@ -483,6 +484,7 @@ struct mddev {
/* if zero, use the system-wide default */
int sync_speed_min;
int sync_speed_max;
+ int sync_io_depth;
/* resync even though the same disks are shared among md-devices */
int parallel_resync;
@@ -518,6 +520,7 @@ struct mddev {
* adding a spare
*/
+ unsigned long normal_io_events; /* IO event timestamp */
atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait;
sector_t recovery_cp;
@@ -714,17 +717,6 @@ static inline int mddev_trylock(struct mddev *mddev)
}
extern void mddev_unlock(struct mddev *mddev);
-static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
-{
- if (blk_queue_io_stat(bdev->bd_disk->queue))
- atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
-}
-
-static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
-{
- md_sync_acct(bio->bi_bdev, nr_sectors);
-}
-
struct md_personality
{
struct md_submodule_head head;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index de9bccbe7337..657d481525be 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2382,7 +2382,6 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
wbio->bi_end_io = end_sync_write;
atomic_inc(&r1_bio->remaining);
- md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
@@ -3055,7 +3054,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r1_bio->bios[i];
if (bio->bi_end_io == end_sync_read) {
read_targets--;
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
@@ -3064,7 +3062,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
} else {
atomic_set(&r1_bio->remaining, 1);
bio = r1_bio->bios[r1_bio->read_disk];
- md_sync_acct_bio(bio, nr_sectors);
if (read_targets == 1)
bio->bi_opf &= ~MD_FAILFAST;
submit_bio_noacct(bio);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index ba32bac975b8..dce06bf65016 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2426,7 +2426,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
if (test_bit(FailFast, &conf->mirrors[d].rdev->flags))
tbio->bi_opf |= MD_FAILFAST;
@@ -2448,8 +2447,6 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_copy_data(tbio, fbio);
d = r10_bio->devs[i].devnum;
atomic_inc(&r10_bio->remaining);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(tbio));
submit_bio_noacct(tbio);
}
@@ -2583,13 +2580,10 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
d = r10_bio->devs[1].devnum;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
- md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
submit_bio_noacct(wbio);
}
if (wbio2) {
atomic_inc(&conf->mirrors[d].replacement->nr_pending);
- md_sync_acct(conf->mirrors[d].replacement->bdev,
- bio_sectors(wbio2));
submit_bio_noacct(wbio2);
}
}
@@ -3757,7 +3751,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
r10_bio->sectors = nr_sectors;
if (bio->bi_end_io == end_sync_read) {
- md_sync_acct_bio(bio, nr_sectors);
bio->bi_status = 0;
submit_bio_noacct(bio);
}
@@ -4880,7 +4873,6 @@ read_more:
r10_bio->sectors = nr_sectors;
/* Now submit the read */
- md_sync_acct_bio(read_bio, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
read_bio->bi_next = NULL;
submit_bio_noacct(read_bio);
@@ -4940,7 +4932,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue;
atomic_inc(&rdev->nr_pending);
- md_sync_acct_bio(b, r10_bio->sectors);
atomic_inc(&r10_bio->remaining);
b->bi_next = NULL;
submit_bio_noacct(b);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6389383166c0..ca5b0e8ba707 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1240,10 +1240,6 @@ again:
}
if (rdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(bi, rdev->bdev, &dev->vec, 1, op | op_flags);
@@ -1300,10 +1296,6 @@ again:
submit_bio_noacct(bi);
}
if (rrdev) {
- if (s->syncing || s->expanding || s->expanded
- || s->replacing)
- md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf));
-
set_bit(STRIPE_IO_STARTED, &sh->state);
bio_init(rbi, rrdev->bdev, &dev->rvec, 1, op | op_flags);