summaryrefslogtreecommitdiff
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c435
1 files changed, 303 insertions, 132 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9daa78c5fe33..4e033c26fdd4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -111,32 +111,48 @@ static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
/* Default safemode delay: 200 msec */
#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1)
/*
- * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
- * is 1000 KB/sec, so the extra system load does not show up that much.
- * Increase it if you want to have more _guaranteed_ speed. Note that
- * the RAID driver will use the maximum available bandwidth if the IO
- * subsystem is idle. There is also an 'absolute maximum' reconstruction
- * speed limit - in case reconstruction slows down your system despite
- * idle IO detection.
+ * Current RAID-1,4,5,6,10 parallel reconstruction 'guaranteed speed limit'
+ * is sysctl_speed_limit_min, 1000 KB/sec by default, so the extra system load
+ * does not show up that much. Increase it if you want to have more guaranteed
+ * speed. Note that the RAID driver will use the maximum bandwidth
+ * sysctl_speed_limit_max, 200 MB/sec by default, if the IO subsystem is idle.
*
- * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
- * or /sys/block/mdX/md/sync_speed_{min,max}
+ * Background sync IO speed control:
+ *
+ * - below speed min:
+ * no limit;
+ * - above speed min and below speed max:
+ * a) if mddev is idle, then no limit;
+ * b) if mddev is busy handling normal IO, then limit inflight sync IO
+ * to sync_io_depth;
+ * - above speed max:
+ * sync IO can't be issued;
+ *
+ * Following configurations can be changed via /proc/sys/dev/raid/ for system
+ * or /sys/block/mdX/md/ for one array.
*/
-
static int sysctl_speed_limit_min = 1000;
static int sysctl_speed_limit_max = 200000;
-static inline int speed_min(struct mddev *mddev)
+static int sysctl_sync_io_depth = 32;
+
+static int speed_min(struct mddev *mddev)
{
return mddev->sync_speed_min ?
mddev->sync_speed_min : sysctl_speed_limit_min;
}
-static inline int speed_max(struct mddev *mddev)
+static int speed_max(struct mddev *mddev)
{
return mddev->sync_speed_max ?
mddev->sync_speed_max : sysctl_speed_limit_max;
}
+static int sync_io_depth(struct mddev *mddev)
+{
+ return mddev->sync_io_depth ?
+ mddev->sync_io_depth : sysctl_sync_io_depth;
+}
+
static void rdev_uninit_serial(struct md_rdev *rdev)
{
if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
@@ -293,14 +309,21 @@ static const struct ctl_table raid_table[] = {
.procname = "speed_limit_min",
.data = &sysctl_speed_limit_min,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "speed_limit_max",
.data = &sysctl_speed_limit_max,
.maxlen = sizeof(int),
- .mode = S_IRUGO|S_IWUSR,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_io_depth",
+ .data = &sysctl_sync_io_depth,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
};
@@ -316,6 +339,7 @@ static int start_readonly;
* so all the races disappear.
*/
static bool create_on_open = true;
+static bool legacy_async_del_gendisk = true;
/*
* We have a system wide 'event count' that is incremented
@@ -613,9 +637,12 @@ static void __mddev_put(struct mddev *mddev)
mddev->ctime || mddev->hold_active)
return;
- /* Array is not configured at all, and not held active, so destroy it */
+ /*
+ * If array is freed by stopping array, MD_DELETED is set by
+ * do_md_stop(), MD_DELETED is still set here in case mddev is freed
+ * directly by closing a mddev that is created by create_on_open.
+ */
set_bit(MD_DELETED, &mddev->flags);
-
/*
* Call queue_work inside the spinlock so that flush_workqueue() after
* mddev_find will succeed in waiting for the work to be done.
@@ -850,6 +877,19 @@ void mddev_unlock(struct mddev *mddev)
kobject_del(&rdev->kobj);
export_rdev(rdev, mddev);
}
+
+ if (!legacy_async_del_gendisk) {
+ /*
+ * Call del_gendisk after release reconfig_mutex to avoid
+ * deadlock (e.g. call del_gendisk under the lock and an
+ * access to sysfs files waits the lock)
+ * And MD_DELETED is only used for md raid which is set in
+ * do_md_stop. dm raid only uses md_stop to stop. So dm raid
+ * doesn't need to check MD_DELETED when getting reconfig lock
+ */
+ if (test_bit(MD_DELETED, &mddev->flags))
+ del_gendisk(mddev->gendisk);
+ }
}
EXPORT_SYMBOL_GPL(mddev_unlock);
@@ -1379,13 +1419,13 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, stru
mddev->layout = -1;
if (sb->state & (1<<MD_SB_CLEAN))
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
else {
if (sb->events_hi == sb->cp_events_hi &&
sb->events_lo == sb->cp_events_lo) {
- mddev->recovery_cp = sb->recovery_cp;
+ mddev->resync_offset = sb->recovery_cp;
} else
- mddev->recovery_cp = 0;
+ mddev->resync_offset = 0;
}
memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
@@ -1511,10 +1551,10 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
mddev->minor_version = sb->minor_version;
if (mddev->in_sync)
{
- sb->recovery_cp = mddev->recovery_cp;
+ sb->recovery_cp = mddev->resync_offset;
sb->cp_events_hi = (mddev->events>>32);
sb->cp_events_lo = (u32)mddev->events;
- if (mddev->recovery_cp == MaxSector)
+ if (mddev->resync_offset == MaxSector)
sb->state = (1<< MD_SB_CLEAN);
} else
sb->recovery_cp = 0;
@@ -1865,7 +1905,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struc
mddev->bitmap_info.default_space = (4096-1024) >> 9;
mddev->reshape_backwards = 0;
- mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+ mddev->resync_offset = le64_to_cpu(sb->resync_offset);
memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2;
@@ -2051,7 +2091,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->utime = cpu_to_le64((__u64)mddev->utime);
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
- sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
+ sb->resync_offset = cpu_to_le64(mddev->resync_offset);
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
sb->resync_offset = cpu_to_le64(MaxSector);
else
@@ -2731,7 +2771,7 @@ repeat:
/* If this is just a dirty<->clean transition, and the array is clean
* and 'events' is odd, we can roll back to the previous clean state */
if (nospares
- && (mddev->in_sync && mddev->recovery_cp == MaxSector)
+ && (mddev->in_sync && mddev->resync_offset == MaxSector)
&& mddev->can_decrease_events
&& mddev->events != 1) {
mddev->events--;
@@ -4267,9 +4307,9 @@ __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
static ssize_t
resync_start_show(struct mddev *mddev, char *page)
{
- if (mddev->recovery_cp == MaxSector)
+ if (mddev->resync_offset == MaxSector)
return sprintf(page, "none\n");
- return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
+ return sprintf(page, "%llu\n", (unsigned long long)mddev->resync_offset);
}
static ssize_t
@@ -4295,7 +4335,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
err = -EBUSY;
if (!err) {
- mddev->recovery_cp = n;
+ mddev->resync_offset = n;
if (mddev->pers)
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
}
@@ -4799,9 +4839,42 @@ out_unlock:
static struct md_sysfs_entry md_metadata =
__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
+static bool rdev_needs_recovery(struct md_rdev *rdev, sector_t sectors)
+{
+ return rdev->raid_disk >= 0 &&
+ !test_bit(Journal, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags) &&
+ !test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < sectors;
+}
+
+static enum sync_action md_get_active_sync_action(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ bool is_recover = false;
+
+ if (mddev->resync_offset < MaxSector)
+ return ACTION_RESYNC;
+
+ if (mddev->reshape_position != MaxSector)
+ return ACTION_RESHAPE;
+
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev_needs_recovery(rdev, MaxSector)) {
+ is_recover = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return is_recover ? ACTION_RECOVER : ACTION_IDLE;
+}
+
enum sync_action md_sync_action(struct mddev *mddev)
{
unsigned long recovery = mddev->recovery;
+ enum sync_action active_action;
/*
* frozen has the highest priority, means running sync_thread will be
@@ -4825,8 +4898,17 @@ enum sync_action md_sync_action(struct mddev *mddev)
!test_bit(MD_RECOVERY_NEEDED, &recovery))
return ACTION_IDLE;
- if (test_bit(MD_RECOVERY_RESHAPE, &recovery) ||
- mddev->reshape_position != MaxSector)
+ /*
+ * Check if any sync operation (resync/recover/reshape) is
+ * currently active. This ensures that only one sync operation
+ * can run at a time. Returns the type of active operation, or
+ * ACTION_IDLE if none are active.
+ */
+ active_action = md_get_active_sync_action(mddev);
+ if (active_action != ACTION_IDLE)
+ return active_action;
+
+ if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
return ACTION_RESHAPE;
if (test_bit(MD_RECOVERY_RECOVER, &recovery))
@@ -5091,7 +5173,7 @@ static ssize_t
sync_min_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_min(mddev),
- mddev->sync_speed_min ? "local": "system");
+ mddev->sync_speed_min ? "local" : "system");
}
static ssize_t
@@ -5100,7 +5182,7 @@ sync_min_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int min;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
min = 0;
} else {
rv = kstrtouint(buf, 10, &min);
@@ -5120,7 +5202,7 @@ static ssize_t
sync_max_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d (%s)\n", speed_max(mddev),
- mddev->sync_speed_max ? "local": "system");
+ mddev->sync_speed_max ? "local" : "system");
}
static ssize_t
@@ -5129,7 +5211,7 @@ sync_max_store(struct mddev *mddev, const char *buf, size_t len)
unsigned int max;
int rv;
- if (strncmp(buf, "system", 6)==0) {
+ if (strncmp(buf, "system", 6) == 0) {
max = 0;
} else {
rv = kstrtouint(buf, 10, &max);
@@ -5146,6 +5228,35 @@ static struct md_sysfs_entry md_sync_max =
__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
static ssize_t
+sync_io_depth_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%d (%s)\n", sync_io_depth(mddev),
+ mddev->sync_io_depth ? "local" : "system");
+}
+
+static ssize_t
+sync_io_depth_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ unsigned int max;
+ int rv;
+
+ if (strncmp(buf, "system", 6) == 0) {
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
+ }
+ mddev->sync_io_depth = max;
+ return len;
+}
+
+static struct md_sysfs_entry md_sync_io_depth =
+__ATTR_RW(sync_io_depth);
+
+static ssize_t
degraded_show(struct mddev *mddev, char *page)
{
return sprintf(page, "%d\n", mddev->degraded);
@@ -5671,6 +5782,7 @@ static struct attribute *md_redundancy_attrs[] = {
&md_mismatches.attr,
&md_sync_min.attr,
&md_sync_max.attr,
+ &md_sync_io_depth.attr,
&md_sync_speed.attr,
&md_sync_force_parallel.attr,
&md_sync_completed.attr,
@@ -5721,19 +5833,30 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
struct mddev *mddev = container_of(kobj, struct mddev, kobj);
ssize_t rv;
+ struct kernfs_node *kn = NULL;
if (!entry->store)
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
+
+ if (entry->store == array_state_store && cmd_match(page, "clear"))
+ kn = sysfs_break_active_protection(kobj, attr);
+
spin_lock(&all_mddevs_lock);
if (!mddev_get(mddev)) {
spin_unlock(&all_mddevs_lock);
+ if (kn)
+ sysfs_unbreak_active_protection(kn);
return -EBUSY;
}
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
mddev_put(mddev);
+
+ if (kn)
+ sysfs_unbreak_active_protection(kn);
+
return rv;
}
@@ -5741,12 +5864,13 @@ static void md_kobj_release(struct kobject *ko)
{
struct mddev *mddev = container_of(ko, struct mddev, kobj);
- if (mddev->sysfs_state)
- sysfs_put(mddev->sysfs_state);
- if (mddev->sysfs_level)
- sysfs_put(mddev->sysfs_level);
-
- del_gendisk(mddev->gendisk);
+ if (legacy_async_del_gendisk) {
+ if (mddev->sysfs_state)
+ sysfs_put(mddev->sysfs_state);
+ if (mddev->sysfs_level)
+ sysfs_put(mddev->sysfs_level);
+ del_gendisk(mddev->gendisk);
+ }
put_disk(mddev->gendisk);
}
@@ -5950,6 +6074,9 @@ static int md_alloc_and_put(dev_t dev, char *name)
{
struct mddev *mddev = md_alloc(dev, name);
+ if (legacy_async_del_gendisk)
+ pr_warn("md: async del_gendisk mode will be removed in future, please upgrade to mdadm-4.5+\n");
+
if (IS_ERR(mddev))
return PTR_ERR(mddev);
mddev_put(mddev);
@@ -5995,7 +6122,7 @@ static int add_named_array(const char *val, const struct kernel_param *kp)
static void md_safemode_timeout(struct timer_list *t)
{
- struct mddev *mddev = from_timer(mddev, t, safemode_timer);
+ struct mddev *mddev = timer_container_of(mddev, t, safemode_timer);
mddev->safemode = 1;
if (mddev->external)
@@ -6172,7 +6299,7 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
- err = mddev->bitmap_ops->create(mddev, -1);
+ err = mddev->bitmap_ops->create(mddev);
if (err)
pr_warn("%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
@@ -6352,7 +6479,7 @@ static void md_clean(struct mddev *mddev)
mddev->external_size = 0;
mddev->dev_sectors = 0;
mddev->raid_disks = 0;
- mddev->recovery_cp = 0;
+ mddev->resync_offset = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->reshape_position = MaxSector;
@@ -6360,15 +6487,22 @@ static void md_clean(struct mddev *mddev)
mddev->persistent = 0;
mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0;
+
/*
- * Don't clear MD_CLOSING, or mddev can be opened again.
- * 'hold_active != 0' means mddev is still in the creation
- * process and will be used later.
+ * For legacy_async_del_gendisk mode, it can stop the array in the
+ * middle of assembling it, then it still can access the array. So
+ * it needs to clear MD_CLOSING. If not legacy_async_del_gendisk,
+ * it can't open the array again after stopping it. So it doesn't
+ * clear MD_CLOSING.
*/
- if (mddev->hold_active)
- mddev->flags = 0;
- else
+ if (legacy_async_del_gendisk && mddev->hold_active) {
+ clear_bit(MD_CLOSING, &mddev->flags);
+ } else {
+ /* if UNTIL_STOP is set, it's cleared here */
+ mddev->hold_active = 0;
+ /* Don't clear MD_CLOSING, or mddev can be opened again. */
mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
+ }
mddev->sb_flags = 0;
mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
@@ -6463,8 +6597,6 @@ static void __md_stop(struct mddev *mddev)
if (mddev->private)
pers->free(mddev, mddev->private);
mddev->private = NULL;
- if (pers->sync_request && mddev->to_remove == NULL)
- mddev->to_remove = &md_redundancy_group;
put_pers(pers);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@@ -6593,10 +6725,9 @@ static int do_md_stop(struct mddev *mddev, int mode)
mddev->bitmap_info.offset = 0;
export_array(mddev);
-
md_clean(mddev);
- if (mddev->hold_active == UNTIL_STOP)
- mddev->hold_active = 0;
+ if (!legacy_async_del_gendisk)
+ set_bit(MD_DELETED, &mddev->flags);
}
md_new_event();
sysfs_notify_dirent_safe(mddev->sysfs_state);
@@ -7232,7 +7363,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
err = 0;
if (mddev->pers) {
if (fd >= 0) {
- err = mddev->bitmap_ops->create(mddev, -1);
+ err = mddev->bitmap_ops->create(mddev);
if (!err)
err = mddev->bitmap_ops->load(mddev);
@@ -7306,9 +7437,9 @@ int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info)
* openned
*/
if (info->state & (1<<MD_SB_CLEAN))
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
else
- mddev->recovery_cp = 0;
+ mddev->resync_offset = 0;
mddev->persistent = ! info->not_persistent;
mddev->external = 0;
@@ -7548,7 +7679,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
- rv = mddev->bitmap_ops->create(mddev, -1);
+ rv = mddev->bitmap_ops->create(mddev);
if (!rv)
rv = mddev->bitmap_ops->load(mddev);
@@ -8247,7 +8378,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev)
seq_printf(seq, "\tresync=REMOTE");
return 1;
}
- if (mddev->recovery_cp < MaxSector) {
+ if (mddev->resync_offset < MaxSector) {
seq_printf(seq, "\tresync=PENDING");
return 1;
}
@@ -8572,50 +8703,55 @@ void md_cluster_stop(struct mddev *mddev)
put_cluster_ops(mddev);
}
-static int is_mddev_idle(struct mddev *mddev, int init)
+static bool is_rdev_holder_idle(struct md_rdev *rdev, bool init)
{
+ unsigned long last_events = rdev->last_events;
+
+ if (!bdev_is_partition(rdev->bdev))
+ return true;
+
+ /*
+ * If rdev is partition, and user doesn't issue IO to the array, the
+ * array is still not idle if user issues IO to other partitions.
+ */
+ rdev->last_events = part_stat_read_accum(rdev->bdev->bd_disk->part0,
+ sectors) -
+ part_stat_read_accum(rdev->bdev, sectors);
+
+ return init || rdev->last_events <= last_events;
+}
+
+/*
+ * mddev is idle if following conditions are matched since last check:
+ * 1) mddev doesn't have normal IO completed;
+ * 2) mddev doesn't have inflight normal IO;
+ * 3) if any member disk is partition, and other partitions don't have IO
+ * completed;
+ *
+ * Noted this checking rely on IO accounting is enabled.
+ */
+static bool is_mddev_idle(struct mddev *mddev, int init)
+{
+ unsigned long last_events = mddev->normal_io_events;
+ struct gendisk *disk;
struct md_rdev *rdev;
- int idle;
- int curr_events;
+ bool idle = true;
- idle = 1;
- rcu_read_lock();
- rdev_for_each_rcu(rdev, mddev) {
- struct gendisk *disk = rdev->bdev->bd_disk;
+ disk = mddev_is_dm(mddev) ? mddev->dm_gendisk : mddev->gendisk;
+ if (!disk)
+ return true;
- if (!init && !blk_queue_io_stat(disk->queue))
- continue;
+ mddev->normal_io_events = part_stat_read_accum(disk->part0, sectors);
+ if (!init && (mddev->normal_io_events > last_events ||
+ bdev_count_inflight(disk->part0)))
+ idle = false;
- curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
- atomic_read(&disk->sync_io);
- /* sync IO will cause sync_io to increase before the disk_stats
- * as sync_io is counted when a request starts, and
- * disk_stats is counted when it completes.
- * So resync activity will cause curr_events to be smaller than
- * when there was no such activity.
- * non-sync IO will cause disk_stat to increase without
- * increasing sync_io so curr_events will (eventually)
- * be larger than it was before. Once it becomes
- * substantially larger, the test below will cause
- * the array to appear non-idle, and resync will slow
- * down.
- * If there is a lot of outstanding resync activity when
- * we set last_event to curr_events, then all that activity
- * completing might cause the array to appear non-idle
- * and resync will be slowed down even though there might
- * not have been non-resync activity. This will only
- * happen once though. 'last_events' will soon reflect
- * the state where there is little or no outstanding
- * resync requests, and further resync activity will
- * always make curr_events less than last_events.
- *
- */
- if (init || curr_events - rdev->last_events > 64) {
- rdev->last_events = curr_events;
- idle = 0;
- }
- }
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev)
+ if (!is_rdev_holder_idle(rdev, init))
+ idle = false;
rcu_read_unlock();
+
return idle;
}
@@ -8741,14 +8877,14 @@ static void md_bitmap_start(struct mddev *mddev,
mddev->pers->bitmap_sector(mddev, &md_io_clone->offset,
&md_io_clone->sectors);
- mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset,
- md_io_clone->sectors);
+ mddev->bitmap_ops->start_write(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
}
static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone)
{
- mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset,
- md_io_clone->sectors);
+ mddev->bitmap_ops->end_write(mddev, md_io_clone->offset,
+ md_io_clone->sectors);
}
static void md_end_clone_io(struct bio *bio)
@@ -8885,7 +9021,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
return mddev->resync_min;
case ACTION_RESYNC:
if (!mddev->bitmap)
- return mddev->recovery_cp;
+ return mddev->resync_offset;
return 0;
case ACTION_RESHAPE:
/*
@@ -8901,11 +9037,7 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
start = MaxSector;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < start)
+ if (rdev_needs_recovery(rdev, start))
start = rdev->recovery_offset;
rcu_read_unlock();
@@ -8927,6 +9059,23 @@ static sector_t md_sync_position(struct mddev *mddev, enum sync_action action)
}
}
+static bool sync_io_within_limit(struct mddev *mddev)
+{
+ int io_sectors;
+
+ /*
+ * For raid456, sync IO is stripe(4k) per IO, for other levels, it's
+ * RESYNC_PAGES(64k) per IO.
+ */
+ if (mddev->level == 4 || mddev->level == 5 || mddev->level == 6)
+ io_sectors = 8;
+ else
+ io_sectors = 128;
+
+ return atomic_read(&mddev->recovery_active) <
+ io_sectors * sync_io_depth(mddev);
+}
+
#define SYNC_MARKS 10
#define SYNC_MARK_STEP (3*HZ)
#define UPDATE_FREQUENCY (5*60*HZ)
@@ -8976,6 +9125,11 @@ void md_do_sync(struct md_thread *thread)
}
action = md_sync_action(mddev);
+ if (action == ACTION_FROZEN || action == ACTION_IDLE) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ goto skip;
+ }
+
desc = md_sync_action_name(action);
mddev->last_sync_action = action;
@@ -9106,8 +9260,8 @@ void md_do_sync(struct md_thread *thread)
atomic_read(&mddev->recovery_active) == 0);
mddev->curr_resync_completed = j;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
- j > mddev->recovery_cp)
- mddev->recovery_cp = j;
+ j > mddev->resync_offset)
+ mddev->resync_offset = j;
update_time = jiffies;
set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
@@ -9195,7 +9349,8 @@ void md_do_sync(struct md_thread *thread)
msleep(500);
goto repeat;
}
- if (!is_mddev_idle(mddev, 0)) {
+ if (!sync_io_within_limit(mddev) &&
+ !is_mddev_idle(mddev, 0)) {
/*
* Give other IO more of a chance.
* The faster the devices, the less we wait.
@@ -9226,19 +9381,19 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync > MD_RESYNC_ACTIVE) {
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
- if (mddev->curr_resync >= mddev->recovery_cp) {
+ if (mddev->curr_resync >= mddev->resync_offset) {
pr_debug("md: checkpointing %s of %s.\n",
desc, mdname(mddev));
if (test_bit(MD_RECOVERY_ERROR,
&mddev->recovery))
- mddev->recovery_cp =
+ mddev->resync_offset =
mddev->curr_resync_completed;
else
- mddev->recovery_cp =
+ mddev->resync_offset =
mddev->curr_resync;
}
} else
- mddev->recovery_cp = MaxSector;
+ mddev->resync_offset = MaxSector;
} else {
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
mddev->curr_resync = MaxSector;
@@ -9246,12 +9401,8 @@ void md_do_sync(struct md_thread *thread)
test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
- if (rdev->raid_disk >= 0 &&
- mddev->delta_disks >= 0 &&
- !test_bit(Journal, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags) &&
- !test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < mddev->curr_resync)
+ if (mddev->delta_disks >= 0 &&
+ rdev_needs_recovery(rdev, mddev->curr_resync))
rdev->recovery_offset = mddev->curr_resync;
rcu_read_unlock();
}
@@ -9342,6 +9493,12 @@ static bool rdev_is_spare(struct md_rdev *rdev)
static bool rdev_addable(struct md_rdev *rdev)
{
+ struct mddev *mddev;
+
+ mddev = READ_ONCE(rdev->mddev);
+ if (!mddev)
+ return false;
+
/* rdev is already used, don't add it again. */
if (test_bit(Candidate, &rdev->flags) || rdev->raid_disk >= 0 ||
test_bit(Faulty, &rdev->flags))
@@ -9352,7 +9509,7 @@ static bool rdev_addable(struct md_rdev *rdev)
return true;
/* Allow to add if array is read-write. */
- if (md_is_rdwr(rdev->mddev))
+ if (md_is_rdwr(mddev))
return true;
/*
@@ -9380,17 +9537,11 @@ static bool md_spares_need_change(struct mddev *mddev)
return false;
}
-static int remove_and_add_spares(struct mddev *mddev,
- struct md_rdev *this)
+static int remove_spares(struct mddev *mddev, struct md_rdev *this)
{
struct md_rdev *rdev;
- int spares = 0;
int removed = 0;
- if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
- /* Mustn't remove devices when resync thread is running */
- return 0;
-
rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
!mddev->pers->hot_remove_disk(mddev, rdev)) {
@@ -9404,6 +9555,21 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd)
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
+ return removed;
+}
+
+static int remove_and_add_spares(struct mddev *mddev,
+ struct md_rdev *this)
+{
+ struct md_rdev *rdev;
+ int spares = 0;
+ int removed = 0;
+
+ if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ /* Mustn't remove devices when resync thread is running */
+ return 0;
+
+ removed = remove_spares(mddev, this);
if (this && removed)
goto no_add;
@@ -9445,7 +9611,8 @@ static bool md_choose_sync_action(struct mddev *mddev, int *spares)
}
/* Check if resync is in progress. */
- if (mddev->recovery_cp < MaxSector) {
+ if (mddev->resync_offset < MaxSector) {
+ remove_spares(mddev, NULL);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
return true;
@@ -9625,7 +9792,7 @@ void md_check_recovery(struct mddev *mddev)
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
(mddev->external == 0 && mddev->safemode == 1) ||
(mddev->safemode == 2
- && !mddev->in_sync && mddev->recovery_cp == MaxSector)
+ && !mddev->in_sync && mddev->resync_offset == MaxSector)
))
return;
@@ -9682,8 +9849,8 @@ void md_check_recovery(struct mddev *mddev)
* remove disk.
*/
rdev_for_each_safe(rdev, tmp, mddev) {
- if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
- rdev->raid_disk < 0)
+ if (rdev->raid_disk < 0 &&
+ test_and_clear_bit(ClusterRemove, &rdev->flags))
md_kick_rdev_from_array(rdev);
}
}
@@ -9989,8 +10156,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
/* Check for change of roles in the active devices */
rdev_for_each_safe(rdev2, tmp, mddev) {
- if (test_bit(Faulty, &rdev2->flags))
+ if (test_bit(Faulty, &rdev2->flags)) {
+ if (test_bit(ClusterRemove, &rdev2->flags))
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
continue;
+ }
/* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
@@ -10288,6 +10458,7 @@ module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
+module_param(legacy_async_del_gendisk, bool, 0600);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MD RAID framework");