From eae1701fbd264cfc7efbaf7cd4cd999760070e27 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:23 -0800 Subject: [PATCH] md: initial sysfs support for md Start using kobjects in mddevs, and provide a couple of simple attributes (level and disks). Attributes live in /sys/block/mdX/md/attr-name Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9ecf51ee596f..a68ad8547325 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -181,7 +181,7 @@ static void mddev_put(mddev_t *mddev) if (!mddev->raid_disks && list_empty(&mddev->disks)) { list_del(&mddev->all_mddevs); blk_put_queue(mddev->queue); - kfree(mddev); + kobject_unregister(&mddev->kobj); } spin_unlock(&all_mddevs_lock); } @@ -1551,6 +1551,85 @@ static void analyze_sbs(mddev_t * mddev) } +struct md_sysfs_entry { + struct attribute attr; + ssize_t (*show)(mddev_t *, char *); + ssize_t (*store)(mddev_t *, const char *, size_t); +}; + +static ssize_t +md_show_level(mddev_t *mddev, char *page) +{ + mdk_personality_t *p = mddev->pers; + if (p == NULL) + return 0; + if (mddev->level >= 0) + return sprintf(page, "RAID-%d\n", mddev->level); + else + return sprintf(page, "%s\n", p->name); +} + +static struct md_sysfs_entry md_level = { + .attr = {.name = "level", .mode = S_IRUGO }, + .show = md_show_level, +}; + +static ssize_t +md_show_rdisks(mddev_t *mddev, char *page) +{ + return sprintf(page, "%d\n", mddev->raid_disks); +} + +static struct md_sysfs_entry md_raid_disks = { + .attr = {.name = "raid_disks", .mode = S_IRUGO }, + .show = md_show_rdisks, +}; + +static struct attribute *md_default_attrs[] = { + &md_level.attr, + &md_raid_disks.attr, + NULL, +}; + +static ssize_t +md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); + mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + + if (!entry->show) + return -EIO; + return entry->show(mddev, page); +} + +static ssize_t +md_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); + mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + + if (!entry->store) + return -EIO; + return entry->store(mddev, page, length); +} + +static void md_free(struct kobject *ko) +{ + mddev_t *mddev = container_of(ko, mddev_t, kobj); + kfree(mddev); +} + +static struct sysfs_ops md_sysfs_ops = { + .show = md_attr_show, + .store = md_attr_store, +}; +static struct kobj_type md_ktype = { + .release = md_free, + .sysfs_ops = &md_sysfs_ops, + .default_attrs = md_default_attrs, +}; + int mdp_major = 0; static struct kobject *md_probe(dev_t dev, int *part, void *data) @@ -1592,6 +1671,11 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) add_disk(disk); mddev->gendisk = disk; up(&disks_sem); + mddev->kobj.parent = kobject_get(&disk->kobj); + mddev->kobj.k_name = NULL; + snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); + mddev->kobj.ktype = &md_ktype; + kobject_register(&mddev->kobj); return NULL; } -- cgit From 86e6ffdd243a06663713e637ee683fb27dce8e0c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:24 -0800 Subject: [PATCH] md: extend md sysfs support to component devices. Each device in an md array how has a corresponding /sys/block/mdX/md/devNN/ directory which can contain attributes. Currently there is only 'state' which summarises the state, nd 'super' which has a copy of the superblock, and 'block' which is a symlink to the block device. Also, /sys/block/mdX/md/rdNN represents slot 'NN' in the array, and is a symlink to the relevant 'devNN'. Obviously spare devices do not have a slot in the array, and so don't have such a symlink. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 160 insertions(+), 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index a68ad8547325..74520b50c307 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -711,6 +711,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) */ int i; int active=0, working=0,failed=0,spare=0,nr_disks=0; + unsigned int fixdesc=0; rdev->sb_size = MD_SB_BYTES; @@ -758,16 +759,28 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->disks[0].state = (1<raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) - rdev2->desc_nr = rdev2->raid_disk; + desc_nr = rdev2->raid_disk; else - rdev2->desc_nr = next_spare++; + desc_nr = next_spare++; + if (desc_nr != rdev2->desc_nr) { + fixdesc |= (1 << desc_nr); + rdev2->desc_nr = desc_nr; + if (rdev2->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev2->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + sysfs_remove_link(&rdev2->kobj, "block"); + kobject_del(&rdev2->kobj); + } d = &sb->disks[rdev2->desc_nr]; nr_disks++; d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); - if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty) + if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ @@ -787,7 +800,22 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (test_bit(WriteMostly, &rdev2->flags)) d->state |= (1<desc_nr)) { + snprintf(rdev2->kobj.name, KOBJ_NAME_LEN, "dev%d", + rdev2->desc_nr); + kobject_add(&rdev2->kobj); + sysfs_create_link(&rdev2->kobj, + &rdev2->bdev->bd_disk->kobj, + "block"); + if (rdev2->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev2->raid_disk); + sysfs_create_link(&mddev->kobj, + &rdev2->kobj, nm); + } + } /* now set the "removed" and "faulty" bits on any missing devices */ for (i=0 ; i < mddev->raid_disks ; i++) { mdp_disk_t *d = &sb->disks[i]; @@ -1147,6 +1175,13 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) list_add(&rdev->same_set, &mddev->disks); rdev->mddev = mddev; printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); + + rdev->kobj.k_name = NULL; + snprintf(rdev->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev->desc_nr); + rdev->kobj.parent = kobject_get(&mddev->kobj); + kobject_add(&rdev->kobj); + + sysfs_create_link(&rdev->kobj, &rdev->bdev->bd_disk->kobj, "block"); return 0; } @@ -1160,6 +1195,8 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) list_del_init(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; + sysfs_remove_link(&rdev->kobj, "block"); + kobject_del(&rdev->kobj); } /* @@ -1215,7 +1252,7 @@ static void export_rdev(mdk_rdev_t * rdev) md_autodetect_dev(rdev->bdev->bd_dev); #endif unlock_rdev(rdev); - kfree(rdev); + kobject_put(&rdev->kobj); } static void kick_rdev_from_array(mdk_rdev_t * rdev) @@ -1414,6 +1451,94 @@ repeat: } +struct rdev_sysfs_entry { + struct attribute attr; + ssize_t (*show)(mdk_rdev_t *, char *); + ssize_t (*store)(mdk_rdev_t *, const char *, size_t); +}; + +static ssize_t +rdev_show_state(mdk_rdev_t *rdev, char *page) +{ + char *sep = ""; + int len=0; + + if (rdev->faulty) { + len+= sprintf(page+len, "%sfaulty",sep); + sep = ","; + } + if (rdev->in_sync) { + len += sprintf(page+len, "%sin_sync",sep); + sep = ","; + } + if (!rdev->faulty && !rdev->in_sync) { + len += sprintf(page+len, "%sspare", sep); + sep = ","; + } + return len+sprintf(page+len, "\n"); +} + +static struct rdev_sysfs_entry rdev_state = { + .attr = {.name = "state", .mode = S_IRUGO }, + .show = rdev_show_state, +}; + +static ssize_t +rdev_show_super(mdk_rdev_t *rdev, char *page) +{ + if (rdev->sb_loaded && rdev->sb_size) { + memcpy(page, page_address(rdev->sb_page), rdev->sb_size); + return rdev->sb_size; + } else + return 0; +} +static struct rdev_sysfs_entry rdev_super = { + .attr = {.name = "super", .mode = S_IRUGO }, + .show = rdev_show_super, +}; +static struct attribute *rdev_default_attrs[] = { + &rdev_state.attr, + &rdev_super.attr, + NULL, +}; +static ssize_t +rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); + mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + + if (!entry->show) + return -EIO; + return entry->show(rdev, page); +} + +static ssize_t +rdev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); + mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); + + if (!entry->store) + return -EIO; + return entry->store(rdev, page, length); +} + +static void rdev_free(struct kobject *ko) +{ + mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); + kfree(rdev); +} +static struct sysfs_ops rdev_sysfs_ops = { + .show = rdev_attr_show, + .store = rdev_attr_store, +}; +static struct kobj_type rdev_ktype = { + .release = rdev_free, + .sysfs_ops = &rdev_sysfs_ops, + .default_attrs = rdev_default_attrs, +}; + /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -1445,6 +1570,10 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi if (err) goto abort_free; + rdev->kobj.parent = NULL; + rdev->kobj.ktype = &rdev_ktype; + kobject_init(&rdev->kobj); + rdev->desc_nr = -1; rdev->faulty = 0; rdev->in_sync = 0; @@ -1820,6 +1949,13 @@ static int do_md_run(mddev_t * mddev) mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */ mddev->in_sync = 1; + + ITERATE_RDEV(mddev,rdev,tmp) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -1941,9 +2077,18 @@ static int do_md_stop(mddev_t * mddev, int ro) * Free resources if final stop */ if (!ro) { + mdk_rdev_t *rdev; + struct list_head *tmp; struct gendisk *disk; printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); + ITERATE_RDEV(mddev,rdev,tmp) + if (rdev->raid_disk >= 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + } + export_array(mddev); mddev->array_size = 0; @@ -3962,17 +4107,24 @@ void md_check_recovery(mddev_t *mddev) if (rdev->raid_disk >= 0 && (rdev->faulty || ! rdev->in_sync) && atomic_read(&rdev->nr_pending)==0) { - if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) + if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { + char nm[20]; + sprintf(nm,"rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); rdev->raid_disk = -1; + } } if (mddev->degraded) { ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk < 0 && !rdev->faulty) { - if (mddev->pers->hot_add_disk(mddev,rdev)) + if (mddev->pers->hot_add_disk(mddev,rdev)) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); spares++; - else + } else break; } } -- cgit From 24dd469d728dae07f40c5d79ea6dedd38cdf1a30 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:26 -0800 Subject: [PATCH] md: allow a manual resync with md You can trigger a 'check' with echo check > /sys/block/mdX/md/scan_mode or a check-and-repair errors with echo repair > /sys/block/mdX/md/scan_mode and read the current state from the same file. Note: personalities need to know the different between 'check' and 'repair', but don't yet. Until they do, 'check' will be the same as 'repair' and will just do a normal resync pass. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 9 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 74520b50c307..37400873b879 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1714,9 +1714,60 @@ static struct md_sysfs_entry md_raid_disks = { .show = md_show_rdisks, }; +static ssize_t +md_show_scan(mddev_t *mddev, char *page) +{ + char *type = "none"; + if (mddev->recovery & + ((1<recovery & (1<recovery)) + type = "resync"; + else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + type = "check"; + else + type = "repair"; + } else + type = "recover"; + } + return sprintf(page, "%s\n", type); +} + +static ssize_t +md_store_scan(mddev_t *mddev, const char *page, size_t len) +{ + int canscan=0; + if (mddev->recovery & + ((1<reconfig_sem); + if (mddev->pers && mddev->pers->sync_request) + canscan=1; + up(&mddev->reconfig_sem); + if (!canscan) + return -EINVAL; + + if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) + return -EINVAL; + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + return len; +} + +static struct md_sysfs_entry md_scan_mode = { + .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, + .show = md_show_scan, + .store = md_store_scan, +}; + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_raid_disks.attr, + &md_scan_mode.attr, NULL, }; @@ -3855,7 +3906,8 @@ static void md_do_sync(mddev_t *mddev) is_mddev_idle(mddev); /* this also initializes IO event counters */ /* we don't use the checkpoint if there's a bitmap */ - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap + && ! test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->recovery_cp; else j = 0; @@ -4093,9 +4145,13 @@ void md_check_recovery(mddev_t *mddev) set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } - if (mddev->recovery) - /* probably just the RECOVERY_NEEDED flag */ - mddev->recovery = 0; + /* Clear some bits that don't mean anything, but + * might be left set + */ + clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + clear_bit(MD_RECOVERY_ERR, &mddev->recovery); + clear_bit(MD_RECOVERY_INTR, &mddev->recovery); + clear_bit(MD_RECOVERY_DONE, &mddev->recovery); /* no recovery is running. * remove any failed drives, then @@ -4129,14 +4185,17 @@ void md_check_recovery(mddev_t *mddev) } } - if (!spares && (mddev->recovery_cp == MaxSector )) { - /* nothing we can do ... */ + if (spares) { + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + } else if (mddev->recovery_cp < MaxSector) { + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + /* nothing to be done ... */ goto unlock; - } + if (mddev->pers->sync_request) { set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - if (!spares) - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); if (spares && mddev->bitmap && ! mddev->bitmap->file) { /* We are adding a device or devices to an array * which has the bitmap stored on all devices. -- cgit From 9d88883e68f404d5581bd391713ceef470ea53a9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:26 -0800 Subject: [PATCH] md: teach raid5 the difference between 'check' and 'repair'. With this, raid5 can be asked to check parity without repairing it. It also keeps a count of the number of incorrect parity blocks found (mismatches) and reports them through sysfs. Signed-off-by: Neil Brown Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 37400873b879..e58d61d9f31b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1758,16 +1758,29 @@ md_store_scan(mddev_t *mddev, const char *page, size_t len) return len; } +static ssize_t +md_show_mismatch(mddev_t *mddev, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long) mddev->resync_mismatches); +} + static struct md_sysfs_entry md_scan_mode = { .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, .show = md_show_scan, .store = md_store_scan, }; +static struct md_sysfs_entry md_mismatches = { + .attr = {.name = "mismatch_cnt", .mode = S_IRUGO }, + .show = md_show_mismatch, +}; + static struct attribute *md_default_attrs[] = { &md_level.attr, &md_raid_disks.attr, &md_scan_mode.attr, + &md_mismatches.attr, NULL, }; @@ -3888,12 +3901,13 @@ static void md_do_sync(mddev_t *mddev) } } while (mddev->curr_resync < 2); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { /* resync follows the size requested by the personality, * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - else + mddev->resync_mismatches = 0; + } else /* recovery follows the physical size of devices */ max_sectors = mddev->size << 1; -- cgit From 9c79197761b4c181a143dc6a6044f4e47d44bdcc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:28 -0800 Subject: [PATCH] md: fix ref-counting problems with kobjects in md Thanks Greg. Cc: Greg KH Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index e58d61d9f31b..fe0137a5b002 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -805,7 +805,11 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (fixdesc & (1<desc_nr)) { snprintf(rdev2->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev2->desc_nr); + /* kobject_add gets a ref on the parent, so + * we have to drop the one we already have + */ kobject_add(&rdev2->kobj); + kobject_put(rdev->kobj.parent); sysfs_create_link(&rdev2->kobj, &rdev2->bdev->bd_disk->kobj, "block"); @@ -1178,7 +1182,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) rdev->kobj.k_name = NULL; snprintf(rdev->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev->desc_nr); - rdev->kobj.parent = kobject_get(&mddev->kobj); + rdev->kobj.parent = &mddev->kobj; kobject_add(&rdev->kobj); sysfs_create_link(&rdev->kobj, &rdev->bdev->bd_disk->kobj, "block"); @@ -1864,7 +1868,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data) add_disk(disk); mddev->gendisk = disk; up(&disks_sem); - mddev->kobj.parent = kobject_get(&disk->kobj); + mddev->kobj.parent = &disk->kobj; mddev->kobj.k_name = NULL; snprintf(mddev->kobj.name, KOBJ_NAME_LEN, "%s", "md"); mddev->kobj.ktype = &md_ktype; -- cgit From 31399d9e56abeec4d819f07eefc97f30b5d5ed75 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:29 -0800 Subject: [PATCH] md: minor MD fixes 1/ Use reduce stack usage, because 'gcc' apparently doesn't overlay different variables that are in separate scopes... 2/ Use test_bit instead of ( .. & 1<< ..) which in this case is buggy. Thanks to Andrew Morton Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index fe0137a5b002..013f2f27589c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -698,6 +698,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) struct list_head *tmp; mdk_rdev_t *rdev2; int next_spare = mddev->raid_disks; + char nm[20]; /* make rdev->sb match mddev data.. * @@ -768,7 +769,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) fixdesc |= (1 << desc_nr); rdev2->desc_nr = desc_nr; if (rdev2->raid_disk >= 0) { - char nm[20]; sprintf(nm, "rd%d", rdev2->raid_disk); sysfs_remove_link(&mddev->kobj, nm); } @@ -814,7 +814,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) &rdev2->bdev->bd_disk->kobj, "block"); if (rdev2->raid_disk >= 0) { - char nm[20]; sprintf(nm, "rd%d", rdev2->raid_disk); sysfs_create_link(&mddev->kobj, &rdev2->kobj, nm); @@ -1722,9 +1721,9 @@ static ssize_t md_show_scan(mddev_t *mddev, char *page) { char *type = "none"; - if (mddev->recovery & - ((1<recovery & (1<recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) type = "resync"; else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) @@ -1741,8 +1740,9 @@ static ssize_t md_store_scan(mddev_t *mddev, const char *page, size_t len) { int canscan=0; - if (mddev->recovery & - ((1<recovery) || + test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; down(&mddev->reconfig_sem); if (mddev->pers && mddev->pers->sync_request) -- cgit From 007583c9253fed363a0bd71b039e9b40a0f6855e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:30 -0800 Subject: [PATCH] md: change raid5 sysfs attribute to not create a new directory There isn't really a need for raid5 attributes to be an a subdirectory, so this patch moves them from /sys/block/mdX/md/raid5/attribute to /sys/block/mdX/md/attribute This suggests that all md personalities should co-operate about namespace usage, but that shouldn't be a problem. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 013f2f27589c..3db5c3513072 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1683,12 +1683,6 @@ static void analyze_sbs(mddev_t * mddev) } -struct md_sysfs_entry { - struct attribute attr; - ssize_t (*show)(mddev_t *, char *); - ssize_t (*store)(mddev_t *, const char *, size_t); -}; - static ssize_t md_show_level(mddev_t *mddev, char *page) { -- cgit From ba22dcbf106338a5c46d6979f9b19564faae3d49 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:31 -0800 Subject: [PATCH] md: improvements to raid5 handling of read errors Two refinements to the 'attempt-overwrite-on-read-error' mechanism. 1/ If the array is read-only, don't attempt an over-write. 2/ If there are more than max_nr_stripes read errors on a device with no success, fail the drive. This will make sure a dead drive will be eventually kicked even when we aren't trying to rewrite (which would normally kick a dead drive more quickly. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3db5c3513072..3fb80397f8aa 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1582,6 +1582,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev->in_sync = 0; rdev->data_offset = 0; atomic_set(&rdev->nr_pending, 0); + atomic_set(&rdev->read_errors, 0); size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; if (!size) { -- cgit From b2d444d7ad975d555bb919601bcdc0e58975a40e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:31 -0800 Subject: [PATCH] md: convert 'faulty' and 'in_sync' fields to bits in 'flags' field This has the advantage of removing the confusion caused by 'rdev_t' and 'mddev_t' both having 'in_sync' fields. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 92 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 45 insertions(+), 47 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 3fb80397f8aa..9dfa063d1c6a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -610,7 +610,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); rdev->raid_disk = -1; - rdev->in_sync = 0; + rdev->flags = 0; if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; @@ -671,21 +671,19 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) return 0; if (mddev->level != LEVEL_MULTIPATH) { - rdev->faulty = 0; - rdev->flags = 0; desc = sb->disks + rdev->desc_nr; if (desc->state & (1<faulty = 1; + set_bit(Faulty, &rdev->flags); else if (desc->state & (1<raid_disk < mddev->raid_disks) { - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); rdev->raid_disk = desc->raid_disk; } if (desc->state & (1<flags); } else /* MULTIPATH are always insync */ - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); return 0; } @@ -761,7 +759,8 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) ITERATE_RDEV(mddev,rdev2,tmp) { mdp_disk_t *d; int desc_nr; - if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) + && !test_bit(Faulty, &rdev2->flags)) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; @@ -780,14 +779,15 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) d->number = rdev2->desc_nr; d->major = MAJOR(rdev2->bdev->bd_dev); d->minor = MINOR(rdev2->bdev->bd_dev); - if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty) + if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) + && !test_bit(Faulty, &rdev2->flags)) d->raid_disk = rdev2->raid_disk; else d->raid_disk = rdev2->desc_nr; /* compatibility */ - if (rdev2->faulty) { + if (test_bit(Faulty, &rdev2->flags)) { d->state = (1<in_sync) { + } else if (test_bit(In_sync, &rdev2->flags)) { d->state = (1<state |= (1<sb_page); rdev->raid_disk = -1; - rdev->in_sync = 0; + rdev->flags = 0; if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; @@ -1027,22 +1027,19 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ - rdev->faulty = 0; break; case 0xfffe: /* faulty */ - rdev->faulty = 1; + set_bit(Faulty, &rdev->flags); break; default: - rdev->in_sync = 1; - rdev->faulty = 0; + set_bit(In_sync, &rdev->flags); rdev->raid_disk = role; break; } - rdev->flags = 0; if (sb->devflags & WriteMostly1) set_bit(WriteMostly, &rdev->flags); } else /* MULTIPATH are always insync */ - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); return 0; } @@ -1086,9 +1083,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) ITERATE_RDEV(mddev,rdev2,tmp) { i = rdev2->desc_nr; - if (rdev2->faulty) + if (test_bit(Faulty, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(0xfffe); - else if (rdev2->in_sync) + else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); @@ -1327,7 +1324,8 @@ static void print_rdev(mdk_rdev_t *rdev) char b[BDEVNAME_SIZE]; printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n", bdevname(rdev->bdev,b), (unsigned long long)rdev->size, - rdev->faulty, rdev->in_sync, rdev->desc_nr); + test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), + rdev->desc_nr); if (rdev->sb_loaded) { printk(KERN_INFO "md: rdev superblock:\n"); print_sb((mdp_super_t*)page_address(rdev->sb_page)); @@ -1421,11 +1419,11 @@ repeat: ITERATE_RDEV(mddev,rdev,tmp) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) dprintk("(skipping faulty "); dprintk("%s ", bdevname(rdev->bdev,b)); - if (!rdev->faulty) { + if (!test_bit(Faulty, &rdev->flags)) { md_super_write(mddev,rdev, rdev->sb_offset<<1, rdev->sb_size, rdev->sb_page); @@ -1466,15 +1464,16 @@ rdev_show_state(mdk_rdev_t *rdev, char *page) char *sep = ""; int len=0; - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } - if (rdev->in_sync) { + if (test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sin_sync",sep); sep = ","; } - if (!rdev->faulty && !rdev->in_sync) { + if (!test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; } @@ -1578,8 +1577,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj); rdev->desc_nr = -1; - rdev->faulty = 0; - rdev->in_sync = 0; + rdev->flags = 0; rdev->data_offset = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); @@ -1670,7 +1668,7 @@ static void analyze_sbs(mddev_t * mddev) if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; - rdev->in_sync = 1; + set_bit(In_sync, &rdev->flags); } } @@ -1939,7 +1937,7 @@ static int do_md_run(mddev_t * mddev) /* devices must have minimum size of one chunk */ ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) continue; if (rdev->size < chunk_size / 1024) { printk(KERN_WARNING @@ -1967,7 +1965,7 @@ static int do_md_run(mddev_t * mddev) * Also find largest hardsector size */ ITERATE_RDEV(mddev,rdev,tmp) { - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) continue; sync_blockdev(rdev->bdev); invalidate_bdev(rdev->bdev, 0); @@ -2304,7 +2302,7 @@ static int autostart_array(dev_t startdev) return err; } - if (start_rdev->faulty) { + if (test_bit(Faulty, &start_rdev->flags)) { printk(KERN_WARNING "md: can not autostart based on faulty %s!\n", bdevname(start_rdev->bdev,b)); @@ -2363,11 +2361,11 @@ static int get_array_info(mddev_t * mddev, void __user * arg) nr=working=active=failed=spare=0; ITERATE_RDEV(mddev,rdev,tmp) { nr++; - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) failed++; else { working++; - if (rdev->in_sync) + if (test_bit(In_sync, &rdev->flags)) active++; else spare++; @@ -2458,9 +2456,9 @@ static int get_disk_info(mddev_t * mddev, void __user * arg) info.minor = MINOR(rdev->bdev->bd_dev); info.raid_disk = rdev->raid_disk; info.state = 0; - if (rdev->faulty) + if (test_bit(Faulty, &rdev->flags)) info.state |= (1<in_sync) { + else if (test_bit(In_sync, &rdev->flags)) { info.state |= (1<saved_raid_disk = rdev->raid_disk; - rdev->in_sync = 0; /* just to be sure */ + clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1<flags); @@ -2591,11 +2589,11 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) else rdev->raid_disk = -1; - rdev->faulty = 0; + rdev->flags = 0; + if (rdev->raid_disk < mddev->raid_disks) - rdev->in_sync = (info->state & (1<in_sync = 0; + if (info->state & (1<flags); if (info->state & (1<flags); @@ -2694,14 +2692,14 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) goto abort_export; } - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { printk(KERN_WARNING "md: can not hot-add faulty %s disk to %s!\n", bdevname(rdev->bdev,b), mdname(mddev)); err = -EINVAL; goto abort_export; } - rdev->in_sync = 0; + clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; bind_rdev_to_array(rdev, mddev); @@ -3428,7 +3426,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) return; } - if (!rdev || rdev->faulty) + if (!rdev || test_bit(Faulty, &rdev->flags)) return; /* dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", @@ -3626,7 +3624,7 @@ static int md_seq_show(struct seq_file *seq, void *v) bdevname(rdev->bdev,b), rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; } else if (rdev->raid_disk < 0) @@ -4174,7 +4172,7 @@ void md_check_recovery(mddev_t *mddev) */ ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk >= 0 && - (rdev->faulty || ! rdev->in_sync) && + (test_bit(Faulty, &rdev->flags) || ! test_bit(In_sync, &rdev->flags)) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0) { char nm[20]; @@ -4187,7 +4185,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->degraded) { ITERATE_RDEV(mddev,rdev,rtmp) if (rdev->raid_disk < 0 - && !rdev->faulty) { + && !test_bit(Faulty, &rdev->flags)) { if (mddev->pers->hot_add_disk(mddev,rdev)) { char nm[20]; sprintf(nm, "rd%d", rdev->raid_disk); @@ -4347,7 +4345,7 @@ static void autostart_arrays(int part) if (IS_ERR(rdev)) continue; - if (rdev->faulty) { + if (test_bit(Faulty, &rdev->flags)) { MD_BUG(); continue; } -- cgit From bd926c63b7a6843d3ce2728396c0891e54fce5c4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:32 -0800 Subject: [PATCH] md: make md on-disk bitmaps not host-endian Current bitmaps use set_bit et.al and so are host-endian, which means not-portable. Oops. Define a new version number (4) for which bitmaps are little-endian. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9dfa063d1c6a..caa4add00c1b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4281,7 +4281,7 @@ static int __init md_init(void) " MD_SB_DISKS=%d\n", MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); - printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, + printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR_HI, BITMAP_MINOR); if (register_blkdev(MAJOR_NR, "md")) -- cgit From a9701a30470856408d08657eb1bd7ae29a146190 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:34 -0800 Subject: [PATCH] md: support BIO_RW_BARRIER for md/raid1 We can only accept BARRIER requests if all slaves handle barriers, and that can, of course, change with time.... So we keep track of whether the whole array seems safe for barriers, and also whether each individual rdev handles barriers. We initially assumes barriers are OK. When writing the superblock we try a barrier, and if that fails, we flag things for no-barriers. This will usually clear the flags fairly quickly. If writing the superblock finds that BIO_RW_BARRIER is -ENOTSUPP, we need to resubmit, so introduce function "md_super_wait" which waits for requests to finish, and retries ENOTSUPP requests without the barrier flag. When writing the real raid1, write requests which were BIO_RW_BARRIER but which aresn't supported need to be retried. So raid1d is enhanced to do this, and when any bio write completes (i.e. no retry needed) we remove it from the r1bio, so that devices needing retry are easy to find. We should hardly ever get -ENOTSUPP errors when writing data to the raid. It should only happen if: 1/ the device used to support BARRIER, but now doesn't. Few devices change like this, though raid1 can! or 2/ the array has no persistent superblock, so there was no opportunity to pre-test for barriers when writing the superblock. Signed-off-by: Neil Brown Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 16 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index caa4add00c1b..199016932de5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -330,18 +330,46 @@ static void free_disk_sb(mdk_rdev_t * rdev) static int super_written(struct bio *bio, unsigned int bytes_done, int error) { mdk_rdev_t *rdev = bio->bi_private; + mddev_t *mddev = rdev->mddev; if (bio->bi_size) return 1; if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) - md_error(rdev->mddev, rdev); + md_error(mddev, rdev); - if (atomic_dec_and_test(&rdev->mddev->pending_writes)) - wake_up(&rdev->mddev->sb_wait); + if (atomic_dec_and_test(&mddev->pending_writes)) + wake_up(&mddev->sb_wait); bio_put(bio); return 0; } +static int super_written_barrier(struct bio *bio, unsigned int bytes_done, int error) +{ + struct bio *bio2 = bio->bi_private; + mdk_rdev_t *rdev = bio2->bi_private; + mddev_t *mddev = rdev->mddev; + if (bio->bi_size) + return 1; + + if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && + error == -EOPNOTSUPP) { + unsigned long flags; + /* barriers don't appear to be supported :-( */ + set_bit(BarriersNotsupp, &rdev->flags); + mddev->barriers_work = 0; + spin_lock_irqsave(&mddev->write_lock, flags); + bio2->bi_next = mddev->biolist; + mddev->biolist = bio2; + spin_unlock_irqrestore(&mddev->write_lock, flags); + wake_up(&mddev->sb_wait); + bio_put(bio); + return 0; + } + bio_put(bio2); + bio->bi_private = rdev; + return super_written(bio, bytes_done, error); +} + void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page) { @@ -350,16 +378,54 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, * and decrement it on completion, waking up sb_wait * if zero is reached. * If an error occurred, call md_error + * + * As we might need to resubmit the request if BIO_RW_BARRIER + * causes ENOTSUPP, we allocate a spare bio... */ struct bio *bio = bio_alloc(GFP_NOIO, 1); + int rw = (1<bi_bdev = rdev->bdev; bio->bi_sector = sector; bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; + bio->bi_rw = rw; + atomic_inc(&mddev->pending_writes); - submit_bio((1<flags)) { + struct bio *rbio; + rw |= (1<bi_private = bio; + rbio->bi_end_io = super_written_barrier; + submit_bio(rw, rbio); + } else + submit_bio(rw, bio); +} + +void md_super_wait(mddev_t *mddev) +{ + /* wait for all superblock writes that were scheduled to complete. + * if any had to be retried (due to BARRIER problems), retry them + */ + DEFINE_WAIT(wq); + for(;;) { + prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); + if (atomic_read(&mddev->pending_writes)==0) + break; + while (mddev->biolist) { + struct bio *bio; + spin_lock_irq(&mddev->write_lock); + bio = mddev->biolist; + mddev->biolist = bio->bi_next ; + bio->bi_next = NULL; + spin_unlock_irq(&mddev->write_lock); + submit_bio(bio->bi_rw, bio); + } + schedule(); + } + finish_wait(&mddev->sb_wait, &wq); } static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) @@ -1382,7 +1448,7 @@ static void md_update_sb(mddev_t * mddev) int sync_req; repeat: - spin_lock(&mddev->write_lock); + spin_lock_irq(&mddev->write_lock); sync_req = mddev->in_sync; mddev->utime = get_seconds(); mddev->events ++; @@ -1405,11 +1471,11 @@ repeat: */ if (!mddev->persistent) { mddev->sb_dirty = 0; - spin_unlock(&mddev->write_lock); + spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); return; } - spin_unlock(&mddev->write_lock); + spin_unlock_irq(&mddev->write_lock); dprintk(KERN_INFO "md: updating %s RAID superblock on device (in sync %d)\n", @@ -1437,17 +1503,17 @@ repeat: /* only need to write one superblock... */ break; } - wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + md_super_wait(mddev); /* if there was a failure, sb_dirty was set to 1, and we re-write super */ - spin_lock(&mddev->write_lock); + spin_lock_irq(&mddev->write_lock); if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { /* have to write it out again */ - spin_unlock(&mddev->write_lock); + spin_unlock_irq(&mddev->write_lock); goto repeat; } mddev->sb_dirty = 0; - spin_unlock(&mddev->write_lock); + spin_unlock_irq(&mddev->write_lock); wake_up(&mddev->sb_wait); } @@ -1989,6 +2055,7 @@ static int do_md_run(mddev_t * mddev) mddev->recovery = 0; mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ + mddev->barriers_work = 1; /* before we start the array running, initialise the bitmap */ err = bitmap_create(mddev); @@ -2107,7 +2174,7 @@ static int do_md_stop(mddev_t * mddev, int ro) mddev->ro = 1; } else { bitmap_flush(mddev); - wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + md_super_wait(mddev); if (mddev->ro) set_disk_ro(disk, 0); blk_queue_make_request(mddev->queue, md_fail_request); @@ -3796,13 +3863,13 @@ void md_write_start(mddev_t *mddev, struct bio *bi) atomic_inc(&mddev->writes_pending); if (mddev->in_sync) { - spin_lock(&mddev->write_lock); + spin_lock_irq(&mddev->write_lock); if (mddev->in_sync) { mddev->in_sync = 0; mddev->sb_dirty = 1; md_wakeup_thread(mddev->thread); } - spin_unlock(&mddev->write_lock); + spin_unlock_irq(&mddev->write_lock); } wait_event(mddev->sb_wait, mddev->sb_dirty==0); } @@ -4112,7 +4179,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev_trylock(mddev)==0) { int spares =0; - spin_lock(&mddev->write_lock); + spin_lock_irq(&mddev->write_lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && mddev->recovery_cp == MaxSector) { mddev->in_sync = 1; @@ -4120,7 +4187,7 @@ void md_check_recovery(mddev_t *mddev) } if (mddev->safemode == 1) mddev->safemode = 0; - spin_unlock(&mddev->write_lock); + spin_unlock_irq(&mddev->write_lock); if (mddev->sb_dirty) md_update_sb(mddev); -- cgit From 19133a4298223422742db7b5f940e8c54c6a1846 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:35 -0800 Subject: [PATCH] md: Remove attempt to use dynamic names in sysfs for component devices on an MD array. With version-0.90 superblock, component devices on an md device to not have any stable name related to the array -(version-1 assigns a fixed index when a device is added to an array, and this remains despit any hot-swap). The intial code for making these devices appear in sysfs used dynamic names, which would change whenever a hot-spare was swapped for a failed or missing device. This turns out not to be practical in sysfs for a number of reasons. This patch changes then naming of component devices to be based on the result of 'bdevname'. This is stable and should be unique. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 199016932de5..8238c1c0ec09 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -762,7 +762,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) struct list_head *tmp; mdk_rdev_t *rdev2; int next_spare = mddev->raid_disks; - char nm[20]; + /* make rdev->sb match mddev data.. * @@ -776,7 +776,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) */ int i; int active=0, working=0,failed=0,spare=0,nr_disks=0; - unsigned int fixdesc=0; rdev->sb_size = MD_SB_BYTES; @@ -830,16 +829,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) desc_nr = rdev2->raid_disk; else desc_nr = next_spare++; - if (desc_nr != rdev2->desc_nr) { - fixdesc |= (1 << desc_nr); - rdev2->desc_nr = desc_nr; - if (rdev2->raid_disk >= 0) { - sprintf(nm, "rd%d", rdev2->raid_disk); - sysfs_remove_link(&mddev->kobj, nm); - } - sysfs_remove_link(&rdev2->kobj, "block"); - kobject_del(&rdev2->kobj); - } + rdev2->desc_nr = desc_nr; d = &sb->disks[rdev2->desc_nr]; nr_disks++; d->number = rdev2->desc_nr; @@ -866,25 +856,6 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (test_bit(WriteMostly, &rdev2->flags)) d->state |= (1<desc_nr)) { - snprintf(rdev2->kobj.name, KOBJ_NAME_LEN, "dev%d", - rdev2->desc_nr); - /* kobject_add gets a ref on the parent, so - * we have to drop the one we already have - */ - kobject_add(&rdev2->kobj); - kobject_put(rdev->kobj.parent); - sysfs_create_link(&rdev2->kobj, - &rdev2->bdev->bd_disk->kobj, - "block"); - if (rdev2->raid_disk >= 0) { - sprintf(nm, "rd%d", rdev2->raid_disk); - sysfs_create_link(&mddev->kobj, - &rdev2->kobj, nm); - } - } /* now set the "removed" and "faulty" bits on any missing devices */ for (i=0 ; i < mddev->raid_disks ; i++) { mdp_disk_t *d = &sb->disks[i]; @@ -1237,13 +1208,14 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) if (find_rdev_nr(mddev, rdev->desc_nr)) return -EBUSY; } + bdevname(rdev->bdev,b); + if (kobject_set_name(&rdev->kobj, "dev-%s", b) < 0) + return -ENOMEM; list_add(&rdev->same_set, &mddev->disks); rdev->mddev = mddev; - printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b)); + printk(KERN_INFO "md: bind<%s>\n", b); - rdev->kobj.k_name = NULL; - snprintf(rdev->kobj.name, KOBJ_NAME_LEN, "dev%d", rdev->desc_nr); rdev->kobj.parent = &mddev->kobj; kobject_add(&rdev->kobj); -- cgit From f91de92ed6bfb70a3ff607558c910c7bf34d45e9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:36 -0800 Subject: [PATCH] md: allow md arrays to be started read-only (module parameter). When an md array is started, the superblock will be written, and resync may commense. This is not good if you want to be completely read-only as, for example, when preparing to resume from a suspend-to-disk image. So introduce a module parameter "start_ro" which can be set to '1' at boot, at module load, or via /sys/module/md_mod/parameters/start_ro When this is set, new arrays get an 'auto-ro' mode, which disables all internal io (superblock updates, resync, recovery) and is automatically switched to 'rw' when the first write request arrives. The array can be set to true 'ro' mode using 'mdadm -r' before the first write request, or resync can be started without a write using 'mdadm -w'. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8238c1c0ec09..d002b8301fc2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -131,6 +131,8 @@ static ctl_table raid_root_table[] = { static struct block_device_operations md_fops; +static int start_readonly; + /* * Enables to iterate over all existing md arrays * all_mddevs_lock protects this list. @@ -2029,6 +2031,9 @@ static int do_md_run(mddev_t * mddev) mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ mddev->barriers_work = 1; + if (start_readonly) + mddev->ro = 2; /* read-only, but switch on first write */ + /* before we start the array running, initialise the bitmap */ err = bitmap_create(mddev); if (err) @@ -2141,7 +2146,7 @@ static int do_md_stop(mddev_t * mddev, int ro) if (ro) { err = -ENXIO; - if (mddev->ro) + if (mddev->ro==1) goto out; mddev->ro = 1; } else { @@ -3258,12 +3263,22 @@ static int md_ioctl(struct inode *inode, struct file *file, /* * The remaining ioctls are changing the state of the - * superblock, so we do not allow read-only arrays - * here: + * superblock, so we do not allow them on read-only arrays. + * However non-MD ioctls (e.g. get-size) will still come through + * here and hit the 'default' below, so only disallow + * 'md' ioctls, and switch to rw mode if started auto-readonly. */ - if (mddev->ro) { - err = -EROFS; - goto abort_unlock; + if (_IOC_TYPE(cmd) == MD_MAJOR && + mddev->ro && mddev->pers) { + if (mddev->ro == 2) { + mddev->ro = 0; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + + } else { + err = -EROFS; + goto abort_unlock; + } } switch (cmd) @@ -3651,8 +3666,10 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); if (mddev->pers) { - if (mddev->ro) + if (mddev->ro==1) seq_printf(seq, " (read-only)"); + if (mddev->ro==2) + seq_printf(seq, "(auto-read-only)"); seq_printf(seq, " %s", mddev->pers->name); } @@ -3696,7 +3713,9 @@ static int md_seq_show(struct seq_file *seq, void *v) status_resync (seq, mddev); seq_printf(seq, "\n "); } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) - seq_printf(seq, " resync=DELAYED\n "); + seq_printf(seq, "\tresync=DELAYED\n "); + else if (mddev->recovery_cp < MaxSector) + seq_printf(seq, "\tresync=PENDING\n "); } else seq_printf(seq, "\n "); @@ -3833,6 +3852,13 @@ void md_write_start(mddev_t *mddev, struct bio *bi) if (bio_data_dir(bi) != WRITE) return; + BUG_ON(mddev->ro == 1); + if (mddev->ro == 2) { + /* need to switch to read/write */ + mddev->ro = 0; + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } atomic_inc(&mddev->writes_pending); if (mddev->in_sync) { spin_lock_irq(&mddev->write_lock); @@ -4431,6 +4457,23 @@ static __exit void md_exit(void) module_init(md_init) module_exit(md_exit) +static int get_ro(char *buffer, struct kernel_param *kp) +{ + return sprintf(buffer, "%d", start_readonly); +} +static int set_ro(const char *val, struct kernel_param *kp) +{ + char *e; + int num = simple_strtoul(val, &e, 10); + if (*val && (*e == '\0' || *e == '\n')) { + start_readonly = num; + return 0;; + } + return -EINVAL; +} + +module_param_call(start_ro, set_ro, get_ro, NULL, 0600); + EXPORT_SYMBOL(register_md_personality); EXPORT_SYMBOL(unregister_md_personality); EXPORT_SYMBOL(md_error); -- cgit From f637b9f9fc195e4f4635faf495fd8b462c21b411 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:37 -0800 Subject: [PATCH] md: make sure /block link in /sys/.../md/ goes to correct devices If a block_device is a partition, then it's kobject is bdev->bd_part->kobj otherwise (if it is a full device), the kobject is bdev->bd_disk->kobj As md wants back-links to the correct object (whether partition or not), we need to respect this difference... (Thus current code shows a link to the whole device, whether we are using a partition or not, which is wrong). Signed-off-by: Neil Brown Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index d002b8301fc2..2b5928976095 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1182,6 +1182,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { mdk_rdev_t *same_pdev; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; + struct kobject *ko; if (rdev->mddev) { MD_BUG(); @@ -1221,7 +1222,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) rdev->kobj.parent = &mddev->kobj; kobject_add(&rdev->kobj); - sysfs_create_link(&rdev->kobj, &rdev->bdev->bd_disk->kobj, "block"); + if (rdev->bdev->bd_part) + ko = &rdev->bdev->bd_part->kobj; + else + ko = &rdev->bdev->bd_disk->kobj; + sysfs_create_link(&rdev->kobj, ko, "block"); return 0; } -- cgit From 96de1e663cda65ba9275afb1bc007f34e5068ba1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:39 -0800 Subject: [PATCH] md: fix some locking and module refcounting issues with md's use of sysfs 1/ I really should be using the __ATTR macros for defining attributes, so that the .owner field get set properly, otherwise modules can be removed while sysfs files are open. This also involves some name changes of _show routines. 2/ Always lock the mddev (against reconfiguration) for all sysfs attribute access. This easily avoid certain races and is completely consistant with other interfaces (ioctl and /proc/mdstat both always lock against reconfiguration). 3/ raid5 attributes must check that the 'conf' structure actually exists (the array could have been stopped while an attribute file was open). 4/ A missing 'kfree' from when the raid5_conf_t was converted to have a kobject embedded, and then converted back again. Signed-off-by: Neil Brown Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 62 ++++++++++++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 34 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 2b5928976095..b2d8813ed716 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1504,7 +1504,7 @@ struct rdev_sysfs_entry { }; static ssize_t -rdev_show_state(mdk_rdev_t *rdev, char *page) +state_show(mdk_rdev_t *rdev, char *page) { char *sep = ""; int len=0; @@ -1525,13 +1525,11 @@ rdev_show_state(mdk_rdev_t *rdev, char *page) return len+sprintf(page+len, "\n"); } -static struct rdev_sysfs_entry rdev_state = { - .attr = {.name = "state", .mode = S_IRUGO }, - .show = rdev_show_state, -}; +static struct rdev_sysfs_entry +rdev_state = __ATTR_RO(state); static ssize_t -rdev_show_super(mdk_rdev_t *rdev, char *page) +super_show(mdk_rdev_t *rdev, char *page) { if (rdev->sb_loaded && rdev->sb_size) { memcpy(page, page_address(rdev->sb_page), rdev->sb_size); @@ -1539,10 +1537,8 @@ rdev_show_super(mdk_rdev_t *rdev, char *page) } else return 0; } -static struct rdev_sysfs_entry rdev_super = { - .attr = {.name = "super", .mode = S_IRUGO }, - .show = rdev_show_super, -}; +static struct rdev_sysfs_entry rdev_super = __ATTR_RO(super); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_super.attr, @@ -1728,7 +1724,7 @@ static void analyze_sbs(mddev_t * mddev) } static ssize_t -md_show_level(mddev_t *mddev, char *page) +level_show(mddev_t *mddev, char *page) { mdk_personality_t *p = mddev->pers; if (p == NULL) @@ -1739,21 +1735,15 @@ md_show_level(mddev_t *mddev, char *page) return sprintf(page, "%s\n", p->name); } -static struct md_sysfs_entry md_level = { - .attr = {.name = "level", .mode = S_IRUGO }, - .show = md_show_level, -}; +static struct md_sysfs_entry md_level = __ATTR_RO(level); static ssize_t -md_show_rdisks(mddev_t *mddev, char *page) +raid_disks_show(mddev_t *mddev, char *page) { return sprintf(page, "%d\n", mddev->raid_disks); } -static struct md_sysfs_entry md_raid_disks = { - .attr = {.name = "raid_disks", .mode = S_IRUGO }, - .show = md_show_rdisks, -}; +static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); static ssize_t md_show_scan(mddev_t *mddev, char *page) @@ -1782,10 +1772,10 @@ md_store_scan(mddev_t *mddev, const char *page, size_t len) if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; - down(&mddev->reconfig_sem); + if (mddev->pers && mddev->pers->sync_request) canscan=1; - up(&mddev->reconfig_sem); + if (!canscan) return -EINVAL; @@ -1801,22 +1791,18 @@ md_store_scan(mddev_t *mddev, const char *page, size_t len) } static ssize_t -md_show_mismatch(mddev_t *mddev, char *page) +mismatch_cnt_show(mddev_t *mddev, char *page) { return sprintf(page, "%llu\n", (unsigned long long) mddev->resync_mismatches); } -static struct md_sysfs_entry md_scan_mode = { - .attr = {.name = "scan_mode", .mode = S_IRUGO|S_IWUSR }, - .show = md_show_scan, - .store = md_store_scan, -}; +static struct md_sysfs_entry +md_scan_mode = __ATTR(scan_mode, S_IRUGO|S_IWUSR, md_show_scan, md_store_scan); -static struct md_sysfs_entry md_mismatches = { - .attr = {.name = "mismatch_cnt", .mode = S_IRUGO }, - .show = md_show_mismatch, -}; + +static struct md_sysfs_entry +md_mismatches = __ATTR_RO(mismatch_cnt); static struct attribute *md_default_attrs[] = { &md_level.attr, @@ -1831,10 +1817,14 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + ssize_t rv; if (!entry->show) return -EIO; - return entry->show(mddev, page); + mddev_lock(mddev); + rv = entry->show(mddev, page); + mddev_unlock(mddev); + return rv; } static ssize_t @@ -1843,10 +1833,14 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, { struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); + ssize_t rv; if (!entry->store) return -EIO; - return entry->store(mddev, page, length); + mddev_lock(mddev); + rv = entry->store(mddev, page, length); + mddev_unlock(mddev); + return rv; } static void md_free(struct kobject *ko) -- cgit From 411036fa1924f5e5b0f7f9d04ae5d8cdc72fb839 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:40 -0800 Subject: [PATCH] md: split off some md attributes in sysfs to a separate group Some, but not all, md array support data redundancy and hence support checking and restoring that redundancy (resync, rebuild). Some attributes apply specifically to functions involving this redundancy, and so should only appear for md arrays for which they are meaningful. i.e. they should not appear for raid0, linear, multpath, faulty. This patch separates these into a distinct group and creates the group only if the personality supports sync_request. Signed-off-by: Neil Brown Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index b2d8813ed716..292dad31d5e5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1807,10 +1807,19 @@ md_mismatches = __ATTR_RO(mismatch_cnt); static struct attribute *md_default_attrs[] = { &md_level.attr, &md_raid_disks.attr, + NULL, +}; + +static struct attribute *md_redundancy_attrs[] = { &md_scan_mode.attr, &md_mismatches.attr, NULL, }; +static struct attribute_group md_redundancy_group = { + .name = NULL, + .attrs = md_redundancy_attrs, +}; + static ssize_t md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) @@ -2047,6 +2056,8 @@ static int do_md_run(mddev_t * mddev) bitmap_destroy(mddev); return err; } + if (mddev->pers->sync_request) + sysfs_create_group(&mddev->kobj, &md_redundancy_group); atomic_set(&mddev->writes_pending,0); mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; @@ -2155,6 +2166,9 @@ static int do_md_stop(mddev_t * mddev, int ro) set_disk_ro(disk, 0); blk_queue_make_request(mddev->queue, md_fail_request); mddev->pers->stop(mddev); + if (mddev->pers->sync_request) + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); + module_put(mddev->pers->owner); mddev->pers = NULL; if (mddev->ro) -- cgit From 8e1b39d623359e5ef7983a2bd0fb676be45cba31 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:41 -0800 Subject: [PATCH] md: only try to print recovery/resync status for personalities that support recovery The introduction of 'resync=PENDING' (for read-only devices) caused that message to appear for non-syncable arrays like raid0 and linear. Simplest thing is to not try to print any resync info unless the personality clearly supports it. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 292dad31d5e5..47b8685d4bdf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3722,13 +3722,15 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->pers) { mddev->pers->status (seq, mddev); seq_printf(seq, "\n "); - if (mddev->curr_resync > 2) { - status_resync (seq, mddev); - seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) - seq_printf(seq, "\tresync=DELAYED\n "); - else if (mddev->recovery_cp < MaxSector) - seq_printf(seq, "\tresync=PENDING\n "); + if (mddev->pers->sync_request) { + if (mddev->curr_resync > 2) { + status_resync (seq, mddev); + seq_printf(seq, "\n "); + } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + seq_printf(seq, "\tresync=DELAYED\n "); + else if (mddev->recovery_cp < MaxSector) + seq_printf(seq, "\tresync=PENDING\n "); + } } else seq_printf(seq, "\n "); -- cgit From fd9d49cac46f5758d513ccf831b599dd4412546f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:42 -0800 Subject: [PATCH] md: ignore auto-readonly flag for arrays where it isn't meaningful The 'auto-readonly' flag (which suppresses resync and superblock updates until the first write) is not meaningful for personalities that don't support resync or superblock writes (raid0, linear, etc). So clear the setting early to avoid it confusing anything - e.g. appearing in /proc/mdstat Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 47b8685d4bdf..25f2bbfe6a2b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2058,6 +2058,9 @@ static int do_md_run(mddev_t * mddev) } if (mddev->pers->sync_request) sysfs_create_group(&mddev->kobj, &md_redundancy_group); + else if (mddev->ro == 2) /* auto-readonly not meaningful */ + mddev->ro = 0; + atomic_set(&mddev->writes_pending,0); mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; -- cgit From 787453c2397edcc3261efebb661739acd8c38547 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:43 -0800 Subject: [PATCH] md: complete conversion of md to use kthreads There are a few loose ends following the conversion of md to use kthreads: - Some fields in mdk_thread_t that aren't needed (kthreads does it's own completion and manages it's own name). - thread->run is now never NULL, so no need to check - Some tests for signal_pending that aren't needed (As we don't use signals to stop threads any more) - Some flush_signals are not needed - Some waits are interruptible and don't need to be. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 25f2bbfe6a2b..097ae1b5484b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3424,21 +3424,17 @@ static int md_thread(void * arg) */ allow_signal(SIGKILL); - complete(thread->event); while (!kthread_should_stop()) { - void (*run)(mddev_t *); - wait_event_interruptible_timeout(thread->wqueue, - test_bit(THREAD_WAKEUP, &thread->flags) - || kthread_should_stop(), - thread->timeout); + wait_event_timeout(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags) + || kthread_should_stop(), + thread->timeout); try_to_freeze(); clear_bit(THREAD_WAKEUP, &thread->flags); - run = thread->run; - if (run) - run(thread->mddev); + thread->run(thread->mddev); } return 0; @@ -3457,7 +3453,6 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, const char *name) { mdk_thread_t *thread; - struct completion event; thread = kmalloc(sizeof(mdk_thread_t), GFP_KERNEL); if (!thread) @@ -3466,18 +3461,14 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, memset(thread, 0, sizeof(mdk_thread_t)); init_waitqueue_head(&thread->wqueue); - init_completion(&event); - thread->event = &event; thread->run = run; thread->mddev = mddev; - thread->name = name; thread->timeout = MAX_SCHEDULE_TIMEOUT; thread->tsk = kthread_run(md_thread, thread, name, mdname(thread->mddev)); if (IS_ERR(thread->tsk)) { kfree(thread); return NULL; } - wait_for_completion(&event); return thread; } @@ -3941,9 +3932,7 @@ static void md_do_sync(mddev_t *mddev) mddev->curr_resync = 2; try_again: - if (signal_pending(current) || - kthread_should_stop()) { - flush_signals(current); + if (kthread_should_stop()) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto skip; } @@ -3963,9 +3952,8 @@ static void md_do_sync(mddev_t *mddev) * time 'round when curr_resync == 2 */ continue; - prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); - if (!signal_pending(current) && - !kthread_should_stop() && + prepare_to_wait(&resync_wait, &wq, TASK_UNINTERRUPTIBLE); + if (!kthread_should_stop() && mddev2->curr_resync >= mddev->curr_resync) { printk(KERN_INFO "md: delaying resync of %s" " until %s has finished resync (they" @@ -4074,13 +4062,12 @@ static void md_do_sync(mddev_t *mddev) } - if (signal_pending(current) || kthread_should_stop()) { + if (kthread_should_stop()) { /* * got a signal, exit. */ printk(KERN_INFO "md: md_do_sync() got signal ... exiting\n"); - flush_signals(current); set_bit(MD_RECOVERY_INTR, &mddev->recovery); goto out; } @@ -4102,7 +4089,7 @@ static void md_do_sync(mddev_t *mddev) if (currspeed > sysctl_speed_limit_min) { if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { - msleep_interruptible(250); + msleep(250); goto repeat; } } -- cgit From 7eec314d7512d5281742263cff3853b43df431db Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:44 -0800 Subject: [PATCH] md: improve 'scan_mode' and rename it to 'sync_action' The current sync_action for an array can be one of idle - nothing happening resync - reduncancy being recalcualted recover - missing device being recoverred to spare check - user initiated check of redundancy repair - like resync but user-initiated and ignores bitmap optimisation. Each of these strings can also be written to the 'sync_action' file to cause that action to happen (if appropriate). While 'sync' is not technically correct, as a recovery is *not* a 'sync', I think it is the most servicable word here. Also 'action' is a strong word than 'mode'. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 097ae1b5484b..023aecd0295e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1746,9 +1746,9 @@ raid_disks_show(mddev_t *mddev, char *page) static struct md_sysfs_entry md_raid_disks = __ATTR_RO(raid_disks); static ssize_t -md_show_scan(mddev_t *mddev, char *page) +action_show(mddev_t *mddev, char *page) { - char *type = "none"; + char *type = "idle"; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -1765,27 +1765,36 @@ md_show_scan(mddev_t *mddev, char *page) } static ssize_t -md_store_scan(mddev_t *mddev, const char *page, size_t len) +action_store(mddev_t *mddev, const char *page, size_t len) { - int canscan=0; + if (!mddev->pers || !mddev->pers->sync_request) + return -EINVAL; + + if (strcmp(page, "idle")==0 || strcmp(page, "idle\n")==0) { + if (mddev->sync_thread) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(mddev->sync_thread); + mddev->sync_thread = NULL; + mddev->recovery = 0; + } + return len; + } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) return -EBUSY; - - if (mddev->pers && mddev->pers->sync_request) - canscan=1; - - if (!canscan) - return -EINVAL; - - if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) - set_bit(MD_RECOVERY_CHECK, &mddev->recovery); - else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) - return -EINVAL; - set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + if (strcmp(page, "resync")==0 || strcmp(page, "resync\n")==0 || + strcmp(page, "recover")==0 || strcmp(page, "recover\n")==0) + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + else { + if (strcmp(page, "check")==0 || strcmp(page, "check\n")==0) + set_bit(MD_RECOVERY_CHECK, &mddev->recovery); + else if (strcmp(page, "repair")!=0 && strcmp(page, "repair\n")!=0) + return -EINVAL; + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); + set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + } md_wakeup_thread(mddev->thread); return len; } @@ -1798,7 +1807,7 @@ mismatch_cnt_show(mddev_t *mddev, char *page) } static struct md_sysfs_entry -md_scan_mode = __ATTR(scan_mode, S_IRUGO|S_IWUSR, md_show_scan, md_store_scan); +md_scan_mode = __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static struct md_sysfs_entry -- cgit From bb636547b02411ca5eef87b1d030ea3fc090a717 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Nov 2005 21:39:45 -0800 Subject: [PATCH] md: document sysfs usage of md, and make a couple of small refinements Document in Documentation/md.txt the files that now appear in sysfs, and make a couple of small refinements to exactly when 'level' and 'raid_disks' are empty, to make it match the documentation. Signed-off-by: Neil Brown Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/md.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 023aecd0295e..adf960d8a7c9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1727,7 +1727,7 @@ static ssize_t level_show(mddev_t *mddev, char *page) { mdk_personality_t *p = mddev->pers; - if (p == NULL) + if (p == NULL && mddev->raid_disks == 0) return 0; if (mddev->level >= 0) return sprintf(page, "RAID-%d\n", mddev->level); @@ -1740,6 +1740,8 @@ static struct md_sysfs_entry md_level = __ATTR_RO(level); static ssize_t raid_disks_show(mddev_t *mddev, char *page) { + if (mddev->raid_disks == 0) + return 0; return sprintf(page, "%d\n", mddev->raid_disks); } -- cgit