diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/btree.c | 2 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 55 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 6 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 35 | ||||
-rw-r--r-- | drivers/md/md-bitmap.h | 17 | ||||
-rw-r--r-- | drivers/md/md.c | 14 | ||||
-rw-r--r-- | drivers/md/raid1-10.c | 10 | ||||
-rw-r--r-- | drivers/md/raid1.c | 19 | ||||
-rw-r--r-- | drivers/md/raid10.c | 11 |
9 files changed, 106 insertions, 63 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 2cc2eb24dc8a..1d0100677357 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -89,8 +89,6 @@ * Test module load/unload */ -#define MAX_NEED_GC 64 -#define MAX_SAVE_PRIO 72 #define MAX_GC_TIMES 100 #define MIN_GC_NODES 100 #define GC_SLEEP_MS 100 diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1a2ce1a4b456..1efb768b2890 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1733,7 +1733,12 @@ static CLOSURE_CALLBACK(cache_set_flush) mutex_unlock(&b->write_lock); } - if (ca->alloc_thread) + /* + * If the register_cache_set() call to bch_cache_set_alloc() failed, + * ca has not been assigned a value and return error. + * So we need check ca is not NULL during bch_cache_set_unregister(). + */ + if (ca && ca->alloc_thread) kthread_stop(ca->alloc_thread); if (c->journal.cur) { @@ -2233,15 +2238,47 @@ static int cache_alloc(struct cache *ca) bio_init(&ca->journal.bio, NULL, ca->journal.bio.bi_inline_vecs, 8, 0); /* - * when ca->sb.njournal_buckets is not zero, journal exists, - * and in bch_journal_replay(), tree node may split, - * so bucket of RESERVE_BTREE type is needed, - * the worst situation is all journal buckets are valid journal, - * and all the keys need to replay, - * so the number of RESERVE_BTREE type buckets should be as much - * as journal buckets + * When the cache disk is first registered, ca->sb.njournal_buckets + * is zero, and it is assigned in run_cache_set(). + * + * When ca->sb.njournal_buckets is not zero, journal exists, + * and in bch_journal_replay(), tree node may split. + * The worst situation is all journal buckets are valid journal, + * and all the keys need to replay, so the number of RESERVE_BTREE + * type buckets should be as much as journal buckets. + * + * If the number of RESERVE_BTREE type buckets is too few, the + * bch_allocator_thread() may hang up and unable to allocate + * bucket. The situation is roughly as follows: + * + * 1. In bch_data_insert_keys(), if the operation is not op->replace, + * it will call the bch_journal(), which increments the journal_ref + * counter. This counter is only decremented after bch_btree_insert + * completes. + * + * 2. When calling bch_btree_insert, if the btree needs to split, + * it will call btree_split() and btree_check_reserve() to check + * whether there are enough reserved buckets in the RESERVE_BTREE + * slot. If not enough, bcache_btree_root() will repeatedly retry. + * + * 3. Normally, the bch_allocator_thread is responsible for filling + * the reservation slots from the free_inc bucket list. When the + * free_inc bucket list is exhausted, the bch_allocator_thread + * will call invalidate_buckets() until free_inc is refilled. + * Then bch_allocator_thread calls bch_prio_write() once. and + * bch_prio_write() will call bch_journal_meta() and waits for + * the journal write to complete. + * + * 4. During journal_write, journal_write_unlocked() is be called. + * If journal full occurs, journal_reclaim() and btree_flush_write() + * will be called sequentially, then retry journal_write. + * + * 5. When 2 and 4 occur together, IO will hung up and cannot recover. + * + * Therefore, reserve more RESERVE_BTREE type buckets. */ - btree_buckets = ca->sb.njournal_buckets ?: 8; + btree_buckets = clamp_t(size_t, ca->sb.nbuckets >> 7, + 32, SB_JOURNAL_BUCKETS); free = roundup_pow_of_two(ca->sb.nbuckets) >> 10; if (!free) { ret = -EPERM; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 127138c61be5..d296770478b2 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1356,11 +1356,7 @@ static int parse_raid_params(struct raid_set *rs, struct dm_arg_set *as, return -EINVAL; } - /* - * In device-mapper, we specify things in sectors, but - * MD records this value in kB - */ - if (value < 0 || value / 2 > COUNTER_MAX) { + if (value < 0) { rs->ti->error = "Max write-behind limit out of range"; return -EINVAL; } diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 37b08f26c62f..bd694910b01b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -105,9 +105,19 @@ * */ +typedef __u16 bitmap_counter_t; + #define PAGE_BITS (PAGE_SIZE << 3) #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) + #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) @@ -789,7 +799,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. */ write_behind = bitmap->mddev->bitmap_info.max_write_behind; - if (write_behind > COUNTER_MAX) + if (write_behind > COUNTER_MAX / 2) write_behind = COUNTER_MAX / 2; sb->write_behind = cpu_to_le32(write_behind); bitmap->mddev->bitmap_info.max_write_behind = write_behind; @@ -1672,13 +1682,13 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -static int bitmap_startwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors) +static void bitmap_start_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) - return 0; + return; while (sectors) { sector_t blocks; @@ -1688,7 +1698,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); if (!bmc) { spin_unlock_irq(&bitmap->counts.lock); - return 0; + return; } if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { @@ -1724,11 +1734,10 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, else sectors = 0; } - return 0; } -static void bitmap_endwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors) +static void bitmap_end_write(struct mddev *mddev, sector_t offset, + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; @@ -2205,9 +2214,9 @@ static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) return ERR_PTR(err); } -static int bitmap_create(struct mddev *mddev, int slot) +static int bitmap_create(struct mddev *mddev) { - struct bitmap *bitmap = __bitmap_create(mddev, slot); + struct bitmap *bitmap = __bitmap_create(mddev, -1); if (IS_ERR(bitmap)) return PTR_ERR(bitmap); @@ -2670,7 +2679,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len) } mddev->bitmap_info.offset = offset; - rv = bitmap_create(mddev, -1); + rv = bitmap_create(mddev); if (rv) goto out; @@ -3003,8 +3012,8 @@ static struct bitmap_operations bitmap_ops = { .end_behind_write = bitmap_end_behind_write, .wait_behind_writes = bitmap_wait_behind_writes, - .startwrite = bitmap_startwrite, - .endwrite = bitmap_endwrite, + .start_write = bitmap_start_write, + .end_write = bitmap_end_write, .start_sync = bitmap_start_sync, .end_sync = bitmap_end_sync, .cond_end_sync = bitmap_cond_end_sync, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 31c93019c76b..59e9dd45cfde 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -9,15 +9,6 @@ #define BITMAP_MAGIC 0x6d746962 -typedef __u16 bitmap_counter_t; -#define COUNTER_BITS 16 -#define COUNTER_BIT_SHIFT 4 -#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) - -#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) -#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) -#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) - /* use these for bitmap->flags and bitmap->sb->state bit-fields */ enum bitmap_state { BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ @@ -72,7 +63,7 @@ struct md_bitmap_stats { struct bitmap_operations { bool (*enabled)(struct mddev *mddev); - int (*create)(struct mddev *mddev, int slot); + int (*create)(struct mddev *mddev); int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, bool init); @@ -89,10 +80,10 @@ struct bitmap_operations { void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); - int (*startwrite)(struct mddev *mddev, sector_t offset, + void (*start_write)(struct mddev *mddev, sector_t offset, + unsigned long sectors); + void (*end_write)(struct mddev *mddev, sector_t offset, unsigned long sectors); - void (*endwrite)(struct mddev *mddev, sector_t offset, - unsigned long sectors); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0fde115e921f..09042b060086 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6225,7 +6225,7 @@ int md_run(struct mddev *mddev) } if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { - err = mddev->bitmap_ops->create(mddev, -1); + err = mddev->bitmap_ops->create(mddev); if (err) pr_warn("%s: failed to create bitmap (%d)\n", mdname(mddev), err); @@ -7285,7 +7285,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = 0; if (mddev->pers) { if (fd >= 0) { - err = mddev->bitmap_ops->create(mddev, -1); + err = mddev->bitmap_ops->create(mddev); if (!err) err = mddev->bitmap_ops->load(mddev); @@ -7601,7 +7601,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - rv = mddev->bitmap_ops->create(mddev, -1); + rv = mddev->bitmap_ops->create(mddev); if (!rv) rv = mddev->bitmap_ops->load(mddev); @@ -8799,14 +8799,14 @@ static void md_bitmap_start(struct mddev *mddev, mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, &md_io_clone->sectors); - mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset, - md_io_clone->sectors); + mddev->bitmap_ops->start_write(mddev, md_io_clone->offset, + md_io_clone->sectors); } static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) { - mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset, - md_io_clone->sectors); + mddev->bitmap_ops->end_write(mddev, md_io_clone->offset, + md_io_clone->sectors); } static void md_end_clone_io(struct bio *bio) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index c7efd8aab675..b8b3a9069701 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -293,3 +293,13 @@ static inline bool raid1_should_read_first(struct mddev *mddev, return false; } + +/* + * bio with REQ_RAHEAD or REQ_NOWAIT can fail at anytime, before such IO is + * submitted to the underlying disks, hence don't record badblocks or retry + * in this case. + */ +static inline bool raid1_should_handle_error(struct bio *bio) +{ + return !(bio->bi_opf & (REQ_RAHEAD | REQ_NOWAIT)); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 657d481525be..19c5a0ce5a40 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -373,14 +373,16 @@ static void raid1_end_read_request(struct bio *bio) */ update_head_pos(r1_bio->read_disk, r1_bio); - if (uptodate) + if (uptodate) { set_bit(R1BIO_Uptodate, &r1_bio->state); - else if (test_bit(FailFast, &rdev->flags) && - test_bit(R1BIO_FailFast, &r1_bio->state)) + } else if (test_bit(FailFast, &rdev->flags) && + test_bit(R1BIO_FailFast, &r1_bio->state)) { /* This was a fail-fast read so we definitely * want to retry */ ; - else { + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; + } else { /* If all other devices have failed, we want to return * the error upwards rather than fail the last device. * Here we redefine "uptodate" to mean "Don't want to retry" @@ -451,16 +453,15 @@ static void raid1_end_write_request(struct bio *bio) struct bio *to_put = NULL; int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; - bool discard_error; sector_t lo = r1_bio->sector; sector_t hi = r1_bio->sector + r1_bio->sectors; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); /* * 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & @@ -511,7 +512,7 @@ static void raid1_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && - !discard_error) { + !ignore_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index dce06bf65016..b74780af4c22 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -399,6 +399,8 @@ static void raid10_end_read_request(struct bio *bio) * wait for the 'master' bio. */ set_bit(R10BIO_Uptodate, &r10_bio->state); + } else if (!raid1_should_handle_error(bio)) { + uptodate = 1; } else { /* If all other devices that store this block have * failed, we want to return the error upwards rather @@ -456,9 +458,8 @@ static void raid10_end_write_request(struct bio *bio) int slot, repl; struct md_rdev *rdev = NULL; struct bio *to_put = NULL; - bool discard_error; - - discard_error = bio->bi_status && bio_op(bio) == REQ_OP_DISCARD; + bool ignore_error = !raid1_should_handle_error(bio) || + (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); @@ -472,7 +473,7 @@ static void raid10_end_write_request(struct bio *bio) /* * this branch is our 'one mirror IO has finished' event handler: */ - if (bio->bi_status && !discard_error) { + if (bio->bi_status && !ignore_error) { if (repl) /* Never record new bad blocks to replacement, * just fail it. @@ -527,7 +528,7 @@ static void raid10_end_write_request(struct bio *bio) /* Maybe we can clear some bad blocks. */ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, r10_bio->sectors) && - !discard_error) { + !ignore_error) { bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; |